varaq-wasm-c/tokenizer.c

#include "tokenizer.h"
#include "common.h"

/*
 Adapted from Section 6.6 of The C Programming Language
 by Brian Kernighan and Dennis Ritchie
*/
typedef struct Map Map;
struct Map
{
  struct Map* next;
  char* keyword;
  TokenType token;
};

#define HASHSIZE 150
static Map* hashtab[HASHSIZE];

unsigned int
hash(char* s)
{
  unsigned int hashval;
  for (hashval = 0; *s != '\0'; s++)
    hashval = *s + 31 * hashval;
  return hashval % HASHSIZE;
}

Map*
lookup(char* s)
{
  Map* np;
  for (np = hashtab[hash(s)]; np != NULL; np = np->next)
    if (strcmp(s, np->keyword) == 0)
      return np;
  return NULL;
}

TokenType
get(char* s)
{
  Map* np;
  for (np = hashtab[hash(s)]; np != NULL; np = np->next)
    if (strcmp(s, np->keyword) == 0)
      return np->token;
  return TOKEN_IDENTIFIER;
}

Map*
put(char* keyword, TokenType token)
{
  Map* np;
  unsigned int hashval;
  if ((np = lookup(keyword)) == NULL) {
    np = (Map*)malloc(sizeof(*np));
    if (np == NULL || (np->keyword = strdup(keyword)) == NULL)
      return NULL;
    hashval = hash(keyword);
    np->next = hashtab[hashval];
    hashtab[hashval] = np;
  }
  np->token = token;
  return np;
}

void
initMap()
{
  put("and", TOKEN_AND);
  put("atan", TOKEN_ATAN);
  put("add", TOKEN_ADD);
  put("bep", TOKEN_COMPLAIN);
  put("complain", TOKEN_COMPLAIN);
  put("compl", TOKEN_COMPL);
  put("compose", TOKEN_COMPOSE);
  put("contradict", TOKEN_CONTRADICT);
  put("cons", TOKEN_CONS);
  put("cos", TOKEN_COS);
  put("chImmoH", TOKEN_CLEAR);
  put("chIm'a'", TOKEN_EMPTY);
  put("cher", TOKEN_SET);
  put("boq", TOKEN_ADD);
  put("choose", TOKEN_CHOOSE);
  put("chov", TOKEN_EVAL);
  put("chuv", TOKEN_MOD);
  put("cha'", TOKEN_DISP);
  put("clear", TOKEN_CLEAR);
  put("dup", TOKEN_DUP);
  put("dump", TOKEN_DUMP);
  put("disp", TOKEN_DISP);
  put("div", TOKEN_DIV);
  put("DuD", TOKEN_MIX);
  put("e", TOKEN_E);
  put("exch", TOKEN_EXCH);
  put("eval", TOKEN_EVAL);
  put("escape", TOKEN_ESCAPE);
  put("empty?", TOKEN_EMPTY);
  put("explode", TOKEN_EXPLODE);
  put("eq?", TOKEN_EQ);
  put("forget", TOKEN_FORGET);
  put("gt?", TOKEN_GT);
  put("ge?", TOKEN_GE);
  put("ghap", TOKEN_XOR);
  put("ghurmI'", TOKEN_E);
  put("ghurtaH", TOKEN_LN);
  put("ghorqu'", TOKEN_SHATTER);
  put("ghobe'chugh", TOKEN_IFNO);
  put("HIja'chugh", TOKEN_IFYES);
  put("Hotlh", TOKEN_DUMP);
  put("HeHmI'", TOKEN_PI);
  put("Habwav", TOKEN_IDIV);
  put("HabmI''a'", TOKEN_INT);
  put("idiv", TOKEN_IDIV);
  put("int?", TOKEN_INT);
  put("isolate", TOKEN_ISOLATE);
  put("ifyes", TOKEN_IFYES);
  put("ifno", TOKEN_IFNO);
  put("je", TOKEN_AND);
  put("jor", TOKEN_EXPLODE);
  put("joq", TOKEN_OR);
  put("ln", TOKEN_LN);
  put("lt?", TOKEN_LT);
  put("le?", TOKEN_LE);
  put("listen", TOKEN_LISTEN);
  put("loS'ar", TOKEN_SQRT);
  put("log", TOKEN_LOG);
  put("log3", TOKEN_LOG3);
  put("latlh", TOKEN_DUP);
  put("law'moH", TOKEN_MUL);
  put("law'qa'moH", TOKEN_POW);
  put("law''a'", TOKEN_GT);
  put("law'rap'a'", TOKEN_GE);
  put("maHghurtaH", TOKEN_LOG);
  put("mix", TOKEN_MIX);
  put("mi'moH", TOKEN_NUMBERIZE);
  put("muv", TOKEN_CONS);
  put("mul", TOKEN_MUL);
  put("mod", TOKEN_MOD);
  put("mobmoH", TOKEN_ISOLATE);
  put("mIScher", TOKEN_SETRAND);
  put("mIS", TOKEN_RAND);
  put("mI''a'", TOKEN_FLOAT);
  put("nIHghoS", TOKEN_SHIFTRIGHT);
  put("ne?", TOKEN_NE);
  put("negative?", TOKEN_NEGATIVE);
  put("name", TOKEN_NAME);
  put("nargh", TOKEN_ESCAPE);
  put("naQmoH", TOKEN_COMPOSE);
  put("number?", TOKEN_ISNUMBER);
  put("numberize", TOKEN_NUMBERIZE);
  put("null?", TOKEN_NULL);
  put("or", TOKEN_OR);
  put("pi", TOKEN_PI);
  put("pagh'a'", TOKEN_NULL);
  put("pop", TOKEN_POP);
  put("pong", TOKEN_NAME);
  put("pow", TOKEN_POW);
  put("poSghoS", TOKEN_SHIFTLEFT);
  put("puS'a'", TOKEN_LT);
  put("puSrap'a'", TOKEN_LE);
  put("qaw", TOKEN_REMEMBER);
  put("qawHa'", TOKEN_FORGET);
  put("qojmI'", TOKEN_TAN);
  put("qojHa'", TOKEN_ATAN);
  put("Qo'moH", TOKEN_COMPL);
  put("remember", TOKEN_REMEMBER);
  put("repeat", TOKEN_REPEAT);
  put("rand", TOKEN_RAND);
  put("rap'a'", TOKEN_EQ);
  put("rapbe'a'", TOKEN_NE);
  put("set", TOKEN_SET);
  put("split", TOKEN_SPLIT);
  put("shatter", TOKEN_SHATTER);
  put("strcut", TOKEN_STRCUT);
  put("streq?", TOKEN_STREQ);
  put("strmeasure", TOKEN_STRMEASURE);
  put("strtie", TOKEN_STRTIE);
  put("tlheghrar", TOKEN_STRTIE);
  put("sub", TOKEN_SUB);
  put("sub1", TOKEN_SUB1);
  put("sqrt", TOKEN_SQRT);
  put("sin", TOKEN_SIN);
  put("clip", TOKEN_CLIP);
  put("poD", TOKEN_CLIP);
  put("smooth", TOKEN_SMOOTH);
  put("Hab", TOKEN_SMOOTH);
  put("howmuch", TOKEN_HOWMUCH);
  put("'ar", TOKEN_HOWMUCH);
  put("setrand", TOKEN_SETRAND);
  put("shift right", TOKEN_SHIFTRIGHT);
  put("shift left", TOKEN_SHIFTLEFT);
  put("SIj", TOKEN_SPLIT);
  put("boqHa'", TOKEN_SUB);
  put("tam", TOKEN_EXCH);
  put("tan", TOKEN_TAN);
  put("taH'a'", TOKEN_NEGATIVE);
  put("tlhoch", TOKEN_CONTRADICT);
  put("tlheghpe'", TOKEN_STRCUT);
  put("tlheghjuv", TOKEN_STRMEASURE);
  put("tlheghrap'a'", TOKEN_STREQ);
  put("vangqa'", TOKEN_REPEAT);
  put("wIv", TOKEN_CHOOSE);
  put("woD", TOKEN_POP);
  put("wav", TOKEN_DIV);
  put("wa'teq", TOKEN_SUB1);
  put("wa'chel", TOKEN_ADD1);
  put("wejghurtaH", TOKEN_LOG3);
  put("xor", TOKEN_XOR);
  put("\'Ij", TOKEN_LISTEN);
  put("time", TOKEN_TIME);
  put("poH", TOKEN_TIME);
  // Wrong word in original spec, old one meant "waving hands or flapping"
  // Also fixes the conflicting joq issue meaning sin or 'or'
  put("yu'eghHa'", TOKEN_COS);
  put("yu'egh", TOKEN_SIN);
  // This one has a special case too as it is the same as the '~' operator
  put("lI'moH", TOKEN_TILDE);
  put("woDHa'", TOKEN_GARBAGE_COLLECT);
  put("gc", TOKEN_GARBAGE_COLLECT);
}

typedef struct Tokenizer Tokenizer;
struct Tokenizer
{
  char* start;
  char* current;
  int32_t line;
};

Tokenizer tokenizer;

void
initTokenizer(char* src)
{
  tokenizer.start = src;
  tokenizer.current = src;
  tokenizer.line = 1;
}

static bool
isAlpha(char c)
{
  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' ||
         c == '\'' || c == '?';
}

static bool
isDigit(char c)
{
  return (c >= '0' && c <= '9') || c == '-';
}

static bool
isAtEnd()
{
  return *tokenizer.current == '\0';
}

static Token
makeToken(TokenType type)
{
  Token token;
  token.type = type;
  token.start = tokenizer.start;
  token.length = (int32_t)(tokenizer.current - tokenizer.start);
  token.line = tokenizer.line;
  return token;
}

static Token
errorToken(char* msg)
{
  Token token;
  token.type = TOKEN_ERROR;
  token.start = msg;
  token.length = (int32_t)strlen(msg);
  token.line = tokenizer.line;
  return token;
}

static char
advance()
{
  tokenizer.current++;
  return tokenizer.current[-1];
}

static char
peek()
{
  return *tokenizer.current;
}

static char
peekNext()
{
  if (isAtEnd())
    return '\0';
  return tokenizer.current[1];
}

static bool
match(char expected)
{
  if (isAtEnd())
    return false;
  if (*tokenizer.current != expected)
    return false;
  tokenizer.current++;
  return true;
}

static void
skipWhitespace()
{
  for (;;) {
    char c = peek();
    switch (c) {
      case ' ':
      case '\r':
      case '\t':
        advance();
        break;
      case '\n':
        tokenizer.line++;
        advance();
        break;
      case '/':
        if (peekNext() == '/') {
          // Ignore the preprocessor import until end of the line.
          while (peek() != '\n' && !isAtEnd())
            advance();
        } else {
          return;
        }
        break;
      case '(':
        if (peekNext() == '*') {
          advance(); // consume (
          advance(); // consume *
          while (!isAtEnd() && peek() != '*' && peekNext() != ')')
            advance(); // Consume contents
          advance();   // consume *
          advance();   // consume )
        }
        break;
      default:
        return;
    }
  }
}

static TokenType
checkKeyword(int start, int length, char* rest, TokenType type)
{
  if (tokenizer.current - tokenizer.start == start + length &&
      memcmp(tokenizer.start + start, rest, length) == 0) {
    return type;
  }

  return TOKEN_IDENTIFIER;
}

static char*
currentTokenToS()
{
  int32_t size = tokenizer.current - tokenizer.start;
  char* str = (char*)malloc(sizeof(size));
  strncpy(str, tokenizer.start, size);
  str[size] = '\0';
  return str;
}

static TokenType
identifierType()
{
  char* check = currentTokenToS();
  TokenType t = get(check);
  free(check);
  return t;
}

static Token
identifier()
{
  while (isAlpha(peek()) || isDigit(peek()))
    advance();
  return makeToken(identifierType());
}

static Token
number()
{
  bool is_float = false;
  while (isDigit(peek()))
    advance();

  // Look for a fractional part.
  if (peek() == '.' && isDigit(peekNext())) {
    is_float = true;
    // Consume the ".".
    advance();

    while (isDigit(peek()))
      advance();
  }

  return makeToken(TOKEN_FLOAT); // or measure if ends in postscript
}

static Token
string()
{
  while (peek() != '"' && !isAtEnd()) {
    if (peek() == '\n')
      tokenizer.line++;
    advance();
  }

  if (isAtEnd())
    return errorToken("Unterminated string.");

  // The closing quote.
  advance();
  return makeToken(TOKEN_STRING);
}

Token
nextToken()
{
  skipWhitespace();
  tokenizer.start = tokenizer.current;
  if (isAtEnd())
    return makeToken(TOKEN_EOF);

  char c = advance();
  if (isAlpha(c))
    return identifier();
  if (isDigit(c))
    return number();
  switch (c) {
    case '(':
      return makeToken(TOKEN_LEFT_PAREN);
    case ')':
      return makeToken(TOKEN_RIGHT_PAREN);
    case '{':
      return makeToken(TOKEN_LEFT_BRACE);
    case '}':
      return makeToken(TOKEN_RIGHT_BRACE);
    case '-':
      return makeToken(TOKEN_NEGATIVE);
    case '~':
      return makeToken(TOKEN_TILDE);
    case '/':
      return makeToken(TOKEN_SLASH);
    case '"':
      return string();
  }

  return errorToken("Unexpected character.");
}

void
debug_printToken(Token t)
{
  char* str = currentTokenToS();

  switch (t.type) {
    case TOKEN_LEFT_PAREN:
      printf("TOKEN_LEFT_PAREN %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_RIGHT_PAREN:
      printf("TOKEN_RIGHT_PAREN %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_LEFT_BRACE:
      printf("TOKEN_LEFT_BRACE %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_RIGHT_BRACE:
      printf("TOKEN_RIGHT_BRACE %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_TILDE:
      printf("TOKEN_TILDE %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_SLASH:
      printf("TOKEN_SLASH %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_MINUS:
      printf("TOKEN_MINUS %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_IDENTIFIER:
      printf("TOKEN_IDENTIFIER %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_STRING:
      printf("TOKEN_STRING %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_FLOAT:
      printf("TOKEN_FLOAT %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_LIST:
      printf("TOKEN_LIST %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_ERROR:
      printf("TOKEN_ERROR %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_FALSE:
      printf("TOKEN_FALSE %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_TRUE:
      printf("TOKEN_TRUE %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_PI:
      printf("TOKEN_PI %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_E:
      printf("TOKEN_E %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_EOF:
      printf("TOKEN_EOF %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_POP:
      printf("TOKEN_POP %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_DUP:
      printf("TOKEN_DUP %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_EXCH:
      printf("TOKEN_EXCH %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_CLEAR:
      printf("TOKEN_CLEAR %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_REMEMBER:
      printf("TOKEN_REMEMBER %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_FORGET:
      printf("TOKEN_FORGET %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_DUMP:
      printf("TOKEN_DUMP %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_NAME:
      printf("TOKEN_NAME %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_SET:
      printf("TOKEN_SET %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_IFYES:
      printf("TOKEN_IFYES %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_IFNO:
      printf("TOKEN_IFNO %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_CHOOSE:
      printf("TOKEN_CHOOSE %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_EVAL:
      printf("TOKEN_EVAL %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_ESCAPE:
      printf("TOKEN_ESCAPE %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_REPEAT:
      printf("TOKEN_REPEAT %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_SPLIT:
      printf("TOKEN_SPLIT %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_CONS:
      printf("TOKEN_CONS %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_SHATTER:
      printf("TOKEN_SHATTER %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_EMPTY:
      printf("TOKEN_EMPTY %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_COMPOSE:
      printf("TOKEN_COMPOSE %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_STREQ:
      printf("TOKEN_STREQ %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_STRCUT:
      printf("TOKEN_STRCUT %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_STRMEASURE:
      printf("TOKEN_STRMEASURE %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_STRTIE:
      printf("TOKEN_STRTIE %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_EXPLODE:
      printf("TOKEN_EXPLODE %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_ADD:
      printf("TOKEN_ADD %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_SUB:
      printf("TOKEN_SUB %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_MUL:
      printf("TOKEN_MUL %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_DIV:
      printf("TOKEN_DIV %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_IDIV:
      printf("TOKEN_IDIV %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_MOD:
      printf("TOKEN_MOD %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_POW:
      printf("TOKEN_POW %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_SQRT:
      printf("TOKEN_SQRT %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_ADD1:
      printf("TOKEN_ADD1 %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_SUB1:
      printf("TOKEN_SUB1 %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_SIN:
      printf("TOKEN_SIN %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_COS:
      printf("TOKEN_COS %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_TAN:
      printf("TOKEN_TAN %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_ATAN:
      printf("TOKEN_ATAN %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_LN:
      printf("TOKEN_LN %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_LOG:
      printf("TOKEN_LOG %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_LOG3:
      printf("TOKEN_LOG3 %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_CLIP:
      printf("TOKEN_CLIP %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_SMOOTH:
      printf("TOKEN_SMOOTH %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_HOWMUCH:
      printf("TOKEN_HOWMUCH %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_SETRAND:
      printf("TOKEN_SETRAND %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_RAND:
      printf("TOKEN_RAND %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_INT:
      printf("TOKEN_INT %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_NUMBERIZE:
      printf("TOKEN_NUMBERIZE %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_ISOLATE:
      printf("TOKEN_ISOLATE %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_MIX:
      printf("TOKEN_MIX %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_CONTRADICT:
      printf("TOKEN_CONTRADICT %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_COMPL:
      printf("TOKEN_COMPL %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_SHIFTRIGHT:
      printf("TOKEN_SHIFTRIGHT %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_SHIFTLEFT:
      printf("TOKEN_SHIFTLEFT %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_GT:
      printf("TOKEN_GT %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_LT:
      printf("TOKEN_LT %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_EQ:
      printf("TOKEN_EQ %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_GE:
      printf("TOKEN_GE %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_LE:
      printf("TOKEN_LE %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_NE:
      printf("TOKEN_NE %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_NULL:
      printf("TOKEN_NULL %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_NEGATIVE:
      printf("TOKEN_NEGATIVE %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_ISNULL:
      printf("TOKEN_ISNULL %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_ISINT:
      printf("TOKEN_ISINT %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_ISNUMBER:
      printf("TOKEN_ISNUMBER %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_AND:
      printf("TOKEN_AND %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_OR:
      printf("TOKEN_OR %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_XOR:
      printf("TOKEN_XOR %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_DISP:
      printf("TOKEN_DISP %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_LISTEN:
      printf("TOKEN_LISTEN %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_COMPLAIN:
      printf("TOKEN_COMPLAIN %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_TIME:
      printf("TOKEN_TIME %s line_no=%d\n", str, t.line);
      break;
    case TOKEN_GARBAGE_COLLECT:
      printf("TOKEN_GARBAGE_COLLECT %s line_no=%d\n", str, t.line);
      break;
  }
  free(str);
}