#include "tokenizer.h"
#include "common.h"

/*
 Adapted from Section 6.6 of The C Programming Language
 by Brian Kernighan and Dennis Ritchie
*/
typedef struct Map Map;
struct Map
{
  struct Map *next;
  char *keyword;
  TokenType token;
};

#define HASHSIZE 150
static Map *hashtab[HASHSIZE];

unsigned int
hash (char *s)
{
  unsigned int hashval;
  for (hashval = 0; *s != '\0'; s++)
    hashval = *s + 31 * hashval;
  return hashval % HASHSIZE;
}

Map *
lookup (char *s)
{
  Map *np;
  for (np = hashtab[hash (s)]; np != NULL; np = np->next)
    if (strcmp (s, np->keyword) == 0)
      return np;
  return NULL;
}

TokenType
get (char *s)
{
  Map *np;
  for (np = hashtab[hash (s)]; np != NULL; np = np->next)
    if (strcmp (s, np->keyword) == 0)
      return np->token;
  return TOKEN_IDENTIFIER;
}

Map *
put (char *keyword, TokenType token)
{
  Map *np;
  unsigned int hashval;
  if ((np = lookup (keyword)) == NULL)
    {
      np = (Map *)malloc (sizeof (*np));
      if (np == NULL || (np->keyword = strdup (keyword)) == NULL)
        return NULL;
      hashval = hash (keyword);
      np->next = hashtab[hashval];
      hashtab[hashval] = np;
    }
  np->token = token;
  return np;
}

void
initMap ()
{
  put ("and", TOKEN_AND);
  put ("atan", TOKEN_ATAN);
  put ("add", TOKEN_ADD);
  put ("bep", TOKEN_COMPLAIN);
  put ("complain", TOKEN_COMPLAIN);
  put ("compl", TOKEN_COMPL);
  put ("compose", TOKEN_COMPOSE);
  put ("contradict", TOKEN_CONTRADICT);
  put ("cons", TOKEN_CONS);
  put ("cos", TOKEN_COS);
  put ("chImmoH", TOKEN_CLEAR);
  put ("chIm'a'", TOKEN_EMPTY);
  put ("cher", TOKEN_SET);
  put ("boq", TOKEN_ADD);
  put ("choose", TOKEN_CHOOSE);
  put ("chov", TOKEN_EVAL);
  put ("chuv", TOKEN_MOD);
  put ("cha'", TOKEN_DISP);
  put ("clear", TOKEN_CLEAR);
  put ("dup", TOKEN_DUP);
  put ("dump", TOKEN_DUMP);
  put ("disp", TOKEN_DISP);
  put ("div", TOKEN_DIV);
  put ("DuD", TOKEN_MIX);
  put ("e", TOKEN_E);
  put ("exch", TOKEN_EXCH);
  put ("eval", TOKEN_EVAL);
  put ("escape", TOKEN_ESCAPE);
  put ("empty?", TOKEN_EMPTY);
  put ("explode", TOKEN_EXPLODE);
  put ("eq?", TOKEN_EQ);
  put ("forget", TOKEN_FORGET);
  put ("gt?", TOKEN_GT);
  put ("ge?", TOKEN_GE);
  put ("ghap", TOKEN_XOR);
  put ("ghurmI'", TOKEN_E);
  put ("ghurtaH", TOKEN_LN);
  put ("ghorqu'", TOKEN_SHATTER);
  put ("ghobe'chugh", TOKEN_IFNO);
  put ("HIja'chugh", TOKEN_IFYES);
  put ("Hotlh", TOKEN_DUMP);
  put ("HeHmI'", TOKEN_PI);
  put ("Habwav", TOKEN_IDIV);
  put ("HabmI''a'", TOKEN_INT);
  put ("idiv", TOKEN_IDIV);
  put ("int?", TOKEN_INT);
  put ("isolate", TOKEN_ISOLATE);
  put ("ifyes", TOKEN_IFYES);
  put ("ifno", TOKEN_IFNO);
  put ("je", TOKEN_AND);
  put ("jor", TOKEN_EXPLODE);
  put ("joq", TOKEN_OR);
  put ("ln", TOKEN_LN);
  put ("lt?", TOKEN_LT);
  put ("le?", TOKEN_LE);
  put ("listen", TOKEN_LISTEN);
  put ("loS'ar", TOKEN_SQRT);
  put ("log", TOKEN_LOG);
  put ("log3", TOKEN_LOG3);
  put ("latlh", TOKEN_DUP);
  put ("law'moH", TOKEN_MUL);
  put ("law'qa'moH", TOKEN_POW);
  put ("law''a'", TOKEN_GT);
  put ("law'rap'a'", TOKEN_GE);
  put ("maHghurtaH", TOKEN_LOG);
  put ("mix", TOKEN_MIX);
  put ("mi'moH", TOKEN_NUMBERIZE);
  put ("muv", TOKEN_CONS);
  put ("mul", TOKEN_MUL);
  put ("mod", TOKEN_MOD);
  put ("mobmoH", TOKEN_ISOLATE);
  put ("mIScher", TOKEN_SETRAND);
  put ("mIS", TOKEN_RAND);
  put ("mI''a'", TOKEN_FLOAT);
  put ("nIHghoS", TOKEN_SHIFTRIGHT);
  put ("ne?", TOKEN_NE);
  put ("negative?", TOKEN_NEGATIVE);
  put ("name", TOKEN_NAME);
  put ("nargh", TOKEN_ESCAPE);
  put ("naQmoH", TOKEN_COMPOSE);
  put ("number?", TOKEN_ISNUMBER);
  put ("numberize", TOKEN_NUMBERIZE);
  put ("null?", TOKEN_NULL);
  put ("or", TOKEN_OR);
  put ("pi", TOKEN_PI);
  put ("pagh'a'", TOKEN_NULL);
  put ("pop", TOKEN_POP);
  put ("pong", TOKEN_NAME);
  put ("pow", TOKEN_POW);
  put ("poSghoS", TOKEN_SHIFTLEFT);
  put ("puS'a'", TOKEN_LT);
  put ("puSrap'a'", TOKEN_LE);
  put ("qaw", TOKEN_REMEMBER);
  put ("qawHa'", TOKEN_FORGET);
  put ("qojmI'", TOKEN_TAN);
  put ("qojHa'", TOKEN_ATAN);
  put ("Qo'moH", TOKEN_COMPL);
  put ("remember", TOKEN_REMEMBER);
  put ("repeat", TOKEN_REPEAT);
  put ("rand", TOKEN_RAND);
  put ("rap'a'", TOKEN_EQ);
  put ("rapbe'a'", TOKEN_NE);
  put ("set", TOKEN_SET);
  put ("split", TOKEN_SPLIT);
  put ("shatter", TOKEN_SHATTER);
  put ("strcut", TOKEN_STRCUT);
  put ("streq?", TOKEN_STREQ);
  put ("strmeasure", TOKEN_STRMEASURE);
  put ("strtie", TOKEN_STRTIE);
  put ("tlheghrar", TOKEN_STRTIE);
  put ("sub", TOKEN_SUB);
  put ("sub1", TOKEN_SUB1);
  put ("sqrt", TOKEN_SQRT);
  put ("sin", TOKEN_SIN);
  put ("clip", TOKEN_CLIP);
  put ("poD", TOKEN_CLIP);
  put ("smooth", TOKEN_SMOOTH);
  put ("Hab", TOKEN_SMOOTH);
  put ("howmuch", TOKEN_HOWMUCH);
  put ("'ar", TOKEN_HOWMUCH);
  put ("setrand", TOKEN_SETRAND);
  put ("shift right", TOKEN_SHIFTRIGHT);
  put ("shift left", TOKEN_SHIFTLEFT);
  put ("SIj", TOKEN_SPLIT);
  put ("boqHa'", TOKEN_SUB);
  put ("tam", TOKEN_EXCH);
  put ("tan", TOKEN_TAN);
  put ("taH'a'", TOKEN_NEGATIVE);
  put ("tlhoch", TOKEN_CONTRADICT);
  put ("tlheghpe'", TOKEN_STRCUT);
  put ("tlheghjuv", TOKEN_STRMEASURE);
  put ("tlheghrap'a'", TOKEN_STREQ);
  put ("vangqa'", TOKEN_REPEAT);
  put ("wIv", TOKEN_CHOOSE);
  put ("woD", TOKEN_POP);
  put ("wav", TOKEN_DIV);
  put ("wa'teq", TOKEN_SUB1);
  put ("wa'chel", TOKEN_ADD1);
  put ("wejghurtaH", TOKEN_LOG3);
  put ("xor", TOKEN_XOR);
  put ("\'Ij", TOKEN_LISTEN);
  put ("time", TOKEN_TIME);
  put ("poH", TOKEN_TIME);
  // Wrong word in original spec, old one meant "waving hands or flapping"
  // Also fixes the conflicting joq issue meaning sin or 'or'
  put ("yu'eghHa'", TOKEN_COS);
  put ("yu'egh", TOKEN_SIN);
  // This one has a special case too as it is the same as the '~' operator
  put ("lI'moH", TOKEN_TILDE);
  put ("woDHa'", TOKEN_GARBAGE_COLLECT);
  put ("gc", TOKEN_GARBAGE_COLLECT);
}

typedef struct Tokenizer Tokenizer;
struct Tokenizer
{
  char *start;
  char *current;
  int32_t line;
};

Tokenizer tokenizer;

void
initTokenizer (char *src)
{
  tokenizer.start = src;
  tokenizer.current = src;
  tokenizer.line = 1;
}

static bool
isAlpha (char c)
{
  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'
         || c == '\'' || c == '?';
}

static bool
isDigit (char c)
{
  return (c >= '0' && c <= '9') || c == '-';
}

static bool
isAtEnd ()
{
  return *tokenizer.current == '\0';
}

static Token
makeToken (TokenType type)
{
  Token token;
  token.type = type;
  token.start = tokenizer.start;
  token.length = (int32_t)(tokenizer.current - tokenizer.start);
  token.line = tokenizer.line;
  return token;
}

static Token
errorToken (char *msg)
{
  Token token;
  token.type = TOKEN_ERROR;
  token.start = msg;
  token.length = (int32_t)strlen (msg);
  token.line = tokenizer.line;
  return token;
}

static char
advance ()
{
  tokenizer.current++;
  return tokenizer.current[-1];
}

static char
peek ()
{
  return *tokenizer.current;
}

static char
peekNext ()
{
  if (isAtEnd ())
    return '\0';
  return tokenizer.current[1];
}

static bool
match (char expected)
{
  if (isAtEnd ())
    return false;
  if (*tokenizer.current != expected)
    return false;
  tokenizer.current++;
  return true;
}

static void
skipWhitespace ()
{
  for (;;)
    {
      char c = peek ();
      switch (c)
        {
        case ' ':
        case '\r':
        case '\t':
          advance ();
          break;
        case '\n':
          tokenizer.line++;
          advance ();
          break;
        case '/':
          if (peekNext () == '/')
            {
              // Ignore the preprocessor import until end of the line.
              while (peek () != '\n' && !isAtEnd ())
                advance ();
            }
          else
            {
              return;
            }
          break;
        case '(':
          if (peekNext () == '*')
            {
              advance (); // consume (
              advance (); // consume *
              while (!isAtEnd () && peek () != '*' && peekNext () != ')')
                advance (); // Consume contents
              advance ();   // consume *
              advance ();   // consume )
            }
          break;
        default:
          return;
        }
    }
}

static TokenType
checkKeyword (int start, int length, char *rest, TokenType type)
{
  if (tokenizer.current - tokenizer.start == start + length
      && memcmp (tokenizer.start + start, rest, length) == 0)
    {
      return type;
    }

  return TOKEN_IDENTIFIER;
}

static TokenType
identifierType ()
{
  char *check;
  int32_t size = tokenizer.current - tokenizer.start;
  check = (char *)malloc (sizeof (size));
  strncpy (check, tokenizer.start, size);
  check[size] = '\0';
  TokenType t = get (check);
  free (check);
  return t;
}

static Token
identifier ()
{
  while (isAlpha (peek ()) || isDigit (peek ()))
    advance ();
  return makeToken (identifierType ());
}

static Token
number ()
{
  bool is_float = false;
  while (isDigit (peek ()))
    advance ();

  // Look for a fractional part.
  if (peek () == '.' && isDigit (peekNext ()))
    {
      is_float = true;
      // Consume the ".".
      advance ();

      while (isDigit (peek ()))
        advance ();
    }

  return makeToken (is_float ? TOKEN_FLOAT
                             : TOKEN_INT); // or measure if ends in postscript
}

static Token
string ()
{
  while (peek () != '"' && !isAtEnd ())
    {
      if (peek () == '\n')
        tokenizer.line++;
      advance ();
    }

  if (isAtEnd ())
    return errorToken ("Unterminated string.");

  // The closing quote.
  advance ();
  return makeToken (TOKEN_STRING);
}

Token
nextToken ()
{
  skipWhitespace ();
  tokenizer.start = tokenizer.current;
  if (isAtEnd ())
    return makeToken (TOKEN_EOF);

  char c = advance ();
  if (isAlpha (c))
    return identifier ();
  if (isDigit (c))
    return number ();
  switch (c)
    {
    case '(':
      return makeToken (TOKEN_LEFT_PAREN);
    case ')':
      return makeToken (TOKEN_RIGHT_PAREN);
    case '{':
      return makeToken (TOKEN_LEFT_BRACE);
    case '}':
      return makeToken (TOKEN_RIGHT_BRACE);
    case '-':
      return makeToken (TOKEN_NEGATIVE);
    case '~':
      return makeToken (TOKEN_TILDE);
    case '/':
      return makeToken (TOKEN_SLASH);
    case '"':
      return string ();
    }

  return errorToken ("Unexpected character.");
}

void
debug_printToken (Token t)
{
  switch (t.type)
    {
    case TOKEN_LEFT_PAREN:
      printf ("TOKEN_LEFT_PAREN line_no=%d\n", t.line);
      break;
    case TOKEN_RIGHT_PAREN:
      printf ("TOKEN_RIGHT_PAREN line_no=%d\n", t.line);
      break;
    case TOKEN_LEFT_BRACE:
      printf ("TOKEN_LEFT_BRACE line_no=%d\n", t.line);
      break;
    case TOKEN_RIGHT_BRACE:
      printf ("TOKEN_RIGHT_BRACE line_no=%d\n", t.line);
      break;
    case TOKEN_TILDE:
      printf ("TOKEN_TILDE line_no=%d\n", t.line);
      break;
    case TOKEN_SLASH:
      printf ("TOKEN_SLASH line_no=%d\n", t.line);
      break;
    case TOKEN_MINUS:
      printf ("TOKEN_MINUS line_no=%d\n", t.line);
      break;
    case TOKEN_IDENTIFIER:
      printf ("TOKEN_IDENTIFIER line_no=%d\n", t.line);
      break;
    case TOKEN_STRING:
      printf ("TOKEN_STRING line_no=%d\n", t.line);
      break;
    case TOKEN_FLOAT:
      printf ("TOKEN_FLOAT line_no=%d\n", t.line);
      break;
    case TOKEN_LIST:
      printf ("TOKEN_LIST line_no=%d\n", t.line);
      break;
    case TOKEN_ERROR:
      printf ("TOKEN_ERROR line_no=%d\n", t.line);
      break;
    case TOKEN_FALSE:
      printf ("TOKEN_FALSE line_no=%d\n", t.line);
      break;
    case TOKEN_TRUE:
      printf ("TOKEN_TRUE line_no=%d\n", t.line);
      break;
    case TOKEN_PI:
      printf ("TOKEN_PI line_no=%d\n", t.line);
      break;
    case TOKEN_E:
      printf ("TOKEN_E line_no=%d\n", t.line);
      break;
    case TOKEN_EOF:
      printf ("TOKEN_EOF line_no=%d\n", t.line);
      break;
    case TOKEN_POP:
      printf ("TOKEN_POP line_no=%d\n", t.line);
      break;
    case TOKEN_DUP:
      printf ("TOKEN_DUP line_no=%d\n", t.line);
      break;
    case TOKEN_EXCH:
      printf ("TOKEN_EXCH line_no=%d\n", t.line);
      break;
    case TOKEN_CLEAR:
      printf ("TOKEN_CLEAR line_no=%d\n", t.line);
      break;
    case TOKEN_REMEMBER:
      printf ("TOKEN_REMEMBER line_no=%d\n", t.line);
      break;
    case TOKEN_FORGET:
      printf ("TOKEN_FORGET line_no=%d\n", t.line);
      break;
    case TOKEN_DUMP:
      printf ("TOKEN_DUMP line_no=%d\n", t.line);
      break;
    case TOKEN_NAME:
      printf ("TOKEN_NAME line_no=%d\n", t.line);
      break;
    case TOKEN_SET:
      printf ("TOKEN_SET line_no=%d\n", t.line);
      break;
    case TOKEN_IFYES:
      printf ("TOKEN_IFYES line_no=%d\n", t.line);
      break;
    case TOKEN_IFNO:
      printf ("TOKEN_IFNO line_no=%d\n", t.line);
      break;
    case TOKEN_CHOOSE:
      printf ("TOKEN_CHOOSE line_no=%d\n", t.line);
      break;
    case TOKEN_EVAL:
      printf ("TOKEN_EVAL line_no=%d\n", t.line);
      break;
    case TOKEN_ESCAPE:
      printf ("TOKEN_ESCAPE line_no=%d\n", t.line);
      break;
    case TOKEN_REPEAT:
      printf ("TOKEN_REPEAT line_no=%d\n", t.line);
      break;
    case TOKEN_SPLIT:
      printf ("TOKEN_SPLIT line_no=%d\n", t.line);
      break;
    case TOKEN_CONS:
      printf ("TOKEN_CONS line_no=%d\n", t.line);
      break;
    case TOKEN_SHATTER:
      printf ("TOKEN_SHATTER line_no=%d\n", t.line);
      break;
    case TOKEN_EMPTY:
      printf ("TOKEN_EMPTY line_no=%d\n", t.line);
      break;
    case TOKEN_COMPOSE:
      printf ("TOKEN_COMPOSE line_no=%d\n", t.line);
      break;
    case TOKEN_STREQ:
      printf ("TOKEN_STREQ line_no=%d\n", t.line);
      break;
    case TOKEN_STRCUT:
      printf ("TOKEN_STRCUT line_no=%d\n", t.line);
      break;
    case TOKEN_STRMEASURE:
      printf ("TOKEN_STRMEASURE line_no=%d\n", t.line);
      break;
    case TOKEN_STRTIE:
      printf ("TOKEN_STRTIE line_no=%d\n", t.line);
      break;
    case TOKEN_EXPLODE:
      printf ("TOKEN_EXPLODE line_no=%d\n", t.line);
      break;
    case TOKEN_ADD:
      printf ("TOKEN_ADD line_no=%d\n", t.line);
      break;
    case TOKEN_SUB:
      printf ("TOKEN_SUB line_no=%d\n", t.line);
      break;
    case TOKEN_MUL:
      printf ("TOKEN_MUL line_no=%d\n", t.line);
      break;
    case TOKEN_DIV:
      printf ("TOKEN_DIV line_no=%d\n", t.line);
      break;
    case TOKEN_IDIV:
      printf ("TOKEN_IDIV line_no=%d\n", t.line);
      break;
    case TOKEN_MOD:
      printf ("TOKEN_MOD line_no=%d\n", t.line);
      break;
    case TOKEN_POW:
      printf ("TOKEN_POW line_no=%d\n", t.line);
      break;
    case TOKEN_SQRT:
      printf ("TOKEN_SQRT line_no=%d\n", t.line);
      break;
    case TOKEN_ADD1:
      printf ("TOKEN_ADD1 line_no=%d\n", t.line);
      break;
    case TOKEN_SUB1:
      printf ("TOKEN_SUB1 line_no=%d\n", t.line);
      break;
    case TOKEN_SIN:
      printf ("TOKEN_SIN line_no=%d\n", t.line);
      break;
    case TOKEN_COS:
      printf ("TOKEN_COS line_no=%d\n", t.line);
      break;
    case TOKEN_TAN:
      printf ("TOKEN_TAN line_no=%d\n", t.line);
      break;
    case TOKEN_ATAN:
      printf ("TOKEN_ATAN line_no=%d\n", t.line);
      break;
    case TOKEN_LN:
      printf ("TOKEN_LN line_no=%d\n", t.line);
      break;
    case TOKEN_LOG:
      printf ("TOKEN_LOG line_no=%d\n", t.line);
      break;
    case TOKEN_LOG3:
      printf ("TOKEN_LOG3 line_no=%d\n", t.line);
      break;
    case TOKEN_CLIP:
      printf ("TOKEN_CLIP line_no=%d\n", t.line);
      break;
    case TOKEN_SMOOTH:
      printf ("TOKEN_SMOOTH line_no=%d\n", t.line);
      break;
    case TOKEN_HOWMUCH:
      printf ("TOKEN_HOWMUCH line_no=%d\n", t.line);
      break;
    case TOKEN_SETRAND:
      printf ("TOKEN_SETRAND line_no=%d\n", t.line);
      break;
    case TOKEN_RAND:
      printf ("TOKEN_RAND line_no=%d\n", t.line);
      break;
    case TOKEN_INT:
      printf ("TOKEN_INT line_no=%d\n", t.line);
      break;
    case TOKEN_NUMBERIZE:
      printf ("TOKEN_NUMBERIZE line_no=%d\n", t.line);
      break;
    case TOKEN_ISOLATE:
      printf ("TOKEN_ISOLATE line_no=%d\n", t.line);
      break;
    case TOKEN_MIX:
      printf ("TOKEN_MIX line_no=%d\n", t.line);
      break;
    case TOKEN_CONTRADICT:
      printf ("TOKEN_CONTRADICT line_no=%d\n", t.line);
      break;
    case TOKEN_COMPL:
      printf ("TOKEN_COMPL line_no=%d\n", t.line);
      break;
    case TOKEN_SHIFTRIGHT:
      printf ("TOKEN_SHIFTRIGHT line_no=%d\n", t.line);
      break;
    case TOKEN_SHIFTLEFT:
      printf ("TOKEN_SHIFTLEFT line_no=%d\n", t.line);
      break;
    case TOKEN_GT:
      printf ("TOKEN_GT line_no=%d\n", t.line);
      break;
    case TOKEN_LT:
      printf ("TOKEN_LT line_no=%d\n", t.line);
      break;
    case TOKEN_EQ:
      printf ("TOKEN_EQ line_no=%d\n", t.line);
      break;
    case TOKEN_GE:
      printf ("TOKEN_GE line_no=%d\n", t.line);
      break;
    case TOKEN_LE:
      printf ("TOKEN_LE line_no=%d\n", t.line);
      break;
    case TOKEN_NE:
      printf ("TOKEN_NE line_no=%d\n", t.line);
      break;
    case TOKEN_NULL:
      printf ("TOKEN_NULL line_no=%d\n", t.line);
      break;
    case TOKEN_NEGATIVE:
      printf ("TOKEN_NEGATIVE line_no=%d\n", t.line);
      break;
    case TOKEN_ISNULL:
      printf ("TOKEN_ISNULL line_no=%d\n", t.line);
      break;
    case TOKEN_ISINT:
      printf ("TOKEN_ISINT line_no=%d\n", t.line);
      break;
    case TOKEN_ISNUMBER:
      printf ("TOKEN_ISNUMBER line_no=%d\n", t.line);
      break;
    case TOKEN_AND:
      printf ("TOKEN_AND line_no=%d\n", t.line);
      break;
    case TOKEN_OR:
      printf ("TOKEN_OR line_no=%d\n", t.line);
      break;
    case TOKEN_XOR:
      printf ("TOKEN_XOR line_no=%d\n", t.line);
      break;
    case TOKEN_DISP:
      printf ("TOKEN_DISP line_no=%d\n", t.line);
      break;
    case TOKEN_LISTEN:
      printf ("TOKEN_LISTEN line_no=%d\n", t.line);
      break;
    case TOKEN_COMPLAIN:
      printf ("TOKEN_COMPLAIN line_no=%d\n", t.line);
      break;
    case TOKEN_TIME:
      printf ("TOKEN_TIME line_no=%d\n", t.line);
      break;
    case TOKEN_GARBAGE_COLLECT:
      printf ("TOKEN_GARBAGE_COLLECT line_no=%d\n", t.line);
      break;
    }
}