#include "tokenizer.h" #include "common.h" /* Adapted from Section 6.6 of The C Programming Language by Brian Kernighan and Dennis Ritchie */ typedef struct Map Map; struct Map { struct Map *next; char *keyword; TokenType token; }; #define HASHSIZE 150 static Map *hashtab[HASHSIZE]; unsigned int hash (char *s) { unsigned int hashval; for (hashval = 0; *s != '\0'; s++) hashval = *s + 31 * hashval; return hashval % HASHSIZE; } Map * lookup (char *s) { Map *np; for (np = hashtab[hash (s)]; np != NULL; np = np->next) if (strcmp (s, np->keyword) == 0) return np; return NULL; } TokenType get (char *s) { Map *np; for (np = hashtab[hash (s)]; np != NULL; np = np->next) if (strcmp (s, np->keyword) == 0) return np->token; return TOKEN_IDENTIFIER; } Map * put (char *keyword, TokenType token) { Map *np; unsigned int hashval; if ((np = lookup (keyword)) == NULL) { np = (Map *)malloc (sizeof (*np)); if (np == NULL || (np->keyword = strdup (keyword)) == NULL) return NULL; hashval = hash (keyword); np->next = hashtab[hashval]; hashtab[hashval] = np; } np->token = token; return np; } void initMap () { put ("and", TOKEN_AND); put ("atan", TOKEN_ATAN); put ("add", TOKEN_ADD); put ("bep", TOKEN_COMPLAIN); put ("complain", TOKEN_COMPLAIN); put ("compl", TOKEN_COMPL); put ("compose", TOKEN_COMPOSE); put ("contradict", TOKEN_CONTRADICT); put ("cons", TOKEN_CONS); put ("cos", TOKEN_COS); put ("chImmoH", TOKEN_CLEAR); put ("chIm'a'", TOKEN_EMPTY); put ("cher", TOKEN_SET); put ("boq", TOKEN_ADD); put ("choose", TOKEN_CHOOSE); put ("chov", TOKEN_EVAL); put ("chuv", TOKEN_MOD); put ("cha'", TOKEN_DISP); put ("clear", TOKEN_CLEAR); put ("dup", TOKEN_DUP); put ("dump", TOKEN_DUMP); put ("disp", TOKEN_DISP); put ("div", TOKEN_DIV); put ("DuD", TOKEN_MIX); put ("e", TOKEN_E); put ("exch", TOKEN_EXCH); put ("eval", TOKEN_EVAL); put ("escape", TOKEN_ESCAPE); put ("empty?", TOKEN_EMPTY); put ("explode", TOKEN_EXPLODE); put ("eq?", TOKEN_EQ); put ("forget", TOKEN_FORGET); put ("gt?", TOKEN_GT); put ("ge?", TOKEN_GE); put ("ghap", TOKEN_XOR); put ("ghurmI'", TOKEN_E); put ("ghurtaH", TOKEN_LN); put ("ghorqu'", TOKEN_SHATTER); put ("ghobe'chugh", TOKEN_IFNO); put ("HIja'chugh", TOKEN_IFYES); put ("Hotlh", TOKEN_DUMP); put ("HeHmI'", TOKEN_PI); put ("Habwav", TOKEN_IDIV); put ("HabmI''a'", TOKEN_INT); put ("idiv", TOKEN_IDIV); put ("int?", TOKEN_INT); put ("isolate", TOKEN_ISOLATE); put ("ifyes", TOKEN_IFYES); put ("ifno", TOKEN_IFNO); put ("je", TOKEN_AND); put ("jor", TOKEN_EXPLODE); put ("joq", TOKEN_OR); put ("ln", TOKEN_LN); put ("lt?", TOKEN_LT); put ("le?", TOKEN_LE); put ("listen", TOKEN_LISTEN); put ("loS'ar", TOKEN_SQRT); put ("log", TOKEN_LOG); put ("log3", TOKEN_LOG3); put ("latlh", TOKEN_DUP); put ("law'moH", TOKEN_MUL); put ("law'qa'moH", TOKEN_POW); put ("law''a'", TOKEN_GT); put ("law'rap'a'", TOKEN_GE); put ("maHghurtaH", TOKEN_LOG); put ("mix", TOKEN_MIX); put ("mi'moH", TOKEN_NUMBERIZE); put ("muv", TOKEN_CONS); put ("mul", TOKEN_MUL); put ("mod", TOKEN_MOD); put ("mobmoH", TOKEN_ISOLATE); put ("mIScher", TOKEN_SETRAND); put ("mIS", TOKEN_RAND); put ("mI''a'", TOKEN_FLOAT); put ("nIHghoS", TOKEN_SHIFTRIGHT); put ("ne?", TOKEN_NE); put ("negative?", TOKEN_NEGATIVE); put ("name", TOKEN_NAME); put ("nargh", TOKEN_ESCAPE); put ("naQmoH", TOKEN_COMPOSE); put ("number?", TOKEN_ISNUMBER); put ("numberize", TOKEN_NUMBERIZE); put ("null?", TOKEN_NULL); put ("or", TOKEN_OR); put ("pi", TOKEN_PI); put ("pagh'a'", TOKEN_NULL); put ("pop", TOKEN_POP); put ("pong", TOKEN_NAME); put ("pow", TOKEN_POW); put ("poSghoS", TOKEN_SHIFTLEFT); put ("puS'a'", TOKEN_LT); put ("puSrap'a'", TOKEN_LE); put ("qaw", TOKEN_REMEMBER); put ("qawHa'", TOKEN_FORGET); put ("qojmI'", TOKEN_TAN); put ("qojHa'", TOKEN_ATAN); put ("Qo'moH", TOKEN_COMPL); put ("remember", TOKEN_REMEMBER); put ("repeat", TOKEN_REPEAT); put ("rand", TOKEN_RAND); put ("rap'a'", TOKEN_EQ); put ("rapbe'a'", TOKEN_NE); put ("set", TOKEN_SET); put ("split", TOKEN_SPLIT); put ("shatter", TOKEN_SHATTER); put ("strcut", TOKEN_STRCUT); put ("streq?", TOKEN_STREQ); put ("strmeasure", TOKEN_STRMEASURE); put ("strtie", TOKEN_STRTIE); put ("tlheghrar", TOKEN_STRTIE); put ("sub", TOKEN_SUB); put ("sub1", TOKEN_SUB1); put ("sqrt", TOKEN_SQRT); put ("sin", TOKEN_SIN); put ("clip", TOKEN_CLIP); put ("poD", TOKEN_CLIP); put ("smooth", TOKEN_SMOOTH); put ("Hab", TOKEN_SMOOTH); put ("howmuch", TOKEN_HOWMUCH); put ("'ar", TOKEN_HOWMUCH); put ("setrand", TOKEN_SETRAND); put ("shift right", TOKEN_SHIFTRIGHT); put ("shift left", TOKEN_SHIFTLEFT); put ("SIj", TOKEN_SPLIT); put ("boqHa'", TOKEN_SUB); put ("tam", TOKEN_EXCH); put ("tan", TOKEN_TAN); put ("taH'a'", TOKEN_NEGATIVE); put ("tlhoch", TOKEN_CONTRADICT); put ("tlheghpe'", TOKEN_STRCUT); put ("tlheghjuv", TOKEN_STRMEASURE); put ("tlheghrap'a'", TOKEN_STREQ); put ("vangqa'", TOKEN_REPEAT); put ("wIv", TOKEN_CHOOSE); put ("woD", TOKEN_POP); put ("wav", TOKEN_DIV); put ("wa'teq", TOKEN_SUB1); put ("wa'chel", TOKEN_ADD1); put ("wejghurtaH", TOKEN_LOG3); put ("xor", TOKEN_XOR); put ("\'Ij", TOKEN_LISTEN); put ("time", TOKEN_TIME); put ("poH", TOKEN_TIME); // Wrong word in original spec, old one meant "waving hands or flapping" // Also fixes the conflicting joq issue meaning sin or 'or' put ("yu'eghHa'", TOKEN_COS); put ("yu'egh", TOKEN_SIN); // This one has a special case too as it is the same as the '~' operator put ("lI'moH", TOKEN_TILDE); put ("woDHa'", TOKEN_GARBAGE_COLLECT); put ("gc", TOKEN_GARBAGE_COLLECT); } typedef struct Tokenizer Tokenizer; struct Tokenizer { char *start; char *current; int32_t line; }; Tokenizer tokenizer; void initTokenizer (char *src) { tokenizer.start = src; tokenizer.current = src; tokenizer.line = 1; } static bool isAlpha (char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '\'' || c == '?'; } static bool isDigit (char c) { return (c >= '0' && c <= '9') || c == '-'; } static bool isAtEnd () { return *tokenizer.current == '\0'; } static Token makeToken (TokenType type) { Token token; token.type = type; token.start = tokenizer.start; token.length = (int32_t)(tokenizer.current - tokenizer.start); token.line = tokenizer.line; return token; } static Token errorToken (char *msg) { Token token; token.type = TOKEN_ERROR; token.start = msg; token.length = (int32_t)strlen (msg); token.line = tokenizer.line; return token; } static char advance () { tokenizer.current++; return tokenizer.current[-1]; } static char peek () { return *tokenizer.current; } static char peekNext () { if (isAtEnd ()) return '\0'; return tokenizer.current[1]; } static bool match (char expected) { if (isAtEnd ()) return false; if (*tokenizer.current != expected) return false; tokenizer.current++; return true; } static void skipWhitespace () { for (;;) { char c = peek (); switch (c) { case ' ': case '\r': case '\t': advance (); break; case '\n': tokenizer.line++; advance (); break; case '/': if (peekNext () == '/') { // Ignore the preprocessor import until end of the line. while (peek () != '\n' && !isAtEnd ()) advance (); } else { return; } break; case '(': if (peekNext () == '*') { advance (); // consume ( advance (); // consume * while (!isAtEnd () && peek () != '*' && peekNext () != ')') advance (); // Consume contents advance (); // consume * advance (); // consume ) } break; default: return; } } } static TokenType checkKeyword (int start, int length, char *rest, TokenType type) { if (tokenizer.current - tokenizer.start == start + length && memcmp (tokenizer.start + start, rest, length) == 0) { return type; } return TOKEN_IDENTIFIER; } static TokenType identifierType () { char *check; int32_t size = tokenizer.current - tokenizer.start; check = (char *)malloc (sizeof (size)); strncpy (check, tokenizer.start, size); check[size] = '\0'; TokenType t = get (check); free (check); return t; } static Token identifier () { while (isAlpha (peek ()) || isDigit (peek ())) advance (); return makeToken (identifierType ()); } static Token number () { bool is_float = false; while (isDigit (peek ())) advance (); // Look for a fractional part. if (peek () == '.' && isDigit (peekNext ())) { is_float = true; // Consume the ".". advance (); while (isDigit (peek ())) advance (); } return makeToken (is_float ? TOKEN_FLOAT : TOKEN_INT); // or measure if ends in postscript } static Token string () { while (peek () != '"' && !isAtEnd ()) { if (peek () == '\n') tokenizer.line++; advance (); } if (isAtEnd ()) return errorToken ("Unterminated string."); // The closing quote. advance (); return makeToken (TOKEN_STRING); } Token nextToken () { skipWhitespace (); tokenizer.start = tokenizer.current; if (isAtEnd ()) return makeToken (TOKEN_EOF); char c = advance (); if (isAlpha (c)) return identifier (); if (isDigit (c)) return number (); switch (c) { case '(': return makeToken (TOKEN_LEFT_PAREN); case ')': return makeToken (TOKEN_RIGHT_PAREN); case '{': return makeToken (TOKEN_LEFT_BRACE); case '}': return makeToken (TOKEN_RIGHT_BRACE); case '-': return makeToken (TOKEN_NEGATIVE); case '~': return makeToken (TOKEN_TILDE); case '/': return makeToken (TOKEN_SLASH); case '"': return string (); } return errorToken ("Unexpected character."); } void debug_printToken (Token t) { switch (t.type) { case TOKEN_LEFT_PAREN: printf ("TOKEN_LEFT_PAREN line_no=%d\n", t.line); break; case TOKEN_RIGHT_PAREN: printf ("TOKEN_RIGHT_PAREN line_no=%d\n", t.line); break; case TOKEN_LEFT_BRACE: printf ("TOKEN_LEFT_BRACE line_no=%d\n", t.line); break; case TOKEN_RIGHT_BRACE: printf ("TOKEN_RIGHT_BRACE line_no=%d\n", t.line); break; case TOKEN_TILDE: printf ("TOKEN_TILDE line_no=%d\n", t.line); break; case TOKEN_SLASH: printf ("TOKEN_SLASH line_no=%d\n", t.line); break; case TOKEN_MINUS: printf ("TOKEN_MINUS line_no=%d\n", t.line); break; case TOKEN_IDENTIFIER: printf ("TOKEN_IDENTIFIER line_no=%d\n", t.line); break; case TOKEN_STRING: printf ("TOKEN_STRING line_no=%d\n", t.line); break; case TOKEN_FLOAT: printf ("TOKEN_FLOAT line_no=%d\n", t.line); break; case TOKEN_LIST: printf ("TOKEN_LIST line_no=%d\n", t.line); break; case TOKEN_ERROR: printf ("TOKEN_ERROR line_no=%d\n", t.line); break; case TOKEN_FALSE: printf ("TOKEN_FALSE line_no=%d\n", t.line); break; case TOKEN_TRUE: printf ("TOKEN_TRUE line_no=%d\n", t.line); break; case TOKEN_PI: printf ("TOKEN_PI line_no=%d\n", t.line); break; case TOKEN_E: printf ("TOKEN_E line_no=%d\n", t.line); break; case TOKEN_EOF: printf ("TOKEN_EOF line_no=%d\n", t.line); break; case TOKEN_POP: printf ("TOKEN_POP line_no=%d\n", t.line); break; case TOKEN_DUP: printf ("TOKEN_DUP line_no=%d\n", t.line); break; case TOKEN_EXCH: printf ("TOKEN_EXCH line_no=%d\n", t.line); break; case TOKEN_CLEAR: printf ("TOKEN_CLEAR line_no=%d\n", t.line); break; case TOKEN_REMEMBER: printf ("TOKEN_REMEMBER line_no=%d\n", t.line); break; case TOKEN_FORGET: printf ("TOKEN_FORGET line_no=%d\n", t.line); break; case TOKEN_DUMP: printf ("TOKEN_DUMP line_no=%d\n", t.line); break; case TOKEN_NAME: printf ("TOKEN_NAME line_no=%d\n", t.line); break; case TOKEN_SET: printf ("TOKEN_SET line_no=%d\n", t.line); break; case TOKEN_IFYES: printf ("TOKEN_IFYES line_no=%d\n", t.line); break; case TOKEN_IFNO: printf ("TOKEN_IFNO line_no=%d\n", t.line); break; case TOKEN_CHOOSE: printf ("TOKEN_CHOOSE line_no=%d\n", t.line); break; case TOKEN_EVAL: printf ("TOKEN_EVAL line_no=%d\n", t.line); break; case TOKEN_ESCAPE: printf ("TOKEN_ESCAPE line_no=%d\n", t.line); break; case TOKEN_REPEAT: printf ("TOKEN_REPEAT line_no=%d\n", t.line); break; case TOKEN_SPLIT: printf ("TOKEN_SPLIT line_no=%d\n", t.line); break; case TOKEN_CONS: printf ("TOKEN_CONS line_no=%d\n", t.line); break; case TOKEN_SHATTER: printf ("TOKEN_SHATTER line_no=%d\n", t.line); break; case TOKEN_EMPTY: printf ("TOKEN_EMPTY line_no=%d\n", t.line); break; case TOKEN_COMPOSE: printf ("TOKEN_COMPOSE line_no=%d\n", t.line); break; case TOKEN_STREQ: printf ("TOKEN_STREQ line_no=%d\n", t.line); break; case TOKEN_STRCUT: printf ("TOKEN_STRCUT line_no=%d\n", t.line); break; case TOKEN_STRMEASURE: printf ("TOKEN_STRMEASURE line_no=%d\n", t.line); break; case TOKEN_STRTIE: printf ("TOKEN_STRTIE line_no=%d\n", t.line); break; case TOKEN_EXPLODE: printf ("TOKEN_EXPLODE line_no=%d\n", t.line); break; case TOKEN_ADD: printf ("TOKEN_ADD line_no=%d\n", t.line); break; case TOKEN_SUB: printf ("TOKEN_SUB line_no=%d\n", t.line); break; case TOKEN_MUL: printf ("TOKEN_MUL line_no=%d\n", t.line); break; case TOKEN_DIV: printf ("TOKEN_DIV line_no=%d\n", t.line); break; case TOKEN_IDIV: printf ("TOKEN_IDIV line_no=%d\n", t.line); break; case TOKEN_MOD: printf ("TOKEN_MOD line_no=%d\n", t.line); break; case TOKEN_POW: printf ("TOKEN_POW line_no=%d\n", t.line); break; case TOKEN_SQRT: printf ("TOKEN_SQRT line_no=%d\n", t.line); break; case TOKEN_ADD1: printf ("TOKEN_ADD1 line_no=%d\n", t.line); break; case TOKEN_SUB1: printf ("TOKEN_SUB1 line_no=%d\n", t.line); break; case TOKEN_SIN: printf ("TOKEN_SIN line_no=%d\n", t.line); break; case TOKEN_COS: printf ("TOKEN_COS line_no=%d\n", t.line); break; case TOKEN_TAN: printf ("TOKEN_TAN line_no=%d\n", t.line); break; case TOKEN_ATAN: printf ("TOKEN_ATAN line_no=%d\n", t.line); break; case TOKEN_LN: printf ("TOKEN_LN line_no=%d\n", t.line); break; case TOKEN_LOG: printf ("TOKEN_LOG line_no=%d\n", t.line); break; case TOKEN_LOG3: printf ("TOKEN_LOG3 line_no=%d\n", t.line); break; case TOKEN_CLIP: printf ("TOKEN_CLIP line_no=%d\n", t.line); break; case TOKEN_SMOOTH: printf ("TOKEN_SMOOTH line_no=%d\n", t.line); break; case TOKEN_HOWMUCH: printf ("TOKEN_HOWMUCH line_no=%d\n", t.line); break; case TOKEN_SETRAND: printf ("TOKEN_SETRAND line_no=%d\n", t.line); break; case TOKEN_RAND: printf ("TOKEN_RAND line_no=%d\n", t.line); break; case TOKEN_INT: printf ("TOKEN_INT line_no=%d\n", t.line); break; case TOKEN_NUMBERIZE: printf ("TOKEN_NUMBERIZE line_no=%d\n", t.line); break; case TOKEN_ISOLATE: printf ("TOKEN_ISOLATE line_no=%d\n", t.line); break; case TOKEN_MIX: printf ("TOKEN_MIX line_no=%d\n", t.line); break; case TOKEN_CONTRADICT: printf ("TOKEN_CONTRADICT line_no=%d\n", t.line); break; case TOKEN_COMPL: printf ("TOKEN_COMPL line_no=%d\n", t.line); break; case TOKEN_SHIFTRIGHT: printf ("TOKEN_SHIFTRIGHT line_no=%d\n", t.line); break; case TOKEN_SHIFTLEFT: printf ("TOKEN_SHIFTLEFT line_no=%d\n", t.line); break; case TOKEN_GT: printf ("TOKEN_GT line_no=%d\n", t.line); break; case TOKEN_LT: printf ("TOKEN_LT line_no=%d\n", t.line); break; case TOKEN_EQ: printf ("TOKEN_EQ line_no=%d\n", t.line); break; case TOKEN_GE: printf ("TOKEN_GE line_no=%d\n", t.line); break; case TOKEN_LE: printf ("TOKEN_LE line_no=%d\n", t.line); break; case TOKEN_NE: printf ("TOKEN_NE line_no=%d\n", t.line); break; case TOKEN_NULL: printf ("TOKEN_NULL line_no=%d\n", t.line); break; case TOKEN_NEGATIVE: printf ("TOKEN_NEGATIVE line_no=%d\n", t.line); break; case TOKEN_ISNULL: printf ("TOKEN_ISNULL line_no=%d\n", t.line); break; case TOKEN_ISINT: printf ("TOKEN_ISINT line_no=%d\n", t.line); break; case TOKEN_ISNUMBER: printf ("TOKEN_ISNUMBER line_no=%d\n", t.line); break; case TOKEN_AND: printf ("TOKEN_AND line_no=%d\n", t.line); break; case TOKEN_OR: printf ("TOKEN_OR line_no=%d\n", t.line); break; case TOKEN_XOR: printf ("TOKEN_XOR line_no=%d\n", t.line); break; case TOKEN_DISP: printf ("TOKEN_DISP line_no=%d\n", t.line); break; case TOKEN_LISTEN: printf ("TOKEN_LISTEN line_no=%d\n", t.line); break; case TOKEN_COMPLAIN: printf ("TOKEN_COMPLAIN line_no=%d\n", t.line); break; case TOKEN_TIME: printf ("TOKEN_TIME line_no=%d\n", t.line); break; case TOKEN_GARBAGE_COLLECT: printf ("TOKEN_GARBAGE_COLLECT line_no=%d\n", t.line); break; } }