#include "tokenizer.h" #include "common.h" /* Adapted from Section 6.6 of The C Programming Language by Brian Kernighan and Dennis Ritchie */ typedef struct Map Map; struct Map { struct Map* next; char* keyword; TokenType token; }; #define HASHSIZE 150 static Map* hashtab[HASHSIZE]; unsigned int hash(char* s) { unsigned int hashval; for (hashval = 0; *s != '\0'; s++) hashval = *s + 31 * hashval; return hashval % HASHSIZE; } Map* lookup(char* s) { Map* np; for (np = hashtab[hash(s)]; np != NULL; np = np->next) if (strcmp(s, np->keyword) == 0) return np; return NULL; } TokenType get(char* s) { Map* np; for (np = hashtab[hash(s)]; np != NULL; np = np->next) if (strcmp(s, np->keyword) == 0) return np->token; return TOKEN_IDENTIFIER; } Map* put(char* keyword, TokenType token) { Map* np; unsigned int hashval; if ((np = lookup(keyword)) == NULL) { np = (Map*)malloc(sizeof(*np)); if (np == NULL || (np->keyword = strdup(keyword)) == NULL) return NULL; hashval = hash(keyword); np->next = hashtab[hashval]; hashtab[hashval] = np; } np->token = token; return np; } void initMap() { put("and", TOKEN_AND); put("atan", TOKEN_ATAN); put("add", TOKEN_ADD); put("bep", TOKEN_COMPLAIN); put("complain", TOKEN_COMPLAIN); put("compl", TOKEN_COMPL); put("compose", TOKEN_COMPOSE); put("contradict", TOKEN_CONTRADICT); put("cons", TOKEN_CONS); put("cos", TOKEN_COS); put("chImmoH", TOKEN_CLEAR); put("chIm'a'", TOKEN_EMPTY); put("cher", TOKEN_SET); put("boq", TOKEN_ADD); put("choose", TOKEN_CHOOSE); put("chov", TOKEN_EVAL); put("chuv", TOKEN_MOD); put("cha'", TOKEN_DISP); put("clear", TOKEN_CLEAR); put("dup", TOKEN_DUP); put("dump", TOKEN_DUMP); put("disp", TOKEN_DISP); put("div", TOKEN_DIV); put("DuD", TOKEN_MIX); put("e", TOKEN_E); put("exch", TOKEN_EXCH); put("eval", TOKEN_EVAL); put("escape", TOKEN_ESCAPE); put("empty?", TOKEN_EMPTY); put("explode", TOKEN_EXPLODE); put("eq?", TOKEN_EQ); put("forget", TOKEN_FORGET); put("gt?", TOKEN_GT); put("ge?", TOKEN_GE); put("ghap", TOKEN_XOR); put("ghurmI'", TOKEN_E); put("ghurtaH", TOKEN_LN); put("ghorqu'", TOKEN_SHATTER); put("ghobe'chugh", TOKEN_IFNO); put("HIja'chugh", TOKEN_IFYES); put("Hotlh", TOKEN_DUMP); put("HeHmI'", TOKEN_PI); put("Habwav", TOKEN_IDIV); put("HabmI''a'", TOKEN_INT); put("idiv", TOKEN_IDIV); put("int?", TOKEN_INT); put("isolate", TOKEN_ISOLATE); put("ifyes", TOKEN_IFYES); put("ifno", TOKEN_IFNO); put("je", TOKEN_AND); put("jor", TOKEN_EXPLODE); put("joq", TOKEN_OR); put("ln", TOKEN_LN); put("lt?", TOKEN_LT); put("le?", TOKEN_LE); put("listen", TOKEN_LISTEN); put("loS'ar", TOKEN_SQRT); put("log", TOKEN_LOG); put("log3", TOKEN_LOG3); put("latlh", TOKEN_DUP); put("law'moH", TOKEN_MUL); put("law'qa'moH", TOKEN_POW); put("law''a'", TOKEN_GT); put("law'rap'a'", TOKEN_GE); put("maHghurtaH", TOKEN_LOG); put("mix", TOKEN_MIX); put("mi'moH", TOKEN_NUMBERIZE); put("muv", TOKEN_CONS); put("mul", TOKEN_MUL); put("mod", TOKEN_MOD); put("mobmoH", TOKEN_ISOLATE); put("mIScher", TOKEN_SETRAND); put("mIS", TOKEN_RAND); put("mI''a'", TOKEN_FLOAT); put("nIHghoS", TOKEN_SHIFTRIGHT); put("ne?", TOKEN_NE); put("negative?", TOKEN_NEGATIVE); put("name", TOKEN_NAME); put("nargh", TOKEN_ESCAPE); put("naQmoH", TOKEN_COMPOSE); put("number?", TOKEN_ISNUMBER); put("numberize", TOKEN_NUMBERIZE); put("null?", TOKEN_NULL); put("or", TOKEN_OR); put("pi", TOKEN_PI); put("pagh'a'", TOKEN_NULL); put("pop", TOKEN_POP); put("pong", TOKEN_NAME); put("pow", TOKEN_POW); put("poSghoS", TOKEN_SHIFTLEFT); put("puS'a'", TOKEN_LT); put("puSrap'a'", TOKEN_LE); put("qaw", TOKEN_REMEMBER); put("qawHa'", TOKEN_FORGET); put("qojmI'", TOKEN_TAN); put("qojHa'", TOKEN_ATAN); put("Qo'moH", TOKEN_COMPL); put("remember", TOKEN_REMEMBER); put("repeat", TOKEN_REPEAT); put("rand", TOKEN_RAND); put("rap'a'", TOKEN_EQ); put("rapbe'a'", TOKEN_NE); put("set", TOKEN_SET); put("split", TOKEN_SPLIT); put("shatter", TOKEN_SHATTER); put("strcut", TOKEN_STRCUT); put("streq?", TOKEN_STREQ); put("strmeasure", TOKEN_STRMEASURE); put("strtie", TOKEN_STRTIE); put("tlheghrar", TOKEN_STRTIE); put("sub", TOKEN_SUB); put("sub1", TOKEN_SUB1); put("sqrt", TOKEN_SQRT); put("sin", TOKEN_SIN); put("clip", TOKEN_CLIP); put("poD", TOKEN_CLIP); put("smooth", TOKEN_SMOOTH); put("Hab", TOKEN_SMOOTH); put("howmuch", TOKEN_HOWMUCH); put("'ar", TOKEN_HOWMUCH); put("setrand", TOKEN_SETRAND); put("shift right", TOKEN_SHIFTRIGHT); put("shift left", TOKEN_SHIFTLEFT); put("SIj", TOKEN_SPLIT); put("boqHa'", TOKEN_SUB); put("tam", TOKEN_EXCH); put("tan", TOKEN_TAN); put("taH'a'", TOKEN_NEGATIVE); put("tlhoch", TOKEN_CONTRADICT); put("tlheghpe'", TOKEN_STRCUT); put("tlheghjuv", TOKEN_STRMEASURE); put("tlheghrap'a'", TOKEN_STREQ); put("vangqa'", TOKEN_REPEAT); put("wIv", TOKEN_CHOOSE); put("woD", TOKEN_POP); put("wav", TOKEN_DIV); put("wa'teq", TOKEN_SUB1); put("wa'chel", TOKEN_ADD1); put("wejghurtaH", TOKEN_LOG3); put("xor", TOKEN_XOR); put("\'Ij", TOKEN_LISTEN); put("time", TOKEN_TIME); put("poH", TOKEN_TIME); // Wrong word in original spec, old one meant "waving hands or flapping" // Also fixes the conflicting joq issue meaning sin or 'or' put("yu'eghHa'", TOKEN_COS); put("yu'egh", TOKEN_SIN); // This one has a special case too as it is the same as the '~' operator put("lI'moH", TOKEN_TILDE); put("woDHa'", TOKEN_GARBAGE_COLLECT); put("gc", TOKEN_GARBAGE_COLLECT); } typedef struct Tokenizer Tokenizer; struct Tokenizer { char* start; char* current; int32_t line; }; Tokenizer tokenizer; void initTokenizer(char* src) { tokenizer.start = src; tokenizer.current = src; tokenizer.line = 1; } static bool isAlpha(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '\'' || c == '?'; } static bool isDigit(char c) { return (c >= '0' && c <= '9') || c == '-'; } static bool isAtEnd() { return *tokenizer.current == '\0'; } static Token makeToken(TokenType type) { Token token; token.type = type; token.start = tokenizer.start; token.length = (int32_t)(tokenizer.current - tokenizer.start); token.line = tokenizer.line; return token; } static Token errorToken(char* msg) { Token token; token.type = TOKEN_ERROR; token.start = msg; token.length = (int32_t)strlen(msg); token.line = tokenizer.line; return token; } static char advance() { tokenizer.current++; return tokenizer.current[-1]; } static char peek() { return *tokenizer.current; } static char peekNext() { if (isAtEnd()) return '\0'; return tokenizer.current[1]; } static bool match(char expected) { if (isAtEnd()) return false; if (*tokenizer.current != expected) return false; tokenizer.current++; return true; } static void skipWhitespace() { for (;;) { char c = peek(); switch (c) { case ' ': case '\r': case '\t': advance(); break; case '\n': tokenizer.line++; advance(); break; case '/': if (peekNext() == '/') { // Ignore the preprocessor import until end of the line. while (peek() != '\n' && !isAtEnd()) advance(); } else { return; } break; case '(': if (peekNext() == '*') { advance(); // consume ( advance(); // consume * while (!isAtEnd() && peek() != '*' && peekNext() != ')') advance(); // Consume contents advance(); // consume * advance(); // consume ) } break; default: return; } } } static TokenType checkKeyword(int start, int length, char* rest, TokenType type) { if (tokenizer.current - tokenizer.start == start + length && memcmp(tokenizer.start + start, rest, length) == 0) { return type; } return TOKEN_IDENTIFIER; } static char* currentTokenToS() { int32_t size = tokenizer.current - tokenizer.start; char* str = (char*)malloc(sizeof(size)); strncpy(str, tokenizer.start, size); str[size] = '\0'; return str; } static TokenType identifierType() { char* check = currentTokenToS(); TokenType t = get(check); free(check); return t; } static Token identifier() { while (isAlpha(peek()) || isDigit(peek())) advance(); return makeToken(identifierType()); } static Token number() { bool is_float = false; while (isDigit(peek())) advance(); // Look for a fractional part. if (peek() == '.' && isDigit(peekNext())) { is_float = true; // Consume the ".". advance(); while (isDigit(peek())) advance(); } return makeToken(TOKEN_FLOAT); // or measure if ends in postscript } static Token string() { while (peek() != '"' && !isAtEnd()) { if (peek() == '\n') tokenizer.line++; advance(); } if (isAtEnd()) return errorToken("Unterminated string."); // The closing quote. advance(); return makeToken(TOKEN_STRING); } Token nextToken() { skipWhitespace(); tokenizer.start = tokenizer.current; if (isAtEnd()) return makeToken(TOKEN_EOF); char c = advance(); if (isAlpha(c)) return identifier(); if (isDigit(c)) return number(); switch (c) { case '(': return makeToken(TOKEN_LEFT_PAREN); case ')': return makeToken(TOKEN_RIGHT_PAREN); case '{': return makeToken(TOKEN_LEFT_BRACE); case '}': return makeToken(TOKEN_RIGHT_BRACE); case '-': return makeToken(TOKEN_NEGATIVE); case '~': return makeToken(TOKEN_TILDE); case '/': return makeToken(TOKEN_SLASH); case '"': return string(); } return errorToken("Unexpected character."); } void debug_printToken(Token t) { char* str = currentTokenToS(); switch (t.type) { case TOKEN_LEFT_PAREN: printf("TOKEN_LEFT_PAREN %s line_no=%d\n", str, t.line); break; case TOKEN_RIGHT_PAREN: printf("TOKEN_RIGHT_PAREN %s line_no=%d\n", str, t.line); break; case TOKEN_LEFT_BRACE: printf("TOKEN_LEFT_BRACE %s line_no=%d\n", str, t.line); break; case TOKEN_RIGHT_BRACE: printf("TOKEN_RIGHT_BRACE %s line_no=%d\n", str, t.line); break; case TOKEN_TILDE: printf("TOKEN_TILDE %s line_no=%d\n", str, t.line); break; case TOKEN_SLASH: printf("TOKEN_SLASH %s line_no=%d\n", str, t.line); break; case TOKEN_MINUS: printf("TOKEN_MINUS %s line_no=%d\n", str, t.line); break; case TOKEN_IDENTIFIER: printf("TOKEN_IDENTIFIER %s line_no=%d\n", str, t.line); break; case TOKEN_STRING: printf("TOKEN_STRING %s line_no=%d\n", str, t.line); break; case TOKEN_FLOAT: printf("TOKEN_FLOAT %s line_no=%d\n", str, t.line); break; case TOKEN_LIST: printf("TOKEN_LIST %s line_no=%d\n", str, t.line); break; case TOKEN_ERROR: printf("TOKEN_ERROR %s line_no=%d\n", str, t.line); break; case TOKEN_FALSE: printf("TOKEN_FALSE %s line_no=%d\n", str, t.line); break; case TOKEN_TRUE: printf("TOKEN_TRUE %s line_no=%d\n", str, t.line); break; case TOKEN_PI: printf("TOKEN_PI %s line_no=%d\n", str, t.line); break; case TOKEN_E: printf("TOKEN_E %s line_no=%d\n", str, t.line); break; case TOKEN_EOF: printf("TOKEN_EOF %s line_no=%d\n", str, t.line); break; case TOKEN_POP: printf("TOKEN_POP %s line_no=%d\n", str, t.line); break; case TOKEN_DUP: printf("TOKEN_DUP %s line_no=%d\n", str, t.line); break; case TOKEN_EXCH: printf("TOKEN_EXCH %s line_no=%d\n", str, t.line); break; case TOKEN_CLEAR: printf("TOKEN_CLEAR %s line_no=%d\n", str, t.line); break; case TOKEN_REMEMBER: printf("TOKEN_REMEMBER %s line_no=%d\n", str, t.line); break; case TOKEN_FORGET: printf("TOKEN_FORGET %s line_no=%d\n", str, t.line); break; case TOKEN_DUMP: printf("TOKEN_DUMP %s line_no=%d\n", str, t.line); break; case TOKEN_NAME: printf("TOKEN_NAME %s line_no=%d\n", str, t.line); break; case TOKEN_SET: printf("TOKEN_SET %s line_no=%d\n", str, t.line); break; case TOKEN_IFYES: printf("TOKEN_IFYES %s line_no=%d\n", str, t.line); break; case TOKEN_IFNO: printf("TOKEN_IFNO %s line_no=%d\n", str, t.line); break; case TOKEN_CHOOSE: printf("TOKEN_CHOOSE %s line_no=%d\n", str, t.line); break; case TOKEN_EVAL: printf("TOKEN_EVAL %s line_no=%d\n", str, t.line); break; case TOKEN_ESCAPE: printf("TOKEN_ESCAPE %s line_no=%d\n", str, t.line); break; case TOKEN_REPEAT: printf("TOKEN_REPEAT %s line_no=%d\n", str, t.line); break; case TOKEN_SPLIT: printf("TOKEN_SPLIT %s line_no=%d\n", str, t.line); break; case TOKEN_CONS: printf("TOKEN_CONS %s line_no=%d\n", str, t.line); break; case TOKEN_SHATTER: printf("TOKEN_SHATTER %s line_no=%d\n", str, t.line); break; case TOKEN_EMPTY: printf("TOKEN_EMPTY %s line_no=%d\n", str, t.line); break; case TOKEN_COMPOSE: printf("TOKEN_COMPOSE %s line_no=%d\n", str, t.line); break; case TOKEN_STREQ: printf("TOKEN_STREQ %s line_no=%d\n", str, t.line); break; case TOKEN_STRCUT: printf("TOKEN_STRCUT %s line_no=%d\n", str, t.line); break; case TOKEN_STRMEASURE: printf("TOKEN_STRMEASURE %s line_no=%d\n", str, t.line); break; case TOKEN_STRTIE: printf("TOKEN_STRTIE %s line_no=%d\n", str, t.line); break; case TOKEN_EXPLODE: printf("TOKEN_EXPLODE %s line_no=%d\n", str, t.line); break; case TOKEN_ADD: printf("TOKEN_ADD %s line_no=%d\n", str, t.line); break; case TOKEN_SUB: printf("TOKEN_SUB %s line_no=%d\n", str, t.line); break; case TOKEN_MUL: printf("TOKEN_MUL %s line_no=%d\n", str, t.line); break; case TOKEN_DIV: printf("TOKEN_DIV %s line_no=%d\n", str, t.line); break; case TOKEN_IDIV: printf("TOKEN_IDIV %s line_no=%d\n", str, t.line); break; case TOKEN_MOD: printf("TOKEN_MOD %s line_no=%d\n", str, t.line); break; case TOKEN_POW: printf("TOKEN_POW %s line_no=%d\n", str, t.line); break; case TOKEN_SQRT: printf("TOKEN_SQRT %s line_no=%d\n", str, t.line); break; case TOKEN_ADD1: printf("TOKEN_ADD1 %s line_no=%d\n", str, t.line); break; case TOKEN_SUB1: printf("TOKEN_SUB1 %s line_no=%d\n", str, t.line); break; case TOKEN_SIN: printf("TOKEN_SIN %s line_no=%d\n", str, t.line); break; case TOKEN_COS: printf("TOKEN_COS %s line_no=%d\n", str, t.line); break; case TOKEN_TAN: printf("TOKEN_TAN %s line_no=%d\n", str, t.line); break; case TOKEN_ATAN: printf("TOKEN_ATAN %s line_no=%d\n", str, t.line); break; case TOKEN_LN: printf("TOKEN_LN %s line_no=%d\n", str, t.line); break; case TOKEN_LOG: printf("TOKEN_LOG %s line_no=%d\n", str, t.line); break; case TOKEN_LOG3: printf("TOKEN_LOG3 %s line_no=%d\n", str, t.line); break; case TOKEN_CLIP: printf("TOKEN_CLIP %s line_no=%d\n", str, t.line); break; case TOKEN_SMOOTH: printf("TOKEN_SMOOTH %s line_no=%d\n", str, t.line); break; case TOKEN_HOWMUCH: printf("TOKEN_HOWMUCH %s line_no=%d\n", str, t.line); break; case TOKEN_SETRAND: printf("TOKEN_SETRAND %s line_no=%d\n", str, t.line); break; case TOKEN_RAND: printf("TOKEN_RAND %s line_no=%d\n", str, t.line); break; case TOKEN_INT: printf("TOKEN_INT %s line_no=%d\n", str, t.line); break; case TOKEN_NUMBERIZE: printf("TOKEN_NUMBERIZE %s line_no=%d\n", str, t.line); break; case TOKEN_ISOLATE: printf("TOKEN_ISOLATE %s line_no=%d\n", str, t.line); break; case TOKEN_MIX: printf("TOKEN_MIX %s line_no=%d\n", str, t.line); break; case TOKEN_CONTRADICT: printf("TOKEN_CONTRADICT %s line_no=%d\n", str, t.line); break; case TOKEN_COMPL: printf("TOKEN_COMPL %s line_no=%d\n", str, t.line); break; case TOKEN_SHIFTRIGHT: printf("TOKEN_SHIFTRIGHT %s line_no=%d\n", str, t.line); break; case TOKEN_SHIFTLEFT: printf("TOKEN_SHIFTLEFT %s line_no=%d\n", str, t.line); break; case TOKEN_GT: printf("TOKEN_GT %s line_no=%d\n", str, t.line); break; case TOKEN_LT: printf("TOKEN_LT %s line_no=%d\n", str, t.line); break; case TOKEN_EQ: printf("TOKEN_EQ %s line_no=%d\n", str, t.line); break; case TOKEN_GE: printf("TOKEN_GE %s line_no=%d\n", str, t.line); break; case TOKEN_LE: printf("TOKEN_LE %s line_no=%d\n", str, t.line); break; case TOKEN_NE: printf("TOKEN_NE %s line_no=%d\n", str, t.line); break; case TOKEN_NULL: printf("TOKEN_NULL %s line_no=%d\n", str, t.line); break; case TOKEN_NEGATIVE: printf("TOKEN_NEGATIVE %s line_no=%d\n", str, t.line); break; case TOKEN_ISNULL: printf("TOKEN_ISNULL %s line_no=%d\n", str, t.line); break; case TOKEN_ISINT: printf("TOKEN_ISINT %s line_no=%d\n", str, t.line); break; case TOKEN_ISNUMBER: printf("TOKEN_ISNUMBER %s line_no=%d\n", str, t.line); break; case TOKEN_AND: printf("TOKEN_AND %s line_no=%d\n", str, t.line); break; case TOKEN_OR: printf("TOKEN_OR %s line_no=%d\n", str, t.line); break; case TOKEN_XOR: printf("TOKEN_XOR %s line_no=%d\n", str, t.line); break; case TOKEN_DISP: printf("TOKEN_DISP %s line_no=%d\n", str, t.line); break; case TOKEN_LISTEN: printf("TOKEN_LISTEN %s line_no=%d\n", str, t.line); break; case TOKEN_COMPLAIN: printf("TOKEN_COMPLAIN %s line_no=%d\n", str, t.line); break; case TOKEN_TIME: printf("TOKEN_TIME %s line_no=%d\n", str, t.line); break; case TOKEN_GARBAGE_COLLECT: printf("TOKEN_GARBAGE_COLLECT %s line_no=%d\n", str, t.line); break; } free(str); }