varaq-wasm-c/tokenizer.c

743 lines
18 KiB
C

#include "tokenizer.h"
#include "common.h"
/*
Adapted from Section 6.6 of The C Programming Language
by Brian Kernighan and Dennis Ritchie
*/
typedef struct Map Map;
struct Map
{
struct Map* next;
char* keyword;
TokenType token;
};
#define HASHSIZE 150
static Map* hashtab[HASHSIZE];
unsigned int
hash(char* s)
{
unsigned int hashval;
for (hashval = 0; *s != '\0'; s++)
hashval = *s + 31 * hashval;
return hashval % HASHSIZE;
}
Map*
lookup(char* s)
{
Map* np;
for (np = hashtab[hash(s)]; np != NULL; np = np->next)
if (strcmp(s, np->keyword) == 0)
return np;
return NULL;
}
TokenType
get(char* s)
{
Map* np;
for (np = hashtab[hash(s)]; np != NULL; np = np->next)
if (strcmp(s, np->keyword) == 0)
return np->token;
return TOKEN_IDENTIFIER;
}
Map*
put(char* keyword, TokenType token)
{
Map* np;
unsigned int hashval;
if ((np = lookup(keyword)) == NULL) {
np = (Map*)malloc(sizeof(*np));
if (np == NULL || (np->keyword = strdup(keyword)) == NULL)
return NULL;
hashval = hash(keyword);
np->next = hashtab[hashval];
hashtab[hashval] = np;
}
np->token = token;
return np;
}
void
initMap()
{
put("and", TOKEN_AND);
put("atan", TOKEN_ATAN);
put("add", TOKEN_ADD);
put("bep", TOKEN_COMPLAIN);
put("complain", TOKEN_COMPLAIN);
put("compl", TOKEN_COMPL);
put("compose", TOKEN_COMPOSE);
put("contradict", TOKEN_CONTRADICT);
put("cons", TOKEN_CONS);
put("cos", TOKEN_COS);
put("chImmoH", TOKEN_CLEAR);
put("chIm'a'", TOKEN_EMPTY);
put("cher", TOKEN_SET);
put("boq", TOKEN_ADD);
put("choose", TOKEN_CHOOSE);
put("chov", TOKEN_EVAL);
put("chuv", TOKEN_MOD);
put("cha'", TOKEN_DISP);
put("clear", TOKEN_CLEAR);
put("dup", TOKEN_DUP);
put("dump", TOKEN_DUMP);
put("disp", TOKEN_DISP);
put("div", TOKEN_DIV);
put("DuD", TOKEN_MIX);
put("e", TOKEN_E);
put("exch", TOKEN_EXCH);
put("eval", TOKEN_EVAL);
put("escape", TOKEN_ESCAPE);
put("empty?", TOKEN_EMPTY);
put("explode", TOKEN_EXPLODE);
put("eq?", TOKEN_EQ);
put("forget", TOKEN_FORGET);
put("gt?", TOKEN_GT);
put("ge?", TOKEN_GE);
put("ghap", TOKEN_XOR);
put("ghurmI'", TOKEN_E);
put("ghurtaH", TOKEN_LN);
put("ghorqu'", TOKEN_SHATTER);
put("ghobe'chugh", TOKEN_IFNO);
put("HIja'chugh", TOKEN_IFYES);
put("Hotlh", TOKEN_DUMP);
put("HeHmI'", TOKEN_PI);
put("Habwav", TOKEN_IDIV);
put("HabmI''a'", TOKEN_INT);
put("idiv", TOKEN_IDIV);
put("int?", TOKEN_INT);
put("isolate", TOKEN_ISOLATE);
put("ifyes", TOKEN_IFYES);
put("ifno", TOKEN_IFNO);
put("je", TOKEN_AND);
put("jor", TOKEN_EXPLODE);
put("joq", TOKEN_OR);
put("ln", TOKEN_LN);
put("lt?", TOKEN_LT);
put("le?", TOKEN_LE);
put("listen", TOKEN_LISTEN);
put("loS'ar", TOKEN_SQRT);
put("log", TOKEN_LOG);
put("log3", TOKEN_LOG3);
put("latlh", TOKEN_DUP);
put("law'moH", TOKEN_MUL);
put("law'qa'moH", TOKEN_POW);
put("law''a'", TOKEN_GT);
put("law'rap'a'", TOKEN_GE);
put("maHghurtaH", TOKEN_LOG);
put("mix", TOKEN_MIX);
put("mi'moH", TOKEN_NUMBERIZE);
put("muv", TOKEN_CONS);
put("mul", TOKEN_MUL);
put("mod", TOKEN_MOD);
put("mobmoH", TOKEN_ISOLATE);
put("mIScher", TOKEN_SETRAND);
put("mIS", TOKEN_RAND);
put("mI''a'", TOKEN_FLOAT);
put("nIHghoS", TOKEN_SHIFTRIGHT);
put("ne?", TOKEN_NE);
put("negative?", TOKEN_NEGATIVE);
put("name", TOKEN_NAME);
put("nargh", TOKEN_ESCAPE);
put("naQmoH", TOKEN_COMPOSE);
put("number?", TOKEN_ISNUMBER);
put("numberize", TOKEN_NUMBERIZE);
put("null?", TOKEN_NULL);
put("or", TOKEN_OR);
put("pi", TOKEN_PI);
put("pagh'a'", TOKEN_NULL);
put("pop", TOKEN_POP);
put("pong", TOKEN_NAME);
put("pow", TOKEN_POW);
put("poSghoS", TOKEN_SHIFTLEFT);
put("puS'a'", TOKEN_LT);
put("puSrap'a'", TOKEN_LE);
put("qaw", TOKEN_REMEMBER);
put("qawHa'", TOKEN_FORGET);
put("qojmI'", TOKEN_TAN);
put("qojHa'", TOKEN_ATAN);
put("Qo'moH", TOKEN_COMPL);
put("remember", TOKEN_REMEMBER);
put("repeat", TOKEN_REPEAT);
put("rand", TOKEN_RAND);
put("rap'a'", TOKEN_EQ);
put("rapbe'a'", TOKEN_NE);
put("set", TOKEN_SET);
put("split", TOKEN_SPLIT);
put("shatter", TOKEN_SHATTER);
put("strcut", TOKEN_STRCUT);
put("streq?", TOKEN_STREQ);
put("strmeasure", TOKEN_STRMEASURE);
put("strtie", TOKEN_STRTIE);
put("tlheghrar", TOKEN_STRTIE);
put("sub", TOKEN_SUB);
put("sub1", TOKEN_SUB1);
put("sqrt", TOKEN_SQRT);
put("sin", TOKEN_SIN);
put("clip", TOKEN_CLIP);
put("poD", TOKEN_CLIP);
put("smooth", TOKEN_SMOOTH);
put("Hab", TOKEN_SMOOTH);
put("howmuch", TOKEN_HOWMUCH);
put("'ar", TOKEN_HOWMUCH);
put("setrand", TOKEN_SETRAND);
put("shift right", TOKEN_SHIFTRIGHT);
put("shift left", TOKEN_SHIFTLEFT);
put("SIj", TOKEN_SPLIT);
put("boqHa'", TOKEN_SUB);
put("tam", TOKEN_EXCH);
put("tan", TOKEN_TAN);
put("taH'a'", TOKEN_NEGATIVE);
put("tlhoch", TOKEN_CONTRADICT);
put("tlheghpe'", TOKEN_STRCUT);
put("tlheghjuv", TOKEN_STRMEASURE);
put("tlheghrap'a'", TOKEN_STREQ);
put("vangqa'", TOKEN_REPEAT);
put("wIv", TOKEN_CHOOSE);
put("woD", TOKEN_POP);
put("wav", TOKEN_DIV);
put("wa'teq", TOKEN_SUB1);
put("wa'chel", TOKEN_ADD1);
put("wejghurtaH", TOKEN_LOG3);
put("xor", TOKEN_XOR);
put("\'Ij", TOKEN_LISTEN);
put("time", TOKEN_TIME);
put("poH", TOKEN_TIME);
// Wrong word in original spec, old one meant "waving hands or flapping"
// Also fixes the conflicting joq issue meaning sin or 'or'
put("yu'eghHa'", TOKEN_COS);
put("yu'egh", TOKEN_SIN);
// This one has a special case too as it is the same as the '~' operator
put("lI'moH", TOKEN_TILDE);
put("woDHa'", TOKEN_GARBAGE_COLLECT);
put("gc", TOKEN_GARBAGE_COLLECT);
}
typedef struct Tokenizer Tokenizer;
struct Tokenizer
{
char* start;
char* current;
int32_t line;
};
Tokenizer tokenizer;
void
initTokenizer(char* src)
{
tokenizer.start = src;
tokenizer.current = src;
tokenizer.line = 1;
}
static bool
isAlpha(char c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' ||
c == '\'' || c == '?';
}
static bool
isDigit(char c)
{
return (c >= '0' && c <= '9') || c == '-';
}
static bool
isAtEnd()
{
return *tokenizer.current == '\0';
}
static Token
makeToken(TokenType type)
{
Token token;
token.type = type;
token.start = tokenizer.start;
token.length = (int32_t)(tokenizer.current - tokenizer.start);
token.line = tokenizer.line;
return token;
}
static Token
errorToken(char* msg)
{
Token token;
token.type = TOKEN_ERROR;
token.start = msg;
token.length = (int32_t)strlen(msg);
token.line = tokenizer.line;
return token;
}
static char
advance()
{
tokenizer.current++;
return tokenizer.current[-1];
}
static char
peek()
{
return *tokenizer.current;
}
static char
peekNext()
{
if (isAtEnd())
return '\0';
return tokenizer.current[1];
}
static bool
match(char expected)
{
if (isAtEnd())
return false;
if (*tokenizer.current != expected)
return false;
tokenizer.current++;
return true;
}
static void
skipWhitespace()
{
for (;;) {
char c = peek();
switch (c) {
case ' ':
case '\r':
case '\t':
advance();
break;
case '\n':
tokenizer.line++;
advance();
break;
case '/':
if (peekNext() == '/') {
// Ignore the preprocessor import until end of the line.
while (peek() != '\n' && !isAtEnd())
advance();
} else {
return;
}
break;
case '(':
if (peekNext() == '*') {
advance(); // consume (
advance(); // consume *
while (!isAtEnd() && peek() != '*' && peekNext() != ')')
advance(); // Consume contents
advance(); // consume *
advance(); // consume )
}
break;
default:
return;
}
}
}
static TokenType
checkKeyword(int start, int length, char* rest, TokenType type)
{
if (tokenizer.current - tokenizer.start == start + length &&
memcmp(tokenizer.start + start, rest, length) == 0) {
return type;
}
return TOKEN_IDENTIFIER;
}
static char*
currentTokenToS()
{
int32_t size = tokenizer.current - tokenizer.start;
char* str = (char*)malloc(sizeof(size));
strncpy(str, tokenizer.start, size);
str[size] = '\0';
return str;
}
static TokenType
identifierType()
{
char* check = currentTokenToS();
TokenType t = get(check);
free(check);
return t;
}
static Token
identifier()
{
while (isAlpha(peek()) || isDigit(peek()))
advance();
return makeToken(identifierType());
}
static Token
number()
{
bool is_float = false;
while (isDigit(peek()))
advance();
// Look for a fractional part.
if (peek() == '.' && isDigit(peekNext())) {
is_float = true;
// Consume the ".".
advance();
while (isDigit(peek()))
advance();
}
return makeToken(TOKEN_FLOAT); // or measure if ends in postscript
}
static Token
string()
{
while (peek() != '"' && !isAtEnd()) {
if (peek() == '\n')
tokenizer.line++;
advance();
}
if (isAtEnd())
return errorToken("Unterminated string.");
// The closing quote.
advance();
return makeToken(TOKEN_STRING);
}
Token
nextToken()
{
skipWhitespace();
tokenizer.start = tokenizer.current;
if (isAtEnd())
return makeToken(TOKEN_EOF);
char c = advance();
if (isAlpha(c))
return identifier();
if (isDigit(c))
return number();
switch (c) {
case '(':
return makeToken(TOKEN_LEFT_PAREN);
case ')':
return makeToken(TOKEN_RIGHT_PAREN);
case '{':
return makeToken(TOKEN_LEFT_BRACE);
case '}':
return makeToken(TOKEN_RIGHT_BRACE);
case '-':
return makeToken(TOKEN_NEGATIVE);
case '~':
return makeToken(TOKEN_TILDE);
case '/':
return makeToken(TOKEN_SLASH);
case '"':
return string();
}
return errorToken("Unexpected character.");
}
void
debug_printToken(Token t)
{
char* str = currentTokenToS();
switch (t.type) {
case TOKEN_LEFT_PAREN:
printf("TOKEN_LEFT_PAREN %s line_no=%d\n", str, t.line);
break;
case TOKEN_RIGHT_PAREN:
printf("TOKEN_RIGHT_PAREN %s line_no=%d\n", str, t.line);
break;
case TOKEN_LEFT_BRACE:
printf("TOKEN_LEFT_BRACE %s line_no=%d\n", str, t.line);
break;
case TOKEN_RIGHT_BRACE:
printf("TOKEN_RIGHT_BRACE %s line_no=%d\n", str, t.line);
break;
case TOKEN_TILDE:
printf("TOKEN_TILDE %s line_no=%d\n", str, t.line);
break;
case TOKEN_SLASH:
printf("TOKEN_SLASH %s line_no=%d\n", str, t.line);
break;
case TOKEN_MINUS:
printf("TOKEN_MINUS %s line_no=%d\n", str, t.line);
break;
case TOKEN_IDENTIFIER:
printf("TOKEN_IDENTIFIER %s line_no=%d\n", str, t.line);
break;
case TOKEN_STRING:
printf("TOKEN_STRING %s line_no=%d\n", str, t.line);
break;
case TOKEN_FLOAT:
printf("TOKEN_FLOAT %s line_no=%d\n", str, t.line);
break;
case TOKEN_LIST:
printf("TOKEN_LIST %s line_no=%d\n", str, t.line);
break;
case TOKEN_ERROR:
printf("TOKEN_ERROR %s line_no=%d\n", str, t.line);
break;
case TOKEN_FALSE:
printf("TOKEN_FALSE %s line_no=%d\n", str, t.line);
break;
case TOKEN_TRUE:
printf("TOKEN_TRUE %s line_no=%d\n", str, t.line);
break;
case TOKEN_PI:
printf("TOKEN_PI %s line_no=%d\n", str, t.line);
break;
case TOKEN_E:
printf("TOKEN_E %s line_no=%d\n", str, t.line);
break;
case TOKEN_EOF:
printf("TOKEN_EOF %s line_no=%d\n", str, t.line);
break;
case TOKEN_POP:
printf("TOKEN_POP %s line_no=%d\n", str, t.line);
break;
case TOKEN_DUP:
printf("TOKEN_DUP %s line_no=%d\n", str, t.line);
break;
case TOKEN_EXCH:
printf("TOKEN_EXCH %s line_no=%d\n", str, t.line);
break;
case TOKEN_CLEAR:
printf("TOKEN_CLEAR %s line_no=%d\n", str, t.line);
break;
case TOKEN_REMEMBER:
printf("TOKEN_REMEMBER %s line_no=%d\n", str, t.line);
break;
case TOKEN_FORGET:
printf("TOKEN_FORGET %s line_no=%d\n", str, t.line);
break;
case TOKEN_DUMP:
printf("TOKEN_DUMP %s line_no=%d\n", str, t.line);
break;
case TOKEN_NAME:
printf("TOKEN_NAME %s line_no=%d\n", str, t.line);
break;
case TOKEN_SET:
printf("TOKEN_SET %s line_no=%d\n", str, t.line);
break;
case TOKEN_IFYES:
printf("TOKEN_IFYES %s line_no=%d\n", str, t.line);
break;
case TOKEN_IFNO:
printf("TOKEN_IFNO %s line_no=%d\n", str, t.line);
break;
case TOKEN_CHOOSE:
printf("TOKEN_CHOOSE %s line_no=%d\n", str, t.line);
break;
case TOKEN_EVAL:
printf("TOKEN_EVAL %s line_no=%d\n", str, t.line);
break;
case TOKEN_ESCAPE:
printf("TOKEN_ESCAPE %s line_no=%d\n", str, t.line);
break;
case TOKEN_REPEAT:
printf("TOKEN_REPEAT %s line_no=%d\n", str, t.line);
break;
case TOKEN_SPLIT:
printf("TOKEN_SPLIT %s line_no=%d\n", str, t.line);
break;
case TOKEN_CONS:
printf("TOKEN_CONS %s line_no=%d\n", str, t.line);
break;
case TOKEN_SHATTER:
printf("TOKEN_SHATTER %s line_no=%d\n", str, t.line);
break;
case TOKEN_EMPTY:
printf("TOKEN_EMPTY %s line_no=%d\n", str, t.line);
break;
case TOKEN_COMPOSE:
printf("TOKEN_COMPOSE %s line_no=%d\n", str, t.line);
break;
case TOKEN_STREQ:
printf("TOKEN_STREQ %s line_no=%d\n", str, t.line);
break;
case TOKEN_STRCUT:
printf("TOKEN_STRCUT %s line_no=%d\n", str, t.line);
break;
case TOKEN_STRMEASURE:
printf("TOKEN_STRMEASURE %s line_no=%d\n", str, t.line);
break;
case TOKEN_STRTIE:
printf("TOKEN_STRTIE %s line_no=%d\n", str, t.line);
break;
case TOKEN_EXPLODE:
printf("TOKEN_EXPLODE %s line_no=%d\n", str, t.line);
break;
case TOKEN_ADD:
printf("TOKEN_ADD %s line_no=%d\n", str, t.line);
break;
case TOKEN_SUB:
printf("TOKEN_SUB %s line_no=%d\n", str, t.line);
break;
case TOKEN_MUL:
printf("TOKEN_MUL %s line_no=%d\n", str, t.line);
break;
case TOKEN_DIV:
printf("TOKEN_DIV %s line_no=%d\n", str, t.line);
break;
case TOKEN_IDIV:
printf("TOKEN_IDIV %s line_no=%d\n", str, t.line);
break;
case TOKEN_MOD:
printf("TOKEN_MOD %s line_no=%d\n", str, t.line);
break;
case TOKEN_POW:
printf("TOKEN_POW %s line_no=%d\n", str, t.line);
break;
case TOKEN_SQRT:
printf("TOKEN_SQRT %s line_no=%d\n", str, t.line);
break;
case TOKEN_ADD1:
printf("TOKEN_ADD1 %s line_no=%d\n", str, t.line);
break;
case TOKEN_SUB1:
printf("TOKEN_SUB1 %s line_no=%d\n", str, t.line);
break;
case TOKEN_SIN:
printf("TOKEN_SIN %s line_no=%d\n", str, t.line);
break;
case TOKEN_COS:
printf("TOKEN_COS %s line_no=%d\n", str, t.line);
break;
case TOKEN_TAN:
printf("TOKEN_TAN %s line_no=%d\n", str, t.line);
break;
case TOKEN_ATAN:
printf("TOKEN_ATAN %s line_no=%d\n", str, t.line);
break;
case TOKEN_LN:
printf("TOKEN_LN %s line_no=%d\n", str, t.line);
break;
case TOKEN_LOG:
printf("TOKEN_LOG %s line_no=%d\n", str, t.line);
break;
case TOKEN_LOG3:
printf("TOKEN_LOG3 %s line_no=%d\n", str, t.line);
break;
case TOKEN_CLIP:
printf("TOKEN_CLIP %s line_no=%d\n", str, t.line);
break;
case TOKEN_SMOOTH:
printf("TOKEN_SMOOTH %s line_no=%d\n", str, t.line);
break;
case TOKEN_HOWMUCH:
printf("TOKEN_HOWMUCH %s line_no=%d\n", str, t.line);
break;
case TOKEN_SETRAND:
printf("TOKEN_SETRAND %s line_no=%d\n", str, t.line);
break;
case TOKEN_RAND:
printf("TOKEN_RAND %s line_no=%d\n", str, t.line);
break;
case TOKEN_INT:
printf("TOKEN_INT %s line_no=%d\n", str, t.line);
break;
case TOKEN_NUMBERIZE:
printf("TOKEN_NUMBERIZE %s line_no=%d\n", str, t.line);
break;
case TOKEN_ISOLATE:
printf("TOKEN_ISOLATE %s line_no=%d\n", str, t.line);
break;
case TOKEN_MIX:
printf("TOKEN_MIX %s line_no=%d\n", str, t.line);
break;
case TOKEN_CONTRADICT:
printf("TOKEN_CONTRADICT %s line_no=%d\n", str, t.line);
break;
case TOKEN_COMPL:
printf("TOKEN_COMPL %s line_no=%d\n", str, t.line);
break;
case TOKEN_SHIFTRIGHT:
printf("TOKEN_SHIFTRIGHT %s line_no=%d\n", str, t.line);
break;
case TOKEN_SHIFTLEFT:
printf("TOKEN_SHIFTLEFT %s line_no=%d\n", str, t.line);
break;
case TOKEN_GT:
printf("TOKEN_GT %s line_no=%d\n", str, t.line);
break;
case TOKEN_LT:
printf("TOKEN_LT %s line_no=%d\n", str, t.line);
break;
case TOKEN_EQ:
printf("TOKEN_EQ %s line_no=%d\n", str, t.line);
break;
case TOKEN_GE:
printf("TOKEN_GE %s line_no=%d\n", str, t.line);
break;
case TOKEN_LE:
printf("TOKEN_LE %s line_no=%d\n", str, t.line);
break;
case TOKEN_NE:
printf("TOKEN_NE %s line_no=%d\n", str, t.line);
break;
case TOKEN_NULL:
printf("TOKEN_NULL %s line_no=%d\n", str, t.line);
break;
case TOKEN_NEGATIVE:
printf("TOKEN_NEGATIVE %s line_no=%d\n", str, t.line);
break;
case TOKEN_ISNULL:
printf("TOKEN_ISNULL %s line_no=%d\n", str, t.line);
break;
case TOKEN_ISINT:
printf("TOKEN_ISINT %s line_no=%d\n", str, t.line);
break;
case TOKEN_ISNUMBER:
printf("TOKEN_ISNUMBER %s line_no=%d\n", str, t.line);
break;
case TOKEN_AND:
printf("TOKEN_AND %s line_no=%d\n", str, t.line);
break;
case TOKEN_OR:
printf("TOKEN_OR %s line_no=%d\n", str, t.line);
break;
case TOKEN_XOR:
printf("TOKEN_XOR %s line_no=%d\n", str, t.line);
break;
case TOKEN_DISP:
printf("TOKEN_DISP %s line_no=%d\n", str, t.line);
break;
case TOKEN_LISTEN:
printf("TOKEN_LISTEN %s line_no=%d\n", str, t.line);
break;
case TOKEN_COMPLAIN:
printf("TOKEN_COMPLAIN %s line_no=%d\n", str, t.line);
break;
case TOKEN_TIME:
printf("TOKEN_TIME %s line_no=%d\n", str, t.line);
break;
case TOKEN_GARBAGE_COLLECT:
printf("TOKEN_GARBAGE_COLLECT %s line_no=%d\n", str, t.line);
break;
}
free(str);
}