varaq-wasm-c/tokenizer.c

747 lines
18 KiB
C
Raw Normal View History

2023-02-04 14:06:00 -05:00
#include "tokenizer.h"
#include "common.h"
/*
Adapted from Section 6.6 of The C Programming Language
by Brian Kernighan and Dennis Ritchie
*/
typedef struct Map Map;
struct Map
{
struct Map *next;
char *keyword;
TokenType token;
};
#define HASHSIZE 150
static Map *hashtab[HASHSIZE];
unsigned int
hash (char *s)
{
unsigned int hashval;
for (hashval = 0; *s != '\0'; s++)
hashval = *s + 31 * hashval;
return hashval % HASHSIZE;
}
Map *
lookup (char *s)
{
Map *np;
for (np = hashtab[hash (s)]; np != NULL; np = np->next)
if (strcmp (s, np->keyword) == 0)
return np;
return NULL;
}
TokenType
get (char *s)
{
Map *np;
for (np = hashtab[hash (s)]; np != NULL; np = np->next)
if (strcmp (s, np->keyword) == 0)
return np->token;
return TOKEN_IDENTIFIER;
}
Map *
put (char *keyword, TokenType token)
{
Map *np;
unsigned int hashval;
if ((np = lookup (keyword)) == NULL)
{
np = (Map *)malloc (sizeof (*np));
if (np == NULL || (np->keyword = strdup (keyword)) == NULL)
return NULL;
hashval = hash (keyword);
np->next = hashtab[hashval];
hashtab[hashval] = np;
}
np->token = token;
return np;
}
void
initMap ()
{
put ("and", TOKEN_AND);
put ("atan", TOKEN_ATAN);
put ("add", TOKEN_ADD);
put ("bep", TOKEN_COMPLAIN);
put ("complain", TOKEN_COMPLAIN);
put ("compl", TOKEN_COMPL);
put ("compose", TOKEN_COMPOSE);
put ("contradict", TOKEN_CONTRADICT);
put ("cons", TOKEN_CONS);
put ("cos", TOKEN_COS);
put ("chImmoH", TOKEN_CLEAR);
put ("chIm'a'", TOKEN_EMPTY);
put ("cher", TOKEN_SET);
put ("boq", TOKEN_ADD);
put ("choose", TOKEN_CHOOSE);
put ("chov", TOKEN_EVAL);
put ("chuv", TOKEN_MOD);
put ("cha'", TOKEN_DISP);
put ("clear", TOKEN_CLEAR);
put ("dup", TOKEN_DUP);
put ("dump", TOKEN_DUMP);
put ("disp", TOKEN_DISP);
put ("div", TOKEN_DIV);
put ("DuD", TOKEN_MIX);
put ("e", TOKEN_E);
put ("exch", TOKEN_EXCH);
put ("eval", TOKEN_EVAL);
put ("escape", TOKEN_ESCAPE);
put ("empty?", TOKEN_EMPTY);
put ("explode", TOKEN_EXPLODE);
put ("eq?", TOKEN_EQ);
put ("forget", TOKEN_FORGET);
put ("gt?", TOKEN_GT);
put ("ge?", TOKEN_GE);
put ("ghap", TOKEN_XOR);
put ("ghurmI'", TOKEN_E);
put ("ghurtaH", TOKEN_LN);
put ("ghorqu'", TOKEN_SHATTER);
put ("ghobe'chugh", TOKEN_IFNO);
put ("HIja'chugh", TOKEN_IFYES);
put ("Hotlh", TOKEN_DUMP);
put ("HeHmI'", TOKEN_PI);
put ("Habwav", TOKEN_IDIV);
put ("HabmI''a'", TOKEN_INT);
put ("idiv", TOKEN_IDIV);
put ("int?", TOKEN_INT);
put ("isolate", TOKEN_ISOLATE);
put ("ifyes", TOKEN_IFYES);
put ("ifno", TOKEN_IFNO);
put ("je", TOKEN_AND);
put ("jor", TOKEN_EXPLODE);
put ("joq", TOKEN_OR);
put ("ln", TOKEN_LN);
put ("lt?", TOKEN_LT);
put ("le?", TOKEN_LE);
put ("listen", TOKEN_LISTEN);
put ("loS'ar", TOKEN_SQRT);
put ("log", TOKEN_LOG);
put ("log3", TOKEN_LOG3);
put ("latlh", TOKEN_DUP);
put ("law'moH", TOKEN_MUL);
put ("law'qa'moH", TOKEN_POW);
put ("law''a'", TOKEN_GT);
put ("law'rap'a'", TOKEN_GE);
put ("maHghurtaH", TOKEN_LOG);
put ("mix", TOKEN_MIX);
put ("mi'moH", TOKEN_NUMBERIZE);
put ("muv", TOKEN_CONS);
put ("mul", TOKEN_MUL);
put ("mod", TOKEN_MOD);
put ("mobmoH", TOKEN_ISOLATE);
put ("mIScher", TOKEN_SETRAND);
put ("mIS", TOKEN_RAND);
put ("mI''a'", TOKEN_FLOAT);
put ("nIHghoS", TOKEN_SHIFTRIGHT);
put ("ne?", TOKEN_NE);
put ("negative?", TOKEN_NEGATIVE);
put ("name", TOKEN_NAME);
put ("nargh", TOKEN_ESCAPE);
put ("naQmoH", TOKEN_COMPOSE);
put ("number?", TOKEN_ISNUMBER);
put ("numberize", TOKEN_NUMBERIZE);
put ("null?", TOKEN_NULL);
put ("or", TOKEN_OR);
put ("pi", TOKEN_PI);
put ("pagh'a'", TOKEN_NULL);
put ("pop", TOKEN_POP);
put ("pong", TOKEN_NAME);
put ("pow", TOKEN_POW);
put ("poSghoS", TOKEN_SHIFTLEFT);
put ("puS'a'", TOKEN_LT);
put ("puSrap'a'", TOKEN_LE);
put ("qaw", TOKEN_REMEMBER);
put ("qawHa'", TOKEN_FORGET);
put ("qojmI'", TOKEN_TAN);
put ("qojHa'", TOKEN_ATAN);
put ("Qo'moH", TOKEN_COMPL);
put ("remember", TOKEN_REMEMBER);
put ("repeat", TOKEN_REPEAT);
put ("rand", TOKEN_RAND);
put ("rap'a'", TOKEN_EQ);
put ("rapbe'a'", TOKEN_NE);
put ("set", TOKEN_SET);
put ("split", TOKEN_SPLIT);
put ("shatter", TOKEN_SHATTER);
put ("strcut", TOKEN_STRCUT);
put ("streq?", TOKEN_STREQ);
put ("strmeasure", TOKEN_STRMEASURE);
put ("strtie", TOKEN_STRTIE);
put ("tlheghrar", TOKEN_STRTIE);
put ("sub", TOKEN_SUB);
put ("sub1", TOKEN_SUB1);
put ("sqrt", TOKEN_SQRT);
put ("sin", TOKEN_SIN);
put ("clip", TOKEN_CLIP);
put ("poD", TOKEN_CLIP);
put ("smooth", TOKEN_SMOOTH);
put ("Hab", TOKEN_SMOOTH);
put ("howmuch", TOKEN_HOWMUCH);
put ("'ar", TOKEN_HOWMUCH);
put ("setrand", TOKEN_SETRAND);
put ("shift right", TOKEN_SHIFTRIGHT);
put ("shift left", TOKEN_SHIFTLEFT);
put ("SIj", TOKEN_SPLIT);
put ("boqHa'", TOKEN_SUB);
put ("tam", TOKEN_EXCH);
put ("tan", TOKEN_TAN);
put ("taH'a'", TOKEN_NEGATIVE);
put ("tlhoch", TOKEN_CONTRADICT);
put ("tlheghpe'", TOKEN_STRCUT);
put ("tlheghjuv", TOKEN_STRMEASURE);
put ("tlheghrap'a'", TOKEN_STREQ);
put ("vangqa'", TOKEN_REPEAT);
put ("wIv", TOKEN_CHOOSE);
put ("woD", TOKEN_POP);
put ("wav", TOKEN_DIV);
put ("wa'teq", TOKEN_SUB1);
put ("wa'chel", TOKEN_ADD1);
put ("wejghurtaH", TOKEN_LOG3);
put ("xor", TOKEN_XOR);
put ("\'Ij", TOKEN_LISTEN);
put ("time", TOKEN_TIME);
put ("poH", TOKEN_TIME);
// Wrong word in original spec, old one meant "waving hands or flapping"
// Also fixes the conflicting joq issue meaning sin or 'or'
put ("yu'eghHa'", TOKEN_COS);
put ("yu'egh", TOKEN_SIN);
// This one has a special case too as it is the same as the '~' operator
put ("lI'moH", TOKEN_TILDE);
put ("woDHa'", TOKEN_GARBAGE_COLLECT);
put ("gc", TOKEN_GARBAGE_COLLECT);
}
typedef struct Tokenizer Tokenizer;
struct Tokenizer
{
char *start;
char *current;
int32_t line;
};
Tokenizer tokenizer;
void
initTokenizer (char *src)
{
tokenizer.start = src;
tokenizer.current = src;
tokenizer.line = 1;
}
static bool
isAlpha (char c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'
|| c == '\'' || c == '?';
}
static bool
isDigit (char c)
{
return (c >= '0' && c <= '9') || c == '-';
}
static bool
isAtEnd ()
{
return *tokenizer.current == '\0';
}
static Token
makeToken (TokenType type)
{
Token token;
token.type = type;
token.start = tokenizer.start;
token.length = (int32_t)(tokenizer.current - tokenizer.start);
token.line = tokenizer.line;
return token;
}
static Token
errorToken (char *msg)
{
Token token;
token.type = TOKEN_ERROR;
token.start = msg;
token.length = (int32_t)strlen (msg);
token.line = tokenizer.line;
return token;
}
static char
advance ()
{
tokenizer.current++;
return tokenizer.current[-1];
}
static char
peek ()
{
return *tokenizer.current;
}
static char
peekNext ()
{
if (isAtEnd ())
return '\0';
return tokenizer.current[1];
}
static bool
match (char expected)
{
if (isAtEnd ())
return false;
if (*tokenizer.current != expected)
return false;
tokenizer.current++;
return true;
}
static void
skipWhitespace ()
{
for (;;)
{
char c = peek ();
switch (c)
{
case ' ':
case '\r':
case '\t':
advance ();
break;
case '\n':
tokenizer.line++;
advance ();
break;
case '/':
if (peekNext () == '/')
{
// Ignore the preprocessor import until end of the line.
while (peek () != '\n' && !isAtEnd ())
advance ();
}
else
{
return;
}
break;
case '(':
if (peekNext () == '*')
{
advance (); // consume (
advance (); // consume *
while (!isAtEnd () && peek () != '*' && peekNext () != ')')
advance (); // Consume contents
advance (); // consume *
advance (); // consume )
}
break;
default:
return;
}
}
}
static TokenType
checkKeyword (int start, int length, char *rest, TokenType type)
{
if (tokenizer.current - tokenizer.start == start + length
&& memcmp (tokenizer.start + start, rest, length) == 0)
{
return type;
}
return TOKEN_IDENTIFIER;
}
static TokenType
identifierType ()
{
char *check;
int32_t size = tokenizer.current - tokenizer.start;
check = (char *)malloc (sizeof (size));
strncpy (check, tokenizer.start, size);
check[size] = '\0';
TokenType t = get (check);
free (check);
return t;
}
static Token
identifier ()
{
while (isAlpha (peek ()) || isDigit (peek ()))
advance ();
return makeToken (identifierType ());
}
static Token
number ()
{
bool is_float = false;
while (isDigit (peek ()))
advance ();
// Look for a fractional part.
if (peek () == '.' && isDigit (peekNext ()))
{
is_float = true;
// Consume the ".".
advance ();
while (isDigit (peek ()))
advance ();
}
return makeToken (is_float ? TOKEN_FLOAT
: TOKEN_INT); // or measure if ends in postscript
}
static Token
string ()
{
while (peek () != '"' && !isAtEnd ())
{
if (peek () == '\n')
tokenizer.line++;
advance ();
}
if (isAtEnd ())
return errorToken ("Unterminated string.");
// The closing quote.
advance ();
return makeToken (TOKEN_STRING);
}
Token
nextToken ()
{
skipWhitespace ();
tokenizer.start = tokenizer.current;
if (isAtEnd ())
return makeToken (TOKEN_EOF);
char c = advance ();
if (isAlpha (c))
return identifier ();
if (isDigit (c))
return number ();
switch (c)
{
case '(':
return makeToken (TOKEN_LEFT_PAREN);
case ')':
return makeToken (TOKEN_RIGHT_PAREN);
case '{':
return makeToken (TOKEN_LEFT_BRACE);
case '}':
return makeToken (TOKEN_RIGHT_BRACE);
case '-':
return makeToken (TOKEN_NEGATIVE);
case '~':
return makeToken (TOKEN_TILDE);
case '/':
return makeToken (TOKEN_SLASH);
case '"':
return string ();
}
return errorToken ("Unexpected character.");
}
void
debug_printToken (Token t)
{
switch (t.type)
{
case TOKEN_LEFT_PAREN:
printf ("TOKEN_LEFT_PAREN line_no=%d\n", t.line);
break;
case TOKEN_RIGHT_PAREN:
printf ("TOKEN_RIGHT_PAREN line_no=%d\n", t.line);
break;
case TOKEN_LEFT_BRACE:
printf ("TOKEN_LEFT_BRACE line_no=%d\n", t.line);
break;
case TOKEN_RIGHT_BRACE:
printf ("TOKEN_RIGHT_BRACE line_no=%d\n", t.line);
break;
case TOKEN_TILDE:
printf ("TOKEN_TILDE line_no=%d\n", t.line);
break;
case TOKEN_SLASH:
printf ("TOKEN_SLASH line_no=%d\n", t.line);
break;
case TOKEN_MINUS:
printf ("TOKEN_MINUS line_no=%d\n", t.line);
break;
case TOKEN_IDENTIFIER:
printf ("TOKEN_IDENTIFIER line_no=%d\n", t.line);
break;
case TOKEN_STRING:
printf ("TOKEN_STRING line_no=%d\n", t.line);
break;
case TOKEN_FLOAT:
printf ("TOKEN_FLOAT line_no=%d\n", t.line);
break;
case TOKEN_LIST:
printf ("TOKEN_LIST line_no=%d\n", t.line);
break;
case TOKEN_ERROR:
printf ("TOKEN_ERROR line_no=%d\n", t.line);
break;
case TOKEN_FALSE:
printf ("TOKEN_FALSE line_no=%d\n", t.line);
break;
case TOKEN_TRUE:
printf ("TOKEN_TRUE line_no=%d\n", t.line);
break;
case TOKEN_PI:
printf ("TOKEN_PI line_no=%d\n", t.line);
break;
case TOKEN_E:
printf ("TOKEN_E line_no=%d\n", t.line);
break;
case TOKEN_EOF:
printf ("TOKEN_EOF line_no=%d\n", t.line);
break;
case TOKEN_POP:
printf ("TOKEN_POP line_no=%d\n", t.line);
break;
case TOKEN_DUP:
printf ("TOKEN_DUP line_no=%d\n", t.line);
break;
case TOKEN_EXCH:
printf ("TOKEN_EXCH line_no=%d\n", t.line);
break;
case TOKEN_CLEAR:
printf ("TOKEN_CLEAR line_no=%d\n", t.line);
break;
case TOKEN_REMEMBER:
printf ("TOKEN_REMEMBER line_no=%d\n", t.line);
break;
case TOKEN_FORGET:
printf ("TOKEN_FORGET line_no=%d\n", t.line);
break;
case TOKEN_DUMP:
printf ("TOKEN_DUMP line_no=%d\n", t.line);
break;
case TOKEN_NAME:
printf ("TOKEN_NAME line_no=%d\n", t.line);
break;
case TOKEN_SET:
printf ("TOKEN_SET line_no=%d\n", t.line);
break;
case TOKEN_IFYES:
printf ("TOKEN_IFYES line_no=%d\n", t.line);
break;
case TOKEN_IFNO:
printf ("TOKEN_IFNO line_no=%d\n", t.line);
break;
case TOKEN_CHOOSE:
printf ("TOKEN_CHOOSE line_no=%d\n", t.line);
break;
case TOKEN_EVAL:
printf ("TOKEN_EVAL line_no=%d\n", t.line);
break;
case TOKEN_ESCAPE:
printf ("TOKEN_ESCAPE line_no=%d\n", t.line);
break;
case TOKEN_REPEAT:
printf ("TOKEN_REPEAT line_no=%d\n", t.line);
break;
case TOKEN_SPLIT:
printf ("TOKEN_SPLIT line_no=%d\n", t.line);
break;
case TOKEN_CONS:
printf ("TOKEN_CONS line_no=%d\n", t.line);
break;
case TOKEN_SHATTER:
printf ("TOKEN_SHATTER line_no=%d\n", t.line);
break;
case TOKEN_EMPTY:
printf ("TOKEN_EMPTY line_no=%d\n", t.line);
break;
case TOKEN_COMPOSE:
printf ("TOKEN_COMPOSE line_no=%d\n", t.line);
break;
case TOKEN_STREQ:
printf ("TOKEN_STREQ line_no=%d\n", t.line);
break;
case TOKEN_STRCUT:
printf ("TOKEN_STRCUT line_no=%d\n", t.line);
break;
case TOKEN_STRMEASURE:
printf ("TOKEN_STRMEASURE line_no=%d\n", t.line);
break;
case TOKEN_STRTIE:
printf ("TOKEN_STRTIE line_no=%d\n", t.line);
break;
case TOKEN_EXPLODE:
printf ("TOKEN_EXPLODE line_no=%d\n", t.line);
break;
case TOKEN_ADD:
printf ("TOKEN_ADD line_no=%d\n", t.line);
break;
case TOKEN_SUB:
printf ("TOKEN_SUB line_no=%d\n", t.line);
break;
case TOKEN_MUL:
printf ("TOKEN_MUL line_no=%d\n", t.line);
break;
case TOKEN_DIV:
printf ("TOKEN_DIV line_no=%d\n", t.line);
break;
case TOKEN_IDIV:
printf ("TOKEN_IDIV line_no=%d\n", t.line);
break;
case TOKEN_MOD:
printf ("TOKEN_MOD line_no=%d\n", t.line);
break;
case TOKEN_POW:
printf ("TOKEN_POW line_no=%d\n", t.line);
break;
case TOKEN_SQRT:
printf ("TOKEN_SQRT line_no=%d\n", t.line);
break;
case TOKEN_ADD1:
printf ("TOKEN_ADD1 line_no=%d\n", t.line);
break;
case TOKEN_SUB1:
printf ("TOKEN_SUB1 line_no=%d\n", t.line);
break;
case TOKEN_SIN:
printf ("TOKEN_SIN line_no=%d\n", t.line);
break;
case TOKEN_COS:
printf ("TOKEN_COS line_no=%d\n", t.line);
break;
case TOKEN_TAN:
printf ("TOKEN_TAN line_no=%d\n", t.line);
break;
case TOKEN_ATAN:
printf ("TOKEN_ATAN line_no=%d\n", t.line);
break;
case TOKEN_LN:
printf ("TOKEN_LN line_no=%d\n", t.line);
break;
case TOKEN_LOG:
printf ("TOKEN_LOG line_no=%d\n", t.line);
break;
case TOKEN_LOG3:
printf ("TOKEN_LOG3 line_no=%d\n", t.line);
break;
case TOKEN_CLIP:
printf ("TOKEN_CLIP line_no=%d\n", t.line);
break;
case TOKEN_SMOOTH:
printf ("TOKEN_SMOOTH line_no=%d\n", t.line);
break;
case TOKEN_HOWMUCH:
printf ("TOKEN_HOWMUCH line_no=%d\n", t.line);
break;
case TOKEN_SETRAND:
printf ("TOKEN_SETRAND line_no=%d\n", t.line);
break;
case TOKEN_RAND:
printf ("TOKEN_RAND line_no=%d\n", t.line);
break;
case TOKEN_INT:
printf ("TOKEN_INT line_no=%d\n", t.line);
break;
case TOKEN_NUMBERIZE:
printf ("TOKEN_NUMBERIZE line_no=%d\n", t.line);
break;
case TOKEN_ISOLATE:
printf ("TOKEN_ISOLATE line_no=%d\n", t.line);
break;
case TOKEN_MIX:
printf ("TOKEN_MIX line_no=%d\n", t.line);
break;
case TOKEN_CONTRADICT:
printf ("TOKEN_CONTRADICT line_no=%d\n", t.line);
break;
case TOKEN_COMPL:
printf ("TOKEN_COMPL line_no=%d\n", t.line);
break;
case TOKEN_SHIFTRIGHT:
printf ("TOKEN_SHIFTRIGHT line_no=%d\n", t.line);
break;
case TOKEN_SHIFTLEFT:
printf ("TOKEN_SHIFTLEFT line_no=%d\n", t.line);
break;
case TOKEN_GT:
printf ("TOKEN_GT line_no=%d\n", t.line);
break;
case TOKEN_LT:
printf ("TOKEN_LT line_no=%d\n", t.line);
break;
case TOKEN_EQ:
printf ("TOKEN_EQ line_no=%d\n", t.line);
break;
case TOKEN_GE:
printf ("TOKEN_GE line_no=%d\n", t.line);
break;
case TOKEN_LE:
printf ("TOKEN_LE line_no=%d\n", t.line);
break;
case TOKEN_NE:
printf ("TOKEN_NE line_no=%d\n", t.line);
break;
case TOKEN_NULL:
printf ("TOKEN_NULL line_no=%d\n", t.line);
break;
case TOKEN_NEGATIVE:
printf ("TOKEN_NEGATIVE line_no=%d\n", t.line);
break;
case TOKEN_ISNULL:
printf ("TOKEN_ISNULL line_no=%d\n", t.line);
break;
case TOKEN_ISINT:
printf ("TOKEN_ISINT line_no=%d\n", t.line);
break;
case TOKEN_ISNUMBER:
printf ("TOKEN_ISNUMBER line_no=%d\n", t.line);
break;
case TOKEN_AND:
printf ("TOKEN_AND line_no=%d\n", t.line);
break;
case TOKEN_OR:
printf ("TOKEN_OR line_no=%d\n", t.line);
break;
case TOKEN_XOR:
printf ("TOKEN_XOR line_no=%d\n", t.line);
break;
case TOKEN_DISP:
printf ("TOKEN_DISP line_no=%d\n", t.line);
break;
case TOKEN_LISTEN:
printf ("TOKEN_LISTEN line_no=%d\n", t.line);
break;
case TOKEN_COMPLAIN:
printf ("TOKEN_COMPLAIN line_no=%d\n", t.line);
break;
case TOKEN_TIME:
printf ("TOKEN_TIME line_no=%d\n", t.line);
break;
case TOKEN_GARBAGE_COLLECT:
printf ("TOKEN_GARBAGE_COLLECT line_no=%d\n", t.line);
break;
}
}