undar-lang/src/parser.c

#include "parser.h"

#define HASHSIZE 150
static TokenMap *hashtab_Token[HASHSIZE];

unsigned int hash_Token(char *s) {
  unsigned int hashval;
  for (hashval = 0; *s != '\0'; s++)
    hashval = *s + 31 * hashval;
  return hashval % HASHSIZE;
}

TokenMap *lookup_Token(char *s) {
  TokenMap *np;
  for (np = hashtab_Token[hash_Token(s)]; np != NULL; np = np->next)
    if (strcmp(s, np->keyword) == 0)
      return np;
  return NULL;
}

TokenType get_Token(char *s) {
  TokenMap *np;
  for (np = hashtab_Token[hash_Token(s)]; np != NULL; np = np->next)
    if (strcmp(s, np->keyword) == 0)
      return np->token;
  return TOKEN_IDENTIFIER;
}

char *strdup(const char *s) {
  size_t len = strlen(s) + 1;
  char *copy = malloc(len);
  if (copy) {
    memcpy(copy, s, len);
  }
  return copy;
}

TokenMap *put_Token(char *keyword, TokenType token) {
  TokenMap *np;
  unsigned int hashval;
  if ((np = lookup_Token(keyword)) == NULL) {
    np = (TokenMap *)malloc(sizeof(*np));
    if (np == NULL || (np->keyword = strdup(keyword)) == NULL)
      return NULL;
    hashval = hash_Token(keyword);
    np->next = hashtab_Token[hashval];
    hashtab_Token[hashval] = np;
  }
  np->token = token;
  return np;
}

void new_TokenMap() {
  put_Token("nil", TOKEN_NULL);
  put_Token("and", TOKEN_AND);
  put_Token("or", TOKEN_OR);
  put_Token("xor", TOKEN_XOR);
  put_Token("mod", TOKEN_MOD);
  put_Token("eq", TOKEN_EQ);
  put_Token("ge", TOKEN_GE);
  put_Token("lt", TOKEN_LT);
  put_Token("le", TOKEN_LE);
  put_Token("ne", TOKEN_NE);
  put_Token("gt", TOKEN_GT);
  put_Token("ge", TOKEN_GE);
  put_Token("srl", TOKEN_SHIFTRIGHT);
  put_Token("sll", TOKEN_SHIFTLEFT);
  put_Token("int", TOKEN_INT);
  put_Token("print", TOKEN_PRINT);
}

typedef struct Tokenizer Tokenizer;
struct Tokenizer {
  char *start;
  char *current;
  int32_t line;
};

Tokenizer tokenizer;

void new_Tokenizer(char *src) {
  tokenizer.start = src;
  tokenizer.current = src;
  tokenizer.line = 1;
}

static bool isAlpha(char c) {
  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' ||
         c == '\'' || c == '?';
}

static bool isDigit(char c) { return (c >= '0' && c <= '9') || c == '-'; }

static bool isAtEnd() { return *tokenizer.current == '\0'; }

static Token makeToken(TokenType type) {
  Token token;
  token.type = type;
  token.start = tokenizer.start;
  token.length = (int32_t)(tokenizer.current - tokenizer.start);
  token.line = tokenizer.line;
  return token;
}

static Token errorToken(char *msg) {
  Token token;
  token.type = TOKEN_ERROR;
  token.start = msg;
  token.length = (int32_t)strlen(msg);
  token.line = tokenizer.line;
  return token;
}

static char advance() {
  tokenizer.current++;
  return tokenizer.current[-1];
}

static char peek() { return *tokenizer.current; }

static char peekNext() {
  if (isAtEnd())
    return '\0';
  return tokenizer.current[1];
}

static void skipWhitespace() {
  for (;;) {
    char c = peek();
    switch (c) {
    case ' ':
    case '\r':
    case '\t':
      advance();
      break;
    case '\n':
      tokenizer.line++;
      advance();
      break;
    case '!':
      while (peek() != '\n' && !isAtEnd())
        advance();
      break;
    default:
      return;
    }
  }
}

static char *currentTokenToS() {
  int32_t size = tokenizer.current - tokenizer.start;
  char *str = (char *)malloc(sizeof(size));
  strncpy(str, tokenizer.start, size);
  str[size] = '\0';
  return str;
}

static TokenType identifierType() {
  char *check = currentTokenToS();
  TokenType t = get_Token(check);
  free(check);
  return t;
}

static Token identifier() {
  while (isAlpha(peek()) || isDigit(peek()))
    advance();
  return makeToken(identifierType());
}

static Token number() {
  bool is_float = false;
  while (isDigit(peek()))
    advance();

  /* Look for a fractional part. */
  if (peek() == '.' && isDigit(peekNext())) {
    is_float = true;
    /* Consume the ".". */
    advance();

    while (isDigit(peek()))
      advance();
  }

  return makeToken((is_float)
                       ? TOKEN_FLOAT
                       : TOKEN_INT); /* or measure if ends in postscript */
}

static Token string() {
  while (peek() != '"' && !isAtEnd()) {
    if (peek() == '\n')
      tokenizer.line++;
    advance();
  }

  if (isAtEnd())
    return errorToken("Unterminated string.");

  /* The closing quote. */
  advance();
  return makeToken(TOKEN_STRING);
}

Token nextToken() {
  skipWhitespace();
  tokenizer.start = tokenizer.current;
  if (isAtEnd())
    return makeToken(TOKEN_EOF);

  char c = advance();
  if (isAlpha(c))
    return identifier();
  if (isDigit(c))
    return number();
  switch (c) {
  case '(':
    return makeToken(TOKEN_LEFT_PAREN);
  case ')':
    return makeToken(TOKEN_RIGHT_PAREN);
  case '{':
    return makeToken(TOKEN_LEFT_BRACE);
  case '}':
    return makeToken(TOKEN_RIGHT_BRACE);
  case '+':
    return makeToken(TOKEN_ADD);
  case '/':
    return makeToken(TOKEN_DIV);
  case '-':
    return makeToken(TOKEN_SUB);
  case '*':
    return makeToken(TOKEN_MUL);
  case ';':
    return makeToken(TOKEN_SEMICOLON);
  case '"':
    return string();
  }

  return errorToken("Unexpected character.");
}

void debug_printToken(Token t) {
  char *str = currentTokenToS();

  switch (t.type) {
  case TOKEN_LEFT_PAREN:
    printf("TOKEN_LEFT_PAREN %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_RIGHT_PAREN:
    printf("TOKEN_RIGHT_PAREN %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_LEFT_BRACE:
    printf("TOKEN_LEFT_BRACE %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_RIGHT_BRACE:
    printf("TOKEN_RIGHT_BRACE %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_IDENTIFIER:
    printf("TOKEN_IDENTIFIER %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_STRING:
    printf("TOKEN_STRING %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_FLOAT:
    printf("TOKEN_FLOAT %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_ERROR:
    printf("TOKEN_ERROR %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_FALSE:
    printf("TOKEN_FALSE %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_TRUE:
    printf("TOKEN_TRUE %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_EOF:
    printf("TOKEN_EOF %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_ADD:
    printf("TOKEN_ADD %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_SUB:
    printf("TOKEN_SUB %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_MUL:
    printf("TOKEN_MUL %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_DIV:
    printf("TOKEN_DIV %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_MOD:
    printf("TOKEN_MOD %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_INT:
    printf("TOKEN_INT %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_UINT:
    printf("TOKEN_UINT %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_SHIFTRIGHT:
    printf("TOKEN_SHIFTRIGHT %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_SHIFTLEFT:
    printf("TOKEN_SHIFTLEFT %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_GT:
    printf("TOKEN_GT %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_LT:
    printf("TOKEN_LT %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_EQ:
    printf("TOKEN_EQ %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_GE:
    printf("TOKEN_GE %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_LE:
    printf("TOKEN_LE %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_NE:
    printf("TOKEN_NE %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_NULL:
    printf("TOKEN_NULL %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_AND:
    printf("TOKEN_AND %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_OR:
    printf("TOKEN_OR %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_XOR:
    printf("TOKEN_XOR %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_SEMICOLON:
    printf("TOKEN_SEMICOLON %s line_no=%d\n", str, t.line);
    break;
  case TOKEN_PRINT:
    printf("TOKEN_PRINT %s line_no=%d\n", str, t.line);
    break;
  }
  free(str);
}