undar-lang/tools/assembler/lexer.c

#include <string.h>

#include "../../vm/libc.h"
#include "lexer.h"

typedef struct {
  const char *start;
  const char *current;
  i32 line;
} Lexer;

Lexer lexer;

void init_lexer(const char *source) {
  lexer.start = source;
  lexer.current = source;
  lexer.line = 1;
}

static bool is_alpha(char c) {
  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_';
}

static bool is_digit(char c) { return c >= '0' && c <= '9'; }

static bool is_at_end() { return *lexer.current == '\0'; }

static char advance() {
  lexer.current++;
  return lexer.current[-1];
}

static char peek() { return *lexer.current; }

static char peek_next() {
  if (is_at_end())
    return '\0';
  return lexer.current[1];
}

static bool match(char expected) {
  if (is_at_end())
    return false;
  if (*lexer.current != expected)
    return false;
  lexer.current++;
  return true;
}

static Token make_token(TokenType type) {
  Token token;
  token.type = type;
  token.start = lexer.start;
  token.length = (i32)(lexer.current - lexer.start);
  token.line = lexer.line;
  return token;
}

static Token error_token(const char *message) {
  Token token;
  token.type = TOKEN_ERROR;
  token.start = message;
  token.length = (i32)strlen(message);
  token.line = lexer.line;
  return token;
}

static void skip_whitespace() {
  for (;;) {
    char c = peek();
    switch (c) {
    case ' ':
    case '\r':
    case '\t':
      advance();
      break;
    case '\n':
      lexer.line++;
      advance();
      break;
    case '/':
      if (peek_next() == '/') {
        // Single-line comment: skip until newline or end of file
        advance();
        while (peek() != '\n' && !is_at_end())
          advance();
      } else if (peek_next() == '*') {
        // Multi-line comment: skip until '*/' or end of file
        advance();
        advance();
        while (!is_at_end()) {
          if (peek() == '\n')
            lexer.line++;
          if (peek() == '*' && peek_next() == '/') {
            advance();
            advance();
            break; // Exit loop, comment ended
          }
          advance();
        }
      } else {
        return; // Not a comment, let tokenization handle it
      }
      break;
    default:
      return;
    }
  }
}

static TokenType check_keyword(i32 start, i32 length, const char *rest,
                              TokenType type) {
  if (lexer.current - lexer.start == start + length &&
      memcmp(lexer.start + start, rest, length) == 0) {
    return type;
  }

  return TOKEN_IDENTIFIER;
}

static TokenType identifierType() {
  switch (lexer.start[0]) {
  case 'a':
    if (lexer.current - lexer.start > 1) {
      switch (lexer.start[1]) {
      case 'n':
        return check_keyword(2, 1, "d", TOKEN_OPERATOR_AND);
      case 's':
        return check_keyword(2, 0, "", TOKEN_KEYWORD_AS);
      }
    }
    break;
  case 'c':
    if (lexer.current - lexer.start > 1) {
      switch (lexer.start[1]) {
      case 'l':
        return check_keyword(2, 3, "ose", TOKEN_KEYWORD_CLOSE);
      case 'o':
        return check_keyword(2, 3, "nst", TOKEN_KEYWORD_CONST);
      }
    }
    break;
  case 'e':
    return check_keyword(1, 3, "lse", TOKEN_KEYWORD_ELSE);
  case 'f':
    if (lexer.current - lexer.start > 1) {
      switch (lexer.start[1]) {
      case 'a':
        return check_keyword(2, 3, "lse", TOKEN_KEYWORD_FALSE);
      case 'o':
        return check_keyword(2, 1, "r", TOKEN_KEYWORD_FOR);
      case '3':
        return check_keyword(1, 1, "2", TOKEN_TYPE_REAL);
      }
      return check_keyword(1, 7, "unction", TOKEN_KEYWORD_FN);
    }
    break;
  case 'i':
    if (lexer.current - lexer.start > 1) {
      switch (lexer.start[1]) {
      case 'f':
        return check_keyword(2, 0, "", TOKEN_KEYWORD_IF);
      case 's':
        return check_keyword(2, 0, "", TOKEN_KEYWORD_IS);
      case '8':
        return check_keyword(2, 0, "", TOKEN_TYPE_I8);
      case '1':
        return check_keyword(2, 1, "6", TOKEN_TYPE_I16);
      case '3':
        return check_keyword(2, 1, "2", TOKEN_TYPE_INT);
      case 'n':
        if (lexer.current - lexer.start > 2) {
          switch (lexer.start[2]) {
          case 'i':
            return check_keyword(3, 2, "t", TOKEN_KEYWORD_INIT);
          case 't':
            return check_keyword(3, 0, "", TOKEN_TYPE_INT);
          }
        }
        break;
      }
    }
    break;
  case 'n':
    if (lexer.current - lexer.start > 1) {
      switch (lexer.start[1]) {
      case 'a':
        return check_keyword(2, 1, "t", TOKEN_TYPE_NAT);
      case 'i':
        return check_keyword(2, 1, "l", TOKEN_KEYWORD_NIL);
      }
    }
    break;
  case 'o':
    if (lexer.current - lexer.start > 1) {
      switch (lexer.start[1]) {
      case 'p':
        return check_keyword(2, 2, "en", TOKEN_KEYWORD_OPEN);
      case 'r':
        return check_keyword(2, 0, "", TOKEN_OPERATOR_OR);
      }
    }
    break;
  case 'p':
    if (lexer.current - lexer.start > 1) {
      switch (lexer.start[1]) {
      case 'l':
        return check_keyword(2, 2, "ex", TOKEN_KEYWORD_PLEX);
      }
    }
    break;
  case 'r':
    if (lexer.current - lexer.start > 1) {
      switch (lexer.start[1]) {
      case 'e':
        if (lexer.current - lexer.start > 2) {
          switch (lexer.start[2]) {
          case 'f':
            return check_keyword(3, 4, "resh", TOKEN_KEYWORD_REFRESH);
          case 't':
            return check_keyword(3, 3, "urn", TOKEN_KEYWORD_RETURN);
          case 'a':
            if (lexer.current - lexer.start > 3) {
              switch(lexer.start[3]) {
                case 'd':
                  return check_keyword(4, 0, "", TOKEN_KEYWORD_READ);
                case 'l':
                  return check_keyword(4, 0, "", TOKEN_TYPE_REAL);
              }
            }
          }
        }
        break;
      }
    }
    break;
  case 's':
    if (lexer.current - lexer.start > 1) {
      switch (lexer.start[1]) {
      case 't':
        return check_keyword(2, 1, "r", TOKEN_TYPE_STR);
      }
    }
    break;
  case 't':
    if (lexer.current - lexer.start > 1) {
      switch (lexer.start[1]) {
      case 'h':
        return check_keyword(2, 2, "is", TOKEN_KEYWORD_THIS);
      case 'r':
        return check_keyword(2, 2, "ue", TOKEN_KEYWORD_TRUE);
      }
    }
    break;
  case 'u':
    if (lexer.current - lexer.start > 1) {
      switch (lexer.start[1]) {
      case 's':
        return check_keyword(2, 1, "e", TOKEN_KEYWORD_USE);
      case '8':
        return check_keyword(2, 0, "", TOKEN_TYPE_U8);
      case '1':
        return check_keyword(2, 1, "6", TOKEN_TYPE_U16);
      case '3':
        return check_keyword(2, 1, "2", TOKEN_TYPE_NAT);
      }
    }
    break;
  case 'w':
    if (lexer.current - lexer.start > 1) {
      switch (lexer.start[1]) {
      case 'h':
        return check_keyword(2, 3, "ile", TOKEN_KEYWORD_WHILE);
      case 'r':
        return check_keyword(2, 3, "ite", TOKEN_KEYWORD_WRITE);
      }
    }
    break;
  case 'b':
    if (lexer.current - lexer.start > 1) {
      switch (lexer.start[1]) {
      case 'y':
        return check_keyword(2, 2, "te", TOKEN_TYPE_U8);
      case 'o':
        return check_keyword(2, 2, "ol", TOKEN_TYPE_U8);
      }
    }
    break;
  case 'g':
    return check_keyword(1, 5, "lobal", TOKEN_KEYWORD_GLOBAL);
  case 'l':
    return check_keyword(1, 3, "oop", TOKEN_KEYWORD_LOOP);
  case 'd':
    return check_keyword(1, 1, "o", TOKEN_KEYWORD_DO);
  case 'v':
    return check_keyword(1, 3, "oid", TOKEN_TYPE_VOID);
  }

  return TOKEN_IDENTIFIER;
}

static Token identifier() {
  while (is_alpha(peek()) || is_digit(peek()))
    advance();
  return make_token(identifierType());
}

static Token number() {
  while (is_digit(peek()))
    advance();

  /*  Look for a fractional part. */
  if (peek() == '.' && is_digit(peek_next())) {
    /*  Consume the ".". */
    advance();

    while (is_digit(peek()))
      advance();

    return make_token(TOKEN_LITERAL_REAL);
  }

  return make_token(TOKEN_LITERAL_INT);
}

static Token string() {
  while (peek() != '"' && !is_at_end()) {
    if (peek() == '\n')
      lexer.line++;
    advance();
  }

  if (is_at_end())
    return error_token("Unterminated string.");

  /*  The closing quote. */
  advance();
  return make_token(TOKEN_LITERAL_STR);
}

Token next_token() {
  skip_whitespace();
  lexer.start = lexer.current;

  if (is_at_end())
    return make_token(TOKEN_EOF);

  char c = advance();
  if (is_alpha(c))
    return identifier();
  char next = peek();
  if ((c == '-' && is_digit(next)) || is_digit(c))
    return number();

  switch (c) {
  case '(':
    return make_token(TOKEN_LPAREN);
  case ')':
    return make_token(TOKEN_RPAREN);
  case '{':
    return make_token(TOKEN_LBRACE);
  case '}':
    return make_token(TOKEN_RBRACE);
  case '[':
    return make_token(TOKEN_LBRACKET);
  case ']':
    return make_token(TOKEN_RBRACKET);
  case ';':
    return make_token(TOKEN_SEMICOLON);
  case ',':
    return make_token(TOKEN_COMMA);
  case '.':
    return make_token(TOKEN_DOT);
  case '-':
    return make_token(match('>') ? TOKEN_ARROW_RIGHT : TOKEN_MINUS);
  case '+':
    return make_token(TOKEN_PLUS);
  case '/':
    return make_token(TOKEN_SLASH);
  case '&':
    return make_token(match('&') ? TOKEN_AND_AND : TOKEN_AND);
  case '#':
    return make_token(TOKEN_MESH);
  case '$':
    return make_token(TOKEN_BIG_MONEY);
  case '*':
    return make_token(TOKEN_STAR);
  case '!':
    return make_token(match('=') ? TOKEN_BANG_EQ : TOKEN_BANG);
  case '=':
    return make_token(match('=') ? TOKEN_EQ_EQ : TOKEN_EQ);
  case '<':
    return make_token(match('=') ? TOKEN_LTE : TOKEN_LT);
  case '>':
    return make_token(match('=') ? TOKEN_GTE : TOKEN_GT);
  case '"':
    return string();
  }

  return error_token("Unexpected character.");
}