undar-lang/tools/assembler/lexer.c

402 lines
9.6 KiB
C

#include <string.h>
#include "../../vm/libc.h"
#include "lexer.h"
typedef struct {
const char *start;
const char *current;
i32 line;
} Lexer;
Lexer lexer;
void init_lexer(const char *source) {
lexer.start = source;
lexer.current = source;
lexer.line = 1;
}
static bool is_alpha(char c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_';
}
static bool is_digit(char c) { return c >= '0' && c <= '9'; }
static bool is_at_end() { return *lexer.current == '\0'; }
static char advance() {
lexer.current++;
return lexer.current[-1];
}
static char peek() { return *lexer.current; }
static char peek_next() {
if (is_at_end())
return '\0';
return lexer.current[1];
}
static bool match(char expected) {
if (is_at_end())
return false;
if (*lexer.current != expected)
return false;
lexer.current++;
return true;
}
static Token make_token(TokenType type) {
Token token;
token.type = type;
token.start = lexer.start;
token.length = (i32)(lexer.current - lexer.start);
token.line = lexer.line;
return token;
}
static Token error_token(const char *message) {
Token token;
token.type = TOKEN_ERROR;
token.start = message;
token.length = (i32)strlen(message);
token.line = lexer.line;
return token;
}
static void skip_whitespace() {
for (;;) {
char c = peek();
switch (c) {
case ' ':
case '\r':
case '\t':
advance();
break;
case '\n':
lexer.line++;
advance();
break;
case '/':
if (peek_next() == '/') {
// Single-line comment: skip until newline or end of file
advance();
while (peek() != '\n' && !is_at_end())
advance();
} else if (peek_next() == '*') {
// Multi-line comment: skip until '*/' or end of file
advance();
advance();
while (!is_at_end()) {
if (peek() == '\n')
lexer.line++;
if (peek() == '*' && peek_next() == '/') {
advance();
advance();
break; // Exit loop, comment ended
}
advance();
}
} else {
return; // Not a comment, let tokenization handle it
}
break;
default:
return;
}
}
}
static TokenType check_keyword(i32 start, i32 length, const char *rest,
TokenType type) {
if (lexer.current - lexer.start == start + length &&
memcmp(lexer.start + start, rest, length) == 0) {
return type;
}
return TOKEN_IDENTIFIER;
}
static TokenType identifierType() {
switch (lexer.start[0]) {
case 'a':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'n':
return check_keyword(2, 1, "d", TOKEN_OPERATOR_AND);
case 's':
return check_keyword(2, 0, "", TOKEN_KEYWORD_AS);
}
}
break;
case 'c':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'l':
return check_keyword(2, 3, "ose", TOKEN_KEYWORD_CLOSE);
case 'o':
return check_keyword(2, 3, "nst", TOKEN_KEYWORD_CONST);
}
}
break;
case 'e':
return check_keyword(1, 3, "lse", TOKEN_KEYWORD_ELSE);
case 'f':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'a':
return check_keyword(2, 3, "lse", TOKEN_KEYWORD_FALSE);
case 'o':
return check_keyword(2, 1, "r", TOKEN_KEYWORD_FOR);
case '3':
return check_keyword(1, 1, "2", TOKEN_TYPE_REAL);
}
return check_keyword(1, 7, "unction", TOKEN_KEYWORD_FN);
}
break;
case 'i':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'f':
return check_keyword(2, 0, "", TOKEN_KEYWORD_IF);
case 's':
return check_keyword(2, 0, "", TOKEN_KEYWORD_IS);
case '8':
return check_keyword(2, 0, "", TOKEN_TYPE_I8);
case '1':
return check_keyword(2, 1, "6", TOKEN_TYPE_I16);
case '3':
return check_keyword(2, 1, "2", TOKEN_TYPE_INT);
case 'n':
if (lexer.current - lexer.start > 2) {
switch (lexer.start[2]) {
case 'i':
return check_keyword(3, 2, "t", TOKEN_KEYWORD_INIT);
case 't':
return check_keyword(3, 0, "", TOKEN_TYPE_INT);
}
}
break;
}
}
break;
case 'n':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'a':
return check_keyword(2, 1, "t", TOKEN_TYPE_NAT);
case 'i':
return check_keyword(2, 1, "l", TOKEN_KEYWORD_NIL);
}
}
break;
case 'o':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'p':
return check_keyword(2, 2, "en", TOKEN_KEYWORD_OPEN);
case 'r':
return check_keyword(2, 0, "", TOKEN_OPERATOR_OR);
}
}
break;
case 'p':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'l':
return check_keyword(2, 2, "ex", TOKEN_KEYWORD_PLEX);
}
}
break;
case 'r':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'e':
if (lexer.current - lexer.start > 2) {
switch (lexer.start[2]) {
case 'f':
return check_keyword(3, 4, "resh", TOKEN_KEYWORD_REFRESH);
case 't':
return check_keyword(3, 3, "urn", TOKEN_KEYWORD_RETURN);
case 'a':
if (lexer.current - lexer.start > 3) {
switch(lexer.start[3]) {
case 'd':
return check_keyword(4, 0, "", TOKEN_KEYWORD_READ);
case 'l':
return check_keyword(4, 0, "", TOKEN_TYPE_REAL);
}
}
}
}
break;
}
}
break;
case 's':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 't':
return check_keyword(2, 1, "r", TOKEN_TYPE_STR);
}
}
break;
case 't':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'h':
return check_keyword(2, 2, "is", TOKEN_KEYWORD_THIS);
case 'r':
return check_keyword(2, 2, "ue", TOKEN_KEYWORD_TRUE);
}
}
break;
case 'u':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 's':
return check_keyword(2, 1, "e", TOKEN_KEYWORD_USE);
case '8':
return check_keyword(2, 0, "", TOKEN_TYPE_U8);
case '1':
return check_keyword(2, 1, "6", TOKEN_TYPE_U16);
case '3':
return check_keyword(2, 1, "2", TOKEN_TYPE_NAT);
}
}
break;
case 'w':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'h':
return check_keyword(2, 3, "ile", TOKEN_KEYWORD_WHILE);
case 'r':
return check_keyword(2, 3, "ite", TOKEN_KEYWORD_WRITE);
}
}
break;
case 'b':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'y':
return check_keyword(2, 2, "te", TOKEN_TYPE_U8);
case 'o':
return check_keyword(2, 2, "ol", TOKEN_TYPE_U8);
}
}
break;
case 'g':
return check_keyword(1, 5, "lobal", TOKEN_KEYWORD_GLOBAL);
case 'l':
return check_keyword(1, 3, "oop", TOKEN_KEYWORD_LOOP);
case 'd':
return check_keyword(1, 1, "o", TOKEN_KEYWORD_DO);
case 'v':
return check_keyword(1, 3, "oid", TOKEN_TYPE_VOID);
}
return TOKEN_IDENTIFIER;
}
static Token identifier() {
while (is_alpha(peek()) || is_digit(peek()))
advance();
return make_token(identifierType());
}
static Token number() {
while (is_digit(peek()))
advance();
/* Look for a fractional part. */
if (peek() == '.' && is_digit(peek_next())) {
/* Consume the ".". */
advance();
while (is_digit(peek()))
advance();
return make_token(TOKEN_LITERAL_REAL);
}
return make_token(TOKEN_LITERAL_INT);
}
static Token string() {
while (peek() != '"' && !is_at_end()) {
if (peek() == '\n')
lexer.line++;
advance();
}
if (is_at_end())
return error_token("Unterminated string.");
/* The closing quote. */
advance();
return make_token(TOKEN_LITERAL_STR);
}
Token next_token() {
skip_whitespace();
lexer.start = lexer.current;
if (is_at_end())
return make_token(TOKEN_EOF);
char c = advance();
if (is_alpha(c))
return identifier();
char next = peek();
if ((c == '-' && is_digit(next)) || is_digit(c))
return number();
switch (c) {
case '(':
return make_token(TOKEN_LPAREN);
case ')':
return make_token(TOKEN_RPAREN);
case '{':
return make_token(TOKEN_LBRACE);
case '}':
return make_token(TOKEN_RBRACE);
case '[':
return make_token(TOKEN_LBRACKET);
case ']':
return make_token(TOKEN_RBRACKET);
case ';':
return make_token(TOKEN_SEMICOLON);
case ',':
return make_token(TOKEN_COMMA);
case '.':
return make_token(TOKEN_DOT);
case '-':
return make_token(match('>') ? TOKEN_ARROW_RIGHT : TOKEN_MINUS);
case '+':
return make_token(TOKEN_PLUS);
case '/':
return make_token(TOKEN_SLASH);
case '&':
return make_token(match('&') ? TOKEN_AND_AND : TOKEN_AND);
case '#':
return make_token(TOKEN_MESH);
case '$':
return make_token(TOKEN_BIG_MONEY);
case '*':
return make_token(TOKEN_STAR);
case '!':
return make_token(match('=') ? TOKEN_BANG_EQ : TOKEN_BANG);
case '=':
return make_token(match('=') ? TOKEN_EQ_EQ : TOKEN_EQ);
case '<':
return make_token(match('=') ? TOKEN_LTE : TOKEN_LT);
case '>':
return make_token(match('=') ? TOKEN_GTE : TOKEN_GT);
case '"':
return string();
}
return error_token("Unexpected character.");
}