#include #include "../../vm/common.h" #include "lexer.h" typedef struct { const char *start; const char *current; int line; } Lexer; Lexer lexer; void initLexer(const char *source) { lexer.start = source; lexer.current = source; lexer.line = 1; } static bool isAlpha(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'; } static bool isDigit(char c) { return c >= '0' && c <= '9'; } static bool isAtEnd() { return *lexer.current == '\0'; } static char advance() { lexer.current++; return lexer.current[-1]; } static char peek() { return *lexer.current; } static char peekNext() { if (isAtEnd()) return '\0'; return lexer.current[1]; } static bool match(char expected) { if (isAtEnd()) return false; if (*lexer.current != expected) return false; lexer.current++; return true; } static Token makeToken(TokenType type) { Token token; token.type = type; token.start = lexer.start; token.length = (int)(lexer.current - lexer.start); token.line = lexer.line; return token; } static Token errorToken(const char *message) { Token token; token.type = TOKEN_ERROR; token.start = message; token.length = (int)strlen(message); token.line = lexer.line; return token; } static void skipWhitespace() { for (;;) { char c = peek(); switch (c) { case ' ': case '\r': case '\t': advance(); break; case '\n': lexer.line++; advance(); break; case '/': if (peekNext() == '/') { // Single-line comment: skip until newline or end of file advance(); while (peek() != '\n' && !isAtEnd()) advance(); } else if (peekNext() == '*') { // Multi-line comment: skip until '*/' or end of file advance(); advance(); while (!isAtEnd()) { if (peek() == '\n') lexer.line++; if (peek() == '*' && peekNext() == '/') { advance(); advance(); break; // Exit loop, comment ended } advance(); } } else { return; // Not a comment, let tokenization handle it } break; default: return; } } } static TokenType checkKeyword(int start, int length, const char *rest, TokenType type) { if (lexer.current - lexer.start == start + length && memcmp(lexer.start + start, rest, length) == 0) { return type; } return TOKEN_IDENTIFIER; } static TokenType identifierType() { switch (lexer.start[0]) { case 'a': if (lexer.current - lexer.start > 1) { switch (lexer.start[1]) { case 'n': return checkKeyword(2, 1, "d", TOKEN_OPERATOR_AND); case 's': return checkKeyword(2, 0, "", TOKEN_KEYWORD_AS); } } break; case 'c': if (lexer.current - lexer.start > 1) { switch (lexer.start[1]) { case 'l': return checkKeyword(2, 3, "ose", TOKEN_KEYWORD_CLOSE); case 'o': return checkKeyword(2, 3, "nst", TOKEN_KEYWORD_CONST); } } break; case 'e': return checkKeyword(1, 3, "lse", TOKEN_KEYWORD_ELSE); case 'f': if (lexer.current - lexer.start > 1) { switch (lexer.start[1]) { case 'a': return checkKeyword(2, 3, "lse", TOKEN_KEYWORD_FALSE); case 'o': return checkKeyword(2, 1, "r", TOKEN_KEYWORD_FOR); case '3': return checkKeyword(2, 1, "2", TOKEN_TYPE_REAL); } return checkKeyword(1, 7, "unction", TOKEN_KEYWORD_FN); } break; case 'i': if (lexer.current - lexer.start > 1) { switch (lexer.start[1]) { case 'f': return checkKeyword(2, 0, "", TOKEN_KEYWORD_IF); case 's': return checkKeyword(2, 0, "", TOKEN_KEYWORD_IS); case '8': return checkKeyword(2, 0, "", TOKEN_TYPE_I8); case '1': return checkKeyword(2, 1, "6", TOKEN_TYPE_I16); case '3': return checkKeyword(2, 1, "2", TOKEN_TYPE_INT); case 'n': if (lexer.current - lexer.start > 2) { switch (lexer.start[2]) { case 'i': return checkKeyword(3, 2, "t", TOKEN_KEYWORD_INIT); case 't': return checkKeyword(3, 0, "", TOKEN_TYPE_INT); } } break; } } break; case 'n': if (lexer.current - lexer.start > 1) { switch (lexer.start[1]) { case 'a': return checkKeyword(2, 1, "t", TOKEN_TYPE_NAT); case 'i': return checkKeyword(2, 1, "l", TOKEN_KEYWORD_NIL); } } break; case 'o': if (lexer.current - lexer.start > 1) { switch (lexer.start[1]) { case 'p': return checkKeyword(2, 2, "en", TOKEN_KEYWORD_OPEN); case 'r': return checkKeyword(2, 0, "", TOKEN_OPERATOR_OR); } } break; case 'p': if (lexer.current - lexer.start > 1) { switch (lexer.start[1]) { case 'l': return checkKeyword(2, 2, "ex", TOKEN_KEYWORD_PLEX); } } break; case 'r': if (lexer.current - lexer.start > 1) { switch (lexer.start[1]) { case 'e': if (lexer.current - lexer.start > 2) { switch (lexer.start[2]) { case 'a': return checkKeyword(3, 1, "d", TOKEN_KEYWORD_READ); case 'f': return checkKeyword(3, 4, "resh", TOKEN_KEYWORD_REFRESH); case 't': return checkKeyword(3, 3, "urn", TOKEN_KEYWORD_RETURN); } } break; } } break; case 's': if (lexer.current - lexer.start > 1) { switch (lexer.start[1]) { case 't': return checkKeyword(2, 1, "r", TOKEN_TYPE_STR); } } break; case 't': if (lexer.current - lexer.start > 1) { switch (lexer.start[1]) { case 'h': return checkKeyword(2, 2, "is", TOKEN_KEYWORD_THIS); case 'r': return checkKeyword(2, 2, "ue", TOKEN_KEYWORD_TRUE); } } break; case 'u': if (lexer.current - lexer.start > 1) { switch (lexer.start[1]) { case 's': return checkKeyword(2, 1, "e", TOKEN_KEYWORD_USE); case '8': return checkKeyword(2, 0, "", TOKEN_TYPE_U8); case '1': return checkKeyword(2, 1, "6", TOKEN_TYPE_U16); case '3': return checkKeyword(2, 1, "2", TOKEN_TYPE_NAT); } } break; case 'w': if (lexer.current - lexer.start > 1) { switch (lexer.start[1]) { case 'h': return checkKeyword(2, 3, "ile", TOKEN_KEYWORD_WHILE); case 'r': return checkKeyword(2, 3, "ite", TOKEN_KEYWORD_WRITE); } } break; case 'g': return checkKeyword(1, 5, "lobal", TOKEN_KEYWORD_GLOBAL); } return TOKEN_IDENTIFIER; } static Token identifier() { while (isAlpha(peek()) || isDigit(peek())) advance(); return makeToken(identifierType()); } static Token number() { while (isDigit(peek())) advance(); /* Look for a fractional part. */ if (peek() == '.' && isDigit(peekNext())) { /* Consume the ".". */ advance(); while (isDigit(peek())) advance(); return makeToken(TOKEN_LITERAL_REAL); } return makeToken(TOKEN_LITERAL_INT); } static Token string() { while (peek() != '"' && !isAtEnd()) { if (peek() == '\n') lexer.line++; advance(); } if (isAtEnd()) return errorToken("Unterminated string."); /* The closing quote. */ advance(); return makeToken(TOKEN_LITERAL_STR); } Token next_token() { skipWhitespace(); lexer.start = lexer.current; if (isAtEnd()) return makeToken(TOKEN_EOF); char c = advance(); if (isAlpha(c)) return identifier(); if (isDigit(c)) return number(); switch (c) { case '(': return makeToken(TOKEN_LPAREN); case ')': return makeToken(TOKEN_RPAREN); case '{': return makeToken(TOKEN_LBRACE); case '}': return makeToken(TOKEN_RBRACE); case '[': return makeToken(TOKEN_LBRACKET); case ']': return makeToken(TOKEN_RBRACKET); case ';': return makeToken(TOKEN_SEMICOLON); case ',': return makeToken(TOKEN_COMMA); case '.': return makeToken(TOKEN_DOT); case '-': return makeToken(match('>') ? TOKEN_ARROW_LEFT : TOKEN_MINUS); case '+': return makeToken(TOKEN_PLUS); case '/': return makeToken(TOKEN_SLASH); case '&': return makeToken(match('&') ? TOKEN_AND_AND : TOKEN_AND); case '#': return makeToken(TOKEN_MESH); case '$': return makeToken(TOKEN_BIG_MONEY); case '*': return makeToken(TOKEN_STAR); case '!': return makeToken(match('=') ? TOKEN_BANG_EQ : TOKEN_BANG); case '=': return makeToken(match('=') ? TOKEN_EQ_EQ : TOKEN_EQ); case '<': return makeToken(match('=') ? TOKEN_LTE : TOKEN_LT); case '>': return makeToken(match('=') ? TOKEN_GTE : TOKEN_GT); case '"': return string(); } return errorToken("Unexpected character."); } const char *token_type_to_string(TokenType type) { switch (type) { case TOKEN_EOF: return "EOF"; case TOKEN_IDENTIFIER: return "IDENTIFIER"; case TOKEN_LITERAL_INT: return "LITERAL_INT"; case TOKEN_LITERAL_NAT: return "LITERAL_NAT"; case TOKEN_LITERAL_REAL: return "LITERAL_REAL"; case TOKEN_LITERAL_STR: return "LITERAL_STR"; case TOKEN_TYPE_INT: return "TYPE_INT"; case TOKEN_TYPE_NAT: return "TYPE_NAT"; case TOKEN_TYPE_REAL: return "TYPE_REAL"; case TOKEN_TYPE_STR: return "TYPE_STR"; case TOKEN_KEYWORD_PLEX: return "KEYWORD_PLEX"; case TOKEN_KEYWORD_FN: return "KEYWORD_FN"; case TOKEN_KEYWORD_CONST: return "KEYWORD_CONST"; case TOKEN_KEYWORD_IF: return "KEYWORD_IF"; case TOKEN_KEYWORD_IS: return "IS"; case TOKEN_KEYWORD_AS: return "AS"; case TOKEN_KEYWORD_ELSE: return "KEYWORD_ELSE"; case TOKEN_KEYWORD_WHILE: return "KEYWORD_WHILE"; case TOKEN_KEYWORD_FOR: return "KEYWORD_FOR"; case TOKEN_KEYWORD_RETURN: return "KEYWORD_RETURN"; case TOKEN_KEYWORD_USE: return "KEYWORD_USE"; case TOKEN_KEYWORD_INIT: return "KEYWORD_INIT"; case TOKEN_KEYWORD_THIS: return "KEYWORD_THIS"; case TOKEN_KEYWORD_OPEN: return "TOKEN_KEYWORD_OPEN"; case TOKEN_KEYWORD_READ: return "TOKEN_KEYWORD_READ"; case TOKEN_KEYWORD_WRITE: return "TOKEN_KEYWORD_WRITE"; case TOKEN_KEYWORD_REFRESH: return "TOKEN_KEYWORD_REFRESH"; case TOKEN_KEYWORD_CLOSE: return "TOKEN_KEYWORD_CLOSE"; case TOKEN_KEYWORD_NIL: return "KEYWORD_NIL"; case TOKEN_KEYWORD_TRUE: return "KEYWORD_TRUE"; case TOKEN_KEYWORD_FALSE: return "KEYWORD_FALSE"; case TOKEN_KEYWORD_GLOBAL: return "KEYWORD_GLOBAL"; case TOKEN_OPERATOR_NOT: return "OPERATOR_NOT"; case TOKEN_OPERATOR_AND: return "OPERATOR_AND"; case TOKEN_OPERATOR_OR: return "OPERATOR_OR"; case TOKEN_BANG: return "BANG"; case TOKEN_BANG_EQ: return "BANG_EQ"; case TOKEN_EQ: return "EQ"; case TOKEN_EQ_EQ: return "EQ_EQ"; case TOKEN_GT: return "GT"; case TOKEN_LT: return "LT"; case TOKEN_GTE: return "GTE"; case TOKEN_LTE: return "LTE"; case TOKEN_DOT: return "DOT"; case TOKEN_COMMA: return "COMMA"; case TOKEN_COLON: return "COLON"; case TOKEN_SEMICOLON: return "SEMICOLON"; case TOKEN_PLUS: return "PLUS"; case TOKEN_MINUS: return "MINUS"; case TOKEN_STAR: return "STAR"; case TOKEN_SLASH: return "SLASH"; case TOKEN_LPAREN: return "LPAREN"; case TOKEN_RPAREN: return "RPAREN"; case TOKEN_LBRACE: return "LBRACE"; case TOKEN_RBRACE: return "RBRACE"; case TOKEN_LBRACKET: return "LBRACKET"; case TOKEN_RBRACKET: return "RBRACKET"; case TOKEN_ARROW_LEFT: return "ARROW_LEFT"; case TOKEN_MESH: return "MESH"; case TOKEN_BIG_MONEY: return "BIG_MONEY"; case TOKEN_AND: return "AND"; case TOKEN_AND_AND: return "AND_AND"; case TOKEN_ERROR: return "ERROR"; default: return "UNKNOWN_TOKEN"; } }