511 lines
12 KiB
C
511 lines
12 KiB
C
#include <string.h>
|
|
|
|
#include "../../vm/common.h"
|
|
#include "lexer.h"
|
|
|
|
typedef struct {
|
|
const char *start;
|
|
const char *current;
|
|
int line;
|
|
} Lexer;
|
|
|
|
Lexer lexer;
|
|
|
|
void initLexer(const char *source) {
|
|
lexer.start = source;
|
|
lexer.current = source;
|
|
lexer.line = 1;
|
|
}
|
|
|
|
static bool isAlpha(char c) {
|
|
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_';
|
|
}
|
|
|
|
static bool isDigit(char c) { return c >= '0' && c <= '9'; }
|
|
|
|
static bool isAtEnd() { return *lexer.current == '\0'; }
|
|
|
|
static char advance() {
|
|
lexer.current++;
|
|
return lexer.current[-1];
|
|
}
|
|
|
|
static char peek() { return *lexer.current; }
|
|
|
|
static char peekNext() {
|
|
if (isAtEnd())
|
|
return '\0';
|
|
return lexer.current[1];
|
|
}
|
|
|
|
static bool match(char expected) {
|
|
if (isAtEnd())
|
|
return false;
|
|
if (*lexer.current != expected)
|
|
return false;
|
|
lexer.current++;
|
|
return true;
|
|
}
|
|
|
|
static Token makeToken(TokenType type) {
|
|
Token token;
|
|
token.type = type;
|
|
token.start = lexer.start;
|
|
token.length = (int)(lexer.current - lexer.start);
|
|
token.line = lexer.line;
|
|
return token;
|
|
}
|
|
|
|
static Token errorToken(const char *message) {
|
|
Token token;
|
|
token.type = TOKEN_ERROR;
|
|
token.start = message;
|
|
token.length = (int)strlen(message);
|
|
token.line = lexer.line;
|
|
return token;
|
|
}
|
|
|
|
static void skipWhitespace() {
|
|
for (;;) {
|
|
char c = peek();
|
|
switch (c) {
|
|
case ' ':
|
|
case '\r':
|
|
case '\t':
|
|
advance();
|
|
break;
|
|
case '\n':
|
|
lexer.line++;
|
|
advance();
|
|
break;
|
|
case '/':
|
|
if (peekNext() == '/') {
|
|
// Single-line comment: skip until newline or end of file
|
|
advance();
|
|
while (peek() != '\n' && !isAtEnd())
|
|
advance();
|
|
} else if (peekNext() == '*') {
|
|
// Multi-line comment: skip until '*/' or end of file
|
|
advance();
|
|
advance();
|
|
while (!isAtEnd()) {
|
|
if (peek() == '\n')
|
|
lexer.line++;
|
|
if (peek() == '*' && peekNext() == '/') {
|
|
advance();
|
|
advance();
|
|
break; // Exit loop, comment ended
|
|
}
|
|
advance();
|
|
}
|
|
} else {
|
|
return; // Not a comment, let tokenization handle it
|
|
}
|
|
break;
|
|
default:
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
static TokenType checkKeyword(int start, int length, const char *rest,
|
|
TokenType type) {
|
|
if (lexer.current - lexer.start == start + length &&
|
|
memcmp(lexer.start + start, rest, length) == 0) {
|
|
return type;
|
|
}
|
|
|
|
return TOKEN_IDENTIFIER;
|
|
}
|
|
|
|
static TokenType identifierType() {
|
|
switch (lexer.start[0]) {
|
|
case 'a':
|
|
if (lexer.current - lexer.start > 1) {
|
|
switch (lexer.start[1]) {
|
|
case 'n':
|
|
return checkKeyword(2, 1, "d", TOKEN_OPERATOR_AND);
|
|
case 's':
|
|
return checkKeyword(2, 0, "", TOKEN_KEYWORD_AS);
|
|
}
|
|
}
|
|
break;
|
|
case 'c':
|
|
if (lexer.current - lexer.start > 1) {
|
|
switch (lexer.start[1]) {
|
|
case 'l':
|
|
return checkKeyword(2, 3, "ose", TOKEN_KEYWORD_CLOSE);
|
|
case 'o':
|
|
return checkKeyword(2, 3, "nst", TOKEN_KEYWORD_CONST);
|
|
}
|
|
}
|
|
break;
|
|
case 'e':
|
|
return checkKeyword(1, 3, "lse", TOKEN_KEYWORD_ELSE);
|
|
case 'f':
|
|
if (lexer.current - lexer.start > 1) {
|
|
switch (lexer.start[1]) {
|
|
case 'a':
|
|
return checkKeyword(2, 3, "lse", TOKEN_KEYWORD_FALSE);
|
|
case 'o':
|
|
return checkKeyword(2, 1, "r", TOKEN_KEYWORD_FOR);
|
|
case '3':
|
|
return checkKeyword(2, 1, "2", TOKEN_TYPE_REAL);
|
|
}
|
|
return checkKeyword(1, 7, "unction", TOKEN_KEYWORD_FN);
|
|
}
|
|
break;
|
|
case 'i':
|
|
if (lexer.current - lexer.start > 1) {
|
|
switch (lexer.start[1]) {
|
|
case 'f':
|
|
return checkKeyword(2, 0, "", TOKEN_KEYWORD_IF);
|
|
case 's':
|
|
return checkKeyword(2, 0, "", TOKEN_KEYWORD_IS);
|
|
case '8':
|
|
return checkKeyword(2, 0, "", TOKEN_TYPE_I8);
|
|
case '1':
|
|
return checkKeyword(2, 1, "6", TOKEN_TYPE_I16);
|
|
case '3':
|
|
return checkKeyword(2, 1, "2", TOKEN_TYPE_INT);
|
|
case 'n':
|
|
if (lexer.current - lexer.start > 2) {
|
|
switch (lexer.start[2]) {
|
|
case 'i':
|
|
return checkKeyword(3, 2, "t", TOKEN_KEYWORD_INIT);
|
|
case 't':
|
|
return checkKeyword(3, 0, "", TOKEN_TYPE_INT);
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
case 'n':
|
|
if (lexer.current - lexer.start > 1) {
|
|
switch (lexer.start[1]) {
|
|
case 'a':
|
|
return checkKeyword(2, 1, "t", TOKEN_TYPE_NAT);
|
|
case 'i':
|
|
return checkKeyword(2, 1, "l", TOKEN_KEYWORD_NIL);
|
|
}
|
|
}
|
|
break;
|
|
case 'o':
|
|
if (lexer.current - lexer.start > 1) {
|
|
switch (lexer.start[1]) {
|
|
case 'p':
|
|
return checkKeyword(2, 2, "en", TOKEN_KEYWORD_OPEN);
|
|
case 'r':
|
|
return checkKeyword(2, 0, "", TOKEN_OPERATOR_OR);
|
|
}
|
|
}
|
|
break;
|
|
case 'p':
|
|
if (lexer.current - lexer.start > 1) {
|
|
switch (lexer.start[1]) {
|
|
case 'l':
|
|
return checkKeyword(2, 2, "ex", TOKEN_KEYWORD_PLEX);
|
|
}
|
|
}
|
|
break;
|
|
case 'r':
|
|
if (lexer.current - lexer.start > 1) {
|
|
switch (lexer.start[1]) {
|
|
case 'e':
|
|
if (lexer.current - lexer.start > 2) {
|
|
switch (lexer.start[2]) {
|
|
case 'a':
|
|
return checkKeyword(3, 1, "d", TOKEN_KEYWORD_READ);
|
|
case 'f':
|
|
return checkKeyword(3, 4, "resh", TOKEN_KEYWORD_REFRESH);
|
|
case 't':
|
|
return checkKeyword(3, 3, "urn", TOKEN_KEYWORD_RETURN);
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
case 's':
|
|
if (lexer.current - lexer.start > 1) {
|
|
switch (lexer.start[1]) {
|
|
case 't':
|
|
return checkKeyword(2, 1, "r", TOKEN_TYPE_STR);
|
|
}
|
|
}
|
|
break;
|
|
case 't':
|
|
if (lexer.current - lexer.start > 1) {
|
|
switch (lexer.start[1]) {
|
|
case 'h':
|
|
return checkKeyword(2, 2, "is", TOKEN_KEYWORD_THIS);
|
|
case 'r':
|
|
return checkKeyword(2, 2, "ue", TOKEN_KEYWORD_TRUE);
|
|
}
|
|
}
|
|
break;
|
|
case 'u':
|
|
if (lexer.current - lexer.start > 1) {
|
|
switch (lexer.start[1]) {
|
|
case 's':
|
|
return checkKeyword(2, 1, "e", TOKEN_KEYWORD_USE);
|
|
case '8':
|
|
return checkKeyword(2, 0, "", TOKEN_TYPE_U8);
|
|
case '1':
|
|
return checkKeyword(2, 1, "6", TOKEN_TYPE_U16);
|
|
case '3':
|
|
return checkKeyword(2, 1, "2", TOKEN_TYPE_NAT);
|
|
}
|
|
}
|
|
break;
|
|
case 'w':
|
|
if (lexer.current - lexer.start > 1) {
|
|
switch (lexer.start[1]) {
|
|
case 'h':
|
|
return checkKeyword(2, 3, "ile", TOKEN_KEYWORD_WHILE);
|
|
case 'r':
|
|
return checkKeyword(2, 3, "ite", TOKEN_KEYWORD_WRITE);
|
|
}
|
|
}
|
|
break;
|
|
case 'g':
|
|
return checkKeyword(1, 5, "lobal", TOKEN_KEYWORD_GLOBAL);
|
|
}
|
|
|
|
return TOKEN_IDENTIFIER;
|
|
}
|
|
|
|
static Token identifier() {
|
|
while (isAlpha(peek()) || isDigit(peek()))
|
|
advance();
|
|
return makeToken(identifierType());
|
|
}
|
|
|
|
static Token number() {
|
|
while (isDigit(peek()))
|
|
advance();
|
|
|
|
/* Look for a fractional part. */
|
|
if (peek() == '.' && isDigit(peekNext())) {
|
|
/* Consume the ".". */
|
|
advance();
|
|
|
|
while (isDigit(peek()))
|
|
advance();
|
|
|
|
return makeToken(TOKEN_LITERAL_REAL);
|
|
}
|
|
|
|
return makeToken(TOKEN_LITERAL_INT);
|
|
}
|
|
|
|
static Token string() {
|
|
while (peek() != '"' && !isAtEnd()) {
|
|
if (peek() == '\n')
|
|
lexer.line++;
|
|
advance();
|
|
}
|
|
|
|
if (isAtEnd())
|
|
return errorToken("Unterminated string.");
|
|
|
|
/* The closing quote. */
|
|
advance();
|
|
return makeToken(TOKEN_LITERAL_STR);
|
|
}
|
|
|
|
Token next_token() {
|
|
skipWhitespace();
|
|
lexer.start = lexer.current;
|
|
|
|
if (isAtEnd())
|
|
return makeToken(TOKEN_EOF);
|
|
|
|
char c = advance();
|
|
if (isAlpha(c))
|
|
return identifier();
|
|
if (isDigit(c))
|
|
return number();
|
|
|
|
switch (c) {
|
|
case '(':
|
|
return makeToken(TOKEN_LPAREN);
|
|
case ')':
|
|
return makeToken(TOKEN_RPAREN);
|
|
case '{':
|
|
return makeToken(TOKEN_LBRACE);
|
|
case '}':
|
|
return makeToken(TOKEN_RBRACE);
|
|
case '[':
|
|
return makeToken(TOKEN_LBRACKET);
|
|
case ']':
|
|
return makeToken(TOKEN_RBRACKET);
|
|
case ';':
|
|
return makeToken(TOKEN_SEMICOLON);
|
|
case ',':
|
|
return makeToken(TOKEN_COMMA);
|
|
case '.':
|
|
return makeToken(TOKEN_DOT);
|
|
case '-':
|
|
return makeToken(match('>') ? TOKEN_ARROW_RIGHT : TOKEN_MINUS);
|
|
case '+':
|
|
return makeToken(TOKEN_PLUS);
|
|
case '/':
|
|
return makeToken(TOKEN_SLASH);
|
|
case '&':
|
|
return makeToken(match('&') ? TOKEN_AND_AND : TOKEN_AND);
|
|
case '#':
|
|
return makeToken(TOKEN_MESH);
|
|
case '$':
|
|
return makeToken(TOKEN_BIG_MONEY);
|
|
case '*':
|
|
return makeToken(TOKEN_STAR);
|
|
case '!':
|
|
return makeToken(match('=') ? TOKEN_BANG_EQ : TOKEN_BANG);
|
|
case '=':
|
|
return makeToken(match('=') ? TOKEN_EQ_EQ : TOKEN_EQ);
|
|
case '<':
|
|
return makeToken(match('=') ? TOKEN_LTE : TOKEN_LT);
|
|
case '>':
|
|
return makeToken(match('=') ? TOKEN_GTE : TOKEN_GT);
|
|
case '"':
|
|
return string();
|
|
}
|
|
|
|
return errorToken("Unexpected character.");
|
|
}
|
|
|
|
const char *token_type_to_string(TokenType type) {
|
|
switch (type) {
|
|
case TOKEN_EOF:
|
|
return "EOF";
|
|
case TOKEN_IDENTIFIER:
|
|
return "IDENTIFIER";
|
|
case TOKEN_LITERAL_INT:
|
|
return "LITERAL_INT";
|
|
case TOKEN_LITERAL_NAT:
|
|
return "LITERAL_NAT";
|
|
case TOKEN_LITERAL_REAL:
|
|
return "LITERAL_REAL";
|
|
case TOKEN_LITERAL_STR:
|
|
return "LITERAL_STR";
|
|
case TOKEN_TYPE_INT:
|
|
return "TYPE_INT";
|
|
case TOKEN_TYPE_NAT:
|
|
return "TYPE_NAT";
|
|
case TOKEN_TYPE_REAL:
|
|
return "TYPE_REAL";
|
|
case TOKEN_TYPE_STR:
|
|
return "TYPE_STR";
|
|
case TOKEN_KEYWORD_PLEX:
|
|
return "KEYWORD_PLEX";
|
|
case TOKEN_KEYWORD_FN:
|
|
return "KEYWORD_FN";
|
|
case TOKEN_KEYWORD_CONST:
|
|
return "KEYWORD_CONST";
|
|
case TOKEN_KEYWORD_IF:
|
|
return "KEYWORD_IF";
|
|
case TOKEN_KEYWORD_IS:
|
|
return "IS";
|
|
case TOKEN_KEYWORD_AS:
|
|
return "AS";
|
|
case TOKEN_KEYWORD_ELSE:
|
|
return "KEYWORD_ELSE";
|
|
case TOKEN_KEYWORD_WHILE:
|
|
return "KEYWORD_WHILE";
|
|
case TOKEN_KEYWORD_FOR:
|
|
return "KEYWORD_FOR";
|
|
case TOKEN_KEYWORD_RETURN:
|
|
return "KEYWORD_RETURN";
|
|
case TOKEN_KEYWORD_USE:
|
|
return "KEYWORD_USE";
|
|
case TOKEN_KEYWORD_INIT:
|
|
return "KEYWORD_INIT";
|
|
case TOKEN_KEYWORD_THIS:
|
|
return "KEYWORD_THIS";
|
|
case TOKEN_KEYWORD_OPEN:
|
|
return "TOKEN_KEYWORD_OPEN";
|
|
case TOKEN_KEYWORD_READ:
|
|
return "TOKEN_KEYWORD_READ";
|
|
case TOKEN_KEYWORD_WRITE:
|
|
return "TOKEN_KEYWORD_WRITE";
|
|
case TOKEN_KEYWORD_REFRESH:
|
|
return "TOKEN_KEYWORD_REFRESH";
|
|
case TOKEN_KEYWORD_CLOSE:
|
|
return "TOKEN_KEYWORD_CLOSE";
|
|
case TOKEN_KEYWORD_NIL:
|
|
return "KEYWORD_NIL";
|
|
case TOKEN_KEYWORD_TRUE:
|
|
return "KEYWORD_TRUE";
|
|
case TOKEN_KEYWORD_FALSE:
|
|
return "KEYWORD_FALSE";
|
|
case TOKEN_KEYWORD_GLOBAL:
|
|
return "KEYWORD_GLOBAL";
|
|
case TOKEN_OPERATOR_NOT:
|
|
return "OPERATOR_NOT";
|
|
case TOKEN_OPERATOR_AND:
|
|
return "OPERATOR_AND";
|
|
case TOKEN_OPERATOR_OR:
|
|
return "OPERATOR_OR";
|
|
case TOKEN_BANG:
|
|
return "BANG";
|
|
case TOKEN_BANG_EQ:
|
|
return "BANG_EQ";
|
|
case TOKEN_EQ:
|
|
return "EQ";
|
|
case TOKEN_EQ_EQ:
|
|
return "EQ_EQ";
|
|
case TOKEN_GT:
|
|
return "GT";
|
|
case TOKEN_LT:
|
|
return "LT";
|
|
case TOKEN_GTE:
|
|
return "GTE";
|
|
case TOKEN_LTE:
|
|
return "LTE";
|
|
case TOKEN_DOT:
|
|
return "DOT";
|
|
case TOKEN_COMMA:
|
|
return "COMMA";
|
|
case TOKEN_COLON:
|
|
return "COLON";
|
|
case TOKEN_SEMICOLON:
|
|
return "SEMICOLON";
|
|
case TOKEN_PLUS:
|
|
return "PLUS";
|
|
case TOKEN_MINUS:
|
|
return "MINUS";
|
|
case TOKEN_STAR:
|
|
return "STAR";
|
|
case TOKEN_SLASH:
|
|
return "SLASH";
|
|
case TOKEN_LPAREN:
|
|
return "LPAREN";
|
|
case TOKEN_RPAREN:
|
|
return "RPAREN";
|
|
case TOKEN_LBRACE:
|
|
return "LBRACE";
|
|
case TOKEN_RBRACE:
|
|
return "RBRACE";
|
|
case TOKEN_LBRACKET:
|
|
return "LBRACKET";
|
|
case TOKEN_RBRACKET:
|
|
return "RBRACKET";
|
|
case TOKEN_ARROW_RIGHT:
|
|
return "ARROW_RIGHT";
|
|
case TOKEN_MESH:
|
|
return "MESH";
|
|
case TOKEN_BIG_MONEY:
|
|
return "BIG_MONEY";
|
|
case TOKEN_AND:
|
|
return "AND";
|
|
case TOKEN_AND_AND:
|
|
return "AND_AND";
|
|
case TOKEN_ERROR:
|
|
return "ERROR";
|
|
default:
|
|
return "UNKNOWN_TOKEN";
|
|
}
|
|
}
|