diff --git a/src/Makefile b/src/Makefile index c55e539..954b848 100644 --- a/src/Makefile +++ b/src/Makefile @@ -12,21 +12,6 @@ CFLAGS_WASM = -g -std=c89 -Wall -Wextra -Werror -Wno-unused-parameter -I. LDFLAGS_WASM = -s WASM=1 -g -s USE_SDL=2 LDLIBS_WASM = -TOOLS_DIR := tools -GENERATOR := $(TOOLS_DIR)/gen_keywords -GENERATOR_SRC := $(GENERATOR).c -KEYWORDS_H := keywords.h - -# Rule to generate keywords.h -$(KEYWORDS_H): $(GENERATOR) $(GENERATOR_SRC) - @echo "Generating keywords.h..." - @$(GENERATOR) > $(KEYWORDS_H) - -# Rule to build the generator -$(GENERATOR): $(GENERATOR_SRC) - @echo "Compiling keyword generator..." - @$(CC) -o $@ $< - # Source and build configuration # ---------------------------- COMMON_SRC = $(wildcard *.c) @@ -45,7 +30,7 @@ OBJ_NATIVE = $(addprefix $(OBJ_DIR_NATIVE)/,$(notdir $(COMMON_SRC:.c=.o))) OBJ_WASM = $(addprefix $(OBJ_DIR_WASM)/,$(notdir $(COMMON_SRC:.c=.o))) # Phony targets -.PHONY: all clean clean_generated install wasm native emscripten linux macos +.PHONY: all clean install wasm native emscripten linux macos # Default target builds the native version all: native @@ -80,14 +65,9 @@ $(OBJ_DIR_WASM)/%.o: %.c # Clean build artifacts # --------------------- -clean: clean-generated +clean: rm -rf $(OBJ_DIR_NATIVE) $(OBJ_DIR_WASM) $(EXEC_NATIVE) $(EXEC_WASM) -# Clean rule for deleting generated keyword binary and header -clean-generated: - @echo "Removing generated files..." - @rm -f $(KEYWORDS_H) $(GENERATOR) - # Install target (example) # ------------------------ install: native diff --git a/src/arch/linux/main.c b/src/arch/linux/main.c index 49084b9..f727605 100644 --- a/src/arch/linux/main.c +++ b/src/arch/linux/main.c @@ -1,6 +1,7 @@ +#include "../../vm.h" #include "../../debug.h" #include "../../test.h" -#include "../../vm.h" +#include "../../lexer.h" #include int main(int argc, char **argv) { @@ -10,6 +11,37 @@ int main(int argc, char **argv) { vm.stack_size = STACK_SIZE; vm.memory_size = MEMORY_SIZE; + if (argc < 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + FILE *f = fopen(argv[1], "rb"); + if (!f) { + perror("fopen"); + return 1; + } + + fseek(f, 0, SEEK_END); + long len = ftell(f); + fseek(f, 0, SEEK_SET); + char *source = (char *)malloc(len + 1); + fread(source, 1, len, f); + source[len] = '\0'; + fclose(f); + + init_lexer(source); + + for (;;) { + Token token = next_token(); + printf("[%d] %-18s: '%.*s'\n", token.line, token_type_name(token.type), token.length, token.start); + if (token.type == TOKEN_EOF) break; + } + + free(source); + return 0; + + test_hello_world_compile(&vm); /* test_add_compile(&vm); */ /* test_add_function_compile(&vm); */ diff --git a/src/keywords.h b/src/keywords.h deleted file mode 100644 index 688dcb9..0000000 --- a/src/keywords.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef KEYWORDS_H -#define KEYWORDS_H - -#include "lexer.h" - -static TokenType check_keyword(int start, int length, const char *rest, TokenType type) { - if ((lexer.current - lexer.start) == start + length && - memcmp(lexer.start + start, rest, length) == 0) return type; - return TOKEN_IDENTIFIER; -} - -static TokenType identifier_type(void) { - switch (lexer.start[0]) { - case 'c': - return check_keyword(1, 4, "onst", TOKEN_KEYWORD_CONST); - case 'e': - return check_keyword(1, 3, "lse", TOKEN_KEYWORD_ELSE); - case 'f': - return check_keyword(1, 1, "n", TOKEN_KEYWORD_FN); - return check_keyword(1, 2, "or", TOKEN_KEYWORD_FOR); - case 'i': - return check_keyword(1, 1, "f", TOKEN_KEYWORD_IF); - return check_keyword(1, 1, "s", TOKEN_OPERATOR_IS); - case 'l': - return check_keyword(1, 2, "et", TOKEN_KEYWORD_LET); - case 'r': - return check_keyword(1, 5, "eturn", TOKEN_KEYWORD_RETURN); - case 't': - return check_keyword(1, 3, "ype", TOKEN_KEYWORD_TYPE); - case 'u': - return check_keyword(1, 2, "se", TOKEN_KEYWORD_USE); - case 'w': - return check_keyword(1, 4, "hile", TOKEN_KEYWORD_WHILE); - } - return TOKEN_IDENTIFIER; -} - -#endif // KEYWORDS_H diff --git a/src/lexer.c b/src/lexer.c index d858912..bf0a340 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1,4 +1,6 @@ -#include "keywords.h" +#include "lexer.h" + +Lexer lexer; void init_lexer(const char *source) { lexer.start = source; @@ -6,25 +8,21 @@ void init_lexer(const char *source) { lexer.line = 1; } -int is_at_end() { - return *lexer.current == '\0'; -} +int is_at_end() { return *lexer.current == '\0'; } -char advance() { - return *lexer.current++; -} +char advance() { return *lexer.current++; } -char peek() { - return *lexer.current; -} +char peek() { return *lexer.current; } char peek_next() { - if (is_at_end()) return '\0'; + if (is_at_end()) + return '\0'; return lexer.current[1]; } int match(char expected) { - if (*lexer.current != expected) return 0; + if (*lexer.current != expected) + return 0; lexer.current++; return 1; } @@ -33,24 +31,26 @@ void skip_whitespace() { for (;;) { char c = peek(); switch (c) { - case ' ': - case '\r': - case '\t': - advance(); - break; - case '\n': - lexer.line++; - advance(); - break; - case '!': - if (peek_next() == '!') { - while (peek() != '\n' && !is_at_end()) advance(); - } else { - while (peek() != '\n' && !is_at_end()) advance(); - } - break; - default: - return; + case ' ': + case '\r': + case '\t': + advance(); + break; + case '\n': + lexer.line++; + advance(); + break; + case '!': + if (peek_next() == '!') { + while (peek() != '\n' && !is_at_end()) + advance(); + } else { + while (peek() != '\n' && !is_at_end()) + advance(); + } + break; + default: + return; } } } @@ -73,20 +73,18 @@ Token error_token(const char *message) { return token; } -int is_alpha(char c) { - return isalpha(c) || c == '_'; -} +int is_alpha(char c) { return isalpha(c) || c == '_'; } -int is_digit(char c) { - return isdigit(c); -} +int is_digit(char c) { return isdigit(c); } Token number() { - while (is_digit(peek())) advance(); + while (is_digit(peek())) + advance(); if (peek() == '.' && is_digit(peek_next())) { advance(); - while (is_digit(peek())) advance(); + while (is_digit(peek())) + advance(); return make_token(TOKEN_FLOAT_LITERAL); } @@ -95,47 +93,116 @@ Token number() { Token string() { while (peek() != '"' && !is_at_end()) { - if (peek() == '\n') lexer.line++; + if (peek() == '\n') + lexer.line++; advance(); } - if (is_at_end()) return error_token("Unterminated string."); + if (is_at_end()) + return error_token("Unterminated string."); - advance(); // Consume closing quote + advance(); return make_token(TOKEN_STRING_LITERAL); } +Token identifier() { + while (is_alpha(peek()) || is_digit(peek())) + advance(); + + int length = (int)(lexer.current - lexer.start); + const char *text = lexer.start; + + if (length == 4 && strncmp(text, "init", 4) == 0) + return make_token(TOKEN_KEYWORD_INIT); + if (length == 4 && strncmp(text, "this", 4) == 0) + return make_token(TOKEN_KEYWORD_THIS); + if (length == 4 && strncmp(text, "type", 4) == 0) + return make_token(TOKEN_KEYWORD_TYPE); + if (length == 2 && strncmp(text, "fn", 2) == 0) + return make_token(TOKEN_KEYWORD_FN); + if (length == 3 && strncmp(text, "let", 3) == 0) + return make_token(TOKEN_KEYWORD_LET); + if (length == 5 && strncmp(text, "const", 5) == 0) + return make_token(TOKEN_KEYWORD_CONST); + if (length == 2 && strncmp(text, "if", 2) == 0) + return make_token(TOKEN_KEYWORD_IF); + if (length == 4 && strncmp(text, "else", 4) == 0) + return make_token(TOKEN_KEYWORD_ELSE); + if (length == 5 && strncmp(text, "while", 5) == 0) + return make_token(TOKEN_KEYWORD_WHILE); + if (length == 3 && strncmp(text, "for", 3) == 0) + return make_token(TOKEN_KEYWORD_FOR); + if (length == 6 && strncmp(text, "return", 6) == 0) + return make_token(TOKEN_KEYWORD_RETURN); + if (length == 3 && strncmp(text, "use", 3) == 0) + return make_token(TOKEN_KEYWORD_USE); + if (length == 2 && strncmp(text, "is", 2) == 0) + return make_token(TOKEN_OPERATOR_IS); + if (length == 3 && strncmp(text, "int", 3) == 0) + return make_token(TOKEN_TYPE_INT); + if (length == 3 && strncmp(text, "nat", 3) == 0) + return make_token(TOKEN_TYPE_NAT); + if (length == 3 && strncmp(text, "str", 3) == 0) + return make_token(TOKEN_TYPE_STR); + if (length == 3 && strncmp(text, "real", 4) == 0) + return make_token(TOKEN_TYPE_REAL); + + return make_token(TOKEN_IDENTIFIER); +} + Token next_token() { skip_whitespace(); lexer.start = lexer.current; - if (is_at_end()) return make_token(TOKEN_EOF); + if (is_at_end()) + return make_token(TOKEN_EOF); char c = advance(); - if (is_alpha(c)) return identifier(); - if (is_digit(c)) return number(); + if (is_alpha(c)) + return identifier(); + if (is_digit(c)) + return number(); switch (c) { - case '(': return make_token(TOKEN_LPAREN); - case ')': return make_token(TOKEN_RPAREN); - case '{': return make_token(TOKEN_LBRACE); - case '}': return make_token(TOKEN_RBRACE); - case '[': return make_token(TOKEN_LBRACKET); - case ']': return make_token(TOKEN_RBRACKET); - case ',': return make_token(TOKEN_COMMA); - case '.': return make_token(TOKEN_DOT); - case ':': return make_token(TOKEN_COLON); - case ';': return make_token(TOKEN_SEMICOLON); - case '+': return make_token(TOKEN_PLUS); - case '-': return make_token(TOKEN_MINUS); - case '*': return make_token(TOKEN_STAR); - case '/': return make_token(TOKEN_SLASH); - case '=': return make_token(TOKEN_EQ); - case '"': return string(); - case '!': - if (match('!')) return make_token(TOKEN_DOUBLE_BANG); - return make_token(TOKEN_BANG); + case '(': + return make_token(TOKEN_LPAREN); + case ')': + return make_token(TOKEN_RPAREN); + case '{': + return make_token(TOKEN_LBRACE); + case '}': + return make_token(TOKEN_RBRACE); + case '[': + return make_token(TOKEN_LBRACKET); + case ']': + return make_token(TOKEN_RBRACKET); + case ',': + return make_token(TOKEN_COMMA); + case '.': + return make_token(TOKEN_DOT); + case ':': + return make_token(TOKEN_COLON); + case ';': + return make_token(TOKEN_SEMICOLON); + case '+': + return make_token(TOKEN_PLUS); + case '-': + return make_token(TOKEN_MINUS); + case '*': + return make_token(TOKEN_STAR); + case '/': + return make_token(TOKEN_SLASH); + case '!': + return make_token(match('=') ? TOKEN_BANG_EQ : TOKEN_BANG); + case '=': + return make_token(match('=') ? TOKEN_EQ_EQ : TOKEN_EQ); + case '<': + return make_token(match('=') ? TOKEN_LTE : TOKEN_LT); + case '>': + return make_token(match('=') ? TOKEN_GTE : TOKEN_GT); + case '"': + return string(); } return error_token("Unexpected character."); @@ -143,42 +210,85 @@ Token next_token() { const char *token_type_name(TokenType type) { switch (type) { - case TOKEN_IDENTIFIER: return "identifier"; - case TOKEN_INT_LITERAL: return "int"; - case TOKEN_FLOAT_LITERAL: return "float"; - case TOKEN_STRING_LITERAL: return "string"; - case TOKEN_KEYWORD_TYPE: return "type"; - case TOKEN_KEYWORD_FN: return "fn"; - case TOKEN_KEYWORD_LET: return "let"; - case TOKEN_KEYWORD_CONST: return "const"; - case TOKEN_KEYWORD_IF: return "if"; - case TOKEN_KEYWORD_ELSE: return "else"; - case TOKEN_KEYWORD_WHILE: return "while"; - case TOKEN_KEYWORD_FOR: return "for"; - case TOKEN_KEYWORD_RETURN: return "return"; - case TOKEN_KEYWORD_USE: return "use"; - case TOKEN_OPERATOR_IS: return "is"; - case TOKEN_BANG: return "!"; - case TOKEN_DOUBLE_BANG: return "!!"; - case TOKEN_EQ: return "="; - case TOKEN_DOT: return "."; - case TOKEN_COMMA: return ","; - case TOKEN_COLON: return ":"; - case TOKEN_SEMICOLON: return ";"; - case TOKEN_PLUS: return "+"; - case TOKEN_MINUS: return "-"; - case TOKEN_STAR: return "*"; - case TOKEN_SLASH: return "/"; - case TOKEN_LPAREN: return "("; - case TOKEN_RPAREN: return ")"; - case TOKEN_LBRACE: return "{"; - case TOKEN_RBRACE: return "}"; - case TOKEN_LBRACKET: return "["; - case TOKEN_RBRACKET: return "]"; - case TOKEN_EOF: return "eof"; - case TOKEN_ERROR: return "error"; - default: return "unknown"; + case TOKEN_IDENTIFIER: + return "identifier"; + case TOKEN_INT_LITERAL: + return "int literal"; + case TOKEN_FLOAT_LITERAL: + return "real literal"; + case TOKEN_STRING_LITERAL: + return "string literal"; + case TOKEN_TYPE_INT: + return "int"; + case TOKEN_TYPE_REAL: + return "real"; + case TOKEN_TYPE_STR: + return "str"; + case TOKEN_TYPE_NAT: + return "nat"; + case TOKEN_KEYWORD_THIS: + return "this"; + case TOKEN_KEYWORD_TYPE: + return "type"; + case TOKEN_KEYWORD_FN: + return "fn"; + case TOKEN_KEYWORD_LET: + return "let"; + case TOKEN_KEYWORD_CONST: + return "const"; + case TOKEN_KEYWORD_IF: + return "if"; + case TOKEN_KEYWORD_ELSE: + return "else"; + case TOKEN_KEYWORD_WHILE: + return "while"; + case TOKEN_KEYWORD_FOR: + return "for"; + case TOKEN_KEYWORD_RETURN: + return "return"; + case TOKEN_KEYWORD_INIT: + return "init"; + case TOKEN_KEYWORD_USE: + return "use"; + case TOKEN_OPERATOR_IS: + return "is"; + case TOKEN_BANG: + return "!"; + case TOKEN_EQ: + return "="; + case TOKEN_DOT: + return "."; + case TOKEN_COMMA: + return ","; + case TOKEN_COLON: + return ":"; + case TOKEN_SEMICOLON: + return ";"; + case TOKEN_PLUS: + return "+"; + case TOKEN_MINUS: + return "-"; + case TOKEN_STAR: + return "*"; + case TOKEN_SLASH: + return "/"; + case TOKEN_LPAREN: + return "("; + case TOKEN_RPAREN: + return ")"; + case TOKEN_LBRACE: + return "{"; + case TOKEN_RBRACE: + return "}"; + case TOKEN_LBRACKET: + return "["; + case TOKEN_RBRACKET: + return "]"; + case TOKEN_EOF: + return "eof"; + case TOKEN_ERROR: + return "error"; + default: + return "unknown"; } } - - diff --git a/src/lexer.h b/src/lexer.h index 5be3edd..55c2c02 100644 --- a/src/lexer.h +++ b/src/lexer.h @@ -1,5 +1,5 @@ -#ifndef ZRL_VM_H -#define ZRL_VM_H +#ifndef ZRL_LEXER_H +#define ZRL_LEXER_H #include #include @@ -11,6 +11,10 @@ typedef enum { TOKEN_INT_LITERAL, TOKEN_FLOAT_LITERAL, TOKEN_STRING_LITERAL, + TOKEN_TYPE_INT, + TOKEN_TYPE_NAT, + TOKEN_TYPE_REAL, + TOKEN_TYPE_STR, TOKEN_KEYWORD_TYPE, TOKEN_KEYWORD_FN, TOKEN_KEYWORD_LET, @@ -21,10 +25,17 @@ typedef enum { TOKEN_KEYWORD_FOR, TOKEN_KEYWORD_RETURN, TOKEN_KEYWORD_USE, + TOKEN_KEYWORD_INIT, + TOKEN_KEYWORD_THIS, TOKEN_OPERATOR_IS, - TOKEN_DOUBLE_BANG, TOKEN_BANG, + TOKEN_BANG_EQ, TOKEN_EQ, + TOKEN_EQ_EQ, + TOKEN_GT, + TOKEN_LT, + TOKEN_GTE, + TOKEN_LTE, TOKEN_DOT, TOKEN_COMMA, TOKEN_COLON, @@ -60,9 +71,8 @@ typedef struct { int line; } Lexer; -Lexer lexer; - void init_lexer(const char *source); +const char *token_type_name(TokenType type); Token next_token(); #endif diff --git a/src/tools/gen_keywords b/src/tools/gen_keywords deleted file mode 100755 index 97880c7..0000000 Binary files a/src/tools/gen_keywords and /dev/null differ diff --git a/src/tools/gen_keywords.c b/src/tools/gen_keywords.c deleted file mode 100644 index 14f26fa..0000000 --- a/src/tools/gen_keywords.c +++ /dev/null @@ -1,90 +0,0 @@ -#include -#include - -typedef enum { - TOKEN_IDENTIFIER, - TOKEN_KEYWORD_TYPE, - TOKEN_KEYWORD_FN, - TOKEN_KEYWORD_LET, - TOKEN_KEYWORD_CONST, - TOKEN_KEYWORD_IF, - TOKEN_KEYWORD_ELSE, - TOKEN_KEYWORD_WHILE, - TOKEN_KEYWORD_FOR, - TOKEN_KEYWORD_RETURN, - TOKEN_KEYWORD_USE, - TOKEN_OPERATOR_IS -} TokenType; - -typedef struct { - const char *keyword; - TokenType token; -} Keyword; - -Keyword keywords[] = { - {"type", TOKEN_KEYWORD_TYPE}, - {"fn", TOKEN_KEYWORD_FN}, - {"let", TOKEN_KEYWORD_LET}, - {"const", TOKEN_KEYWORD_CONST}, - {"if", TOKEN_KEYWORD_IF}, - {"else", TOKEN_KEYWORD_ELSE}, - {"while", TOKEN_KEYWORD_WHILE}, - {"for", TOKEN_KEYWORD_FOR}, - {"return", TOKEN_KEYWORD_RETURN}, - {"use", TOKEN_KEYWORD_USE}, - {"is", TOKEN_OPERATOR_IS}, -}; - -void emit_keyword_header(FILE *out) { - fprintf(out, "#ifndef KEYWORDS_H\n"); - fprintf(out, "#define KEYWORDS_H\n\n"); - fprintf(out, "#include \"lexer.h\"\n\n"); - - fprintf(out, "static TokenType check_keyword(int start, int length, const char *rest, TokenType type) {\n"); - fprintf(out, " if ((lexer.current - lexer.start) == start + length &&\n"); - fprintf(out, " memcmp(lexer.start + start, rest, length) == 0) return type;\n"); - fprintf(out, " return TOKEN_IDENTIFIER;\n"); - fprintf(out, "}\n\n"); - - fprintf(out, "static TokenType identifier_type(void) {\n"); - fprintf(out, " switch (lexer.start[0]) {\n"); - - for (char ch = 'a'; ch <= 'z'; ++ch) { - int printed = 0; - for (int i = 0; i < sizeof(keywords) / sizeof(Keyword); ++i) { - const char *kw = keywords[i].keyword; - if (kw[0] == ch) { - if (!printed) { - fprintf(out, " case '%c':\n", ch); - printed = 1; - } - - int len = (int)strlen(kw); - fprintf(out, " return check_keyword(%d, %d, \"%s\", %s);\n", - 1, len - 1, kw + 1, - (keywords[i].token == TOKEN_IDENTIFIER ? "TOKEN_IDENTIFIER" : - keywords[i].token == TOKEN_OPERATOR_IS ? "TOKEN_OPERATOR_IS" : - keywords[i].token == TOKEN_KEYWORD_RETURN ? "TOKEN_KEYWORD_RETURN" : - keywords[i].token == TOKEN_KEYWORD_WHILE ? "TOKEN_KEYWORD_WHILE" : - keywords[i].token == TOKEN_KEYWORD_CONST ? "TOKEN_KEYWORD_CONST" : - keywords[i].token == TOKEN_KEYWORD_TYPE ? "TOKEN_KEYWORD_TYPE" : - keywords[i].token == TOKEN_KEYWORD_FN ? "TOKEN_KEYWORD_FN" : - keywords[i].token == TOKEN_KEYWORD_IF ? "TOKEN_KEYWORD_IF" : - keywords[i].token == TOKEN_KEYWORD_FOR ? "TOKEN_KEYWORD_FOR" : - keywords[i].token == TOKEN_KEYWORD_LET ? "TOKEN_KEYWORD_LET" : - keywords[i].token == TOKEN_KEYWORD_ELSE ? "TOKEN_KEYWORD_ELSE" : - keywords[i].token == TOKEN_KEYWORD_USE ? "TOKEN_KEYWORD_USE" : "TOKEN_IDENTIFIER")); - } - } - } - - fprintf(out, " }\n return TOKEN_IDENTIFIER;\n"); - fprintf(out, "}\n\n"); - - fprintf(out, "#endif // KEYWORDS_H\n"); -} - -int main(void) { - emit_keyword_header(stdout); - return 0; -}