fix lexer

This commit is contained in:
zongor 2025-07-27 21:07:39 -04:00
parent 88dfbb098d
commit 7e6a063bdb
7 changed files with 260 additions and 256 deletions

View File

@ -12,21 +12,6 @@ CFLAGS_WASM = -g -std=c89 -Wall -Wextra -Werror -Wno-unused-parameter -I.
LDFLAGS_WASM = -s WASM=1 -g -s USE_SDL=2
LDLIBS_WASM =
TOOLS_DIR := tools
GENERATOR := $(TOOLS_DIR)/gen_keywords
GENERATOR_SRC := $(GENERATOR).c
KEYWORDS_H := keywords.h
# Rule to generate keywords.h
$(KEYWORDS_H): $(GENERATOR) $(GENERATOR_SRC)
@echo "Generating keywords.h..."
@$(GENERATOR) > $(KEYWORDS_H)
# Rule to build the generator
$(GENERATOR): $(GENERATOR_SRC)
@echo "Compiling keyword generator..."
@$(CC) -o $@ $<
# Source and build configuration
# ----------------------------
COMMON_SRC = $(wildcard *.c)
@ -45,7 +30,7 @@ OBJ_NATIVE = $(addprefix $(OBJ_DIR_NATIVE)/,$(notdir $(COMMON_SRC:.c=.o)))
OBJ_WASM = $(addprefix $(OBJ_DIR_WASM)/,$(notdir $(COMMON_SRC:.c=.o)))
# Phony targets
.PHONY: all clean clean_generated install wasm native emscripten linux macos
.PHONY: all clean install wasm native emscripten linux macos
# Default target builds the native version
all: native
@ -80,14 +65,9 @@ $(OBJ_DIR_WASM)/%.o: %.c
# Clean build artifacts
# ---------------------
clean: clean-generated
clean:
rm -rf $(OBJ_DIR_NATIVE) $(OBJ_DIR_WASM) $(EXEC_NATIVE) $(EXEC_WASM)
# Clean rule for deleting generated keyword binary and header
clean-generated:
@echo "Removing generated files..."
@rm -f $(KEYWORDS_H) $(GENERATOR)
# Install target (example)
# ------------------------
install: native

View File

@ -1,6 +1,7 @@
#include "../../vm.h"
#include "../../debug.h"
#include "../../test.h"
#include "../../vm.h"
#include "../../lexer.h"
#include <SDL2/SDL.h>
int main(int argc, char **argv) {
@ -10,6 +11,37 @@ int main(int argc, char **argv) {
vm.stack_size = STACK_SIZE;
vm.memory_size = MEMORY_SIZE;
if (argc < 2) {
fprintf(stderr, "Usage: %s <file.zrl>\n", argv[0]);
return 1;
}
FILE *f = fopen(argv[1], "rb");
if (!f) {
perror("fopen");
return 1;
}
fseek(f, 0, SEEK_END);
long len = ftell(f);
fseek(f, 0, SEEK_SET);
char *source = (char *)malloc(len + 1);
fread(source, 1, len, f);
source[len] = '\0';
fclose(f);
init_lexer(source);
for (;;) {
Token token = next_token();
printf("[%d] %-18s: '%.*s'\n", token.line, token_type_name(token.type), token.length, token.start);
if (token.type == TOKEN_EOF) break;
}
free(source);
return 0;
test_hello_world_compile(&vm);
/* test_add_compile(&vm); */
/* test_add_function_compile(&vm); */

View File

@ -1,38 +0,0 @@
#ifndef KEYWORDS_H
#define KEYWORDS_H
#include "lexer.h"
static TokenType check_keyword(int start, int length, const char *rest, TokenType type) {
if ((lexer.current - lexer.start) == start + length &&
memcmp(lexer.start + start, rest, length) == 0) return type;
return TOKEN_IDENTIFIER;
}
static TokenType identifier_type(void) {
switch (lexer.start[0]) {
case 'c':
return check_keyword(1, 4, "onst", TOKEN_KEYWORD_CONST);
case 'e':
return check_keyword(1, 3, "lse", TOKEN_KEYWORD_ELSE);
case 'f':
return check_keyword(1, 1, "n", TOKEN_KEYWORD_FN);
return check_keyword(1, 2, "or", TOKEN_KEYWORD_FOR);
case 'i':
return check_keyword(1, 1, "f", TOKEN_KEYWORD_IF);
return check_keyword(1, 1, "s", TOKEN_OPERATOR_IS);
case 'l':
return check_keyword(1, 2, "et", TOKEN_KEYWORD_LET);
case 'r':
return check_keyword(1, 5, "eturn", TOKEN_KEYWORD_RETURN);
case 't':
return check_keyword(1, 3, "ype", TOKEN_KEYWORD_TYPE);
case 'u':
return check_keyword(1, 2, "se", TOKEN_KEYWORD_USE);
case 'w':
return check_keyword(1, 4, "hile", TOKEN_KEYWORD_WHILE);
}
return TOKEN_IDENTIFIER;
}
#endif // KEYWORDS_H

View File

@ -1,4 +1,6 @@
#include "keywords.h"
#include "lexer.h"
Lexer lexer;
void init_lexer(const char *source) {
lexer.start = source;
@ -6,25 +8,21 @@ void init_lexer(const char *source) {
lexer.line = 1;
}
int is_at_end() {
return *lexer.current == '\0';
}
int is_at_end() { return *lexer.current == '\0'; }
char advance() {
return *lexer.current++;
}
char advance() { return *lexer.current++; }
char peek() {
return *lexer.current;
}
char peek() { return *lexer.current; }
char peek_next() {
if (is_at_end()) return '\0';
if (is_at_end())
return '\0';
return lexer.current[1];
}
int match(char expected) {
if (*lexer.current != expected) return 0;
if (*lexer.current != expected)
return 0;
lexer.current++;
return 1;
}
@ -44,9 +42,11 @@ void skip_whitespace() {
break;
case '!':
if (peek_next() == '!') {
while (peek() != '\n' && !is_at_end()) advance();
while (peek() != '\n' && !is_at_end())
advance();
} else {
while (peek() != '\n' && !is_at_end()) advance();
while (peek() != '\n' && !is_at_end())
advance();
}
break;
default:
@ -73,20 +73,18 @@ Token error_token(const char *message) {
return token;
}
int is_alpha(char c) {
return isalpha(c) || c == '_';
}
int is_alpha(char c) { return isalpha(c) || c == '_'; }
int is_digit(char c) {
return isdigit(c);
}
int is_digit(char c) { return isdigit(c); }
Token number() {
while (is_digit(peek())) advance();
while (is_digit(peek()))
advance();
if (peek() == '.' && is_digit(peek_next())) {
advance();
while (is_digit(peek())) advance();
while (is_digit(peek()))
advance();
return make_token(TOKEN_FLOAT_LITERAL);
}
@ -95,47 +93,116 @@ Token number() {
Token string() {
while (peek() != '"' && !is_at_end()) {
if (peek() == '\n') lexer.line++;
if (peek() == '\n')
lexer.line++;
advance();
}
if (is_at_end()) return error_token("Unterminated string.");
if (is_at_end())
return error_token("Unterminated string.");
advance(); // Consume closing quote
advance();
return make_token(TOKEN_STRING_LITERAL);
}
Token identifier() {
while (is_alpha(peek()) || is_digit(peek()))
advance();
int length = (int)(lexer.current - lexer.start);
const char *text = lexer.start;
if (length == 4 && strncmp(text, "init", 4) == 0)
return make_token(TOKEN_KEYWORD_INIT);
if (length == 4 && strncmp(text, "this", 4) == 0)
return make_token(TOKEN_KEYWORD_THIS);
if (length == 4 && strncmp(text, "type", 4) == 0)
return make_token(TOKEN_KEYWORD_TYPE);
if (length == 2 && strncmp(text, "fn", 2) == 0)
return make_token(TOKEN_KEYWORD_FN);
if (length == 3 && strncmp(text, "let", 3) == 0)
return make_token(TOKEN_KEYWORD_LET);
if (length == 5 && strncmp(text, "const", 5) == 0)
return make_token(TOKEN_KEYWORD_CONST);
if (length == 2 && strncmp(text, "if", 2) == 0)
return make_token(TOKEN_KEYWORD_IF);
if (length == 4 && strncmp(text, "else", 4) == 0)
return make_token(TOKEN_KEYWORD_ELSE);
if (length == 5 && strncmp(text, "while", 5) == 0)
return make_token(TOKEN_KEYWORD_WHILE);
if (length == 3 && strncmp(text, "for", 3) == 0)
return make_token(TOKEN_KEYWORD_FOR);
if (length == 6 && strncmp(text, "return", 6) == 0)
return make_token(TOKEN_KEYWORD_RETURN);
if (length == 3 && strncmp(text, "use", 3) == 0)
return make_token(TOKEN_KEYWORD_USE);
if (length == 2 && strncmp(text, "is", 2) == 0)
return make_token(TOKEN_OPERATOR_IS);
if (length == 3 && strncmp(text, "int", 3) == 0)
return make_token(TOKEN_TYPE_INT);
if (length == 3 && strncmp(text, "nat", 3) == 0)
return make_token(TOKEN_TYPE_NAT);
if (length == 3 && strncmp(text, "str", 3) == 0)
return make_token(TOKEN_TYPE_STR);
if (length == 3 && strncmp(text, "real", 4) == 0)
return make_token(TOKEN_TYPE_REAL);
return make_token(TOKEN_IDENTIFIER);
}
Token next_token() {
skip_whitespace();
lexer.start = lexer.current;
if (is_at_end()) return make_token(TOKEN_EOF);
if (is_at_end())
return make_token(TOKEN_EOF);
char c = advance();
if (is_alpha(c)) return identifier();
if (is_digit(c)) return number();
if (is_alpha(c))
return identifier();
if (is_digit(c))
return number();
switch (c) {
case '(': return make_token(TOKEN_LPAREN);
case ')': return make_token(TOKEN_RPAREN);
case '{': return make_token(TOKEN_LBRACE);
case '}': return make_token(TOKEN_RBRACE);
case '[': return make_token(TOKEN_LBRACKET);
case ']': return make_token(TOKEN_RBRACKET);
case ',': return make_token(TOKEN_COMMA);
case '.': return make_token(TOKEN_DOT);
case ':': return make_token(TOKEN_COLON);
case ';': return make_token(TOKEN_SEMICOLON);
case '+': return make_token(TOKEN_PLUS);
case '-': return make_token(TOKEN_MINUS);
case '*': return make_token(TOKEN_STAR);
case '/': return make_token(TOKEN_SLASH);
case '=': return make_token(TOKEN_EQ);
case '"': return string();
case '(':
return make_token(TOKEN_LPAREN);
case ')':
return make_token(TOKEN_RPAREN);
case '{':
return make_token(TOKEN_LBRACE);
case '}':
return make_token(TOKEN_RBRACE);
case '[':
return make_token(TOKEN_LBRACKET);
case ']':
return make_token(TOKEN_RBRACKET);
case ',':
return make_token(TOKEN_COMMA);
case '.':
return make_token(TOKEN_DOT);
case ':':
return make_token(TOKEN_COLON);
case ';':
return make_token(TOKEN_SEMICOLON);
case '+':
return make_token(TOKEN_PLUS);
case '-':
return make_token(TOKEN_MINUS);
case '*':
return make_token(TOKEN_STAR);
case '/':
return make_token(TOKEN_SLASH);
case '!':
if (match('!')) return make_token(TOKEN_DOUBLE_BANG);
return make_token(TOKEN_BANG);
return make_token(match('=') ? TOKEN_BANG_EQ : TOKEN_BANG);
case '=':
return make_token(match('=') ? TOKEN_EQ_EQ : TOKEN_EQ);
case '<':
return make_token(match('=') ? TOKEN_LTE : TOKEN_LT);
case '>':
return make_token(match('=') ? TOKEN_GTE : TOKEN_GT);
case '"':
return string();
}
return error_token("Unexpected character.");
@ -143,42 +210,85 @@ Token next_token() {
const char *token_type_name(TokenType type) {
switch (type) {
case TOKEN_IDENTIFIER: return "identifier";
case TOKEN_INT_LITERAL: return "int";
case TOKEN_FLOAT_LITERAL: return "float";
case TOKEN_STRING_LITERAL: return "string";
case TOKEN_KEYWORD_TYPE: return "type";
case TOKEN_KEYWORD_FN: return "fn";
case TOKEN_KEYWORD_LET: return "let";
case TOKEN_KEYWORD_CONST: return "const";
case TOKEN_KEYWORD_IF: return "if";
case TOKEN_KEYWORD_ELSE: return "else";
case TOKEN_KEYWORD_WHILE: return "while";
case TOKEN_KEYWORD_FOR: return "for";
case TOKEN_KEYWORD_RETURN: return "return";
case TOKEN_KEYWORD_USE: return "use";
case TOKEN_OPERATOR_IS: return "is";
case TOKEN_BANG: return "!";
case TOKEN_DOUBLE_BANG: return "!!";
case TOKEN_EQ: return "=";
case TOKEN_DOT: return ".";
case TOKEN_COMMA: return ",";
case TOKEN_COLON: return ":";
case TOKEN_SEMICOLON: return ";";
case TOKEN_PLUS: return "+";
case TOKEN_MINUS: return "-";
case TOKEN_STAR: return "*";
case TOKEN_SLASH: return "/";
case TOKEN_LPAREN: return "(";
case TOKEN_RPAREN: return ")";
case TOKEN_LBRACE: return "{";
case TOKEN_RBRACE: return "}";
case TOKEN_LBRACKET: return "[";
case TOKEN_RBRACKET: return "]";
case TOKEN_EOF: return "eof";
case TOKEN_ERROR: return "error";
default: return "unknown";
case TOKEN_IDENTIFIER:
return "identifier";
case TOKEN_INT_LITERAL:
return "int literal";
case TOKEN_FLOAT_LITERAL:
return "real literal";
case TOKEN_STRING_LITERAL:
return "string literal";
case TOKEN_TYPE_INT:
return "int";
case TOKEN_TYPE_REAL:
return "real";
case TOKEN_TYPE_STR:
return "str";
case TOKEN_TYPE_NAT:
return "nat";
case TOKEN_KEYWORD_THIS:
return "this";
case TOKEN_KEYWORD_TYPE:
return "type";
case TOKEN_KEYWORD_FN:
return "fn";
case TOKEN_KEYWORD_LET:
return "let";
case TOKEN_KEYWORD_CONST:
return "const";
case TOKEN_KEYWORD_IF:
return "if";
case TOKEN_KEYWORD_ELSE:
return "else";
case TOKEN_KEYWORD_WHILE:
return "while";
case TOKEN_KEYWORD_FOR:
return "for";
case TOKEN_KEYWORD_RETURN:
return "return";
case TOKEN_KEYWORD_INIT:
return "init";
case TOKEN_KEYWORD_USE:
return "use";
case TOKEN_OPERATOR_IS:
return "is";
case TOKEN_BANG:
return "!";
case TOKEN_EQ:
return "=";
case TOKEN_DOT:
return ".";
case TOKEN_COMMA:
return ",";
case TOKEN_COLON:
return ":";
case TOKEN_SEMICOLON:
return ";";
case TOKEN_PLUS:
return "+";
case TOKEN_MINUS:
return "-";
case TOKEN_STAR:
return "*";
case TOKEN_SLASH:
return "/";
case TOKEN_LPAREN:
return "(";
case TOKEN_RPAREN:
return ")";
case TOKEN_LBRACE:
return "{";
case TOKEN_RBRACE:
return "}";
case TOKEN_LBRACKET:
return "[";
case TOKEN_RBRACKET:
return "]";
case TOKEN_EOF:
return "eof";
case TOKEN_ERROR:
return "error";
default:
return "unknown";
}
}

View File

@ -1,5 +1,5 @@
#ifndef ZRL_VM_H
#define ZRL_VM_H
#ifndef ZRL_LEXER_H
#define ZRL_LEXER_H
#include <stdio.h>
#include <string.h>
@ -11,6 +11,10 @@ typedef enum {
TOKEN_INT_LITERAL,
TOKEN_FLOAT_LITERAL,
TOKEN_STRING_LITERAL,
TOKEN_TYPE_INT,
TOKEN_TYPE_NAT,
TOKEN_TYPE_REAL,
TOKEN_TYPE_STR,
TOKEN_KEYWORD_TYPE,
TOKEN_KEYWORD_FN,
TOKEN_KEYWORD_LET,
@ -21,10 +25,17 @@ typedef enum {
TOKEN_KEYWORD_FOR,
TOKEN_KEYWORD_RETURN,
TOKEN_KEYWORD_USE,
TOKEN_KEYWORD_INIT,
TOKEN_KEYWORD_THIS,
TOKEN_OPERATOR_IS,
TOKEN_DOUBLE_BANG,
TOKEN_BANG,
TOKEN_BANG_EQ,
TOKEN_EQ,
TOKEN_EQ_EQ,
TOKEN_GT,
TOKEN_LT,
TOKEN_GTE,
TOKEN_LTE,
TOKEN_DOT,
TOKEN_COMMA,
TOKEN_COLON,
@ -60,9 +71,8 @@ typedef struct {
int line;
} Lexer;
Lexer lexer;
void init_lexer(const char *source);
const char *token_type_name(TokenType type);
Token next_token();
#endif

Binary file not shown.

View File

@ -1,90 +0,0 @@
#include <stdio.h>
#include <string.h>
typedef enum {
TOKEN_IDENTIFIER,
TOKEN_KEYWORD_TYPE,
TOKEN_KEYWORD_FN,
TOKEN_KEYWORD_LET,
TOKEN_KEYWORD_CONST,
TOKEN_KEYWORD_IF,
TOKEN_KEYWORD_ELSE,
TOKEN_KEYWORD_WHILE,
TOKEN_KEYWORD_FOR,
TOKEN_KEYWORD_RETURN,
TOKEN_KEYWORD_USE,
TOKEN_OPERATOR_IS
} TokenType;
typedef struct {
const char *keyword;
TokenType token;
} Keyword;
Keyword keywords[] = {
{"type", TOKEN_KEYWORD_TYPE},
{"fn", TOKEN_KEYWORD_FN},
{"let", TOKEN_KEYWORD_LET},
{"const", TOKEN_KEYWORD_CONST},
{"if", TOKEN_KEYWORD_IF},
{"else", TOKEN_KEYWORD_ELSE},
{"while", TOKEN_KEYWORD_WHILE},
{"for", TOKEN_KEYWORD_FOR},
{"return", TOKEN_KEYWORD_RETURN},
{"use", TOKEN_KEYWORD_USE},
{"is", TOKEN_OPERATOR_IS},
};
void emit_keyword_header(FILE *out) {
fprintf(out, "#ifndef KEYWORDS_H\n");
fprintf(out, "#define KEYWORDS_H\n\n");
fprintf(out, "#include \"lexer.h\"\n\n");
fprintf(out, "static TokenType check_keyword(int start, int length, const char *rest, TokenType type) {\n");
fprintf(out, " if ((lexer.current - lexer.start) == start + length &&\n");
fprintf(out, " memcmp(lexer.start + start, rest, length) == 0) return type;\n");
fprintf(out, " return TOKEN_IDENTIFIER;\n");
fprintf(out, "}\n\n");
fprintf(out, "static TokenType identifier_type(void) {\n");
fprintf(out, " switch (lexer.start[0]) {\n");
for (char ch = 'a'; ch <= 'z'; ++ch) {
int printed = 0;
for (int i = 0; i < sizeof(keywords) / sizeof(Keyword); ++i) {
const char *kw = keywords[i].keyword;
if (kw[0] == ch) {
if (!printed) {
fprintf(out, " case '%c':\n", ch);
printed = 1;
}
int len = (int)strlen(kw);
fprintf(out, " return check_keyword(%d, %d, \"%s\", %s);\n",
1, len - 1, kw + 1,
(keywords[i].token == TOKEN_IDENTIFIER ? "TOKEN_IDENTIFIER" :
keywords[i].token == TOKEN_OPERATOR_IS ? "TOKEN_OPERATOR_IS" :
keywords[i].token == TOKEN_KEYWORD_RETURN ? "TOKEN_KEYWORD_RETURN" :
keywords[i].token == TOKEN_KEYWORD_WHILE ? "TOKEN_KEYWORD_WHILE" :
keywords[i].token == TOKEN_KEYWORD_CONST ? "TOKEN_KEYWORD_CONST" :
keywords[i].token == TOKEN_KEYWORD_TYPE ? "TOKEN_KEYWORD_TYPE" :
keywords[i].token == TOKEN_KEYWORD_FN ? "TOKEN_KEYWORD_FN" :
keywords[i].token == TOKEN_KEYWORD_IF ? "TOKEN_KEYWORD_IF" :
keywords[i].token == TOKEN_KEYWORD_FOR ? "TOKEN_KEYWORD_FOR" :
keywords[i].token == TOKEN_KEYWORD_LET ? "TOKEN_KEYWORD_LET" :
keywords[i].token == TOKEN_KEYWORD_ELSE ? "TOKEN_KEYWORD_ELSE" :
keywords[i].token == TOKEN_KEYWORD_USE ? "TOKEN_KEYWORD_USE" : "TOKEN_IDENTIFIER"));
}
}
}
fprintf(out, " }\n return TOKEN_IDENTIFIER;\n");
fprintf(out, "}\n\n");
fprintf(out, "#endif // KEYWORDS_H\n");
}
int main(void) {
emit_keyword_header(stdout);
return 0;
}