Add intial lexer back again, fix docs

This commit is contained in:
zongor 2025-10-18 18:15:54 -07:00
parent 6c1bf1ff8c
commit 790d7e8509
5 changed files with 502 additions and 37 deletions

View File

@ -86,11 +86,13 @@ ifeq ($(BUILD_MODE), release)
PLATFORM_SOURCE := $(ARCH_DIR)/main.c \ PLATFORM_SOURCE := $(ARCH_DIR)/main.c \
$(ARCH_DIR)/devices.c\ $(ARCH_DIR)/devices.c\
$(SRC_DIR)/tools/parser.c \ $(SRC_DIR)/tools/parser.c \
$(SRC_DIR)/tools/lexer.c \
$(SRC_DIR)/tools/assembler.c $(SRC_DIR)/tools/assembler.c
else else
PLATFORM_SOURCE := $(ARCH_DIR)/main.c \ PLATFORM_SOURCE := $(ARCH_DIR)/main.c \
$(ARCH_DIR)/devices.c \ $(ARCH_DIR)/devices.c \
$(SRC_DIR)/tools/parser.c \ $(SRC_DIR)/tools/parser.c \
$(SRC_DIR)/tools/lexer.c \
$(SRC_DIR)/tools/assembler.c $(SRC_DIR)/tools/assembler.c
endif endif

View File

@ -58,14 +58,26 @@ The Undâr compiler will be written in Sċieppan, as well as core VM tests.
#+BEGIN_SRC lisp #+BEGIN_SRC lisp
((code ((code
(label main (label main
(load-immediate $0 &terminal-namespace) ; load terminal namespace (load-immediate $1 &hello-str) ; load hello string ptr
(load-immediate $1 &hello-str) ; load hello string ptr (push $1)
(string-length $2 $1) ; get length to write to stdout (call &pln)
(syscall WRITE $0 $1 $2) ; do the write syscall (halt)) ; done
(halt))) ; done (label pln
(load-immediate $0 &terminal-namespace) ; get terminal device
(load-immediate $11 0)
(syscall OPEN $0 $0 $11)
(load-immediate $3 &new-line)
(pop $1)
(load-offset-32 $7 $0 4) ; load handle
(string-length $2 $1)
(syscall WRITE $7 $1 $2)
(string-length $4 $3)
(syscall WRITE $7 $3 $4)
(return)))
(data (data
(label terminal-namespace "/dev/term/0") (label terminal-namespace "/dev/term/0")
(label hello-str "nuqneH 'u'?\n"))) (label new-line "\n")
(label hello-str "nuqneH 'u'?")))
#+END_SRC #+END_SRC
#+BEGIN_SRC sh #+BEGIN_SRC sh
@ -81,37 +93,39 @@ memory is managed via frame based arenas. function scopes defines a memory frame
heap allocations using the internal malloc opcode push pointers within this frame. when a frame exits, the pointer is reset like stack based gc. heap allocations using the internal malloc opcode push pointers within this frame. when a frame exits, the pointer is reset like stack based gc.
#+BEGIN_SRC lisp #+BEGIN_SRC lisp
((code ((code
(label main ; this example adds 2 numbers together (label main
(load-immediate $0 1) ; pushes 1 onto the stack for the function call (load-immediate $0 &terminal-namespace) ; get terminal device
(push $0) (load-immediate $11 0)
(load-immediate $0 1) (syscall OPEN $0 $0 $11)
(push $0)
(call &add) ; here a new frame is generated
(pop $0) ; the element is returned and the memory for the pln is "freed" automatically because the child frame is done
(halt))
(label add
(pop $0)
(pop $1)
(add-int $2 $1 $0) ; add the arguments
(int-to-string $3 $2) ; convert to a string (heap allocation)
(push $3)
(call &pln) ; call print function
(push $2)
(return)) ; return to main function
(label pln (load-immediate $1 &help) ; print help message
(load-immediate $0 &terminal-namespace) ; load the namespace for the terminal (push $0)
(load-immediate $3 &new-line) ; and a newline char (push $1)
(pop $1) ; pointer to string (call &pln)
(string-length $2 $1) ; get the length
(syscall WRITE $0 $1 $2) ; write the string (load-immediate $1 32) ; read in a string of max 32 char length
(malloc $4 $1) ; allocate memory for the string
(load-offset-32 $7 $0 4) ; load handle
(syscall READ $7 $2 $1 $4) ; read the string
(push $0)
(push $4)
(call &pln) ; print the string
(halt))
(label pln
(load-immediate $3 &new-line)
(pop $1)
(pop $0)
(load-offset-32 $7 $0 4) ; load handle
(string-length $2 $1)
(syscall WRITE $7 $1 $2)
(string-length $4 $3) (string-length $4 $3)
(syscall WRITE $0 $3 $4) (syscall WRITE $7 $3 $4)
(return))) ; return back to add function (return)))
(data ; allocates strings at compile time (data
(label terminal-namespace "/dev/term/0") (label terminal-namespace "/dev/term/0")
(label help "Enter a string: ")
(label new-line "\n"))) (label new-line "\n")))
#+END_SRC #+END_SRC

View File

@ -1,5 +1,6 @@
#include "../../tools/assembler.h" #include "../../tools/assembler.h"
#include "../../tools/parser.h" #include "../../tools/parser.h"
#include "../../tools/lexer.h"
#include "../../vm/vm.h" #include "../../vm/vm.h"
#include "devices.h" #include "devices.h"
#include <SDL2/SDL.h> #include <SDL2/SDL.h>
@ -122,6 +123,50 @@ bool loadVM(const char *filename, VM *vm) {
// Function to compile and optionally save // Function to compile and optionally save
bool compileAndSave(const char *source_file, const char *output_file, VM *vm) { bool compileAndSave(const char *source_file, const char *output_file, VM *vm) {
USED(vm);
USED(output_file);
FILE *f = fopen(source_file, "rb");
if (!f) {
perror("fopen");
return false;
}
static char source[MAX_SRC_SIZE + 1];
fseek(f, 0, SEEK_END);
long len = ftell(f);
fseek(f, 0, SEEK_SET);
if (len >= MAX_SRC_SIZE) {
fprintf(stderr, "Source is larger than buffer\n");
fclose(f);
return false;
}
size_t read = fread(source, 1, len, f);
source[read] = '\0';
fclose(f);
initLexer(source);
Token token;
do {
token = nextToken();
if (token.type == TOKEN_ERROR) {
printf("ERROR at line %d: %.*s\n", token.line, token.length, token.start);
break; // Stop on error, or continue if you want to see more
}
if (token.type != TOKEN_EOF) {
printf("Line %d [%s]: %.*s\n",
token.line,
tokenTypeToString(token.type),
token.length,
token.start);
}
} while (token.type != TOKEN_EOF);
return true;
}
// Function to assemble and optionally save
bool assembleAndSave(const char *source_file, const char *output_file, VM *vm) {
FILE *f = fopen(source_file, "rb"); FILE *f = fopen(source_file, "rb");
if (!f) { if (!f) {
perror("fopen"); perror("fopen");
@ -377,7 +422,7 @@ i32 main(i32 argc, char *argv[]) {
bool dump_rom = false; bool dump_rom = false;
char *input_file = nil; char *input_file = nil;
char *output_file = nil; char *output_file = nil;
bool is_rom = false; bool is_rom, is_assembly = false;
// Parse command line arguments // Parse command line arguments
for (i32 i = 1; i < argc; i++) { for (i32 i = 1; i < argc; i++) {
@ -395,6 +440,9 @@ i32 main(i32 argc, char *argv[]) {
if (ext && (strcmp(ext, ".rom") == 0)) { if (ext && (strcmp(ext, ".rom") == 0)) {
is_rom = true; is_rom = true;
} }
if (ext && (strcmp(ext, ".asm.lisp") == 0)) {
is_assembly = true;
}
} else if (output_file == nil && dump_rom) { } else if (output_file == nil && dump_rom) {
// This is the output file for -o flag // This is the output file for -o flag
output_file = argv[i]; output_file = argv[i];
@ -408,8 +456,14 @@ i32 main(i32 argc, char *argv[]) {
if (is_rom) { if (is_rom) {
// Load ROM file directly // Load ROM file directly
compilation_success = loadVM(input_file, &vm); compilation_success = loadVM(input_file, &vm);
} else { } else if (is_assembly) {
// Compile Lisp file // Compile Lisp file
if (dump_rom && output_file) {
compilation_success = assembleAndSave(input_file, output_file, &vm);
} else {
compilation_success = assembleAndSave(input_file, nil, &vm);
}
} else {
if (dump_rom && output_file) { if (dump_rom && output_file) {
compilation_success = compileAndSave(input_file, output_file, &vm); compilation_success = compileAndSave(input_file, output_file, &vm);
} else { } else {

324
src/tools/lexer.c Normal file
View File

@ -0,0 +1,324 @@
#include <string.h>
#include "../vm/common.h"
#include "lexer.h"
typedef struct {
const char *start;
const char *current;
int line;
} Lexer;
Lexer lexer;
void initLexer(const char *source) {
lexer.start = source;
lexer.current = source;
lexer.line = 1;
}
static bool isAlpha(char c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_';
}
static bool isDigit(char c) { return c >= '0' && c <= '9'; }
static bool isAtEnd() { return *lexer.current == '\0'; }
static char advance() {
lexer.current++;
return lexer.current[-1];
}
static char peek() { return *lexer.current; }
static char peekNext() {
if (isAtEnd())
return '\0';
return lexer.current[1];
}
static bool match(char expected) {
if (isAtEnd())
return false;
if (*lexer.current != expected)
return false;
lexer.current++;
return true;
}
static Token makeToken(TokenType type) {
Token token;
token.type = type;
token.start = lexer.start;
token.length = (int)(lexer.current - lexer.start);
token.line = lexer.line;
return token;
}
static Token errorToken(const char *message) {
Token token;
token.type = TOKEN_ERROR;
token.start = message;
token.length = (int)strlen(message);
token.line = lexer.line;
return token;
}
static void skipWhitespace() {
for (;;) {
char c = peek();
switch (c) {
case ' ':
case '\r':
case '\t':
advance();
break;
case '\n':
lexer.line++;
advance();
break;
case '/':
if (peekNext() == '/') {
// Single-line comment: skip until newline or end of file
advance();
while (peek() != '\n' && !isAtEnd())
advance();
} else if (peekNext() == '*') {
// Multi-line comment: skip until '*/' or end of file
advance();
advance();
while (!isAtEnd()) {
if (peek() == '*' && peekNext() == '/') {
advance();
advance();
break; // Exit loop, comment ended
}
advance();
}
} else {
return; // Not a comment, let tokenization handle it
}
break;
default:
return;
}
}
}
static TokenType checkKeyword(int start, int length, const char *rest,
TokenType type) {
if (lexer.current - lexer.start == start + length &&
memcmp(lexer.start + start, rest, length) == 0) {
return type;
}
return TOKEN_IDENTIFIER;
}
static TokenType identifierType() {
switch (lexer.start[0]) {
case 'a':
return checkKeyword(1, 2, "nd", TOKEN_OPERATOR_AND);
case 'e':
return checkKeyword(1, 3, "lse", TOKEN_KEYWORD_ELSE);
case 'f':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'a':
return checkKeyword(2, 3, "lse", TOKEN_KEYWORD_FALSE);
case 'o':
return checkKeyword(2, 1, "r", TOKEN_KEYWORD_FOR);
}
return checkKeyword(1, 7, "unction", TOKEN_KEYWORD_FN);
}
break;
case 'i':
return checkKeyword(1, 1, "f", TOKEN_KEYWORD_IF);
case 'n':
return checkKeyword(1, 2, "il", TOKEN_KEYWORD_NIL);
case 'o':
return checkKeyword(1, 1, "r", TOKEN_OPERATOR_OR);
case 'p':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'l':
return checkKeyword(2, 2, "ex", TOKEN_KEYWORD_PLEX);
case 'r':
return checkKeyword(2, 3, "int", TOKEN_KEYWORD_PRINT);
}
}
break;
case 'r':
return checkKeyword(1, 5, "eturn", TOKEN_KEYWORD_RETURN);
case 't':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'h':
return checkKeyword(2, 2, "is", TOKEN_KEYWORD_THIS);
case 'r':
return checkKeyword(2, 2, "ue", TOKEN_KEYWORD_TRUE);
}
}
break;
case 'l':
return checkKeyword(1, 2, "et", TOKEN_KEYWORD_LET);
case 'w':
return checkKeyword(1, 4, "hile", TOKEN_KEYWORD_WHILE);
}
return TOKEN_IDENTIFIER;
}
static Token identifier() {
while (isAlpha(peek()) || isDigit(peek()))
advance();
return makeToken(identifierType());
}
static Token number() {
while (isDigit(peek()))
advance();
/* Look for a fractional part. */
if (peek() == '.' && isDigit(peekNext())) {
/* Consume the ".". */
advance();
while (isDigit(peek()))
advance();
return makeToken(TOKEN_FLOAT_LITERAL);
}
return makeToken(TOKEN_INT_LITERAL);
}
static Token string() {
while (peek() != '"' && !isAtEnd()) {
if (peek() == '\n')
lexer.line++;
advance();
}
if (isAtEnd())
return errorToken("Unterminated string.");
/* The closing quote. */
advance();
return makeToken(TOKEN_STRING_LITERAL);
}
Token nextToken() {
skipWhitespace();
lexer.start = lexer.current;
if (isAtEnd())
return makeToken(TOKEN_EOF);
char c = advance();
if (isAlpha(c))
return identifier();
if (isDigit(c))
return number();
switch (c) {
case '(':
return makeToken(TOKEN_LPAREN);
case ')':
return makeToken(TOKEN_RPAREN);
case '{':
return makeToken(TOKEN_LBRACE);
case '}':
return makeToken(TOKEN_RBRACE);
case '[':
return makeToken(TOKEN_LBRACKET);
case ']':
return makeToken(TOKEN_RBRACKET);
case ';':
return makeToken(TOKEN_SEMICOLON);
case ',':
return makeToken(TOKEN_COMMA);
case '.':
return makeToken(TOKEN_DOT);
case '-':
return makeToken(TOKEN_MINUS);
case '+':
return makeToken(TOKEN_PLUS);
case '/':
return makeToken(TOKEN_SLASH);
case '*':
return makeToken(TOKEN_STAR);
case '!':
return makeToken(match('=') ? TOKEN_BANG_EQ : TOKEN_BANG);
case '=':
return makeToken(match('=') ? TOKEN_EQ_EQ : TOKEN_EQ);
case '<':
return makeToken(match('=') ? TOKEN_LTE : TOKEN_LT);
case '>':
return makeToken(match('=') ? TOKEN_GTE : TOKEN_GT);
case '"':
return string();
}
return errorToken("Unexpected character.");
}
const char* tokenTypeToString(TokenType type) {
switch (type) {
case TOKEN_EOF: return "EOF";
case TOKEN_IDENTIFIER: return "IDENTIFIER";
case TOKEN_INT_LITERAL: return "INT_LITERAL";
case TOKEN_UINT_LITERAL: return "UINT_LITERAL";
case TOKEN_FLOAT_LITERAL: return "FLOAT_LITERAL";
case TOKEN_STRING_LITERAL: return "STRING_LITERAL";
case TOKEN_TYPE_INT: return "TYPE_INT";
case TOKEN_TYPE_NAT: return "TYPE_NAT";
case TOKEN_TYPE_REAL: return "TYPE_REAL";
case TOKEN_TYPE_STR: return "TYPE_STR";
case TOKEN_KEYWORD_PLEX: return "KEYWORD_PLEX";
case TOKEN_KEYWORD_FN: return "KEYWORD_FN";
case TOKEN_KEYWORD_LET: return "KEYWORD_LET";
case TOKEN_KEYWORD_CONST: return "KEYWORD_CONST";
case TOKEN_KEYWORD_IF: return "KEYWORD_IF";
case TOKEN_KEYWORD_ELSE: return "KEYWORD_ELSE";
case TOKEN_KEYWORD_WHILE: return "KEYWORD_WHILE";
case TOKEN_KEYWORD_FOR: return "KEYWORD_FOR";
case TOKEN_KEYWORD_RETURN: return "KEYWORD_RETURN";
case TOKEN_KEYWORD_USE: return "KEYWORD_USE";
case TOKEN_KEYWORD_INIT: return "KEYWORD_INIT";
case TOKEN_KEYWORD_THIS: return "KEYWORD_THIS";
case TOKEN_KEYWORD_PRINT: return "KEYWORD_PRINT";
case TOKEN_KEYWORD_NIL: return "KEYWORD_NIL";
case TOKEN_KEYWORD_TRUE: return "KEYWORD_TRUE";
case TOKEN_KEYWORD_FALSE: return "KEYWORD_FALSE";
case TOKEN_OPERATOR_IS: return "OPERATOR_IS";
case TOKEN_OPERATOR_NOT: return "OPERATOR_NOT";
case TOKEN_OPERATOR_AND: return "OPERATOR_AND";
case TOKEN_OPERATOR_OR: return "OPERATOR_OR";
case TOKEN_BANG: return "BANG";
case TOKEN_BANG_EQ: return "BANG_EQ";
case TOKEN_EQ: return "EQ";
case TOKEN_EQ_EQ: return "EQ_EQ";
case TOKEN_GT: return "GT";
case TOKEN_LT: return "LT";
case TOKEN_GTE: return "GTE";
case TOKEN_LTE: return "LTE";
case TOKEN_DOT: return "DOT";
case TOKEN_COMMA: return "COMMA";
case TOKEN_COLON: return "COLON";
case TOKEN_SEMICOLON: return "SEMICOLON";
case TOKEN_PLUS: return "PLUS";
case TOKEN_MINUS: return "MINUS";
case TOKEN_STAR: return "STAR";
case TOKEN_SLASH: return "SLASH";
case TOKEN_LPAREN: return "LPAREN";
case TOKEN_RPAREN: return "RPAREN";
case TOKEN_LBRACE: return "LBRACE";
case TOKEN_RBRACE: return "RBRACE";
case TOKEN_LBRACKET: return "LBRACKET";
case TOKEN_RBRACKET: return "RBRACKET";
case TOKEN_ERROR: return "ERROR";
default: return "UNKNOWN_TOKEN";
}
}

71
src/tools/lexer.h Normal file
View File

@ -0,0 +1,71 @@
#ifndef UNDAR_LEXER_H
#define UNDAR_LEXER_H
typedef enum {
TOKEN_EOF,
TOKEN_IDENTIFIER,
TOKEN_INT_LITERAL,
TOKEN_UINT_LITERAL,
TOKEN_FLOAT_LITERAL,
TOKEN_STRING_LITERAL,
TOKEN_TYPE_INT,
TOKEN_TYPE_NAT,
TOKEN_TYPE_REAL,
TOKEN_TYPE_STR,
TOKEN_KEYWORD_PLEX,
TOKEN_KEYWORD_FN,
TOKEN_KEYWORD_LET,
TOKEN_KEYWORD_CONST,
TOKEN_KEYWORD_IF,
TOKEN_KEYWORD_ELSE,
TOKEN_KEYWORD_WHILE,
TOKEN_KEYWORD_FOR,
TOKEN_KEYWORD_RETURN,
TOKEN_KEYWORD_USE,
TOKEN_KEYWORD_INIT,
TOKEN_KEYWORD_THIS,
TOKEN_KEYWORD_PRINT,
TOKEN_KEYWORD_NIL,
TOKEN_KEYWORD_TRUE,
TOKEN_KEYWORD_FALSE,
TOKEN_OPERATOR_IS,
TOKEN_OPERATOR_NOT,
TOKEN_OPERATOR_AND,
TOKEN_OPERATOR_OR,
TOKEN_BANG,
TOKEN_BANG_EQ,
TOKEN_EQ,
TOKEN_EQ_EQ,
TOKEN_GT,
TOKEN_LT,
TOKEN_GTE,
TOKEN_LTE,
TOKEN_DOT,
TOKEN_COMMA,
TOKEN_COLON,
TOKEN_SEMICOLON,
TOKEN_PLUS,
TOKEN_MINUS,
TOKEN_STAR,
TOKEN_SLASH,
TOKEN_LPAREN,
TOKEN_RPAREN,
TOKEN_LBRACE,
TOKEN_RBRACE,
TOKEN_LBRACKET,
TOKEN_RBRACKET,
TOKEN_ERROR
} TokenType;
typedef struct {
TokenType type;
const char *start;
int length;
int line;
} Token;
void initLexer(const char *source);
Token nextToken();
const char* tokenTypeToString(TokenType type);
#endif