diff --git a/Makefile b/Makefile index 3c913ca..c5ce8ef 100644 --- a/Makefile +++ b/Makefile @@ -86,11 +86,13 @@ ifeq ($(BUILD_MODE), release) PLATFORM_SOURCE := $(ARCH_DIR)/main.c \ $(ARCH_DIR)/devices.c\ $(SRC_DIR)/tools/parser.c \ + $(SRC_DIR)/tools/lexer.c \ $(SRC_DIR)/tools/assembler.c else PLATFORM_SOURCE := $(ARCH_DIR)/main.c \ $(ARCH_DIR)/devices.c \ $(SRC_DIR)/tools/parser.c \ + $(SRC_DIR)/tools/lexer.c \ $(SRC_DIR)/tools/assembler.c endif diff --git a/README.org b/README.org index 56778d8..f51ebe2 100644 --- a/README.org +++ b/README.org @@ -58,14 +58,26 @@ The Undâr compiler will be written in Sċieppan, as well as core VM tests. #+BEGIN_SRC lisp ((code (label main - (load-immediate $0 &terminal-namespace) ; load terminal namespace - (load-immediate $1 &hello-str) ; load hello string ptr - (string-length $2 $1) ; get length to write to stdout - (syscall WRITE $0 $1 $2) ; do the write syscall - (halt))) ; done + (load-immediate $1 &hello-str) ; load hello string ptr + (push $1) + (call &pln) + (halt)) ; done + (label pln + (load-immediate $0 &terminal-namespace) ; get terminal device + (load-immediate $11 0) + (syscall OPEN $0 $0 $11) + (load-immediate $3 &new-line) + (pop $1) + (load-offset-32 $7 $0 4) ; load handle + (string-length $2 $1) + (syscall WRITE $7 $1 $2) + (string-length $4 $3) + (syscall WRITE $7 $3 $4) + (return))) (data (label terminal-namespace "/dev/term/0") - (label hello-str "nuqneH 'u'?\n"))) + (label new-line "\n") + (label hello-str "nuqneH 'u'?"))) #+END_SRC #+BEGIN_SRC sh @@ -81,37 +93,39 @@ memory is managed via frame based arenas. function scopes defines a memory frame heap allocations using the internal malloc opcode push pointers within this frame. when a frame exits, the pointer is reset like stack based gc. #+BEGIN_SRC lisp -((code - (label main ; this example adds 2 numbers together - (load-immediate $0 1) ; pushes 1 onto the stack for the function call - (push $0) - (load-immediate $0 1) - (push $0) - (call &add) ; here a new frame is generated - (pop $0) ; the element is returned and the memory for the pln is "freed" automatically because the child frame is done - (halt)) - - (label add - (pop $0) - (pop $1) - (add-int $2 $1 $0) ; add the arguments - (int-to-string $3 $2) ; convert to a string (heap allocation) - (push $3) - (call &pln) ; call print function - (push $2) - (return)) ; return to main function +((code + (label main + (load-immediate $0 &terminal-namespace) ; get terminal device + (load-immediate $11 0) + (syscall OPEN $0 $0 $11) - (label pln - (load-immediate $0 &terminal-namespace) ; load the namespace for the terminal - (load-immediate $3 &new-line) ; and a newline char - (pop $1) ; pointer to string - (string-length $2 $1) ; get the length - (syscall WRITE $0 $1 $2) ; write the string + (load-immediate $1 &help) ; print help message + (push $0) + (push $1) + (call &pln) + + (load-immediate $1 32) ; read in a string of max 32 char length + (malloc $4 $1) ; allocate memory for the string + (load-offset-32 $7 $0 4) ; load handle + (syscall READ $7 $2 $1 $4) ; read the string + + (push $0) + (push $4) + (call &pln) ; print the string + (halt)) + (label pln + (load-immediate $3 &new-line) + (pop $1) + (pop $0) + (load-offset-32 $7 $0 4) ; load handle + (string-length $2 $1) + (syscall WRITE $7 $1 $2) (string-length $4 $3) - (syscall WRITE $0 $3 $4) - (return))) ; return back to add function -(data ; allocates strings at compile time - (label terminal-namespace "/dev/term/0") + (syscall WRITE $7 $3 $4) + (return))) +(data + (label terminal-namespace "/dev/term/0") + (label help "Enter a string: ") (label new-line "\n"))) #+END_SRC diff --git a/src/arch/linux/main.c b/src/arch/linux/main.c index 480a99a..bfabc37 100644 --- a/src/arch/linux/main.c +++ b/src/arch/linux/main.c @@ -1,5 +1,6 @@ #include "../../tools/assembler.h" #include "../../tools/parser.h" +#include "../../tools/lexer.h" #include "../../vm/vm.h" #include "devices.h" #include @@ -122,6 +123,50 @@ bool loadVM(const char *filename, VM *vm) { // Function to compile and optionally save bool compileAndSave(const char *source_file, const char *output_file, VM *vm) { + USED(vm); + USED(output_file); + FILE *f = fopen(source_file, "rb"); + if (!f) { + perror("fopen"); + return false; + } + + static char source[MAX_SRC_SIZE + 1]; + + fseek(f, 0, SEEK_END); + long len = ftell(f); + fseek(f, 0, SEEK_SET); + if (len >= MAX_SRC_SIZE) { + fprintf(stderr, "Source is larger than buffer\n"); + fclose(f); + return false; + } + size_t read = fread(source, 1, len, f); + source[read] = '\0'; + fclose(f); + + initLexer(source); + Token token; + do { + token = nextToken(); + if (token.type == TOKEN_ERROR) { + printf("ERROR at line %d: %.*s\n", token.line, token.length, token.start); + break; // Stop on error, or continue if you want to see more + } + if (token.type != TOKEN_EOF) { + printf("Line %d [%s]: %.*s\n", + token.line, + tokenTypeToString(token.type), + token.length, + token.start); + } + } while (token.type != TOKEN_EOF); + + return true; +} + +// Function to assemble and optionally save +bool assembleAndSave(const char *source_file, const char *output_file, VM *vm) { FILE *f = fopen(source_file, "rb"); if (!f) { perror("fopen"); @@ -377,7 +422,7 @@ i32 main(i32 argc, char *argv[]) { bool dump_rom = false; char *input_file = nil; char *output_file = nil; - bool is_rom = false; + bool is_rom, is_assembly = false; // Parse command line arguments for (i32 i = 1; i < argc; i++) { @@ -395,6 +440,9 @@ i32 main(i32 argc, char *argv[]) { if (ext && (strcmp(ext, ".rom") == 0)) { is_rom = true; } + if (ext && (strcmp(ext, ".asm.lisp") == 0)) { + is_assembly = true; + } } else if (output_file == nil && dump_rom) { // This is the output file for -o flag output_file = argv[i]; @@ -408,8 +456,14 @@ i32 main(i32 argc, char *argv[]) { if (is_rom) { // Load ROM file directly compilation_success = loadVM(input_file, &vm); - } else { + } else if (is_assembly) { // Compile Lisp file + if (dump_rom && output_file) { + compilation_success = assembleAndSave(input_file, output_file, &vm); + } else { + compilation_success = assembleAndSave(input_file, nil, &vm); + } + } else { if (dump_rom && output_file) { compilation_success = compileAndSave(input_file, output_file, &vm); } else { diff --git a/src/tools/lexer.c b/src/tools/lexer.c new file mode 100644 index 0000000..2831f64 --- /dev/null +++ b/src/tools/lexer.c @@ -0,0 +1,324 @@ +#include + +#include "../vm/common.h" +#include "lexer.h" + +typedef struct { + const char *start; + const char *current; + int line; +} Lexer; + +Lexer lexer; + +void initLexer(const char *source) { + lexer.start = source; + lexer.current = source; + lexer.line = 1; +} + +static bool isAlpha(char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'; +} + +static bool isDigit(char c) { return c >= '0' && c <= '9'; } + +static bool isAtEnd() { return *lexer.current == '\0'; } + +static char advance() { + lexer.current++; + return lexer.current[-1]; +} + +static char peek() { return *lexer.current; } + +static char peekNext() { + if (isAtEnd()) + return '\0'; + return lexer.current[1]; +} + +static bool match(char expected) { + if (isAtEnd()) + return false; + if (*lexer.current != expected) + return false; + lexer.current++; + return true; +} + +static Token makeToken(TokenType type) { + Token token; + token.type = type; + token.start = lexer.start; + token.length = (int)(lexer.current - lexer.start); + token.line = lexer.line; + return token; +} + +static Token errorToken(const char *message) { + Token token; + token.type = TOKEN_ERROR; + token.start = message; + token.length = (int)strlen(message); + token.line = lexer.line; + return token; +} + +static void skipWhitespace() { + for (;;) { + char c = peek(); + switch (c) { + case ' ': + case '\r': + case '\t': + advance(); + break; + case '\n': + lexer.line++; + advance(); + break; + case '/': + if (peekNext() == '/') { + // Single-line comment: skip until newline or end of file + advance(); + while (peek() != '\n' && !isAtEnd()) + advance(); + } else if (peekNext() == '*') { + // Multi-line comment: skip until '*/' or end of file + advance(); + advance(); + while (!isAtEnd()) { + if (peek() == '*' && peekNext() == '/') { + advance(); + advance(); + break; // Exit loop, comment ended + } + advance(); + } + } else { + return; // Not a comment, let tokenization handle it + } + break; + default: + return; + } + } +} + +static TokenType checkKeyword(int start, int length, const char *rest, + TokenType type) { + if (lexer.current - lexer.start == start + length && + memcmp(lexer.start + start, rest, length) == 0) { + return type; + } + + return TOKEN_IDENTIFIER; +} + +static TokenType identifierType() { + switch (lexer.start[0]) { + case 'a': + return checkKeyword(1, 2, "nd", TOKEN_OPERATOR_AND); + case 'e': + return checkKeyword(1, 3, "lse", TOKEN_KEYWORD_ELSE); + case 'f': + if (lexer.current - lexer.start > 1) { + switch (lexer.start[1]) { + case 'a': + return checkKeyword(2, 3, "lse", TOKEN_KEYWORD_FALSE); + case 'o': + return checkKeyword(2, 1, "r", TOKEN_KEYWORD_FOR); + } + return checkKeyword(1, 7, "unction", TOKEN_KEYWORD_FN); + } + break; + case 'i': + return checkKeyword(1, 1, "f", TOKEN_KEYWORD_IF); + case 'n': + return checkKeyword(1, 2, "il", TOKEN_KEYWORD_NIL); + case 'o': + return checkKeyword(1, 1, "r", TOKEN_OPERATOR_OR); + case 'p': + if (lexer.current - lexer.start > 1) { + switch (lexer.start[1]) { + case 'l': + return checkKeyword(2, 2, "ex", TOKEN_KEYWORD_PLEX); + case 'r': + return checkKeyword(2, 3, "int", TOKEN_KEYWORD_PRINT); + } + } + break; + case 'r': + return checkKeyword(1, 5, "eturn", TOKEN_KEYWORD_RETURN); + case 't': + if (lexer.current - lexer.start > 1) { + switch (lexer.start[1]) { + case 'h': + return checkKeyword(2, 2, "is", TOKEN_KEYWORD_THIS); + case 'r': + return checkKeyword(2, 2, "ue", TOKEN_KEYWORD_TRUE); + } + } + break; + case 'l': + return checkKeyword(1, 2, "et", TOKEN_KEYWORD_LET); + case 'w': + return checkKeyword(1, 4, "hile", TOKEN_KEYWORD_WHILE); + } + + return TOKEN_IDENTIFIER; +} + +static Token identifier() { + while (isAlpha(peek()) || isDigit(peek())) + advance(); + return makeToken(identifierType()); +} + +static Token number() { + while (isDigit(peek())) + advance(); + + /* Look for a fractional part. */ + if (peek() == '.' && isDigit(peekNext())) { + /* Consume the ".". */ + advance(); + + while (isDigit(peek())) + advance(); + + return makeToken(TOKEN_FLOAT_LITERAL); + } + + return makeToken(TOKEN_INT_LITERAL); +} + +static Token string() { + while (peek() != '"' && !isAtEnd()) { + if (peek() == '\n') + lexer.line++; + advance(); + } + + if (isAtEnd()) + return errorToken("Unterminated string."); + + /* The closing quote. */ + advance(); + return makeToken(TOKEN_STRING_LITERAL); +} + +Token nextToken() { + skipWhitespace(); + lexer.start = lexer.current; + + if (isAtEnd()) + return makeToken(TOKEN_EOF); + + char c = advance(); + if (isAlpha(c)) + return identifier(); + if (isDigit(c)) + return number(); + + switch (c) { + case '(': + return makeToken(TOKEN_LPAREN); + case ')': + return makeToken(TOKEN_RPAREN); + case '{': + return makeToken(TOKEN_LBRACE); + case '}': + return makeToken(TOKEN_RBRACE); + case '[': + return makeToken(TOKEN_LBRACKET); + case ']': + return makeToken(TOKEN_RBRACKET); + case ';': + return makeToken(TOKEN_SEMICOLON); + case ',': + return makeToken(TOKEN_COMMA); + case '.': + return makeToken(TOKEN_DOT); + case '-': + return makeToken(TOKEN_MINUS); + case '+': + return makeToken(TOKEN_PLUS); + case '/': + return makeToken(TOKEN_SLASH); + case '*': + return makeToken(TOKEN_STAR); + case '!': + return makeToken(match('=') ? TOKEN_BANG_EQ : TOKEN_BANG); + case '=': + return makeToken(match('=') ? TOKEN_EQ_EQ : TOKEN_EQ); + case '<': + return makeToken(match('=') ? TOKEN_LTE : TOKEN_LT); + case '>': + return makeToken(match('=') ? TOKEN_GTE : TOKEN_GT); + case '"': + return string(); + } + + return errorToken("Unexpected character."); +} + +const char* tokenTypeToString(TokenType type) { + switch (type) { + case TOKEN_EOF: return "EOF"; + case TOKEN_IDENTIFIER: return "IDENTIFIER"; + case TOKEN_INT_LITERAL: return "INT_LITERAL"; + case TOKEN_UINT_LITERAL: return "UINT_LITERAL"; + case TOKEN_FLOAT_LITERAL: return "FLOAT_LITERAL"; + case TOKEN_STRING_LITERAL: return "STRING_LITERAL"; + case TOKEN_TYPE_INT: return "TYPE_INT"; + case TOKEN_TYPE_NAT: return "TYPE_NAT"; + case TOKEN_TYPE_REAL: return "TYPE_REAL"; + case TOKEN_TYPE_STR: return "TYPE_STR"; + case TOKEN_KEYWORD_PLEX: return "KEYWORD_PLEX"; + case TOKEN_KEYWORD_FN: return "KEYWORD_FN"; + case TOKEN_KEYWORD_LET: return "KEYWORD_LET"; + case TOKEN_KEYWORD_CONST: return "KEYWORD_CONST"; + case TOKEN_KEYWORD_IF: return "KEYWORD_IF"; + case TOKEN_KEYWORD_ELSE: return "KEYWORD_ELSE"; + case TOKEN_KEYWORD_WHILE: return "KEYWORD_WHILE"; + case TOKEN_KEYWORD_FOR: return "KEYWORD_FOR"; + case TOKEN_KEYWORD_RETURN: return "KEYWORD_RETURN"; + case TOKEN_KEYWORD_USE: return "KEYWORD_USE"; + case TOKEN_KEYWORD_INIT: return "KEYWORD_INIT"; + case TOKEN_KEYWORD_THIS: return "KEYWORD_THIS"; + case TOKEN_KEYWORD_PRINT: return "KEYWORD_PRINT"; + case TOKEN_KEYWORD_NIL: return "KEYWORD_NIL"; + case TOKEN_KEYWORD_TRUE: return "KEYWORD_TRUE"; + case TOKEN_KEYWORD_FALSE: return "KEYWORD_FALSE"; + case TOKEN_OPERATOR_IS: return "OPERATOR_IS"; + case TOKEN_OPERATOR_NOT: return "OPERATOR_NOT"; + case TOKEN_OPERATOR_AND: return "OPERATOR_AND"; + case TOKEN_OPERATOR_OR: return "OPERATOR_OR"; + case TOKEN_BANG: return "BANG"; + case TOKEN_BANG_EQ: return "BANG_EQ"; + case TOKEN_EQ: return "EQ"; + case TOKEN_EQ_EQ: return "EQ_EQ"; + case TOKEN_GT: return "GT"; + case TOKEN_LT: return "LT"; + case TOKEN_GTE: return "GTE"; + case TOKEN_LTE: return "LTE"; + case TOKEN_DOT: return "DOT"; + case TOKEN_COMMA: return "COMMA"; + case TOKEN_COLON: return "COLON"; + case TOKEN_SEMICOLON: return "SEMICOLON"; + case TOKEN_PLUS: return "PLUS"; + case TOKEN_MINUS: return "MINUS"; + case TOKEN_STAR: return "STAR"; + case TOKEN_SLASH: return "SLASH"; + case TOKEN_LPAREN: return "LPAREN"; + case TOKEN_RPAREN: return "RPAREN"; + case TOKEN_LBRACE: return "LBRACE"; + case TOKEN_RBRACE: return "RBRACE"; + case TOKEN_LBRACKET: return "LBRACKET"; + case TOKEN_RBRACKET: return "RBRACKET"; + case TOKEN_ERROR: return "ERROR"; + default: return "UNKNOWN_TOKEN"; + } +} \ No newline at end of file diff --git a/src/tools/lexer.h b/src/tools/lexer.h new file mode 100644 index 0000000..7dbae73 --- /dev/null +++ b/src/tools/lexer.h @@ -0,0 +1,71 @@ +#ifndef UNDAR_LEXER_H +#define UNDAR_LEXER_H + +typedef enum { + TOKEN_EOF, + TOKEN_IDENTIFIER, + TOKEN_INT_LITERAL, + TOKEN_UINT_LITERAL, + TOKEN_FLOAT_LITERAL, + TOKEN_STRING_LITERAL, + TOKEN_TYPE_INT, + TOKEN_TYPE_NAT, + TOKEN_TYPE_REAL, + TOKEN_TYPE_STR, + TOKEN_KEYWORD_PLEX, + TOKEN_KEYWORD_FN, + TOKEN_KEYWORD_LET, + TOKEN_KEYWORD_CONST, + TOKEN_KEYWORD_IF, + TOKEN_KEYWORD_ELSE, + TOKEN_KEYWORD_WHILE, + TOKEN_KEYWORD_FOR, + TOKEN_KEYWORD_RETURN, + TOKEN_KEYWORD_USE, + TOKEN_KEYWORD_INIT, + TOKEN_KEYWORD_THIS, + TOKEN_KEYWORD_PRINT, + TOKEN_KEYWORD_NIL, + TOKEN_KEYWORD_TRUE, + TOKEN_KEYWORD_FALSE, + TOKEN_OPERATOR_IS, + TOKEN_OPERATOR_NOT, + TOKEN_OPERATOR_AND, + TOKEN_OPERATOR_OR, + TOKEN_BANG, + TOKEN_BANG_EQ, + TOKEN_EQ, + TOKEN_EQ_EQ, + TOKEN_GT, + TOKEN_LT, + TOKEN_GTE, + TOKEN_LTE, + TOKEN_DOT, + TOKEN_COMMA, + TOKEN_COLON, + TOKEN_SEMICOLON, + TOKEN_PLUS, + TOKEN_MINUS, + TOKEN_STAR, + TOKEN_SLASH, + TOKEN_LPAREN, + TOKEN_RPAREN, + TOKEN_LBRACE, + TOKEN_RBRACE, + TOKEN_LBRACKET, + TOKEN_RBRACKET, + TOKEN_ERROR +} TokenType; + +typedef struct { + TokenType type; + const char *start; + int length; + int line; +} Token; + +void initLexer(const char *source); +Token nextToken(); +const char* tokenTypeToString(TokenType type); + +#endif \ No newline at end of file