Add intial lexer back again, fix docs

2025-10-18 18:15:54 -07:00 · 2025-10-18 18:15:54 -07:00 · 790d7e8509
parent 6c1bf1ff8c
commit 790d7e8509
5 changed files with 502 additions and 37 deletions
--- a/2
+++ b/2
@ -86,11 +86,13 @@ ifeq ($(BUILD_MODE), release)
 	PLATFORM_SOURCE := $(ARCH_DIR)/main.c \
 		$(ARCH_DIR)/devices.c\
        $(SRC_DIR)/tools/parser.c \
        $(SRC_DIR)/tools/lexer.c \
 		$(SRC_DIR)/tools/assembler.c
 else
 	PLATFORM_SOURCE := $(ARCH_DIR)/main.c \
 		$(ARCH_DIR)/devices.c \
        $(SRC_DIR)/tools/parser.c \
        $(SRC_DIR)/tools/lexer.c \
 		$(SRC_DIR)/tools/assembler.c
 endif
--- a/README.org
+++ b/README.org
@ -58,14 +58,26 @@ The Undâr compiler will be written in Sċieppan, as well as core VM tests.
 #+BEGIN_SRC lisp
 ((code 
 	(label main	
 		(load-immediate $0 &terminal-namespace) ; load terminal namespace
 		(load-immediate $1 &hello-str) ; load hello string ptr
-    	(string-length $2 $1)                   ; get length to write to stdout
+		(push $1)
-		(syscall WRITE $0 $1 $2)                ; do the write syscall
+    	(call &pln)
-		(halt)))                                ; done
+		(halt)) ; done
  	(label pln 
 		(load-immediate $0 &terminal-namespace) ; get terminal device
 		(load-immediate $11 0)
 		(syscall OPEN $0 $0 $11)	
 		(load-immediate $3 &new-line)
 		(pop $1)
    	(load-offset-32 $7 $0 4) ; load handle
 		(string-length $2 $1)
 		(syscall WRITE $7 $1 $2)
 		(string-length $4 $3)
 		(syscall WRITE $7 $3 $4)
 		(return)))
 (data 
 	(label terminal-namespace "/dev/term/0")
-    (label hello-str "nuqneH 'u'?\n")))
+	(label new-line "\n")
    (label hello-str "nuqneH 'u'?")))
 #+END_SRC
 #+BEGIN_SRC sh
@ -82,36 +94,38 @@ heap allocations using the internal malloc opcode push pointers within this fram
 #+BEGIN_SRC lisp
 ((code 
-	(label main                  ; this example adds 2 numbers together
+	(label main
-		(load-immediate $0 1)    ; pushes 1 onto the stack for the function call
+		(load-immediate $0 &terminal-namespace) ; get terminal device
 		(load-immediate $11 0)
 		(syscall OPEN $0 $0 $11)
 		(load-immediate $1 &help) ; print help message
 		(push $0)
-		(load-immediate $0 1) 
+        (push $1)
 		(call &pln)
 		(load-immediate $1 32) ; read in a string of max 32 char length 
 		(malloc $4 $1) ; allocate memory for the string
 		(load-offset-32 $7 $0 4) ; load handle
 		(syscall READ $7 $2 $1 $4) ; read the string
 		(push $0)
-		(call &add)              ; here a new frame is generated
+		(push $4)
-		(pop $0)                 ; the element is returned and the memory for the pln is "freed" automatically because the child frame is done
+		(call &pln) ; print the string
 		(halt))
 	(label add 
 		(pop $0)
 		(pop $1)
 		(add-int $2 $1 $0)      ; add the arguments
 		(int-to-string $3 $2)   ; convert to a string (heap allocation)
 		(push $3)
 		(call &pln)         ; call print function
 		(push $2)
 		(return))               ; return to main function
  	(label pln 
-		(load-immediate $0 &terminal-namespace) ; load the namespace for the terminal
+		(load-immediate $3 &new-line)
-		(load-immediate $3 &new-line)           ; and a newline char
+		(pop $1)
-		(pop $1)                                ; pointer to string
+    	(pop $0)
-		(string-length $2 $1)                   ; get the length
+    	(load-offset-32 $7 $0 4) ; load handle
-		(syscall WRITE $0 $1 $2)                ; write the string
+		(string-length $2 $1)
 		(syscall WRITE $7 $1 $2)
 		(string-length $4 $3)
-		(syscall WRITE $0 $3 $4)
+		(syscall WRITE $7 $3 $4)
-		(return)))                              ; return back to add function
+		(return)))
-(data                                           ; allocates strings at compile time
+(data 
 	(label terminal-namespace "/dev/term/0")
    (label help "Enter a string: ")
 	(label new-line "\n")))
 #+END_SRC
--- a/src/arch/linux/main.c
+++ b/src/arch/linux/main.c
@ -1,5 +1,6 @@
 #include "../../tools/assembler.h"
 #include "../../tools/parser.h"
 #include "../../tools/lexer.h"
 #include "../../vm/vm.h"
 #include "devices.h"
 #include <SDL2/SDL.h>
@ -122,6 +123,50 @@ bool loadVM(const char *filename, VM *vm) {
 // Function to compile and optionally save
 bool compileAndSave(const char *source_file, const char *output_file, VM *vm) {
  USED(vm);
  USED(output_file);
  FILE *f = fopen(source_file, "rb");
  if (!f) {
    perror("fopen");
    return false;
  }
  static char source[MAX_SRC_SIZE + 1];
  fseek(f, 0, SEEK_END);
  long len = ftell(f);
  fseek(f, 0, SEEK_SET);
  if (len >= MAX_SRC_SIZE) {
    fprintf(stderr, "Source is larger than buffer\n");
    fclose(f);
    return false;
  }
  size_t read = fread(source, 1, len, f);
  source[read] = '\0';
  fclose(f);
  initLexer(source);
  Token token;
  do {
    token = nextToken();
    if (token.type == TOKEN_ERROR) {
      printf("ERROR at line %d: %.*s\n", token.line, token.length, token.start);
      break; // Stop on error, or continue if you want to see more
    }
    if (token.type != TOKEN_EOF) {
      printf("Line %d [%s]: %.*s\n", 
            token.line, 
            tokenTypeToString(token.type), 
            token.length, 
            token.start);
    }
  } while (token.type != TOKEN_EOF);
  return true;
 }
 // Function to assemble and optionally save
 bool assembleAndSave(const char *source_file, const char *output_file, VM *vm) {
  FILE *f = fopen(source_file, "rb");
  if (!f) {
    perror("fopen");
@ -377,7 +422,7 @@ i32 main(i32 argc, char *argv[]) {
  bool dump_rom = false;
  char *input_file = nil;
  char *output_file = nil;
-  bool is_rom = false;
+  bool is_rom, is_assembly = false;
  // Parse command line arguments
  for (i32 i = 1; i < argc; i++) {
@ -395,6 +440,9 @@ i32 main(i32 argc, char *argv[]) {
      if (ext && (strcmp(ext, ".rom") == 0)) {
        is_rom = true;
      }
      if (ext && (strcmp(ext, ".asm.lisp") == 0)) {
        is_assembly = true;
      }
    } else if (output_file == nil && dump_rom) {
      // This is the output file for -o flag
      output_file = argv[i];
@ -408,8 +456,14 @@ i32 main(i32 argc, char *argv[]) {
    if (is_rom) {
      // Load ROM file directly
      compilation_success = loadVM(input_file, &vm);
-    } else {
+    } else if (is_assembly) {
      // Compile Lisp file
      if (dump_rom && output_file) {
        compilation_success = assembleAndSave(input_file, output_file, &vm);
      } else {
        compilation_success = assembleAndSave(input_file, nil, &vm);
      }
    } else {
      if (dump_rom && output_file) {
        compilation_success = compileAndSave(input_file, output_file, &vm);
      } else {
--- a/src/tools/lexer.c
+++ b/src/tools/lexer.c
@ -0,0 +1,324 @@
 #include <string.h>
 #include "../vm/common.h"
 #include "lexer.h"
 typedef struct {
  const char *start;
  const char *current;
  int line;
 } Lexer;
 Lexer lexer;
 void initLexer(const char *source) {
  lexer.start = source;
  lexer.current = source;
  lexer.line = 1;
 }
 static bool isAlpha(char c) {
  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_';
 }
 static bool isDigit(char c) { return c >= '0' && c <= '9'; }
 static bool isAtEnd() { return *lexer.current == '\0'; }
 static char advance() {
  lexer.current++;
  return lexer.current[-1];
 }
 static char peek() { return *lexer.current; }
 static char peekNext() {
  if (isAtEnd())
    return '\0';
  return lexer.current[1];
 }
 static bool match(char expected) {
  if (isAtEnd())
    return false;
  if (*lexer.current != expected)
    return false;
  lexer.current++;
  return true;
 }
 static Token makeToken(TokenType type) {
  Token token;
  token.type = type;
  token.start = lexer.start;
  token.length = (int)(lexer.current - lexer.start);
  token.line = lexer.line;
  return token;
 }
 static Token errorToken(const char *message) {
  Token token;
  token.type = TOKEN_ERROR;
  token.start = message;
  token.length = (int)strlen(message);
  token.line = lexer.line;
  return token;
 }
 static void skipWhitespace() {
  for (;;) {
    char c = peek();
    switch (c) {
    case ' ':
    case '\r':
    case '\t':
      advance();
      break;
    case '\n':
      lexer.line++;
      advance();
      break;
    case '/':
      if (peekNext() == '/') {
        // Single-line comment: skip until newline or end of file
        advance(); 
        while (peek() != '\n' && !isAtEnd())
          advance();
      } else if (peekNext() == '*') {
        // Multi-line comment: skip until '*/' or end of file
        advance(); 
        advance(); 
        while (!isAtEnd()) {
          if (peek() == '*' && peekNext() == '/') {
            advance(); 
            advance(); 
            break;     // Exit loop, comment ended
          }
          advance();
        }
      } else {
        return; // Not a comment, let tokenization handle it
      }
      break;
    default:
      return;
    }
  }
 }
 static TokenType checkKeyword(int start, int length, const char *rest,
                              TokenType type) {
  if (lexer.current - lexer.start == start + length &&
      memcmp(lexer.start + start, rest, length) == 0) {
    return type;
  }
  return TOKEN_IDENTIFIER;
 }
 static TokenType identifierType() {
  switch (lexer.start[0]) {
  case 'a':
    return checkKeyword(1, 2, "nd", TOKEN_OPERATOR_AND);
  case 'e':
    return checkKeyword(1, 3, "lse", TOKEN_KEYWORD_ELSE);
  case 'f':
    if (lexer.current - lexer.start > 1) {
      switch (lexer.start[1]) {
      case 'a':
        return checkKeyword(2, 3, "lse", TOKEN_KEYWORD_FALSE);
      case 'o':
        return checkKeyword(2, 1, "r", TOKEN_KEYWORD_FOR);
      }
      return checkKeyword(1, 7, "unction", TOKEN_KEYWORD_FN);
    }
    break;
  case 'i':
    return checkKeyword(1, 1, "f", TOKEN_KEYWORD_IF);
  case 'n':
    return checkKeyword(1, 2, "il", TOKEN_KEYWORD_NIL);
  case 'o':
    return checkKeyword(1, 1, "r", TOKEN_OPERATOR_OR);
  case 'p':
    if (lexer.current - lexer.start > 1) {
      switch (lexer.start[1]) {
      case 'l':
        return checkKeyword(2, 2, "ex", TOKEN_KEYWORD_PLEX);
      case 'r':
        return checkKeyword(2, 3, "int", TOKEN_KEYWORD_PRINT);
      }
    }
    break;
  case 'r':
    return checkKeyword(1, 5, "eturn", TOKEN_KEYWORD_RETURN);
  case 't':
    if (lexer.current - lexer.start > 1) {
      switch (lexer.start[1]) {
      case 'h':
        return checkKeyword(2, 2, "is", TOKEN_KEYWORD_THIS);
      case 'r':
        return checkKeyword(2, 2, "ue", TOKEN_KEYWORD_TRUE);
      }
    }
    break;
  case 'l':
    return checkKeyword(1, 2, "et", TOKEN_KEYWORD_LET);
  case 'w':
    return checkKeyword(1, 4, "hile", TOKEN_KEYWORD_WHILE);
  }
  return TOKEN_IDENTIFIER;
 }
 static Token identifier() {
  while (isAlpha(peek()) || isDigit(peek()))
    advance();
  return makeToken(identifierType());
 }
 static Token number() {
  while (isDigit(peek()))
    advance();
  /*  Look for a fractional part. */
  if (peek() == '.' && isDigit(peekNext())) {
    /*  Consume the ".". */
    advance();
    while (isDigit(peek()))
      advance();
    return makeToken(TOKEN_FLOAT_LITERAL);
  }
  return makeToken(TOKEN_INT_LITERAL);
 }
 static Token string() {
  while (peek() != '"' && !isAtEnd()) {
    if (peek() == '\n')
      lexer.line++;
    advance();
  }
  if (isAtEnd())
    return errorToken("Unterminated string.");
  /*  The closing quote. */
  advance();
  return makeToken(TOKEN_STRING_LITERAL);
 }
 Token nextToken() {
  skipWhitespace();
  lexer.start = lexer.current;
  if (isAtEnd())
    return makeToken(TOKEN_EOF);
  char c = advance();
  if (isAlpha(c))
    return identifier();
  if (isDigit(c))
    return number();
  switch (c) {
  case '(':
    return makeToken(TOKEN_LPAREN);
  case ')':
    return makeToken(TOKEN_RPAREN);
  case '{':
    return makeToken(TOKEN_LBRACE);
  case '}':
    return makeToken(TOKEN_RBRACE);
  case '[':
    return makeToken(TOKEN_LBRACKET);
  case ']':
    return makeToken(TOKEN_RBRACKET);
  case ';':
    return makeToken(TOKEN_SEMICOLON);
  case ',':
    return makeToken(TOKEN_COMMA);
  case '.':
    return makeToken(TOKEN_DOT);
  case '-':
    return makeToken(TOKEN_MINUS);
  case '+':
    return makeToken(TOKEN_PLUS);
  case '/':
    return makeToken(TOKEN_SLASH);
  case '*':
    return makeToken(TOKEN_STAR);
  case '!':
    return makeToken(match('=') ? TOKEN_BANG_EQ : TOKEN_BANG);
  case '=':
    return makeToken(match('=') ? TOKEN_EQ_EQ : TOKEN_EQ);
  case '<':
    return makeToken(match('=') ? TOKEN_LTE : TOKEN_LT);
  case '>':
    return makeToken(match('=') ? TOKEN_GTE : TOKEN_GT);
  case '"':
    return string();
  }
  return errorToken("Unexpected character.");
 }
 const char* tokenTypeToString(TokenType type) {
  switch (type) {
    case TOKEN_EOF: return "EOF";
    case TOKEN_IDENTIFIER: return "IDENTIFIER";
    case TOKEN_INT_LITERAL: return "INT_LITERAL";
    case TOKEN_UINT_LITERAL: return "UINT_LITERAL";
    case TOKEN_FLOAT_LITERAL: return "FLOAT_LITERAL";
    case TOKEN_STRING_LITERAL: return "STRING_LITERAL";
    case TOKEN_TYPE_INT: return "TYPE_INT";
    case TOKEN_TYPE_NAT: return "TYPE_NAT";
    case TOKEN_TYPE_REAL: return "TYPE_REAL";
    case TOKEN_TYPE_STR: return "TYPE_STR";
    case TOKEN_KEYWORD_PLEX: return "KEYWORD_PLEX";
    case TOKEN_KEYWORD_FN: return "KEYWORD_FN";
    case TOKEN_KEYWORD_LET: return "KEYWORD_LET";
    case TOKEN_KEYWORD_CONST: return "KEYWORD_CONST";
    case TOKEN_KEYWORD_IF: return "KEYWORD_IF";
    case TOKEN_KEYWORD_ELSE: return "KEYWORD_ELSE";
    case TOKEN_KEYWORD_WHILE: return "KEYWORD_WHILE";
    case TOKEN_KEYWORD_FOR: return "KEYWORD_FOR";
    case TOKEN_KEYWORD_RETURN: return "KEYWORD_RETURN";
    case TOKEN_KEYWORD_USE: return "KEYWORD_USE";
    case TOKEN_KEYWORD_INIT: return "KEYWORD_INIT";
    case TOKEN_KEYWORD_THIS: return "KEYWORD_THIS";
    case TOKEN_KEYWORD_PRINT: return "KEYWORD_PRINT";
    case TOKEN_KEYWORD_NIL: return "KEYWORD_NIL";
    case TOKEN_KEYWORD_TRUE: return "KEYWORD_TRUE";
    case TOKEN_KEYWORD_FALSE: return "KEYWORD_FALSE";
    case TOKEN_OPERATOR_IS: return "OPERATOR_IS";
    case TOKEN_OPERATOR_NOT: return "OPERATOR_NOT";
    case TOKEN_OPERATOR_AND: return "OPERATOR_AND";
    case TOKEN_OPERATOR_OR: return "OPERATOR_OR";
    case TOKEN_BANG: return "BANG";
    case TOKEN_BANG_EQ: return "BANG_EQ";
    case TOKEN_EQ: return "EQ";
    case TOKEN_EQ_EQ: return "EQ_EQ";
    case TOKEN_GT: return "GT";
    case TOKEN_LT: return "LT";
    case TOKEN_GTE: return "GTE";
    case TOKEN_LTE: return "LTE";
    case TOKEN_DOT: return "DOT";
    case TOKEN_COMMA: return "COMMA";
    case TOKEN_COLON: return "COLON";
    case TOKEN_SEMICOLON: return "SEMICOLON";
    case TOKEN_PLUS: return "PLUS";
    case TOKEN_MINUS: return "MINUS";
    case TOKEN_STAR: return "STAR";
    case TOKEN_SLASH: return "SLASH";
    case TOKEN_LPAREN: return "LPAREN";
    case TOKEN_RPAREN: return "RPAREN";
    case TOKEN_LBRACE: return "LBRACE";
    case TOKEN_RBRACE: return "RBRACE";
    case TOKEN_LBRACKET: return "LBRACKET";
    case TOKEN_RBRACKET: return "RBRACKET";
    case TOKEN_ERROR: return "ERROR";
    default: return "UNKNOWN_TOKEN";
  }
 }
--- a/src/tools/lexer.h
+++ b/src/tools/lexer.h
@ -0,0 +1,71 @@
 #ifndef UNDAR_LEXER_H
 #define UNDAR_LEXER_H
 typedef enum {
  TOKEN_EOF,
  TOKEN_IDENTIFIER,
  TOKEN_INT_LITERAL,
  TOKEN_UINT_LITERAL,
  TOKEN_FLOAT_LITERAL,
  TOKEN_STRING_LITERAL,
  TOKEN_TYPE_INT,
  TOKEN_TYPE_NAT,
  TOKEN_TYPE_REAL,
  TOKEN_TYPE_STR,
  TOKEN_KEYWORD_PLEX,
  TOKEN_KEYWORD_FN,
  TOKEN_KEYWORD_LET,
  TOKEN_KEYWORD_CONST,
  TOKEN_KEYWORD_IF,
  TOKEN_KEYWORD_ELSE,
  TOKEN_KEYWORD_WHILE,
  TOKEN_KEYWORD_FOR,
  TOKEN_KEYWORD_RETURN,
  TOKEN_KEYWORD_USE,
  TOKEN_KEYWORD_INIT,
  TOKEN_KEYWORD_THIS,
  TOKEN_KEYWORD_PRINT,
  TOKEN_KEYWORD_NIL,
  TOKEN_KEYWORD_TRUE,
  TOKEN_KEYWORD_FALSE,
  TOKEN_OPERATOR_IS,
  TOKEN_OPERATOR_NOT,
  TOKEN_OPERATOR_AND,
  TOKEN_OPERATOR_OR,
  TOKEN_BANG,
  TOKEN_BANG_EQ,
  TOKEN_EQ,
  TOKEN_EQ_EQ,
  TOKEN_GT,
  TOKEN_LT,
  TOKEN_GTE,
  TOKEN_LTE,
  TOKEN_DOT,
  TOKEN_COMMA,
  TOKEN_COLON,
  TOKEN_SEMICOLON,
  TOKEN_PLUS,
  TOKEN_MINUS,
  TOKEN_STAR,
  TOKEN_SLASH,
  TOKEN_LPAREN,
  TOKEN_RPAREN,
  TOKEN_LBRACE,
  TOKEN_RBRACE,
  TOKEN_LBRACKET,
  TOKEN_RBRACKET,
  TOKEN_ERROR
 } TokenType;
 typedef struct {
  TokenType type;
  const char *start;
  int length;
  int line;
 } Token;
 void initLexer(const char *source);
 Token nextToken();
 const char* tokenTypeToString(TokenType type);
 #endif