#include "compiler.h" #include "parser.h" #include #include /* TODO: For expressions we should use the shunting yard algorithem. This will be useful because it will make it trivial to track types. If the type is a literal, we just read it, if it is a variable we read the variable type from the info. During the first pass we count the number of variables. We assign a local to each variable. When parsing a expression, we assign any function call or literal to a temp variable slot, (3 maybe?) First one goes in 0, then 2nd in 1, then do operation. Store the operation in 0. If it is a function call, use 1 to load and push the args, then use 1 for the return variable. If both happen to be function calls use 0, 1, 2. Where 1 & 2 are the function calls, and the result operation gets stored in 0. */ #define DEBUG_COMPILER void emit_byte(u8 byte) { #ifdef DEBUG_COMPILER printf("code[%d] = %d\n", cp, byte); #endif code[cp] = byte; } void emit_u32(u32 value) { #ifdef DEBUG_COMPILER printf("code[%d..%d] = %d\n", cp, cp + 3, value); #endif u32 *c = (u32 *)(&code[cp / 4]); c[0] = value; } Symbol *symbol_table_lookup(ScopeTable *table, const char *name, u32 length, i32 scope_ref) { SymbolTable st = table->scopes[scope_ref]; for (u32 i = 0; i < st.count; i++) { if (st.symbols[i].name_length == length) { if (sleq(st.symbols[i].name, name, length)) { return &table->scopes[scope_ref].symbols[i]; } } } if (st.parent < 0) return nil; return symbol_table_lookup(table, name, length, st.parent); } u8 symbol_table_add(ScopeTable *table, Symbol s) { Symbol *sym = symbol_table_lookup(table, s.name, s.name_length, table->scope_ref); if (sym != nil) { fprintf(stderr, "Error: Symbol '%.*s' already defined, in this scope" " please pick a different variable name or create a new scope.\n", s.name_length, s.name); exit(1); } u8 current_index = table->scopes[table->scope_ref].count; if (current_index + 1 > 255) { fprintf(stderr, "Error: Only 255 symbols are allowed per scope" " first off: impressive; secondly:" " just create a new scope and keep going.\n"); exit(1); } if (!table_realloc(table)) { fprintf(stderr, "Error: Symbol table is out of memory! This is likely because you " " built the assembler in static mode, increase the static size." " if you built using malloc, that means your computer is out of" " memory. Close a few tabs in your web browser and try again." " Count was %d, while capacity was %d\n", table->count, table->capacity); exit(1); } /* set ref to current count for local */ s.ref = current_index; #ifdef DEBUG_COMPILER if (s.scope == VAR) { printf("$%d = %s\n", s.ref, s.name); } else if (s.scope == GLOBAL) { printf("memory[%d] = %s\n", s.ref, s.name); } else { printf("code[%d] = %s\n", s.ref, s.name); } #endif table->scopes[table->scope_ref].symbols[current_index] = s; table->scopes[table->scope_ref].count++; return current_index; } u32 get_ref(ScopeTable *st, const char *name, u32 length) { Symbol *sym = symbol_table_lookup(st, name, length, st->scope_ref); if (!sym) { fprintf(stderr, "Error: Assembler has no idea what Symbol '%.*s' means.\n", length, name); exit(1); return 0; } return sym->ref; } u32 get_ptr(Token token, ScopeTable *st) { if (token.type == TOKEN_IDENTIFIER) { return get_ref(st, token.start, token.length); } if (token.type == TOKEN_LITERAL_INT) { return atoi(token.start); } if (token.type == TOKEN_LITERAL_NAT) { char *endptr; u32 out = (u32)strtoul(token.start, &endptr, 10); if (endptr == token.start || *endptr != '\0') { fprintf(stderr, "Invalid decimal literal at line %d: %.*s\n", token.line, token.length, token.start); exit(1); } return out; } fprintf(stderr, "Error: Not a pointer or symbol at line %d: %.*s\n", token.line, token.length, token.start); exit(1); } u32 get_reg(Token token, ScopeTable *st) { if (token.type == TOKEN_IDENTIFIER) { return get_ref(st, token.start, token.length); } if (token.type == TOKEN_BIG_MONEY) { token = next_token(); return atoi(token.start); } fprintf(stderr, "Error: Not a register or symbol at line %d: %.*s\n", token.line, token.length, token.start); exit(1); } Token next_id_or_reg() { Token token = next_token(); if (token.type == TOKEN_IDENTIFIER) { return token; } if (token.type == TOKEN_BIG_MONEY) { token = next_token(); return token; } printf("Not an ID or register at line %d: %.*s\n", token.line, token.length, token.start); exit(1); return token; } Token next_id_or_ptr() { Token token = next_token(); if (token.type != TOKEN_IDENTIFIER && token.type != TOKEN_LITERAL_NAT && token.type != TOKEN_LITERAL_INT && token.type != TOKEN_LITERAL_REAL) { printf("Not an ID or register at line %d: %.*s\n", token.line, token.length, token.start); exit(1); } return token; } Token next_token_is(TokenType type) { Token token = next_token(); if (token.type != type) { printf("ERROR at line %d: %.*s\n", token.line, token.length, token.start); exit(1); } return token; } /** * Global . */ bool define_global(ScopeTable *st, Token regType) { u32 *globals = (u32 *)(mem); Symbol s; switch (regType.type) { case TOKEN_TYPE_BOOL: s.type = BOOL; s.size = 1; break; case TOKEN_TYPE_I8: s.type = I8; s.size = 1; break; case TOKEN_TYPE_U8: s.type = U8; s.size = 1; break; case TOKEN_TYPE_I16: s.type = I16; s.size = 2; break; case TOKEN_TYPE_U16: s.type = U16; s.size = 2; break; case TOKEN_TYPE_INT: s.type = I32; s.size = 4; break; case TOKEN_TYPE_NAT: s.type = U32; s.size = 4; break; case TOKEN_TYPE_REAL: s.type = F32; s.size = 4; break; case TOKEN_TYPE_STR: s.type = STR; break; default: return false; } Token name = next_token_is(TOKEN_IDENTIFIER); if (name.length > MAX_SYMBOL_NAME_LENGTH) { return false; } mcpy(s.name, (char *)name.start, name.length); s.name_length = name.length; s.name[name.length] = '\0'; u32 addr = mp; s.ref = addr; s.scope = GLOBAL; next_token_is(TOKEN_EQ); Token value = next_token(); switch (value.type) { case TOKEN_KEYWORD_TRUE: { u32 addr = mp; WRITE_U8(addr, 1); mp += s.size; break; } case TOKEN_KEYWORD_FALSE: { u32 addr = mp; WRITE_U8(addr, 0); mp += s.size; break; } case TOKEN_LITERAL_INT: { i32 out = atoi(value.start); u32 addr = mp; WRITE_U32(addr, out); mp += s.size; break; } case TOKEN_LITERAL_NAT: { char *endptr; u32 out = (u32)strtoul(value.start, &endptr, 10); if (endptr == value.start || *endptr != '\0') { fprintf(stderr, "Invalid decimal literal: %s\n", value.start); exit(1); } u32 addr = mp; WRITE_U32(addr, out); mp += s.size; break; } case TOKEN_LITERAL_REAL: { i32 out = FLOAT_TO_REAL(atof(value.start)); u32 addr = mp; WRITE_U32(addr, out); mp += s.size; break; } case TOKEN_LITERAL_STR: { const char *src = value.start; i32 len = 0; i32 i = 0; while (i < value.length) { char c = src[i++]; if (c == '"') { continue; } if (c == '\\' && i < value.length) { switch (src[i++]) { case 'n': c = '\n'; break; case 't': c = '\t'; break; case 'r': c = '\r'; break; case '\\': case '"': case '\'': break; default: i--; /* Rewind for unknown escapes */ } } WRITE_U8(addr + 4 + len, c); len++; } u32 size = len + 5; /* 4 (len) + dst_len + 1 (null) */ s.size = size; mp += size; WRITE_U32(addr, len); WRITE_U8(addr + 4 + len, '\0'); break; } default: return false; } next_token_is(TOKEN_SEMICOLON); symbol_table_add(st, s); return true; } /** * Var . */ void define_var(ScopeTable *st, Token regType) { Symbol s; s.scope = VAR; switch (regType.type) { case TOKEN_TYPE_I8: { s.type = I8; s.size = 1; break; } case TOKEN_TYPE_I16: { s.type = I16; s.size = 2; break; } case TOKEN_TYPE_INT: { s.type = I32; s.size = 4; break; } case TOKEN_TYPE_U8: { s.type = U8; s.size = 1; break; } case TOKEN_TYPE_U16: { s.type = U16; s.size = 2; break; } case TOKEN_TYPE_NAT: { s.type = U32; s.size = 4; break; } case TOKEN_TYPE_REAL: { s.type = F32; s.size = 4; break; } case TOKEN_TYPE_BOOL: { s.type = BOOL; s.size = 1; break; } case TOKEN_TYPE_STR: { s.type = STR; s.size = 4; /* not really this type, pointer alias which is 4 */ break; } default: printf("ERROR at line %d: %.*s\n", regType.line, regType.length, regType.start); exit(1); } Token name = next_token_is(TOKEN_IDENTIFIER); if (name.length > MAX_SYMBOL_NAME_LENGTH) { printf("VARIABLE NAME TOO LONG at line %d: %.*s\n", regType.line, regType.length, regType.start); exit(1); } mcpy(s.name, (void *)name.start, name.length); s.name[name.length] = '\0'; s.name_length = name.length; symbol_table_add(st, s); } /** * Function. */ void define_function(ScopeTable *st) { Symbol s; s.scope = LOCAL; s.type = FUNCTION; Token name = next_token_is(TOKEN_IDENTIFIER); if (name.length > MAX_SYMBOL_NAME_LENGTH) { printf("FUNCITON NAME TOO LONG at line %d: %.*s\n", name.line, name.length, name.start); exit(1); } mcpy(s.name, (void *)name.start, name.length); s.name[name.length] = '\0'; s.name_length = name.length; next_token_is(TOKEN_LPAREN); i32 temp = st->scope_ref; st->count++; st->scopes[st->count].parent = st->scope_ref; st->scope_ref = (i32)st->count; Token next = next_token(); while (next.type != TOKEN_RPAREN) { define_var(st, next); next = next_token(); if (next.type == TOKEN_COMMA) { next = next_token(); continue; } else if (next.type == TOKEN_RPAREN) { break; } else { printf("ERROR at line %d: %.*s\n", next.line, next.length, next.start); exit(1); } } s.ref = cp; next = next_token_is(TOKEN_LBRACE); st->scope_ref = temp; // need to add to the parents scope symbol_table_add(st, s); st->scope_ref = (i32)st->count; } /** * Plex. */ void define_plex(ScopeTable *st) { Symbol s; s.scope = GLOBAL; s.type = PLEX; Token name = next_token_is(TOKEN_IDENTIFIER); if (name.length > MAX_SYMBOL_NAME_LENGTH) { printf("PLEX NAME TOO LONG at line %d: %.*s\n", name.line, name.length, name.start); exit(1); } mcpy(s.name, (void *)name.start, name.length); s.name[name.length] = '\0'; s.name_length = name.length; next_token_is(TOKEN_LPAREN); } /** * Branch. */ void define_branch(ScopeTable *st) { Symbol s; s.scope = LOCAL; s.type = VOID; Token name = next_token_is(TOKEN_IDENTIFIER); if (name.length > MAX_SYMBOL_NAME_LENGTH) { printf("BRANCH NAME TOO LONG at line %d: %.*s\n", name.line, name.length, name.start); exit(1); } mcpy(s.name, (void *)name.start, name.length); s.name_length = name.length; s.name[name.length] = '\0'; s.ref = cp; symbol_table_add(st, s); } /** * Define a loop */ void define_loop(ScopeTable *st) {} /** * Build the symbol table and calculate the types/size/offsets of all values. */ void build_symbol_table(char *source, ScopeTable *st) { Token token; init_lexer(source); do { token = next_token(); if (token.type == TOKEN_ERROR) { printf("ERROR at line %d: %.*s\n", token.line, token.length, token.start); exit(1); } if (token.type != TOKEN_EOF) { if (token.type == TOKEN_LBRACE) { st->count++; st->scopes[st->count].parent = st->scope_ref; st->scope_ref = (i32)st->count; st->depth++; continue; } if (token.type == TOKEN_RBRACE) { i32 current_scope = st->scope_ref; i32 parent = st->scopes[current_scope].parent; if (parent < 0) parent = 0; st->scope_ref = parent; st->depth--; continue; } if (token.type == TOKEN_KEYWORD_PLEX) { if (st->depth != 0) { printf("I'm letting it slide, but generally plexes are declared " "outside of a scope %d: %.*s\n", token.line, token.length, token.start); } define_plex(st); continue; } if (token.type == TOKEN_KEYWORD_FN) { if (st->depth != 0) { printf("Functions can only be declared outside of a scope %d: %.*s\n", token.line, token.length, token.start); exit(1); } define_function(st); continue; } if (token.type == TOKEN_KEYWORD_CONST) { // FIXME: add consts, for now just make everything next_token(); continue; } if (token.type == TOKEN_TYPE_I8 || token.type == TOKEN_TYPE_I16 || token.type == TOKEN_TYPE_INT || token.type == TOKEN_TYPE_U8 || token.type == TOKEN_TYPE_U16 || token.type == TOKEN_TYPE_NAT || token.type == TOKEN_TYPE_REAL || token.type == TOKEN_TYPE_STR || token.type == TOKEN_TYPE_BOOL) { if (st->depth == 0) { define_global(st, token); continue; } define_var(st, token); next_token_is(TOKEN_SEMICOLON); continue; } if (token.type == TOKEN_KEYWORD_IF) { define_loop(st); continue; } if (token.type == TOKEN_KEYWORD_LOOP || token.type == TOKEN_KEYWORD_DO || token.type == TOKEN_KEYWORD_FOR) { define_branch(st); continue; } if (token.type == TOKEN_KEYWORD_RETURN) { Token next = next_token(); if (next.type == TOKEN_SEMICOLON) { /* put 0xFF as return register */ cp++; continue; } get_reg(next, st); cp++; next_token_is(TOKEN_SEMICOLON); continue; } #ifdef DEBUG_COMPILER printf("-- %.*s --\n", token.length, token.start); #endif } } while (token.type != TOKEN_EOF); } /** * 2nd pass, emit the bytecode */ void emit_bytecode(char *source, ScopeTable *st) { Token token; init_lexer(source); do { token = next_token(); if (token.type == TOKEN_ERROR) { printf("ERROR at line %d: %.*s\n", token.line, token.length, token.start); break; } if (token.type != TOKEN_EOF) { if (token.type == TOKEN_LBRACE) { st->count++; st->scopes[st->count].parent = st->scope_ref; st->scope_ref = (i32)st->count; st->depth++; continue; } if (token.type == TOKEN_RBRACE) { i32 current_scope = st->scope_ref; i32 parent = st->scopes[current_scope].parent; if (parent < 0) parent = 0; st->scope_ref = parent; st->depth--; continue; } if (token.type == TOKEN_KEYWORD_FN) { /* ignore, already processed */ Token next = next_token(); while (next.type != TOKEN_RPAREN) { next = next_token(); } continue; } if (token.type == TOKEN_KEYWORD_PLEX) { /* ignore, already processed */ Token next = next_token(); while (next.type != TOKEN_RPAREN) { next = next_token(); } continue; } if (token.type == TOKEN_KEYWORD_CONST) { /* ignore, already processed */ next_token(); /* type */ next_token(); /* var */ next_token(); /* reg */ next_token(); /* ; */ continue; } if (token.type == TOKEN_TYPE_I8 || token.type == TOKEN_TYPE_I16 || token.type == TOKEN_TYPE_INT || token.type == TOKEN_TYPE_U8 || token.type == TOKEN_TYPE_U16 || token.type == TOKEN_TYPE_NAT || token.type == TOKEN_TYPE_REAL || token.type == TOKEN_TYPE_STR) { /* ignore, already processed */ next_token(); /* var */ next_token(); /* reg */ next_token(); /* ; */ continue; } if (token.type == TOKEN_KEYWORD_LOOP || token.type == TOKEN_KEYWORD_IF || token.type == TOKEN_KEYWORD_ELSE || token.type == TOKEN_KEYWORD_DO || token.type == TOKEN_KEYWORD_FOR) { /* ignore, already processed */ next_token(); /* id */ } if (token.type == TOKEN_KEYWORD_RETURN) { Token next = next_token(); if (next.type == TOKEN_SEMICOLON) { /* put 0xFF as return register */ code[cp++] = ENCODE_B(OP_RETURN, 255, 0); continue; } u32 reg = get_reg(next, st); code[cp++] = ENCODE_B(OP_RETURN, reg, 0); next_token_is(TOKEN_SEMICOLON); continue; } #ifdef DEBUG_COMPILER printf("-- %.*s --\n", token.length, token.start); #endif if (token.type == TOKEN_IDENTIFIER) { /*} else { some other identifier printf("Unknown id at line %d: %.*s\n", token.line, token.length, token.start); exit(1); } */ } } } while (token.type != TOKEN_EOF); } /** * Compile. */ bool compile(ScopeTable *st, char *source) { build_symbol_table(source, st); cp = 0; /* actually start emitting code */ st->count = 0; emit_bytecode(source, st); return true; }