1
0
Fork 0
undar-lang-old/src/tools/lexer.c

406 lines
10 KiB
C

#include <string.h>
#include "../vm/common.h"
#include "lexer.h"
typedef struct {
const char *start;
const char *current;
int line;
} Lexer;
Lexer lexer;
void initLexer(const char *source) {
lexer.start = source;
lexer.current = source;
lexer.line = 1;
}
static bool isAlpha(char c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_';
}
static bool isDigit(char c) { return c >= '0' && c <= '9'; }
static bool isAtEnd() { return *lexer.current == '\0'; }
static char advance() {
lexer.current++;
return lexer.current[-1];
}
static char peek() { return *lexer.current; }
static char peekNext() {
if (isAtEnd())
return '\0';
return lexer.current[1];
}
static bool match(char expected) {
if (isAtEnd())
return false;
if (*lexer.current != expected)
return false;
lexer.current++;
return true;
}
static Token makeToken(TokenType type) {
Token token;
token.type = type;
token.start = lexer.start;
token.length = (int)(lexer.current - lexer.start);
token.line = lexer.line;
return token;
}
static Token errorToken(const char *message) {
Token token;
token.type = TOKEN_ERROR;
token.start = message;
token.length = (int)strlen(message);
token.line = lexer.line;
return token;
}
static void skipWhitespace() {
for (;;) {
char c = peek();
switch (c) {
case ' ':
case '\r':
case '\t':
advance();
break;
case '\n':
lexer.line++;
advance();
break;
case '/':
if (peekNext() == '/') {
// Single-line comment: skip until newline or end of file
advance();
while (peek() != '\n' && !isAtEnd())
advance();
} else if (peekNext() == '*') {
// Multi-line comment: skip until '*/' or end of file
advance();
advance();
while (!isAtEnd()) {
if (peek() == '\n') lexer.line++;
if (peek() == '*' && peekNext() == '/') {
advance();
advance();
break; // Exit loop, comment ended
}
advance();
}
} else {
return; // Not a comment, let tokenization handle it
}
break;
default:
return;
}
}
}
static TokenType checkKeyword(int start, int length, const char *rest,
TokenType type) {
if (lexer.current - lexer.start == start + length &&
memcmp(lexer.start + start, rest, length) == 0) {
return type;
}
return TOKEN_IDENTIFIER;
}
static TokenType identifierType() {
switch (lexer.start[0]) {
case 'a':
return checkKeyword(1, 2, "nd", TOKEN_OPERATOR_AND);
case 'c':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'l':
return checkKeyword(2, 3, "ose", TOKEN_KEYWORD_CLOSE);
case 'o':
return checkKeyword(2, 3, "nst", TOKEN_KEYWORD_CONST);
}
}
break;
case 'e':
return checkKeyword(1, 3, "lse", TOKEN_KEYWORD_ELSE);
case 'f':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'a':
return checkKeyword(2, 3, "lse", TOKEN_KEYWORD_FALSE);
case 'o':
return checkKeyword(2, 1, "r", TOKEN_KEYWORD_FOR);
}
return checkKeyword(1, 7, "unction", TOKEN_KEYWORD_FN);
}
break;
case 'i':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'f':
return checkKeyword(2, 0, "", TOKEN_KEYWORD_IF);
case 'n':
if (lexer.current - lexer.start > 2) {
switch (lexer.start[2]) {
case 'i':
return checkKeyword(3, 2, "t", TOKEN_KEYWORD_INIT);
case 't':
return checkKeyword(3, 1, "", TOKEN_TYPE_INT);
}
}
break;
}
}
break;
case 'n':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'a':
return checkKeyword(2, 1, "t", TOKEN_TYPE_NAT);
case 'i':
return checkKeyword(2, 1, "l", TOKEN_KEYWORD_NIL);
}
}
break;
case 'o':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'p':
return checkKeyword(2, 2, "en", TOKEN_KEYWORD_OPEN);
case 'r':
return checkKeyword(2, 0, "", TOKEN_OPERATOR_OR);
}
}
break;
case 'p':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'l':
return checkKeyword(2, 2, "ex", TOKEN_KEYWORD_PLEX);
}
}
break;
case 'r':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'e':
if (lexer.current - lexer.start > 2) {
switch (lexer.start[2]) {
case 'a':
return checkKeyword(3, 1, "d", TOKEN_KEYWORD_READ);
case 'f':
return checkKeyword(3, 4, "resh", TOKEN_KEYWORD_REFRESH);
case 't':
return checkKeyword(3, 3, "urn", TOKEN_KEYWORD_RETURN);
}
}
break;
}
}
break;
case 's':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 't':
return checkKeyword(2, 1, "r", TOKEN_TYPE_STR);
}
}
break;
case 't':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'h':
return checkKeyword(2, 2, "is", TOKEN_KEYWORD_THIS);
case 'r':
return checkKeyword(2, 2, "ue", TOKEN_KEYWORD_TRUE);
}
}
break;
case 'u':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 's':
return checkKeyword(2, 1, "e", TOKEN_KEYWORD_USE);
}
}
break;
case 'w':
if (lexer.current - lexer.start > 1) {
switch (lexer.start[1]) {
case 'h':
return checkKeyword(2, 3, "ile", TOKEN_KEYWORD_WHILE);
case 'r':
return checkKeyword(2, 3, "ite", TOKEN_KEYWORD_WRITE);
}
}
break;
}
return TOKEN_IDENTIFIER;
}
static Token identifier() {
while (isAlpha(peek()) || isDigit(peek()))
advance();
return makeToken(identifierType());
}
static Token number() {
while (isDigit(peek()))
advance();
/* Look for a fractional part. */
if (peek() == '.' && isDigit(peekNext())) {
/* Consume the ".". */
advance();
while (isDigit(peek()))
advance();
return makeToken(TOKEN_FLOAT_LITERAL);
}
return makeToken(TOKEN_INT_LITERAL);
}
static Token string() {
while (peek() != '"' && !isAtEnd()) {
if (peek() == '\n')
lexer.line++;
advance();
}
if (isAtEnd())
return errorToken("Unterminated string.");
/* The closing quote. */
advance();
return makeToken(TOKEN_STRING_LITERAL);
}
Token nextToken() {
skipWhitespace();
lexer.start = lexer.current;
if (isAtEnd())
return makeToken(TOKEN_EOF);
char c = advance();
if (isAlpha(c))
return identifier();
if (isDigit(c))
return number();
switch (c) {
case '(':
return makeToken(TOKEN_LPAREN);
case ')':
return makeToken(TOKEN_RPAREN);
case '{':
return makeToken(TOKEN_LBRACE);
case '}':
return makeToken(TOKEN_RBRACE);
case '[':
return makeToken(TOKEN_LBRACKET);
case ']':
return makeToken(TOKEN_RBRACKET);
case ';':
return makeToken(TOKEN_SEMICOLON);
case ',':
return makeToken(TOKEN_COMMA);
case '.':
return makeToken(TOKEN_DOT);
case '-':
return makeToken(TOKEN_MINUS);
case '+':
return makeToken(TOKEN_PLUS);
case '/':
return makeToken(TOKEN_SLASH);
case '*':
return makeToken(TOKEN_STAR);
case '!':
return makeToken(match('=') ? TOKEN_BANG_EQ : TOKEN_BANG);
case '=':
return makeToken(match('=') ? TOKEN_EQ_EQ : TOKEN_EQ);
case '<':
return makeToken(match('=') ? TOKEN_LTE : TOKEN_LT);
case '>':
return makeToken(match('=') ? TOKEN_GTE : TOKEN_GT);
case '"':
return string();
}
return errorToken("Unexpected character.");
}
const char* tokenTypeToString(TokenType type) {
switch (type) {
case TOKEN_EOF: return "EOF";
case TOKEN_IDENTIFIER: return "IDENTIFIER";
case TOKEN_INT_LITERAL: return "INT_LITERAL";
case TOKEN_UINT_LITERAL: return "UINT_LITERAL";
case TOKEN_FLOAT_LITERAL: return "FLOAT_LITERAL";
case TOKEN_STRING_LITERAL: return "STRING_LITERAL";
case TOKEN_TYPE_INT: return "TYPE_INT";
case TOKEN_TYPE_NAT: return "TYPE_NAT";
case TOKEN_TYPE_REAL: return "TYPE_REAL";
case TOKEN_TYPE_STR: return "TYPE_STR";
case TOKEN_KEYWORD_PLEX: return "KEYWORD_PLEX";
case TOKEN_KEYWORD_FN: return "KEYWORD_FN";
case TOKEN_KEYWORD_CONST: return "KEYWORD_CONST";
case TOKEN_KEYWORD_IF: return "KEYWORD_IF";
case TOKEN_KEYWORD_ELSE: return "KEYWORD_ELSE";
case TOKEN_KEYWORD_WHILE: return "KEYWORD_WHILE";
case TOKEN_KEYWORD_FOR: return "KEYWORD_FOR";
case TOKEN_KEYWORD_RETURN: return "KEYWORD_RETURN";
case TOKEN_KEYWORD_USE: return "KEYWORD_USE";
case TOKEN_KEYWORD_INIT: return "KEYWORD_INIT";
case TOKEN_KEYWORD_THIS: return "KEYWORD_THIS";
case TOKEN_KEYWORD_OPEN: return "TOKEN_KEYWORD_OPEN";
case TOKEN_KEYWORD_READ: return "TOKEN_KEYWORD_READ";
case TOKEN_KEYWORD_WRITE: return "TOKEN_KEYWORD_WRITE";
case TOKEN_KEYWORD_REFRESH: return "TOKEN_KEYWORD_REFRESH";
case TOKEN_KEYWORD_CLOSE: return "TOKEN_KEYWORD_CLOSE";
case TOKEN_KEYWORD_NIL: return "KEYWORD_NIL";
case TOKEN_KEYWORD_TRUE: return "KEYWORD_TRUE";
case TOKEN_KEYWORD_FALSE: return "KEYWORD_FALSE";
case TOKEN_OPERATOR_IS: return "OPERATOR_IS";
case TOKEN_OPERATOR_NOT: return "OPERATOR_NOT";
case TOKEN_OPERATOR_AND: return "OPERATOR_AND";
case TOKEN_OPERATOR_OR: return "OPERATOR_OR";
case TOKEN_BANG: return "BANG";
case TOKEN_BANG_EQ: return "BANG_EQ";
case TOKEN_EQ: return "EQ";
case TOKEN_EQ_EQ: return "EQ_EQ";
case TOKEN_GT: return "GT";
case TOKEN_LT: return "LT";
case TOKEN_GTE: return "GTE";
case TOKEN_LTE: return "LTE";
case TOKEN_DOT: return "DOT";
case TOKEN_COMMA: return "COMMA";
case TOKEN_COLON: return "COLON";
case TOKEN_SEMICOLON: return "SEMICOLON";
case TOKEN_PLUS: return "PLUS";
case TOKEN_MINUS: return "MINUS";
case TOKEN_STAR: return "STAR";
case TOKEN_SLASH: return "SLASH";
case TOKEN_LPAREN: return "LPAREN";
case TOKEN_RPAREN: return "RPAREN";
case TOKEN_LBRACE: return "LBRACE";
case TOKEN_RBRACE: return "RBRACE";
case TOKEN_LBRACKET: return "LBRACKET";
case TOKEN_RBRACKET: return "RBRACKET";
case TOKEN_ERROR: return "ERROR";
default: return "UNKNOWN_TOKEN";
}
}