undar-lang/src/lexer.c

295 lines
6.6 KiB
C

#include "lexer.h"
Lexer lexer;
void init_lexer(const char *source) {
lexer.start = source;
lexer.current = source;
lexer.line = 1;
}
int is_at_end() { return *lexer.current == '\0'; }
char advance() { return *lexer.current++; }
char peek() { return *lexer.current; }
char peek_next() {
if (is_at_end())
return '\0';
return lexer.current[1];
}
int match(char expected) {
if (*lexer.current != expected)
return 0;
lexer.current++;
return 1;
}
void skip_whitespace() {
for (;;) {
char c = peek();
switch (c) {
case ' ':
case '\r':
case '\t':
advance();
break;
case '\n':
lexer.line++;
advance();
break;
case '!':
if (peek_next() == '!') {
while (peek() != '\n' && !is_at_end())
advance();
} else {
while (peek() != '\n' && !is_at_end())
advance();
}
break;
default:
return;
}
}
}
Token make_token(TokenType type) {
Token token;
token.type = type;
token.start = lexer.start;
token.length = (int)(lexer.current - lexer.start);
token.line = lexer.line;
return token;
}
Token error_token(const char *message) {
Token token;
token.type = TOKEN_ERROR;
token.start = message;
token.length = (int)strlen(message);
token.line = lexer.line;
return token;
}
int is_alpha(char c) { return isalpha(c) || c == '_'; }
int is_digit(char c) { return isdigit(c); }
Token number() {
while (is_digit(peek()))
advance();
if (peek() == '.' && is_digit(peek_next())) {
advance();
while (is_digit(peek()))
advance();
return make_token(TOKEN_FLOAT_LITERAL);
}
return make_token(TOKEN_INT_LITERAL);
}
Token string() {
while (peek() != '"' && !is_at_end()) {
if (peek() == '\n')
lexer.line++;
advance();
}
if (is_at_end())
return error_token("Unterminated string.");
advance();
return make_token(TOKEN_STRING_LITERAL);
}
Token identifier() {
while (is_alpha(peek()) || is_digit(peek()))
advance();
int length = (int)(lexer.current - lexer.start);
const char *text = lexer.start;
if (length == 4 && strncmp(text, "init", 4) == 0)
return make_token(TOKEN_KEYWORD_INIT);
if (length == 4 && strncmp(text, "this", 4) == 0)
return make_token(TOKEN_KEYWORD_THIS);
if (length == 4 && strncmp(text, "type", 4) == 0)
return make_token(TOKEN_KEYWORD_TYPE);
if (length == 2 && strncmp(text, "fn", 2) == 0)
return make_token(TOKEN_KEYWORD_FN);
if (length == 3 && strncmp(text, "let", 3) == 0)
return make_token(TOKEN_KEYWORD_LET);
if (length == 5 && strncmp(text, "const", 5) == 0)
return make_token(TOKEN_KEYWORD_CONST);
if (length == 2 && strncmp(text, "if", 2) == 0)
return make_token(TOKEN_KEYWORD_IF);
if (length == 4 && strncmp(text, "else", 4) == 0)
return make_token(TOKEN_KEYWORD_ELSE);
if (length == 5 && strncmp(text, "while", 5) == 0)
return make_token(TOKEN_KEYWORD_WHILE);
if (length == 3 && strncmp(text, "for", 3) == 0)
return make_token(TOKEN_KEYWORD_FOR);
if (length == 6 && strncmp(text, "return", 6) == 0)
return make_token(TOKEN_KEYWORD_RETURN);
if (length == 3 && strncmp(text, "use", 3) == 0)
return make_token(TOKEN_KEYWORD_USE);
if (length == 2 && strncmp(text, "is", 2) == 0)
return make_token(TOKEN_OPERATOR_IS);
if (length == 3 && strncmp(text, "int", 3) == 0)
return make_token(TOKEN_TYPE_INT);
if (length == 3 && strncmp(text, "nat", 3) == 0)
return make_token(TOKEN_TYPE_NAT);
if (length == 3 && strncmp(text, "str", 3) == 0)
return make_token(TOKEN_TYPE_STR);
if (length == 3 && strncmp(text, "real", 4) == 0)
return make_token(TOKEN_TYPE_REAL);
return make_token(TOKEN_IDENTIFIER);
}
Token next_token() {
skip_whitespace();
lexer.start = lexer.current;
if (is_at_end())
return make_token(TOKEN_EOF);
char c = advance();
if (is_alpha(c))
return identifier();
if (is_digit(c))
return number();
switch (c) {
case '(':
return make_token(TOKEN_LPAREN);
case ')':
return make_token(TOKEN_RPAREN);
case '{':
return make_token(TOKEN_LBRACE);
case '}':
return make_token(TOKEN_RBRACE);
case '[':
return make_token(TOKEN_LBRACKET);
case ']':
return make_token(TOKEN_RBRACKET);
case ',':
return make_token(TOKEN_COMMA);
case '.':
return make_token(TOKEN_DOT);
case ':':
return make_token(TOKEN_COLON);
case ';':
return make_token(TOKEN_SEMICOLON);
case '+':
return make_token(TOKEN_PLUS);
case '-':
return make_token(TOKEN_MINUS);
case '*':
return make_token(TOKEN_STAR);
case '/':
return make_token(TOKEN_SLASH);
case '!':
return make_token(match('=') ? TOKEN_BANG_EQ : TOKEN_BANG);
case '=':
return make_token(match('=') ? TOKEN_EQ_EQ : TOKEN_EQ);
case '<':
return make_token(match('=') ? TOKEN_LTE : TOKEN_LT);
case '>':
return make_token(match('=') ? TOKEN_GTE : TOKEN_GT);
case '"':
return string();
}
return error_token("Unexpected character.");
}
const char *token_type_name(TokenType type) {
switch (type) {
case TOKEN_IDENTIFIER:
return "identifier";
case TOKEN_INT_LITERAL:
return "int literal";
case TOKEN_FLOAT_LITERAL:
return "real literal";
case TOKEN_STRING_LITERAL:
return "string literal";
case TOKEN_TYPE_INT:
return "int";
case TOKEN_TYPE_REAL:
return "real";
case TOKEN_TYPE_STR:
return "str";
case TOKEN_TYPE_NAT:
return "nat";
case TOKEN_KEYWORD_THIS:
return "this";
case TOKEN_KEYWORD_TYPE:
return "type";
case TOKEN_KEYWORD_FN:
return "fn";
case TOKEN_KEYWORD_LET:
return "let";
case TOKEN_KEYWORD_CONST:
return "const";
case TOKEN_KEYWORD_IF:
return "if";
case TOKEN_KEYWORD_ELSE:
return "else";
case TOKEN_KEYWORD_WHILE:
return "while";
case TOKEN_KEYWORD_FOR:
return "for";
case TOKEN_KEYWORD_RETURN:
return "return";
case TOKEN_KEYWORD_INIT:
return "init";
case TOKEN_KEYWORD_USE:
return "use";
case TOKEN_OPERATOR_IS:
return "is";
case TOKEN_BANG:
return "!";
case TOKEN_EQ:
return "=";
case TOKEN_DOT:
return ".";
case TOKEN_COMMA:
return ",";
case TOKEN_COLON:
return ":";
case TOKEN_SEMICOLON:
return ";";
case TOKEN_PLUS:
return "+";
case TOKEN_MINUS:
return "-";
case TOKEN_STAR:
return "*";
case TOKEN_SLASH:
return "/";
case TOKEN_LPAREN:
return "(";
case TOKEN_RPAREN:
return ")";
case TOKEN_LBRACE:
return "{";
case TOKEN_RBRACE:
return "}";
case TOKEN_LBRACKET:
return "[";
case TOKEN_RBRACKET:
return "]";
case TOKEN_EOF:
return "eof";
case TOKEN_ERROR:
return "error";
default:
return "unknown";
}
}