295 lines
6.6 KiB
C
295 lines
6.6 KiB
C
#include "lexer.h"
|
|
|
|
Lexer lexer;
|
|
|
|
void init_lexer(const char *source) {
|
|
lexer.start = source;
|
|
lexer.current = source;
|
|
lexer.line = 1;
|
|
}
|
|
|
|
int is_at_end() { return *lexer.current == '\0'; }
|
|
|
|
char advance() { return *lexer.current++; }
|
|
|
|
char peek() { return *lexer.current; }
|
|
|
|
char peek_next() {
|
|
if (is_at_end())
|
|
return '\0';
|
|
return lexer.current[1];
|
|
}
|
|
|
|
int match(char expected) {
|
|
if (*lexer.current != expected)
|
|
return 0;
|
|
lexer.current++;
|
|
return 1;
|
|
}
|
|
|
|
void skip_whitespace() {
|
|
for (;;) {
|
|
char c = peek();
|
|
switch (c) {
|
|
case ' ':
|
|
case '\r':
|
|
case '\t':
|
|
advance();
|
|
break;
|
|
case '\n':
|
|
lexer.line++;
|
|
advance();
|
|
break;
|
|
case '!':
|
|
if (peek_next() == '!') {
|
|
while (peek() != '\n' && !is_at_end())
|
|
advance();
|
|
} else {
|
|
while (peek() != '\n' && !is_at_end())
|
|
advance();
|
|
}
|
|
break;
|
|
default:
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
Token make_token(TokenType type) {
|
|
Token token;
|
|
token.type = type;
|
|
token.start = lexer.start;
|
|
token.length = (int)(lexer.current - lexer.start);
|
|
token.line = lexer.line;
|
|
return token;
|
|
}
|
|
|
|
Token error_token(const char *message) {
|
|
Token token;
|
|
token.type = TOKEN_ERROR;
|
|
token.start = message;
|
|
token.length = (int)strlen(message);
|
|
token.line = lexer.line;
|
|
return token;
|
|
}
|
|
|
|
int is_alpha(char c) { return isalpha(c) || c == '_'; }
|
|
|
|
int is_digit(char c) { return isdigit(c); }
|
|
|
|
Token number() {
|
|
while (is_digit(peek()))
|
|
advance();
|
|
|
|
if (peek() == '.' && is_digit(peek_next())) {
|
|
advance();
|
|
while (is_digit(peek()))
|
|
advance();
|
|
return make_token(TOKEN_FLOAT_LITERAL);
|
|
}
|
|
|
|
return make_token(TOKEN_INT_LITERAL);
|
|
}
|
|
|
|
Token string() {
|
|
while (peek() != '"' && !is_at_end()) {
|
|
if (peek() == '\n')
|
|
lexer.line++;
|
|
advance();
|
|
}
|
|
|
|
if (is_at_end())
|
|
return error_token("Unterminated string.");
|
|
|
|
advance();
|
|
return make_token(TOKEN_STRING_LITERAL);
|
|
}
|
|
|
|
Token identifier() {
|
|
while (is_alpha(peek()) || is_digit(peek()))
|
|
advance();
|
|
|
|
int length = (int)(lexer.current - lexer.start);
|
|
const char *text = lexer.start;
|
|
|
|
if (length == 4 && strncmp(text, "init", 4) == 0)
|
|
return make_token(TOKEN_KEYWORD_INIT);
|
|
if (length == 4 && strncmp(text, "this", 4) == 0)
|
|
return make_token(TOKEN_KEYWORD_THIS);
|
|
if (length == 4 && strncmp(text, "type", 4) == 0)
|
|
return make_token(TOKEN_KEYWORD_TYPE);
|
|
if (length == 2 && strncmp(text, "fn", 2) == 0)
|
|
return make_token(TOKEN_KEYWORD_FN);
|
|
if (length == 3 && strncmp(text, "let", 3) == 0)
|
|
return make_token(TOKEN_KEYWORD_LET);
|
|
if (length == 5 && strncmp(text, "const", 5) == 0)
|
|
return make_token(TOKEN_KEYWORD_CONST);
|
|
if (length == 2 && strncmp(text, "if", 2) == 0)
|
|
return make_token(TOKEN_KEYWORD_IF);
|
|
if (length == 4 && strncmp(text, "else", 4) == 0)
|
|
return make_token(TOKEN_KEYWORD_ELSE);
|
|
if (length == 5 && strncmp(text, "while", 5) == 0)
|
|
return make_token(TOKEN_KEYWORD_WHILE);
|
|
if (length == 3 && strncmp(text, "for", 3) == 0)
|
|
return make_token(TOKEN_KEYWORD_FOR);
|
|
if (length == 6 && strncmp(text, "return", 6) == 0)
|
|
return make_token(TOKEN_KEYWORD_RETURN);
|
|
if (length == 3 && strncmp(text, "use", 3) == 0)
|
|
return make_token(TOKEN_KEYWORD_USE);
|
|
if (length == 2 && strncmp(text, "is", 2) == 0)
|
|
return make_token(TOKEN_OPERATOR_IS);
|
|
if (length == 3 && strncmp(text, "int", 3) == 0)
|
|
return make_token(TOKEN_TYPE_INT);
|
|
if (length == 3 && strncmp(text, "nat", 3) == 0)
|
|
return make_token(TOKEN_TYPE_NAT);
|
|
if (length == 3 && strncmp(text, "str", 3) == 0)
|
|
return make_token(TOKEN_TYPE_STR);
|
|
if (length == 3 && strncmp(text, "real", 4) == 0)
|
|
return make_token(TOKEN_TYPE_REAL);
|
|
|
|
return make_token(TOKEN_IDENTIFIER);
|
|
}
|
|
|
|
Token next_token() {
|
|
skip_whitespace();
|
|
lexer.start = lexer.current;
|
|
|
|
if (is_at_end())
|
|
return make_token(TOKEN_EOF);
|
|
|
|
char c = advance();
|
|
|
|
if (is_alpha(c))
|
|
return identifier();
|
|
if (is_digit(c))
|
|
return number();
|
|
|
|
switch (c) {
|
|
case '(':
|
|
return make_token(TOKEN_LPAREN);
|
|
case ')':
|
|
return make_token(TOKEN_RPAREN);
|
|
case '{':
|
|
return make_token(TOKEN_LBRACE);
|
|
case '}':
|
|
return make_token(TOKEN_RBRACE);
|
|
case '[':
|
|
return make_token(TOKEN_LBRACKET);
|
|
case ']':
|
|
return make_token(TOKEN_RBRACKET);
|
|
case ',':
|
|
return make_token(TOKEN_COMMA);
|
|
case '.':
|
|
return make_token(TOKEN_DOT);
|
|
case ':':
|
|
return make_token(TOKEN_COLON);
|
|
case ';':
|
|
return make_token(TOKEN_SEMICOLON);
|
|
case '+':
|
|
return make_token(TOKEN_PLUS);
|
|
case '-':
|
|
return make_token(TOKEN_MINUS);
|
|
case '*':
|
|
return make_token(TOKEN_STAR);
|
|
case '/':
|
|
return make_token(TOKEN_SLASH);
|
|
case '!':
|
|
return make_token(match('=') ? TOKEN_BANG_EQ : TOKEN_BANG);
|
|
case '=':
|
|
return make_token(match('=') ? TOKEN_EQ_EQ : TOKEN_EQ);
|
|
case '<':
|
|
return make_token(match('=') ? TOKEN_LTE : TOKEN_LT);
|
|
case '>':
|
|
return make_token(match('=') ? TOKEN_GTE : TOKEN_GT);
|
|
case '"':
|
|
return string();
|
|
}
|
|
|
|
return error_token("Unexpected character.");
|
|
}
|
|
|
|
const char *token_type_name(TokenType type) {
|
|
switch (type) {
|
|
case TOKEN_IDENTIFIER:
|
|
return "identifier";
|
|
case TOKEN_INT_LITERAL:
|
|
return "int literal";
|
|
case TOKEN_FLOAT_LITERAL:
|
|
return "real literal";
|
|
case TOKEN_STRING_LITERAL:
|
|
return "string literal";
|
|
case TOKEN_TYPE_INT:
|
|
return "int";
|
|
case TOKEN_TYPE_REAL:
|
|
return "real";
|
|
case TOKEN_TYPE_STR:
|
|
return "str";
|
|
case TOKEN_TYPE_NAT:
|
|
return "nat";
|
|
case TOKEN_KEYWORD_THIS:
|
|
return "this";
|
|
case TOKEN_KEYWORD_TYPE:
|
|
return "type";
|
|
case TOKEN_KEYWORD_FN:
|
|
return "fn";
|
|
case TOKEN_KEYWORD_LET:
|
|
return "let";
|
|
case TOKEN_KEYWORD_CONST:
|
|
return "const";
|
|
case TOKEN_KEYWORD_IF:
|
|
return "if";
|
|
case TOKEN_KEYWORD_ELSE:
|
|
return "else";
|
|
case TOKEN_KEYWORD_WHILE:
|
|
return "while";
|
|
case TOKEN_KEYWORD_FOR:
|
|
return "for";
|
|
case TOKEN_KEYWORD_RETURN:
|
|
return "return";
|
|
case TOKEN_KEYWORD_INIT:
|
|
return "init";
|
|
case TOKEN_KEYWORD_USE:
|
|
return "use";
|
|
case TOKEN_OPERATOR_IS:
|
|
return "is";
|
|
case TOKEN_BANG:
|
|
return "!";
|
|
case TOKEN_EQ:
|
|
return "=";
|
|
case TOKEN_DOT:
|
|
return ".";
|
|
case TOKEN_COMMA:
|
|
return ",";
|
|
case TOKEN_COLON:
|
|
return ":";
|
|
case TOKEN_SEMICOLON:
|
|
return ";";
|
|
case TOKEN_PLUS:
|
|
return "+";
|
|
case TOKEN_MINUS:
|
|
return "-";
|
|
case TOKEN_STAR:
|
|
return "*";
|
|
case TOKEN_SLASH:
|
|
return "/";
|
|
case TOKEN_LPAREN:
|
|
return "(";
|
|
case TOKEN_RPAREN:
|
|
return ")";
|
|
case TOKEN_LBRACE:
|
|
return "{";
|
|
case TOKEN_RBRACE:
|
|
return "}";
|
|
case TOKEN_LBRACKET:
|
|
return "[";
|
|
case TOKEN_RBRACKET:
|
|
return "]";
|
|
case TOKEN_EOF:
|
|
return "eof";
|
|
case TOKEN_ERROR:
|
|
return "error";
|
|
default:
|
|
return "unknown";
|
|
}
|
|
}
|