complexpr/src/lexer.rs

281 lines
11 KiB
Rust

use std::rc::Rc;
use crate::{ParserError, Position, token::{Token, TokenType}};
pub struct Lexer {
// name of file being lexed
filename: Option<Rc<str>>,
line: usize,
col: usize,
tokens: Vec<Token>,
start: usize,
current: usize,
code: Vec<char>
}
impl Lexer {
pub fn new(code: &str, filename: Option<String>) -> Self {
Self { filename: filename.map(Rc::from), line: 1, col: 1, tokens: vec![], start: 0, current: 0, code: code.chars().collect() }
}
pub fn into_tokens(self) -> Vec<Token> {
self.tokens
}
fn next(&mut self) -> char {
let c = self.code[self.current];
self.advance(c == '\n');
c
}
fn expect(&mut self, chars: &[char]) -> Option<char> {
if self.at_end() { return None }
for c in chars {
if self.code[self.current] == *c {
self.advance(*c == '\n');
return Some(*c)
}
}
None
}
fn at_end(&self) -> bool {
self.current >= self.code.len()
}
fn peek(&self) -> char {
self.code[self.current]
}
fn add_token<S>(&mut self, ty: TokenType, text: S) where S: Into<String> {
self.tokens.push(Token {
ty,
text: text.into(),
pos: Position {
file: self.filename.clone(),
pos: self.start,
line: self.line,
col: self.col - (self.current - self.start)
}
});
}
fn mk_error<S>(&self, msg: S) -> ParserError where S: Into<String> {
ParserError {
pos: Position { file: self.filename.clone(), pos: self.start, line: self.line, col: self.col},
message: msg.into()
}
}
fn collect_literal(&self) -> String {
self.code[self.start..self.current].iter().collect::<String>()
}
fn advance(&mut self, newline: bool) {
if newline {
self.line += 1;
self.col = 1;
} else {
self.col += 1;
}
self.current += 1;
}
pub fn lex(&mut self) -> Result<(), ParserError> {
while !self.at_end() {
self.start = self.current;
match self.next() {
'+' => match self.expect(&['=']) {
Some('=') => self.add_token(TokenType::PlusEqual, "+="),
_ => self.add_token(TokenType::Plus, "+"),
},
'-' => match self.expect(&['=', '>']) {
Some('=') => self.add_token(TokenType::MinusEqual, "-="),
Some('>') => self.add_token(TokenType::Arrow, "->"),
_ => self.add_token(TokenType::Minus, "-"),
},
'*' => match self.expect(&['=']) {
Some('=') => self.add_token(TokenType::StarEqual, "*="),
_ => self.add_token(TokenType::Star, "*"),
},
'%' => match self.expect(&['=']) {
Some('=') => self.add_token(TokenType::PercentEqual, "%="),
_ => self.add_token(TokenType::Percent, "%"),
},
'/' => match self.expect(&['=', '/']) {
Some('=') => self.add_token(TokenType::SlashEqual, "/="),
Some('/') => match self.expect(&['=']) {
Some('=') => self.add_token(TokenType::DoubleSlashEqual, "//="),
_ => self.add_token(TokenType::DoubleSlash, "//")
}
_ => self.add_token(TokenType::Slash, "/"),
},
'^' => match self.expect(&['=']) {
Some('=') => self.add_token(TokenType::CaretEqual, "^="),
_ => self.add_token(TokenType::Caret, "^"),
},
'=' => match self.expect(&['=']) {
Some('=') => self.add_token(TokenType::DoubleEqual, "=="),
_ => self.add_token(TokenType::Equal, "=")
},
'!' => match self.expect(&['=']) {
Some('=') => self.add_token(TokenType::BangEqual, "!="),
_ => self.add_token(TokenType::Bang, "!")
},
'>' => match self.expect(&['=']) {
Some('=') => self.add_token(TokenType::GreaterEqual, ">="),
_ => self.add_token(TokenType::Greater, ">")
},
'<' => match self.expect(&['=']) {
Some('=') => match self.expect(&['>']) {
Some('>') => self.add_token(TokenType::Spaceship, "<=>"),
_ => self.add_token(TokenType::LessEqual, "<="),
}
_ => self.add_token(TokenType::Less, "<")
},
'&' => match self.expect(&['&']) {
Some('&') => self.add_token(TokenType::DoubleAmper, "&&"),
_ => self.add_token(TokenType::Amper, "&"),
},
'|' => match self.expect(&['|', ':', '?', '>', '&']) {
Some('|') => self.add_token(TokenType::DoublePipe, "||"),
Some(':') => self.add_token(TokenType::PipeColon, "|:"),
Some('?') => self.add_token(TokenType::PipeQuestion, "|?"),
Some('>') => self.add_token(TokenType::PipePoint, "|>"),
Some('&') => self.add_token(TokenType::PipeAmper, "|&"),
_ => self.add_token(TokenType::Pipe, "|"),
},
',' => self.add_token(TokenType::Comma, ","),
';' => self.add_token(TokenType::Semicolon, ";"),
':' => self.add_token(TokenType::Colon, ":"),
'(' => self.add_token(TokenType::LParen, "("),
')' => self.add_token(TokenType::RParen, ")"),
'[' => self.add_token(TokenType::LBrack, "["),
']' => self.add_token(TokenType::RBrack, "]"),
'{' => self.add_token(TokenType::LBrace, "{"),
'}' => self.add_token(TokenType::RBrace, "}"),
'#' => {
while !self.at_end() && self.peek() != '\n' {
self.advance(false);
}
self.advance(true);
},
'"' => self.string()?,
'\'' => self.char()?,
' ' | '\t' | '\r' | '\n' => (),
'0'..='9' => self.number()?,
'a'..='z' | 'A'..='Z' | '_' => self.ident()?,
c => return Err(self.mk_error(format!("Unexpected character: {}", c)))
}
}
Ok(())
}
fn char(&mut self) -> Result<(), ParserError> {
if self.at_end() { return Err(self.mk_error("Unexpected EOF in character literal")) }
let mut c = self.next();
if c == '\'' {
return Err(self.mk_error("Empty character literal"))
} else if c == '\\' {
if self.at_end() { return Err(self.mk_error("Unexpected EOF in character literal")) }
// TODO Escapes
}
if self.at_end() { return Err(self.mk_error("Unexpected EOF in character literal")) }
self.expect(&['\'']).ok_or_else(|| self.mk_error("Expected ' to terminate character literal"))?;
self.add_token(TokenType::Char(c), self.collect_literal());
Ok(())
}
fn string(&mut self) -> Result<(), ParserError> {
let mut s = String::new();
while !self.at_end() && self.peek() != '"' {
if self.peek() == '\\' {
self.advance(false);
if self.at_end() {
return Err(self.mk_error("Unexpected EOF in string literal"))
}
// TODO more escape codes! \xHH, \u{HH..} or maybe \uhhhh \Uhhhhhhhh
match self.peek() {
'0' => s.push('\0'),
'n' => s.push('\n'),
't' => s.push('\t'),
'r' => s.push('\r'),
'e' => s.push('\x1b'),
'\\' => s.push('\\'),
'"' => s.push('"'),
'\n' => (),
c => return Err(self.mk_error(format!("Unknown escape code \\{}", c)))
}
self.advance(self.peek() == '\n')
} else {
s.push(self.peek());
self.advance(self.peek() == '\n');
}
}
if self.at_end() {
return Err(self.mk_error("Unexpected EOF in string literal"))
}
self.advance(false);
self.add_token(TokenType::String(Rc::from(s)), self.collect_literal());
Ok(())
}
fn number(&mut self) -> Result<(), ParserError> {
let mut has_dot = false;
while !self.at_end() && (self.peek().is_numeric() || self.peek() == '.') {
if self.peek() == '.' {
if has_dot {
break;
} else {
has_dot = true;
}
}
self.advance(false);
}
let is_imag = !self.at_end() && self.peek() == 'i';
if is_imag { self.advance(false); }
let literal = self.collect_literal();
if is_imag {
match literal[..literal.len()-1].parse::<f64>() {
Ok(num) => self.add_token(TokenType::ImFloat(num), literal),
Err(e) => return Err(self.mk_error(format!("Error parsing float: {}", e)))
}
} else if has_dot {
match literal.parse::<f64>() {
Ok(num) => self.add_token(TokenType::Float(num), literal),
Err(e) => return Err(self.mk_error(format!("Error parsing float: {}", e)))
}
} else {
match literal.parse::<i64>() {
Ok(num) => self.add_token(TokenType::Int(num), literal),
Err(e) => return Err(self.mk_error(format!("Error parsing float: {}", e)))
}
}
Ok(())
}
fn ident(&mut self) -> Result<(), ParserError> {
while !self.at_end() && (self.peek().is_ascii_alphanumeric() || self.peek() == '_') {
self.advance(false);
}
let literal = self.collect_literal();
let token_ty = match literal.as_ref() {
"true" => TokenType::True,
"false" => TokenType::False,
"nil" => TokenType::Nil,
"if" => TokenType::If,
"elif" => TokenType::Elif,
"else" => TokenType::Else,
"while" => TokenType::While,
"for" => TokenType::For,
"let" => TokenType::Let,
"break" => TokenType::Break,
"continue" => TokenType::Continue,
"return" => TokenType::Return,
s => TokenType::Ident(Rc::from(s))
};
self.add_token(token_ty, literal);
Ok(())
}
}