commit 8b5123e6050f1d24f2f6f3a000fef4665b878a2b from: Murilo Ijanc date: Thu Sep 18 14:15:00 2025 UTC Extend lexer with strings, keywords and identifiers Add support for string literals with escapes, float literals, identifiers, keywords (let, fn, if, etc), two-char tokens (==, !=, ->, <=, >=), comments (#), and line/column tracking for error messages. commit - 6004e88422764c3bf275c6a4f1be37220f847d97 commit + 8b5123e6050f1d24f2f6f3a000fef4665b878a2b blob - 2f8635a833b62092ba29d4dda0a37d510dd77cb1 blob + 2dd48afc7d320f5e104d79c933f6e2527a264729 --- src/lexer/mod.rs +++ src/lexer/mod.rs @@ -24,6 +24,8 @@ use token::{Token, TokenKind}; pub struct Lexer { source: Vec, pos: usize, + line: usize, + col: usize, } impl Lexer { @@ -31,6 +33,8 @@ impl Lexer { Self { source: source.chars().collect(), pos: 0, + line: 1, + col: 1, } } @@ -40,10 +44,27 @@ impl Lexer { fn advance(&mut self) -> Option { let ch = self.source.get(self.pos).copied(); + if let Some(c) = ch { + if c == '\n' { + self.line += 1; + self.col = 1; + } else { + self.col += 1; + } + } self.pos += 1; ch } + fn match_next(&mut self, expected: char) -> bool { + if self.peek() == Some(expected) { + self.advance(); + true + } else { + false + } + } + fn skip_whitespace(&mut self) { while let Some(ch) = self.peek() { if ch.is_ascii_whitespace() { @@ -54,6 +75,15 @@ impl Lexer { } } + fn skip_comment(&mut self) { + while let Some(ch) = self.peek() { + if ch == '\n' { + break; + } + self.advance(); + } + } + fn read_number(&mut self, start: usize) -> Token { while let Some(ch) = self.peek() { if ch.is_ascii_digit() { @@ -63,16 +93,113 @@ impl Lexer { } } + if self.peek() == Some('.') { + self.advance(); + while let Some(ch) = self.peek() { + if ch.is_ascii_digit() { + self.advance(); + } else { + break; + } + } + let text: String = + self.source[start..self.pos].iter().collect(); + let value: f64 = text.parse().unwrap(); + return Token::new( + TokenKind::FloatLit(value), + Span::new(start, self.pos), + ); + } + let text: String = self.source[start..self.pos].iter().collect(); let value: i64 = text.parse().unwrap(); - Token::new( TokenKind::IntLit(value), Span::new(start, self.pos), ) } + fn read_string( + &mut self, + start: usize, + ) -> Result { + let mut value = String::new(); + + loop { + match self.advance() { + None => { + return Err(OlangError::new( + "unterminated string", + Span::new(start, self.pos), + )); + } + Some('"') => break, + Some('\\') => match self.advance() { + Some('n') => value.push('\n'), + Some('t') => value.push('\t'), + Some('\\') => value.push('\\'), + Some('"') => value.push('"'), + Some(c) => { + return Err(OlangError::new( + format!("invalid escape '\\{c}'"), + Span::new(self.pos - 2, self.pos), + )); + } + None => { + return Err(OlangError::new( + "unterminated escape", + Span::new(start, self.pos), + )); + } + }, + Some(c) => value.push(c), + } + } + + Ok(Token::new( + TokenKind::StrLit(value), + Span::new(start, self.pos), + )) + } + + fn read_identifier(&mut self, start: usize) -> Token { + while let Some(ch) = self.peek() { + if ch.is_ascii_alphanumeric() || ch == '_' { + self.advance(); + } else { + break; + } + } + + let text: String = + self.source[start..self.pos].iter().collect(); + let span = Span::new(start, self.pos); + + let kind = match text.as_str() { + "let" => TokenKind::Let, + "mut" => TokenKind::Mut, + "fn" => TokenKind::Fn, + "if" => TokenKind::If, + "else" => TokenKind::Else, + "for" => TokenKind::For, + "in" => TokenKind::In, + "while" => TokenKind::While, + "return" => TokenKind::Return, + "print" => TokenKind::Print, + "range" => TokenKind::Range, + "true" => TokenKind::BoolLit(true), + "false" => TokenKind::BoolLit(false), + "int" => TokenKind::IntType, + "float" => TokenKind::FloatType, + "str" => TokenKind::StrType, + "bool" => TokenKind::BoolType, + _ => TokenKind::Ident(text), + }; + + Token::new(kind, span) + } + pub fn tokenize( &mut self, ) -> Result, OlangError> { @@ -80,6 +207,7 @@ impl Lexer { loop { self.skip_whitespace(); + let start = self.pos; let Some(ch) = self.advance() else { @@ -90,15 +218,16 @@ impl Lexer { break; }; + if ch == '#' { + self.skip_comment(); + continue; + } + let token = match ch { '+' => Token::new( TokenKind::Plus, Span::new(start, self.pos), ), - '-' => Token::new( - TokenKind::Minus, - Span::new(start, self.pos), - ), '*' => Token::new( TokenKind::Star, Span::new(start, self.pos), @@ -107,6 +236,10 @@ impl Lexer { TokenKind::Slash, Span::new(start, self.pos), ), + '%' => Token::new( + TokenKind::Percent, + Span::new(start, self.pos), + ), '(' => Token::new( TokenKind::LParen, Span::new(start, self.pos), @@ -115,17 +248,106 @@ impl Lexer { TokenKind::RParen, Span::new(start, self.pos), ), + '{' => Token::new( + TokenKind::LBrace, + Span::new(start, self.pos), + ), + '}' => Token::new( + TokenKind::RBrace, + Span::new(start, self.pos), + ), + ',' => Token::new( + TokenKind::Comma, + Span::new(start, self.pos), + ), + ':' => Token::new( + TokenKind::Colon, + Span::new(start, self.pos), + ), ';' => Token::new( TokenKind::Semicolon, Span::new(start, self.pos), ), + '-' => { + if self.match_next('>') { + Token::new( + TokenKind::Arrow, + Span::new(start, self.pos), + ) + } else { + Token::new( + TokenKind::Minus, + Span::new(start, self.pos), + ) + } + } + '=' => { + if self.match_next('=') { + Token::new( + TokenKind::EqEq, + Span::new(start, self.pos), + ) + } else { + Token::new( + TokenKind::Eq, + Span::new(start, self.pos), + ) + } + } + '!' => { + if self.match_next('=') { + Token::new( + TokenKind::BangEq, + Span::new(start, self.pos), + ) + } else { + return Err(OlangError::new( + format!( + "unexpected char '!' at {}:{}", + self.line, self.col + ), + Span::new(start, self.pos), + )); + } + } + '<' => { + if self.match_next('=') { + Token::new( + TokenKind::LessEq, + Span::new(start, self.pos), + ) + } else { + Token::new( + TokenKind::Less, + Span::new(start, self.pos), + ) + } + } + '>' => { + if self.match_next('=') { + Token::new( + TokenKind::GreaterEq, + Span::new(start, self.pos), + ) + } else { + Token::new( + TokenKind::Greater, + Span::new(start, self.pos), + ) + } + } + '"' => self.read_string(start)?, c if c.is_ascii_digit() => { self.read_number(start) } + c if c.is_ascii_alphabetic() || c == '_' => { + self.read_identifier(start) + } c => { return Err(OlangError::new( format!( - "unexpected char '{c}'" + "unexpected char '{c}' at {}:{}", + self.line, self.col ), Span::new(start, self.pos), )); blob - 79d4169f9a10218c78f90f1669eb903f8e53a8cd blob + 475f5863b5f3a624a4344329bd3dc0061f7112a5 --- src/lexer/token.rs +++ src/lexer/token.rs @@ -19,14 +19,56 @@ use crate::span::Span; #[derive(Debug, Clone, PartialEq)] pub enum TokenKind { + // Literals IntLit(i64), + FloatLit(f64), + StrLit(String), + BoolLit(bool), + Ident(String), + + // Operators Plus, Minus, Star, Slash, + Percent, + Eq, + EqEq, + BangEq, + Less, + LessEq, + Greater, + GreaterEq, + Arrow, + + // Delimiters LParen, RParen, + LBrace, + RBrace, + Comma, + Colon, Semicolon, + + // Keywords + Let, + Mut, + Fn, + If, + Else, + For, + In, + While, + Return, + Print, + Range, + + // Types + IntType, + FloatType, + StrType, + BoolType, + Eof, }