commit 6004e88422764c3bf275c6a4f1be37220f847d97 from: Murilo Ijanc date: Fri Aug 22 19:45:00 2025 UTC Add basic lexer with tokenize command Lexer tokenizes numbers, arithmetic operators, parentheses and semicolons. CLI tokenize subcommand reads .ol files and prints token stream. commit - 2f626af47071623ff393a568fdf765526342fb27 commit + 6004e88422764c3bf275c6a4f1be37220f847d97 blob - cc1182a89a75f06c6d393998c54b36e0df043a9f blob + e91891d1c7e74a8aa6eb5a5fd993596ad05f0eb4 --- src/main.rs +++ src/main.rs @@ -15,6 +15,59 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // +mod error; +mod lexer; +mod span; + +use std::env; +use std::fs; +use std::process; + fn main() { - println!("Hello, world!"); + let args: Vec = env::args().collect(); + + if args.len() < 3 { + eprintln!("usage: ol "); + eprintln!("commands: tokenize"); + process::exit(1); + } + + let command = &args[1]; + let filename = &args[2]; + + let source = match fs::read_to_string(filename) { + Ok(s) => s, + Err(e) => { + eprintln!( + "error reading '{filename}': {e}" + ); + process::exit(1); + } + }; + + match command.as_str() { + "tokenize" => cmd_tokenize(&source), + _ => { + eprintln!("unknown command: {command}"); + process::exit(1); + } + } } + +fn cmd_tokenize(source: &str) { + let mut lexer = lexer::Lexer::new(source); + + match lexer.tokenize() { + Ok(tokens) => { + let parts: Vec = tokens + .iter() + .map(|t| format!("{:?}", t.kind)) + .collect(); + println!("{}", parts.join(" ")); + } + Err(e) => { + eprintln!("{e}"); + process::exit(1); + } + } +} blob - /dev/null blob + 2f8635a833b62092ba29d4dda0a37d510dd77cb1 (mode 644) --- /dev/null +++ src/lexer/mod.rs @@ -0,0 +1,140 @@ +// vim: set tw=79 cc=80 ts=4 sw=4 sts=4 et : +// +// Copyright (c) 2025-2026 Murilo Ijanc' +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// + +pub mod token; + +use crate::error::OlangError; +use crate::span::Span; +use token::{Token, TokenKind}; + +pub struct Lexer { + source: Vec, + pos: usize, +} + +impl Lexer { + pub fn new(source: &str) -> Self { + Self { + source: source.chars().collect(), + pos: 0, + } + } + + fn peek(&self) -> Option { + self.source.get(self.pos).copied() + } + + fn advance(&mut self) -> Option { + let ch = self.source.get(self.pos).copied(); + self.pos += 1; + ch + } + + fn skip_whitespace(&mut self) { + while let Some(ch) = self.peek() { + if ch.is_ascii_whitespace() { + self.advance(); + } else { + break; + } + } + } + + fn read_number(&mut self, start: usize) -> Token { + while let Some(ch) = self.peek() { + if ch.is_ascii_digit() { + self.advance(); + } else { + break; + } + } + + let text: String = + self.source[start..self.pos].iter().collect(); + let value: i64 = text.parse().unwrap(); + + Token::new( + TokenKind::IntLit(value), + Span::new(start, self.pos), + ) + } + + pub fn tokenize( + &mut self, + ) -> Result, OlangError> { + let mut tokens = Vec::new(); + + loop { + self.skip_whitespace(); + let start = self.pos; + + let Some(ch) = self.advance() else { + tokens.push(Token::new( + TokenKind::Eof, + Span::new(start, start), + )); + break; + }; + + let token = match ch { + '+' => Token::new( + TokenKind::Plus, + Span::new(start, self.pos), + ), + '-' => Token::new( + TokenKind::Minus, + Span::new(start, self.pos), + ), + '*' => Token::new( + TokenKind::Star, + Span::new(start, self.pos), + ), + '/' => Token::new( + TokenKind::Slash, + Span::new(start, self.pos), + ), + '(' => Token::new( + TokenKind::LParen, + Span::new(start, self.pos), + ), + ')' => Token::new( + TokenKind::RParen, + Span::new(start, self.pos), + ), + ';' => Token::new( + TokenKind::Semicolon, + Span::new(start, self.pos), + ), + c if c.is_ascii_digit() => { + self.read_number(start) + } + c => { + return Err(OlangError::new( + format!( + "unexpected char '{c}'" + ), + Span::new(start, self.pos), + )); + } + }; + + tokens.push(token); + } + + Ok(tokens) + } +} blob - /dev/null blob + 79d4169f9a10218c78f90f1669eb903f8e53a8cd (mode 644) --- /dev/null +++ src/lexer/token.rs @@ -0,0 +1,43 @@ +// vim: set tw=79 cc=80 ts=4 sw=4 sts=4 et : +// +// Copyright (c) 2025-2026 Murilo Ijanc' +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// + +use crate::span::Span; + +#[derive(Debug, Clone, PartialEq)] +pub enum TokenKind { + IntLit(i64), + Plus, + Minus, + Star, + Slash, + LParen, + RParen, + Semicolon, + Eof, +} + +#[derive(Debug, Clone, PartialEq)] +pub struct Token { + pub kind: TokenKind, + pub span: Span, +} + +impl Token { + pub fn new(kind: TokenKind, span: Span) -> Self { + Self { kind, span } + } +}