feat: add basic lexing(scanning)

just following the Scanning section of the book. Partially implemented
This commit is contained in:
Seymur Bagirov 2024-11-16 23:04:16 +04:00
parent ef7d7bcbf8
commit 5988cd84ef
5 changed files with 357 additions and 1 deletions

51
src/lib.rs Normal file
View File

@ -0,0 +1,51 @@
use std::{
error::Error,
fs,
io::{self, Write},
};
use token::Token;
mod scanner;
mod token;
mod utils;
pub fn run_file(path: &str) -> Result<(), Box<dyn Error>> {
let file = fs::read_to_string(path)?;
Ok(())
}
pub fn run(src: &str) {
let tokens: Vec<Token> = Vec::new();
}
pub fn run_prompt() -> Result<(), Box<dyn Error>> {
let stdin = io::stdin();
let input = &mut String::new();
print!("> ");
io::stdout().flush()?;
loop {
input.clear();
let _ = stdin.read_line(input)?;
print!("> ");
io::stdout().flush()?;
}
}
#[derive(Debug)]
pub struct RloxError {
msg: String,
line: usize,
}
impl RloxError {
pub fn error(line: i32, message: &str) {
report(line, "", message);
}
}
pub fn report(line: i32, location: &str, message: &str) {
eprintln!("[line {line}] Error {location}: {message}");
}

View File

@ -1 +1,28 @@
fn main() {} use std::{env::args_os, ffi::OsString, process::ExitCode};
use izanami::{run_file, run_prompt};
fn main() -> ExitCode {
let args: Vec<OsString> = args_os().collect();
if args.len() > 2 {
println!("usage: izanami [script]");
return ExitCode::from(64);
} else if args.len() == 2 {
let result = run_file(args[1].to_str().unwrap());
if let Err(res) = result {
println!("Couldn't read the file. Reason: {}", &*res);
return ExitCode::from(1);
}
} else {
let result = run_prompt();
if let Err(res) = result {
println!("Error while processing the repl. Reason: {}", &*res);
return ExitCode::from(1);
}
}
ExitCode::SUCCESS
}

180
src/scanner.rs Normal file
View File

@ -0,0 +1,180 @@
use std::{iter::Peekable, mem, str::Chars};
use crate::{
token::{Token, TokenType},
utils::StringUtils,
RloxError,
};
pub struct Scanner {
source: String,
tokens: Vec<Token>,
iter: Peekable<Chars<'static>>,
start: usize,
current: usize,
line: usize,
}
impl Scanner {
fn new(source: String) -> Self {
// the reason for using unsafe here is to have the ability to use utf-8 symbols
// rust doesn't allow having both the iterator and iterable inside one
// structure(understandably so bcs of reference invalidation)
let chars = unsafe {
mem::transmute::<std::str::Chars<'_>, std::str::Chars<'static>>(source.chars())
};
Self {
source,
iter: chars.peekable(),
tokens: Vec::new(),
start: 0,
current: 0,
line: 1,
}
}
// this is so awful for me to write. This function needs to be not mutable in theory and it
// could be accomplished. TODO!
fn scan_tokens(&mut self) -> Result<&Vec<Token>, Vec<RloxError>> {
let mut errors = Vec::new();
while let Some(character) = self.advance() {
self.start = self.current;
let result = self.scan_token(character);
if let Err(e) = result {
errors.push(RloxError {
msg: e.to_string(),
line: self.line,
});
}
}
self.tokens.push(Token {
t_type: TokenType::EOF,
lexeme: "".to_string(),
literal: None,
line: self.line,
});
if !errors.is_empty() {
return Err(errors);
}
Ok(&self.tokens)
}
//fn is_at_end(&self) -> bool {
// self.current >= self.source.len()
//}
fn scan_token(&mut self, token: char) -> Result<(), &'static str> {
let mut error = Ok(());
match token {
'(' => self.add_token(TokenType::LeftParen),
')' => self.add_token(TokenType::RightParen),
'{' => self.add_token(TokenType::LeftBrace),
'}' => self.add_token(TokenType::RightBrace),
',' => self.add_token(TokenType::Comma),
'.' => self.add_token(TokenType::Dot),
'-' => self.add_token(TokenType::Minus),
'+' => self.add_token(TokenType::Plus),
';' => self.add_token(TokenType::Semicolon),
'*' => self.add_token(TokenType::Star),
'!' if self.peek_and_match('=') => self.add_token(TokenType::BangEqual),
'!' => self.add_token(TokenType::Bang),
'=' if self.peek_and_match('=') => self.add_token(TokenType::EqualEqual),
'=' => self.add_token(TokenType::Equal),
'<' if self.peek_and_match('=') => self.add_token(TokenType::LessEqual),
'<' => self.add_token(TokenType::Less),
'>' if self.peek_and_match('>') => self.add_token(TokenType::GreaterEqual),
'>' => self.add_token(TokenType::Greater),
// checking for comments and just advance the iterator
'/' if self.peek_and_match('/') => {
while self.peek().is_some_and(|x| x != '\n') {
self.advance();
}
}
'/' => self.add_token(TokenType::Slash),
' ' | '\r' | '\t' => (),
'\n' => self.line += 1,
_ => error = Err("Unexpected character"),
};
error
}
fn advance(&mut self) -> Option<char> {
self.current += 1;
self.iter.next()
}
fn add_token(&mut self, t_type: TokenType) {
self.add_token_literal(t_type, None)
}
fn add_token_literal(&mut self, t_type: TokenType, literal: Option<Box<dyn std::any::Any>>) {
let text = self.source.substring(self.start, self.current);
self.tokens.push(Token {
t_type,
lexeme: text.to_string(),
literal,
line: self.line,
});
}
fn peek(&mut self) -> Option<char> {
self.iter.peek().copied()
}
fn peek_and_match(&mut self, expected: char) -> bool {
let peek = self.peek();
if peek.is_some_and(|x| x == expected) {
self.advance();
return true;
}
false
}
}
#[cfg(test)]
mod tests {
use super::*;
use TokenType::*;
fn do_cols_match<T: PartialEq>(a: &[T], b: &[T]) -> bool {
let matching = a.iter().zip(b.iter()).filter(|&(a, b)| a == b).count();
matching == a.len() && matching == b.len()
}
#[test]
fn should_be_equal() {
let value = r#"
// this is a comment
(( )){} // grouping stuff
!*+-/=<> <= == // operators
"#;
let mut scanner = Scanner::new(value.to_string());
let expected_tokens = vec![
LeftParen, LeftParen, RightParen, RightParen, LeftBrace, RightBrace, Bang, Star, Plus,
Minus, Slash, Equal, Less, Greater, LessEqual, EqualEqual, EOF,
];
let actual_tokens: Vec<TokenType> = scanner
.scan_tokens()
.unwrap()
.iter()
.map(|x| x.t_type)
.collect();
println!("actual: {:?}", actual_tokens);
println!("expected: {:?}", expected_tokens);
assert!(do_cols_match(&actual_tokens, &expected_tokens));
}
}

62
src/token.rs Normal file
View File

@ -0,0 +1,62 @@
use std::fmt::Display;
#[derive(Debug, PartialEq, Clone, Copy)]
pub enum TokenType {
LeftParen,
RightParen,
LeftBrace,
RightBrace,
Comma,
Dot,
Minus,
Plus,
Semicolon,
Slash,
Star,
Bang,
BangEqual,
Equal,
EqualEqual,
Greater,
GreaterEqual,
Less,
LessEqual,
Identifier,
String,
Number,
And,
Class,
Else,
False,
Fun,
For,
If,
Nil,
OR,
Print,
Return,
Super,
This,
True,
Var,
While,
EOF,
}
#[derive(Debug)]
pub struct Token {
pub t_type: TokenType,
pub lexeme: String,
pub literal: Option<Box<dyn std::any::Any>>,
pub line: usize,
}
impl Display for Token {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:?} {} {:?}", self.t_type, self.lexeme, self.literal)
}
}

36
src/utils.rs Normal file
View File

@ -0,0 +1,36 @@
pub trait StringUtils {
fn substring(&self, start: usize, end: usize) -> &str;
}
impl StringUtils for String {
fn substring(&self, start: usize, len: usize) -> &str {
let mut char_pos = 0;
let mut byte_start = 0;
let mut it = self.chars();
loop {
if char_pos == start {
break;
}
if let Some(c) = it.next() {
char_pos += 1;
byte_start += c.len_utf8();
} else {
break;
}
}
char_pos = 0;
let mut byte_end = byte_start;
loop {
if char_pos == len {
break;
}
if let Some(c) = it.next() {
char_pos += 1;
byte_end += c.len_utf8();
} else {
break;
}
}
&self[byte_start..byte_end]
}
}