From 5988cd84ef8f5d55be56a70b7573d5f625edc69f Mon Sep 17 00:00:00 2001
From: Seymur Bagirov <seymur.baghirov@proton.me>
Date: Sat, 16 Nov 2024 23:04:16 +0400
Subject: [PATCH] feat: add basic lexing(scanning)

just following the Scanning section of the book. Partially implemented
---
 src/lib.rs     |  51 ++++++++++++++
 src/main.rs    |  29 +++++++-
 src/scanner.rs | 180 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/token.rs   |  62 +++++++++++++++++
 src/utils.rs   |  36 ++++++++++
 5 files changed, 357 insertions(+), 1 deletion(-)
 create mode 100644 src/lib.rs
 create mode 100644 src/scanner.rs
 create mode 100644 src/token.rs
 create mode 100644 src/utils.rs

diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..ee01e65
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,51 @@
+use std::{
+    error::Error,
+    fs,
+    io::{self, Write},
+};
+
+use token::Token;
+
+mod scanner;
+mod token;
+mod utils;
+
+pub fn run_file(path: &str) -> Result<(), Box<dyn Error>> {
+    let file = fs::read_to_string(path)?;
+
+    Ok(())
+}
+
+pub fn run(src: &str) {
+    let tokens: Vec<Token> = Vec::new();
+}
+
+pub fn run_prompt() -> Result<(), Box<dyn Error>> {
+    let stdin = io::stdin();
+    let input = &mut String::new();
+    print!("> ");
+    io::stdout().flush()?;
+    loop {
+        input.clear();
+        let _ = stdin.read_line(input)?;
+
+        print!("> ");
+        io::stdout().flush()?;
+    }
+}
+
+#[derive(Debug)]
+pub struct RloxError {
+    msg: String,
+    line: usize,
+}
+
+impl RloxError {
+    pub fn error(line: i32, message: &str) {
+        report(line, "", message);
+    }
+}
+
+pub fn report(line: i32, location: &str, message: &str) {
+    eprintln!("[line {line}] Error {location}: {message}");
+}
diff --git a/src/main.rs b/src/main.rs
index f328e4d..5422fa7 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1 +1,28 @@
-fn main() {}
+use std::{env::args_os, ffi::OsString, process::ExitCode};
+
+use izanami::{run_file, run_prompt};
+
+fn main() -> ExitCode {
+    let args: Vec<OsString> = args_os().collect();
+
+    if args.len() > 2 {
+        println!("usage: izanami [script]");
+        return ExitCode::from(64);
+    } else if args.len() == 2 {
+        let result = run_file(args[1].to_str().unwrap());
+
+        if let Err(res) = result {
+            println!("Couldn't read the file. Reason: {}", &*res);
+            return ExitCode::from(1);
+        }
+    } else {
+        let result = run_prompt();
+
+        if let Err(res) = result {
+            println!("Error while processing the repl. Reason: {}", &*res);
+            return ExitCode::from(1);
+        }
+    }
+
+    ExitCode::SUCCESS
+}
diff --git a/src/scanner.rs b/src/scanner.rs
new file mode 100644
index 0000000..c50b85a
--- /dev/null
+++ b/src/scanner.rs
@@ -0,0 +1,180 @@
+use std::{iter::Peekable, mem, str::Chars};
+
+use crate::{
+    token::{Token, TokenType},
+    utils::StringUtils,
+    RloxError,
+};
+
+pub struct Scanner {
+    source: String,
+    tokens: Vec<Token>,
+    iter: Peekable<Chars<'static>>,
+    start: usize,
+    current: usize,
+    line: usize,
+}
+
+impl Scanner {
+    fn new(source: String) -> Self {
+        // the reason for using unsafe here is to have the ability to use utf-8 symbols
+        // rust doesn't allow having both the iterator and iterable inside one
+        // structure(understandably so bcs of reference invalidation)
+        let chars = unsafe {
+            mem::transmute::<std::str::Chars<'_>, std::str::Chars<'static>>(source.chars())
+        };
+        Self {
+            source,
+            iter: chars.peekable(),
+            tokens: Vec::new(),
+            start: 0,
+            current: 0,
+            line: 1,
+        }
+    }
+
+    // this is so awful for me to write. This function needs to be not mutable in theory and it
+    // could be accomplished. TODO!
+    fn scan_tokens(&mut self) -> Result<&Vec<Token>, Vec<RloxError>> {
+        let mut errors = Vec::new();
+        while let Some(character) = self.advance() {
+            self.start = self.current;
+            let result = self.scan_token(character);
+            if let Err(e) = result {
+                errors.push(RloxError {
+                    msg: e.to_string(),
+                    line: self.line,
+                });
+            }
+        }
+
+        self.tokens.push(Token {
+            t_type: TokenType::EOF,
+            lexeme: "".to_string(),
+            literal: None,
+            line: self.line,
+        });
+
+        if !errors.is_empty() {
+            return Err(errors);
+        }
+
+        Ok(&self.tokens)
+    }
+
+    //fn is_at_end(&self) -> bool {
+    //    self.current >= self.source.len()
+    //}
+
+    fn scan_token(&mut self, token: char) -> Result<(), &'static str> {
+        let mut error = Ok(());
+
+        match token {
+            '(' => self.add_token(TokenType::LeftParen),
+            ')' => self.add_token(TokenType::RightParen),
+            '{' => self.add_token(TokenType::LeftBrace),
+            '}' => self.add_token(TokenType::RightBrace),
+            ',' => self.add_token(TokenType::Comma),
+            '.' => self.add_token(TokenType::Dot),
+            '-' => self.add_token(TokenType::Minus),
+            '+' => self.add_token(TokenType::Plus),
+            ';' => self.add_token(TokenType::Semicolon),
+            '*' => self.add_token(TokenType::Star),
+            '!' if self.peek_and_match('=') => self.add_token(TokenType::BangEqual),
+            '!' => self.add_token(TokenType::Bang),
+            '=' if self.peek_and_match('=') => self.add_token(TokenType::EqualEqual),
+            '=' => self.add_token(TokenType::Equal),
+            '<' if self.peek_and_match('=') => self.add_token(TokenType::LessEqual),
+            '<' => self.add_token(TokenType::Less),
+            '>' if self.peek_and_match('>') => self.add_token(TokenType::GreaterEqual),
+            '>' => self.add_token(TokenType::Greater),
+            // checking for comments and just advance the iterator
+            '/' if self.peek_and_match('/') => {
+                while self.peek().is_some_and(|x| x != '\n') {
+                    self.advance();
+                }
+            }
+            '/' => self.add_token(TokenType::Slash),
+
+            ' ' | '\r' | '\t' => (),
+            '\n' => self.line += 1,
+
+            _ => error = Err("Unexpected character"),
+        };
+
+        error
+    }
+
+    fn advance(&mut self) -> Option<char> {
+        self.current += 1;
+        self.iter.next()
+    }
+
+    fn add_token(&mut self, t_type: TokenType) {
+        self.add_token_literal(t_type, None)
+    }
+
+    fn add_token_literal(&mut self, t_type: TokenType, literal: Option<Box<dyn std::any::Any>>) {
+        let text = self.source.substring(self.start, self.current);
+        self.tokens.push(Token {
+            t_type,
+            lexeme: text.to_string(),
+            literal,
+            line: self.line,
+        });
+    }
+
+    fn peek(&mut self) -> Option<char> {
+        self.iter.peek().copied()
+    }
+
+    fn peek_and_match(&mut self, expected: char) -> bool {
+        let peek = self.peek();
+        if peek.is_some_and(|x| x == expected) {
+            self.advance();
+            return true;
+        }
+
+        false
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use TokenType::*;
+
+    fn do_cols_match<T: PartialEq>(a: &[T], b: &[T]) -> bool {
+        let matching = a.iter().zip(b.iter()).filter(|&(a, b)| a == b).count();
+        matching == a.len() && matching == b.len()
+    }
+
+    #[test]
+    fn should_be_equal() {
+        let value = r#"
+            // this is a comment
+            (( )){} // grouping stuff
+            !*+-/=<> <= == // operators
+        "#;
+
+        let mut scanner = Scanner::new(value.to_string());
+
+        let expected_tokens = vec![
+            LeftParen, LeftParen, RightParen, RightParen, LeftBrace, RightBrace, Bang, Star, Plus,
+            Minus, Slash, Equal, Less, Greater, LessEqual, EqualEqual, EOF,
+        ];
+
+        let actual_tokens: Vec<TokenType> = scanner
+            .scan_tokens()
+            .unwrap()
+            .iter()
+            .map(|x| x.t_type)
+            .collect();
+
+        println!("actual: {:?}", actual_tokens);
+
+        println!("expected: {:?}", expected_tokens);
+
+        assert!(do_cols_match(&actual_tokens, &expected_tokens));
+    }
+}
diff --git a/src/token.rs b/src/token.rs
new file mode 100644
index 0000000..246f7fa
--- /dev/null
+++ b/src/token.rs
@@ -0,0 +1,62 @@
+use std::fmt::Display;
+
+#[derive(Debug, PartialEq, Clone, Copy)]
+pub enum TokenType {
+    LeftParen,
+    RightParen,
+    LeftBrace,
+    RightBrace,
+    Comma,
+    Dot,
+    Minus,
+    Plus,
+    Semicolon,
+    Slash,
+    Star,
+
+    Bang,
+    BangEqual,
+    Equal,
+    EqualEqual,
+    Greater,
+    GreaterEqual,
+    Less,
+    LessEqual,
+
+    Identifier,
+    String,
+    Number,
+
+    And,
+    Class,
+    Else,
+    False,
+    Fun,
+    For,
+    If,
+    Nil,
+    OR,
+    Print,
+    Return,
+    Super,
+    This,
+    True,
+    Var,
+    While,
+
+    EOF,
+}
+
+#[derive(Debug)]
+pub struct Token {
+    pub t_type: TokenType,
+    pub lexeme: String,
+    pub literal: Option<Box<dyn std::any::Any>>,
+    pub line: usize,
+}
+
+impl Display for Token {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?} {} {:?}", self.t_type, self.lexeme, self.literal)
+    }
+}
diff --git a/src/utils.rs b/src/utils.rs
new file mode 100644
index 0000000..2bda673
--- /dev/null
+++ b/src/utils.rs
@@ -0,0 +1,36 @@
+pub trait StringUtils {
+    fn substring(&self, start: usize, end: usize) -> &str;
+}
+
+impl StringUtils for String {
+    fn substring(&self, start: usize, len: usize) -> &str {
+        let mut char_pos = 0;
+        let mut byte_start = 0;
+        let mut it = self.chars();
+        loop {
+            if char_pos == start {
+                break;
+            }
+            if let Some(c) = it.next() {
+                char_pos += 1;
+                byte_start += c.len_utf8();
+            } else {
+                break;
+            }
+        }
+        char_pos = 0;
+        let mut byte_end = byte_start;
+        loop {
+            if char_pos == len {
+                break;
+            }
+            if let Some(c) = it.next() {
+                char_pos += 1;
+                byte_end += c.len_utf8();
+            } else {
+                break;
+            }
+        }
+        &self[byte_start..byte_end]
+    }
+}