You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

387 lines
13 KiB
Rust

use std::borrow::Cow;
use std::fmt::Debug;
use std::iter::Peekable;
use std::str::CharIndices;
use crate::span::{LineOffset, LineSpan, Span};
#[derive(Debug, Eq, PartialEq, Clone)]
pub enum Token<'a> {
/// (
LParen,
/// )
RParen,
/// atom
Atom(&'a str),
/// "string"
String(Cow<'a, str>),
/// `#|`
LComment,
/// `|#`
RComment,
}
#[derive(Debug, Eq, PartialEq)]
pub enum Error<'a, O> {
UnsupportedEscape {
input: &'a str,
offset: O,
character: char,
span: Span<Cow<'a, str>, O>,
},
MissingEndQuote {
input: &'a str,
offset: O,
span: Span<Cow<'a, str>, O>,
},
}
#[derive(Debug, Clone)]
pub struct Lexer<'a, T: Tracker = NoTracker> {
pub(crate) input: &'a str,
reader: Peekable<CharIndices<'a>>,
tracker: T,
}
impl<'a> Lexer<'a> {
#[inline]
pub fn new(input: &'a str) -> Self {
Lexer::with_tracker::<NoTracker>(input)
}
pub fn with_tracker<T: Default + Tracker>(input: &'a str) -> Lexer<'a, T> {
Lexer {
input,
reader: input.char_indices().peekable(),
tracker: T::default(),
}
}
}
impl<'a, T: Tracker> Lexer<'a, T> {
pub fn next(&mut self) -> Option<Result<Span<Token<'a>, T::Offset>, Error<'a, T::Offset>>> {
while let Some((start_offset, c)) = self.reader.next() {
self.tracker.start(start_offset);
if c.is_whitespace() {
if c == '\n' {
self.tracker.process_newline(start_offset);
}
let mut last_offset = start_offset;
let mut last_char = c;
while let Some((offset, c)) = self.reader.next_if(|(_, c)| c.is_whitespace()) {
if c == '\n' {
self.tracker.process_newline(offset);
}
last_offset = offset;
last_char = c;
}
self.tracker.end(last_offset + last_char.len_utf8(), ());
continue;
}
match c {
'(' => return Some(Ok(self.tracker.end(start_offset + 1, Token::LParen))),
')' => return Some(Ok(self.tracker.end(start_offset + 1, Token::RParen))),
'"' => {
let mut backup = String::with_capacity(256);
// " is always 1 byte
let mut last_offset = start_offset + 1;
let mut used_backup = false;
let mut filled_backup = false;
let mut escape = false;
while let Some((offset, c)) = self.reader.next() {
if escape {
let prev_offset = last_offset;
// only supports escapes 1 byte wide for now
last_offset = offset + 1;
#[inline(always)]
fn fill_backup(filled_backup: &mut bool, backup: &mut String, prev_offset: usize, offset: usize, input: &str) {
if !*filled_backup {
backup.push_str(&input[prev_offset..offset - 1]);
*filled_backup = true;
}
}
match c {
'n' => {
fill_backup(&mut filled_backup, &mut backup, prev_offset, offset, self.input);
backup.push('\n');
}
'\\' => {
fill_backup(&mut filled_backup, &mut backup, prev_offset, offset, self.input);
backup.push('\\');
}
'"' => {
fill_backup(&mut filled_backup, &mut backup, prev_offset, offset, self.input);
backup.push('"');
}
_ => {
let span = self.tracker.end(offset + 1, if filled_backup {
Cow::Owned(backup)
} else {
Cow::Borrowed(&self.input[start_offset + 1..offset - 1])
});
return Some(Err(Error::UnsupportedEscape {
input: self.input,
offset: self.tracker.get_offset(offset - 1),
character: c,
span,
}));
}
}
escape = false;
continue;
}
match c {
'\\' => {
escape = true;
used_backup = true;
if filled_backup {
backup.push_str(&self.input[last_offset..offset]);
}
}
'"' => {
return Some(Ok(self.tracker.end(offset + 1, if used_backup {
backup.push_str(&self.input[last_offset..offset]);
Token::String(Cow::Owned(backup))
} else {
Token::String(Cow::Borrowed(&self.input[last_offset..offset]))
})));
}
_ => continue,
}
}
let span = self.tracker.end(self.input.len(), if used_backup {
backup.push_str(&self.input[last_offset..]);
Cow::Owned(backup)
} else {
Cow::Borrowed(&self.input[last_offset..])
});
return Some(Err(Error::MissingEndQuote {
input: self.input,
offset: span.end,
span,
}));
}
'#' => {
if let Some((_, '|')) = self.reader.peek() {
// consume the pipe
self.reader.next();
return Some(Ok(self.tracker.end(start_offset + 2, Token::LComment)));
}
// fall through to atom
}
'|' => {
if let Some((_, '#')) = self.reader.peek() {
// consume the pipe
self.reader.next();
return Some(Ok(self.tracker.end(start_offset + 2, Token::RComment)));
}
// fall through to atom
}
_ => {}
}
while let Some((end_offset, c)) = self.reader.peek().copied() {
if c == ')' || c == '(' || c.is_whitespace() || c == '"' || (c == '|' && self.input.as_bytes()[end_offset + 1] == b'#') || (c == '#' && self.input.as_bytes()[end_offset + 1] == b'|') {
return Some(Ok(self.tracker.end(end_offset, Token::Atom(&self.input[start_offset..end_offset]))));
}
self.reader.next();
}
return Some(Ok(self.tracker.end(self.input.len(), Token::Atom(&self.input[start_offset..]))));
}
None
}
}
pub trait Tracker {
type Offset: Copy + Debug;
fn process_newline(&mut self, _offset: usize) {}
fn start(&mut self, _offset: usize) {}
fn end<T>(&mut self, offset: usize, value: T) -> Span<T, Self::Offset>;
fn get_offset(&self, offset: usize) -> Self::Offset;
}
#[derive(Default, Debug, Copy, Clone)]
pub struct NoTracker;
impl Tracker for NoTracker {
type Offset = ();
fn end<T>(&mut self, _offset: usize, value: T) -> Span<T, Self::Offset> {
Span::new((), (), value)
}
fn get_offset(&self, _offset: usize) -> Self::Offset {
()
}
}
pub struct OffsetTracker {
start: usize,
}
impl Default for OffsetTracker {
fn default() -> Self {
OffsetTracker {
start: 0,
}
}
}
impl Tracker for OffsetTracker {
type Offset = usize;
fn start(&mut self, offset: usize) {
self.start = offset;
}
fn end<T>(&mut self, offset: usize, value: T) -> Span<T> {
Span {
start: self.start,
end: offset,
value,
}
}
fn get_offset(&self, offset: usize) -> Self::Offset {
offset
}
}
pub struct LineTracker {
start: LineOffset,
last_line_offset: usize,
line_n: usize,
}
impl Default for LineTracker {
fn default() -> Self {
LineTracker {
start: LineOffset {
line: 0,
line_offset: 0,
absolute_offset: 0,
},
last_line_offset: 0,
line_n: 0,
}
}
}
impl Tracker for LineTracker {
type Offset = LineOffset;
fn process_newline(&mut self, offset: usize) {
self.line_n += 1;
self.last_line_offset = offset + 1;
}
fn start(&mut self, offset: usize) {
self.start = self.get_offset(offset)
}
fn end<T>(&mut self, offset: usize, value: T) -> LineSpan<T> {
Span {
start: self.start,
end: self.get_offset(offset),
value,
}
}
fn get_offset(&self, offset: usize) -> Self::Offset {
LineOffset {
line: self.line_n,
line_offset: offset - self.last_line_offset,
absolute_offset: offset,
}
}
}
#[cfg(test)]
mod tests {
use std::borrow::Cow;
use crate::lexer::{Error, Lexer, LineTracker, OffsetTracker, Token};
use crate::span::{LineOffset, LineSpan, Span};
#[test]
fn test_simple() {
let mut lex = Lexer::new("( ) #| |# \"hello\" hello \"hello\\n\"\n \n");
assert_eq!(lex.next().unwrap().unwrap().value, Token::LParen);
assert_eq!(lex.next().unwrap().unwrap().value, Token::RParen);
assert_eq!(lex.next().unwrap().unwrap().value, Token::LComment);
assert_eq!(lex.next().unwrap().unwrap().value, Token::RComment);
assert_eq!(lex.next().unwrap().unwrap().value, Token::String(Cow::Borrowed("hello")));
assert_eq!(lex.next().unwrap().unwrap().value, Token::Atom("hello"));
assert_eq!(lex.next().unwrap().unwrap().value, Token::String(Cow::Owned("hello\n".to_string())));
assert!(lex.next().is_none())
}
#[test]
fn test_offset_tracker() {
let mut lex = Lexer::with_tracker::<OffsetTracker>(r#"( ) #| |# "hello" hello "hello\n\\\"""#);
assert_eq!(lex.next().unwrap().unwrap(), Span::new(0, 1, Token::LParen));
assert_eq!(lex.next().unwrap().unwrap(), Span::new(2, 3, Token::RParen));
assert_eq!(lex.next().unwrap().unwrap(), Span::new(4, 6, Token::LComment));
assert_eq!(lex.next().unwrap().unwrap(), Span::new(7, 9, Token::RComment));
assert_eq!(lex.next().unwrap().unwrap(), Span::new(10, 17, Token::String(Cow::Borrowed("hello"))));
assert_eq!(lex.next().unwrap().unwrap(), Span::new(18, 23, Token::Atom("hello")));
assert_eq!(lex.next().unwrap().unwrap(), Span::new(24, 37, Token::String(Cow::Owned("hello\n\\\"".to_string()))));
assert!(lex.next().is_none())
}
#[test]
fn test_line_tracker() {
let mut lex = Lexer::with_tracker::<LineTracker>("(\n )");
assert_eq!(lex.next().unwrap().unwrap(), LineSpan::new(LineOffset::new(0, 0, 0), LineOffset::new(0, 1, 1), Token::LParen));
assert_eq!(lex.next().unwrap().unwrap(), LineSpan::new(LineOffset::new(1, 1, 3), LineOffset::new(1, 2, 4), Token::RParen));
assert!(lex.next().is_none())
}
#[test]
fn test_error() {
let mut lex = Lexer::new("\"hello");
let item = lex.next();
format!("{:?}", item);
assert_eq!(item, Some(Err(Error::MissingEndQuote {
input: "\"hello",
offset: (),
span: Span::new((), (), Cow::Borrowed("hello")),
})));
let mut lex = Lexer::new("\"hello\\x");
let item = lex.next();
format!("{:?}", item);
assert_eq!(item, Some(Err(Error::UnsupportedEscape {
input: "\"hello\\x",
offset: (),
character: 'x',
span: Span::new((), (), Cow::Borrowed("hello")),
})));
}
}