You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
387 lines
13 KiB
Rust
387 lines
13 KiB
Rust
use std::borrow::Cow;
|
|
use std::fmt::Debug;
|
|
use std::iter::Peekable;
|
|
use std::str::CharIndices;
|
|
use crate::span::{LineOffset, LineSpan, Span};
|
|
|
|
#[derive(Debug, Eq, PartialEq, Clone)]
|
|
pub enum Token<'a> {
|
|
/// (
|
|
LParen,
|
|
/// )
|
|
RParen,
|
|
/// atom
|
|
Atom(&'a str),
|
|
/// "string"
|
|
String(Cow<'a, str>),
|
|
/// `#|`
|
|
LComment,
|
|
/// `|#`
|
|
RComment,
|
|
}
|
|
|
|
#[derive(Debug, Eq, PartialEq)]
|
|
pub enum Error<'a, O> {
|
|
UnsupportedEscape {
|
|
input: &'a str,
|
|
offset: O,
|
|
character: char,
|
|
span: Span<Cow<'a, str>, O>,
|
|
},
|
|
MissingEndQuote {
|
|
input: &'a str,
|
|
offset: O,
|
|
span: Span<Cow<'a, str>, O>,
|
|
},
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct Lexer<'a, T: Tracker = NoTracker> {
|
|
pub(crate) input: &'a str,
|
|
reader: Peekable<CharIndices<'a>>,
|
|
tracker: T,
|
|
}
|
|
|
|
impl<'a> Lexer<'a> {
|
|
#[inline]
|
|
pub fn new(input: &'a str) -> Self {
|
|
Lexer::with_tracker::<NoTracker>(input)
|
|
}
|
|
|
|
pub fn with_tracker<T: Default + Tracker>(input: &'a str) -> Lexer<'a, T> {
|
|
Lexer {
|
|
input,
|
|
reader: input.char_indices().peekable(),
|
|
tracker: T::default(),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<'a, T: Tracker> Lexer<'a, T> {
|
|
pub fn next(&mut self) -> Option<Result<Span<Token<'a>, T::Offset>, Error<'a, T::Offset>>> {
|
|
while let Some((start_offset, c)) = self.reader.next() {
|
|
self.tracker.start(start_offset);
|
|
if c.is_whitespace() {
|
|
if c == '\n' {
|
|
self.tracker.process_newline(start_offset);
|
|
}
|
|
|
|
let mut last_offset = start_offset;
|
|
let mut last_char = c;
|
|
while let Some((offset, c)) = self.reader.next_if(|(_, c)| c.is_whitespace()) {
|
|
if c == '\n' {
|
|
self.tracker.process_newline(offset);
|
|
}
|
|
last_offset = offset;
|
|
last_char = c;
|
|
}
|
|
self.tracker.end(last_offset + last_char.len_utf8(), ());
|
|
continue;
|
|
}
|
|
|
|
|
|
match c {
|
|
'(' => return Some(Ok(self.tracker.end(start_offset + 1, Token::LParen))),
|
|
')' => return Some(Ok(self.tracker.end(start_offset + 1, Token::RParen))),
|
|
'"' => {
|
|
let mut backup = String::with_capacity(256);
|
|
// " is always 1 byte
|
|
let mut last_offset = start_offset + 1;
|
|
let mut used_backup = false;
|
|
let mut filled_backup = false;
|
|
let mut escape = false;
|
|
|
|
while let Some((offset, c)) = self.reader.next() {
|
|
if escape {
|
|
let prev_offset = last_offset;
|
|
// only supports escapes 1 byte wide for now
|
|
last_offset = offset + 1;
|
|
|
|
#[inline(always)]
|
|
fn fill_backup(filled_backup: &mut bool, backup: &mut String, prev_offset: usize, offset: usize, input: &str) {
|
|
if !*filled_backup {
|
|
backup.push_str(&input[prev_offset..offset - 1]);
|
|
*filled_backup = true;
|
|
}
|
|
}
|
|
|
|
match c {
|
|
'n' => {
|
|
fill_backup(&mut filled_backup, &mut backup, prev_offset, offset, self.input);
|
|
backup.push('\n');
|
|
}
|
|
'\\' => {
|
|
fill_backup(&mut filled_backup, &mut backup, prev_offset, offset, self.input);
|
|
backup.push('\\');
|
|
}
|
|
'"' => {
|
|
fill_backup(&mut filled_backup, &mut backup, prev_offset, offset, self.input);
|
|
backup.push('"');
|
|
}
|
|
_ => {
|
|
let span = self.tracker.end(offset + 1, if filled_backup {
|
|
Cow::Owned(backup)
|
|
} else {
|
|
Cow::Borrowed(&self.input[start_offset + 1..offset - 1])
|
|
});
|
|
|
|
return Some(Err(Error::UnsupportedEscape {
|
|
input: self.input,
|
|
offset: self.tracker.get_offset(offset - 1),
|
|
character: c,
|
|
span,
|
|
}));
|
|
}
|
|
}
|
|
|
|
escape = false;
|
|
continue;
|
|
}
|
|
|
|
match c {
|
|
'\\' => {
|
|
escape = true;
|
|
used_backup = true;
|
|
if filled_backup {
|
|
backup.push_str(&self.input[last_offset..offset]);
|
|
}
|
|
}
|
|
'"' => {
|
|
return Some(Ok(self.tracker.end(offset + 1, if used_backup {
|
|
backup.push_str(&self.input[last_offset..offset]);
|
|
Token::String(Cow::Owned(backup))
|
|
} else {
|
|
Token::String(Cow::Borrowed(&self.input[last_offset..offset]))
|
|
})));
|
|
}
|
|
|
|
_ => continue,
|
|
}
|
|
}
|
|
|
|
let span = self.tracker.end(self.input.len(), if used_backup {
|
|
backup.push_str(&self.input[last_offset..]);
|
|
Cow::Owned(backup)
|
|
} else {
|
|
Cow::Borrowed(&self.input[last_offset..])
|
|
});
|
|
|
|
return Some(Err(Error::MissingEndQuote {
|
|
input: self.input,
|
|
offset: span.end,
|
|
span,
|
|
}));
|
|
}
|
|
'#' => {
|
|
if let Some((_, '|')) = self.reader.peek() {
|
|
// consume the pipe
|
|
self.reader.next();
|
|
return Some(Ok(self.tracker.end(start_offset + 2, Token::LComment)));
|
|
}
|
|
|
|
// fall through to atom
|
|
}
|
|
|
|
'|' => {
|
|
if let Some((_, '#')) = self.reader.peek() {
|
|
// consume the pipe
|
|
self.reader.next();
|
|
return Some(Ok(self.tracker.end(start_offset + 2, Token::RComment)));
|
|
}
|
|
|
|
// fall through to atom
|
|
}
|
|
|
|
_ => {}
|
|
}
|
|
|
|
while let Some((end_offset, c)) = self.reader.peek().copied() {
|
|
if c == ')' || c == '(' || c.is_whitespace() || c == '"' || (c == '|' && self.input.as_bytes()[end_offset + 1] == b'#') || (c == '#' && self.input.as_bytes()[end_offset + 1] == b'|') {
|
|
return Some(Ok(self.tracker.end(end_offset, Token::Atom(&self.input[start_offset..end_offset]))));
|
|
}
|
|
|
|
self.reader.next();
|
|
}
|
|
|
|
return Some(Ok(self.tracker.end(self.input.len(), Token::Atom(&self.input[start_offset..]))));
|
|
}
|
|
|
|
None
|
|
}
|
|
}
|
|
|
|
pub trait Tracker {
|
|
type Offset: Copy + Debug;
|
|
|
|
fn process_newline(&mut self, _offset: usize) {}
|
|
|
|
fn start(&mut self, _offset: usize) {}
|
|
|
|
fn end<T>(&mut self, offset: usize, value: T) -> Span<T, Self::Offset>;
|
|
|
|
fn get_offset(&self, offset: usize) -> Self::Offset;
|
|
}
|
|
|
|
#[derive(Default, Debug, Copy, Clone)]
|
|
pub struct NoTracker;
|
|
|
|
impl Tracker for NoTracker {
|
|
type Offset = ();
|
|
|
|
fn end<T>(&mut self, _offset: usize, value: T) -> Span<T, Self::Offset> {
|
|
Span::new((), (), value)
|
|
}
|
|
|
|
fn get_offset(&self, _offset: usize) -> Self::Offset {
|
|
()
|
|
}
|
|
}
|
|
|
|
pub struct OffsetTracker {
|
|
start: usize,
|
|
}
|
|
|
|
impl Default for OffsetTracker {
|
|
fn default() -> Self {
|
|
OffsetTracker {
|
|
start: 0,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Tracker for OffsetTracker {
|
|
type Offset = usize;
|
|
|
|
fn start(&mut self, offset: usize) {
|
|
self.start = offset;
|
|
}
|
|
|
|
fn end<T>(&mut self, offset: usize, value: T) -> Span<T> {
|
|
Span {
|
|
start: self.start,
|
|
end: offset,
|
|
value,
|
|
}
|
|
}
|
|
|
|
fn get_offset(&self, offset: usize) -> Self::Offset {
|
|
offset
|
|
}
|
|
}
|
|
|
|
pub struct LineTracker {
|
|
start: LineOffset,
|
|
last_line_offset: usize,
|
|
line_n: usize,
|
|
}
|
|
|
|
impl Default for LineTracker {
|
|
fn default() -> Self {
|
|
LineTracker {
|
|
start: LineOffset {
|
|
line: 0,
|
|
line_offset: 0,
|
|
absolute_offset: 0,
|
|
},
|
|
last_line_offset: 0,
|
|
line_n: 0,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Tracker for LineTracker {
|
|
type Offset = LineOffset;
|
|
|
|
fn process_newline(&mut self, offset: usize) {
|
|
self.line_n += 1;
|
|
self.last_line_offset = offset + 1;
|
|
}
|
|
|
|
fn start(&mut self, offset: usize) {
|
|
self.start = self.get_offset(offset)
|
|
}
|
|
|
|
fn end<T>(&mut self, offset: usize, value: T) -> LineSpan<T> {
|
|
Span {
|
|
start: self.start,
|
|
end: self.get_offset(offset),
|
|
value,
|
|
}
|
|
}
|
|
|
|
fn get_offset(&self, offset: usize) -> Self::Offset {
|
|
LineOffset {
|
|
line: self.line_n,
|
|
line_offset: offset - self.last_line_offset,
|
|
absolute_offset: offset,
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use std::borrow::Cow;
|
|
use crate::lexer::{Error, Lexer, LineTracker, OffsetTracker, Token};
|
|
use crate::span::{LineOffset, LineSpan, Span};
|
|
|
|
#[test]
|
|
fn test_simple() {
|
|
let mut lex = Lexer::new("( ) #| |# \"hello\" hello \"hello\\n\"\n \n");
|
|
|
|
assert_eq!(lex.next().unwrap().unwrap().value, Token::LParen);
|
|
assert_eq!(lex.next().unwrap().unwrap().value, Token::RParen);
|
|
assert_eq!(lex.next().unwrap().unwrap().value, Token::LComment);
|
|
assert_eq!(lex.next().unwrap().unwrap().value, Token::RComment);
|
|
assert_eq!(lex.next().unwrap().unwrap().value, Token::String(Cow::Borrowed("hello")));
|
|
assert_eq!(lex.next().unwrap().unwrap().value, Token::Atom("hello"));
|
|
assert_eq!(lex.next().unwrap().unwrap().value, Token::String(Cow::Owned("hello\n".to_string())));
|
|
assert!(lex.next().is_none())
|
|
}
|
|
|
|
#[test]
|
|
fn test_offset_tracker() {
|
|
let mut lex = Lexer::with_tracker::<OffsetTracker>(r#"( ) #| |# "hello" hello "hello\n\\\"""#);
|
|
|
|
assert_eq!(lex.next().unwrap().unwrap(), Span::new(0, 1, Token::LParen));
|
|
assert_eq!(lex.next().unwrap().unwrap(), Span::new(2, 3, Token::RParen));
|
|
assert_eq!(lex.next().unwrap().unwrap(), Span::new(4, 6, Token::LComment));
|
|
assert_eq!(lex.next().unwrap().unwrap(), Span::new(7, 9, Token::RComment));
|
|
assert_eq!(lex.next().unwrap().unwrap(), Span::new(10, 17, Token::String(Cow::Borrowed("hello"))));
|
|
assert_eq!(lex.next().unwrap().unwrap(), Span::new(18, 23, Token::Atom("hello")));
|
|
assert_eq!(lex.next().unwrap().unwrap(), Span::new(24, 37, Token::String(Cow::Owned("hello\n\\\"".to_string()))));
|
|
assert!(lex.next().is_none())
|
|
}
|
|
|
|
#[test]
|
|
fn test_line_tracker() {
|
|
let mut lex = Lexer::with_tracker::<LineTracker>("(\n )");
|
|
|
|
assert_eq!(lex.next().unwrap().unwrap(), LineSpan::new(LineOffset::new(0, 0, 0), LineOffset::new(0, 1, 1), Token::LParen));
|
|
assert_eq!(lex.next().unwrap().unwrap(), LineSpan::new(LineOffset::new(1, 1, 3), LineOffset::new(1, 2, 4), Token::RParen));
|
|
assert!(lex.next().is_none())
|
|
}
|
|
|
|
#[test]
|
|
fn test_error() {
|
|
let mut lex = Lexer::new("\"hello");
|
|
let item = lex.next();
|
|
format!("{:?}", item);
|
|
assert_eq!(item, Some(Err(Error::MissingEndQuote {
|
|
input: "\"hello",
|
|
offset: (),
|
|
span: Span::new((), (), Cow::Borrowed("hello")),
|
|
})));
|
|
|
|
let mut lex = Lexer::new("\"hello\\x");
|
|
let item = lex.next();
|
|
format!("{:?}", item);
|
|
assert_eq!(item, Some(Err(Error::UnsupportedEscape {
|
|
input: "\"hello\\x",
|
|
offset: (),
|
|
character: 'x',
|
|
span: Span::new((), (), Cow::Borrowed("hello")),
|
|
})));
|
|
}
|
|
}
|