From dd4fc0ca8f9e16568787623bbdae2aec847dee76 Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Sat, 12 Jul 2025 03:23:49 +0200 Subject: [PATCH] wip --- src/highlighter/lang.rs | 597 +++++++++++++++++++--------------------- src/highlighter/mod.rs | 4 +- 2 files changed, 284 insertions(+), 317 deletions(-) diff --git a/src/highlighter/lang.rs b/src/highlighter/lang.rs index 7b16868..fd12ccf 100644 --- a/src/highlighter/lang.rs +++ b/src/highlighter/lang.rs @@ -10,7 +10,6 @@ use regex_syntax::hir::{Class, ClassBytes, ClassBytesRange, Hir, HirKind, Look}; use super::{Action, Consume, HighlightKind}; use crate::arena::{Arena, ArenaString, scratch_arena}; -use crate::cell::SemiRefCell; use crate::highlighter::{CharsetFormatter, Transition}; pub struct LanguageDefinition { @@ -48,7 +47,8 @@ pub const JSON: LanguageDefinition = { // Strings (r#"""#, String, Push("string")), // Numbers (start: minus or digit) - (r#"-?\d*(?:\.\d+)?(?:[eE][+-]?\d+)?"#, Number, Pop(1)), + (r#"-\d*(?:\.\d+)?(?:[eE][+-]?\d+)?"#, Number, Pop(1)), + (r#"\d*(?:\.\d+)?(?:[eE][+-]?\d+)?"#, Number, Pop(1)), // Booleans/null (r#"true\b"#, Keyword, Pop(1)), (r#"false\b"#, Keyword, Pop(1)), @@ -65,348 +65,315 @@ pub const JSON: LanguageDefinition = { } }; -type NodeCell<'a> = SemiRefCell>; - -// Nodes form a DFA graph which is mostly shaped like a tree. -// Each group of sibling nodes represent the edges coming out of a DFA state. -struct Node<'a> { - edge_first: Option<&'a EdgeCell<'a>>, - edge_last: Option<&'a EdgeCell<'a>>, +struct WipState { + transitions: Vec, } -impl<'a> Node<'a> { - fn new_in(arena: &'a Arena) -> &'a mut NodeCell<'a> { - arena.alloc_uninit().write(NodeCell::new(Node { edge_first: None, edge_last: None })) - } +#[derive(Clone, Copy, PartialEq, Eq)] +enum WipAction { + Change(usize), + Push(usize), + Pop(usize), } -type EdgeCell<'a> = SemiRefCell>; - -struct Edge<'a> { - edge_next: Option<&'a EdgeCell<'a>>, - dst: &'a NodeCell<'a>, - test: Consume<'a>, +#[derive(PartialEq, Eq)] +enum WipConsume { + Chars(usize), + Prefix(String), + PrefixInsensitive(String), + Charset(Box<[bool; 256]>), + Line, } -fn add_edge<'a>( - arena: &'a Arena, - src: &'a NodeCell<'a>, - dst: &'a NodeCell<'a>, - test: Consume<'a>, -) -> &'a NodeCell<'a> { - let mut src = src.borrow_mut(); +struct WipTransition { + test: WipConsume, + kind: HighlightKind, + action: WipAction, +} - // Check if the edge already exists. - { - let mut edge = src.edge_first; - while let Some(e) = edge { - let e = e.borrow(); - if e.test == test { - return e.dst; - } - edge = e.edge_next; - } +struct WipContext<'a> { + states: &'a mut Vec, + kind: HighlightKind, +} + +impl WipContext<'_> { + fn add_state(&mut self) -> usize { + self.states.push(WipState { transitions: Vec::new() }); + self.states.len() - 1 } - let edge = arena.alloc_uninit().write(EdgeCell::new(Edge { edge_next: None, dst, test })); + fn add_transition(&mut self, src: usize, dst: WipAction, test: WipConsume) -> WipAction { + let src = &mut self.states[src].transitions; - if let Some(last) = src.edge_last { - last.borrow_mut().edge_next = Some(edge); - } else { - src.edge_first = Some(edge); - } - - src.edge_last = Some(edge); - dst -} - -fn transform<'a>( - arena: &'a Arena, - src: &'a NodeCell<'a>, - dst: &'a NodeCell<'a>, - hir: &Hir, -) -> &'a NodeCell<'a> { - fn is_any_class(class: &ClassBytes) -> bool { - class.ranges() == [ClassBytesRange::new(0, 255)] - } - - match hir.kind() { - HirKind::Literal(lit) => transform_literal(arena, src, dst, &lit.0), - HirKind::Class(Class::Bytes(class)) if is_any_class(class) => { - transform_any(arena, src, dst) - } - HirKind::Class(Class::Bytes(class)) => transform_class(arena, src, dst, class), - HirKind::Look(Look::WordAscii) => dst, - HirKind::Repetition(rep) => match (rep.min, rep.max, rep.sub.kind()) { - (0, None, HirKind::Class(Class::Bytes(class))) if is_any_class(class) => { - transform_any_star(arena, src, dst) - } - (0, None, HirKind::Class(Class::Bytes(class))) => { - let dst = transform_class_plus(arena, src, dst, class); - transform_option(arena, src, dst); - dst - } - (0, Some(1), _) => { - let dst = transform(arena, src, dst, &rep.sub); - transform_option(arena, src, dst); - dst - } - (1, None, HirKind::Class(Class::Bytes(class))) => { - transform_class_plus(arena, src, dst, class) - } - _ => panic!("Unsupported HIR: {hir:?}"), - }, - HirKind::Concat(hirs) if hirs.len() >= 2 => transform_concat(arena, src, dst, hirs), - HirKind::Alternation(hirs) if hirs.len() >= 2 => transform_alt(arena, src, dst, hirs), - _ => panic!("Unsupported HIR: {hir:?}"), - } -} - -// string -fn transform_literal<'a>( - arena: &'a Arena, - src: &'a NodeCell<'a>, - dst: &'a NodeCell<'a>, - lit: &[u8], -) -> &'a NodeCell<'a> { - let copy = arena.alloc_uninit_slice(lit.len()).write_clone_of_slice(lit); - let copy = str::from_utf8(copy).unwrap(); - add_edge(arena, src, dst, Consume::Prefix(copy)) -} - -// [a-z]+ -fn transform_class_plus<'a>( - arena: &'a Arena, - src: &'a NodeCell<'a>, - dst: &'a NodeCell<'a>, - class: &ClassBytes, -) -> &'a NodeCell<'a> { - let charset = class_to_charset(arena, class); - add_edge(arena, src, dst, Consume::Charset(charset)) -} - -// [eE] -fn transform_class<'a>( - arena: &'a Arena, - src: &'a NodeCell<'a>, - dst: &'a NodeCell<'a>, - class: &ClassBytes, -) -> &'a NodeCell<'a> { - let charset = class_to_charset(arena, class); - let mut actual_dst = None; - - for i in 0..256 { - if !charset[i] { - continue; - } - - if i >= 128 { - panic!("Invalid non-ASCII class character {i}"); - } - - let ch = i as u8; - let copy = arena.alloc_uninit().write(ch.to_ascii_lowercase()); - let copy = str::from_utf8(slice::from_ref(copy)).unwrap(); - - // NOTE: Uppercase chars have a lower numeric value than lowercase chars. - // As such, we need to test for `is_ascii_uppercase`. - let test = if ch.is_ascii_uppercase() - && let upper = ch.to_ascii_lowercase() as usize - && charset[upper] - { - charset[upper] = false; - Consume::PrefixInsensitive(copy) - } else { - Consume::Prefix(copy) - }; - - let node = add_edge(arena, src, dst, test); - if !ptr::eq(node, *actual_dst.get_or_insert(node)) { - panic!("Diverging destinations for class transformer: {class:?}"); - } - } - - actual_dst.unwrap_or(dst) -} - -// .? -fn transform_option<'a>( - arena: &'a Arena, - src: &'a NodeCell<'a>, - dst: &'a NodeCell<'a>, -) -> &'a NodeCell<'a> { - add_edge(arena, src, dst, Consume::Chars(0)) -} - -// .* -fn transform_any_star<'a>( - arena: &'a Arena, - src: &'a NodeCell<'a>, - dst: &'a NodeCell<'a>, -) -> &'a NodeCell<'a> { - add_edge(arena, src, dst, Consume::Line) -} - -// . -fn transform_any<'a>( - arena: &'a Arena, - src: &'a NodeCell<'a>, - dst: &'a NodeCell<'a>, -) -> &'a NodeCell<'a> { - add_edge(arena, src, dst, Consume::Chars(1)) -} - -fn transform_concat<'a>( - arena: &'a Arena, - mut src: &'a NodeCell<'a>, - dst: &'a NodeCell<'a>, - hirs: &[Hir], -) -> &'a NodeCell<'a> { - fn check_lowercase_literal(hir: &Hir) -> Option { - if let HirKind::Class(Class::Bytes(class)) = hir.kind() - && let ranges = class.ranges() - && ranges.len() == 2 - && ranges[0].len() == 1 - && ranges[1].len() == 1 - && let lower_a = ranges[0].start().to_ascii_lowercase() - && let lower_b = ranges[1].start().to_ascii_lowercase() - && lower_a == lower_b - { - Some(lower_a) - } else { - None - } - } - - let mut it = hirs.iter().peekable(); - - while let Some(mut hir) = it.next() { - if let Some(ch) = check_lowercase_literal(hir) { - // Transform [aA][bB][cC] into PrefixInsensitive("abc") - let mut str = ManuallyDrop::new(ArenaString::new_in(arena)); - str.push(ch as char); - - while let Some(next_hir) = it.peek() { - if let Some(next_ch) = check_lowercase_literal(next_hir) { - str.push(next_ch as char); - it.next(); - } else { - break; + // Check if the edge already exists. + for t in src.iter() { + if t.test == test { + match t.action { + WipAction::Change(_) => return t.action, + _ => panic!("Existing edge with non-change action"), } } - - let next = if it.peek().is_some() { Node::new_in(arena) } else { dst }; - let str: &'a str = unsafe { mem::transmute(str.as_str()) }; - src = add_edge(arena, src, next, Consume::PrefixInsensitive(str)); - } else { - let next = if it.peek().is_some() { Node::new_in(arena) } else { dst }; - src = transform(arena, src, next, hir); } + + src.push(WipTransition { test, kind: self.kind, action: dst }); + dst } - src -} - -fn transform_alt<'a>( - arena: &'a Arena, - src: &'a NodeCell<'a>, - dst: &'a NodeCell<'a>, - hirs: &[Hir], -) -> &'a NodeCell<'a> { - let mut actual_dst = None; - - for hir in hirs { - let node = transform(arena, src, dst, hir); - if !ptr::eq(node, *actual_dst.get_or_insert(node)) { - panic!("Diverging destinations for alternation transformer: {hirs:?}"); + fn transform(&mut self, src: usize, dst: WipAction, hir: &Hir) -> WipAction { + fn is_any_class(class: &ClassBytes) -> bool { + class.ranges() == [ClassBytesRange::new(0, 255)] } - } - actual_dst.unwrap_or(dst) -} - -fn class_to_charset<'a>(arena: &'a Arena, class: &ClassBytes) -> &'a mut [bool; 256] { - let mut charset = arena.alloc_uninit().write([false; 256]); - - for r in class.iter() { - charset[r.start() as usize..=r.end() as usize].fill(true); - } - - // If the class includes \w, we also set any non-ASCII characters. - // That's not how Unicode works, but it simplifies the implementation. - if [(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')] - .iter() - .all(|&(beg, end)| charset[beg as usize..=end as usize].iter().all(|&b| b)) - { - charset[0x80..=0xFF].fill(true); - } - - charset -} - -fn print_mermaid<'a>(root: &'a NodeCell<'a>) { - fn node_id<'a, 'v>( - visited: &'v mut HashMap<*const NodeCell<'a>, (usize, bool)>, - ptr: &'a NodeCell<'a>, - ) -> &'v mut (usize, bool) { - let num = visited.len(); - match visited.entry(ptr as *const _) { - Entry::Occupied(mut e) => e.into_mut(), - Entry::Vacant(mut e) => e.insert((num, false)), - } - } - - fn walk<'a>( - node: &'a NodeCell<'a>, - visited: &mut HashMap<*const NodeCell<'a>, (usize, bool)>, - out: &mut String, - ) { - let node_ptr = node as *const _; - let src_id = match node_id(visited, node) { - (num, visited) if !*visited => { - *visited = true; - *num + match hir.kind() { + HirKind::Literal(lit) => self.transform_literal(src, dst, &lit.0), + HirKind::Class(Class::Bytes(class)) if is_any_class(class) => { + self.transform_any(src, dst) } - _ => return, // Already visited - }; + HirKind::Class(Class::Bytes(class)) => self.transform_class(src, dst, class), + HirKind::Look(Look::WordAscii) => dst, + HirKind::Repetition(rep) => match (rep.min, rep.max, rep.sub.kind()) { + (0, None, HirKind::Class(Class::Bytes(class))) if is_any_class(class) => { + self.transform_any_star(src, dst) + } + (0, None, HirKind::Class(Class::Bytes(class))) => { + let dst = self.transform_class_plus(src, dst, class); + self.transform_option(src, dst); + dst + } + (0, Some(1), _) => { + let dst = self.transform(src, dst, &rep.sub); + self.transform_option(src, dst); + dst + } + (1, None, HirKind::Class(Class::Bytes(class))) => { + self.transform_class_plus(src, dst, class) + } + _ => panic!("Unsupported HIR: {hir:?}"), + }, + HirKind::Concat(hirs) if hirs.len() >= 2 => self.transform_concat(src, dst, hirs), + HirKind::Alternation(hirs) if hirs.len() >= 2 => self.transform_alt(src, dst, hirs), + _ => panic!("Unsupported HIR: {hir:?}"), + } + } - let node_ref = node.borrow(); - let mut edge = node_ref.edge_first; + // string + fn transform_literal(&mut self, src: usize, dst: WipAction, lit: &[u8]) -> WipAction { + self.add_transition(src, dst, WipConsume::Prefix(String::from_utf8(lit.to_vec()).unwrap())) + } - while let Some(edge_cell) = edge { - let edge_ref = edge_cell.borrow(); - let &mut (dst_id, _) = node_id(visited, edge_ref.dst); - let label = match &edge_ref.test { - Consume::Prefix(s) => format!("Prefix({s})"), - Consume::PrefixInsensitive(s) => format!("PrefixInsensitive({s})"), - Consume::Charset(c) => format!("Charset({:?})", CharsetFormatter(c)), - Consume::Chars(n) => format!("Chars({n})"), - Consume::Line => "Line".to_string(), + // [a-z]+ + fn transform_class_plus( + &mut self, + src: usize, + dst: WipAction, + class: &ClassBytes, + ) -> WipAction { + let charset = self.class_to_charset(class); + self.add_transition(src, dst, WipConsume::Charset(charset)) + } + + // [eE] + fn transform_class(&mut self, src: usize, dst: WipAction, class: &ClassBytes) -> WipAction { + let mut charset = self.class_to_charset(class); + let mut actual_dst = None; + + for i in 0..256 { + if !charset[i] { + continue; + } + + if i >= 128 { + panic!("Invalid non-ASCII class character {i}"); + } + + let ch = i as u8; + let str = String::from_utf8(slice::from_ref(&ch).to_vec()).unwrap(); + + // NOTE: Uppercase chars have a lower numeric value than lowercase chars. + // As such, we need to test for `is_ascii_uppercase`. + let test = if ch.is_ascii_uppercase() + && let upper = ch.to_ascii_lowercase() as usize + && charset[upper] + { + charset[upper] = false; + WipConsume::PrefixInsensitive(str) + } else { + WipConsume::Prefix(str) + }; + + let d = self.add_transition(src, dst, test); + if d != *actual_dst.get_or_insert(d) { + panic!("Diverging destinations for class transformer: {class:?}"); + } + } + + actual_dst.unwrap_or(dst) + } + + // .? + fn transform_option(&mut self, src: usize, dst: WipAction) -> WipAction { + self.add_transition(src, dst, WipConsume::Chars(0)) + } + + // .* + fn transform_any_star(&mut self, src: usize, dst: WipAction) -> WipAction { + self.add_transition(src, dst, WipConsume::Line) + } + + // . + fn transform_any(&mut self, src: usize, dst: WipAction) -> WipAction { + self.add_transition(src, dst, WipConsume::Chars(1)) + } + + fn transform_concat(&mut self, src: usize, dst: WipAction, hirs: &[Hir]) -> WipAction { + fn check_lowercase_literal(hir: &Hir) -> Option { + if let HirKind::Class(Class::Bytes(class)) = hir.kind() + && let ranges = class.ranges() + && ranges.len() == 2 + && ranges[0].len() == 1 + && ranges[1].len() == 1 + && let lower_a = ranges[0].start().to_ascii_lowercase() + && let lower_b = ranges[1].start().to_ascii_lowercase() + && lower_a == lower_b + { + Some(lower_a) + } else { + None + } + } + + let mut it = hirs.iter().peekable(); + let mut src = WipAction::Change(src); + + while let Some(mut hir) = it.next() { + let src_idx = match src { + WipAction::Change(idx) => idx, + _ => panic!("Unexpected action in transform_concat"), + }; + + if let Some(ch) = check_lowercase_literal(hir) { + // Transform [aA][bB][cC] into PrefixInsensitive("abc"). + let mut str = String::new(); + str.push(ch as char); + + while let Some(next_hir) = it.peek() { + if let Some(next_ch) = check_lowercase_literal(next_hir) { + str.push(next_ch as char); + it.next(); + } else { + break; + } + } + + let next = + if it.peek().is_some() { WipAction::Change(self.add_state()) } else { dst }; + src = self.add_transition(src_idx, next, WipConsume::PrefixInsensitive(str)); + } else { + // Any other sequence is simply concatenated. + let next = + if it.peek().is_some() { WipAction::Change(self.add_state()) } else { dst }; + src = self.transform(src_idx, next, hir); + } + } + + src + } + + fn transform_alt(&mut self, src: usize, dst: WipAction, hirs: &[Hir]) -> WipAction { + let mut actual_dst = None; + + for hir in hirs { + let d = self.transform(src, dst, hir); + if d != *actual_dst.get_or_insert(d) { + panic!("Diverging destinations for alternation transformer: {hirs:?}"); + } + } + + actual_dst.unwrap_or(dst) + } + + fn class_to_charset(&mut self, class: &ClassBytes) -> Box<[bool; 256]> { + let mut charset = Box::new([false; 256]); + + for r in class.iter() { + charset[r.start() as usize..=r.end() as usize].fill(true); + } + + // If the class includes \w, we also set any non-ASCII characters. + // That's not how Unicode works, but it simplifies the implementation. + if [(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')] + .iter() + .all(|&(beg, end)| charset[beg as usize..=end as usize].iter().all(|&b| b)) + { + charset[0x80..=0xFF].fill(true); + } + + charset + } +} + +fn print_mermaid(def_states: &[StateDefinition], states: &[WipState]) { + // Print header for Mermaid graph + println!("%%{{init:{{'fontFamily':'monospace','flowchart':{{'defaultRenderer':'elk'}}}}}}%%"); + println!("graph TD"); + + // Print nodes (states) + for (idx, _state) in states.iter().enumerate() { + println!( + " {idx}[\"{}\"]", + match def_states.get(idx) { + Some(state) => state.name, + None => &format!("{idx}"), + } + ); + } + + // Print edges (transitions) + for (src_idx, state) in states.iter().enumerate() { + for t in &state.transitions { + let dst = match t.action { + WipAction::Change(idx) => format!("{idx}"), + WipAction::Push(idx) => { + format!("push{}[/\"Push({})\"/]", src_idx << 16 | idx, def_states[idx].name) + } + WipAction::Pop(count) => { + format!("pop{}[/\"Pop({count})\"/]", src_idx << 16 | count) + } + }; + let label = match &t.test { + WipConsume::Prefix(s) => format!("Prefix({s})"), + WipConsume::PrefixInsensitive(s) => format!("PrefixInsensitive({s})"), + WipConsume::Charset(c) => format!("Charset({:?})", CharsetFormatter(c)), + WipConsume::Chars(n) => format!("Chars({n})"), + WipConsume::Line => "Line".to_string(), }; let label = label.replace('"', """); - out.push_str(&format!(" {src_id} -->|\"{label}\"| {dst_id}\n")); - - walk(edge_ref.dst, visited, out); - - edge = edge_ref.edge_next; + let label = label.replace('\\', r#"\\"#); + println!(" {src_idx} -->|\"{label}\"| {dst}"); } } - - let mut out = String::from( - "%%{init:{'fontFamily':'monospace','flowchart':{'defaultRenderer':'elk'}}}%%\ngraph TD\n", - ); - let mut visited = HashMap::new(); - walk(root, &mut visited, &mut out); - println!("{out}"); } #[allow(dead_code)] pub fn parse_language_definition(def: &LanguageDefinition) { - let scratch = scratch_arena(None); - let root = Node::new_in(&scratch); + let mut state_names = HashMap::new(); + let mut states = Vec::new(); for state in def.states { + state_names.insert(state.name, states.len()); + states.push(WipState { transitions: Vec::new() }); + } + + for (ground_idx, state) in def.states.iter().enumerate() { for (pattern, kind, action) in state.rules { + let mut ctx = WipContext { states: &mut states, kind: *kind }; + let dst = match action { + ActionDefinition::Push(name) => match state_names.get(name) { + Some(&idx) => WipAction::Push(idx), + None => panic!("Unknown state name: {name}"), + }, + ActionDefinition::Pop(count) => WipAction::Pop(*count), + }; let hir = regex_syntax::ParserBuilder::new() .utf8(false) .unicode(false) @@ -414,11 +381,11 @@ pub fn parse_language_definition(def: &LanguageDefinition) { .build() .parse(pattern) .unwrap(); - transform(&scratch, root, root, &hir); + ctx.transform(ground_idx, dst, &hir); } } - print_mermaid(root); + print_mermaid(def.states, &states); } #[cfg(test)] diff --git a/src/highlighter/mod.rs b/src/highlighter/mod.rs index 1fc7206..b0ccec8 100644 --- a/src/highlighter/mod.rs +++ b/src/highlighter/mod.rs @@ -83,7 +83,7 @@ impl Language { struct Transition<'s> { test: Consume<'s>, kind: HighlightKind, - state: Action, + action: Action, } #[derive(PartialEq, Eq)] @@ -337,7 +337,7 @@ impl<'doc> Highlighter<'doc> { } } - match t.state { + match t.action { Action::Change(to) => { if let Some(last) = res.last_mut() { last.kind = t.kind;