This commit is contained in:
Leonard Hecker
2025-07-12 03:23:49 +02:00
parent f377e9f4bf
commit dd4fc0ca8f
2 changed files with 284 additions and 317 deletions

View File

@@ -10,7 +10,6 @@ use regex_syntax::hir::{Class, ClassBytes, ClassBytesRange, Hir, HirKind, Look};
use super::{Action, Consume, HighlightKind};
use crate::arena::{Arena, ArenaString, scratch_arena};
use crate::cell::SemiRefCell;
use crate::highlighter::{CharsetFormatter, Transition};
pub struct LanguageDefinition {
@@ -48,7 +47,8 @@ pub const JSON: LanguageDefinition = {
// Strings
(r#"""#, String, Push("string")),
// Numbers (start: minus or digit)
(r#"-?\d*(?:\.\d+)?(?:[eE][+-]?\d+)?"#, Number, Pop(1)),
(r#"-\d*(?:\.\d+)?(?:[eE][+-]?\d+)?"#, Number, Pop(1)),
(r#"\d*(?:\.\d+)?(?:[eE][+-]?\d+)?"#, Number, Pop(1)),
// Booleans/null
(r#"true\b"#, Keyword, Pop(1)),
(r#"false\b"#, Keyword, Pop(1)),
@@ -65,348 +65,315 @@ pub const JSON: LanguageDefinition = {
}
};
type NodeCell<'a> = SemiRefCell<Node<'a>>;
// Nodes form a DFA graph which is mostly shaped like a tree.
// Each group of sibling nodes represent the edges coming out of a DFA state.
struct Node<'a> {
edge_first: Option<&'a EdgeCell<'a>>,
edge_last: Option<&'a EdgeCell<'a>>,
struct WipState {
transitions: Vec<WipTransition>,
}
impl<'a> Node<'a> {
fn new_in(arena: &'a Arena) -> &'a mut NodeCell<'a> {
arena.alloc_uninit().write(NodeCell::new(Node { edge_first: None, edge_last: None }))
}
#[derive(Clone, Copy, PartialEq, Eq)]
enum WipAction {
Change(usize),
Push(usize),
Pop(usize),
}
type EdgeCell<'a> = SemiRefCell<Edge<'a>>;
struct Edge<'a> {
edge_next: Option<&'a EdgeCell<'a>>,
dst: &'a NodeCell<'a>,
test: Consume<'a>,
#[derive(PartialEq, Eq)]
enum WipConsume {
Chars(usize),
Prefix(String),
PrefixInsensitive(String),
Charset(Box<[bool; 256]>),
Line,
}
fn add_edge<'a>(
arena: &'a Arena,
src: &'a NodeCell<'a>,
dst: &'a NodeCell<'a>,
test: Consume<'a>,
) -> &'a NodeCell<'a> {
let mut src = src.borrow_mut();
struct WipTransition {
test: WipConsume,
kind: HighlightKind,
action: WipAction,
}
// Check if the edge already exists.
{
let mut edge = src.edge_first;
while let Some(e) = edge {
let e = e.borrow();
if e.test == test {
return e.dst;
}
edge = e.edge_next;
}
struct WipContext<'a> {
states: &'a mut Vec<WipState>,
kind: HighlightKind,
}
impl WipContext<'_> {
fn add_state(&mut self) -> usize {
self.states.push(WipState { transitions: Vec::new() });
self.states.len() - 1
}
let edge = arena.alloc_uninit().write(EdgeCell::new(Edge { edge_next: None, dst, test }));
fn add_transition(&mut self, src: usize, dst: WipAction, test: WipConsume) -> WipAction {
let src = &mut self.states[src].transitions;
if let Some(last) = src.edge_last {
last.borrow_mut().edge_next = Some(edge);
} else {
src.edge_first = Some(edge);
}
src.edge_last = Some(edge);
dst
}
fn transform<'a>(
arena: &'a Arena,
src: &'a NodeCell<'a>,
dst: &'a NodeCell<'a>,
hir: &Hir,
) -> &'a NodeCell<'a> {
fn is_any_class(class: &ClassBytes) -> bool {
class.ranges() == [ClassBytesRange::new(0, 255)]
}
match hir.kind() {
HirKind::Literal(lit) => transform_literal(arena, src, dst, &lit.0),
HirKind::Class(Class::Bytes(class)) if is_any_class(class) => {
transform_any(arena, src, dst)
}
HirKind::Class(Class::Bytes(class)) => transform_class(arena, src, dst, class),
HirKind::Look(Look::WordAscii) => dst,
HirKind::Repetition(rep) => match (rep.min, rep.max, rep.sub.kind()) {
(0, None, HirKind::Class(Class::Bytes(class))) if is_any_class(class) => {
transform_any_star(arena, src, dst)
}
(0, None, HirKind::Class(Class::Bytes(class))) => {
let dst = transform_class_plus(arena, src, dst, class);
transform_option(arena, src, dst);
dst
}
(0, Some(1), _) => {
let dst = transform(arena, src, dst, &rep.sub);
transform_option(arena, src, dst);
dst
}
(1, None, HirKind::Class(Class::Bytes(class))) => {
transform_class_plus(arena, src, dst, class)
}
_ => panic!("Unsupported HIR: {hir:?}"),
},
HirKind::Concat(hirs) if hirs.len() >= 2 => transform_concat(arena, src, dst, hirs),
HirKind::Alternation(hirs) if hirs.len() >= 2 => transform_alt(arena, src, dst, hirs),
_ => panic!("Unsupported HIR: {hir:?}"),
}
}
// string
fn transform_literal<'a>(
arena: &'a Arena,
src: &'a NodeCell<'a>,
dst: &'a NodeCell<'a>,
lit: &[u8],
) -> &'a NodeCell<'a> {
let copy = arena.alloc_uninit_slice(lit.len()).write_clone_of_slice(lit);
let copy = str::from_utf8(copy).unwrap();
add_edge(arena, src, dst, Consume::Prefix(copy))
}
// [a-z]+
fn transform_class_plus<'a>(
arena: &'a Arena,
src: &'a NodeCell<'a>,
dst: &'a NodeCell<'a>,
class: &ClassBytes,
) -> &'a NodeCell<'a> {
let charset = class_to_charset(arena, class);
add_edge(arena, src, dst, Consume::Charset(charset))
}
// [eE]
fn transform_class<'a>(
arena: &'a Arena,
src: &'a NodeCell<'a>,
dst: &'a NodeCell<'a>,
class: &ClassBytes,
) -> &'a NodeCell<'a> {
let charset = class_to_charset(arena, class);
let mut actual_dst = None;
for i in 0..256 {
if !charset[i] {
continue;
}
if i >= 128 {
panic!("Invalid non-ASCII class character {i}");
}
let ch = i as u8;
let copy = arena.alloc_uninit().write(ch.to_ascii_lowercase());
let copy = str::from_utf8(slice::from_ref(copy)).unwrap();
// NOTE: Uppercase chars have a lower numeric value than lowercase chars.
// As such, we need to test for `is_ascii_uppercase`.
let test = if ch.is_ascii_uppercase()
&& let upper = ch.to_ascii_lowercase() as usize
&& charset[upper]
{
charset[upper] = false;
Consume::PrefixInsensitive(copy)
} else {
Consume::Prefix(copy)
};
let node = add_edge(arena, src, dst, test);
if !ptr::eq(node, *actual_dst.get_or_insert(node)) {
panic!("Diverging destinations for class transformer: {class:?}");
}
}
actual_dst.unwrap_or(dst)
}
// .?
fn transform_option<'a>(
arena: &'a Arena,
src: &'a NodeCell<'a>,
dst: &'a NodeCell<'a>,
) -> &'a NodeCell<'a> {
add_edge(arena, src, dst, Consume::Chars(0))
}
// .*
fn transform_any_star<'a>(
arena: &'a Arena,
src: &'a NodeCell<'a>,
dst: &'a NodeCell<'a>,
) -> &'a NodeCell<'a> {
add_edge(arena, src, dst, Consume::Line)
}
// .
fn transform_any<'a>(
arena: &'a Arena,
src: &'a NodeCell<'a>,
dst: &'a NodeCell<'a>,
) -> &'a NodeCell<'a> {
add_edge(arena, src, dst, Consume::Chars(1))
}
fn transform_concat<'a>(
arena: &'a Arena,
mut src: &'a NodeCell<'a>,
dst: &'a NodeCell<'a>,
hirs: &[Hir],
) -> &'a NodeCell<'a> {
fn check_lowercase_literal(hir: &Hir) -> Option<u8> {
if let HirKind::Class(Class::Bytes(class)) = hir.kind()
&& let ranges = class.ranges()
&& ranges.len() == 2
&& ranges[0].len() == 1
&& ranges[1].len() == 1
&& let lower_a = ranges[0].start().to_ascii_lowercase()
&& let lower_b = ranges[1].start().to_ascii_lowercase()
&& lower_a == lower_b
{
Some(lower_a)
} else {
None
}
}
let mut it = hirs.iter().peekable();
while let Some(mut hir) = it.next() {
if let Some(ch) = check_lowercase_literal(hir) {
// Transform [aA][bB][cC] into PrefixInsensitive("abc")
let mut str = ManuallyDrop::new(ArenaString::new_in(arena));
str.push(ch as char);
while let Some(next_hir) = it.peek() {
if let Some(next_ch) = check_lowercase_literal(next_hir) {
str.push(next_ch as char);
it.next();
} else {
break;
// Check if the edge already exists.
for t in src.iter() {
if t.test == test {
match t.action {
WipAction::Change(_) => return t.action,
_ => panic!("Existing edge with non-change action"),
}
}
let next = if it.peek().is_some() { Node::new_in(arena) } else { dst };
let str: &'a str = unsafe { mem::transmute(str.as_str()) };
src = add_edge(arena, src, next, Consume::PrefixInsensitive(str));
} else {
let next = if it.peek().is_some() { Node::new_in(arena) } else { dst };
src = transform(arena, src, next, hir);
}
src.push(WipTransition { test, kind: self.kind, action: dst });
dst
}
src
}
fn transform_alt<'a>(
arena: &'a Arena,
src: &'a NodeCell<'a>,
dst: &'a NodeCell<'a>,
hirs: &[Hir],
) -> &'a NodeCell<'a> {
let mut actual_dst = None;
for hir in hirs {
let node = transform(arena, src, dst, hir);
if !ptr::eq(node, *actual_dst.get_or_insert(node)) {
panic!("Diverging destinations for alternation transformer: {hirs:?}");
fn transform(&mut self, src: usize, dst: WipAction, hir: &Hir) -> WipAction {
fn is_any_class(class: &ClassBytes) -> bool {
class.ranges() == [ClassBytesRange::new(0, 255)]
}
}
actual_dst.unwrap_or(dst)
}
fn class_to_charset<'a>(arena: &'a Arena, class: &ClassBytes) -> &'a mut [bool; 256] {
let mut charset = arena.alloc_uninit().write([false; 256]);
for r in class.iter() {
charset[r.start() as usize..=r.end() as usize].fill(true);
}
// If the class includes \w, we also set any non-ASCII characters.
// That's not how Unicode works, but it simplifies the implementation.
if [(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')]
.iter()
.all(|&(beg, end)| charset[beg as usize..=end as usize].iter().all(|&b| b))
{
charset[0x80..=0xFF].fill(true);
}
charset
}
fn print_mermaid<'a>(root: &'a NodeCell<'a>) {
fn node_id<'a, 'v>(
visited: &'v mut HashMap<*const NodeCell<'a>, (usize, bool)>,
ptr: &'a NodeCell<'a>,
) -> &'v mut (usize, bool) {
let num = visited.len();
match visited.entry(ptr as *const _) {
Entry::Occupied(mut e) => e.into_mut(),
Entry::Vacant(mut e) => e.insert((num, false)),
}
}
fn walk<'a>(
node: &'a NodeCell<'a>,
visited: &mut HashMap<*const NodeCell<'a>, (usize, bool)>,
out: &mut String,
) {
let node_ptr = node as *const _;
let src_id = match node_id(visited, node) {
(num, visited) if !*visited => {
*visited = true;
*num
match hir.kind() {
HirKind::Literal(lit) => self.transform_literal(src, dst, &lit.0),
HirKind::Class(Class::Bytes(class)) if is_any_class(class) => {
self.transform_any(src, dst)
}
_ => return, // Already visited
};
HirKind::Class(Class::Bytes(class)) => self.transform_class(src, dst, class),
HirKind::Look(Look::WordAscii) => dst,
HirKind::Repetition(rep) => match (rep.min, rep.max, rep.sub.kind()) {
(0, None, HirKind::Class(Class::Bytes(class))) if is_any_class(class) => {
self.transform_any_star(src, dst)
}
(0, None, HirKind::Class(Class::Bytes(class))) => {
let dst = self.transform_class_plus(src, dst, class);
self.transform_option(src, dst);
dst
}
(0, Some(1), _) => {
let dst = self.transform(src, dst, &rep.sub);
self.transform_option(src, dst);
dst
}
(1, None, HirKind::Class(Class::Bytes(class))) => {
self.transform_class_plus(src, dst, class)
}
_ => panic!("Unsupported HIR: {hir:?}"),
},
HirKind::Concat(hirs) if hirs.len() >= 2 => self.transform_concat(src, dst, hirs),
HirKind::Alternation(hirs) if hirs.len() >= 2 => self.transform_alt(src, dst, hirs),
_ => panic!("Unsupported HIR: {hir:?}"),
}
}
let node_ref = node.borrow();
let mut edge = node_ref.edge_first;
// string
fn transform_literal(&mut self, src: usize, dst: WipAction, lit: &[u8]) -> WipAction {
self.add_transition(src, dst, WipConsume::Prefix(String::from_utf8(lit.to_vec()).unwrap()))
}
while let Some(edge_cell) = edge {
let edge_ref = edge_cell.borrow();
let &mut (dst_id, _) = node_id(visited, edge_ref.dst);
let label = match &edge_ref.test {
Consume::Prefix(s) => format!("Prefix({s})"),
Consume::PrefixInsensitive(s) => format!("PrefixInsensitive({s})"),
Consume::Charset(c) => format!("Charset({:?})", CharsetFormatter(c)),
Consume::Chars(n) => format!("Chars({n})"),
Consume::Line => "Line".to_string(),
// [a-z]+
fn transform_class_plus(
&mut self,
src: usize,
dst: WipAction,
class: &ClassBytes,
) -> WipAction {
let charset = self.class_to_charset(class);
self.add_transition(src, dst, WipConsume::Charset(charset))
}
// [eE]
fn transform_class(&mut self, src: usize, dst: WipAction, class: &ClassBytes) -> WipAction {
let mut charset = self.class_to_charset(class);
let mut actual_dst = None;
for i in 0..256 {
if !charset[i] {
continue;
}
if i >= 128 {
panic!("Invalid non-ASCII class character {i}");
}
let ch = i as u8;
let str = String::from_utf8(slice::from_ref(&ch).to_vec()).unwrap();
// NOTE: Uppercase chars have a lower numeric value than lowercase chars.
// As such, we need to test for `is_ascii_uppercase`.
let test = if ch.is_ascii_uppercase()
&& let upper = ch.to_ascii_lowercase() as usize
&& charset[upper]
{
charset[upper] = false;
WipConsume::PrefixInsensitive(str)
} else {
WipConsume::Prefix(str)
};
let d = self.add_transition(src, dst, test);
if d != *actual_dst.get_or_insert(d) {
panic!("Diverging destinations for class transformer: {class:?}");
}
}
actual_dst.unwrap_or(dst)
}
// .?
fn transform_option(&mut self, src: usize, dst: WipAction) -> WipAction {
self.add_transition(src, dst, WipConsume::Chars(0))
}
// .*
fn transform_any_star(&mut self, src: usize, dst: WipAction) -> WipAction {
self.add_transition(src, dst, WipConsume::Line)
}
// .
fn transform_any(&mut self, src: usize, dst: WipAction) -> WipAction {
self.add_transition(src, dst, WipConsume::Chars(1))
}
fn transform_concat(&mut self, src: usize, dst: WipAction, hirs: &[Hir]) -> WipAction {
fn check_lowercase_literal(hir: &Hir) -> Option<u8> {
if let HirKind::Class(Class::Bytes(class)) = hir.kind()
&& let ranges = class.ranges()
&& ranges.len() == 2
&& ranges[0].len() == 1
&& ranges[1].len() == 1
&& let lower_a = ranges[0].start().to_ascii_lowercase()
&& let lower_b = ranges[1].start().to_ascii_lowercase()
&& lower_a == lower_b
{
Some(lower_a)
} else {
None
}
}
let mut it = hirs.iter().peekable();
let mut src = WipAction::Change(src);
while let Some(mut hir) = it.next() {
let src_idx = match src {
WipAction::Change(idx) => idx,
_ => panic!("Unexpected action in transform_concat"),
};
if let Some(ch) = check_lowercase_literal(hir) {
// Transform [aA][bB][cC] into PrefixInsensitive("abc").
let mut str = String::new();
str.push(ch as char);
while let Some(next_hir) = it.peek() {
if let Some(next_ch) = check_lowercase_literal(next_hir) {
str.push(next_ch as char);
it.next();
} else {
break;
}
}
let next =
if it.peek().is_some() { WipAction::Change(self.add_state()) } else { dst };
src = self.add_transition(src_idx, next, WipConsume::PrefixInsensitive(str));
} else {
// Any other sequence is simply concatenated.
let next =
if it.peek().is_some() { WipAction::Change(self.add_state()) } else { dst };
src = self.transform(src_idx, next, hir);
}
}
src
}
fn transform_alt(&mut self, src: usize, dst: WipAction, hirs: &[Hir]) -> WipAction {
let mut actual_dst = None;
for hir in hirs {
let d = self.transform(src, dst, hir);
if d != *actual_dst.get_or_insert(d) {
panic!("Diverging destinations for alternation transformer: {hirs:?}");
}
}
actual_dst.unwrap_or(dst)
}
fn class_to_charset(&mut self, class: &ClassBytes) -> Box<[bool; 256]> {
let mut charset = Box::new([false; 256]);
for r in class.iter() {
charset[r.start() as usize..=r.end() as usize].fill(true);
}
// If the class includes \w, we also set any non-ASCII characters.
// That's not how Unicode works, but it simplifies the implementation.
if [(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')]
.iter()
.all(|&(beg, end)| charset[beg as usize..=end as usize].iter().all(|&b| b))
{
charset[0x80..=0xFF].fill(true);
}
charset
}
}
fn print_mermaid(def_states: &[StateDefinition], states: &[WipState]) {
// Print header for Mermaid graph
println!("%%{{init:{{'fontFamily':'monospace','flowchart':{{'defaultRenderer':'elk'}}}}}}%%");
println!("graph TD");
// Print nodes (states)
for (idx, _state) in states.iter().enumerate() {
println!(
" {idx}[\"{}\"]",
match def_states.get(idx) {
Some(state) => state.name,
None => &format!("{idx}"),
}
);
}
// Print edges (transitions)
for (src_idx, state) in states.iter().enumerate() {
for t in &state.transitions {
let dst = match t.action {
WipAction::Change(idx) => format!("{idx}"),
WipAction::Push(idx) => {
format!("push{}[/\"Push({})\"/]", src_idx << 16 | idx, def_states[idx].name)
}
WipAction::Pop(count) => {
format!("pop{}[/\"Pop({count})\"/]", src_idx << 16 | count)
}
};
let label = match &t.test {
WipConsume::Prefix(s) => format!("Prefix({s})"),
WipConsume::PrefixInsensitive(s) => format!("PrefixInsensitive({s})"),
WipConsume::Charset(c) => format!("Charset({:?})", CharsetFormatter(c)),
WipConsume::Chars(n) => format!("Chars({n})"),
WipConsume::Line => "Line".to_string(),
};
let label = label.replace('"', "&quot;");
out.push_str(&format!(" {src_id} -->|\"{label}\"| {dst_id}\n"));
walk(edge_ref.dst, visited, out);
edge = edge_ref.edge_next;
let label = label.replace('\\', r#"\\"#);
println!(" {src_idx} -->|\"{label}\"| {dst}");
}
}
let mut out = String::from(
"%%{init:{'fontFamily':'monospace','flowchart':{'defaultRenderer':'elk'}}}%%\ngraph TD\n",
);
let mut visited = HashMap::new();
walk(root, &mut visited, &mut out);
println!("{out}");
}
#[allow(dead_code)]
pub fn parse_language_definition(def: &LanguageDefinition) {
let scratch = scratch_arena(None);
let root = Node::new_in(&scratch);
let mut state_names = HashMap::new();
let mut states = Vec::new();
for state in def.states {
state_names.insert(state.name, states.len());
states.push(WipState { transitions: Vec::new() });
}
for (ground_idx, state) in def.states.iter().enumerate() {
for (pattern, kind, action) in state.rules {
let mut ctx = WipContext { states: &mut states, kind: *kind };
let dst = match action {
ActionDefinition::Push(name) => match state_names.get(name) {
Some(&idx) => WipAction::Push(idx),
None => panic!("Unknown state name: {name}"),
},
ActionDefinition::Pop(count) => WipAction::Pop(*count),
};
let hir = regex_syntax::ParserBuilder::new()
.utf8(false)
.unicode(false)
@@ -414,11 +381,11 @@ pub fn parse_language_definition(def: &LanguageDefinition) {
.build()
.parse(pattern)
.unwrap();
transform(&scratch, root, root, &hir);
ctx.transform(ground_idx, dst, &hir);
}
}
print_mermaid(root);
print_mermaid(def.states, &states);
}
#[cfg(test)]

View File

@@ -83,7 +83,7 @@ impl Language {
struct Transition<'s> {
test: Consume<'s>,
kind: HighlightKind,
state: Action,
action: Action,
}
#[derive(PartialEq, Eq)]
@@ -337,7 +337,7 @@ impl<'doc> Highlighter<'doc> {
}
}
match t.state {
match t.action {
Action::Change(to) => {
if let Some(last) = res.last_mut() {
last.kind = t.kind;