mirror of
https://github.com/microsoft/edit.git
synced 2026-06-15 21:16:40 -05:00
wip
This commit is contained in:
@@ -10,7 +10,6 @@ use regex_syntax::hir::{Class, ClassBytes, ClassBytesRange, Hir, HirKind, Look};
|
||||
|
||||
use super::{Action, Consume, HighlightKind};
|
||||
use crate::arena::{Arena, ArenaString, scratch_arena};
|
||||
use crate::cell::SemiRefCell;
|
||||
use crate::highlighter::{CharsetFormatter, Transition};
|
||||
|
||||
pub struct LanguageDefinition {
|
||||
@@ -48,7 +47,8 @@ pub const JSON: LanguageDefinition = {
|
||||
// Strings
|
||||
(r#"""#, String, Push("string")),
|
||||
// Numbers (start: minus or digit)
|
||||
(r#"-?\d*(?:\.\d+)?(?:[eE][+-]?\d+)?"#, Number, Pop(1)),
|
||||
(r#"-\d*(?:\.\d+)?(?:[eE][+-]?\d+)?"#, Number, Pop(1)),
|
||||
(r#"\d*(?:\.\d+)?(?:[eE][+-]?\d+)?"#, Number, Pop(1)),
|
||||
// Booleans/null
|
||||
(r#"true\b"#, Keyword, Pop(1)),
|
||||
(r#"false\b"#, Keyword, Pop(1)),
|
||||
@@ -65,348 +65,315 @@ pub const JSON: LanguageDefinition = {
|
||||
}
|
||||
};
|
||||
|
||||
type NodeCell<'a> = SemiRefCell<Node<'a>>;
|
||||
|
||||
// Nodes form a DFA graph which is mostly shaped like a tree.
|
||||
// Each group of sibling nodes represent the edges coming out of a DFA state.
|
||||
struct Node<'a> {
|
||||
edge_first: Option<&'a EdgeCell<'a>>,
|
||||
edge_last: Option<&'a EdgeCell<'a>>,
|
||||
struct WipState {
|
||||
transitions: Vec<WipTransition>,
|
||||
}
|
||||
|
||||
impl<'a> Node<'a> {
|
||||
fn new_in(arena: &'a Arena) -> &'a mut NodeCell<'a> {
|
||||
arena.alloc_uninit().write(NodeCell::new(Node { edge_first: None, edge_last: None }))
|
||||
}
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
enum WipAction {
|
||||
Change(usize),
|
||||
Push(usize),
|
||||
Pop(usize),
|
||||
}
|
||||
|
||||
type EdgeCell<'a> = SemiRefCell<Edge<'a>>;
|
||||
|
||||
struct Edge<'a> {
|
||||
edge_next: Option<&'a EdgeCell<'a>>,
|
||||
dst: &'a NodeCell<'a>,
|
||||
test: Consume<'a>,
|
||||
#[derive(PartialEq, Eq)]
|
||||
enum WipConsume {
|
||||
Chars(usize),
|
||||
Prefix(String),
|
||||
PrefixInsensitive(String),
|
||||
Charset(Box<[bool; 256]>),
|
||||
Line,
|
||||
}
|
||||
|
||||
fn add_edge<'a>(
|
||||
arena: &'a Arena,
|
||||
src: &'a NodeCell<'a>,
|
||||
dst: &'a NodeCell<'a>,
|
||||
test: Consume<'a>,
|
||||
) -> &'a NodeCell<'a> {
|
||||
let mut src = src.borrow_mut();
|
||||
struct WipTransition {
|
||||
test: WipConsume,
|
||||
kind: HighlightKind,
|
||||
action: WipAction,
|
||||
}
|
||||
|
||||
// Check if the edge already exists.
|
||||
{
|
||||
let mut edge = src.edge_first;
|
||||
while let Some(e) = edge {
|
||||
let e = e.borrow();
|
||||
if e.test == test {
|
||||
return e.dst;
|
||||
}
|
||||
edge = e.edge_next;
|
||||
}
|
||||
struct WipContext<'a> {
|
||||
states: &'a mut Vec<WipState>,
|
||||
kind: HighlightKind,
|
||||
}
|
||||
|
||||
impl WipContext<'_> {
|
||||
fn add_state(&mut self) -> usize {
|
||||
self.states.push(WipState { transitions: Vec::new() });
|
||||
self.states.len() - 1
|
||||
}
|
||||
|
||||
let edge = arena.alloc_uninit().write(EdgeCell::new(Edge { edge_next: None, dst, test }));
|
||||
fn add_transition(&mut self, src: usize, dst: WipAction, test: WipConsume) -> WipAction {
|
||||
let src = &mut self.states[src].transitions;
|
||||
|
||||
if let Some(last) = src.edge_last {
|
||||
last.borrow_mut().edge_next = Some(edge);
|
||||
} else {
|
||||
src.edge_first = Some(edge);
|
||||
}
|
||||
|
||||
src.edge_last = Some(edge);
|
||||
dst
|
||||
}
|
||||
|
||||
fn transform<'a>(
|
||||
arena: &'a Arena,
|
||||
src: &'a NodeCell<'a>,
|
||||
dst: &'a NodeCell<'a>,
|
||||
hir: &Hir,
|
||||
) -> &'a NodeCell<'a> {
|
||||
fn is_any_class(class: &ClassBytes) -> bool {
|
||||
class.ranges() == [ClassBytesRange::new(0, 255)]
|
||||
}
|
||||
|
||||
match hir.kind() {
|
||||
HirKind::Literal(lit) => transform_literal(arena, src, dst, &lit.0),
|
||||
HirKind::Class(Class::Bytes(class)) if is_any_class(class) => {
|
||||
transform_any(arena, src, dst)
|
||||
}
|
||||
HirKind::Class(Class::Bytes(class)) => transform_class(arena, src, dst, class),
|
||||
HirKind::Look(Look::WordAscii) => dst,
|
||||
HirKind::Repetition(rep) => match (rep.min, rep.max, rep.sub.kind()) {
|
||||
(0, None, HirKind::Class(Class::Bytes(class))) if is_any_class(class) => {
|
||||
transform_any_star(arena, src, dst)
|
||||
}
|
||||
(0, None, HirKind::Class(Class::Bytes(class))) => {
|
||||
let dst = transform_class_plus(arena, src, dst, class);
|
||||
transform_option(arena, src, dst);
|
||||
dst
|
||||
}
|
||||
(0, Some(1), _) => {
|
||||
let dst = transform(arena, src, dst, &rep.sub);
|
||||
transform_option(arena, src, dst);
|
||||
dst
|
||||
}
|
||||
(1, None, HirKind::Class(Class::Bytes(class))) => {
|
||||
transform_class_plus(arena, src, dst, class)
|
||||
}
|
||||
_ => panic!("Unsupported HIR: {hir:?}"),
|
||||
},
|
||||
HirKind::Concat(hirs) if hirs.len() >= 2 => transform_concat(arena, src, dst, hirs),
|
||||
HirKind::Alternation(hirs) if hirs.len() >= 2 => transform_alt(arena, src, dst, hirs),
|
||||
_ => panic!("Unsupported HIR: {hir:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
// string
|
||||
fn transform_literal<'a>(
|
||||
arena: &'a Arena,
|
||||
src: &'a NodeCell<'a>,
|
||||
dst: &'a NodeCell<'a>,
|
||||
lit: &[u8],
|
||||
) -> &'a NodeCell<'a> {
|
||||
let copy = arena.alloc_uninit_slice(lit.len()).write_clone_of_slice(lit);
|
||||
let copy = str::from_utf8(copy).unwrap();
|
||||
add_edge(arena, src, dst, Consume::Prefix(copy))
|
||||
}
|
||||
|
||||
// [a-z]+
|
||||
fn transform_class_plus<'a>(
|
||||
arena: &'a Arena,
|
||||
src: &'a NodeCell<'a>,
|
||||
dst: &'a NodeCell<'a>,
|
||||
class: &ClassBytes,
|
||||
) -> &'a NodeCell<'a> {
|
||||
let charset = class_to_charset(arena, class);
|
||||
add_edge(arena, src, dst, Consume::Charset(charset))
|
||||
}
|
||||
|
||||
// [eE]
|
||||
fn transform_class<'a>(
|
||||
arena: &'a Arena,
|
||||
src: &'a NodeCell<'a>,
|
||||
dst: &'a NodeCell<'a>,
|
||||
class: &ClassBytes,
|
||||
) -> &'a NodeCell<'a> {
|
||||
let charset = class_to_charset(arena, class);
|
||||
let mut actual_dst = None;
|
||||
|
||||
for i in 0..256 {
|
||||
if !charset[i] {
|
||||
continue;
|
||||
}
|
||||
|
||||
if i >= 128 {
|
||||
panic!("Invalid non-ASCII class character {i}");
|
||||
}
|
||||
|
||||
let ch = i as u8;
|
||||
let copy = arena.alloc_uninit().write(ch.to_ascii_lowercase());
|
||||
let copy = str::from_utf8(slice::from_ref(copy)).unwrap();
|
||||
|
||||
// NOTE: Uppercase chars have a lower numeric value than lowercase chars.
|
||||
// As such, we need to test for `is_ascii_uppercase`.
|
||||
let test = if ch.is_ascii_uppercase()
|
||||
&& let upper = ch.to_ascii_lowercase() as usize
|
||||
&& charset[upper]
|
||||
{
|
||||
charset[upper] = false;
|
||||
Consume::PrefixInsensitive(copy)
|
||||
} else {
|
||||
Consume::Prefix(copy)
|
||||
};
|
||||
|
||||
let node = add_edge(arena, src, dst, test);
|
||||
if !ptr::eq(node, *actual_dst.get_or_insert(node)) {
|
||||
panic!("Diverging destinations for class transformer: {class:?}");
|
||||
}
|
||||
}
|
||||
|
||||
actual_dst.unwrap_or(dst)
|
||||
}
|
||||
|
||||
// .?
|
||||
fn transform_option<'a>(
|
||||
arena: &'a Arena,
|
||||
src: &'a NodeCell<'a>,
|
||||
dst: &'a NodeCell<'a>,
|
||||
) -> &'a NodeCell<'a> {
|
||||
add_edge(arena, src, dst, Consume::Chars(0))
|
||||
}
|
||||
|
||||
// .*
|
||||
fn transform_any_star<'a>(
|
||||
arena: &'a Arena,
|
||||
src: &'a NodeCell<'a>,
|
||||
dst: &'a NodeCell<'a>,
|
||||
) -> &'a NodeCell<'a> {
|
||||
add_edge(arena, src, dst, Consume::Line)
|
||||
}
|
||||
|
||||
// .
|
||||
fn transform_any<'a>(
|
||||
arena: &'a Arena,
|
||||
src: &'a NodeCell<'a>,
|
||||
dst: &'a NodeCell<'a>,
|
||||
) -> &'a NodeCell<'a> {
|
||||
add_edge(arena, src, dst, Consume::Chars(1))
|
||||
}
|
||||
|
||||
fn transform_concat<'a>(
|
||||
arena: &'a Arena,
|
||||
mut src: &'a NodeCell<'a>,
|
||||
dst: &'a NodeCell<'a>,
|
||||
hirs: &[Hir],
|
||||
) -> &'a NodeCell<'a> {
|
||||
fn check_lowercase_literal(hir: &Hir) -> Option<u8> {
|
||||
if let HirKind::Class(Class::Bytes(class)) = hir.kind()
|
||||
&& let ranges = class.ranges()
|
||||
&& ranges.len() == 2
|
||||
&& ranges[0].len() == 1
|
||||
&& ranges[1].len() == 1
|
||||
&& let lower_a = ranges[0].start().to_ascii_lowercase()
|
||||
&& let lower_b = ranges[1].start().to_ascii_lowercase()
|
||||
&& lower_a == lower_b
|
||||
{
|
||||
Some(lower_a)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
let mut it = hirs.iter().peekable();
|
||||
|
||||
while let Some(mut hir) = it.next() {
|
||||
if let Some(ch) = check_lowercase_literal(hir) {
|
||||
// Transform [aA][bB][cC] into PrefixInsensitive("abc")
|
||||
let mut str = ManuallyDrop::new(ArenaString::new_in(arena));
|
||||
str.push(ch as char);
|
||||
|
||||
while let Some(next_hir) = it.peek() {
|
||||
if let Some(next_ch) = check_lowercase_literal(next_hir) {
|
||||
str.push(next_ch as char);
|
||||
it.next();
|
||||
} else {
|
||||
break;
|
||||
// Check if the edge already exists.
|
||||
for t in src.iter() {
|
||||
if t.test == test {
|
||||
match t.action {
|
||||
WipAction::Change(_) => return t.action,
|
||||
_ => panic!("Existing edge with non-change action"),
|
||||
}
|
||||
}
|
||||
|
||||
let next = if it.peek().is_some() { Node::new_in(arena) } else { dst };
|
||||
let str: &'a str = unsafe { mem::transmute(str.as_str()) };
|
||||
src = add_edge(arena, src, next, Consume::PrefixInsensitive(str));
|
||||
} else {
|
||||
let next = if it.peek().is_some() { Node::new_in(arena) } else { dst };
|
||||
src = transform(arena, src, next, hir);
|
||||
}
|
||||
|
||||
src.push(WipTransition { test, kind: self.kind, action: dst });
|
||||
dst
|
||||
}
|
||||
|
||||
src
|
||||
}
|
||||
|
||||
fn transform_alt<'a>(
|
||||
arena: &'a Arena,
|
||||
src: &'a NodeCell<'a>,
|
||||
dst: &'a NodeCell<'a>,
|
||||
hirs: &[Hir],
|
||||
) -> &'a NodeCell<'a> {
|
||||
let mut actual_dst = None;
|
||||
|
||||
for hir in hirs {
|
||||
let node = transform(arena, src, dst, hir);
|
||||
if !ptr::eq(node, *actual_dst.get_or_insert(node)) {
|
||||
panic!("Diverging destinations for alternation transformer: {hirs:?}");
|
||||
fn transform(&mut self, src: usize, dst: WipAction, hir: &Hir) -> WipAction {
|
||||
fn is_any_class(class: &ClassBytes) -> bool {
|
||||
class.ranges() == [ClassBytesRange::new(0, 255)]
|
||||
}
|
||||
}
|
||||
|
||||
actual_dst.unwrap_or(dst)
|
||||
}
|
||||
|
||||
fn class_to_charset<'a>(arena: &'a Arena, class: &ClassBytes) -> &'a mut [bool; 256] {
|
||||
let mut charset = arena.alloc_uninit().write([false; 256]);
|
||||
|
||||
for r in class.iter() {
|
||||
charset[r.start() as usize..=r.end() as usize].fill(true);
|
||||
}
|
||||
|
||||
// If the class includes \w, we also set any non-ASCII characters.
|
||||
// That's not how Unicode works, but it simplifies the implementation.
|
||||
if [(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')]
|
||||
.iter()
|
||||
.all(|&(beg, end)| charset[beg as usize..=end as usize].iter().all(|&b| b))
|
||||
{
|
||||
charset[0x80..=0xFF].fill(true);
|
||||
}
|
||||
|
||||
charset
|
||||
}
|
||||
|
||||
fn print_mermaid<'a>(root: &'a NodeCell<'a>) {
|
||||
fn node_id<'a, 'v>(
|
||||
visited: &'v mut HashMap<*const NodeCell<'a>, (usize, bool)>,
|
||||
ptr: &'a NodeCell<'a>,
|
||||
) -> &'v mut (usize, bool) {
|
||||
let num = visited.len();
|
||||
match visited.entry(ptr as *const _) {
|
||||
Entry::Occupied(mut e) => e.into_mut(),
|
||||
Entry::Vacant(mut e) => e.insert((num, false)),
|
||||
}
|
||||
}
|
||||
|
||||
fn walk<'a>(
|
||||
node: &'a NodeCell<'a>,
|
||||
visited: &mut HashMap<*const NodeCell<'a>, (usize, bool)>,
|
||||
out: &mut String,
|
||||
) {
|
||||
let node_ptr = node as *const _;
|
||||
let src_id = match node_id(visited, node) {
|
||||
(num, visited) if !*visited => {
|
||||
*visited = true;
|
||||
*num
|
||||
match hir.kind() {
|
||||
HirKind::Literal(lit) => self.transform_literal(src, dst, &lit.0),
|
||||
HirKind::Class(Class::Bytes(class)) if is_any_class(class) => {
|
||||
self.transform_any(src, dst)
|
||||
}
|
||||
_ => return, // Already visited
|
||||
};
|
||||
HirKind::Class(Class::Bytes(class)) => self.transform_class(src, dst, class),
|
||||
HirKind::Look(Look::WordAscii) => dst,
|
||||
HirKind::Repetition(rep) => match (rep.min, rep.max, rep.sub.kind()) {
|
||||
(0, None, HirKind::Class(Class::Bytes(class))) if is_any_class(class) => {
|
||||
self.transform_any_star(src, dst)
|
||||
}
|
||||
(0, None, HirKind::Class(Class::Bytes(class))) => {
|
||||
let dst = self.transform_class_plus(src, dst, class);
|
||||
self.transform_option(src, dst);
|
||||
dst
|
||||
}
|
||||
(0, Some(1), _) => {
|
||||
let dst = self.transform(src, dst, &rep.sub);
|
||||
self.transform_option(src, dst);
|
||||
dst
|
||||
}
|
||||
(1, None, HirKind::Class(Class::Bytes(class))) => {
|
||||
self.transform_class_plus(src, dst, class)
|
||||
}
|
||||
_ => panic!("Unsupported HIR: {hir:?}"),
|
||||
},
|
||||
HirKind::Concat(hirs) if hirs.len() >= 2 => self.transform_concat(src, dst, hirs),
|
||||
HirKind::Alternation(hirs) if hirs.len() >= 2 => self.transform_alt(src, dst, hirs),
|
||||
_ => panic!("Unsupported HIR: {hir:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
let node_ref = node.borrow();
|
||||
let mut edge = node_ref.edge_first;
|
||||
// string
|
||||
fn transform_literal(&mut self, src: usize, dst: WipAction, lit: &[u8]) -> WipAction {
|
||||
self.add_transition(src, dst, WipConsume::Prefix(String::from_utf8(lit.to_vec()).unwrap()))
|
||||
}
|
||||
|
||||
while let Some(edge_cell) = edge {
|
||||
let edge_ref = edge_cell.borrow();
|
||||
let &mut (dst_id, _) = node_id(visited, edge_ref.dst);
|
||||
let label = match &edge_ref.test {
|
||||
Consume::Prefix(s) => format!("Prefix({s})"),
|
||||
Consume::PrefixInsensitive(s) => format!("PrefixInsensitive({s})"),
|
||||
Consume::Charset(c) => format!("Charset({:?})", CharsetFormatter(c)),
|
||||
Consume::Chars(n) => format!("Chars({n})"),
|
||||
Consume::Line => "Line".to_string(),
|
||||
// [a-z]+
|
||||
fn transform_class_plus(
|
||||
&mut self,
|
||||
src: usize,
|
||||
dst: WipAction,
|
||||
class: &ClassBytes,
|
||||
) -> WipAction {
|
||||
let charset = self.class_to_charset(class);
|
||||
self.add_transition(src, dst, WipConsume::Charset(charset))
|
||||
}
|
||||
|
||||
// [eE]
|
||||
fn transform_class(&mut self, src: usize, dst: WipAction, class: &ClassBytes) -> WipAction {
|
||||
let mut charset = self.class_to_charset(class);
|
||||
let mut actual_dst = None;
|
||||
|
||||
for i in 0..256 {
|
||||
if !charset[i] {
|
||||
continue;
|
||||
}
|
||||
|
||||
if i >= 128 {
|
||||
panic!("Invalid non-ASCII class character {i}");
|
||||
}
|
||||
|
||||
let ch = i as u8;
|
||||
let str = String::from_utf8(slice::from_ref(&ch).to_vec()).unwrap();
|
||||
|
||||
// NOTE: Uppercase chars have a lower numeric value than lowercase chars.
|
||||
// As such, we need to test for `is_ascii_uppercase`.
|
||||
let test = if ch.is_ascii_uppercase()
|
||||
&& let upper = ch.to_ascii_lowercase() as usize
|
||||
&& charset[upper]
|
||||
{
|
||||
charset[upper] = false;
|
||||
WipConsume::PrefixInsensitive(str)
|
||||
} else {
|
||||
WipConsume::Prefix(str)
|
||||
};
|
||||
|
||||
let d = self.add_transition(src, dst, test);
|
||||
if d != *actual_dst.get_or_insert(d) {
|
||||
panic!("Diverging destinations for class transformer: {class:?}");
|
||||
}
|
||||
}
|
||||
|
||||
actual_dst.unwrap_or(dst)
|
||||
}
|
||||
|
||||
// .?
|
||||
fn transform_option(&mut self, src: usize, dst: WipAction) -> WipAction {
|
||||
self.add_transition(src, dst, WipConsume::Chars(0))
|
||||
}
|
||||
|
||||
// .*
|
||||
fn transform_any_star(&mut self, src: usize, dst: WipAction) -> WipAction {
|
||||
self.add_transition(src, dst, WipConsume::Line)
|
||||
}
|
||||
|
||||
// .
|
||||
fn transform_any(&mut self, src: usize, dst: WipAction) -> WipAction {
|
||||
self.add_transition(src, dst, WipConsume::Chars(1))
|
||||
}
|
||||
|
||||
fn transform_concat(&mut self, src: usize, dst: WipAction, hirs: &[Hir]) -> WipAction {
|
||||
fn check_lowercase_literal(hir: &Hir) -> Option<u8> {
|
||||
if let HirKind::Class(Class::Bytes(class)) = hir.kind()
|
||||
&& let ranges = class.ranges()
|
||||
&& ranges.len() == 2
|
||||
&& ranges[0].len() == 1
|
||||
&& ranges[1].len() == 1
|
||||
&& let lower_a = ranges[0].start().to_ascii_lowercase()
|
||||
&& let lower_b = ranges[1].start().to_ascii_lowercase()
|
||||
&& lower_a == lower_b
|
||||
{
|
||||
Some(lower_a)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
let mut it = hirs.iter().peekable();
|
||||
let mut src = WipAction::Change(src);
|
||||
|
||||
while let Some(mut hir) = it.next() {
|
||||
let src_idx = match src {
|
||||
WipAction::Change(idx) => idx,
|
||||
_ => panic!("Unexpected action in transform_concat"),
|
||||
};
|
||||
|
||||
if let Some(ch) = check_lowercase_literal(hir) {
|
||||
// Transform [aA][bB][cC] into PrefixInsensitive("abc").
|
||||
let mut str = String::new();
|
||||
str.push(ch as char);
|
||||
|
||||
while let Some(next_hir) = it.peek() {
|
||||
if let Some(next_ch) = check_lowercase_literal(next_hir) {
|
||||
str.push(next_ch as char);
|
||||
it.next();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let next =
|
||||
if it.peek().is_some() { WipAction::Change(self.add_state()) } else { dst };
|
||||
src = self.add_transition(src_idx, next, WipConsume::PrefixInsensitive(str));
|
||||
} else {
|
||||
// Any other sequence is simply concatenated.
|
||||
let next =
|
||||
if it.peek().is_some() { WipAction::Change(self.add_state()) } else { dst };
|
||||
src = self.transform(src_idx, next, hir);
|
||||
}
|
||||
}
|
||||
|
||||
src
|
||||
}
|
||||
|
||||
fn transform_alt(&mut self, src: usize, dst: WipAction, hirs: &[Hir]) -> WipAction {
|
||||
let mut actual_dst = None;
|
||||
|
||||
for hir in hirs {
|
||||
let d = self.transform(src, dst, hir);
|
||||
if d != *actual_dst.get_or_insert(d) {
|
||||
panic!("Diverging destinations for alternation transformer: {hirs:?}");
|
||||
}
|
||||
}
|
||||
|
||||
actual_dst.unwrap_or(dst)
|
||||
}
|
||||
|
||||
fn class_to_charset(&mut self, class: &ClassBytes) -> Box<[bool; 256]> {
|
||||
let mut charset = Box::new([false; 256]);
|
||||
|
||||
for r in class.iter() {
|
||||
charset[r.start() as usize..=r.end() as usize].fill(true);
|
||||
}
|
||||
|
||||
// If the class includes \w, we also set any non-ASCII characters.
|
||||
// That's not how Unicode works, but it simplifies the implementation.
|
||||
if [(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')]
|
||||
.iter()
|
||||
.all(|&(beg, end)| charset[beg as usize..=end as usize].iter().all(|&b| b))
|
||||
{
|
||||
charset[0x80..=0xFF].fill(true);
|
||||
}
|
||||
|
||||
charset
|
||||
}
|
||||
}
|
||||
|
||||
fn print_mermaid(def_states: &[StateDefinition], states: &[WipState]) {
|
||||
// Print header for Mermaid graph
|
||||
println!("%%{{init:{{'fontFamily':'monospace','flowchart':{{'defaultRenderer':'elk'}}}}}}%%");
|
||||
println!("graph TD");
|
||||
|
||||
// Print nodes (states)
|
||||
for (idx, _state) in states.iter().enumerate() {
|
||||
println!(
|
||||
" {idx}[\"{}\"]",
|
||||
match def_states.get(idx) {
|
||||
Some(state) => state.name,
|
||||
None => &format!("{idx}"),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// Print edges (transitions)
|
||||
for (src_idx, state) in states.iter().enumerate() {
|
||||
for t in &state.transitions {
|
||||
let dst = match t.action {
|
||||
WipAction::Change(idx) => format!("{idx}"),
|
||||
WipAction::Push(idx) => {
|
||||
format!("push{}[/\"Push({})\"/]", src_idx << 16 | idx, def_states[idx].name)
|
||||
}
|
||||
WipAction::Pop(count) => {
|
||||
format!("pop{}[/\"Pop({count})\"/]", src_idx << 16 | count)
|
||||
}
|
||||
};
|
||||
let label = match &t.test {
|
||||
WipConsume::Prefix(s) => format!("Prefix({s})"),
|
||||
WipConsume::PrefixInsensitive(s) => format!("PrefixInsensitive({s})"),
|
||||
WipConsume::Charset(c) => format!("Charset({:?})", CharsetFormatter(c)),
|
||||
WipConsume::Chars(n) => format!("Chars({n})"),
|
||||
WipConsume::Line => "Line".to_string(),
|
||||
};
|
||||
let label = label.replace('"', """);
|
||||
out.push_str(&format!(" {src_id} -->|\"{label}\"| {dst_id}\n"));
|
||||
|
||||
walk(edge_ref.dst, visited, out);
|
||||
|
||||
edge = edge_ref.edge_next;
|
||||
let label = label.replace('\\', r#"\\"#);
|
||||
println!(" {src_idx} -->|\"{label}\"| {dst}");
|
||||
}
|
||||
}
|
||||
|
||||
let mut out = String::from(
|
||||
"%%{init:{'fontFamily':'monospace','flowchart':{'defaultRenderer':'elk'}}}%%\ngraph TD\n",
|
||||
);
|
||||
let mut visited = HashMap::new();
|
||||
walk(root, &mut visited, &mut out);
|
||||
println!("{out}");
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn parse_language_definition(def: &LanguageDefinition) {
|
||||
let scratch = scratch_arena(None);
|
||||
let root = Node::new_in(&scratch);
|
||||
let mut state_names = HashMap::new();
|
||||
let mut states = Vec::new();
|
||||
|
||||
for state in def.states {
|
||||
state_names.insert(state.name, states.len());
|
||||
states.push(WipState { transitions: Vec::new() });
|
||||
}
|
||||
|
||||
for (ground_idx, state) in def.states.iter().enumerate() {
|
||||
for (pattern, kind, action) in state.rules {
|
||||
let mut ctx = WipContext { states: &mut states, kind: *kind };
|
||||
let dst = match action {
|
||||
ActionDefinition::Push(name) => match state_names.get(name) {
|
||||
Some(&idx) => WipAction::Push(idx),
|
||||
None => panic!("Unknown state name: {name}"),
|
||||
},
|
||||
ActionDefinition::Pop(count) => WipAction::Pop(*count),
|
||||
};
|
||||
let hir = regex_syntax::ParserBuilder::new()
|
||||
.utf8(false)
|
||||
.unicode(false)
|
||||
@@ -414,11 +381,11 @@ pub fn parse_language_definition(def: &LanguageDefinition) {
|
||||
.build()
|
||||
.parse(pattern)
|
||||
.unwrap();
|
||||
transform(&scratch, root, root, &hir);
|
||||
ctx.transform(ground_idx, dst, &hir);
|
||||
}
|
||||
}
|
||||
|
||||
print_mermaid(root);
|
||||
print_mermaid(def.states, &states);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -83,7 +83,7 @@ impl Language {
|
||||
struct Transition<'s> {
|
||||
test: Consume<'s>,
|
||||
kind: HighlightKind,
|
||||
state: Action,
|
||||
action: Action,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq)]
|
||||
@@ -337,7 +337,7 @@ impl<'doc> Highlighter<'doc> {
|
||||
}
|
||||
}
|
||||
|
||||
match t.state {
|
||||
match t.action {
|
||||
Action::Change(to) => {
|
||||
if let Some(last) = res.last_mut() {
|
||||
last.kind = t.kind;
|
||||
|
||||
Reference in New Issue
Block a user