terminal/tools/Generate-CodepointWidthsFromUCD.ps1
Josh Soref 774f74258f
ci: upgrade to check-spelling v0.0.24 (#18261)
This upgrades to [check-spelling v0.0.24].

A number of GitHub APIs are being turned off shortly, so we need to
upgrade or various uncertain outcomes will occur.

There are some minor bugs that I'm aware of and which I've fixed since
this release (including a couple I discovered while preparing this PR).

There's a new accessibility forbidden pattern:

#### Should be `cannot` (or `can't`)

See https://www.grammarly.com/blog/cannot-or-can-not/
> Don't use `can not` when you mean `cannot`. The only time you're
likely to see `can not` written as separate words is when the word `can`
happens to precede some other phrase that happens to start with `not`.
> `Can't` is a contraction of `cannot`, and it's best suited for
informal writing.
> In formal writing and where contractions are frowned upon, use
`cannot`.
> It is possible to write `can not`, but you generally find it only as
part of some other construction, such as `not only . . . but also.`
- if you encounter such a case, add a pattern for that case to
patterns.txt.
```
\b[Cc]an not\b
```

[check-spelling v0.0.24]: https://github.com/check-spelling/check-spelling/releases/tag/v0.0.24

Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com>
2024-12-04 12:06:31 -06:00

277 lines
9.1 KiB
PowerShell

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
#Requires -Version 7
# (we use the null coalescing operator)
################################################################################
# This script generates an array suitable for replacing the body of
# src/types/CodepointWidthDetector.cpp from a Unicode UCD XML document[1]
# compliant with UAX#42[2].
#
# This script supports a quasi-mandatory "overrides" file, overrides.xml.
# If you do not have overrides, supply the -NoOverrides parameter. This was
# developed for use with the CodepointWidthDetector, which has some override
# ranges.
#
# This script was developed against the flat "no han unification" UCD
# "ucd.nounihan.flat.xml".
# It does not support the grouped database format.
# significantly smaller, which would provide a performance win on the admittedly
# extremely rare occasion that we should need to regenerate our table.
#
# Invoke this script from the root of this repository as:
# .\tools\Generate-CodepointWidthsFromUCD.ps1 -Path .\path\to\ucd.nounihan.flat.xml -OverridePath .\src\types\unicode_width_overrides.xml -Pack
#
# [1]: https://www.unicode.org/Public/UCD/latest/ucdxml/
# [2]: https://www.unicode.org/reports/tr42/
[Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSAvoidUsingPositionalParameters', '')]
[Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSUseProcessBlockForPipelineCommand', '')]
[CmdletBinding()]
Param(
[Parameter(Position=0, ValueFromPipeline=$true, ParameterSetName="Parsed")]
[System.Xml.XmlDocument]$InputObject,
[Parameter(Position=1, ValueFromPipeline=$true, ParameterSetName="Parsed")]
[System.Xml.XmlDocument]$OverrideObject,
[Parameter(Position=0, ValueFromPipelineByPropertyName=$true, ParameterSetName="Unparsed")]
[string]$Path = "ucd.nounihan.flat.xml",
[Parameter(Position=1, ValueFromPipelineByPropertyName=$true, ParameterSetName="Unparsed")]
[string]$OverridePath = "overrides.xml",
[switch]$Pack, # Pack tightly based on width
[switch]$NoOverrides # Do not include overrides
)
Enum CodepointWidth {
Narrow;
Wide;
Ambiguous;
}
# UCD Functions {{{
Function Get-UCDEntryRange($entry) {
$s = $e = 0
if ($null -ne $v.cp) {
# Individual Codepoint
$s = $e = [int]("0x"+$v.cp)
} ElseIf ($null -ne $v."first-cp") {
# Range of Codepoints
$s = [int]("0x"+$v."first-cp")
$e = [int]("0x"+$v."last-cp")
}
$s
$e
}
Function Get-UCDEntryWidth($entry) {
If ($entry.Emoji -eq "Y" -and $entry.EPres -eq "Y") {
[CodepointWidth]::Wide
Return
}
Switch($entry.ea) {
"N" { [CodepointWidth]::Narrow; Return }
"Na" { [CodepointWidth]::Narrow; Return }
"H" { [CodepointWidth]::Narrow; Return }
"W" { [CodepointWidth]::Wide; Return }
"F" { [CodepointWidth]::Wide; Return }
"A" { [CodepointWidth]::Ambiguous; Return }
default { throw "Unexpected East_Asian_Width property" }
}
}
Function Get-UCDEntryFlags($entry) {
If ($script:Pack) {
# If we're "pack"ing entries, only the computed width matters for telling them apart
Get-UCDEntryWidth $entry
Return
}
$normalizedEAWidth = $entry.ea
$normalizedEAWidth = $normalizedEAWidth -eq "F" ? "W" : $normalizedEAWidth;
"{0}{1}{2}" -f $normalizedEAWidth, $entry.Emoji, $entry.EPres
}
# }}}
Class UnicodeRange : System.IComparable {
[int]$Start
[int]$End
[CodepointWidth]$Width
[string]$Flags
[string]$Comment
UnicodeRange([System.Xml.XmlElement]$ucdEntry) {
$this.Start, $this.End = Get-UCDEntryRange $ucdEntry
$this.Width = Get-UCDEntryWidth $ucdEntry
$this.Flags = Get-UCDEntryFlags $ucdEntry
If (-not $script:Pack -and $ucdEntry.Emoji -eq "Y" -and $ucdEntry.EPres -eq "Y") {
$this.Comment = "Emoji=Y EPres=Y"
}
If ($null -ne $ucdEntry.comment) {
$this.Comment = $ucdEntry.comment
}
}
[int] CompareTo([object]$Other) {
If ($Other -is [int]) {
Return $this.Start - $Other
}
Return $this.Start - $Other.Start
}
[bool] Merge([UnicodeRange]$Other) {
# If there's more than one codepoint between them, don't merge
If (($Other.Start - $this.End) -gt 1) {
Return $false
}
# Comments are different: do not merge
If ($this.Comment -ne $Other.Comment) {
Return $false
}
# Flags are different: do not merge
If ($this.Flags -ne $Other.Flags) {
Return $false
}
$this.End = $Other.End
Return $true
}
[int] Length() {
return $this.End - $this.Start + 1
}
}
Class UnicodeRangeList : System.Collections.Generic.List[Object] {
UnicodeRangeList([int]$Capacity) : base($Capacity) { }
[int] hidden _FindInsertionPoint([int]$codepoint) {
$l = $this.BinarySearch($codepoint)
If ($l -lt 0) {
# Return value <0: value was not found, return value is bitwise complement the index of the first >= value
Return -bNOT $l
}
Return $l
}
ReplaceUnicodeRange([UnicodeRange]$newRange) {
$subset = [System.Collections.Generic.List[Object]]::New(3)
$subset.Add($newRange)
$i = $this._FindInsertionPoint($newRange.Start)
# Left overlap can only ever be one (_FindInsertionPoint always returns the
# index immediately after the range whose Start is <= than ours).
$prev = $null
If($i -gt 0 -and $this[$i - 1].End -ge $newRange.Start) {
$prev = $i - 1
}
# Right overlap can be Infinite (because we didn't account for End)
# Find extent of right overlap
For($next = $i; ($next -lt $this.Count - 1) -and ($this[$next+1].Start -le $newRange.End); $next++) { }
If ($this[$next].Start -gt $newRange.End) {
# It turns out we didn't damage the following range; clear it
$next = $null
}
If ($null -ne $next) {
# Replace damaged elements after I with a truncated range
$last = $this[$next]
$this.RemoveRange($i, $next - $i + 1) # Remove damaged elements after I
$last.Start = $newRange.End + 1
If ($last.Start -le $last.End) {
$subset.Add($last)
}
}
If ($null -ne $prev) {
# Replace damaged elements before I with a truncated range
$first = $this[$prev]
$this.RemoveRange($prev, $i - $prev) # Remove damaged elements (b/c we may not need to re-add them!)
$first.End = $newRange.Start - 1
If ($first.End -ge $first.Start) {
$subset.Insert(0, $first)
}
$i = $prev # Update the insertion cursor
}
$this.InsertRange($i, $subset)
}
}
# Ingest UCD
If ($null -eq $InputObject) {
$InputObject = [xml](Get-Content $Path)
}
$UCDRepertoire = $InputObject.ucd.repertoire.ChildNodes | Sort-Object {
# Sort by either cp or first-cp (for ranges)
if ($null -ne $_.cp) {
[int]("0x"+$_.cp)
} ElseIf ($null -ne $_."first-cp") {
[int]("0x"+$_."first-cp")
}
}
$ranges = [UnicodeRangeList]::New(1024)
ForEach($v in $UCDRepertoire) {
$range = [UnicodeRange]::new($v)
If ($ranges.Count -gt 0 -and $ranges[$ranges.Count - 1].Merge($range)) {
# Merged into last entry
Continue
}
$ranges.Add([object]$range)
}
If (-not $NoOverrides) {
If ($null -eq $OverrideObject) {
$OverrideObject = [xml](Get-Content $OverridePath)
}
$OverrideRepertoire = $OverrideObject.ucd.repertoire.ChildNodes
$overrideCount = 0
ForEach($v in $OverrideRepertoire) {
$range = [UnicodeRange]::new($v)
$overrideCount += $range.Length()
$range.Comment = $range.Comment ?? "overridden without comment"
$ranges.ReplaceUnicodeRange($range)
}
}
$ranges.RemoveAll({ $args[0].Width -eq [CodepointWidth]::Narrow }) | Out-Null
$c = 0
ForEach($_ in $ranges) {
$c += $_.End - $_.Start + 1
}
# Emit Code
" // Generated by {0} -Pack:{1} -Full:{2} -NoOverrides:{3}" -f $MyInvocation.MyCommand.Name, $Pack, $Full, $NoOverrides
" // on {0} from {1}." -f (Get-Date -AsUTC -Format "u"), $InputObject.ucd.description
" // {0} (0x{0:X}) codepoints covered." -f $c
If (-not $NoOverrides) {
" // {0} (0x{0:X}) codepoints overridden." -f $overrideCount
" // Override path: {0}" -f $OverridePath
}
" static constexpr std::array<UnicodeRange, {0}> s_wideAndAmbiguousTable{{" -f $ranges.Count
ForEach($_ in $ranges) {
$isAmbiguous = $_.Width -eq [CodepointWidth]::Ambiguous
$comment = ""
if ($null -ne $_.Comment) {
# We only vend comments when we aren't packing tightly
$comment = " // {0}" -f $_.Comment
}
" UnicodeRange{{ 0x{0:x}, 0x{1:x}, {2} }},{3}" -f $_.Start, $_.End, [int]$isAmbiguous, $comment
}
" };"