mirror of
https://github.com/microsoft/terminal.git
synced 2025-12-10 00:48:23 -06:00
This upgrades to [check-spelling v0.0.24]. A number of GitHub APIs are being turned off shortly, so we need to upgrade or various uncertain outcomes will occur. There are some minor bugs that I'm aware of and which I've fixed since this release (including a couple I discovered while preparing this PR). There's a new accessibility forbidden pattern: #### Should be `cannot` (or `can't`) See https://www.grammarly.com/blog/cannot-or-can-not/ > Don't use `can not` when you mean `cannot`. The only time you're likely to see `can not` written as separate words is when the word `can` happens to precede some other phrase that happens to start with `not`. > `Can't` is a contraction of `cannot`, and it's best suited for informal writing. > In formal writing and where contractions are frowned upon, use `cannot`. > It is possible to write `can not`, but you generally find it only as part of some other construction, such as `not only . . . but also.` - if you encounter such a case, add a pattern for that case to patterns.txt. ``` \b[Cc]an not\b ``` [check-spelling v0.0.24]: https://github.com/check-spelling/check-spelling/releases/tag/v0.0.24 Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com>
277 lines
9.1 KiB
PowerShell
277 lines
9.1 KiB
PowerShell
# Copyright (c) Microsoft Corporation.
|
|
# Licensed under the MIT license.
|
|
|
|
#Requires -Version 7
|
|
# (we use the null coalescing operator)
|
|
|
|
################################################################################
|
|
# This script generates an array suitable for replacing the body of
|
|
# src/types/CodepointWidthDetector.cpp from a Unicode UCD XML document[1]
|
|
# compliant with UAX#42[2].
|
|
#
|
|
# This script supports a quasi-mandatory "overrides" file, overrides.xml.
|
|
# If you do not have overrides, supply the -NoOverrides parameter. This was
|
|
# developed for use with the CodepointWidthDetector, which has some override
|
|
# ranges.
|
|
#
|
|
# This script was developed against the flat "no han unification" UCD
|
|
# "ucd.nounihan.flat.xml".
|
|
# It does not support the grouped database format.
|
|
# significantly smaller, which would provide a performance win on the admittedly
|
|
# extremely rare occasion that we should need to regenerate our table.
|
|
#
|
|
# Invoke this script from the root of this repository as:
|
|
# .\tools\Generate-CodepointWidthsFromUCD.ps1 -Path .\path\to\ucd.nounihan.flat.xml -OverridePath .\src\types\unicode_width_overrides.xml -Pack
|
|
#
|
|
# [1]: https://www.unicode.org/Public/UCD/latest/ucdxml/
|
|
# [2]: https://www.unicode.org/reports/tr42/
|
|
|
|
[Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSAvoidUsingPositionalParameters', '')]
|
|
[Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSUseProcessBlockForPipelineCommand', '')]
|
|
[CmdletBinding()]
|
|
Param(
|
|
[Parameter(Position=0, ValueFromPipeline=$true, ParameterSetName="Parsed")]
|
|
[System.Xml.XmlDocument]$InputObject,
|
|
|
|
[Parameter(Position=1, ValueFromPipeline=$true, ParameterSetName="Parsed")]
|
|
[System.Xml.XmlDocument]$OverrideObject,
|
|
|
|
[Parameter(Position=0, ValueFromPipelineByPropertyName=$true, ParameterSetName="Unparsed")]
|
|
[string]$Path = "ucd.nounihan.flat.xml",
|
|
|
|
[Parameter(Position=1, ValueFromPipelineByPropertyName=$true, ParameterSetName="Unparsed")]
|
|
[string]$OverridePath = "overrides.xml",
|
|
|
|
[switch]$Pack, # Pack tightly based on width
|
|
[switch]$NoOverrides # Do not include overrides
|
|
)
|
|
|
|
Enum CodepointWidth {
|
|
Narrow;
|
|
Wide;
|
|
Ambiguous;
|
|
}
|
|
|
|
# UCD Functions {{{
|
|
Function Get-UCDEntryRange($entry) {
|
|
$s = $e = 0
|
|
if ($null -ne $v.cp) {
|
|
# Individual Codepoint
|
|
$s = $e = [int]("0x"+$v.cp)
|
|
} ElseIf ($null -ne $v."first-cp") {
|
|
# Range of Codepoints
|
|
$s = [int]("0x"+$v."first-cp")
|
|
$e = [int]("0x"+$v."last-cp")
|
|
}
|
|
$s
|
|
$e
|
|
}
|
|
|
|
Function Get-UCDEntryWidth($entry) {
|
|
If ($entry.Emoji -eq "Y" -and $entry.EPres -eq "Y") {
|
|
[CodepointWidth]::Wide
|
|
Return
|
|
}
|
|
|
|
Switch($entry.ea) {
|
|
"N" { [CodepointWidth]::Narrow; Return }
|
|
"Na" { [CodepointWidth]::Narrow; Return }
|
|
"H" { [CodepointWidth]::Narrow; Return }
|
|
"W" { [CodepointWidth]::Wide; Return }
|
|
"F" { [CodepointWidth]::Wide; Return }
|
|
"A" { [CodepointWidth]::Ambiguous; Return }
|
|
default { throw "Unexpected East_Asian_Width property" }
|
|
}
|
|
}
|
|
|
|
Function Get-UCDEntryFlags($entry) {
|
|
If ($script:Pack) {
|
|
# If we're "pack"ing entries, only the computed width matters for telling them apart
|
|
Get-UCDEntryWidth $entry
|
|
Return
|
|
}
|
|
|
|
$normalizedEAWidth = $entry.ea
|
|
$normalizedEAWidth = $normalizedEAWidth -eq "F" ? "W" : $normalizedEAWidth;
|
|
"{0}{1}{2}" -f $normalizedEAWidth, $entry.Emoji, $entry.EPres
|
|
}
|
|
# }}}
|
|
|
|
Class UnicodeRange : System.IComparable {
|
|
[int]$Start
|
|
[int]$End
|
|
[CodepointWidth]$Width
|
|
[string]$Flags
|
|
[string]$Comment
|
|
|
|
UnicodeRange([System.Xml.XmlElement]$ucdEntry) {
|
|
$this.Start, $this.End = Get-UCDEntryRange $ucdEntry
|
|
$this.Width = Get-UCDEntryWidth $ucdEntry
|
|
$this.Flags = Get-UCDEntryFlags $ucdEntry
|
|
|
|
If (-not $script:Pack -and $ucdEntry.Emoji -eq "Y" -and $ucdEntry.EPres -eq "Y") {
|
|
$this.Comment = "Emoji=Y EPres=Y"
|
|
}
|
|
|
|
If ($null -ne $ucdEntry.comment) {
|
|
$this.Comment = $ucdEntry.comment
|
|
}
|
|
}
|
|
|
|
[int] CompareTo([object]$Other) {
|
|
If ($Other -is [int]) {
|
|
Return $this.Start - $Other
|
|
}
|
|
Return $this.Start - $Other.Start
|
|
}
|
|
|
|
[bool] Merge([UnicodeRange]$Other) {
|
|
# If there's more than one codepoint between them, don't merge
|
|
If (($Other.Start - $this.End) -gt 1) {
|
|
Return $false
|
|
}
|
|
|
|
# Comments are different: do not merge
|
|
If ($this.Comment -ne $Other.Comment) {
|
|
Return $false
|
|
}
|
|
|
|
# Flags are different: do not merge
|
|
If ($this.Flags -ne $Other.Flags) {
|
|
Return $false
|
|
}
|
|
|
|
$this.End = $Other.End
|
|
Return $true
|
|
}
|
|
|
|
[int] Length() {
|
|
return $this.End - $this.Start + 1
|
|
}
|
|
}
|
|
|
|
Class UnicodeRangeList : System.Collections.Generic.List[Object] {
|
|
UnicodeRangeList([int]$Capacity) : base($Capacity) { }
|
|
|
|
[int] hidden _FindInsertionPoint([int]$codepoint) {
|
|
$l = $this.BinarySearch($codepoint)
|
|
If ($l -lt 0) {
|
|
# Return value <0: value was not found, return value is bitwise complement the index of the first >= value
|
|
Return -bNOT $l
|
|
}
|
|
Return $l
|
|
}
|
|
|
|
ReplaceUnicodeRange([UnicodeRange]$newRange) {
|
|
$subset = [System.Collections.Generic.List[Object]]::New(3)
|
|
$subset.Add($newRange)
|
|
|
|
$i = $this._FindInsertionPoint($newRange.Start)
|
|
|
|
# Left overlap can only ever be one (_FindInsertionPoint always returns the
|
|
# index immediately after the range whose Start is <= than ours).
|
|
$prev = $null
|
|
If($i -gt 0 -and $this[$i - 1].End -ge $newRange.Start) {
|
|
$prev = $i - 1
|
|
}
|
|
|
|
# Right overlap can be Infinite (because we didn't account for End)
|
|
# Find extent of right overlap
|
|
For($next = $i; ($next -lt $this.Count - 1) -and ($this[$next+1].Start -le $newRange.End); $next++) { }
|
|
If ($this[$next].Start -gt $newRange.End) {
|
|
# It turns out we didn't damage the following range; clear it
|
|
$next = $null
|
|
}
|
|
|
|
If ($null -ne $next) {
|
|
# Replace damaged elements after I with a truncated range
|
|
$last = $this[$next]
|
|
$this.RemoveRange($i, $next - $i + 1) # Remove damaged elements after I
|
|
$last.Start = $newRange.End + 1
|
|
If ($last.Start -le $last.End) {
|
|
$subset.Add($last)
|
|
}
|
|
}
|
|
|
|
If ($null -ne $prev) {
|
|
# Replace damaged elements before I with a truncated range
|
|
$first = $this[$prev]
|
|
$this.RemoveRange($prev, $i - $prev) # Remove damaged elements (b/c we may not need to re-add them!)
|
|
$first.End = $newRange.Start - 1
|
|
If ($first.End -ge $first.Start) {
|
|
$subset.Insert(0, $first)
|
|
}
|
|
$i = $prev # Update the insertion cursor
|
|
}
|
|
|
|
$this.InsertRange($i, $subset)
|
|
}
|
|
}
|
|
|
|
# Ingest UCD
|
|
If ($null -eq $InputObject) {
|
|
$InputObject = [xml](Get-Content $Path)
|
|
}
|
|
|
|
$UCDRepertoire = $InputObject.ucd.repertoire.ChildNodes | Sort-Object {
|
|
# Sort by either cp or first-cp (for ranges)
|
|
if ($null -ne $_.cp) {
|
|
[int]("0x"+$_.cp)
|
|
} ElseIf ($null -ne $_."first-cp") {
|
|
[int]("0x"+$_."first-cp")
|
|
}
|
|
}
|
|
|
|
$ranges = [UnicodeRangeList]::New(1024)
|
|
|
|
ForEach($v in $UCDRepertoire) {
|
|
$range = [UnicodeRange]::new($v)
|
|
If ($ranges.Count -gt 0 -and $ranges[$ranges.Count - 1].Merge($range)) {
|
|
# Merged into last entry
|
|
Continue
|
|
}
|
|
$ranges.Add([object]$range)
|
|
}
|
|
|
|
If (-not $NoOverrides) {
|
|
If ($null -eq $OverrideObject) {
|
|
$OverrideObject = [xml](Get-Content $OverridePath)
|
|
}
|
|
|
|
$OverrideRepertoire = $OverrideObject.ucd.repertoire.ChildNodes
|
|
$overrideCount = 0
|
|
ForEach($v in $OverrideRepertoire) {
|
|
$range = [UnicodeRange]::new($v)
|
|
$overrideCount += $range.Length()
|
|
$range.Comment = $range.Comment ?? "overridden without comment"
|
|
$ranges.ReplaceUnicodeRange($range)
|
|
}
|
|
}
|
|
|
|
$ranges.RemoveAll({ $args[0].Width -eq [CodepointWidth]::Narrow }) | Out-Null
|
|
|
|
$c = 0
|
|
ForEach($_ in $ranges) {
|
|
$c += $_.End - $_.Start + 1
|
|
}
|
|
|
|
# Emit Code
|
|
" // Generated by {0} -Pack:{1} -Full:{2} -NoOverrides:{3}" -f $MyInvocation.MyCommand.Name, $Pack, $Full, $NoOverrides
|
|
" // on {0} from {1}." -f (Get-Date -AsUTC -Format "u"), $InputObject.ucd.description
|
|
" // {0} (0x{0:X}) codepoints covered." -f $c
|
|
If (-not $NoOverrides) {
|
|
" // {0} (0x{0:X}) codepoints overridden." -f $overrideCount
|
|
" // Override path: {0}" -f $OverridePath
|
|
}
|
|
" static constexpr std::array<UnicodeRange, {0}> s_wideAndAmbiguousTable{{" -f $ranges.Count
|
|
ForEach($_ in $ranges) {
|
|
$isAmbiguous = $_.Width -eq [CodepointWidth]::Ambiguous
|
|
$comment = ""
|
|
if ($null -ne $_.Comment) {
|
|
# We only vend comments when we aren't packing tightly
|
|
$comment = " // {0}" -f $_.Comment
|
|
}
|
|
" UnicodeRange{{ 0x{0:x}, 0x{1:x}, {2} }},{3}" -f $_.Start, $_.End, [int]$isAmbiguous, $comment
|
|
}
|
|
" };"
|