forked from ebhomengo/niki
1
0
Fork 0
niki/vendor/golang.org/x/text/unicode/bidi/bracket.go

590 lines
11 KiB
Go
Raw Normal View History

2024-02-18 10:42:21 +00:00
// Copyright 2015 The Go Authors. All rights reserved.
2024-02-18 10:42:21 +00:00
// Use of this source code is governed by a BSD-style
2024-02-18 10:42:21 +00:00
// license that can be found in the LICENSE file.
package bidi
import (
"container/list"
"fmt"
"sort"
)
// This file contains a port of the reference implementation of the
2024-02-18 10:42:21 +00:00
// Bidi Parentheses Algorithm:
2024-02-18 10:42:21 +00:00
// https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/BidiPBAReference.java
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// The implementation in this file covers definitions BD14-BD16 and rule N0
2024-02-18 10:42:21 +00:00
// of UAX#9.
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// Some preprocessing is done for each rune before data is passed to this
2024-02-18 10:42:21 +00:00
// algorithm:
2024-02-18 10:42:21 +00:00
// - opening and closing brackets are identified
2024-02-18 10:42:21 +00:00
// - a bracket pair type, like '(' and ')' is assigned a unique identifier that
2024-02-18 10:42:21 +00:00
// is identical for the opening and closing bracket. It is left to do these
2024-02-18 10:42:21 +00:00
// mappings.
2024-02-18 10:42:21 +00:00
// - The BPA algorithm requires that bracket characters that are canonical
2024-02-18 10:42:21 +00:00
// equivalents of each other be able to be substituted for each other.
2024-02-18 10:42:21 +00:00
// It is the responsibility of the caller to do this canonicalization.
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// In implementing BD16, this implementation departs slightly from the "logical"
2024-02-18 10:42:21 +00:00
// algorithm defined in UAX#9. In particular, the stack referenced there
2024-02-18 10:42:21 +00:00
// supports operations that go beyond a "basic" stack. An equivalent
2024-02-18 10:42:21 +00:00
// implementation based on a linked list is used here.
// Bidi_Paired_Bracket_Type
2024-02-18 10:42:21 +00:00
// BD14. An opening paired bracket is a character whose
2024-02-18 10:42:21 +00:00
// Bidi_Paired_Bracket_Type property value is Open.
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// BD15. A closing paired bracket is a character whose
2024-02-18 10:42:21 +00:00
// Bidi_Paired_Bracket_Type property value is Close.
2024-02-18 10:42:21 +00:00
type bracketType byte
const (
bpNone bracketType = iota
2024-02-18 10:42:21 +00:00
bpOpen
2024-02-18 10:42:21 +00:00
bpClose
)
// bracketPair holds a pair of index values for opening and closing bracket
2024-02-18 10:42:21 +00:00
// location of a bracket pair.
2024-02-18 10:42:21 +00:00
type bracketPair struct {
opener int
2024-02-18 10:42:21 +00:00
closer int
}
func (b *bracketPair) String() string {
2024-02-18 10:42:21 +00:00
return fmt.Sprintf("(%v, %v)", b.opener, b.closer)
2024-02-18 10:42:21 +00:00
}
// bracketPairs is a slice of bracketPairs with a sort.Interface implementation.
2024-02-18 10:42:21 +00:00
type bracketPairs []bracketPair
func (b bracketPairs) Len() int { return len(b) }
func (b bracketPairs) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
2024-02-18 10:42:21 +00:00
func (b bracketPairs) Less(i, j int) bool { return b[i].opener < b[j].opener }
// resolvePairedBrackets runs the paired bracket part of the UBA algorithm.
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// For each rune, it takes the indexes into the original string, the class the
2024-02-18 10:42:21 +00:00
// bracket type (in pairTypes) and the bracket identifier (pairValues). It also
2024-02-18 10:42:21 +00:00
// takes the direction type for the start-of-sentence and the embedding level.
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// The identifiers for bracket types are the rune of the canonicalized opening
2024-02-18 10:42:21 +00:00
// bracket for brackets (open or close) or 0 for runes that are not brackets.
2024-02-18 10:42:21 +00:00
func resolvePairedBrackets(s *isolatingRunSequence) {
2024-02-18 10:42:21 +00:00
p := bracketPairer{
sos: s.sos,
openers: list.New(),
2024-02-18 10:42:21 +00:00
codesIsolatedRun: s.types,
indexes: s.indexes,
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
dirEmbed := L
2024-02-18 10:42:21 +00:00
if s.level&1 != 0 {
2024-02-18 10:42:21 +00:00
dirEmbed = R
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
p.locateBrackets(s.p.pairTypes, s.p.pairValues)
2024-02-18 10:42:21 +00:00
p.resolveBrackets(dirEmbed, s.p.initialTypes)
2024-02-18 10:42:21 +00:00
}
type bracketPairer struct {
sos Class // direction corresponding to start of sequence
// The following is a restatement of BD 16 using non-algorithmic language.
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// A bracket pair is a pair of characters consisting of an opening
2024-02-18 10:42:21 +00:00
// paired bracket and a closing paired bracket such that the
2024-02-18 10:42:21 +00:00
// Bidi_Paired_Bracket property value of the former equals the latter,
2024-02-18 10:42:21 +00:00
// subject to the following constraints.
2024-02-18 10:42:21 +00:00
// - both characters of a pair occur in the same isolating run sequence
2024-02-18 10:42:21 +00:00
// - the closing character of a pair follows the opening character
2024-02-18 10:42:21 +00:00
// - any bracket character can belong at most to one pair, the earliest possible one
2024-02-18 10:42:21 +00:00
// - any bracket character not part of a pair is treated like an ordinary character
2024-02-18 10:42:21 +00:00
// - pairs may nest properly, but their spans may not overlap otherwise
// Bracket characters with canonical decompositions are supposed to be
2024-02-18 10:42:21 +00:00
// treated as if they had been normalized, to allow normalized and non-
2024-02-18 10:42:21 +00:00
// normalized text to give the same result. In this implementation that step
2024-02-18 10:42:21 +00:00
// is pushed out to the caller. The caller has to ensure that the pairValue
2024-02-18 10:42:21 +00:00
// slices contain the rune of the opening bracket after normalization for
2024-02-18 10:42:21 +00:00
// any opening or closing bracket.
openers *list.List // list of positions for opening brackets
// bracket pair positions sorted by location of opening bracket
2024-02-18 10:42:21 +00:00
pairPositions bracketPairs
codesIsolatedRun []Class // directional bidi codes for an isolated run
indexes []int // array of index values into the original string
2024-02-18 10:42:21 +00:00
}
// matchOpener reports whether characters at given positions form a matching
2024-02-18 10:42:21 +00:00
// bracket pair.
2024-02-18 10:42:21 +00:00
func (p *bracketPairer) matchOpener(pairValues []rune, opener, closer int) bool {
2024-02-18 10:42:21 +00:00
return pairValues[p.indexes[opener]] == pairValues[p.indexes[closer]]
2024-02-18 10:42:21 +00:00
}
const maxPairingDepth = 63
// locateBrackets locates matching bracket pairs according to BD16.
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// This implementation uses a linked list instead of a stack, because, while
2024-02-18 10:42:21 +00:00
// elements are added at the front (like a push) they are not generally removed
2024-02-18 10:42:21 +00:00
// in atomic 'pop' operations, reducing the benefit of the stack archetype.
2024-02-18 10:42:21 +00:00
func (p *bracketPairer) locateBrackets(pairTypes []bracketType, pairValues []rune) {
2024-02-18 10:42:21 +00:00
// traverse the run
2024-02-18 10:42:21 +00:00
// do that explicitly (not in a for-each) so we can record position
2024-02-18 10:42:21 +00:00
for i, index := range p.indexes {
// look at the bracket type for each character
2024-02-18 10:42:21 +00:00
if pairTypes[index] == bpNone || p.codesIsolatedRun[i] != ON {
2024-02-18 10:42:21 +00:00
// continue scanning
2024-02-18 10:42:21 +00:00
continue
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
switch pairTypes[index] {
2024-02-18 10:42:21 +00:00
case bpOpen:
2024-02-18 10:42:21 +00:00
// check if maximum pairing depth reached
2024-02-18 10:42:21 +00:00
if p.openers.Len() == maxPairingDepth {
2024-02-18 10:42:21 +00:00
p.openers.Init()
2024-02-18 10:42:21 +00:00
return
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
// remember opener location, most recent first
2024-02-18 10:42:21 +00:00
p.openers.PushFront(i)
case bpClose:
2024-02-18 10:42:21 +00:00
// see if there is a match
2024-02-18 10:42:21 +00:00
count := 0
2024-02-18 10:42:21 +00:00
for elem := p.openers.Front(); elem != nil; elem = elem.Next() {
2024-02-18 10:42:21 +00:00
count++
2024-02-18 10:42:21 +00:00
opener := elem.Value.(int)
2024-02-18 10:42:21 +00:00
if p.matchOpener(pairValues, opener, i) {
2024-02-18 10:42:21 +00:00
// if the opener matches, add nested pair to the ordered list
2024-02-18 10:42:21 +00:00
p.pairPositions = append(p.pairPositions, bracketPair{opener, i})
2024-02-18 10:42:21 +00:00
// remove up to and including matched opener
2024-02-18 10:42:21 +00:00
for ; count > 0; count-- {
2024-02-18 10:42:21 +00:00
p.openers.Remove(p.openers.Front())
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
break
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
sort.Sort(p.pairPositions)
2024-02-18 10:42:21 +00:00
// if we get here, the closing bracket matched no openers
2024-02-18 10:42:21 +00:00
// and gets ignored
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
}
// Bracket pairs within an isolating run sequence are processed as units so
2024-02-18 10:42:21 +00:00
// that both the opening and the closing paired bracket in a pair resolve to
2024-02-18 10:42:21 +00:00
// the same direction.
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// N0. Process bracket pairs in an isolating run sequence sequentially in
2024-02-18 10:42:21 +00:00
// the logical order of the text positions of the opening paired brackets
2024-02-18 10:42:21 +00:00
// using the logic given below. Within this scope, bidirectional types EN
2024-02-18 10:42:21 +00:00
// and AN are treated as R.
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// Identify the bracket pairs in the current isolating run sequence
2024-02-18 10:42:21 +00:00
// according to BD16. For each bracket-pair element in the list of pairs of
2024-02-18 10:42:21 +00:00
// text positions:
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// a Inspect the bidirectional types of the characters enclosed within the
2024-02-18 10:42:21 +00:00
// bracket pair.
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// b If any strong type (either L or R) matching the embedding direction is
2024-02-18 10:42:21 +00:00
// found, set the type for both brackets in the pair to match the embedding
2024-02-18 10:42:21 +00:00
// direction.
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// o [ e ] o -> o e e e o
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// o [ o e ] -> o e o e e
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// o [ NI e ] -> o e NI e e
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// c Otherwise, if a strong type (opposite the embedding direction) is
2024-02-18 10:42:21 +00:00
// found, test for adjacent strong types as follows: 1 First, check
2024-02-18 10:42:21 +00:00
// backwards before the opening paired bracket until the first strong type
2024-02-18 10:42:21 +00:00
// (L, R, or sos) is found. If that first preceding strong type is opposite
2024-02-18 10:42:21 +00:00
// the embedding direction, then set the type for both brackets in the pair
2024-02-18 10:42:21 +00:00
// to that type. 2 Otherwise, set the type for both brackets in the pair to
2024-02-18 10:42:21 +00:00
// the embedding direction.
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// o [ o ] e -> o o o o e
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// o [ o NI ] o -> o o o NI o o
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// e [ o ] o -> e e o e o
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// e [ o ] e -> e e o e e
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// e ( o [ o ] NI ) e -> e e o o o o NI e e
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// d Otherwise, do not set the type for the current bracket pair. Note that
2024-02-18 10:42:21 +00:00
// if the enclosed text contains no strong types the paired brackets will
2024-02-18 10:42:21 +00:00
// both resolve to the same level when resolved individually using rules N1
2024-02-18 10:42:21 +00:00
// and N2.
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// e ( NI ) o -> e ( NI ) o
// getStrongTypeN0 maps character's directional code to strong type as required
2024-02-18 10:42:21 +00:00
// by rule N0.
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// TODO: have separate type for "strong" directionality.
2024-02-18 10:42:21 +00:00
func (p *bracketPairer) getStrongTypeN0(index int) Class {
2024-02-18 10:42:21 +00:00
switch p.codesIsolatedRun[index] {
2024-02-18 10:42:21 +00:00
// in the scope of N0, number types are treated as R
2024-02-18 10:42:21 +00:00
case EN, AN, AL, R:
2024-02-18 10:42:21 +00:00
return R
2024-02-18 10:42:21 +00:00
case L:
2024-02-18 10:42:21 +00:00
return L
2024-02-18 10:42:21 +00:00
default:
2024-02-18 10:42:21 +00:00
return ON
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
}
// classifyPairContent reports the strong types contained inside a Bracket Pair,
2024-02-18 10:42:21 +00:00
// assuming the given embedding direction.
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// It returns ON if no strong type is found. If a single strong type is found,
2024-02-18 10:42:21 +00:00
// it returns this type. Otherwise it returns the embedding direction.
2024-02-18 10:42:21 +00:00
//
2024-02-18 10:42:21 +00:00
// TODO: use separate type for "strong" directionality.
2024-02-18 10:42:21 +00:00
func (p *bracketPairer) classifyPairContent(loc bracketPair, dirEmbed Class) Class {
2024-02-18 10:42:21 +00:00
dirOpposite := ON
2024-02-18 10:42:21 +00:00
for i := loc.opener + 1; i < loc.closer; i++ {
2024-02-18 10:42:21 +00:00
dir := p.getStrongTypeN0(i)
2024-02-18 10:42:21 +00:00
if dir == ON {
2024-02-18 10:42:21 +00:00
continue
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
if dir == dirEmbed {
2024-02-18 10:42:21 +00:00
return dir // type matching embedding direction found
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
dirOpposite = dir
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
// return ON if no strong type found, or class opposite to dirEmbed
2024-02-18 10:42:21 +00:00
return dirOpposite
2024-02-18 10:42:21 +00:00
}
// classBeforePair determines which strong types are present before a Bracket
2024-02-18 10:42:21 +00:00
// Pair. Return R or L if strong type found, otherwise ON.
2024-02-18 10:42:21 +00:00
func (p *bracketPairer) classBeforePair(loc bracketPair) Class {
2024-02-18 10:42:21 +00:00
for i := loc.opener - 1; i >= 0; i-- {
2024-02-18 10:42:21 +00:00
if dir := p.getStrongTypeN0(i); dir != ON {
2024-02-18 10:42:21 +00:00
return dir
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
// no strong types found, return sos
2024-02-18 10:42:21 +00:00
return p.sos
2024-02-18 10:42:21 +00:00
}
// assignBracketType implements rule N0 for a single bracket pair.
2024-02-18 10:42:21 +00:00
func (p *bracketPairer) assignBracketType(loc bracketPair, dirEmbed Class, initialTypes []Class) {
2024-02-18 10:42:21 +00:00
// rule "N0, a", inspect contents of pair
2024-02-18 10:42:21 +00:00
dirPair := p.classifyPairContent(loc, dirEmbed)
// dirPair is now L, R, or N (no strong type found)
// the following logical tests are performed out of order compared to
2024-02-18 10:42:21 +00:00
// the statement of the rules but yield the same results
2024-02-18 10:42:21 +00:00
if dirPair == ON {
2024-02-18 10:42:21 +00:00
return // case "d" - nothing to do
2024-02-18 10:42:21 +00:00
}
if dirPair != dirEmbed {
2024-02-18 10:42:21 +00:00
// case "c": strong type found, opposite - check before (c.1)
2024-02-18 10:42:21 +00:00
dirPair = p.classBeforePair(loc)
2024-02-18 10:42:21 +00:00
if dirPair == dirEmbed || dirPair == ON {
2024-02-18 10:42:21 +00:00
// no strong opposite type found before - use embedding (c.2)
2024-02-18 10:42:21 +00:00
dirPair = dirEmbed
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
// else: case "b", strong type found matching embedding,
2024-02-18 10:42:21 +00:00
// no explicit action needed, as dirPair is already set to embedding
2024-02-18 10:42:21 +00:00
// direction
// set the bracket types to the type found
2024-02-18 10:42:21 +00:00
p.setBracketsToType(loc, dirPair, initialTypes)
2024-02-18 10:42:21 +00:00
}
func (p *bracketPairer) setBracketsToType(loc bracketPair, dirPair Class, initialTypes []Class) {
2024-02-18 10:42:21 +00:00
p.codesIsolatedRun[loc.opener] = dirPair
2024-02-18 10:42:21 +00:00
p.codesIsolatedRun[loc.closer] = dirPair
for i := loc.opener + 1; i < loc.closer; i++ {
2024-02-18 10:42:21 +00:00
index := p.indexes[i]
2024-02-18 10:42:21 +00:00
if initialTypes[index] != NSM {
2024-02-18 10:42:21 +00:00
break
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
p.codesIsolatedRun[i] = dirPair
2024-02-18 10:42:21 +00:00
}
for i := loc.closer + 1; i < len(p.indexes); i++ {
2024-02-18 10:42:21 +00:00
index := p.indexes[i]
2024-02-18 10:42:21 +00:00
if initialTypes[index] != NSM {
2024-02-18 10:42:21 +00:00
break
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
p.codesIsolatedRun[i] = dirPair
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
}
// resolveBrackets implements rule N0 for a list of pairs.
2024-02-18 10:42:21 +00:00
func (p *bracketPairer) resolveBrackets(dirEmbed Class, initialTypes []Class) {
2024-02-18 10:42:21 +00:00
for _, loc := range p.pairPositions {
2024-02-18 10:42:21 +00:00
p.assignBracketType(loc, dirEmbed, initialTypes)
2024-02-18 10:42:21 +00:00
}
2024-02-18 10:42:21 +00:00
}