360 lines
8.6 KiB
Go
360 lines
8.6 KiB
Go
|
// Copyright 2013 The Go Authors. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style
|
||
|
// license that can be found in the LICENSE file.
|
||
|
|
||
|
package cldr
|
||
|
|
||
|
import (
|
||
|
"bufio"
|
||
|
"encoding/xml"
|
||
|
"errors"
|
||
|
"fmt"
|
||
|
"strconv"
|
||
|
"strings"
|
||
|
"unicode"
|
||
|
"unicode/utf8"
|
||
|
)
|
||
|
|
||
|
// RuleProcessor can be passed to Collator's Process method, which
|
||
|
// parses the rules and calls the respective method for each rule found.
|
||
|
type RuleProcessor interface {
|
||
|
Reset(anchor string, before int) error
|
||
|
Insert(level int, str, context, extend string) error
|
||
|
Index(id string)
|
||
|
}
|
||
|
|
||
|
const (
|
||
|
// cldrIndex is a Unicode-reserved sentinel value used to mark the start
|
||
|
// of a grouping within an index.
|
||
|
// We ignore any rule that starts with this rune.
|
||
|
// See https://unicode.org/reports/tr35/#Collation_Elements for details.
|
||
|
cldrIndex = "\uFDD0"
|
||
|
|
||
|
// specialAnchor is the format in which to represent logical reset positions,
|
||
|
// such as "first tertiary ignorable".
|
||
|
specialAnchor = "<%s/>"
|
||
|
)
|
||
|
|
||
|
// Process parses the rules for the tailorings of this collation
|
||
|
// and calls the respective methods of p for each rule found.
|
||
|
func (c Collation) Process(p RuleProcessor) (err error) {
|
||
|
if len(c.Cr) > 0 {
|
||
|
if len(c.Cr) > 1 {
|
||
|
return fmt.Errorf("multiple cr elements, want 0 or 1")
|
||
|
}
|
||
|
return processRules(p, c.Cr[0].Data())
|
||
|
}
|
||
|
if c.Rules.Any != nil {
|
||
|
return c.processXML(p)
|
||
|
}
|
||
|
return errors.New("no tailoring data")
|
||
|
}
|
||
|
|
||
|
// processRules parses rules in the Collation Rule Syntax defined in
|
||
|
// https://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Tailorings.
|
||
|
func processRules(p RuleProcessor, s string) (err error) {
|
||
|
chk := func(s string, e error) string {
|
||
|
if err == nil {
|
||
|
err = e
|
||
|
}
|
||
|
return s
|
||
|
}
|
||
|
i := 0 // Save the line number for use after the loop.
|
||
|
scanner := bufio.NewScanner(strings.NewReader(s))
|
||
|
for ; scanner.Scan() && err == nil; i++ {
|
||
|
for s := skipSpace(scanner.Text()); s != "" && s[0] != '#'; s = skipSpace(s) {
|
||
|
level := 5
|
||
|
var ch byte
|
||
|
switch ch, s = s[0], s[1:]; ch {
|
||
|
case '&': // followed by <anchor> or '[' <key> ']'
|
||
|
if s = skipSpace(s); consume(&s, '[') {
|
||
|
s = chk(parseSpecialAnchor(p, s))
|
||
|
} else {
|
||
|
s = chk(parseAnchor(p, 0, s))
|
||
|
}
|
||
|
case '<': // sort relation '<'{1,4}, optionally followed by '*'.
|
||
|
for level = 1; consume(&s, '<'); level++ {
|
||
|
}
|
||
|
if level > 4 {
|
||
|
err = fmt.Errorf("level %d > 4", level)
|
||
|
}
|
||
|
fallthrough
|
||
|
case '=': // identity relation, optionally followed by *.
|
||
|
if consume(&s, '*') {
|
||
|
s = chk(parseSequence(p, level, s))
|
||
|
} else {
|
||
|
s = chk(parseOrder(p, level, s))
|
||
|
}
|
||
|
default:
|
||
|
chk("", fmt.Errorf("illegal operator %q", ch))
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if chk("", scanner.Err()); err != nil {
|
||
|
return fmt.Errorf("%d: %v", i, err)
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// parseSpecialAnchor parses the anchor syntax which is either of the form
|
||
|
// ['before' <level>] <anchor>
|
||
|
// or
|
||
|
// [<label>]
|
||
|
// The starting should already be consumed.
|
||
|
func parseSpecialAnchor(p RuleProcessor, s string) (tail string, err error) {
|
||
|
i := strings.IndexByte(s, ']')
|
||
|
if i == -1 {
|
||
|
return "", errors.New("unmatched bracket")
|
||
|
}
|
||
|
a := strings.TrimSpace(s[:i])
|
||
|
s = s[i+1:]
|
||
|
if strings.HasPrefix(a, "before ") {
|
||
|
l, err := strconv.ParseUint(skipSpace(a[len("before "):]), 10, 3)
|
||
|
if err != nil {
|
||
|
return s, err
|
||
|
}
|
||
|
return parseAnchor(p, int(l), s)
|
||
|
}
|
||
|
return s, p.Reset(fmt.Sprintf(specialAnchor, a), 0)
|
||
|
}
|
||
|
|
||
|
func parseAnchor(p RuleProcessor, level int, s string) (tail string, err error) {
|
||
|
anchor, s, err := scanString(s)
|
||
|
if err != nil {
|
||
|
return s, err
|
||
|
}
|
||
|
return s, p.Reset(anchor, level)
|
||
|
}
|
||
|
|
||
|
func parseOrder(p RuleProcessor, level int, s string) (tail string, err error) {
|
||
|
var value, context, extend string
|
||
|
if value, s, err = scanString(s); err != nil {
|
||
|
return s, err
|
||
|
}
|
||
|
if strings.HasPrefix(value, cldrIndex) {
|
||
|
p.Index(value[len(cldrIndex):])
|
||
|
return
|
||
|
}
|
||
|
if consume(&s, '|') {
|
||
|
if context, s, err = scanString(s); err != nil {
|
||
|
return s, errors.New("missing string after context")
|
||
|
}
|
||
|
}
|
||
|
if consume(&s, '/') {
|
||
|
if extend, s, err = scanString(s); err != nil {
|
||
|
return s, errors.New("missing string after extension")
|
||
|
}
|
||
|
}
|
||
|
return s, p.Insert(level, value, context, extend)
|
||
|
}
|
||
|
|
||
|
// scanString scans a single input string.
|
||
|
func scanString(s string) (str, tail string, err error) {
|
||
|
if s = skipSpace(s); s == "" {
|
||
|
return s, s, errors.New("missing string")
|
||
|
}
|
||
|
buf := [16]byte{} // small but enough to hold most cases.
|
||
|
value := buf[:0]
|
||
|
for s != "" {
|
||
|
if consume(&s, '\'') {
|
||
|
i := strings.IndexByte(s, '\'')
|
||
|
if i == -1 {
|
||
|
return "", "", errors.New(`unmatched single quote`)
|
||
|
}
|
||
|
if i == 0 {
|
||
|
value = append(value, '\'')
|
||
|
} else {
|
||
|
value = append(value, s[:i]...)
|
||
|
}
|
||
|
s = s[i+1:]
|
||
|
continue
|
||
|
}
|
||
|
r, sz := utf8.DecodeRuneInString(s)
|
||
|
if unicode.IsSpace(r) || strings.ContainsRune("&<=#", r) {
|
||
|
break
|
||
|
}
|
||
|
value = append(value, s[:sz]...)
|
||
|
s = s[sz:]
|
||
|
}
|
||
|
return string(value), skipSpace(s), nil
|
||
|
}
|
||
|
|
||
|
func parseSequence(p RuleProcessor, level int, s string) (tail string, err error) {
|
||
|
if s = skipSpace(s); s == "" {
|
||
|
return s, errors.New("empty sequence")
|
||
|
}
|
||
|
last := rune(0)
|
||
|
for s != "" {
|
||
|
r, sz := utf8.DecodeRuneInString(s)
|
||
|
s = s[sz:]
|
||
|
|
||
|
if r == '-' {
|
||
|
// We have a range. The first element was already written.
|
||
|
if last == 0 {
|
||
|
return s, errors.New("range without starter value")
|
||
|
}
|
||
|
r, sz = utf8.DecodeRuneInString(s)
|
||
|
s = s[sz:]
|
||
|
if r == utf8.RuneError || r < last {
|
||
|
return s, fmt.Errorf("invalid range %q-%q", last, r)
|
||
|
}
|
||
|
for i := last + 1; i <= r; i++ {
|
||
|
if err := p.Insert(level, string(i), "", ""); err != nil {
|
||
|
return s, err
|
||
|
}
|
||
|
}
|
||
|
last = 0
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
if unicode.IsSpace(r) || unicode.IsPunct(r) {
|
||
|
break
|
||
|
}
|
||
|
|
||
|
// normal case
|
||
|
if err := p.Insert(level, string(r), "", ""); err != nil {
|
||
|
return s, err
|
||
|
}
|
||
|
last = r
|
||
|
}
|
||
|
return s, nil
|
||
|
}
|
||
|
|
||
|
func skipSpace(s string) string {
|
||
|
return strings.TrimLeftFunc(s, unicode.IsSpace)
|
||
|
}
|
||
|
|
||
|
// consumes returns whether the next byte is ch. If so, it gobbles it by
|
||
|
// updating s.
|
||
|
func consume(s *string, ch byte) (ok bool) {
|
||
|
if *s == "" || (*s)[0] != ch {
|
||
|
return false
|
||
|
}
|
||
|
*s = (*s)[1:]
|
||
|
return true
|
||
|
}
|
||
|
|
||
|
// The following code parses Collation rules of CLDR version 24 and before.
|
||
|
|
||
|
var lmap = map[byte]int{
|
||
|
'p': 1,
|
||
|
's': 2,
|
||
|
't': 3,
|
||
|
'i': 5,
|
||
|
}
|
||
|
|
||
|
type rulesElem struct {
|
||
|
Rules struct {
|
||
|
Common
|
||
|
Any []*struct {
|
||
|
XMLName xml.Name
|
||
|
rule
|
||
|
} `xml:",any"`
|
||
|
} `xml:"rules"`
|
||
|
}
|
||
|
|
||
|
type rule struct {
|
||
|
Value string `xml:",chardata"`
|
||
|
Before string `xml:"before,attr"`
|
||
|
Any []*struct {
|
||
|
XMLName xml.Name
|
||
|
rule
|
||
|
} `xml:",any"`
|
||
|
}
|
||
|
|
||
|
var emptyValueError = errors.New("cldr: empty rule value")
|
||
|
|
||
|
func (r *rule) value() (string, error) {
|
||
|
// Convert hexadecimal Unicode codepoint notation to a string.
|
||
|
s := charRe.ReplaceAllStringFunc(r.Value, replaceUnicode)
|
||
|
r.Value = s
|
||
|
if s == "" {
|
||
|
if len(r.Any) != 1 {
|
||
|
return "", emptyValueError
|
||
|
}
|
||
|
r.Value = fmt.Sprintf(specialAnchor, r.Any[0].XMLName.Local)
|
||
|
r.Any = nil
|
||
|
} else if len(r.Any) != 0 {
|
||
|
return "", fmt.Errorf("cldr: XML elements found in collation rule: %v", r.Any)
|
||
|
}
|
||
|
return r.Value, nil
|
||
|
}
|
||
|
|
||
|
func (r rule) process(p RuleProcessor, name, context, extend string) error {
|
||
|
v, err := r.value()
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
switch name {
|
||
|
case "p", "s", "t", "i":
|
||
|
if strings.HasPrefix(v, cldrIndex) {
|
||
|
p.Index(v[len(cldrIndex):])
|
||
|
return nil
|
||
|
}
|
||
|
if err := p.Insert(lmap[name[0]], v, context, extend); err != nil {
|
||
|
return err
|
||
|
}
|
||
|
case "pc", "sc", "tc", "ic":
|
||
|
level := lmap[name[0]]
|
||
|
for _, s := range v {
|
||
|
if err := p.Insert(level, string(s), context, extend); err != nil {
|
||
|
return err
|
||
|
}
|
||
|
}
|
||
|
default:
|
||
|
return fmt.Errorf("cldr: unsupported tag: %q", name)
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// processXML parses the format of CLDR versions 24 and older.
|
||
|
func (c Collation) processXML(p RuleProcessor) (err error) {
|
||
|
// Collation is generated and defined in xml.go.
|
||
|
var v string
|
||
|
for _, r := range c.Rules.Any {
|
||
|
switch r.XMLName.Local {
|
||
|
case "reset":
|
||
|
level := 0
|
||
|
switch r.Before {
|
||
|
case "primary", "1":
|
||
|
level = 1
|
||
|
case "secondary", "2":
|
||
|
level = 2
|
||
|
case "tertiary", "3":
|
||
|
level = 3
|
||
|
case "":
|
||
|
default:
|
||
|
return fmt.Errorf("cldr: unknown level %q", r.Before)
|
||
|
}
|
||
|
v, err = r.value()
|
||
|
if err == nil {
|
||
|
err = p.Reset(v, level)
|
||
|
}
|
||
|
case "x":
|
||
|
var context, extend string
|
||
|
for _, r1 := range r.Any {
|
||
|
v, err = r1.value()
|
||
|
switch r1.XMLName.Local {
|
||
|
case "context":
|
||
|
context = v
|
||
|
case "extend":
|
||
|
extend = v
|
||
|
}
|
||
|
}
|
||
|
for _, r1 := range r.Any {
|
||
|
if t := r1.XMLName.Local; t == "context" || t == "extend" {
|
||
|
continue
|
||
|
}
|
||
|
r1.rule.process(p, r1.XMLName.Local, context, extend)
|
||
|
}
|
||
|
default:
|
||
|
err = r.rule.process(p, r.XMLName.Local, "", "")
|
||
|
}
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
}
|
||
|
return nil
|
||
|
}
|