211 lines
6.6 KiB
Go
211 lines
6.6 KiB
Go
|
// Copyright 2016 The Go Authors. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style
|
||
|
// license that can be found in the LICENSE file.
|
||
|
|
||
|
// +build icu
|
||
|
|
||
|
package cases
|
||
|
|
||
|
import (
|
||
|
"path"
|
||
|
"strings"
|
||
|
"testing"
|
||
|
|
||
|
"golang.org/x/text/internal/testtext"
|
||
|
"golang.org/x/text/language"
|
||
|
"golang.org/x/text/unicode/norm"
|
||
|
)
|
||
|
|
||
|
func TestICUConformance(t *testing.T) {
|
||
|
// Build test set.
|
||
|
input := []string{
|
||
|
"a.a a_a",
|
||
|
"a\u05d0a",
|
||
|
"\u05d0'a",
|
||
|
"a\u03084a",
|
||
|
"a\u0308a",
|
||
|
"a3\u30a3a",
|
||
|
"a\u303aa",
|
||
|
"a_\u303a_a",
|
||
|
"1_a..a",
|
||
|
"1_a.a",
|
||
|
"a..a.",
|
||
|
"a--a-",
|
||
|
"a-a-",
|
||
|
"a\u200ba",
|
||
|
"a\u200b\u200ba",
|
||
|
"a\u00ad\u00ada", // Format
|
||
|
"a\u00ada",
|
||
|
"a''a", // SingleQuote
|
||
|
"a'a",
|
||
|
"a::a", // MidLetter
|
||
|
"a:a",
|
||
|
"a..a", // MidNumLet
|
||
|
"a.a",
|
||
|
"a;;a", // MidNum
|
||
|
"a;a",
|
||
|
"a__a", // ExtendNumlet
|
||
|
"a_a",
|
||
|
"ΟΣ''a",
|
||
|
}
|
||
|
add := func(x interface{}) {
|
||
|
switch v := x.(type) {
|
||
|
case string:
|
||
|
input = append(input, v)
|
||
|
case []string:
|
||
|
for _, s := range v {
|
||
|
input = append(input, s)
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
for _, tc := range testCases {
|
||
|
add(tc.src)
|
||
|
add(tc.lower)
|
||
|
add(tc.upper)
|
||
|
add(tc.title)
|
||
|
}
|
||
|
for _, tc := range bufferTests {
|
||
|
add(tc.src)
|
||
|
}
|
||
|
for _, tc := range breakTest {
|
||
|
add(strings.Replace(tc, "|", "", -1))
|
||
|
}
|
||
|
for _, tc := range foldTestCases {
|
||
|
add(tc)
|
||
|
}
|
||
|
|
||
|
// Compare ICU to Go.
|
||
|
for _, c := range []string{"lower", "upper", "title", "fold"} {
|
||
|
for _, tag := range []string{
|
||
|
"und", "af", "az", "el", "lt", "nl", "tr",
|
||
|
} {
|
||
|
for _, s := range input {
|
||
|
if exclude(c, tag, s) {
|
||
|
continue
|
||
|
}
|
||
|
testtext.Run(t, path.Join(c, tag, s), func(t *testing.T) {
|
||
|
want := doICU(tag, c, s)
|
||
|
got := doGo(tag, c, s)
|
||
|
if norm.NFC.String(got) != norm.NFC.String(want) {
|
||
|
t.Errorf("\n in %[3]q (%+[3]q)\n got %[1]q (%+[1]q)\n want %[2]q (%+[2]q)", got, want, s)
|
||
|
}
|
||
|
})
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// exclude indicates if a string should be excluded from testing.
|
||
|
func exclude(cm, tag, s string) bool {
|
||
|
list := []struct{ cm, tags, pattern string }{
|
||
|
// TODO: Go does not handle certain esoteric breaks correctly. This will be
|
||
|
// fixed once we have a real word break iterator. Alternatively, it
|
||
|
// seems like we're not too far off from making it work, so we could
|
||
|
// fix these last steps. But first verify that using a separate word
|
||
|
// breaker does not hurt performance.
|
||
|
{"title", "af nl", "a''a"},
|
||
|
{"", "", "א'a"},
|
||
|
|
||
|
// All the exclusions below seem to be issues with the ICU
|
||
|
// implementation (at version 57) and thus are not marked as TODO.
|
||
|
|
||
|
// ICU does not handle leading apostrophe for Dutch and
|
||
|
// Afrikaans correctly. See https://unicode.org/cldr/trac/ticket/7078.
|
||
|
{"title", "af nl", "'n"},
|
||
|
{"title", "af nl", "'N"},
|
||
|
|
||
|
// Go terminates the final sigma check after a fixed number of
|
||
|
// ignorables have been found. This ensures that the algorithm can make
|
||
|
// progress in a streaming scenario.
|
||
|
{"lower title", "", "\u039f\u03a3...............................a"},
|
||
|
// This also applies to upper in Greek.
|
||
|
// NOTE: we could fix the following two cases by adding state to elUpper
|
||
|
// and aztrLower. However, considering a modifier to not belong to the
|
||
|
// preceding letter after the maximum modifiers count is reached is
|
||
|
// consistent with the behavior of unicode/norm.
|
||
|
{"upper", "el", "\u03bf" + strings.Repeat("\u0321", 29) + "\u0313"},
|
||
|
{"lower", "az tr lt", "I" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},
|
||
|
{"upper", "lt", "i" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},
|
||
|
{"lower", "lt", "I" + strings.Repeat("\u0321", 30) + "\u0300"},
|
||
|
|
||
|
// ICU title case seems to erroneously removes \u0307 from an upper case
|
||
|
// I unconditionally, instead of only when lowercasing. The ICU
|
||
|
// transform algorithm transforms these cases consistently with our
|
||
|
// implementation.
|
||
|
{"title", "az tr", "\u0307"},
|
||
|
|
||
|
// The spec says to remove \u0307 after Soft-Dotted characters. ICU
|
||
|
// transforms conform but ucasemap_utf8ToUpper does not.
|
||
|
{"upper title", "lt", "i\u0307"},
|
||
|
{"upper title", "lt", "i" + strings.Repeat("\u0321", 29) + "\u0307\u0300"},
|
||
|
|
||
|
// Both Unicode and CLDR prescribe an extra explicit dot above after a
|
||
|
// Soft_Dotted character if there are other modifiers.
|
||
|
// ucasemap_utf8ToUpper does not do this; ICU transforms do.
|
||
|
// The issue with ucasemap_utf8ToUpper seems to be that it does not
|
||
|
// consider the modifiers that are part of composition in the evaluation
|
||
|
// of More_Above. For instance, according to the More_Above rule for lt,
|
||
|
// a dotted capital I (U+0130) becomes i\u0307\u0307 (an small i with
|
||
|
// two additional dots). This seems odd, but is correct. ICU is
|
||
|
// definitely not correct as it produces different results for different
|
||
|
// normal forms. For instance, for an İ:
|
||
|
// \u0130 (NFC) -> i\u0307 (incorrect)
|
||
|
// I\u0307 (NFD) -> i\u0307\u0307 (correct)
|
||
|
// We could argue that we should not add a \u0307 if there already is
|
||
|
// one, but this may be hard to get correct and is not conform the
|
||
|
// standard.
|
||
|
{"lower title", "lt", "\u0130"},
|
||
|
{"lower title", "lt", "\u00cf"},
|
||
|
|
||
|
// We are conform ICU ucasemap_utf8ToUpper if we remove support for
|
||
|
// elUpper. However, this is clearly not conform the spec. Moreover, the
|
||
|
// ICU transforms _do_ implement this transform and produces results
|
||
|
// consistent with our implementation. Note that we still prefer to use
|
||
|
// ucasemap_utf8ToUpper instead of transforms as the latter have
|
||
|
// inconsistencies in the word breaking algorithm.
|
||
|
{"upper", "el", "\u0386"}, // GREEK CAPITAL LETTER ALPHA WITH TONOS
|
||
|
{"upper", "el", "\u0389"}, // GREEK CAPITAL LETTER ETA WITH TONOS
|
||
|
{"upper", "el", "\u038A"}, // GREEK CAPITAL LETTER IOTA WITH TONOS
|
||
|
|
||
|
{"upper", "el", "\u0391"}, // GREEK CAPITAL LETTER ALPHA
|
||
|
{"upper", "el", "\u0397"}, // GREEK CAPITAL LETTER ETA
|
||
|
{"upper", "el", "\u0399"}, // GREEK CAPITAL LETTER IOTA
|
||
|
|
||
|
{"upper", "el", "\u03AC"}, // GREEK SMALL LETTER ALPHA WITH TONOS
|
||
|
{"upper", "el", "\u03AE"}, // GREEK SMALL LETTER ALPHA WITH ETA
|
||
|
{"upper", "el", "\u03AF"}, // GREEK SMALL LETTER ALPHA WITH IOTA
|
||
|
|
||
|
{"upper", "el", "\u03B1"}, // GREEK SMALL LETTER ALPHA
|
||
|
{"upper", "el", "\u03B7"}, // GREEK SMALL LETTER ETA
|
||
|
{"upper", "el", "\u03B9"}, // GREEK SMALL LETTER IOTA
|
||
|
}
|
||
|
for _, x := range list {
|
||
|
if x.cm != "" && strings.Index(x.cm, cm) == -1 {
|
||
|
continue
|
||
|
}
|
||
|
if x.tags != "" && strings.Index(x.tags, tag) == -1 {
|
||
|
continue
|
||
|
}
|
||
|
if strings.Index(s, x.pattern) != -1 {
|
||
|
return true
|
||
|
}
|
||
|
}
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
func doGo(tag, caser, input string) string {
|
||
|
var c Caser
|
||
|
t := language.MustParse(tag)
|
||
|
switch caser {
|
||
|
case "lower":
|
||
|
c = Lower(t)
|
||
|
case "upper":
|
||
|
c = Upper(t)
|
||
|
case "title":
|
||
|
c = Title(t)
|
||
|
case "fold":
|
||
|
c = Fold()
|
||
|
}
|
||
|
return c.String(input)
|
||
|
}
|