youtubebeat/vendor/golang.org/x/text/cases/icu_test.go

// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build icu

package cases

import (
	"path"
	"strings"
	"testing"

	"golang.org/x/text/internal/testtext"
	"golang.org/x/text/language"
	"golang.org/x/text/unicode/norm"
)

func TestICUConformance(t *testing.T) {
	// Build test set.
	input := []string{
		"a.a a_a",
		"a\u05d0a",
		"\u05d0'a",
		"a\u03084a",
		"a\u0308a",
		"a3\u30a3a",
		"a\u303aa",
		"a_\u303a_a",
		"1_a..a",
		"1_a.a",
		"a..a.",
		"a--a-",
		"a-a-",
		"a\u200ba",
		"a\u200b\u200ba",
		"a\u00ad\u00ada", // Format
		"a\u00ada",
		"a''a", // SingleQuote
		"a'a",
		"a::a", // MidLetter
		"a:a",
		"a..a", // MidNumLet
		"a.a",
		"a;;a", // MidNum
		"a;a",
		"a__a", // ExtendNumlet
		"a_a",
		"ΟΣ''a",
	}
	add := func(x interface{}) {
		switch v := x.(type) {
		case string:
			input = append(input, v)
		case []string:
			for _, s := range v {
				input = append(input, s)
			}
		}
	}
	for _, tc := range testCases {
		add(tc.src)
		add(tc.lower)
		add(tc.upper)
		add(tc.title)
	}
	for _, tc := range bufferTests {
		add(tc.src)
	}
	for _, tc := range breakTest {
		add(strings.Replace(tc, "|", "", -1))
	}
	for _, tc := range foldTestCases {
		add(tc)
	}

	// Compare ICU to Go.
	for _, c := range []string{"lower", "upper", "title", "fold"} {
		for _, tag := range []string{
			"und", "af", "az", "el", "lt", "nl", "tr",
		} {
			for _, s := range input {
				if exclude(c, tag, s) {
					continue
				}
				testtext.Run(t, path.Join(c, tag, s), func(t *testing.T) {
					want := doICU(tag, c, s)
					got := doGo(tag, c, s)
					if norm.NFC.String(got) != norm.NFC.String(want) {
						t.Errorf("\n    in %[3]q (%+[3]q)\n   got %[1]q (%+[1]q)\n  want %[2]q (%+[2]q)", got, want, s)
					}
				})
			}
		}
	}
}

// exclude indicates if a string should be excluded from testing.
func exclude(cm, tag, s string) bool {
	list := []struct{ cm, tags, pattern string }{
		// TODO: Go does not handle certain esoteric breaks correctly. This will be
		// fixed once we have a real word break iterator. Alternatively, it
		// seems like we're not too far off from making it work, so we could
		// fix these last steps. But first verify that using a separate word
		// breaker does not hurt performance.
		{"title", "af nl", "a''a"},
		{"", "", "א'a"},

		// All the exclusions below seem to be issues with the ICU
		// implementation (at version 57) and thus are not marked as TODO.

		// ICU does not handle leading apostrophe for Dutch and
		// Afrikaans correctly. See https://unicode.org/cldr/trac/ticket/7078.
		{"title", "af nl", "'n"},
		{"title", "af nl", "'N"},

		// Go terminates the final sigma check after a fixed number of
		// ignorables have been found. This ensures that the algorithm can make
		// progress in a streaming scenario.
		{"lower title", "", "\u039f\u03a3...............................a"},
		// This also applies to upper in Greek.
		// NOTE: we could fix the following two cases by adding state to elUpper
		// and aztrLower. However, considering a modifier to not belong to the
		// preceding letter after the maximum modifiers count is reached is
		// consistent with the behavior of unicode/norm.
		{"upper", "el", "\u03bf" + strings.Repeat("\u0321", 29) + "\u0313"},
		{"lower", "az tr lt", "I" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},
		{"upper", "lt", "i" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},
		{"lower", "lt", "I" + strings.Repeat("\u0321", 30) + "\u0300"},

		// ICU title case seems to erroneously removes \u0307 from an upper case
		// I unconditionally, instead of only when lowercasing. The ICU
		// transform algorithm transforms these cases consistently with our
		// implementation.
		{"title", "az tr", "\u0307"},

		// The spec says to remove \u0307 after Soft-Dotted characters. ICU
		// transforms conform but ucasemap_utf8ToUpper does not.
		{"upper title", "lt", "i\u0307"},
		{"upper title", "lt", "i" + strings.Repeat("\u0321", 29) + "\u0307\u0300"},

		// Both Unicode and CLDR prescribe an extra explicit dot above after a
		// Soft_Dotted character if there are other modifiers.
		// ucasemap_utf8ToUpper does not do this; ICU transforms do.
		// The issue with ucasemap_utf8ToUpper seems to be that it does not
		// consider the modifiers that are part of composition in the evaluation
		// of More_Above. For instance, according to the More_Above rule for lt,
		// a dotted capital I (U+0130) becomes i\u0307\u0307 (an small i with
		// two additional dots). This seems odd, but is correct. ICU is
		// definitely not correct as it produces different results for different
		// normal forms. For instance, for an İ:
		//    \u0130  (NFC) -> i\u0307         (incorrect)
		//    I\u0307 (NFD) -> i\u0307\u0307   (correct)
		// We could argue that we should not add a \u0307 if there already is
		// one, but this may be hard to get correct and is not conform the
		// standard.
		{"lower title", "lt", "\u0130"},
		{"lower title", "lt", "\u00cf"},

		// We are conform ICU ucasemap_utf8ToUpper if we remove support for
		// elUpper. However, this is clearly not conform the spec. Moreover, the
		// ICU transforms _do_ implement this transform and produces results
		// consistent with our implementation. Note that we still prefer to use
		// ucasemap_utf8ToUpper instead of transforms as the latter have
		// inconsistencies in the word breaking algorithm.
		{"upper", "el", "\u0386"}, // GREEK CAPITAL LETTER ALPHA WITH TONOS
		{"upper", "el", "\u0389"}, // GREEK CAPITAL LETTER ETA WITH TONOS
		{"upper", "el", "\u038A"}, // GREEK CAPITAL LETTER IOTA WITH TONOS

		{"upper", "el", "\u0391"}, // GREEK CAPITAL LETTER ALPHA
		{"upper", "el", "\u0397"}, // GREEK CAPITAL LETTER ETA
		{"upper", "el", "\u0399"}, // GREEK CAPITAL LETTER IOTA

		{"upper", "el", "\u03AC"}, // GREEK SMALL LETTER ALPHA WITH TONOS
		{"upper", "el", "\u03AE"}, // GREEK SMALL LETTER ALPHA WITH ETA
		{"upper", "el", "\u03AF"}, // GREEK SMALL LETTER ALPHA WITH IOTA

		{"upper", "el", "\u03B1"}, // GREEK SMALL LETTER ALPHA
		{"upper", "el", "\u03B7"}, // GREEK SMALL LETTER ETA
		{"upper", "el", "\u03B9"}, // GREEK SMALL LETTER IOTA
	}
	for _, x := range list {
		if x.cm != "" && strings.Index(x.cm, cm) == -1 {
			continue
		}
		if x.tags != "" && strings.Index(x.tags, tag) == -1 {
			continue
		}
		if strings.Index(s, x.pattern) != -1 {
			return true
		}
	}
	return false
}

func doGo(tag, caser, input string) string {
	var c Caser
	t := language.MustParse(tag)
	switch caser {
	case "lower":
		c = Lower(t)
	case "upper":
		c = Upper(t)
	case "title":
		c = Title(t)
	case "fold":
		c = Fold()
	}
	return c.String(input)
}
Add code and dependencies 2018-11-18 15:32:28 +01:00			`// Copyright 2016 The Go Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style`
			`// license that can be found in the LICENSE file.`

			`// +build icu`

			`package cases`

			`import (`
			`"path"`
			`"strings"`
			`"testing"`

			`"golang.org/x/text/internal/testtext"`
			`"golang.org/x/text/language"`
			`"golang.org/x/text/unicode/norm"`
			`)`

			`func TestICUConformance(t *testing.T) {`
			`// Build test set.`
			`input := []string{`
			`"a.a a_a",`
			`"a\u05d0a",`
			`"\u05d0'a",`
			`"a\u03084a",`
			`"a\u0308a",`
			`"a3\u30a3a",`
			`"a\u303aa",`
			`"a_\u303a_a",`
			`"1_a..a",`
			`"1_a.a",`
			`"a..a.",`
			`"a--a-",`
			`"a-a-",`
			`"a\u200ba",`
			`"a\u200b\u200ba",`
			`"a\u00ad\u00ada", // Format`
			`"a\u00ada",`
			`"a''a", // SingleQuote`
			`"a'a",`
			`"a::a", // MidLetter`
			`"a:a",`
			`"a..a", // MidNumLet`
			`"a.a",`
			`"a;;a", // MidNum`
			`"a;a",`
			`"a__a", // ExtendNumlet`
			`"a_a",`
			`"ΟΣ''a",`
			`}`
			`add := func(x interface{}) {`
			`switch v := x.(type) {`
			`case string:`
			`input = append(input, v)`
			`case []string:`
			`for _, s := range v {`
			`input = append(input, s)`
			`}`
			`}`
			`}`
			`for _, tc := range testCases {`
			`add(tc.src)`
			`add(tc.lower)`
			`add(tc.upper)`
			`add(tc.title)`
			`}`
			`for _, tc := range bufferTests {`
			`add(tc.src)`
			`}`
			`for _, tc := range breakTest {`
			`add(strings.Replace(tc, "\|", "", -1))`
			`}`
			`for _, tc := range foldTestCases {`
			`add(tc)`
			`}`

			`// Compare ICU to Go.`
			`for _, c := range []string{"lower", "upper", "title", "fold"} {`
			`for _, tag := range []string{`
			`"und", "af", "az", "el", "lt", "nl", "tr",`
			`} {`
			`for _, s := range input {`
			`if exclude(c, tag, s) {`
			`continue`
			`}`
			`testtext.Run(t, path.Join(c, tag, s), func(t *testing.T) {`
			`want := doICU(tag, c, s)`
			`got := doGo(tag, c, s)`
			`if norm.NFC.String(got) != norm.NFC.String(want) {`
			`t.Errorf("\n in %[3]q (%+[3]q)\n got %[1]q (%+[1]q)\n want %[2]q (%+[2]q)", got, want, s)`
			`}`
			`})`
			`}`
			`}`
			`}`
			`}`

			`// exclude indicates if a string should be excluded from testing.`
			`func exclude(cm, tag, s string) bool {`
			`list := []struct{ cm, tags, pattern string }{`
			`// TODO: Go does not handle certain esoteric breaks correctly. This will be`
			`// fixed once we have a real word break iterator. Alternatively, it`
			`// seems like we're not too far off from making it work, so we could`
			`// fix these last steps. But first verify that using a separate word`
			`// breaker does not hurt performance.`
			`{"title", "af nl", "a''a"},`
			`{"", "", "א'a"},`

			`// All the exclusions below seem to be issues with the ICU`
			`// implementation (at version 57) and thus are not marked as TODO.`

			`// ICU does not handle leading apostrophe for Dutch and`
			`// Afrikaans correctly. See https://unicode.org/cldr/trac/ticket/7078.`
			`{"title", "af nl", "'n"},`
			`{"title", "af nl", "'N"},`

			`// Go terminates the final sigma check after a fixed number of`
			`// ignorables have been found. This ensures that the algorithm can make`
			`// progress in a streaming scenario.`
			`{"lower title", "", "\u039f\u03a3...............................a"},`
			`// This also applies to upper in Greek.`
			`// NOTE: we could fix the following two cases by adding state to elUpper`
			`// and aztrLower. However, considering a modifier to not belong to the`
			`// preceding letter after the maximum modifiers count is reached is`
			`// consistent with the behavior of unicode/norm.`
			`{"upper", "el", "\u03bf" + strings.Repeat("\u0321", 29) + "\u0313"},`
			`{"lower", "az tr lt", "I" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},`
			`{"upper", "lt", "i" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},`
			`{"lower", "lt", "I" + strings.Repeat("\u0321", 30) + "\u0300"},`

			`// ICU title case seems to erroneously removes \u0307 from an upper case`
			`// I unconditionally, instead of only when lowercasing. The ICU`
			`// transform algorithm transforms these cases consistently with our`
			`// implementation.`
			`{"title", "az tr", "\u0307"},`

			`// The spec says to remove \u0307 after Soft-Dotted characters. ICU`
			`// transforms conform but ucasemap_utf8ToUpper does not.`
			`{"upper title", "lt", "i\u0307"},`
			`{"upper title", "lt", "i" + strings.Repeat("\u0321", 29) + "\u0307\u0300"},`

			`// Both Unicode and CLDR prescribe an extra explicit dot above after a`
			`// Soft_Dotted character if there are other modifiers.`
			`// ucasemap_utf8ToUpper does not do this; ICU transforms do.`
			`// The issue with ucasemap_utf8ToUpper seems to be that it does not`
			`// consider the modifiers that are part of composition in the evaluation`
			`// of More_Above. For instance, according to the More_Above rule for lt,`
			`// a dotted capital I (U+0130) becomes i\u0307\u0307 (an small i with`
			`// two additional dots). This seems odd, but is correct. ICU is`
			`// definitely not correct as it produces different results for different`
			`// normal forms. For instance, for an İ:`
			`// \u0130 (NFC) -> i\u0307 (incorrect)`
			`// I\u0307 (NFD) -> i\u0307\u0307 (correct)`
			`// We could argue that we should not add a \u0307 if there already is`
			`// one, but this may be hard to get correct and is not conform the`
			`// standard.`
			`{"lower title", "lt", "\u0130"},`
			`{"lower title", "lt", "\u00cf"},`

			`// We are conform ICU ucasemap_utf8ToUpper if we remove support for`
			`// elUpper. However, this is clearly not conform the spec. Moreover, the`
			`// ICU transforms _do_ implement this transform and produces results`
			`// consistent with our implementation. Note that we still prefer to use`
			`// ucasemap_utf8ToUpper instead of transforms as the latter have`
			`// inconsistencies in the word breaking algorithm.`
			`{"upper", "el", "\u0386"}, // GREEK CAPITAL LETTER ALPHA WITH TONOS`
			`{"upper", "el", "\u0389"}, // GREEK CAPITAL LETTER ETA WITH TONOS`
			`{"upper", "el", "\u038A"}, // GREEK CAPITAL LETTER IOTA WITH TONOS`

			`{"upper", "el", "\u0391"}, // GREEK CAPITAL LETTER ALPHA`
			`{"upper", "el", "\u0397"}, // GREEK CAPITAL LETTER ETA`
			`{"upper", "el", "\u0399"}, // GREEK CAPITAL LETTER IOTA`

			`{"upper", "el", "\u03AC"}, // GREEK SMALL LETTER ALPHA WITH TONOS`
			`{"upper", "el", "\u03AE"}, // GREEK SMALL LETTER ALPHA WITH ETA`
			`{"upper", "el", "\u03AF"}, // GREEK SMALL LETTER ALPHA WITH IOTA`

			`{"upper", "el", "\u03B1"}, // GREEK SMALL LETTER ALPHA`
			`{"upper", "el", "\u03B7"}, // GREEK SMALL LETTER ETA`
			`{"upper", "el", "\u03B9"}, // GREEK SMALL LETTER IOTA`
			`}`
			`for _, x := range list {`
			`if x.cm != "" && strings.Index(x.cm, cm) == -1 {`
			`continue`
			`}`
			`if x.tags != "" && strings.Index(x.tags, tag) == -1 {`
			`continue`
			`}`
			`if strings.Index(s, x.pattern) != -1 {`
			`return true`
			`}`
			`}`
			`return false`
			`}`

			`func doGo(tag, caser, input string) string {`
			`var c Caser`
			`t := language.MustParse(tag)`
			`switch caser {`
			`case "lower":`
			`c = Lower(t)`
			`case "upper":`
			`c = Upper(t)`
			`case "title":`
			`c = Title(t)`
			`case "fold":`
			`c = Fold()`
			`}`
			`return c.String(input)`
			`}`