You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

951 lines
30 KiB

  1. // Copyright 2014 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package cases
  5. import (
  6. "bytes"
  7. "fmt"
  8. "path"
  9. "strings"
  10. "testing"
  11. "unicode/utf8"
  12. "golang.org/x/text/internal/testtext"
  13. "golang.org/x/text/language"
  14. "golang.org/x/text/transform"
  15. "golang.org/x/text/unicode/norm"
  16. )
  17. type testCase struct {
  18. lang string
  19. src interface{} // string, []string, or nil to skip test
  20. title interface{} // string, []string, or nil to skip test
  21. lower interface{} // string, []string, or nil to skip test
  22. upper interface{} // string, []string, or nil to skip test
  23. opts options
  24. }
  25. var testCases = []testCase{
  26. 0: {
  27. lang: "und",
  28. src: "abc aBc ABC abC İsıI ΕΣΆΣ",
  29. title: "Abc Abc Abc Abc İsıi Εσάσ",
  30. lower: "abc abc abc abc i\u0307sıi εσάσ",
  31. upper: "ABC ABC ABC ABC İSII ΕΣΆΣ",
  32. opts: getOpts(HandleFinalSigma(false)),
  33. },
  34. 1: {
  35. lang: "und",
  36. src: "abc aBc ABC abC İsıI ΕΣΆΣ Σ _Σ -Σ",
  37. title: "Abc Abc Abc Abc İsıi Εσάς Σ _Σ -Σ",
  38. lower: "abc abc abc abc i\u0307sıi εσάς σ _σ -σ",
  39. upper: "ABC ABC ABC ABC İSII ΕΣΆΣ Σ _Σ -Σ",
  40. opts: getOpts(HandleFinalSigma(true)),
  41. },
  42. 2: { // Title cased runes.
  43. lang: supported,
  44. src: "DžA",
  45. title: "Dža",
  46. lower: "dža",
  47. upper: "DŽA",
  48. },
  49. 3: {
  50. // Title breaking.
  51. lang: supported,
  52. src: []string{
  53. "FOO CASE TEST",
  54. "DON'T DO THiS",
  55. "χωΡΊΣ χωΡΊΣ^a χωΡΊΣ:a χωΡΊΣ:^a χωΡΊΣ^ όμΩΣ Σ",
  56. "with-hyphens",
  57. "49ers 49ers",
  58. `"capitalize a^a -hyphen 0X _u a_u:a`,
  59. "MidNumLet a.b\u2018c\u2019d\u2024e\ufe52f\uff07f\uff0eg",
  60. "MidNum a,b;c\u037ed\u0589e\u060cf\u2044g\ufe50h",
  61. "\u0345 x\u3031x x\u05d0x \u05d0x a'.a a.a a4,a",
  62. },
  63. title: []string{
  64. "Foo Case Test",
  65. "Don't Do This",
  66. "Χωρίς Χωρίσ^A Χωρίσ:a Χωρίσ:^A Χωρίς^ Όμως Σ",
  67. "With-Hyphens",
  68. // Note that 49Ers is correct according to the spec.
  69. // TODO: provide some option to the user to treat different
  70. // characters as cased.
  71. "49Ers 49Ers",
  72. `"Capitalize A^A -Hyphen 0X _U A_u:a`,
  73. "Midnumlet A.b\u2018c\u2019d\u2024e\ufe52f\uff07f\uff0eg",
  74. "Midnum A,B;C\u037eD\u0589E\u060cF\u2044G\ufe50H",
  75. "\u0399 X\u3031X X\u05d0x \u05d0X A'.A A.a A4,A",
  76. },
  77. },
  78. // TODO: These are known deviations from the options{} Unicode Word Breaking
  79. // Algorithm.
  80. // {
  81. // "und",
  82. // "x_\u3031_x a4,4a",
  83. // "X_\u3031_x A4,4a", // Currently is "X_\U3031_X A4,4A".
  84. // "x_\u3031_x a4,4a",
  85. // "X_\u3031_X A4,4A",
  86. // options{},
  87. // },
  88. 4: {
  89. // Tests title options
  90. lang: "und",
  91. src: "abc aBc ABC abC İsıI o'Brien",
  92. title: "Abc ABc ABC AbC İsıI O'Brien",
  93. opts: getOpts(NoLower),
  94. },
  95. 5: {
  96. lang: "el",
  97. src: "aBc ΟΔΌΣ Οδός Σο ΣΟ Σ oΣ ΟΣ σ ἕξ \u03ac",
  98. title: "Abc Οδός Οδός Σο Σο Σ Oς Ος Σ Ἕξ \u0386",
  99. lower: "abc οδός οδός σο σο σ oς ος σ ἕξ \u03ac",
  100. upper: "ABC ΟΔΟΣ ΟΔΟΣ ΣΟ ΣΟ Σ OΣ ΟΣ Σ ΕΞ \u0391", // Uppercase removes accents
  101. },
  102. 6: {
  103. lang: "tr az",
  104. src: "Isiİ İsıI I\u0307sIiİ İsıI\u0307 I\u0300\u0307",
  105. title: "Isii İsıı I\u0307sıii İsıi I\u0300\u0307",
  106. lower: "ısii isıı isıii isıi \u0131\u0300\u0307",
  107. upper: "ISİİ İSII I\u0307SIİİ İSII\u0307 I\u0300\u0307",
  108. },
  109. 7: {
  110. lang: "lt",
  111. src: "I Ï J J̈ Į Į̈ Ì Í Ĩ xi̇̈ xj̇̈ xį̇̈ xi̇̀ xi̇́ xi̇̃ XI XÏ XJ XJ̈ XĮ XĮ̈ XI̟̤",
  112. title: "I Ï J J̈ Į Į̈ Ì Í Ĩ Xi̇̈ Xj̇̈ Xį̇̈ Xi̇̀ Xi̇́ Xi̇̃ Xi Xi̇̈ Xj Xj̇̈ Xį Xį̇̈ Xi̟̤",
  113. lower: "i i̇̈ j j̇̈ į į̇̈ i̇̀ i̇́ i̇̃ xi̇̈ xj̇̈ xį̇̈ xi̇̀ xi̇́ xi̇̃ xi xi̇̈ xj xj̇̈ xį xį̇̈ xi̟̤",
  114. upper: "I Ï J J̈ Į Į̈ Ì Í Ĩ XÏ XJ̈ XĮ̈ XÌ XÍ XĨ XI XÏ XJ XJ̈ XĮ XĮ̈ XI̟̤",
  115. },
  116. 8: {
  117. lang: "lt",
  118. src: "\u012e\u0300 \u00cc i\u0307\u0300 i\u0307\u0301 i\u0307\u0303 i\u0307\u0308 i\u0300\u0307",
  119. title: "\u012e\u0300 \u00cc \u00cc \u00cd \u0128 \u00cf I\u0300\u0307",
  120. lower: "\u012f\u0307\u0300 i\u0307\u0300 i\u0307\u0300 i\u0307\u0301 i\u0307\u0303 i\u0307\u0308 i\u0300\u0307",
  121. upper: "\u012e\u0300 \u00cc \u00cc \u00cd \u0128 \u00cf I\u0300\u0307",
  122. },
  123. 9: {
  124. lang: "nl",
  125. src: "ijs IJs Ij Ijs İJ İJs aa aA 'ns 'S",
  126. title: "IJs IJs IJ IJs İj İjs Aa Aa 'ns 's",
  127. },
  128. // Note: this specification is not currently part of CLDR. The same holds
  129. // for the leading apostrophe handling for Dutch.
  130. // See https://unicode.org/cldr/trac/ticket/7078.
  131. 10: {
  132. lang: "af",
  133. src: "wag 'n bietjie",
  134. title: "Wag 'n Bietjie",
  135. lower: "wag 'n bietjie",
  136. upper: "WAG 'N BIETJIE",
  137. },
  138. }
  139. func TestCaseMappings(t *testing.T) {
  140. for i, tt := range testCases {
  141. src, ok := tt.src.([]string)
  142. if !ok {
  143. src = strings.Split(tt.src.(string), " ")
  144. }
  145. for _, lang := range strings.Split(tt.lang, " ") {
  146. tag := language.MustParse(lang)
  147. testEntry := func(name string, mk func(language.Tag, options) transform.SpanningTransformer, gold interface{}) {
  148. c := Caser{mk(tag, tt.opts)}
  149. if gold != nil {
  150. wants, ok := gold.([]string)
  151. if !ok {
  152. wants = strings.Split(gold.(string), " ")
  153. }
  154. for j, want := range wants {
  155. if got := c.String(src[j]); got != want {
  156. t.Errorf("%d:%s:\n%s.String(%+q):\ngot %+q;\nwant %+q", i, lang, name, src[j], got, want)
  157. }
  158. }
  159. }
  160. dst := make([]byte, 256) // big enough to hold any result
  161. src := []byte(strings.Join(src, " "))
  162. v := testtext.AllocsPerRun(20, func() {
  163. c.Transform(dst, src, true)
  164. })
  165. if v > 1.1 {
  166. t.Errorf("%d:%s:\n%s: number of allocs was %f; want 0", i, lang, name, v)
  167. }
  168. }
  169. testEntry("Upper", makeUpper, tt.upper)
  170. testEntry("Lower", makeLower, tt.lower)
  171. testEntry("Title", makeTitle, tt.title)
  172. }
  173. }
  174. }
  175. // TestAlloc tests that some mapping methods should not cause any allocation.
  176. func TestAlloc(t *testing.T) {
  177. dst := make([]byte, 256) // big enough to hold any result
  178. src := []byte(txtNonASCII)
  179. for i, f := range []func() Caser{
  180. func() Caser { return Upper(language.Und) },
  181. func() Caser { return Lower(language.Und) },
  182. func() Caser { return Lower(language.Und, HandleFinalSigma(false)) },
  183. // TODO: use a shared copy for these casers as well, in order of
  184. // importance, starting with the most important:
  185. // func() Caser { return Title(language.Und) },
  186. // func() Caser { return Title(language.Und, HandleFinalSigma(false)) },
  187. } {
  188. testtext.Run(t, "", func(t *testing.T) {
  189. var c Caser
  190. v := testtext.AllocsPerRun(10, func() {
  191. c = f()
  192. })
  193. if v > 0 {
  194. // TODO: Right now only Upper has 1 allocation. Special-case Lower
  195. // and Title as well to have less allocations for the root locale.
  196. t.Errorf("%d:init: number of allocs was %f; want 0", i, v)
  197. }
  198. v = testtext.AllocsPerRun(2, func() {
  199. c.Transform(dst, src, true)
  200. })
  201. if v > 0 {
  202. t.Errorf("%d:transform: number of allocs was %f; want 0", i, v)
  203. }
  204. })
  205. }
  206. }
  207. func testHandover(t *testing.T, c Caser, src string) {
  208. want := c.String(src)
  209. // Find the common prefix.
  210. pSrc := 0
  211. for ; pSrc < len(src) && pSrc < len(want) && want[pSrc] == src[pSrc]; pSrc++ {
  212. }
  213. // Test handover for each substring of the prefix.
  214. for i := 0; i < pSrc; i++ {
  215. testtext.Run(t, fmt.Sprint("interleave/", i), func(t *testing.T) {
  216. dst := make([]byte, 4*len(src))
  217. c.Reset()
  218. nSpan, _ := c.Span([]byte(src[:i]), false)
  219. copy(dst, src[:nSpan])
  220. nTransform, _, _ := c.Transform(dst[nSpan:], []byte(src[nSpan:]), true)
  221. got := string(dst[:nSpan+nTransform])
  222. if got != want {
  223. t.Errorf("full string: got %q; want %q", got, want)
  224. }
  225. })
  226. }
  227. }
  228. func TestHandover(t *testing.T) {
  229. testCases := []struct {
  230. desc string
  231. t Caser
  232. first, second string
  233. }{{
  234. "title/nosigma/single midword",
  235. Title(language.Und, HandleFinalSigma(false)),
  236. "A.", "a",
  237. }, {
  238. "title/nosigma/single midword",
  239. Title(language.Und, HandleFinalSigma(false)),
  240. "A", ".a",
  241. }, {
  242. "title/nosigma/double midword",
  243. Title(language.Und, HandleFinalSigma(false)),
  244. "A..", "a",
  245. }, {
  246. "title/nosigma/double midword",
  247. Title(language.Und, HandleFinalSigma(false)),
  248. "A.", ".a",
  249. }, {
  250. "title/nosigma/double midword",
  251. Title(language.Und, HandleFinalSigma(false)),
  252. "A", "..a",
  253. }, {
  254. "title/sigma/single midword",
  255. Title(language.Und),
  256. "ΟΣ.", "a",
  257. }, {
  258. "title/sigma/single midword",
  259. Title(language.Und),
  260. "ΟΣ", ".a",
  261. }, {
  262. "title/sigma/double midword",
  263. Title(language.Und),
  264. "ΟΣ..", "a",
  265. }, {
  266. "title/sigma/double midword",
  267. Title(language.Und),
  268. "ΟΣ.", ".a",
  269. }, {
  270. "title/sigma/double midword",
  271. Title(language.Und),
  272. "ΟΣ", "..a",
  273. }, {
  274. "title/af/leading apostrophe",
  275. Title(language.Afrikaans),
  276. "'", "n bietje",
  277. }}
  278. for _, tc := range testCases {
  279. testtext.Run(t, tc.desc, func(t *testing.T) {
  280. src := tc.first + tc.second
  281. want := tc.t.String(src)
  282. tc.t.Reset()
  283. n, _ := tc.t.Span([]byte(tc.first), false)
  284. dst := make([]byte, len(want))
  285. copy(dst, tc.first[:n])
  286. nDst, _, _ := tc.t.Transform(dst[n:], []byte(src[n:]), true)
  287. got := string(dst[:n+nDst])
  288. if got != want {
  289. t.Errorf("got %q; want %q", got, want)
  290. }
  291. })
  292. }
  293. }
  294. // minBufSize is the size of the buffer by which the casing operation in
  295. // this package are guaranteed to make progress.
  296. const minBufSize = norm.MaxSegmentSize
  297. type bufferTest struct {
  298. desc, src, want string
  299. firstErr error
  300. dstSize, srcSize int
  301. t transform.SpanningTransformer
  302. }
  303. var bufferTests []bufferTest
  304. func init() {
  305. bufferTests = []bufferTest{{
  306. desc: "und/upper/short dst",
  307. src: "abcdefg",
  308. want: "ABCDEFG",
  309. firstErr: transform.ErrShortDst,
  310. dstSize: 3,
  311. srcSize: minBufSize,
  312. t: Upper(language.Und),
  313. }, {
  314. desc: "und/upper/short src",
  315. src: "123é56",
  316. want: "123É56",
  317. firstErr: transform.ErrShortSrc,
  318. dstSize: 4,
  319. srcSize: 4,
  320. t: Upper(language.Und),
  321. }, {
  322. desc: "und/upper/no error on short",
  323. src: "12",
  324. want: "12",
  325. firstErr: nil,
  326. dstSize: 1,
  327. srcSize: 1,
  328. t: Upper(language.Und),
  329. }, {
  330. desc: "und/lower/short dst",
  331. src: "ABCDEFG",
  332. want: "abcdefg",
  333. firstErr: transform.ErrShortDst,
  334. dstSize: 3,
  335. srcSize: minBufSize,
  336. t: Lower(language.Und),
  337. }, {
  338. desc: "und/lower/short src",
  339. src: "123É56",
  340. want: "123é56",
  341. firstErr: transform.ErrShortSrc,
  342. dstSize: 4,
  343. srcSize: 4,
  344. t: Lower(language.Und),
  345. }, {
  346. desc: "und/lower/no error on short",
  347. src: "12",
  348. want: "12",
  349. firstErr: nil,
  350. dstSize: 1,
  351. srcSize: 1,
  352. t: Lower(language.Und),
  353. }, {
  354. desc: "und/lower/simple (no final sigma)",
  355. src: "ΟΣ ΟΣΣ",
  356. want: "οσ οσσ",
  357. dstSize: minBufSize,
  358. srcSize: minBufSize,
  359. t: Lower(language.Und, HandleFinalSigma(false)),
  360. }, {
  361. desc: "und/title/simple (no final sigma)",
  362. src: "ΟΣ ΟΣΣ",
  363. want: "Οσ Οσσ",
  364. dstSize: minBufSize,
  365. srcSize: minBufSize,
  366. t: Title(language.Und, HandleFinalSigma(false)),
  367. }, {
  368. desc: "und/title/final sigma: no error",
  369. src: "ΟΣ",
  370. want: "Ος",
  371. dstSize: minBufSize,
  372. srcSize: minBufSize,
  373. t: Title(language.Und),
  374. }, {
  375. desc: "und/title/final sigma: short source",
  376. src: "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ",
  377. want: "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς",
  378. firstErr: transform.ErrShortSrc,
  379. dstSize: minBufSize,
  380. srcSize: 10,
  381. t: Title(language.Und),
  382. }, {
  383. desc: "und/title/final sigma: short destination 1",
  384. src: "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ",
  385. want: "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς",
  386. firstErr: transform.ErrShortDst,
  387. dstSize: 10,
  388. srcSize: minBufSize,
  389. t: Title(language.Und),
  390. }, {
  391. desc: "und/title/final sigma: short destination 2",
  392. src: "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ",
  393. want: "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς",
  394. firstErr: transform.ErrShortDst,
  395. dstSize: 9,
  396. srcSize: minBufSize,
  397. t: Title(language.Und),
  398. }, {
  399. desc: "und/title/final sigma: short destination 3",
  400. src: "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ",
  401. want: "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς",
  402. firstErr: transform.ErrShortDst,
  403. dstSize: 8,
  404. srcSize: minBufSize,
  405. t: Title(language.Und),
  406. }, {
  407. desc: "und/title/clipped UTF-8 rune",
  408. src: "σσσσσσσσσσσ",
  409. want: "Σσσσσσσσσσσ",
  410. firstErr: transform.ErrShortSrc,
  411. dstSize: minBufSize,
  412. srcSize: 5,
  413. t: Title(language.Und),
  414. }, {
  415. desc: "und/title/clipped UTF-8 rune atEOF",
  416. src: "σσσ" + string([]byte{0xCF}),
  417. want: "Σσσ" + string([]byte{0xCF}),
  418. dstSize: minBufSize,
  419. srcSize: minBufSize,
  420. t: Title(language.Und),
  421. }, {
  422. // Note: the choice to change the final sigma at the end in case of
  423. // too many case ignorables is arbitrary. The main reason for this
  424. // choice is that it results in simpler code.
  425. desc: "und/title/final sigma: max ignorables",
  426. src: "ΟΣ" + strings.Repeat(".", maxIgnorable) + "a",
  427. want: "Οσ" + strings.Repeat(".", maxIgnorable) + "A",
  428. dstSize: minBufSize,
  429. srcSize: minBufSize,
  430. t: Title(language.Und),
  431. }, {
  432. // Note: the choice to change the final sigma at the end in case of
  433. // too many case ignorables is arbitrary. The main reason for this
  434. // choice is that it results in simpler code.
  435. desc: "und/title/long string",
  436. src: "AA" + strings.Repeat(".", maxIgnorable+1) + "a",
  437. want: "Aa" + strings.Repeat(".", maxIgnorable+1) + "A",
  438. dstSize: minBufSize,
  439. srcSize: len("AA" + strings.Repeat(".", maxIgnorable+1)),
  440. t: Title(language.Und),
  441. }, {
  442. // Note: the choice to change the final sigma at the end in case of
  443. // too many case ignorables is arbitrary. The main reason for this
  444. // choice is that it results in simpler code.
  445. desc: "und/title/final sigma: too many ignorables",
  446. src: "ΟΣ" + strings.Repeat(".", maxIgnorable+1) + "a",
  447. want: "Ος" + strings.Repeat(".", maxIgnorable+1) + "A",
  448. dstSize: minBufSize,
  449. srcSize: len("ΟΣ" + strings.Repeat(".", maxIgnorable+1)),
  450. t: Title(language.Und),
  451. }, {
  452. desc: "und/title/final sigma: apostrophe",
  453. src: "ΟΣ''a",
  454. want: "Οσ''A",
  455. dstSize: minBufSize,
  456. srcSize: minBufSize,
  457. t: Title(language.Und),
  458. }, {
  459. desc: "el/upper/max ignorables",
  460. src: "ο" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0313",
  461. want: "Ο" + strings.Repeat("\u0321", maxIgnorable-1),
  462. dstSize: minBufSize,
  463. srcSize: minBufSize,
  464. t: Upper(language.Greek),
  465. }, {
  466. desc: "el/upper/too many ignorables",
  467. src: "ο" + strings.Repeat("\u0321", maxIgnorable) + "\u0313",
  468. want: "Ο" + strings.Repeat("\u0321", maxIgnorable) + "\u0313",
  469. dstSize: minBufSize,
  470. srcSize: len("ο" + strings.Repeat("\u0321", maxIgnorable)),
  471. t: Upper(language.Greek),
  472. }, {
  473. desc: "el/upper/short dst",
  474. src: "123ο",
  475. want: "123Ο",
  476. firstErr: transform.ErrShortDst,
  477. dstSize: 3,
  478. srcSize: minBufSize,
  479. t: Upper(language.Greek),
  480. }, {
  481. desc: "lt/lower/max ignorables",
  482. src: "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300",
  483. want: "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
  484. dstSize: minBufSize,
  485. srcSize: minBufSize,
  486. t: Lower(language.Lithuanian),
  487. }, {
  488. desc: "lt/lower/too many ignorables",
  489. src: "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0300",
  490. want: "i" + strings.Repeat("\u0321", maxIgnorable) + "\u0300",
  491. dstSize: minBufSize,
  492. srcSize: len("I" + strings.Repeat("\u0321", maxIgnorable)),
  493. t: Lower(language.Lithuanian),
  494. }, {
  495. desc: "lt/lower/decomposition with short dst buffer 1",
  496. src: "aaaaa\u00cc", // U+00CC LATIN CAPITAL LETTER I GRAVE
  497. firstErr: transform.ErrShortDst,
  498. want: "aaaaai\u0307\u0300",
  499. dstSize: 5,
  500. srcSize: minBufSize,
  501. t: Lower(language.Lithuanian),
  502. }, {
  503. desc: "lt/lower/decomposition with short dst buffer 2",
  504. src: "aaaa\u00cc", // U+00CC LATIN CAPITAL LETTER I GRAVE
  505. firstErr: transform.ErrShortDst,
  506. want: "aaaai\u0307\u0300",
  507. dstSize: 5,
  508. srcSize: minBufSize,
  509. t: Lower(language.Lithuanian),
  510. }, {
  511. desc: "lt/upper/max ignorables",
  512. src: "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
  513. want: "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300",
  514. dstSize: minBufSize,
  515. srcSize: minBufSize,
  516. t: Upper(language.Lithuanian),
  517. }, {
  518. desc: "lt/upper/too many ignorables",
  519. src: "i" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300",
  520. want: "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300",
  521. dstSize: minBufSize,
  522. srcSize: len("i" + strings.Repeat("\u0321", maxIgnorable)),
  523. t: Upper(language.Lithuanian),
  524. }, {
  525. desc: "lt/upper/short dst",
  526. src: "12i\u0307\u0300",
  527. want: "12\u00cc",
  528. firstErr: transform.ErrShortDst,
  529. dstSize: 3,
  530. srcSize: minBufSize,
  531. t: Upper(language.Lithuanian),
  532. }, {
  533. desc: "aztr/lower/max ignorables",
  534. src: "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
  535. want: "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300",
  536. dstSize: minBufSize,
  537. srcSize: minBufSize,
  538. t: Lower(language.Turkish),
  539. }, {
  540. desc: "aztr/lower/too many ignorables",
  541. src: "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300",
  542. want: "\u0131" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300",
  543. dstSize: minBufSize,
  544. srcSize: len("I" + strings.Repeat("\u0321", maxIgnorable)),
  545. t: Lower(language.Turkish),
  546. }, {
  547. desc: "nl/title/pre-IJ cutoff",
  548. src: " ij",
  549. want: " IJ",
  550. firstErr: transform.ErrShortDst,
  551. dstSize: 2,
  552. srcSize: minBufSize,
  553. t: Title(language.Dutch),
  554. }, {
  555. desc: "nl/title/mid-IJ cutoff",
  556. src: " ij",
  557. want: " IJ",
  558. firstErr: transform.ErrShortDst,
  559. dstSize: 3,
  560. srcSize: minBufSize,
  561. t: Title(language.Dutch),
  562. }, {
  563. desc: "af/title/apostrophe",
  564. src: "'n bietje",
  565. want: "'n Bietje",
  566. firstErr: transform.ErrShortDst,
  567. dstSize: 3,
  568. srcSize: minBufSize,
  569. t: Title(language.Afrikaans),
  570. }}
  571. }
  572. func TestShortBuffersAndOverflow(t *testing.T) {
  573. for i, tt := range bufferTests {
  574. testtext.Run(t, tt.desc, func(t *testing.T) {
  575. buf := make([]byte, tt.dstSize)
  576. got := []byte{}
  577. var nSrc, nDst int
  578. var err error
  579. for p := 0; p < len(tt.src); p += nSrc {
  580. q := p + tt.srcSize
  581. if q > len(tt.src) {
  582. q = len(tt.src)
  583. }
  584. nDst, nSrc, err = tt.t.Transform(buf, []byte(tt.src[p:q]), q == len(tt.src))
  585. got = append(got, buf[:nDst]...)
  586. if p == 0 && err != tt.firstErr {
  587. t.Errorf("%d:%s:\n error was %v; want %v", i, tt.desc, err, tt.firstErr)
  588. break
  589. }
  590. }
  591. if string(got) != tt.want {
  592. t.Errorf("%d:%s:\ngot %+q;\nwant %+q", i, tt.desc, got, tt.want)
  593. }
  594. testHandover(t, Caser{tt.t}, tt.src)
  595. })
  596. }
  597. }
  598. func TestSpan(t *testing.T) {
  599. for _, tt := range []struct {
  600. desc string
  601. src string
  602. want string
  603. atEOF bool
  604. err error
  605. t Caser
  606. }{{
  607. desc: "und/upper/basic",
  608. src: "abcdefg",
  609. want: "",
  610. atEOF: true,
  611. err: transform.ErrEndOfSpan,
  612. t: Upper(language.Und),
  613. }, {
  614. desc: "und/upper/short src",
  615. src: "123É"[:4],
  616. want: "123",
  617. atEOF: false,
  618. err: transform.ErrShortSrc,
  619. t: Upper(language.Und),
  620. }, {
  621. desc: "und/upper/no error on short",
  622. src: "12",
  623. want: "12",
  624. atEOF: false,
  625. t: Upper(language.Und),
  626. }, {
  627. desc: "und/lower/basic",
  628. src: "ABCDEFG",
  629. want: "",
  630. atEOF: true,
  631. err: transform.ErrEndOfSpan,
  632. t: Lower(language.Und),
  633. }, {
  634. desc: "und/lower/short src num",
  635. src: "123é"[:4],
  636. want: "123",
  637. atEOF: false,
  638. err: transform.ErrShortSrc,
  639. t: Lower(language.Und),
  640. }, {
  641. desc: "und/lower/short src greek",
  642. src: "αβγé"[:7],
  643. want: "αβγ",
  644. atEOF: false,
  645. err: transform.ErrShortSrc,
  646. t: Lower(language.Und),
  647. }, {
  648. desc: "und/lower/no error on short",
  649. src: "12",
  650. want: "12",
  651. atEOF: false,
  652. t: Lower(language.Und),
  653. }, {
  654. desc: "und/lower/simple (no final sigma)",
  655. src: "ος οσσ",
  656. want: "οσ οσσ",
  657. atEOF: true,
  658. t: Lower(language.Und, HandleFinalSigma(false)),
  659. }, {
  660. desc: "und/title/simple (no final sigma)",
  661. src: "Οσ Οσσ",
  662. want: "Οσ Οσσ",
  663. atEOF: true,
  664. t: Title(language.Und, HandleFinalSigma(false)),
  665. }, {
  666. desc: "und/lower/final sigma: no error",
  667. src: "οΣ", // Oς
  668. want: "ο", // Oς
  669. err: transform.ErrEndOfSpan,
  670. t: Lower(language.Und),
  671. }, {
  672. desc: "und/title/final sigma: no error",
  673. src: "ΟΣ", // Oς
  674. want: "Ο", // Oς
  675. err: transform.ErrEndOfSpan,
  676. t: Title(language.Und),
  677. }, {
  678. desc: "und/title/final sigma: no short source!",
  679. src: "ΟσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσΣ",
  680. want: "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσ",
  681. err: transform.ErrEndOfSpan,
  682. t: Title(language.Und),
  683. }, {
  684. desc: "und/title/clipped UTF-8 rune",
  685. src: "Σσ" + string([]byte{0xCF}),
  686. want: "Σσ",
  687. atEOF: false,
  688. err: transform.ErrShortSrc,
  689. t: Title(language.Und),
  690. }, {
  691. desc: "und/title/clipped UTF-8 rune atEOF",
  692. src: "Σσσ" + string([]byte{0xCF}),
  693. want: "Σσσ" + string([]byte{0xCF}),
  694. atEOF: true,
  695. t: Title(language.Und),
  696. }, {
  697. // Note: the choice to change the final sigma at the end in case of
  698. // too many case ignorables is arbitrary. The main reason for this
  699. // choice is that it results in simpler code.
  700. desc: "und/title/long string",
  701. src: "A" + strings.Repeat("a", maxIgnorable+5),
  702. want: "A" + strings.Repeat("a", maxIgnorable+5),
  703. t: Title(language.Und),
  704. }, {
  705. // Note: the choice to change the final sigma at the end in case of
  706. // too many case ignorables is arbitrary. The main reason for this
  707. // choice is that it results in simpler code.
  708. desc: "und/title/cyrillic",
  709. src: "При",
  710. want: "При",
  711. atEOF: true,
  712. t: Title(language.Und, HandleFinalSigma(false)),
  713. }, {
  714. // Note: the choice to change the final sigma at the end in case of
  715. // too many case ignorables is arbitrary. The main reason for this
  716. // choice is that it results in simpler code.
  717. desc: "und/title/final sigma: max ignorables",
  718. src: "Οσ" + strings.Repeat(".", maxIgnorable) + "A",
  719. want: "Οσ" + strings.Repeat(".", maxIgnorable) + "A",
  720. t: Title(language.Und),
  721. }, {
  722. desc: "el/upper/max ignorables - not implemented",
  723. src: "Ο" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0313",
  724. want: "",
  725. err: transform.ErrEndOfSpan,
  726. t: Upper(language.Greek),
  727. }, {
  728. desc: "el/upper/too many ignorables - not implemented",
  729. src: "Ο" + strings.Repeat("\u0321", maxIgnorable) + "\u0313",
  730. want: "",
  731. err: transform.ErrEndOfSpan,
  732. t: Upper(language.Greek),
  733. }, {
  734. desc: "el/upper/short dst",
  735. src: "123ο",
  736. want: "",
  737. err: transform.ErrEndOfSpan,
  738. t: Upper(language.Greek),
  739. }, {
  740. desc: "lt/lower/max ignorables",
  741. src: "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
  742. want: "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
  743. t: Lower(language.Lithuanian),
  744. }, {
  745. desc: "lt/lower/isLower",
  746. src: "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0300",
  747. want: "",
  748. err: transform.ErrEndOfSpan,
  749. t: Lower(language.Lithuanian),
  750. }, {
  751. desc: "lt/lower/not identical",
  752. src: "aaaaa\u00cc", // U+00CC LATIN CAPITAL LETTER I GRAVE
  753. err: transform.ErrEndOfSpan,
  754. want: "aaaaa",
  755. t: Lower(language.Lithuanian),
  756. }, {
  757. desc: "lt/lower/identical",
  758. src: "aaaai\u0307\u0300", // U+00CC LATIN CAPITAL LETTER I GRAVE
  759. want: "aaaai\u0307\u0300",
  760. t: Lower(language.Lithuanian),
  761. }, {
  762. desc: "lt/upper/not implemented",
  763. src: "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300",
  764. want: "",
  765. err: transform.ErrEndOfSpan,
  766. t: Upper(language.Lithuanian),
  767. }, {
  768. desc: "lt/upper/not implemented, ascii",
  769. src: "AB",
  770. want: "",
  771. err: transform.ErrEndOfSpan,
  772. t: Upper(language.Lithuanian),
  773. }, {
  774. desc: "nl/title/pre-IJ cutoff",
  775. src: " IJ",
  776. want: " IJ",
  777. t: Title(language.Dutch),
  778. }, {
  779. desc: "nl/title/mid-IJ cutoff",
  780. src: " Ia",
  781. want: " Ia",
  782. t: Title(language.Dutch),
  783. }, {
  784. desc: "af/title/apostrophe",
  785. src: "'n Bietje",
  786. want: "'n Bietje",
  787. t: Title(language.Afrikaans),
  788. }, {
  789. desc: "af/title/apostrophe-incorrect",
  790. src: "'N Bietje",
  791. // The Single_Quote (a MidWord), needs to be retained as unspanned so
  792. // that a successive call to Transform can detect that N should not be
  793. // capitalized.
  794. want: "",
  795. err: transform.ErrEndOfSpan,
  796. t: Title(language.Afrikaans),
  797. }} {
  798. testtext.Run(t, tt.desc, func(t *testing.T) {
  799. for p := 0; p < len(tt.want); p += utf8.RuneLen([]rune(tt.src[p:])[0]) {
  800. tt.t.Reset()
  801. n, err := tt.t.Span([]byte(tt.src[:p]), false)
  802. if err != nil && err != transform.ErrShortSrc {
  803. t.Errorf("early failure:Span(%+q): %v (%d < %d)", tt.src[:p], err, n, len(tt.want))
  804. break
  805. }
  806. }
  807. tt.t.Reset()
  808. n, err := tt.t.Span([]byte(tt.src), tt.atEOF)
  809. if n != len(tt.want) || err != tt.err {
  810. t.Errorf("Span(%+q, %v): got %d, %v; want %d, %v", tt.src, tt.atEOF, n, err, len(tt.want), tt.err)
  811. }
  812. testHandover(t, tt.t, tt.src)
  813. })
  814. }
  815. }
  816. var txtASCII = strings.Repeat("The quick brown fox jumps over the lazy dog. ", 50)
  817. // Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/
  818. const txt_vn = `Với các điều kiện sau: Ghi nhận công của tác giả. Nếu bạn sử
  819. dụng, chuyển đổi, hoặc xây dựng dự án từ nội dung được chia sẻ này, bạn phải áp
  820. dụng giấy phép này hoặc một giấy phép khác có các điều khoản tương tự như giấy
  821. phép này cho dự án của bạn. Hiểu rằng: Miễn — Bất kỳ các điều kiện nào trên đây
  822. cũng có thể được miễn bỏ nếu bạn được sự cho phép của người sở hữu bản quyền.
  823. Phạm vi công chúng — Khi tác phẩm hoặc bất kỳ chương nào của tác phẩm đã trong
  824. vùng dành cho công chúng theo quy định của pháp luật thì tình trạng của nó không
  825. bị ảnh hưởng bởi giấy phép trong bất kỳ trường hợp nào.`
  826. // http://creativecommons.org/licenses/by-sa/2.5/cn/
  827. const txt_cn = `您可以自由: 复制、发行、展览、表演、放映、
  828. 广播或通过信息网络传播本作品 创作演绎作品
  829. 对本作品进行商业性使用 惟须遵守下列条件:
  830. 署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。
  831. 相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作,
  832. 您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。`
  833. // Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru
  834. const txt_ru = `При обязательном соблюдении следующих условий: Attribution — Вы
  835. должны атрибутировать произведение (указывать автора и источник) в порядке,
  836. предусмотренном автором или лицензиаром (но только так, чтобы никоим образом не
  837. подразумевалось, что они поддерживают вас или использование вами данного
  838. произведения). Υπό τις ακόλουθες προϋποθέσεις:`
  839. // Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/
  840. const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με
  841. τον τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια (χωρίς
  842. όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή τη χρήση του έργου
  843. από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε, τροποποιήσετε ή δημιουργήσετε
  844. περαιτέρω βασισμένοι στο έργο θα μπορείτε να διανέμετε το έργο που θα προκύψει
  845. μόνο με την ίδια ή παρόμοια άδεια.`
  846. const txtNonASCII = txt_vn + txt_cn + txt_ru + txt_gr
  847. // TODO: Improve ASCII performance.
  848. func BenchmarkCasers(b *testing.B) {
  849. for _, s := range []struct{ name, text string }{
  850. {"ascii", txtASCII},
  851. {"nonASCII", txtNonASCII},
  852. {"short", "При"},
  853. } {
  854. src := []byte(s.text)
  855. // Measure case mappings in bytes package for comparison.
  856. for _, f := range []struct {
  857. name string
  858. fn func(b []byte) []byte
  859. }{
  860. {"lower", bytes.ToLower},
  861. {"title", bytes.ToTitle},
  862. {"upper", bytes.ToUpper},
  863. } {
  864. testtext.Bench(b, path.Join(s.name, "bytes", f.name), func(b *testing.B) {
  865. b.SetBytes(int64(len(src)))
  866. for i := 0; i < b.N; i++ {
  867. f.fn(src)
  868. }
  869. })
  870. }
  871. for _, t := range []struct {
  872. name string
  873. caser transform.SpanningTransformer
  874. }{
  875. {"fold/default", Fold()},
  876. {"upper/default", Upper(language.Und)},
  877. {"lower/sigma", Lower(language.Und)},
  878. {"lower/simple", Lower(language.Und, HandleFinalSigma(false))},
  879. {"title/sigma", Title(language.Und)},
  880. {"title/simple", Title(language.Und, HandleFinalSigma(false))},
  881. } {
  882. c := Caser{t.caser}
  883. dst := make([]byte, len(src))
  884. testtext.Bench(b, path.Join(s.name, t.name, "transform"), func(b *testing.B) {
  885. b.SetBytes(int64(len(src)))
  886. for i := 0; i < b.N; i++ {
  887. c.Reset()
  888. c.Transform(dst, src, true)
  889. }
  890. })
  891. // No need to check span for simple cases, as they will be the same
  892. // as sigma.
  893. if strings.HasSuffix(t.name, "/simple") {
  894. continue
  895. }
  896. spanSrc := c.Bytes(src)
  897. testtext.Bench(b, path.Join(s.name, t.name, "span"), func(b *testing.B) {
  898. c.Reset()
  899. if n, _ := c.Span(spanSrc, true); n < len(spanSrc) {
  900. b.Fatalf("spanner is not recognizing text %q as done (at %d)", spanSrc, n)
  901. }
  902. b.SetBytes(int64(len(spanSrc)))
  903. for i := 0; i < b.N; i++ {
  904. c.Reset()
  905. c.Span(spanSrc, true)
  906. }
  907. })
  908. }
  909. }
  910. }