You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

1288 lines
42 KiB

  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package norm
  5. import (
  6. "bytes"
  7. "flag"
  8. "fmt"
  9. "io"
  10. "log"
  11. "strings"
  12. "testing"
  13. "unicode/utf8"
  14. "golang.org/x/text/internal/testtext"
  15. "golang.org/x/text/transform"
  16. )
  17. var (
  18. testn = flag.Int("testn", -1, "specific test number to run or -1 for all")
  19. )
  20. // pc replaces any rune r that is repeated n times, for n > 1, with r{n}.
  21. func pc(s string) []byte {
  22. b := bytes.NewBuffer(make([]byte, 0, len(s)))
  23. for i := 0; i < len(s); {
  24. r, sz := utf8.DecodeRuneInString(s[i:])
  25. n := 0
  26. if sz == 1 {
  27. // Special-case one-byte case to handle repetition for invalid UTF-8.
  28. for c := s[i]; i+n < len(s) && s[i+n] == c; n++ {
  29. }
  30. } else {
  31. for _, r2 := range s[i:] {
  32. if r2 != r {
  33. break
  34. }
  35. n++
  36. }
  37. }
  38. b.WriteString(s[i : i+sz])
  39. if n > 1 {
  40. fmt.Fprintf(b, "{%d}", n)
  41. }
  42. i += sz * n
  43. }
  44. return b.Bytes()
  45. }
  46. // pidx finds the index from which two strings start to differ, plus context.
  47. // It returns the index and ellipsis if the index is greater than 0.
  48. func pidx(a, b string) (i int, prefix string) {
  49. for ; i < len(a) && i < len(b) && a[i] == b[i]; i++ {
  50. }
  51. if i < 8 {
  52. return 0, ""
  53. }
  54. i -= 3 // ensure taking at least one full rune before the difference.
  55. for k := i - 7; i > k && !utf8.RuneStart(a[i]); i-- {
  56. }
  57. return i, "..."
  58. }
  59. type PositionTest struct {
  60. input string
  61. pos int
  62. buffer string // expected contents of reorderBuffer, if applicable
  63. }
  64. type positionFunc func(rb *reorderBuffer, s string) (int, []byte)
  65. func runPosTests(t *testing.T, name string, f Form, fn positionFunc, tests []PositionTest) {
  66. rb := reorderBuffer{}
  67. rb.init(f, nil)
  68. for i, test := range tests {
  69. rb.reset()
  70. rb.src = inputString(test.input)
  71. rb.nsrc = len(test.input)
  72. pos, out := fn(&rb, test.input)
  73. if pos != test.pos {
  74. t.Errorf("%s:%d: position is %d; want %d", name, i, pos, test.pos)
  75. }
  76. if outs := string(out); outs != test.buffer {
  77. k, pfx := pidx(outs, test.buffer)
  78. t.Errorf("%s:%d: buffer \nwas %s%+q; \nwant %s%+q", name, i, pfx, pc(outs[k:]), pfx, pc(test.buffer[k:]))
  79. }
  80. }
  81. }
  82. func grave(n int) string {
  83. return rep(0x0300, n)
  84. }
  85. func rep(r rune, n int) string {
  86. return strings.Repeat(string(r), n)
  87. }
  88. const segSize = maxByteBufferSize
  89. var cgj = GraphemeJoiner
  90. var decomposeSegmentTests = []PositionTest{
  91. // illegal runes
  92. {"\xC2", 0, ""},
  93. {"\xC0", 1, "\xC0"},
  94. {"\u00E0\x80", 2, "\u0061\u0300"},
  95. // starter
  96. {"a", 1, "a"},
  97. {"ab", 1, "a"},
  98. // starter + composing
  99. {"a\u0300", 3, "a\u0300"},
  100. {"a\u0300b", 3, "a\u0300"},
  101. // with decomposition
  102. {"\u00C0", 2, "A\u0300"},
  103. {"\u00C0b", 2, "A\u0300"},
  104. // long
  105. {grave(31), 60, grave(30) + cgj},
  106. {"a" + grave(31), 61, "a" + grave(30) + cgj},
  107. // Stability tests: see https://www.unicode.org/review/pr-29.html.
  108. // U+0300 COMBINING GRAVE ACCENT;Mn;230;NSM;;;;;N;NON-SPACING GRAVE;;;;
  109. // U+0B47 ORIYA VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
  110. // U+0B3E ORIYA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
  111. // U+1100 HANGUL CHOSEONG KIYEOK;Lo;0;L;;;;;N;;;;;
  112. // U+1161 HANGUL JUNGSEONG A;Lo;0;L;;;;;N;;;;;
  113. {"\u0B47\u0300\u0B3E", 8, "\u0B47\u0300\u0B3E"},
  114. {"\u1100\u0300\u1161", 8, "\u1100\u0300\u1161"},
  115. {"\u0B47\u0B3E", 6, "\u0B47\u0B3E"},
  116. {"\u1100\u1161", 6, "\u1100\u1161"},
  117. // U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
  118. // Sequence of decomposing characters that are starters and modifiers.
  119. {"\u0d4a" + strings.Repeat("\u0d3e", 31), 90, "\u0d46" + strings.Repeat("\u0d3e", 30) + cgj},
  120. {grave(30), 60, grave(30)},
  121. // U+FF9E is a starter, but decomposes to U+3099, which is not.
  122. {grave(30) + "\uff9e", 60, grave(30) + cgj},
  123. // ends with incomplete UTF-8 encoding
  124. {"\xCC", 0, ""},
  125. {"\u0300\xCC", 2, "\u0300"},
  126. }
  127. func decomposeSegmentF(rb *reorderBuffer, s string) (int, []byte) {
  128. rb.initString(NFD, s)
  129. rb.setFlusher(nil, appendFlush)
  130. p := decomposeSegment(rb, 0, true)
  131. return p, rb.out
  132. }
  133. func TestDecomposeSegment(t *testing.T) {
  134. runPosTests(t, "TestDecomposeSegment", NFC, decomposeSegmentF, decomposeSegmentTests)
  135. }
  136. var firstBoundaryTests = []PositionTest{
  137. // no boundary
  138. {"", -1, ""},
  139. {"\u0300", -1, ""},
  140. {"\x80\x80", -1, ""},
  141. // illegal runes
  142. {"\xff", 0, ""},
  143. {"\u0300\xff", 2, ""},
  144. {"\u0300\xc0\x80\x80", 2, ""},
  145. // boundaries
  146. {"a", 0, ""},
  147. {"\u0300a", 2, ""},
  148. // Hangul
  149. {"\u1103\u1161", 0, ""},
  150. {"\u110B\u1173\u11B7", 0, ""},
  151. {"\u1161\u110B\u1173\u11B7", 3, ""},
  152. {"\u1173\u11B7\u1103\u1161", 6, ""},
  153. // too many combining characters.
  154. {grave(maxNonStarters - 1), -1, ""},
  155. {grave(maxNonStarters), 60, ""},
  156. {grave(maxNonStarters + 1), 60, ""},
  157. }
  158. func firstBoundaryF(rb *reorderBuffer, s string) (int, []byte) {
  159. return rb.f.form.FirstBoundary([]byte(s)), nil
  160. }
  161. func firstBoundaryStringF(rb *reorderBuffer, s string) (int, []byte) {
  162. return rb.f.form.FirstBoundaryInString(s), nil
  163. }
  164. func TestFirstBoundary(t *testing.T) {
  165. runPosTests(t, "TestFirstBoundary", NFC, firstBoundaryF, firstBoundaryTests)
  166. runPosTests(t, "TestFirstBoundaryInString", NFC, firstBoundaryStringF, firstBoundaryTests)
  167. }
  168. func TestNextBoundary(t *testing.T) {
  169. testCases := []struct {
  170. input string
  171. atEOF bool
  172. want int
  173. }{
  174. // no boundary
  175. {"", true, 0},
  176. {"", false, -1},
  177. {"\u0300", true, 2},
  178. {"\u0300", false, -1},
  179. {"\x80\x80", true, 1},
  180. {"\x80\x80", false, 1},
  181. // illegal runes
  182. {"\xff", false, 1},
  183. {"\u0300\xff", false, 2},
  184. {"\u0300\xc0\x80\x80", false, 2},
  185. {"\xc2\x80\x80", false, 2},
  186. {"\xc2", false, -1},
  187. {"\xc2", true, 1},
  188. {"a\u0300\xc2", false, -1},
  189. {"a\u0300\xc2", true, 3},
  190. // boundaries
  191. {"a", true, 1},
  192. {"a", false, -1},
  193. {"aa", false, 1},
  194. {"\u0300", true, 2},
  195. {"\u0300", false, -1},
  196. {"\u0300a", false, 2},
  197. // Hangul
  198. {"\u1103\u1161", true, 6},
  199. {"\u1103\u1161", false, -1},
  200. {"\u110B\u1173\u11B7", false, -1},
  201. {"\u110B\u1173\u11B7\u110B\u1173\u11B7", false, 9},
  202. {"\u1161\u110B\u1173\u11B7", false, 3},
  203. {"\u1173\u11B7\u1103\u1161", false, 6},
  204. // too many combining characters.
  205. {grave(maxNonStarters - 1), false, -1},
  206. {grave(maxNonStarters), false, 60},
  207. {grave(maxNonStarters + 1), false, 60},
  208. }
  209. for _, tc := range testCases {
  210. if got := NFC.NextBoundary([]byte(tc.input), tc.atEOF); got != tc.want {
  211. t.Errorf("NextBoundary(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want)
  212. }
  213. if got := NFC.NextBoundaryInString(tc.input, tc.atEOF); got != tc.want {
  214. t.Errorf("NextBoundaryInString(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want)
  215. }
  216. }
  217. }
  218. var decomposeToLastTests = []PositionTest{
  219. // ends with inert character
  220. {"Hello!", 6, ""},
  221. {"\u0632", 2, ""},
  222. {"a\u0301\u0635", 5, ""},
  223. // ends with non-inert starter
  224. {"a", 0, "a"},
  225. {"a\u0301a", 3, "a"},
  226. {"a\u0301\u03B9", 3, "\u03B9"},
  227. {"a\u0327", 0, "a\u0327"},
  228. // illegal runes
  229. {"\xFF", 1, ""},
  230. {"aa\xFF", 3, ""},
  231. {"\xC0\x80\x80", 3, ""},
  232. {"\xCC\x80\x80", 3, ""},
  233. // ends with incomplete UTF-8 encoding
  234. {"a\xCC", 2, ""},
  235. // ends with combining characters
  236. {"\u0300\u0301", 0, "\u0300\u0301"},
  237. {"a\u0300\u0301", 0, "a\u0300\u0301"},
  238. {"a\u0301\u0308", 0, "a\u0301\u0308"},
  239. {"a\u0308\u0301", 0, "a\u0308\u0301"},
  240. {"aaaa\u0300\u0301", 3, "a\u0300\u0301"},
  241. {"\u0300a\u0300\u0301", 2, "a\u0300\u0301"},
  242. {"\u00C0", 0, "A\u0300"},
  243. {"a\u00C0", 1, "A\u0300"},
  244. // decomposing
  245. {"a\u0300\u00E0", 3, "a\u0300"},
  246. // multisegment decompositions (flushes leading segments)
  247. {"a\u0300\uFDC0", 7, "\u064A"},
  248. {"\uFDC0" + grave(29), 4, "\u064A" + grave(29)},
  249. {"\uFDC0" + grave(30), 4, "\u064A" + grave(30)},
  250. {"\uFDC0" + grave(31), 5, grave(30)},
  251. {"\uFDFA" + grave(14), 31, "\u0645" + grave(14)},
  252. // Overflow
  253. {"\u00E0" + grave(29), 0, "a" + grave(30)},
  254. {"\u00E0" + grave(30), 2, grave(30)},
  255. // Hangul
  256. {"a\u1103", 1, "\u1103"},
  257. {"a\u110B", 1, "\u110B"},
  258. {"a\u110B\u1173", 1, "\u110B\u1173"},
  259. // See comment in composition.go:compBoundaryAfter.
  260. {"a\u110B\u1173\u11B7", 1, "\u110B\u1173\u11B7"},
  261. {"a\uC73C", 1, "\u110B\u1173"},
  262. {"다음", 3, "\u110B\u1173\u11B7"},
  263. {"다", 0, "\u1103\u1161"},
  264. {"\u1103\u1161\u110B\u1173\u11B7", 6, "\u110B\u1173\u11B7"},
  265. {"\u110B\u1173\u11B7\u1103\u1161", 9, "\u1103\u1161"},
  266. {"다음음", 6, "\u110B\u1173\u11B7"},
  267. {"음다다", 6, "\u1103\u1161"},
  268. // maximized buffer
  269. {"a" + grave(30), 0, "a" + grave(30)},
  270. // Buffer overflow
  271. {"a" + grave(31), 3, grave(30)},
  272. // weird UTF-8
  273. {"a\u0300\u11B7", 0, "a\u0300\u11B7"},
  274. }
  275. func decomposeToLast(rb *reorderBuffer, s string) (int, []byte) {
  276. rb.setFlusher([]byte(s), appendFlush)
  277. decomposeToLastBoundary(rb)
  278. buf := rb.flush(nil)
  279. return len(rb.out), buf
  280. }
  281. func TestDecomposeToLastBoundary(t *testing.T) {
  282. runPosTests(t, "TestDecomposeToLastBoundary", NFKC, decomposeToLast, decomposeToLastTests)
  283. }
  284. var lastBoundaryTests = []PositionTest{
  285. // ends with inert character
  286. {"Hello!", 6, ""},
  287. {"\u0632", 2, ""},
  288. // ends with non-inert starter
  289. {"a", 0, ""},
  290. // illegal runes
  291. {"\xff", 1, ""},
  292. {"aa\xff", 3, ""},
  293. {"a\xff\u0300", 1, ""}, // TODO: should probably be 2.
  294. {"\xc0\x80\x80", 3, ""},
  295. {"\xc0\x80\x80\u0300", 3, ""},
  296. // ends with incomplete UTF-8 encoding
  297. {"\xCC", -1, ""},
  298. {"\xE0\x80", -1, ""},
  299. {"\xF0\x80\x80", -1, ""},
  300. {"a\xCC", 0, ""},
  301. {"\x80\xCC", 1, ""},
  302. {"\xCC\xCC", 1, ""},
  303. // ends with combining characters
  304. {"a\u0300\u0301", 0, ""},
  305. {"aaaa\u0300\u0301", 3, ""},
  306. {"\u0300a\u0300\u0301", 2, ""},
  307. {"\u00C2", 0, ""},
  308. {"a\u00C2", 1, ""},
  309. // decomposition may recombine
  310. {"\u0226", 0, ""},
  311. // no boundary
  312. {"", -1, ""},
  313. {"\u0300\u0301", -1, ""},
  314. {"\u0300", -1, ""},
  315. {"\x80\x80", -1, ""},
  316. {"\x80\x80\u0301", -1, ""},
  317. // Hangul
  318. {"다음", 3, ""},
  319. {"다", 0, ""},
  320. {"\u1103\u1161\u110B\u1173\u11B7", 6, ""},
  321. {"\u110B\u1173\u11B7\u1103\u1161", 9, ""},
  322. // too many combining characters.
  323. {grave(maxNonStarters - 1), -1, ""},
  324. // May still be preceded with a non-starter.
  325. {grave(maxNonStarters), -1, ""},
  326. // May still need to insert a cgj after the last combiner.
  327. {grave(maxNonStarters + 1), 2, ""},
  328. {grave(maxNonStarters + 2), 4, ""},
  329. {"a" + grave(maxNonStarters-1), 0, ""},
  330. {"a" + grave(maxNonStarters), 0, ""},
  331. // May still need to insert a cgj after the last combiner.
  332. {"a" + grave(maxNonStarters+1), 3, ""},
  333. {"a" + grave(maxNonStarters+2), 5, ""},
  334. }
  335. func lastBoundaryF(rb *reorderBuffer, s string) (int, []byte) {
  336. return rb.f.form.LastBoundary([]byte(s)), nil
  337. }
  338. func TestLastBoundary(t *testing.T) {
  339. runPosTests(t, "TestLastBoundary", NFC, lastBoundaryF, lastBoundaryTests)
  340. }
  341. type spanTest struct {
  342. input string
  343. atEOF bool
  344. n int
  345. err error
  346. }
  347. var quickSpanTests = []spanTest{
  348. {"", true, 0, nil},
  349. // starters
  350. {"a", true, 1, nil},
  351. {"abc", true, 3, nil},
  352. {"\u043Eb", true, 3, nil},
  353. // incomplete last rune.
  354. {"\xCC", true, 1, nil},
  355. {"\xCC", false, 0, transform.ErrShortSrc},
  356. {"a\xCC", true, 2, nil},
  357. {"a\xCC", false, 0, transform.ErrShortSrc}, // TODO: could be 1 for NFD
  358. // incorrectly ordered combining characters
  359. {"\u0300\u0316", true, 0, transform.ErrEndOfSpan},
  360. {"\u0300\u0316", false, 0, transform.ErrEndOfSpan},
  361. {"\u0300\u0316cd", true, 0, transform.ErrEndOfSpan},
  362. {"\u0300\u0316cd", false, 0, transform.ErrEndOfSpan},
  363. // have a maximum number of combining characters.
  364. {rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
  365. {"a" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
  366. {"Ɵ" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
  367. {"aa" + rep(0x035D, 30) + "\u035B", true, 1, transform.ErrEndOfSpan},
  368. {rep(0x035D, 30) + cgj + "\u035B", true, 64, nil},
  369. {"a" + rep(0x035D, 30) + cgj + "\u035B", true, 65, nil},
  370. {"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil},
  371. {"aa" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil},
  372. {"a" + rep(0x035D, 30) + cgj + "\u035B", false, 61, transform.ErrShortSrc},
  373. {"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc},
  374. {"aa" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc},
  375. }
  376. var quickSpanNFDTests = []spanTest{
  377. // needs decomposing
  378. {"\u00C0", true, 0, transform.ErrEndOfSpan},
  379. {"abc\u00C0", true, 3, transform.ErrEndOfSpan},
  380. // correctly ordered combining characters
  381. {"\u0300", true, 2, nil},
  382. {"ab\u0300", true, 4, nil},
  383. {"ab\u0300cd", true, 6, nil},
  384. {"\u0300cd", true, 4, nil},
  385. {"\u0316\u0300", true, 4, nil},
  386. {"ab\u0316\u0300", true, 6, nil},
  387. {"ab\u0316\u0300cd", true, 8, nil},
  388. {"ab\u0316\u0300\u00C0", true, 6, transform.ErrEndOfSpan},
  389. {"\u0316\u0300cd", true, 6, nil},
  390. {"\u043E\u0308b", true, 5, nil},
  391. // incorrectly ordered combining characters
  392. {"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan}, // TODO: we could skip 'b' as well.
  393. {"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan},
  394. // Hangul
  395. {"같은", true, 0, transform.ErrEndOfSpan},
  396. }
  397. var quickSpanNFCTests = []spanTest{
  398. // okay composed
  399. {"\u00C0", true, 2, nil},
  400. {"abc\u00C0", true, 5, nil},
  401. // correctly ordered combining characters
  402. // TODO: b may combine with modifiers, which is why this fails. We could
  403. // make a more precise test that actually checks whether last
  404. // characters combines. Probably not worth it.
  405. {"ab\u0300", true, 1, transform.ErrEndOfSpan},
  406. {"ab\u0300cd", true, 1, transform.ErrEndOfSpan},
  407. {"ab\u0316\u0300", true, 1, transform.ErrEndOfSpan},
  408. {"ab\u0316\u0300cd", true, 1, transform.ErrEndOfSpan},
  409. {"\u00C0\u035D", true, 4, nil},
  410. // we do not special case leading combining characters
  411. {"\u0300cd", true, 0, transform.ErrEndOfSpan},
  412. {"\u0300", true, 0, transform.ErrEndOfSpan},
  413. {"\u0316\u0300", true, 0, transform.ErrEndOfSpan},
  414. {"\u0316\u0300cd", true, 0, transform.ErrEndOfSpan},
  415. // incorrectly ordered combining characters
  416. {"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan},
  417. {"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan},
  418. // Hangul
  419. {"같은", true, 6, nil},
  420. {"같은", false, 3, transform.ErrShortSrc},
  421. // We return the start of the violating segment in case of overflow.
  422. {grave(30) + "\uff9e", true, 0, transform.ErrEndOfSpan},
  423. {grave(30), true, 0, transform.ErrEndOfSpan},
  424. }
  425. func runSpanTests(t *testing.T, name string, f Form, testCases []spanTest) {
  426. for i, tc := range testCases {
  427. s := fmt.Sprintf("Bytes/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF)
  428. ok := testtext.Run(t, s, func(t *testing.T) {
  429. n, err := f.Span([]byte(tc.input), tc.atEOF)
  430. if n != tc.n || err != tc.err {
  431. t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err)
  432. }
  433. })
  434. if !ok {
  435. continue // Don't do the String variant if the Bytes variant failed.
  436. }
  437. s = fmt.Sprintf("String/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF)
  438. testtext.Run(t, s, func(t *testing.T) {
  439. n, err := f.SpanString(tc.input, tc.atEOF)
  440. if n != tc.n || err != tc.err {
  441. t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err)
  442. }
  443. })
  444. }
  445. }
  446. func TestSpan(t *testing.T) {
  447. runSpanTests(t, "NFD", NFD, quickSpanTests)
  448. runSpanTests(t, "NFD", NFD, quickSpanNFDTests)
  449. runSpanTests(t, "NFC", NFC, quickSpanTests)
  450. runSpanTests(t, "NFC", NFC, quickSpanNFCTests)
  451. }
  452. var isNormalTests = []PositionTest{
  453. {"", 1, ""},
  454. // illegal runes
  455. {"\xff", 1, ""},
  456. // starters
  457. {"a", 1, ""},
  458. {"abc", 1, ""},
  459. {"\u043Eb", 1, ""},
  460. // incorrectly ordered combining characters
  461. {"\u0300\u0316", 0, ""},
  462. {"ab\u0300\u0316", 0, ""},
  463. {"ab\u0300\u0316cd", 0, ""},
  464. {"\u0300\u0316cd", 0, ""},
  465. }
  466. var isNormalNFDTests = []PositionTest{
  467. // needs decomposing
  468. {"\u00C0", 0, ""},
  469. {"abc\u00C0", 0, ""},
  470. // correctly ordered combining characters
  471. {"\u0300", 1, ""},
  472. {"ab\u0300", 1, ""},
  473. {"ab\u0300cd", 1, ""},
  474. {"\u0300cd", 1, ""},
  475. {"\u0316\u0300", 1, ""},
  476. {"ab\u0316\u0300", 1, ""},
  477. {"ab\u0316\u0300cd", 1, ""},
  478. {"\u0316\u0300cd", 1, ""},
  479. {"\u043E\u0308b", 1, ""},
  480. // Hangul
  481. {"같은", 0, ""},
  482. }
  483. var isNormalNFCTests = []PositionTest{
  484. // okay composed
  485. {"\u00C0", 1, ""},
  486. {"abc\u00C0", 1, ""},
  487. // need reordering
  488. {"a\u0300", 0, ""},
  489. {"a\u0300cd", 0, ""},
  490. {"a\u0316\u0300", 0, ""},
  491. {"a\u0316\u0300cd", 0, ""},
  492. // correctly ordered combining characters
  493. {"ab\u0300", 1, ""},
  494. {"ab\u0300cd", 1, ""},
  495. {"ab\u0316\u0300", 1, ""},
  496. {"ab\u0316\u0300cd", 1, ""},
  497. {"\u00C0\u035D", 1, ""},
  498. {"\u0300", 1, ""},
  499. {"\u0316\u0300cd", 1, ""},
  500. // Hangul
  501. {"같은", 1, ""},
  502. }
  503. var isNormalNFKXTests = []PositionTest{
  504. // Special case.
  505. {"\u00BC", 0, ""},
  506. }
  507. func isNormalF(rb *reorderBuffer, s string) (int, []byte) {
  508. if rb.f.form.IsNormal([]byte(s)) {
  509. return 1, nil
  510. }
  511. return 0, nil
  512. }
  513. func isNormalStringF(rb *reorderBuffer, s string) (int, []byte) {
  514. if rb.f.form.IsNormalString(s) {
  515. return 1, nil
  516. }
  517. return 0, nil
  518. }
  519. func TestIsNormal(t *testing.T) {
  520. runPosTests(t, "TestIsNormalNFD1", NFD, isNormalF, isNormalTests)
  521. runPosTests(t, "TestIsNormalNFD2", NFD, isNormalF, isNormalNFDTests)
  522. runPosTests(t, "TestIsNormalNFC1", NFC, isNormalF, isNormalTests)
  523. runPosTests(t, "TestIsNormalNFC2", NFC, isNormalF, isNormalNFCTests)
  524. runPosTests(t, "TestIsNormalNFKD1", NFKD, isNormalF, isNormalTests)
  525. runPosTests(t, "TestIsNormalNFKD2", NFKD, isNormalF, isNormalNFDTests)
  526. runPosTests(t, "TestIsNormalNFKD3", NFKD, isNormalF, isNormalNFKXTests)
  527. runPosTests(t, "TestIsNormalNFKC1", NFKC, isNormalF, isNormalTests)
  528. runPosTests(t, "TestIsNormalNFKC2", NFKC, isNormalF, isNormalNFCTests)
  529. runPosTests(t, "TestIsNormalNFKC3", NFKC, isNormalF, isNormalNFKXTests)
  530. }
  531. func TestIsNormalString(t *testing.T) {
  532. runPosTests(t, "TestIsNormalNFD1", NFD, isNormalStringF, isNormalTests)
  533. runPosTests(t, "TestIsNormalNFD2", NFD, isNormalStringF, isNormalNFDTests)
  534. runPosTests(t, "TestIsNormalNFC1", NFC, isNormalStringF, isNormalTests)
  535. runPosTests(t, "TestIsNormalNFC2", NFC, isNormalStringF, isNormalNFCTests)
  536. }
  537. type AppendTest struct {
  538. left string
  539. right string
  540. out string
  541. }
  542. type appendFunc func(f Form, out []byte, s string) []byte
  543. var fstr = []string{"NFC", "NFD", "NFKC", "NFKD"}
  544. func runNormTests(t *testing.T, name string, fn appendFunc) {
  545. for f := NFC; f <= NFKD; f++ {
  546. runAppendTests(t, name, f, fn, normTests[f])
  547. }
  548. }
  549. func runAppendTests(t *testing.T, name string, f Form, fn appendFunc, tests []AppendTest) {
  550. for i, test := range tests {
  551. t.Run(fmt.Sprintf("%s/%d", fstr[f], i), func(t *testing.T) {
  552. id := pc(test.left + test.right)
  553. if *testn >= 0 && i != *testn {
  554. return
  555. }
  556. t.Run("fn", func(t *testing.T) {
  557. out := []byte(test.left)
  558. have := string(fn(f, out, test.right))
  559. if len(have) != len(test.out) {
  560. t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(test.out), pc(have), pc(test.out))
  561. }
  562. if have != test.out {
  563. k, pf := pidx(have, test.out)
  564. t.Errorf("%+q:\nwas %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(test.out[k:]))
  565. }
  566. })
  567. // Bootstrap by normalizing input. Ensures that the various variants
  568. // behave the same.
  569. for g := NFC; g <= NFKD; g++ {
  570. if f == g {
  571. continue
  572. }
  573. t.Run(fstr[g], func(t *testing.T) {
  574. want := g.String(test.left + test.right)
  575. have := string(fn(g, g.AppendString(nil, test.left), test.right))
  576. if len(have) != len(want) {
  577. t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(want), pc(have), pc(want))
  578. }
  579. if have != want {
  580. k, pf := pidx(have, want)
  581. t.Errorf("%+q:\nwas %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(want[k:]))
  582. }
  583. })
  584. }
  585. })
  586. }
  587. }
  588. var normTests = [][]AppendTest{
  589. appendTestsNFC,
  590. appendTestsNFD,
  591. appendTestsNFKC,
  592. appendTestsNFKD,
  593. }
  594. var appendTestsNFC = []AppendTest{
  595. {"", ascii, ascii},
  596. {"", txt_all, txt_all},
  597. {"\uff9e", grave(30), "\uff9e" + grave(29) + cgj + grave(1)},
  598. {grave(30), "\uff9e", grave(30) + cgj + "\uff9e"},
  599. // Tests designed for Iter.
  600. { // ordering of non-composing combining characters
  601. "",
  602. "\u0305\u0316",
  603. "\u0316\u0305",
  604. },
  605. { // segment overflow
  606. "",
  607. "a" + rep(0x0305, maxNonStarters+4) + "\u0316",
  608. "a" + rep(0x0305, maxNonStarters) + cgj + "\u0316" + rep(0x305, 4),
  609. },
  610. { // Combine across non-blocking non-starters.
  611. // U+0327 COMBINING CEDILLA;Mn;202;NSM;;;;;N;NON-SPACING CEDILLA;;;;
  612. // U+0325 COMBINING RING BELOW;Mn;220;NSM;;;;;N;NON-SPACING RING BELOW;;;;
  613. "", "a\u0327\u0325", "\u1e01\u0327",
  614. },
  615. { // Jamo V+T does not combine.
  616. "",
  617. "\u1161\u11a8",
  618. "\u1161\u11a8",
  619. },
  620. // Stability tests: see https://www.unicode.org/review/pr-29.html.
  621. {"", "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"},
  622. {"", "\u1100\u0300\u1161", "\u1100\u0300\u1161"},
  623. {"", "\u0b47\u0b3e", "\u0b4b"},
  624. {"", "\u1100\u1161", "\uac00"},
  625. // U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
  626. { // 0d4a starts a new segment.
  627. "",
  628. "\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15),
  629. "\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15),
  630. },
  631. { // Split combining characters.
  632. // TODO: don't insert CGJ before starters.
  633. "",
  634. "\u0d46" + strings.Repeat("\u0d3e", 31),
  635. "\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e",
  636. },
  637. { // Split combining characters.
  638. "",
  639. "\u0d4a" + strings.Repeat("\u0d3e", 30),
  640. "\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e",
  641. },
  642. { // https://golang.org/issues/20079
  643. "",
  644. "\xeb\u0344",
  645. "\xeb\u0308\u0301",
  646. },
  647. { // https://golang.org/issues/20079
  648. "",
  649. "\uac00" + strings.Repeat("\u0300", 30),
  650. "\uac00" + strings.Repeat("\u0300", 29) + "\u034f\u0300",
  651. },
  652. { // https://golang.org/issues/20079
  653. "",
  654. "\xeb" + strings.Repeat("\u0300", 31),
  655. "\xeb" + strings.Repeat("\u0300", 30) + "\u034f\u0300",
  656. },
  657. }
  658. var appendTestsNFD = []AppendTest{
  659. // TODO: Move some of the tests here.
  660. }
  661. var appendTestsNFKC = []AppendTest{
  662. // empty buffers
  663. {"", "", ""},
  664. {"a", "", "a"},
  665. {"", "a", "a"},
  666. {"", "\u0041\u0307\u0304", "\u01E0"},
  667. // segment split across buffers
  668. {"", "a\u0300b", "\u00E0b"},
  669. {"a", "\u0300b", "\u00E0b"},
  670. {"a", "\u0300\u0316", "\u00E0\u0316"},
  671. {"a", "\u0316\u0300", "\u00E0\u0316"},
  672. {"a", "\u0300a\u0300", "\u00E0\u00E0"},
  673. {"a", "\u0300a\u0300a\u0300", "\u00E0\u00E0\u00E0"},
  674. {"a", "\u0300aaa\u0300aaa\u0300", "\u00E0aa\u00E0aa\u00E0"},
  675. {"a\u0300", "\u0327", "\u00E0\u0327"},
  676. {"a\u0327", "\u0300", "\u00E0\u0327"},
  677. {"a\u0316", "\u0300", "\u00E0\u0316"},
  678. {"\u0041\u0307", "\u0304", "\u01E0"},
  679. // Hangul
  680. {"", "\u110B\u1173", "\uC73C"},
  681. {"", "\u1103\u1161", "\uB2E4"},
  682. {"", "\u110B\u1173\u11B7", "\uC74C"},
  683. {"", "\u320E", "\x28\uAC00\x29"},
  684. {"", "\x28\u1100\u1161\x29", "\x28\uAC00\x29"},
  685. {"\u1103", "\u1161", "\uB2E4"},
  686. {"\u110B", "\u1173\u11B7", "\uC74C"},
  687. {"\u110B\u1173", "\u11B7", "\uC74C"},
  688. {"\uC73C", "\u11B7", "\uC74C"},
  689. // UTF-8 encoding split across buffers
  690. {"a\xCC", "\x80", "\u00E0"},
  691. {"a\xCC", "\x80b", "\u00E0b"},
  692. {"a\xCC", "\x80a\u0300", "\u00E0\u00E0"},
  693. {"a\xCC", "\x80\x80", "\u00E0\x80"},
  694. {"a\xCC", "\x80\xCC", "\u00E0\xCC"},
  695. {"a\u0316\xCC", "\x80a\u0316\u0300", "\u00E0\u0316\u00E0\u0316"},
  696. // ending in incomplete UTF-8 encoding
  697. {"", "\xCC", "\xCC"},
  698. {"a", "\xCC", "a\xCC"},
  699. {"a", "b\xCC", "ab\xCC"},
  700. {"\u0226", "\xCC", "\u0226\xCC"},
  701. // illegal runes
  702. {"", "\x80", "\x80"},
  703. {"", "\x80\x80\x80", "\x80\x80\x80"},
  704. {"", "\xCC\x80\x80\x80", "\xCC\x80\x80\x80"},
  705. {"", "a\x80", "a\x80"},
  706. {"", "a\x80\x80\x80", "a\x80\x80\x80"},
  707. {"", "a\x80\x80\x80\x80\x80\x80", "a\x80\x80\x80\x80\x80\x80"},
  708. {"a", "\x80\x80\x80", "a\x80\x80\x80"},
  709. // overflow
  710. {"", strings.Repeat("\x80", 33), strings.Repeat("\x80", 33)},
  711. {strings.Repeat("\x80", 33), "", strings.Repeat("\x80", 33)},
  712. {strings.Repeat("\x80", 33), strings.Repeat("\x80", 33), strings.Repeat("\x80", 66)},
  713. // overflow of combining characters
  714. {"", grave(34), grave(30) + cgj + grave(4)},
  715. {"", grave(36), grave(30) + cgj + grave(6)},
  716. {grave(29), grave(5), grave(30) + cgj + grave(4)},
  717. {grave(30), grave(4), grave(30) + cgj + grave(4)},
  718. {grave(30), grave(3), grave(30) + cgj + grave(3)},
  719. {grave(30) + "\xCC", "\x80", grave(30) + cgj + grave(1)},
  720. {"", "\uFDFA" + grave(14), "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645" + grave(14)},
  721. {"", "\uFDFA" + grave(28) + "\u0316", "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645\u0316" + grave(28)},
  722. // - First rune has a trailing non-starter.
  723. {"\u00d5", grave(30), "\u00d5" + grave(29) + cgj + grave(1)},
  724. // - U+FF9E decomposes into a non-starter in compatibility mode. A CGJ must be
  725. // inserted even when FF9E starts a new segment.
  726. {"\uff9e", grave(30), "\u3099" + grave(29) + cgj + grave(1)},
  727. {grave(30), "\uff9e", grave(30) + cgj + "\u3099"},
  728. // - Many non-starter decompositions in a row causing overflow.
  729. {"", rep(0x340, 31), rep(0x300, 30) + cgj + "\u0300"},
  730. {"", rep(0xFF9E, 31), rep(0x3099, 30) + cgj + "\u3099"},
  731. {"", "\u0644\u0625" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + "\u0300\u0300"},
  732. {"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)},
  733. {"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)},
  734. // U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers.
  735. {"", "\u0f7f" + rep(0xf71, 29) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80"},
  736. {"", "\u0f7f" + rep(0xf71, 28) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + "\u0f80"},
  737. {"", "\u0f7f" + rep(0xf81, 16), "\u0f7f" + rep(0xf71, 15) + rep(0xf80, 15) + cgj + "\u0f71\u0f80"},
  738. // weird UTF-8
  739. {"\u00E0\xE1", "\x86", "\u00E0\xE1\x86"},
  740. {"a\u0300\u11B7", "\u0300", "\u00E0\u11B7\u0300"},
  741. {"a\u0300\u11B7\u0300", "\u0300", "\u00E0\u11B7\u0300\u0300"},
  742. {"\u0300", "\xF8\x80\x80\x80\x80\u0300", "\u0300\xF8\x80\x80\x80\x80\u0300"},
  743. {"\u0300", "\xFC\x80\x80\x80\x80\x80\u0300", "\u0300\xFC\x80\x80\x80\x80\x80\u0300"},
  744. {"\xF8\x80\x80\x80\x80\u0300", "\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
  745. {"\xFC\x80\x80\x80\x80\x80\u0300", "\u0300", "\xFC\x80\x80\x80\x80\x80\u0300\u0300"},
  746. {"\xF8\x80\x80\x80", "\x80\u0300\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
  747. {"", strings.Repeat("a\u0316\u0300", 6), strings.Repeat("\u00E0\u0316", 6)},
  748. // large input.
  749. {"", strings.Repeat("a\u0300\u0316", 31), strings.Repeat("\u00E0\u0316", 31)},
  750. {"", strings.Repeat("a\u0300\u0316", 4000), strings.Repeat("\u00E0\u0316", 4000)},
  751. {"", strings.Repeat("\x80\x80", 4000), strings.Repeat("\x80\x80", 4000)},
  752. {"", "\u0041\u0307\u0304", "\u01E0"},
  753. }
  754. var appendTestsNFKD = []AppendTest{
  755. {"", "a" + grave(64), "a" + grave(30) + cgj + grave(30) + cgj + grave(4)},
  756. { // segment overflow on unchanged character
  757. "",
  758. "a" + grave(64) + "\u0316",
  759. "a" + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(4),
  760. },
  761. { // segment overflow on unchanged character + start value
  762. "",
  763. "a" + grave(98) + "\u0316",
  764. "a" + grave(30) + cgj + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(8),
  765. },
  766. { // segment overflow on decomposition. (U+0340 decomposes to U+0300.)
  767. "",
  768. "a" + grave(59) + "\u0340",
  769. "a" + grave(30) + cgj + grave(30),
  770. },
  771. { // segment overflow on non-starter decomposition
  772. "",
  773. "a" + grave(33) + "\u0340" + grave(30) + "\u0320",
  774. "a" + grave(30) + cgj + grave(30) + cgj + "\u0320" + grave(4),
  775. },
  776. { // start value after ASCII overflow
  777. "",
  778. rep('a', segSize) + grave(32) + "\u0320",
  779. rep('a', segSize) + grave(30) + cgj + "\u0320" + grave(2),
  780. },
  781. { // Jamo overflow
  782. "",
  783. "\u1100\u1161" + grave(30) + "\u0320" + grave(2),
  784. "\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
  785. },
  786. { // Hangul
  787. "",
  788. "\uac00",
  789. "\u1100\u1161",
  790. },
  791. { // Hangul overflow
  792. "",
  793. "\uac00" + grave(32) + "\u0320",
  794. "\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
  795. },
  796. { // Hangul overflow in Hangul mode.
  797. "",
  798. "\uac00\uac00" + grave(32) + "\u0320",
  799. "\u1100\u1161\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
  800. },
  801. { // Hangul overflow in Hangul mode.
  802. "",
  803. strings.Repeat("\uac00", 3) + grave(32) + "\u0320",
  804. strings.Repeat("\u1100\u1161", 3) + grave(29) + cgj + "\u0320" + grave(3),
  805. },
  806. { // start value after cc=0
  807. "",
  808. "您您" + grave(34) + "\u0320",
  809. "您您" + grave(30) + cgj + "\u0320" + grave(4),
  810. },
  811. { // start value after normalization
  812. "",
  813. "\u0300\u0320a" + grave(34) + "\u0320",
  814. "\u0320\u0300a" + grave(30) + cgj + "\u0320" + grave(4),
  815. },
  816. {
  817. // U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers.
  818. "",
  819. "a\u0f7f" + rep(0xf71, 29) + "\u0f81",
  820. "a\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80",
  821. },
  822. }
  823. func TestAppend(t *testing.T) {
  824. runNormTests(t, "Append", func(f Form, out []byte, s string) []byte {
  825. return f.Append(out, []byte(s)...)
  826. })
  827. }
  828. func TestAppendString(t *testing.T) {
  829. runNormTests(t, "AppendString", func(f Form, out []byte, s string) []byte {
  830. return f.AppendString(out, s)
  831. })
  832. }
  833. func TestBytes(t *testing.T) {
  834. runNormTests(t, "Bytes", func(f Form, out []byte, s string) []byte {
  835. buf := []byte{}
  836. buf = append(buf, out...)
  837. buf = append(buf, s...)
  838. return f.Bytes(buf)
  839. })
  840. }
  841. func TestString(t *testing.T) {
  842. runNormTests(t, "String", func(f Form, out []byte, s string) []byte {
  843. outs := string(out) + s
  844. return []byte(f.String(outs))
  845. })
  846. }
  847. func TestLinking(t *testing.T) {
  848. const prog = `
  849. package main
  850. import "fmt"
  851. import "golang.org/x/text/unicode/norm"
  852. func main() { fmt.Println(norm.%s) }
  853. `
  854. baseline, errB := testtext.CodeSize(fmt.Sprintf(prog, "MaxSegmentSize"))
  855. withTables, errT := testtext.CodeSize(fmt.Sprintf(prog, `NFC.String("")`))
  856. if errB != nil || errT != nil {
  857. t.Skipf("code size failed: %v and %v", errB, errT)
  858. }
  859. // Tables are at least 50K
  860. if d := withTables - baseline; d < 50*1024 {
  861. t.Errorf("tables appear not to be dropped: %d - %d = %d",
  862. withTables, baseline, d)
  863. }
  864. }
  865. func appendBench(f Form, in []byte) func() {
  866. buf := make([]byte, 0, 4*len(in))
  867. return func() {
  868. f.Append(buf, in...)
  869. }
  870. }
  871. func bytesBench(f Form, in []byte) func() {
  872. return func() {
  873. f.Bytes(in)
  874. }
  875. }
  876. func iterBench(f Form, in []byte) func() {
  877. iter := Iter{}
  878. return func() {
  879. iter.Init(f, in)
  880. for !iter.Done() {
  881. iter.Next()
  882. }
  883. }
  884. }
  885. func transformBench(f Form, in []byte) func() {
  886. buf := make([]byte, 4*len(in))
  887. return func() {
  888. if _, n, err := f.Transform(buf, in, true); err != nil || len(in) != n {
  889. log.Panic(n, len(in), err)
  890. }
  891. }
  892. }
  893. func readerBench(f Form, in []byte) func() {
  894. buf := make([]byte, 4*len(in))
  895. return func() {
  896. r := f.Reader(bytes.NewReader(in))
  897. var err error
  898. for err == nil {
  899. _, err = r.Read(buf)
  900. }
  901. if err != io.EOF {
  902. panic("")
  903. }
  904. }
  905. }
  906. func writerBench(f Form, in []byte) func() {
  907. buf := make([]byte, 0, 4*len(in))
  908. return func() {
  909. r := f.Writer(bytes.NewBuffer(buf))
  910. if _, err := r.Write(in); err != nil {
  911. panic("")
  912. }
  913. }
  914. }
  915. func appendBenchmarks(bm []func(), f Form, in []byte) []func() {
  916. bm = append(bm, appendBench(f, in))
  917. bm = append(bm, iterBench(f, in))
  918. bm = append(bm, transformBench(f, in))
  919. bm = append(bm, readerBench(f, in))
  920. bm = append(bm, writerBench(f, in))
  921. return bm
  922. }
  923. func doFormBenchmark(b *testing.B, inf, f Form, s string) {
  924. b.StopTimer()
  925. in := inf.Bytes([]byte(s))
  926. bm := appendBenchmarks(nil, f, in)
  927. b.SetBytes(int64(len(in) * len(bm)))
  928. b.StartTimer()
  929. for i := 0; i < b.N; i++ {
  930. for _, fn := range bm {
  931. fn()
  932. }
  933. }
  934. }
  935. func doSingle(b *testing.B, f func(Form, []byte) func(), s []byte) {
  936. b.StopTimer()
  937. fn := f(NFC, s)
  938. b.SetBytes(int64(len(s)))
  939. b.StartTimer()
  940. for i := 0; i < b.N; i++ {
  941. fn()
  942. }
  943. }
  944. var (
  945. smallNoChange = []byte("nörmalization")
  946. smallChange = []byte("No\u0308rmalization")
  947. ascii = strings.Repeat("There is nothing to change here! ", 500)
  948. )
  949. func lowerBench(f Form, in []byte) func() {
  950. // Use package strings instead of bytes as it doesn't allocate memory
  951. // if there aren't any changes.
  952. s := string(in)
  953. return func() {
  954. strings.ToLower(s)
  955. }
  956. }
  957. func BenchmarkLowerCaseNoChange(b *testing.B) {
  958. doSingle(b, lowerBench, smallNoChange)
  959. }
  960. func BenchmarkLowerCaseChange(b *testing.B) {
  961. doSingle(b, lowerBench, smallChange)
  962. }
  963. func quickSpanBench(f Form, in []byte) func() {
  964. return func() {
  965. f.QuickSpan(in)
  966. }
  967. }
  968. func BenchmarkQuickSpanChangeNFC(b *testing.B) {
  969. doSingle(b, quickSpanBench, smallNoChange)
  970. }
  971. func BenchmarkBytesNoChangeNFC(b *testing.B) {
  972. doSingle(b, bytesBench, smallNoChange)
  973. }
  974. func BenchmarkBytesChangeNFC(b *testing.B) {
  975. doSingle(b, bytesBench, smallChange)
  976. }
  977. func BenchmarkAppendNoChangeNFC(b *testing.B) {
  978. doSingle(b, appendBench, smallNoChange)
  979. }
  980. func BenchmarkAppendChangeNFC(b *testing.B) {
  981. doSingle(b, appendBench, smallChange)
  982. }
  983. func BenchmarkAppendLargeNFC(b *testing.B) {
  984. doSingle(b, appendBench, txt_all_bytes)
  985. }
  986. func BenchmarkIterNoChangeNFC(b *testing.B) {
  987. doSingle(b, iterBench, smallNoChange)
  988. }
  989. func BenchmarkIterChangeNFC(b *testing.B) {
  990. doSingle(b, iterBench, smallChange)
  991. }
  992. func BenchmarkIterLargeNFC(b *testing.B) {
  993. doSingle(b, iterBench, txt_all_bytes)
  994. }
  995. func BenchmarkTransformNoChangeNFC(b *testing.B) {
  996. doSingle(b, transformBench, smallNoChange)
  997. }
  998. func BenchmarkTransformChangeNFC(b *testing.B) {
  999. doSingle(b, transformBench, smallChange)
  1000. }
  1001. func BenchmarkTransformLargeNFC(b *testing.B) {
  1002. doSingle(b, transformBench, txt_all_bytes)
  1003. }
  1004. func BenchmarkNormalizeAsciiNFC(b *testing.B) {
  1005. doFormBenchmark(b, NFC, NFC, ascii)
  1006. }
  1007. func BenchmarkNormalizeAsciiNFD(b *testing.B) {
  1008. doFormBenchmark(b, NFC, NFD, ascii)
  1009. }
  1010. func BenchmarkNormalizeAsciiNFKC(b *testing.B) {
  1011. doFormBenchmark(b, NFC, NFKC, ascii)
  1012. }
  1013. func BenchmarkNormalizeAsciiNFKD(b *testing.B) {
  1014. doFormBenchmark(b, NFC, NFKD, ascii)
  1015. }
  1016. func BenchmarkNormalizeNFC2NFC(b *testing.B) {
  1017. doFormBenchmark(b, NFC, NFC, txt_all)
  1018. }
  1019. func BenchmarkNormalizeNFC2NFD(b *testing.B) {
  1020. doFormBenchmark(b, NFC, NFD, txt_all)
  1021. }
  1022. func BenchmarkNormalizeNFD2NFC(b *testing.B) {
  1023. doFormBenchmark(b, NFD, NFC, txt_all)
  1024. }
  1025. func BenchmarkNormalizeNFD2NFD(b *testing.B) {
  1026. doFormBenchmark(b, NFD, NFD, txt_all)
  1027. }
  1028. // Hangul is often special-cased, so we test it separately.
  1029. func BenchmarkNormalizeHangulNFC2NFC(b *testing.B) {
  1030. doFormBenchmark(b, NFC, NFC, txt_kr)
  1031. }
  1032. func BenchmarkNormalizeHangulNFC2NFD(b *testing.B) {
  1033. doFormBenchmark(b, NFC, NFD, txt_kr)
  1034. }
  1035. func BenchmarkNormalizeHangulNFD2NFC(b *testing.B) {
  1036. doFormBenchmark(b, NFD, NFC, txt_kr)
  1037. }
  1038. func BenchmarkNormalizeHangulNFD2NFD(b *testing.B) {
  1039. doFormBenchmark(b, NFD, NFD, txt_kr)
  1040. }
  1041. var forms = []Form{NFC, NFD, NFKC, NFKD}
  1042. func doTextBenchmark(b *testing.B, s string) {
  1043. b.StopTimer()
  1044. in := []byte(s)
  1045. bm := []func(){}
  1046. for _, f := range forms {
  1047. bm = appendBenchmarks(bm, f, in)
  1048. }
  1049. b.SetBytes(int64(len(s) * len(bm)))
  1050. b.StartTimer()
  1051. for i := 0; i < b.N; i++ {
  1052. for _, f := range bm {
  1053. f()
  1054. }
  1055. }
  1056. }
  1057. func BenchmarkCanonicalOrdering(b *testing.B) {
  1058. doTextBenchmark(b, txt_canon)
  1059. }
  1060. func BenchmarkExtendedLatin(b *testing.B) {
  1061. doTextBenchmark(b, txt_vn)
  1062. }
  1063. func BenchmarkMiscTwoByteUtf8(b *testing.B) {
  1064. doTextBenchmark(b, twoByteUtf8)
  1065. }
  1066. func BenchmarkMiscThreeByteUtf8(b *testing.B) {
  1067. doTextBenchmark(b, threeByteUtf8)
  1068. }
  1069. func BenchmarkHangul(b *testing.B) {
  1070. doTextBenchmark(b, txt_kr)
  1071. }
  1072. func BenchmarkJapanese(b *testing.B) {
  1073. doTextBenchmark(b, txt_jp)
  1074. }
  1075. func BenchmarkChinese(b *testing.B) {
  1076. doTextBenchmark(b, txt_cn)
  1077. }
  1078. func BenchmarkOverflow(b *testing.B) {
  1079. doTextBenchmark(b, overflow)
  1080. }
  1081. var overflow = string(bytes.Repeat([]byte("\u035D"), 4096)) + "\u035B"
  1082. // Tests sampled from the Canonical ordering tests (Part 2) of
  1083. // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
  1084. const txt_canon = `\u0061\u0315\u0300\u05AE\u0300\u0062 \u0061\u0300\u0315\u0300\u05AE\u0062
  1085. \u0061\u0302\u0315\u0300\u05AE\u0062 \u0061\u0307\u0315\u0300\u05AE\u0062
  1086. \u0061\u0315\u0300\u05AE\u030A\u0062 \u0061\u059A\u0316\u302A\u031C\u0062
  1087. \u0061\u032E\u059A\u0316\u302A\u0062 \u0061\u0338\u093C\u0334\u0062
  1088. \u0061\u059A\u0316\u302A\u0339 \u0061\u0341\u0315\u0300\u05AE\u0062
  1089. \u0061\u0348\u059A\u0316\u302A\u0062 \u0061\u0361\u0345\u035D\u035C\u0062
  1090. \u0061\u0366\u0315\u0300\u05AE\u0062 \u0061\u0315\u0300\u05AE\u0486\u0062
  1091. \u0061\u05A4\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0613\u0062
  1092. \u0061\u0315\u0300\u05AE\u0615\u0062 \u0061\u0617\u0315\u0300\u05AE\u0062
  1093. \u0061\u0619\u0618\u064D\u064E\u0062 \u0061\u0315\u0300\u05AE\u0654\u0062
  1094. \u0061\u0315\u0300\u05AE\u06DC\u0062 \u0061\u0733\u0315\u0300\u05AE\u0062
  1095. \u0061\u0744\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0745\u0062
  1096. \u0061\u09CD\u05B0\u094D\u3099\u0062 \u0061\u0E38\u0E48\u0E38\u0C56\u0062
  1097. \u0061\u0EB8\u0E48\u0E38\u0E49\u0062 \u0061\u0F72\u0F71\u0EC8\u0F71\u0062
  1098. \u0061\u1039\u05B0\u094D\u3099\u0062 \u0061\u05B0\u094D\u3099\u1A60\u0062
  1099. \u0061\u3099\u093C\u0334\u1BE6\u0062 \u0061\u3099\u093C\u0334\u1C37\u0062
  1100. \u0061\u1CD9\u059A\u0316\u302A\u0062 \u0061\u2DED\u0315\u0300\u05AE\u0062
  1101. \u0061\u2DEF\u0315\u0300\u05AE\u0062 \u0061\u302D\u302E\u059A\u0316\u0062`
  1102. // Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/
  1103. const txt_vn = `Với các điều kiện sau: Ghi nhận công của tác giả.
  1104. Nếu bạn sử dụng, chuyển đổi, hoặc xây dựng dự án từ
  1105. nội dung được chia sẻ này, bạn phải áp dụng giấy phép này hoặc
  1106. một giấy phép khác có các điều khoản tương tự như giấy phép này
  1107. cho dự án của bạn. Hiểu rằng: Miễn — Bất kỳ các điều kiện nào
  1108. trên đây cũng có thể được miễn bỏ nếu bạn được sự cho phép của
  1109. người sở hữu bản quyền. Phạm vi công chúng — Khi tác phẩm hoặc
  1110. bất kỳ chương nào của tác phẩm đã trong vùng dành cho công
  1111. chúng theo quy định của pháp luật thì tình trạng của nó không
  1112. bị ảnh hưởng bởi giấy phép trong bất kỳ trường hợp nào.`
  1113. // Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru
  1114. const txt_ru = `При обязательном соблюдении следующих условий:
  1115. Attribution — Вы должны атрибутировать произведение (указывать
  1116. автора и источник) в порядке, предусмотренном автором или
  1117. лицензиаром (но только так, чтобы никоим образом не подразумевалось,
  1118. что они поддерживают вас или использование вами данного произведения).
  1119. Υπό τις ακόλουθες προϋποθέσεις:`
  1120. // Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/
  1121. const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με τον
  1122. τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια
  1123. (χωρίς όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή
  1124. τη χρήση του έργου από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε,
  1125. τροποποιήσετε ή δημιουργήσετε περαιτέρω βασισμένοι στο έργο θα
  1126. μπορείτε να διανέμετε το έργο που θα προκύψει μόνο με την ίδια ή
  1127. παρόμοια άδεια.`
  1128. // Taken from http://creativecommons.org/licenses/by-sa/3.0/deed.ar
  1129. const txt_ar = `بموجب الشروط التالية نسب المصنف — يجب عليك أن
  1130. تنسب العمل بالطريقة التي تحددها المؤلف أو المرخص (ولكن ليس بأي حال من
  1131. الأحوال أن توحي وتقترح بتحول أو استخدامك للعمل).
  1132. المشاركة على قدم المساواة — إذا كنت يعدل ، والتغيير ، أو الاستفادة
  1133. من هذا العمل ، قد ينتج عن توزيع العمل إلا في ظل تشابه او تطابق فى واحد
  1134. لهذا الترخيص.`
  1135. // Taken from http://creativecommons.org/licenses/by-sa/1.0/il/
  1136. const txt_il = `בכפוף לתנאים הבאים: ייחוס — עליך לייחס את היצירה (לתת קרדיט) באופן
  1137. המצויין על-ידי היוצר או מעניק הרישיון (אך לא בשום אופן המרמז על כך
  1138. שהם תומכים בך או בשימוש שלך ביצירה). שיתוף זהה — אם תחליט/י לשנות,
  1139. לעבד או ליצור יצירה נגזרת בהסתמך על יצירה זו, תוכל/י להפיץ את יצירתך
  1140. החדשה רק תחת אותו הרישיון או רישיון דומה לרישיון זה.`
  1141. const twoByteUtf8 = txt_ru + txt_gr + txt_ar + txt_il
  1142. // Taken from http://creativecommons.org/licenses/by-sa/2.0/kr/
  1143. const txt_kr = `다음과 같은 조건을 따라야 합니다: 저작자표시
  1144. (Attribution) — 저작자나 이용허락자가 정한 방법으로 저작물의
  1145. 원저작자를 표시하여야 합니다(그러나 원저작자가 이용자나 이용자의
  1146. 이용을 보증하거나 추천한다는 의미로 표시해서는 안됩니다).
  1147. 동일조건변경허락 — 이 저작물을 이용하여 만든 이차적 저작물에는 본
  1148. 라이선스와 동일한 라이선스를 적용해야 합니다.`
  1149. // Taken from http://creativecommons.org/licenses/by-sa/3.0/th/
  1150. const txt_th = `ภายใต้เงื่อนไข ดังต่อไปนี้ : แสดงที่มา — คุณต้องแสดงที่
  1151. มาของงานดังกล่าว ตามรูปแบบที่ผู้สร้างสรรค์หรือผู้อนุญาตกำหนด (แต่
  1152. ไม่ใช่ในลักษณะที่ว่า พวกเขาสนับสนุนคุณหรือสนับสนุนการที่
  1153. คุณนำงานไปใช้) อนุญาตแบบเดียวกัน — หากคุณดัดแปลง เปลี่ยนรูป หรื
  1154. อต่อเติมงานนี้ คุณต้องใช้สัญญาอนุญาตแบบเดียวกันหรือแบบที่เหมื
  1155. อนกับสัญญาอนุญาตที่ใช้กับงานนี้เท่านั้น`
  1156. const threeByteUtf8 = txt_th
  1157. // Taken from http://creativecommons.org/licenses/by-sa/2.0/jp/
  1158. const txt_jp = `あなたの従うべき条件は以下の通りです。
  1159. 表示 — あなたは原著作者のクレジットを表示しなければなりません。
  1160. 継承 — もしあなたがこの作品を改変、変形または加工した場合、
  1161. あなたはその結果生じた作品をこの作品と同一の許諾条件の下でのみ
  1162. 頒布することができます。`
  1163. // http://creativecommons.org/licenses/by-sa/2.5/cn/
  1164. const txt_cn = `您可以自由: 复制、发行、展览、表演、放映、
  1165. 广播或通过信息网络传播本作品 创作演绎作品
  1166. 对本作品进行商业性使用 惟须遵守下列条件:
  1167. 署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。
  1168. 相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作,
  1169. 您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。`
  1170. const txt_cjk = txt_cn + txt_jp + txt_kr
  1171. const txt_all = txt_vn + twoByteUtf8 + threeByteUtf8 + txt_cjk
  1172. var txt_all_bytes = []byte(txt_all)