You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

610 lines
15 KiB

  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Note: the file data_test.go that is generated should not be checked in.
  5. //go:generate go run maketables.go triegen.go
  6. //go:generate go test -tags test
  7. // Package norm contains types and functions for normalizing Unicode strings.
  8. package norm // import "golang.org/x/text/unicode/norm"
  9. import (
  10. "unicode/utf8"
  11. "golang.org/x/text/transform"
  12. )
  13. // A Form denotes a canonical representation of Unicode code points.
  14. // The Unicode-defined normalization and equivalence forms are:
  15. //
  16. // NFC Unicode Normalization Form C
  17. // NFD Unicode Normalization Form D
  18. // NFKC Unicode Normalization Form KC
  19. // NFKD Unicode Normalization Form KD
  20. //
  21. // For a Form f, this documentation uses the notation f(x) to mean
  22. // the bytes or string x converted to the given form.
  23. // A position n in x is called a boundary if conversion to the form can
  24. // proceed independently on both sides:
  25. // f(x) == append(f(x[0:n]), f(x[n:])...)
  26. //
  27. // References: https://unicode.org/reports/tr15/ and
  28. // https://unicode.org/notes/tn5/.
  29. type Form int
  30. const (
  31. NFC Form = iota
  32. NFD
  33. NFKC
  34. NFKD
  35. )
  36. // Bytes returns f(b). May return b if f(b) = b.
  37. func (f Form) Bytes(b []byte) []byte {
  38. src := inputBytes(b)
  39. ft := formTable[f]
  40. n, ok := ft.quickSpan(src, 0, len(b), true)
  41. if ok {
  42. return b
  43. }
  44. out := make([]byte, n, len(b))
  45. copy(out, b[0:n])
  46. rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
  47. return doAppendInner(&rb, n)
  48. }
  49. // String returns f(s).
  50. func (f Form) String(s string) string {
  51. src := inputString(s)
  52. ft := formTable[f]
  53. n, ok := ft.quickSpan(src, 0, len(s), true)
  54. if ok {
  55. return s
  56. }
  57. out := make([]byte, n, len(s))
  58. copy(out, s[0:n])
  59. rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
  60. return string(doAppendInner(&rb, n))
  61. }
  62. // IsNormal returns true if b == f(b).
  63. func (f Form) IsNormal(b []byte) bool {
  64. src := inputBytes(b)
  65. ft := formTable[f]
  66. bp, ok := ft.quickSpan(src, 0, len(b), true)
  67. if ok {
  68. return true
  69. }
  70. rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
  71. rb.setFlusher(nil, cmpNormalBytes)
  72. for bp < len(b) {
  73. rb.out = b[bp:]
  74. if bp = decomposeSegment(&rb, bp, true); bp < 0 {
  75. return false
  76. }
  77. bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
  78. }
  79. return true
  80. }
  81. func cmpNormalBytes(rb *reorderBuffer) bool {
  82. b := rb.out
  83. for i := 0; i < rb.nrune; i++ {
  84. info := rb.rune[i]
  85. if int(info.size) > len(b) {
  86. return false
  87. }
  88. p := info.pos
  89. pe := p + info.size
  90. for ; p < pe; p++ {
  91. if b[0] != rb.byte[p] {
  92. return false
  93. }
  94. b = b[1:]
  95. }
  96. }
  97. return true
  98. }
  99. // IsNormalString returns true if s == f(s).
  100. func (f Form) IsNormalString(s string) bool {
  101. src := inputString(s)
  102. ft := formTable[f]
  103. bp, ok := ft.quickSpan(src, 0, len(s), true)
  104. if ok {
  105. return true
  106. }
  107. rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
  108. rb.setFlusher(nil, func(rb *reorderBuffer) bool {
  109. for i := 0; i < rb.nrune; i++ {
  110. info := rb.rune[i]
  111. if bp+int(info.size) > len(s) {
  112. return false
  113. }
  114. p := info.pos
  115. pe := p + info.size
  116. for ; p < pe; p++ {
  117. if s[bp] != rb.byte[p] {
  118. return false
  119. }
  120. bp++
  121. }
  122. }
  123. return true
  124. })
  125. for bp < len(s) {
  126. if bp = decomposeSegment(&rb, bp, true); bp < 0 {
  127. return false
  128. }
  129. bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
  130. }
  131. return true
  132. }
  133. // patchTail fixes a case where a rune may be incorrectly normalized
  134. // if it is followed by illegal continuation bytes. It returns the
  135. // patched buffer and whether the decomposition is still in progress.
  136. func patchTail(rb *reorderBuffer) bool {
  137. info, p := lastRuneStart(&rb.f, rb.out)
  138. if p == -1 || info.size == 0 {
  139. return true
  140. }
  141. end := p + int(info.size)
  142. extra := len(rb.out) - end
  143. if extra > 0 {
  144. // Potentially allocating memory. However, this only
  145. // happens with ill-formed UTF-8.
  146. x := make([]byte, 0)
  147. x = append(x, rb.out[len(rb.out)-extra:]...)
  148. rb.out = rb.out[:end]
  149. decomposeToLastBoundary(rb)
  150. rb.doFlush()
  151. rb.out = append(rb.out, x...)
  152. return false
  153. }
  154. buf := rb.out[p:]
  155. rb.out = rb.out[:p]
  156. decomposeToLastBoundary(rb)
  157. if s := rb.ss.next(info); s == ssStarter {
  158. rb.doFlush()
  159. rb.ss.first(info)
  160. } else if s == ssOverflow {
  161. rb.doFlush()
  162. rb.insertCGJ()
  163. rb.ss = 0
  164. }
  165. rb.insertUnsafe(inputBytes(buf), 0, info)
  166. return true
  167. }
  168. func appendQuick(rb *reorderBuffer, i int) int {
  169. if rb.nsrc == i {
  170. return i
  171. }
  172. end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
  173. rb.out = rb.src.appendSlice(rb.out, i, end)
  174. return end
  175. }
  176. // Append returns f(append(out, b...)).
  177. // The buffer out must be nil, empty, or equal to f(out).
  178. func (f Form) Append(out []byte, src ...byte) []byte {
  179. return f.doAppend(out, inputBytes(src), len(src))
  180. }
  181. func (f Form) doAppend(out []byte, src input, n int) []byte {
  182. if n == 0 {
  183. return out
  184. }
  185. ft := formTable[f]
  186. // Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
  187. if len(out) == 0 {
  188. p, _ := ft.quickSpan(src, 0, n, true)
  189. out = src.appendSlice(out, 0, p)
  190. if p == n {
  191. return out
  192. }
  193. rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
  194. return doAppendInner(&rb, p)
  195. }
  196. rb := reorderBuffer{f: *ft, src: src, nsrc: n}
  197. return doAppend(&rb, out, 0)
  198. }
  199. func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
  200. rb.setFlusher(out, appendFlush)
  201. src, n := rb.src, rb.nsrc
  202. doMerge := len(out) > 0
  203. if q := src.skipContinuationBytes(p); q > p {
  204. // Move leading non-starters to destination.
  205. rb.out = src.appendSlice(rb.out, p, q)
  206. p = q
  207. doMerge = patchTail(rb)
  208. }
  209. fd := &rb.f
  210. if doMerge {
  211. var info Properties
  212. if p < n {
  213. info = fd.info(src, p)
  214. if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
  215. if p == 0 {
  216. decomposeToLastBoundary(rb)
  217. }
  218. p = decomposeSegment(rb, p, true)
  219. }
  220. }
  221. if info.size == 0 {
  222. rb.doFlush()
  223. // Append incomplete UTF-8 encoding.
  224. return src.appendSlice(rb.out, p, n)
  225. }
  226. if rb.nrune > 0 {
  227. return doAppendInner(rb, p)
  228. }
  229. }
  230. p = appendQuick(rb, p)
  231. return doAppendInner(rb, p)
  232. }
  233. func doAppendInner(rb *reorderBuffer, p int) []byte {
  234. for n := rb.nsrc; p < n; {
  235. p = decomposeSegment(rb, p, true)
  236. p = appendQuick(rb, p)
  237. }
  238. return rb.out
  239. }
  240. // AppendString returns f(append(out, []byte(s))).
  241. // The buffer out must be nil, empty, or equal to f(out).
  242. func (f Form) AppendString(out []byte, src string) []byte {
  243. return f.doAppend(out, inputString(src), len(src))
  244. }
  245. // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
  246. // It is not guaranteed to return the largest such n.
  247. func (f Form) QuickSpan(b []byte) int {
  248. n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
  249. return n
  250. }
  251. // Span implements transform.SpanningTransformer. It returns a boundary n such
  252. // that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
  253. func (f Form) Span(b []byte, atEOF bool) (n int, err error) {
  254. n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF)
  255. if n < len(b) {
  256. if !ok {
  257. err = transform.ErrEndOfSpan
  258. } else {
  259. err = transform.ErrShortSrc
  260. }
  261. }
  262. return n, err
  263. }
  264. // SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
  265. // It is not guaranteed to return the largest such n.
  266. func (f Form) SpanString(s string, atEOF bool) (n int, err error) {
  267. n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF)
  268. if n < len(s) {
  269. if !ok {
  270. err = transform.ErrEndOfSpan
  271. } else {
  272. err = transform.ErrShortSrc
  273. }
  274. }
  275. return n, err
  276. }
  277. // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
  278. // whether any non-normalized parts were found. If atEOF is false, n will
  279. // not point past the last segment if this segment might be become
  280. // non-normalized by appending other runes.
  281. func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
  282. var lastCC uint8
  283. ss := streamSafe(0)
  284. lastSegStart := i
  285. for n = end; i < n; {
  286. if j := src.skipASCII(i, n); i != j {
  287. i = j
  288. lastSegStart = i - 1
  289. lastCC = 0
  290. ss = 0
  291. continue
  292. }
  293. info := f.info(src, i)
  294. if info.size == 0 {
  295. if atEOF {
  296. // include incomplete runes
  297. return n, true
  298. }
  299. return lastSegStart, true
  300. }
  301. // This block needs to be before the next, because it is possible to
  302. // have an overflow for runes that are starters (e.g. with U+FF9E).
  303. switch ss.next(info) {
  304. case ssStarter:
  305. lastSegStart = i
  306. case ssOverflow:
  307. return lastSegStart, false
  308. case ssSuccess:
  309. if lastCC > info.ccc {
  310. return lastSegStart, false
  311. }
  312. }
  313. if f.composing {
  314. if !info.isYesC() {
  315. break
  316. }
  317. } else {
  318. if !info.isYesD() {
  319. break
  320. }
  321. }
  322. lastCC = info.ccc
  323. i += int(info.size)
  324. }
  325. if i == n {
  326. if !atEOF {
  327. n = lastSegStart
  328. }
  329. return n, true
  330. }
  331. return lastSegStart, false
  332. }
  333. // QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
  334. // It is not guaranteed to return the largest such n.
  335. func (f Form) QuickSpanString(s string) int {
  336. n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
  337. return n
  338. }
  339. // FirstBoundary returns the position i of the first boundary in b
  340. // or -1 if b contains no boundary.
  341. func (f Form) FirstBoundary(b []byte) int {
  342. return f.firstBoundary(inputBytes(b), len(b))
  343. }
  344. func (f Form) firstBoundary(src input, nsrc int) int {
  345. i := src.skipContinuationBytes(0)
  346. if i >= nsrc {
  347. return -1
  348. }
  349. fd := formTable[f]
  350. ss := streamSafe(0)
  351. // We should call ss.first here, but we can't as the first rune is
  352. // skipped already. This means FirstBoundary can't really determine
  353. // CGJ insertion points correctly. Luckily it doesn't have to.
  354. for {
  355. info := fd.info(src, i)
  356. if info.size == 0 {
  357. return -1
  358. }
  359. if s := ss.next(info); s != ssSuccess {
  360. return i
  361. }
  362. i += int(info.size)
  363. if i >= nsrc {
  364. if !info.BoundaryAfter() && !ss.isMax() {
  365. return -1
  366. }
  367. return nsrc
  368. }
  369. }
  370. }
  371. // FirstBoundaryInString returns the position i of the first boundary in s
  372. // or -1 if s contains no boundary.
  373. func (f Form) FirstBoundaryInString(s string) int {
  374. return f.firstBoundary(inputString(s), len(s))
  375. }
  376. // NextBoundary reports the index of the boundary between the first and next
  377. // segment in b or -1 if atEOF is false and there are not enough bytes to
  378. // determine this boundary.
  379. func (f Form) NextBoundary(b []byte, atEOF bool) int {
  380. return f.nextBoundary(inputBytes(b), len(b), atEOF)
  381. }
  382. // NextBoundaryInString reports the index of the boundary between the first and
  383. // next segment in b or -1 if atEOF is false and there are not enough bytes to
  384. // determine this boundary.
  385. func (f Form) NextBoundaryInString(s string, atEOF bool) int {
  386. return f.nextBoundary(inputString(s), len(s), atEOF)
  387. }
  388. func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
  389. if nsrc == 0 {
  390. if atEOF {
  391. return 0
  392. }
  393. return -1
  394. }
  395. fd := formTable[f]
  396. info := fd.info(src, 0)
  397. if info.size == 0 {
  398. if atEOF {
  399. return 1
  400. }
  401. return -1
  402. }
  403. ss := streamSafe(0)
  404. ss.first(info)
  405. for i := int(info.size); i < nsrc; i += int(info.size) {
  406. info = fd.info(src, i)
  407. if info.size == 0 {
  408. if atEOF {
  409. return i
  410. }
  411. return -1
  412. }
  413. // TODO: Using streamSafe to determine the boundary isn't the same as
  414. // using BoundaryBefore. Determine which should be used.
  415. if s := ss.next(info); s != ssSuccess {
  416. return i
  417. }
  418. }
  419. if !atEOF && !info.BoundaryAfter() && !ss.isMax() {
  420. return -1
  421. }
  422. return nsrc
  423. }
  424. // LastBoundary returns the position i of the last boundary in b
  425. // or -1 if b contains no boundary.
  426. func (f Form) LastBoundary(b []byte) int {
  427. return lastBoundary(formTable[f], b)
  428. }
  429. func lastBoundary(fd *formInfo, b []byte) int {
  430. i := len(b)
  431. info, p := lastRuneStart(fd, b)
  432. if p == -1 {
  433. return -1
  434. }
  435. if info.size == 0 { // ends with incomplete rune
  436. if p == 0 { // starts with incomplete rune
  437. return -1
  438. }
  439. i = p
  440. info, p = lastRuneStart(fd, b[:i])
  441. if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
  442. return i
  443. }
  444. }
  445. if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
  446. return i
  447. }
  448. if info.BoundaryAfter() {
  449. return i
  450. }
  451. ss := streamSafe(0)
  452. v := ss.backwards(info)
  453. for i = p; i >= 0 && v != ssStarter; i = p {
  454. info, p = lastRuneStart(fd, b[:i])
  455. if v = ss.backwards(info); v == ssOverflow {
  456. break
  457. }
  458. if p+int(info.size) != i {
  459. if p == -1 { // no boundary found
  460. return -1
  461. }
  462. return i // boundary after an illegal UTF-8 encoding
  463. }
  464. }
  465. return i
  466. }
  467. // decomposeSegment scans the first segment in src into rb. It inserts 0x034f
  468. // (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
  469. // and returns the number of bytes consumed from src or iShortDst or iShortSrc.
  470. func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
  471. // Force one character to be consumed.
  472. info := rb.f.info(rb.src, sp)
  473. if info.size == 0 {
  474. return 0
  475. }
  476. if s := rb.ss.next(info); s == ssStarter {
  477. // TODO: this could be removed if we don't support merging.
  478. if rb.nrune > 0 {
  479. goto end
  480. }
  481. } else if s == ssOverflow {
  482. rb.insertCGJ()
  483. goto end
  484. }
  485. if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
  486. return int(err)
  487. }
  488. for {
  489. sp += int(info.size)
  490. if sp >= rb.nsrc {
  491. if !atEOF && !info.BoundaryAfter() {
  492. return int(iShortSrc)
  493. }
  494. break
  495. }
  496. info = rb.f.info(rb.src, sp)
  497. if info.size == 0 {
  498. if !atEOF {
  499. return int(iShortSrc)
  500. }
  501. break
  502. }
  503. if s := rb.ss.next(info); s == ssStarter {
  504. break
  505. } else if s == ssOverflow {
  506. rb.insertCGJ()
  507. break
  508. }
  509. if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
  510. return int(err)
  511. }
  512. }
  513. end:
  514. if !rb.doFlush() {
  515. return int(iShortDst)
  516. }
  517. return sp
  518. }
  519. // lastRuneStart returns the runeInfo and position of the last
  520. // rune in buf or the zero runeInfo and -1 if no rune was found.
  521. func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
  522. p := len(buf) - 1
  523. for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
  524. }
  525. if p < 0 {
  526. return Properties{}, -1
  527. }
  528. return fd.info(inputBytes(buf), p), p
  529. }
  530. // decomposeToLastBoundary finds an open segment at the end of the buffer
  531. // and scans it into rb. Returns the buffer minus the last segment.
  532. func decomposeToLastBoundary(rb *reorderBuffer) {
  533. fd := &rb.f
  534. info, i := lastRuneStart(fd, rb.out)
  535. if int(info.size) != len(rb.out)-i {
  536. // illegal trailing continuation bytes
  537. return
  538. }
  539. if info.BoundaryAfter() {
  540. return
  541. }
  542. var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
  543. padd := 0
  544. ss := streamSafe(0)
  545. p := len(rb.out)
  546. for {
  547. add[padd] = info
  548. v := ss.backwards(info)
  549. if v == ssOverflow {
  550. // Note that if we have an overflow, it the string we are appending to
  551. // is not correctly normalized. In this case the behavior is undefined.
  552. break
  553. }
  554. padd++
  555. p -= int(info.size)
  556. if v == ssStarter || p < 0 {
  557. break
  558. }
  559. info, i = lastRuneStart(fd, rb.out[:p])
  560. if int(info.size) != p-i {
  561. break
  562. }
  563. }
  564. rb.ss = ss
  565. // Copy bytes for insertion as we may need to overwrite rb.out.
  566. var buf [maxBufferSize * utf8.UTFMax]byte
  567. cp := buf[:copy(buf[:], rb.out[p:])]
  568. rb.out = rb.out[:p]
  569. for padd--; padd >= 0; padd-- {
  570. info = add[padd]
  571. rb.insertUnsafe(inputBytes(cp), 0, info)
  572. cp = cp[info.size:]
  573. }
  574. }