You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

528 lines
13 KiB

  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:generate go run maketables.go triegen.go
  5. //go:generate go run maketables.go triegen.go -test
  6. // Package norm contains types and functions for normalizing Unicode strings.
  7. package norm // import "golang.org/x/text/unicode/norm"
  8. import "unicode/utf8"
  9. // A Form denotes a canonical representation of Unicode code points.
  10. // The Unicode-defined normalization and equivalence forms are:
  11. //
  12. // NFC Unicode Normalization Form C
  13. // NFD Unicode Normalization Form D
  14. // NFKC Unicode Normalization Form KC
  15. // NFKD Unicode Normalization Form KD
  16. //
  17. // For a Form f, this documentation uses the notation f(x) to mean
  18. // the bytes or string x converted to the given form.
  19. // A position n in x is called a boundary if conversion to the form can
  20. // proceed independently on both sides:
  21. // f(x) == append(f(x[0:n]), f(x[n:])...)
  22. //
  23. // References: http://unicode.org/reports/tr15/ and
  24. // http://unicode.org/notes/tn5/.
  25. type Form int
  26. const (
  27. NFC Form = iota
  28. NFD
  29. NFKC
  30. NFKD
  31. )
  32. // Bytes returns f(b). May return b if f(b) = b.
  33. func (f Form) Bytes(b []byte) []byte {
  34. src := inputBytes(b)
  35. ft := formTable[f]
  36. n, ok := ft.quickSpan(src, 0, len(b), true)
  37. if ok {
  38. return b
  39. }
  40. out := make([]byte, n, len(b))
  41. copy(out, b[0:n])
  42. rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
  43. return doAppendInner(&rb, n)
  44. }
  45. // String returns f(s).
  46. func (f Form) String(s string) string {
  47. src := inputString(s)
  48. ft := formTable[f]
  49. n, ok := ft.quickSpan(src, 0, len(s), true)
  50. if ok {
  51. return s
  52. }
  53. out := make([]byte, n, len(s))
  54. copy(out, s[0:n])
  55. rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
  56. return string(doAppendInner(&rb, n))
  57. }
  58. // IsNormal returns true if b == f(b).
  59. func (f Form) IsNormal(b []byte) bool {
  60. src := inputBytes(b)
  61. ft := formTable[f]
  62. bp, ok := ft.quickSpan(src, 0, len(b), true)
  63. if ok {
  64. return true
  65. }
  66. rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
  67. rb.setFlusher(nil, cmpNormalBytes)
  68. for bp < len(b) {
  69. rb.out = b[bp:]
  70. if bp = decomposeSegment(&rb, bp, true); bp < 0 {
  71. return false
  72. }
  73. bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
  74. }
  75. return true
  76. }
  77. func cmpNormalBytes(rb *reorderBuffer) bool {
  78. b := rb.out
  79. for i := 0; i < rb.nrune; i++ {
  80. info := rb.rune[i]
  81. if int(info.size) > len(b) {
  82. return false
  83. }
  84. p := info.pos
  85. pe := p + info.size
  86. for ; p < pe; p++ {
  87. if b[0] != rb.byte[p] {
  88. return false
  89. }
  90. b = b[1:]
  91. }
  92. }
  93. return true
  94. }
  95. // IsNormalString returns true if s == f(s).
  96. func (f Form) IsNormalString(s string) bool {
  97. src := inputString(s)
  98. ft := formTable[f]
  99. bp, ok := ft.quickSpan(src, 0, len(s), true)
  100. if ok {
  101. return true
  102. }
  103. rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
  104. rb.setFlusher(nil, func(rb *reorderBuffer) bool {
  105. for i := 0; i < rb.nrune; i++ {
  106. info := rb.rune[i]
  107. if bp+int(info.size) > len(s) {
  108. return false
  109. }
  110. p := info.pos
  111. pe := p + info.size
  112. for ; p < pe; p++ {
  113. if s[bp] != rb.byte[p] {
  114. return false
  115. }
  116. bp++
  117. }
  118. }
  119. return true
  120. })
  121. for bp < len(s) {
  122. if bp = decomposeSegment(&rb, bp, true); bp < 0 {
  123. return false
  124. }
  125. bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
  126. }
  127. return true
  128. }
  129. // patchTail fixes a case where a rune may be incorrectly normalized
  130. // if it is followed by illegal continuation bytes. It returns the
  131. // patched buffer and whether the decomposition is still in progress.
  132. func patchTail(rb *reorderBuffer) bool {
  133. info, p := lastRuneStart(&rb.f, rb.out)
  134. if p == -1 || info.size == 0 {
  135. return true
  136. }
  137. end := p + int(info.size)
  138. extra := len(rb.out) - end
  139. if extra > 0 {
  140. // Potentially allocating memory. However, this only
  141. // happens with ill-formed UTF-8.
  142. x := make([]byte, 0)
  143. x = append(x, rb.out[len(rb.out)-extra:]...)
  144. rb.out = rb.out[:end]
  145. decomposeToLastBoundary(rb)
  146. rb.doFlush()
  147. rb.out = append(rb.out, x...)
  148. return false
  149. }
  150. buf := rb.out[p:]
  151. rb.out = rb.out[:p]
  152. decomposeToLastBoundary(rb)
  153. if s := rb.ss.next(info); s == ssStarter {
  154. rb.doFlush()
  155. rb.ss.first(info)
  156. } else if s == ssOverflow {
  157. rb.doFlush()
  158. rb.insertCGJ()
  159. rb.ss = 0
  160. }
  161. rb.insertUnsafe(inputBytes(buf), 0, info)
  162. return true
  163. }
  164. func appendQuick(rb *reorderBuffer, i int) int {
  165. if rb.nsrc == i {
  166. return i
  167. }
  168. end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
  169. rb.out = rb.src.appendSlice(rb.out, i, end)
  170. return end
  171. }
  172. // Append returns f(append(out, b...)).
  173. // The buffer out must be nil, empty, or equal to f(out).
  174. func (f Form) Append(out []byte, src ...byte) []byte {
  175. return f.doAppend(out, inputBytes(src), len(src))
  176. }
  177. func (f Form) doAppend(out []byte, src input, n int) []byte {
  178. if n == 0 {
  179. return out
  180. }
  181. ft := formTable[f]
  182. // Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
  183. if len(out) == 0 {
  184. p, _ := ft.quickSpan(src, 0, n, true)
  185. out = src.appendSlice(out, 0, p)
  186. if p == n {
  187. return out
  188. }
  189. rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
  190. return doAppendInner(&rb, p)
  191. }
  192. rb := reorderBuffer{f: *ft, src: src, nsrc: n}
  193. return doAppend(&rb, out, 0)
  194. }
  195. func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
  196. rb.setFlusher(out, appendFlush)
  197. src, n := rb.src, rb.nsrc
  198. doMerge := len(out) > 0
  199. if q := src.skipContinuationBytes(p); q > p {
  200. // Move leading non-starters to destination.
  201. rb.out = src.appendSlice(rb.out, p, q)
  202. p = q
  203. doMerge = patchTail(rb)
  204. }
  205. fd := &rb.f
  206. if doMerge {
  207. var info Properties
  208. if p < n {
  209. info = fd.info(src, p)
  210. if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
  211. if p == 0 {
  212. decomposeToLastBoundary(rb)
  213. }
  214. p = decomposeSegment(rb, p, true)
  215. }
  216. }
  217. if info.size == 0 {
  218. rb.doFlush()
  219. // Append incomplete UTF-8 encoding.
  220. return src.appendSlice(rb.out, p, n)
  221. }
  222. if rb.nrune > 0 {
  223. return doAppendInner(rb, p)
  224. }
  225. }
  226. p = appendQuick(rb, p)
  227. return doAppendInner(rb, p)
  228. }
  229. func doAppendInner(rb *reorderBuffer, p int) []byte {
  230. for n := rb.nsrc; p < n; {
  231. p = decomposeSegment(rb, p, true)
  232. p = appendQuick(rb, p)
  233. }
  234. return rb.out
  235. }
  236. // AppendString returns f(append(out, []byte(s))).
  237. // The buffer out must be nil, empty, or equal to f(out).
  238. func (f Form) AppendString(out []byte, src string) []byte {
  239. return f.doAppend(out, inputString(src), len(src))
  240. }
  241. // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
  242. // It is not guaranteed to return the largest such n.
  243. func (f Form) QuickSpan(b []byte) int {
  244. n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
  245. return n
  246. }
  247. // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
  248. // whether any non-normalized parts were found. If atEOF is false, n will
  249. // not point past the last segment if this segment might be become
  250. // non-normalized by appending other runes.
  251. func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
  252. var lastCC uint8
  253. ss := streamSafe(0)
  254. lastSegStart := i
  255. for n = end; i < n; {
  256. if j := src.skipASCII(i, n); i != j {
  257. i = j
  258. lastSegStart = i - 1
  259. lastCC = 0
  260. ss = 0
  261. continue
  262. }
  263. info := f.info(src, i)
  264. if info.size == 0 {
  265. if atEOF {
  266. // include incomplete runes
  267. return n, true
  268. }
  269. return lastSegStart, true
  270. }
  271. // This block needs to be before the next, because it is possible to
  272. // have an overflow for runes that are starters (e.g. with U+FF9E).
  273. switch ss.next(info) {
  274. case ssStarter:
  275. ss.first(info)
  276. lastSegStart = i
  277. case ssOverflow:
  278. return lastSegStart, false
  279. case ssSuccess:
  280. if lastCC > info.ccc {
  281. return lastSegStart, false
  282. }
  283. }
  284. if f.composing {
  285. if !info.isYesC() {
  286. break
  287. }
  288. } else {
  289. if !info.isYesD() {
  290. break
  291. }
  292. }
  293. lastCC = info.ccc
  294. i += int(info.size)
  295. }
  296. if i == n {
  297. if !atEOF {
  298. n = lastSegStart
  299. }
  300. return n, true
  301. }
  302. return lastSegStart, false
  303. }
  304. // QuickSpanString returns a boundary n such that b[0:n] == f(s[0:n]).
  305. // It is not guaranteed to return the largest such n.
  306. func (f Form) QuickSpanString(s string) int {
  307. n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
  308. return n
  309. }
  310. // FirstBoundary returns the position i of the first boundary in b
  311. // or -1 if b contains no boundary.
  312. func (f Form) FirstBoundary(b []byte) int {
  313. return f.firstBoundary(inputBytes(b), len(b))
  314. }
  315. func (f Form) firstBoundary(src input, nsrc int) int {
  316. i := src.skipContinuationBytes(0)
  317. if i >= nsrc {
  318. return -1
  319. }
  320. fd := formTable[f]
  321. ss := streamSafe(0)
  322. // We should call ss.first here, but we can't as the first rune is
  323. // skipped already. This means FirstBoundary can't really determine
  324. // CGJ insertion points correctly. Luckily it doesn't have to.
  325. // TODO: consider adding NextBoundary
  326. for {
  327. info := fd.info(src, i)
  328. if info.size == 0 {
  329. return -1
  330. }
  331. if s := ss.next(info); s != ssSuccess {
  332. return i
  333. }
  334. i += int(info.size)
  335. if i >= nsrc {
  336. if !info.BoundaryAfter() && !ss.isMax() {
  337. return -1
  338. }
  339. return nsrc
  340. }
  341. }
  342. }
  343. // FirstBoundaryInString returns the position i of the first boundary in s
  344. // or -1 if s contains no boundary.
  345. func (f Form) FirstBoundaryInString(s string) int {
  346. return f.firstBoundary(inputString(s), len(s))
  347. }
  348. // LastBoundary returns the position i of the last boundary in b
  349. // or -1 if b contains no boundary.
  350. func (f Form) LastBoundary(b []byte) int {
  351. return lastBoundary(formTable[f], b)
  352. }
  353. func lastBoundary(fd *formInfo, b []byte) int {
  354. i := len(b)
  355. info, p := lastRuneStart(fd, b)
  356. if p == -1 {
  357. return -1
  358. }
  359. if info.size == 0 { // ends with incomplete rune
  360. if p == 0 { // starts with incomplete rune
  361. return -1
  362. }
  363. i = p
  364. info, p = lastRuneStart(fd, b[:i])
  365. if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
  366. return i
  367. }
  368. }
  369. if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
  370. return i
  371. }
  372. if info.BoundaryAfter() {
  373. return i
  374. }
  375. ss := streamSafe(0)
  376. v := ss.backwards(info)
  377. for i = p; i >= 0 && v != ssStarter; i = p {
  378. info, p = lastRuneStart(fd, b[:i])
  379. if v = ss.backwards(info); v == ssOverflow {
  380. break
  381. }
  382. if p+int(info.size) != i {
  383. if p == -1 { // no boundary found
  384. return -1
  385. }
  386. return i // boundary after an illegal UTF-8 encoding
  387. }
  388. }
  389. return i
  390. }
  391. // decomposeSegment scans the first segment in src into rb. It inserts 0x034f
  392. // (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
  393. // and returns the number of bytes consumed from src or iShortDst or iShortSrc.
  394. func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
  395. // Force one character to be consumed.
  396. info := rb.f.info(rb.src, sp)
  397. if info.size == 0 {
  398. return 0
  399. }
  400. if rb.nrune > 0 {
  401. if s := rb.ss.next(info); s == ssStarter {
  402. goto end
  403. } else if s == ssOverflow {
  404. rb.insertCGJ()
  405. goto end
  406. }
  407. } else {
  408. rb.ss.first(info)
  409. }
  410. if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
  411. return int(err)
  412. }
  413. for {
  414. sp += int(info.size)
  415. if sp >= rb.nsrc {
  416. if !atEOF && !info.BoundaryAfter() {
  417. return int(iShortSrc)
  418. }
  419. break
  420. }
  421. info = rb.f.info(rb.src, sp)
  422. if info.size == 0 {
  423. if !atEOF {
  424. return int(iShortSrc)
  425. }
  426. break
  427. }
  428. if s := rb.ss.next(info); s == ssStarter {
  429. break
  430. } else if s == ssOverflow {
  431. rb.insertCGJ()
  432. break
  433. }
  434. if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
  435. return int(err)
  436. }
  437. }
  438. end:
  439. if !rb.doFlush() {
  440. return int(iShortDst)
  441. }
  442. return sp
  443. }
  444. // lastRuneStart returns the runeInfo and position of the last
  445. // rune in buf or the zero runeInfo and -1 if no rune was found.
  446. func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
  447. p := len(buf) - 1
  448. for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
  449. }
  450. if p < 0 {
  451. return Properties{}, -1
  452. }
  453. return fd.info(inputBytes(buf), p), p
  454. }
  455. // decomposeToLastBoundary finds an open segment at the end of the buffer
  456. // and scans it into rb. Returns the buffer minus the last segment.
  457. func decomposeToLastBoundary(rb *reorderBuffer) {
  458. fd := &rb.f
  459. info, i := lastRuneStart(fd, rb.out)
  460. if int(info.size) != len(rb.out)-i {
  461. // illegal trailing continuation bytes
  462. return
  463. }
  464. if info.BoundaryAfter() {
  465. return
  466. }
  467. var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
  468. padd := 0
  469. ss := streamSafe(0)
  470. p := len(rb.out)
  471. for {
  472. add[padd] = info
  473. v := ss.backwards(info)
  474. if v == ssOverflow {
  475. // Note that if we have an overflow, it the string we are appending to
  476. // is not correctly normalized. In this case the behavior is undefined.
  477. break
  478. }
  479. padd++
  480. p -= int(info.size)
  481. if v == ssStarter || p < 0 {
  482. break
  483. }
  484. info, i = lastRuneStart(fd, rb.out[:p])
  485. if int(info.size) != p-i {
  486. break
  487. }
  488. }
  489. rb.ss = ss
  490. // Copy bytes for insertion as we may need to overwrite rb.out.
  491. var buf [maxBufferSize * utf8.UTFMax]byte
  492. cp := buf[:copy(buf[:], rb.out[p:])]
  493. rb.out = rb.out[:p]
  494. for padd--; padd >= 0; padd-- {
  495. info = add[padd]
  496. rb.insertUnsafe(inputBytes(cp), 0, info)
  497. cp = cp[info.size:]
  498. }
  499. }