Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.
 
 
 

979 wiersze
24 KiB

  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build ignore
  5. // Normalization table generator.
  6. // Data read from the web.
  7. // See forminfo.go for a description of the trie values associated with each rune.
  8. package main
  9. import (
  10. "bytes"
  11. "flag"
  12. "fmt"
  13. "io"
  14. "log"
  15. "sort"
  16. "strconv"
  17. "strings"
  18. "golang.org/x/text/internal/gen"
  19. "golang.org/x/text/internal/triegen"
  20. "golang.org/x/text/internal/ucd"
  21. )
  22. func main() {
  23. gen.Init()
  24. loadUnicodeData()
  25. compactCCC()
  26. loadCompositionExclusions()
  27. completeCharFields(FCanonical)
  28. completeCharFields(FCompatibility)
  29. computeNonStarterCounts()
  30. verifyComputed()
  31. printChars()
  32. if *test {
  33. testDerived()
  34. printTestdata()
  35. } else {
  36. makeTables()
  37. }
  38. }
  39. var (
  40. tablelist = flag.String("tables",
  41. "all",
  42. "comma-separated list of which tables to generate; "+
  43. "can be 'decomp', 'recomp', 'info' and 'all'")
  44. test = flag.Bool("test",
  45. false,
  46. "test existing tables against DerivedNormalizationProps and generate test data for regression testing")
  47. verbose = flag.Bool("verbose",
  48. false,
  49. "write data to stdout as it is parsed")
  50. )
  51. const MaxChar = 0x10FFFF // anything above this shouldn't exist
  52. // Quick Check properties of runes allow us to quickly
  53. // determine whether a rune may occur in a normal form.
  54. // For a given normal form, a rune may be guaranteed to occur
  55. // verbatim (QC=Yes), may or may not combine with another
  56. // rune (QC=Maybe), or may not occur (QC=No).
  57. type QCResult int
  58. const (
  59. QCUnknown QCResult = iota
  60. QCYes
  61. QCNo
  62. QCMaybe
  63. )
  64. func (r QCResult) String() string {
  65. switch r {
  66. case QCYes:
  67. return "Yes"
  68. case QCNo:
  69. return "No"
  70. case QCMaybe:
  71. return "Maybe"
  72. }
  73. return "***UNKNOWN***"
  74. }
  75. const (
  76. FCanonical = iota // NFC or NFD
  77. FCompatibility // NFKC or NFKD
  78. FNumberOfFormTypes
  79. )
  80. const (
  81. MComposed = iota // NFC or NFKC
  82. MDecomposed // NFD or NFKD
  83. MNumberOfModes
  84. )
  85. // This contains only the properties we're interested in.
  86. type Char struct {
  87. name string
  88. codePoint rune // if zero, this index is not a valid code point.
  89. ccc uint8 // canonical combining class
  90. origCCC uint8
  91. excludeInComp bool // from CompositionExclusions.txt
  92. compatDecomp bool // it has a compatibility expansion
  93. nTrailingNonStarters uint8
  94. nLeadingNonStarters uint8 // must be equal to trailing if non-zero
  95. forms [FNumberOfFormTypes]FormInfo // For FCanonical and FCompatibility
  96. state State
  97. }
  98. var chars = make([]Char, MaxChar+1)
  99. var cccMap = make(map[uint8]uint8)
  100. func (c Char) String() string {
  101. buf := new(bytes.Buffer)
  102. fmt.Fprintf(buf, "%U [%s]:\n", c.codePoint, c.name)
  103. fmt.Fprintf(buf, " ccc: %v\n", c.ccc)
  104. fmt.Fprintf(buf, " excludeInComp: %v\n", c.excludeInComp)
  105. fmt.Fprintf(buf, " compatDecomp: %v\n", c.compatDecomp)
  106. fmt.Fprintf(buf, " state: %v\n", c.state)
  107. fmt.Fprintf(buf, " NFC:\n")
  108. fmt.Fprint(buf, c.forms[FCanonical])
  109. fmt.Fprintf(buf, " NFKC:\n")
  110. fmt.Fprint(buf, c.forms[FCompatibility])
  111. return buf.String()
  112. }
  113. // In UnicodeData.txt, some ranges are marked like this:
  114. // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
  115. // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
  116. // parseCharacter keeps a state variable indicating the weirdness.
  117. type State int
  118. const (
  119. SNormal State = iota // known to be zero for the type
  120. SFirst
  121. SLast
  122. SMissing
  123. )
  124. var lastChar = rune('\u0000')
  125. func (c Char) isValid() bool {
  126. return c.codePoint != 0 && c.state != SMissing
  127. }
  128. type FormInfo struct {
  129. quickCheck [MNumberOfModes]QCResult // index: MComposed or MDecomposed
  130. verified [MNumberOfModes]bool // index: MComposed or MDecomposed
  131. combinesForward bool // May combine with rune on the right
  132. combinesBackward bool // May combine with rune on the left
  133. isOneWay bool // Never appears in result
  134. inDecomp bool // Some decompositions result in this char.
  135. decomp Decomposition
  136. expandedDecomp Decomposition
  137. }
  138. func (f FormInfo) String() string {
  139. buf := bytes.NewBuffer(make([]byte, 0))
  140. fmt.Fprintf(buf, " quickCheck[C]: %v\n", f.quickCheck[MComposed])
  141. fmt.Fprintf(buf, " quickCheck[D]: %v\n", f.quickCheck[MDecomposed])
  142. fmt.Fprintf(buf, " cmbForward: %v\n", f.combinesForward)
  143. fmt.Fprintf(buf, " cmbBackward: %v\n", f.combinesBackward)
  144. fmt.Fprintf(buf, " isOneWay: %v\n", f.isOneWay)
  145. fmt.Fprintf(buf, " inDecomp: %v\n", f.inDecomp)
  146. fmt.Fprintf(buf, " decomposition: %X\n", f.decomp)
  147. fmt.Fprintf(buf, " expandedDecomp: %X\n", f.expandedDecomp)
  148. return buf.String()
  149. }
  150. type Decomposition []rune
  151. func parseDecomposition(s string, skipfirst bool) (a []rune, err error) {
  152. decomp := strings.Split(s, " ")
  153. if len(decomp) > 0 && skipfirst {
  154. decomp = decomp[1:]
  155. }
  156. for _, d := range decomp {
  157. point, err := strconv.ParseUint(d, 16, 64)
  158. if err != nil {
  159. return a, err
  160. }
  161. a = append(a, rune(point))
  162. }
  163. return a, nil
  164. }
  165. func loadUnicodeData() {
  166. f := gen.OpenUCDFile("UnicodeData.txt")
  167. defer f.Close()
  168. p := ucd.New(f)
  169. for p.Next() {
  170. r := p.Rune(ucd.CodePoint)
  171. char := &chars[r]
  172. char.ccc = uint8(p.Uint(ucd.CanonicalCombiningClass))
  173. decmap := p.String(ucd.DecompMapping)
  174. exp, err := parseDecomposition(decmap, false)
  175. isCompat := false
  176. if err != nil {
  177. if len(decmap) > 0 {
  178. exp, err = parseDecomposition(decmap, true)
  179. if err != nil {
  180. log.Fatalf(`%U: bad decomp |%v|: "%s"`, r, decmap, err)
  181. }
  182. isCompat = true
  183. }
  184. }
  185. char.name = p.String(ucd.Name)
  186. char.codePoint = r
  187. char.forms[FCompatibility].decomp = exp
  188. if !isCompat {
  189. char.forms[FCanonical].decomp = exp
  190. } else {
  191. char.compatDecomp = true
  192. }
  193. if len(decmap) > 0 {
  194. char.forms[FCompatibility].decomp = exp
  195. }
  196. }
  197. if err := p.Err(); err != nil {
  198. log.Fatal(err)
  199. }
  200. }
  201. // compactCCC converts the sparse set of CCC values to a continguous one,
  202. // reducing the number of bits needed from 8 to 6.
  203. func compactCCC() {
  204. m := make(map[uint8]uint8)
  205. for i := range chars {
  206. c := &chars[i]
  207. m[c.ccc] = 0
  208. }
  209. cccs := []int{}
  210. for v, _ := range m {
  211. cccs = append(cccs, int(v))
  212. }
  213. sort.Ints(cccs)
  214. for i, c := range cccs {
  215. cccMap[uint8(i)] = uint8(c)
  216. m[uint8(c)] = uint8(i)
  217. }
  218. for i := range chars {
  219. c := &chars[i]
  220. c.origCCC = c.ccc
  221. c.ccc = m[c.ccc]
  222. }
  223. if len(m) >= 1<<6 {
  224. log.Fatalf("too many difference CCC values: %d >= 64", len(m))
  225. }
  226. }
  227. // CompositionExclusions.txt has form:
  228. // 0958 # ...
  229. // See http://unicode.org/reports/tr44/ for full explanation
  230. func loadCompositionExclusions() {
  231. f := gen.OpenUCDFile("CompositionExclusions.txt")
  232. defer f.Close()
  233. p := ucd.New(f)
  234. for p.Next() {
  235. c := &chars[p.Rune(0)]
  236. if c.excludeInComp {
  237. log.Fatalf("%U: Duplicate entry in exclusions.", c.codePoint)
  238. }
  239. c.excludeInComp = true
  240. }
  241. if e := p.Err(); e != nil {
  242. log.Fatal(e)
  243. }
  244. }
  245. // hasCompatDecomp returns true if any of the recursive
  246. // decompositions contains a compatibility expansion.
  247. // In this case, the character may not occur in NFK*.
  248. func hasCompatDecomp(r rune) bool {
  249. c := &chars[r]
  250. if c.compatDecomp {
  251. return true
  252. }
  253. for _, d := range c.forms[FCompatibility].decomp {
  254. if hasCompatDecomp(d) {
  255. return true
  256. }
  257. }
  258. return false
  259. }
  260. // Hangul related constants.
  261. const (
  262. HangulBase = 0xAC00
  263. HangulEnd = 0xD7A4 // hangulBase + Jamo combinations (19 * 21 * 28)
  264. JamoLBase = 0x1100
  265. JamoLEnd = 0x1113
  266. JamoVBase = 0x1161
  267. JamoVEnd = 0x1176
  268. JamoTBase = 0x11A8
  269. JamoTEnd = 0x11C3
  270. JamoLVTCount = 19 * 21 * 28
  271. JamoTCount = 28
  272. )
  273. func isHangul(r rune) bool {
  274. return HangulBase <= r && r < HangulEnd
  275. }
  276. func isHangulWithoutJamoT(r rune) bool {
  277. if !isHangul(r) {
  278. return false
  279. }
  280. r -= HangulBase
  281. return r < JamoLVTCount && r%JamoTCount == 0
  282. }
  283. func ccc(r rune) uint8 {
  284. return chars[r].ccc
  285. }
  286. // Insert a rune in a buffer, ordered by Canonical Combining Class.
  287. func insertOrdered(b Decomposition, r rune) Decomposition {
  288. n := len(b)
  289. b = append(b, 0)
  290. cc := ccc(r)
  291. if cc > 0 {
  292. // Use bubble sort.
  293. for ; n > 0; n-- {
  294. if ccc(b[n-1]) <= cc {
  295. break
  296. }
  297. b[n] = b[n-1]
  298. }
  299. }
  300. b[n] = r
  301. return b
  302. }
  303. // Recursively decompose.
  304. func decomposeRecursive(form int, r rune, d Decomposition) Decomposition {
  305. dcomp := chars[r].forms[form].decomp
  306. if len(dcomp) == 0 {
  307. return insertOrdered(d, r)
  308. }
  309. for _, c := range dcomp {
  310. d = decomposeRecursive(form, c, d)
  311. }
  312. return d
  313. }
  314. func completeCharFields(form int) {
  315. // Phase 0: pre-expand decomposition.
  316. for i := range chars {
  317. f := &chars[i].forms[form]
  318. if len(f.decomp) == 0 {
  319. continue
  320. }
  321. exp := make(Decomposition, 0)
  322. for _, c := range f.decomp {
  323. exp = decomposeRecursive(form, c, exp)
  324. }
  325. f.expandedDecomp = exp
  326. }
  327. // Phase 1: composition exclusion, mark decomposition.
  328. for i := range chars {
  329. c := &chars[i]
  330. f := &c.forms[form]
  331. // Marks script-specific exclusions and version restricted.
  332. f.isOneWay = c.excludeInComp
  333. // Singletons
  334. f.isOneWay = f.isOneWay || len(f.decomp) == 1
  335. // Non-starter decompositions
  336. if len(f.decomp) > 1 {
  337. chk := c.ccc != 0 || chars[f.decomp[0]].ccc != 0
  338. f.isOneWay = f.isOneWay || chk
  339. }
  340. // Runes that decompose into more than two runes.
  341. f.isOneWay = f.isOneWay || len(f.decomp) > 2
  342. if form == FCompatibility {
  343. f.isOneWay = f.isOneWay || hasCompatDecomp(c.codePoint)
  344. }
  345. for _, r := range f.decomp {
  346. chars[r].forms[form].inDecomp = true
  347. }
  348. }
  349. // Phase 2: forward and backward combining.
  350. for i := range chars {
  351. c := &chars[i]
  352. f := &c.forms[form]
  353. if !f.isOneWay && len(f.decomp) == 2 {
  354. f0 := &chars[f.decomp[0]].forms[form]
  355. f1 := &chars[f.decomp[1]].forms[form]
  356. if !f0.isOneWay {
  357. f0.combinesForward = true
  358. }
  359. if !f1.isOneWay {
  360. f1.combinesBackward = true
  361. }
  362. }
  363. if isHangulWithoutJamoT(rune(i)) {
  364. f.combinesForward = true
  365. }
  366. }
  367. // Phase 3: quick check values.
  368. for i := range chars {
  369. c := &chars[i]
  370. f := &c.forms[form]
  371. switch {
  372. case len(f.decomp) > 0:
  373. f.quickCheck[MDecomposed] = QCNo
  374. case isHangul(rune(i)):
  375. f.quickCheck[MDecomposed] = QCNo
  376. default:
  377. f.quickCheck[MDecomposed] = QCYes
  378. }
  379. switch {
  380. case f.isOneWay:
  381. f.quickCheck[MComposed] = QCNo
  382. case (i & 0xffff00) == JamoLBase:
  383. f.quickCheck[MComposed] = QCYes
  384. if JamoLBase <= i && i < JamoLEnd {
  385. f.combinesForward = true
  386. }
  387. if JamoVBase <= i && i < JamoVEnd {
  388. f.quickCheck[MComposed] = QCMaybe
  389. f.combinesBackward = true
  390. f.combinesForward = true
  391. }
  392. if JamoTBase <= i && i < JamoTEnd {
  393. f.quickCheck[MComposed] = QCMaybe
  394. f.combinesBackward = true
  395. }
  396. case !f.combinesBackward:
  397. f.quickCheck[MComposed] = QCYes
  398. default:
  399. f.quickCheck[MComposed] = QCMaybe
  400. }
  401. }
  402. }
  403. func computeNonStarterCounts() {
  404. // Phase 4: leading and trailing non-starter count
  405. for i := range chars {
  406. c := &chars[i]
  407. runes := []rune{rune(i)}
  408. // We always use FCompatibility so that the CGJ insertion points do not
  409. // change for repeated normalizations with different forms.
  410. if exp := c.forms[FCompatibility].expandedDecomp; len(exp) > 0 {
  411. runes = exp
  412. }
  413. // We consider runes that combine backwards to be non-starters for the
  414. // purpose of Stream-Safe Text Processing.
  415. for _, r := range runes {
  416. if cr := &chars[r]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward {
  417. break
  418. }
  419. c.nLeadingNonStarters++
  420. }
  421. for i := len(runes) - 1; i >= 0; i-- {
  422. if cr := &chars[runes[i]]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward {
  423. break
  424. }
  425. c.nTrailingNonStarters++
  426. }
  427. if c.nTrailingNonStarters > 3 {
  428. log.Fatalf("%U: Decomposition with more than 3 (%d) trailing modifiers (%U)", i, c.nTrailingNonStarters, runes)
  429. }
  430. if isHangul(rune(i)) {
  431. c.nTrailingNonStarters = 2
  432. if isHangulWithoutJamoT(rune(i)) {
  433. c.nTrailingNonStarters = 1
  434. }
  435. }
  436. if l, t := c.nLeadingNonStarters, c.nTrailingNonStarters; l > 0 && l != t {
  437. log.Fatalf("%U: number of leading and trailing non-starters should be equal (%d vs %d)", i, l, t)
  438. }
  439. if t := c.nTrailingNonStarters; t > 3 {
  440. log.Fatalf("%U: number of trailing non-starters is %d > 3", t)
  441. }
  442. }
  443. }
  444. func printBytes(w io.Writer, b []byte, name string) {
  445. fmt.Fprintf(w, "// %s: %d bytes\n", name, len(b))
  446. fmt.Fprintf(w, "var %s = [...]byte {", name)
  447. for i, c := range b {
  448. switch {
  449. case i%64 == 0:
  450. fmt.Fprintf(w, "\n// Bytes %x - %x\n", i, i+63)
  451. case i%8 == 0:
  452. fmt.Fprintf(w, "\n")
  453. }
  454. fmt.Fprintf(w, "0x%.2X, ", c)
  455. }
  456. fmt.Fprint(w, "\n}\n\n")
  457. }
  458. // See forminfo.go for format.
  459. func makeEntry(f *FormInfo, c *Char) uint16 {
  460. e := uint16(0)
  461. if r := c.codePoint; HangulBase <= r && r < HangulEnd {
  462. e |= 0x40
  463. }
  464. if f.combinesForward {
  465. e |= 0x20
  466. }
  467. if f.quickCheck[MDecomposed] == QCNo {
  468. e |= 0x4
  469. }
  470. switch f.quickCheck[MComposed] {
  471. case QCYes:
  472. case QCNo:
  473. e |= 0x10
  474. case QCMaybe:
  475. e |= 0x18
  476. default:
  477. log.Fatalf("Illegal quickcheck value %v.", f.quickCheck[MComposed])
  478. }
  479. e |= uint16(c.nTrailingNonStarters)
  480. return e
  481. }
  482. // decompSet keeps track of unique decompositions, grouped by whether
  483. // the decomposition is followed by a trailing and/or leading CCC.
  484. type decompSet [7]map[string]bool
  485. const (
  486. normalDecomp = iota
  487. firstMulti
  488. firstCCC
  489. endMulti
  490. firstLeadingCCC
  491. firstCCCZeroExcept
  492. firstStarterWithNLead
  493. lastDecomp
  494. )
  495. var cname = []string{"firstMulti", "firstCCC", "endMulti", "firstLeadingCCC", "firstCCCZeroExcept", "firstStarterWithNLead", "lastDecomp"}
  496. func makeDecompSet() decompSet {
  497. m := decompSet{}
  498. for i := range m {
  499. m[i] = make(map[string]bool)
  500. }
  501. return m
  502. }
  503. func (m *decompSet) insert(key int, s string) {
  504. m[key][s] = true
  505. }
  506. func printCharInfoTables(w io.Writer) int {
  507. mkstr := func(r rune, f *FormInfo) (int, string) {
  508. d := f.expandedDecomp
  509. s := string([]rune(d))
  510. if max := 1 << 6; len(s) >= max {
  511. const msg = "%U: too many bytes in decomposition: %d >= %d"
  512. log.Fatalf(msg, r, len(s), max)
  513. }
  514. head := uint8(len(s))
  515. if f.quickCheck[MComposed] != QCYes {
  516. head |= 0x40
  517. }
  518. if f.combinesForward {
  519. head |= 0x80
  520. }
  521. s = string([]byte{head}) + s
  522. lccc := ccc(d[0])
  523. tccc := ccc(d[len(d)-1])
  524. cc := ccc(r)
  525. if cc != 0 && lccc == 0 && tccc == 0 {
  526. log.Fatalf("%U: trailing and leading ccc are 0 for non-zero ccc %d", r, cc)
  527. }
  528. if tccc < lccc && lccc != 0 {
  529. const msg = "%U: lccc (%d) must be <= tcc (%d)"
  530. log.Fatalf(msg, r, lccc, tccc)
  531. }
  532. index := normalDecomp
  533. nTrail := chars[r].nTrailingNonStarters
  534. if tccc > 0 || lccc > 0 || nTrail > 0 {
  535. tccc <<= 2
  536. tccc |= nTrail
  537. s += string([]byte{tccc})
  538. index = endMulti
  539. for _, r := range d[1:] {
  540. if ccc(r) == 0 {
  541. index = firstCCC
  542. }
  543. }
  544. if lccc > 0 {
  545. s += string([]byte{lccc})
  546. if index == firstCCC {
  547. log.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r)
  548. }
  549. index = firstLeadingCCC
  550. }
  551. if cc != lccc {
  552. if cc != 0 {
  553. log.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc)
  554. }
  555. index = firstCCCZeroExcept
  556. }
  557. } else if len(d) > 1 {
  558. index = firstMulti
  559. }
  560. return index, s
  561. }
  562. decompSet := makeDecompSet()
  563. const nLeadStr = "\x00\x01" // 0-byte length and tccc with nTrail.
  564. decompSet.insert(firstStarterWithNLead, nLeadStr)
  565. // Store the uniqued decompositions in a byte buffer,
  566. // preceded by their byte length.
  567. for _, c := range chars {
  568. for _, f := range c.forms {
  569. if len(f.expandedDecomp) == 0 {
  570. continue
  571. }
  572. if f.combinesBackward {
  573. log.Fatalf("%U: combinesBackward and decompose", c.codePoint)
  574. }
  575. index, s := mkstr(c.codePoint, &f)
  576. decompSet.insert(index, s)
  577. }
  578. }
  579. decompositions := bytes.NewBuffer(make([]byte, 0, 10000))
  580. size := 0
  581. positionMap := make(map[string]uint16)
  582. decompositions.WriteString("\000")
  583. fmt.Fprintln(w, "const (")
  584. for i, m := range decompSet {
  585. sa := []string{}
  586. for s := range m {
  587. sa = append(sa, s)
  588. }
  589. sort.Strings(sa)
  590. for _, s := range sa {
  591. p := decompositions.Len()
  592. decompositions.WriteString(s)
  593. positionMap[s] = uint16(p)
  594. }
  595. if cname[i] != "" {
  596. fmt.Fprintf(w, "%s = 0x%X\n", cname[i], decompositions.Len())
  597. }
  598. }
  599. fmt.Fprintln(w, "maxDecomp = 0x8000")
  600. fmt.Fprintln(w, ")")
  601. b := decompositions.Bytes()
  602. printBytes(w, b, "decomps")
  603. size += len(b)
  604. varnames := []string{"nfc", "nfkc"}
  605. for i := 0; i < FNumberOfFormTypes; i++ {
  606. trie := triegen.NewTrie(varnames[i])
  607. for r, c := range chars {
  608. f := c.forms[i]
  609. d := f.expandedDecomp
  610. if len(d) != 0 {
  611. _, key := mkstr(c.codePoint, &f)
  612. trie.Insert(rune(r), uint64(positionMap[key]))
  613. if c.ccc != ccc(d[0]) {
  614. // We assume the lead ccc of a decomposition !=0 in this case.
  615. if ccc(d[0]) == 0 {
  616. log.Fatalf("Expected leading CCC to be non-zero; ccc is %d", c.ccc)
  617. }
  618. }
  619. } else if c.nLeadingNonStarters > 0 && len(f.expandedDecomp) == 0 && c.ccc == 0 && !f.combinesBackward {
  620. // Handle cases where it can't be detected that the nLead should be equal
  621. // to nTrail.
  622. trie.Insert(c.codePoint, uint64(positionMap[nLeadStr]))
  623. } else if v := makeEntry(&f, &c)<<8 | uint16(c.ccc); v != 0 {
  624. trie.Insert(c.codePoint, uint64(0x8000|v))
  625. }
  626. }
  627. sz, err := trie.Gen(w, triegen.Compact(&normCompacter{name: varnames[i]}))
  628. if err != nil {
  629. log.Fatal(err)
  630. }
  631. size += sz
  632. }
  633. return size
  634. }
  635. func contains(sa []string, s string) bool {
  636. for _, a := range sa {
  637. if a == s {
  638. return true
  639. }
  640. }
  641. return false
  642. }
  643. func makeTables() {
  644. w := &bytes.Buffer{}
  645. size := 0
  646. if *tablelist == "" {
  647. return
  648. }
  649. list := strings.Split(*tablelist, ",")
  650. if *tablelist == "all" {
  651. list = []string{"recomp", "info"}
  652. }
  653. // Compute maximum decomposition size.
  654. max := 0
  655. for _, c := range chars {
  656. if n := len(string(c.forms[FCompatibility].expandedDecomp)); n > max {
  657. max = n
  658. }
  659. }
  660. fmt.Fprintln(w, "const (")
  661. fmt.Fprintln(w, "\t// Version is the Unicode edition from which the tables are derived.")
  662. fmt.Fprintf(w, "\tVersion = %q\n", gen.UnicodeVersion())
  663. fmt.Fprintln(w)
  664. fmt.Fprintln(w, "\t// MaxTransformChunkSize indicates the maximum number of bytes that Transform")
  665. fmt.Fprintln(w, "\t// may need to write atomically for any Form. Making a destination buffer at")
  666. fmt.Fprintln(w, "\t// least this size ensures that Transform can always make progress and that")
  667. fmt.Fprintln(w, "\t// the user does not need to grow the buffer on an ErrShortDst.")
  668. fmt.Fprintf(w, "\tMaxTransformChunkSize = %d+maxNonStarters*4\n", len(string(0x034F))+max)
  669. fmt.Fprintln(w, ")\n")
  670. // Print the CCC remap table.
  671. size += len(cccMap)
  672. fmt.Fprintf(w, "var ccc = [%d]uint8{", len(cccMap))
  673. for i := 0; i < len(cccMap); i++ {
  674. if i%8 == 0 {
  675. fmt.Fprintln(w)
  676. }
  677. fmt.Fprintf(w, "%3d, ", cccMap[uint8(i)])
  678. }
  679. fmt.Fprintln(w, "\n}\n")
  680. if contains(list, "info") {
  681. size += printCharInfoTables(w)
  682. }
  683. if contains(list, "recomp") {
  684. // Note that we use 32 bit keys, instead of 64 bit.
  685. // This clips the bits of three entries, but we know
  686. // this won't cause a collision. The compiler will catch
  687. // any changes made to UnicodeData.txt that introduces
  688. // a collision.
  689. // Note that the recomposition map for NFC and NFKC
  690. // are identical.
  691. // Recomposition map
  692. nrentries := 0
  693. for _, c := range chars {
  694. f := c.forms[FCanonical]
  695. if !f.isOneWay && len(f.decomp) > 0 {
  696. nrentries++
  697. }
  698. }
  699. sz := nrentries * 8
  700. size += sz
  701. fmt.Fprintf(w, "// recompMap: %d bytes (entries only)\n", sz)
  702. fmt.Fprintln(w, "var recompMap = map[uint32]rune{")
  703. for i, c := range chars {
  704. f := c.forms[FCanonical]
  705. d := f.decomp
  706. if !f.isOneWay && len(d) > 0 {
  707. key := uint32(uint16(d[0]))<<16 + uint32(uint16(d[1]))
  708. fmt.Fprintf(w, "0x%.8X: 0x%.4X,\n", key, i)
  709. }
  710. }
  711. fmt.Fprintf(w, "}\n\n")
  712. }
  713. fmt.Fprintf(w, "// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size)
  714. gen.WriteGoFile("tables.go", "norm", w.Bytes())
  715. }
  716. func printChars() {
  717. if *verbose {
  718. for _, c := range chars {
  719. if !c.isValid() || c.state == SMissing {
  720. continue
  721. }
  722. fmt.Println(c)
  723. }
  724. }
  725. }
  726. // verifyComputed does various consistency tests.
  727. func verifyComputed() {
  728. for i, c := range chars {
  729. for _, f := range c.forms {
  730. isNo := (f.quickCheck[MDecomposed] == QCNo)
  731. if (len(f.decomp) > 0) != isNo && !isHangul(rune(i)) {
  732. log.Fatalf("%U: NF*D QC must be No if rune decomposes", i)
  733. }
  734. isMaybe := f.quickCheck[MComposed] == QCMaybe
  735. if f.combinesBackward != isMaybe {
  736. log.Fatalf("%U: NF*C QC must be Maybe if combinesBackward", i)
  737. }
  738. if len(f.decomp) > 0 && f.combinesForward && isMaybe {
  739. log.Fatalf("%U: NF*C QC must be Yes or No if combinesForward and decomposes", i)
  740. }
  741. if len(f.expandedDecomp) != 0 {
  742. continue
  743. }
  744. if a, b := c.nLeadingNonStarters > 0, (c.ccc > 0 || f.combinesBackward); a != b {
  745. // We accept these runes to be treated differently (it only affects
  746. // segment breaking in iteration, most likely on improper use), but
  747. // reconsider if more characters are added.
  748. // U+FF9E HALFWIDTH KATAKANA VOICED SOUND MARK;Lm;0;L;<narrow> 3099;;;;N;;;;;
  749. // U+FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK;Lm;0;L;<narrow> 309A;;;;N;;;;;
  750. // U+3133 HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<compat> 11AA;;;;N;HANGUL LETTER GIYEOG SIOS;;;;
  751. // U+318E HANGUL LETTER ARAEAE;Lo;0;L;<compat> 11A1;;;;N;HANGUL LETTER ALAE AE;;;;
  752. // U+FFA3 HALFWIDTH HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<narrow> 3133;;;;N;HALFWIDTH HANGUL LETTER GIYEOG SIOS;;;;
  753. // U+FFDC HALFWIDTH HANGUL LETTER I;Lo;0;L;<narrow> 3163;;;;N;;;;;
  754. if i != 0xFF9E && i != 0xFF9F && !(0x3133 <= i && i <= 0x318E) && !(0xFFA3 <= i && i <= 0xFFDC) {
  755. log.Fatalf("%U: nLead was %v; want %v", i, a, b)
  756. }
  757. }
  758. }
  759. nfc := c.forms[FCanonical]
  760. nfkc := c.forms[FCompatibility]
  761. if nfc.combinesBackward != nfkc.combinesBackward {
  762. log.Fatalf("%U: Cannot combine combinesBackward\n", c.codePoint)
  763. }
  764. }
  765. }
  766. // Use values in DerivedNormalizationProps.txt to compare against the
  767. // values we computed.
  768. // DerivedNormalizationProps.txt has form:
  769. // 00C0..00C5 ; NFD_QC; N # ...
  770. // 0374 ; NFD_QC; N # ...
  771. // See http://unicode.org/reports/tr44/ for full explanation
  772. func testDerived() {
  773. f := gen.OpenUCDFile("DerivedNormalizationProps.txt")
  774. defer f.Close()
  775. p := ucd.New(f)
  776. for p.Next() {
  777. r := p.Rune(0)
  778. c := &chars[r]
  779. var ftype, mode int
  780. qt := p.String(1)
  781. switch qt {
  782. case "NFC_QC":
  783. ftype, mode = FCanonical, MComposed
  784. case "NFD_QC":
  785. ftype, mode = FCanonical, MDecomposed
  786. case "NFKC_QC":
  787. ftype, mode = FCompatibility, MComposed
  788. case "NFKD_QC":
  789. ftype, mode = FCompatibility, MDecomposed
  790. default:
  791. continue
  792. }
  793. var qr QCResult
  794. switch p.String(2) {
  795. case "Y":
  796. qr = QCYes
  797. case "N":
  798. qr = QCNo
  799. case "M":
  800. qr = QCMaybe
  801. default:
  802. log.Fatalf(`Unexpected quick check value "%s"`, p.String(2))
  803. }
  804. if got := c.forms[ftype].quickCheck[mode]; got != qr {
  805. log.Printf("%U: FAILED %s (was %v need %v)\n", r, qt, got, qr)
  806. }
  807. c.forms[ftype].verified[mode] = true
  808. }
  809. if err := p.Err(); err != nil {
  810. log.Fatal(err)
  811. }
  812. // Any unspecified value must be QCYes. Verify this.
  813. for i, c := range chars {
  814. for j, fd := range c.forms {
  815. for k, qr := range fd.quickCheck {
  816. if !fd.verified[k] && qr != QCYes {
  817. m := "%U: FAIL F:%d M:%d (was %v need Yes) %s\n"
  818. log.Printf(m, i, j, k, qr, c.name)
  819. }
  820. }
  821. }
  822. }
  823. }
  824. var testHeader = `const (
  825. Yes = iota
  826. No
  827. Maybe
  828. )
  829. type formData struct {
  830. qc uint8
  831. combinesForward bool
  832. decomposition string
  833. }
  834. type runeData struct {
  835. r rune
  836. ccc uint8
  837. nLead uint8
  838. nTrail uint8
  839. f [2]formData // 0: canonical; 1: compatibility
  840. }
  841. func f(qc uint8, cf bool, dec string) [2]formData {
  842. return [2]formData{{qc, cf, dec}, {qc, cf, dec}}
  843. }
  844. func g(qc, qck uint8, cf, cfk bool, d, dk string) [2]formData {
  845. return [2]formData{{qc, cf, d}, {qck, cfk, dk}}
  846. }
  847. var testData = []runeData{
  848. `
  849. func printTestdata() {
  850. type lastInfo struct {
  851. ccc uint8
  852. nLead uint8
  853. nTrail uint8
  854. f string
  855. }
  856. last := lastInfo{}
  857. w := &bytes.Buffer{}
  858. fmt.Fprintf(w, testHeader)
  859. for r, c := range chars {
  860. f := c.forms[FCanonical]
  861. qc, cf, d := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp)
  862. f = c.forms[FCompatibility]
  863. qck, cfk, dk := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp)
  864. s := ""
  865. if d == dk && qc == qck && cf == cfk {
  866. s = fmt.Sprintf("f(%s, %v, %q)", qc, cf, d)
  867. } else {
  868. s = fmt.Sprintf("g(%s, %s, %v, %v, %q, %q)", qc, qck, cf, cfk, d, dk)
  869. }
  870. current := lastInfo{c.ccc, c.nLeadingNonStarters, c.nTrailingNonStarters, s}
  871. if last != current {
  872. fmt.Fprintf(w, "\t{0x%x, %d, %d, %d, %s},\n", r, c.origCCC, c.nLeadingNonStarters, c.nTrailingNonStarters, s)
  873. last = current
  874. }
  875. }
  876. fmt.Fprintln(w, "}")
  877. gen.WriteGoFile("data_test.go", "norm", w.Bytes())
  878. }