No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.
 
 
 

840 líneas
22 KiB

  1. // Copyright 2014 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build ignore
  5. // This program generates the trie for casing operations. The Unicode casing
  6. // algorithm requires the lookup of various properties and mappings for each
  7. // rune. The table generated by this generator combines several of the most
  8. // frequently used of these into a single trie so that they can be accessed
  9. // with a single lookup.
  10. package main
  11. import (
  12. "bytes"
  13. "fmt"
  14. "io"
  15. "io/ioutil"
  16. "log"
  17. "reflect"
  18. "strconv"
  19. "strings"
  20. "unicode"
  21. "golang.org/x/text/internal/gen"
  22. "golang.org/x/text/internal/triegen"
  23. "golang.org/x/text/internal/ucd"
  24. "golang.org/x/text/unicode/norm"
  25. )
  26. func main() {
  27. gen.Init()
  28. genTables()
  29. genTablesTest()
  30. gen.Repackage("gen_trieval.go", "trieval.go", "cases")
  31. }
  32. // runeInfo contains all information for a rune that we care about for casing
  33. // operations.
  34. type runeInfo struct {
  35. Rune rune
  36. entry info // trie value for this rune.
  37. CaseMode info
  38. // Simple case mappings.
  39. Simple [1 + maxCaseMode][]rune
  40. // Special casing
  41. HasSpecial bool
  42. Conditional bool
  43. Special [1 + maxCaseMode][]rune
  44. // Folding
  45. FoldSimple rune
  46. FoldSpecial rune
  47. FoldFull []rune
  48. // TODO: FC_NFKC, or equivalent data.
  49. // Properties
  50. SoftDotted bool
  51. CaseIgnorable bool
  52. Cased bool
  53. DecomposeGreek bool
  54. BreakType string
  55. BreakCat breakCategory
  56. // We care mostly about 0, Above, and IotaSubscript.
  57. CCC byte
  58. }
  59. type breakCategory int
  60. const (
  61. breakBreak breakCategory = iota
  62. breakLetter
  63. breakMid
  64. )
  65. // mapping returns the case mapping for the given case type.
  66. func (r *runeInfo) mapping(c info) string {
  67. if r.HasSpecial {
  68. return string(r.Special[c])
  69. }
  70. if len(r.Simple[c]) != 0 {
  71. return string(r.Simple[c])
  72. }
  73. return string(r.Rune)
  74. }
  75. func parse(file string, f func(p *ucd.Parser)) {
  76. ucd.Parse(gen.OpenUCDFile(file), f)
  77. }
  78. func parseUCD() []runeInfo {
  79. chars := make([]runeInfo, unicode.MaxRune)
  80. get := func(r rune) *runeInfo {
  81. c := &chars[r]
  82. c.Rune = r
  83. return c
  84. }
  85. parse("UnicodeData.txt", func(p *ucd.Parser) {
  86. ri := get(p.Rune(0))
  87. ri.CCC = byte(p.Int(ucd.CanonicalCombiningClass))
  88. ri.Simple[cLower] = p.Runes(ucd.SimpleLowercaseMapping)
  89. ri.Simple[cUpper] = p.Runes(ucd.SimpleUppercaseMapping)
  90. ri.Simple[cTitle] = p.Runes(ucd.SimpleTitlecaseMapping)
  91. if p.String(ucd.GeneralCategory) == "Lt" {
  92. ri.CaseMode = cTitle
  93. }
  94. })
  95. // <code>; <property>
  96. parse("PropList.txt", func(p *ucd.Parser) {
  97. if p.String(1) == "Soft_Dotted" {
  98. chars[p.Rune(0)].SoftDotted = true
  99. }
  100. })
  101. // <code>; <word break type>
  102. parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
  103. ri := get(p.Rune(0))
  104. switch p.String(1) {
  105. case "Case_Ignorable":
  106. ri.CaseIgnorable = true
  107. case "Cased":
  108. ri.Cased = true
  109. case "Lowercase":
  110. ri.CaseMode = cLower
  111. case "Uppercase":
  112. ri.CaseMode = cUpper
  113. }
  114. })
  115. // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
  116. parse("SpecialCasing.txt", func(p *ucd.Parser) {
  117. // We drop all conditional special casing and deal with them manually in
  118. // the language-specific case mappers. Rune 0x03A3 is the only one with
  119. // a conditional formatting that is not language-specific. However,
  120. // dealing with this letter is tricky, especially in a streaming
  121. // context, so we deal with it in the Caser for Greek specifically.
  122. ri := get(p.Rune(0))
  123. if p.String(4) == "" {
  124. ri.HasSpecial = true
  125. ri.Special[cLower] = p.Runes(1)
  126. ri.Special[cTitle] = p.Runes(2)
  127. ri.Special[cUpper] = p.Runes(3)
  128. } else {
  129. ri.Conditional = true
  130. }
  131. })
  132. // TODO: Use text breaking according to UAX #29.
  133. // <code>; <word break type>
  134. parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
  135. ri := get(p.Rune(0))
  136. ri.BreakType = p.String(1)
  137. // We collapse the word breaking properties onto the categories we need.
  138. switch p.String(1) { // TODO: officially we need to canonicalize.
  139. case "MidLetter", "MidNumLet", "Single_Quote":
  140. ri.BreakCat = breakMid
  141. if !ri.CaseIgnorable {
  142. // finalSigma relies on the fact that all breakMid runes are
  143. // also a Case_Ignorable. Revisit this code when this changes.
  144. log.Fatalf("Rune %U, which has a break category mid, is not a case ignorable", ri)
  145. }
  146. case "ALetter", "Hebrew_Letter", "Numeric", "Extend", "ExtendNumLet", "Format", "ZWJ":
  147. ri.BreakCat = breakLetter
  148. }
  149. })
  150. // <code>; <type>; <mapping>
  151. parse("CaseFolding.txt", func(p *ucd.Parser) {
  152. ri := get(p.Rune(0))
  153. switch p.String(1) {
  154. case "C":
  155. ri.FoldSimple = p.Rune(2)
  156. ri.FoldFull = p.Runes(2)
  157. case "S":
  158. ri.FoldSimple = p.Rune(2)
  159. case "T":
  160. ri.FoldSpecial = p.Rune(2)
  161. case "F":
  162. ri.FoldFull = p.Runes(2)
  163. default:
  164. log.Fatalf("%U: unknown type: %s", p.Rune(0), p.String(1))
  165. }
  166. })
  167. return chars
  168. }
  169. func genTables() {
  170. chars := parseUCD()
  171. verifyProperties(chars)
  172. t := triegen.NewTrie("case")
  173. for i := range chars {
  174. c := &chars[i]
  175. makeEntry(c)
  176. t.Insert(rune(i), uint64(c.entry))
  177. }
  178. w := gen.NewCodeWriter()
  179. defer w.WriteVersionedGoFile("tables.go", "cases")
  180. gen.WriteUnicodeVersion(w)
  181. // TODO: write CLDR version after adding a mechanism to detect that the
  182. // tables on which the manually created locale-sensitive casing code is
  183. // based hasn't changed.
  184. w.WriteVar("xorData", string(xorData))
  185. w.WriteVar("exceptions", string(exceptionData))
  186. sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{}))
  187. if err != nil {
  188. log.Fatal(err)
  189. }
  190. w.Size += sz
  191. }
  192. func makeEntry(ri *runeInfo) {
  193. if ri.CaseIgnorable {
  194. if ri.Cased {
  195. ri.entry = cIgnorableCased
  196. } else {
  197. ri.entry = cIgnorableUncased
  198. }
  199. } else {
  200. ri.entry = ri.CaseMode
  201. }
  202. // TODO: handle soft-dotted.
  203. ccc := cccOther
  204. switch ri.CCC {
  205. case 0: // Not_Reordered
  206. ccc = cccZero
  207. case above: // Above
  208. ccc = cccAbove
  209. }
  210. switch ri.BreakCat {
  211. case breakBreak:
  212. ccc = cccBreak
  213. case breakMid:
  214. ri.entry |= isMidBit
  215. }
  216. ri.entry |= ccc
  217. if ri.CaseMode == cUncased {
  218. return
  219. }
  220. // Need to do something special.
  221. if ri.CaseMode == cTitle || ri.HasSpecial || ri.mapping(cTitle) != ri.mapping(cUpper) {
  222. makeException(ri)
  223. return
  224. }
  225. if f := string(ri.FoldFull); len(f) > 0 && f != ri.mapping(cUpper) && f != ri.mapping(cLower) {
  226. makeException(ri)
  227. return
  228. }
  229. // Rune is either lowercase or uppercase.
  230. orig := string(ri.Rune)
  231. mapped := ""
  232. if ri.CaseMode == cUpper {
  233. mapped = ri.mapping(cLower)
  234. } else {
  235. mapped = ri.mapping(cUpper)
  236. }
  237. if len(orig) != len(mapped) {
  238. makeException(ri)
  239. return
  240. }
  241. if string(ri.FoldFull) == ri.mapping(cUpper) {
  242. ri.entry |= inverseFoldBit
  243. }
  244. n := len(orig)
  245. // Create per-byte XOR mask.
  246. var b []byte
  247. for i := 0; i < n; i++ {
  248. b = append(b, orig[i]^mapped[i])
  249. }
  250. // Remove leading 0 bytes, but keep at least one byte.
  251. for ; len(b) > 1 && b[0] == 0; b = b[1:] {
  252. }
  253. if len(b) == 1 && b[0]&0xc0 == 0 {
  254. ri.entry |= info(b[0]) << xorShift
  255. return
  256. }
  257. key := string(b)
  258. x, ok := xorCache[key]
  259. if !ok {
  260. xorData = append(xorData, 0) // for detecting start of sequence
  261. xorData = append(xorData, b...)
  262. x = len(xorData) - 1
  263. xorCache[key] = x
  264. }
  265. ri.entry |= info(x<<xorShift) | xorIndexBit
  266. }
  267. var xorCache = map[string]int{}
  268. // xorData contains byte-wise XOR data for the least significant bytes of a
  269. // UTF-8 encoded rune. An index points to the last byte. The sequence starts
  270. // with a zero terminator.
  271. var xorData = []byte{}
  272. // See the comments in gen_trieval.go re "the exceptions slice".
  273. var exceptionData = []byte{0}
  274. // makeException encodes case mappings that cannot be expressed in a simple
  275. // XOR diff.
  276. func makeException(ri *runeInfo) {
  277. ccc := ri.entry & cccMask
  278. // Set exception bit and retain case type.
  279. ri.entry &= 0x0007
  280. ri.entry |= exceptionBit
  281. if len(exceptionData) >= 1<<numExceptionBits {
  282. log.Fatalf("%U:exceptionData too large %x > %d bits", ri.Rune, len(exceptionData), numExceptionBits)
  283. }
  284. // Set the offset in the exceptionData array.
  285. ri.entry |= info(len(exceptionData) << exceptionShift)
  286. orig := string(ri.Rune)
  287. tc := ri.mapping(cTitle)
  288. uc := ri.mapping(cUpper)
  289. lc := ri.mapping(cLower)
  290. ff := string(ri.FoldFull)
  291. // addString sets the length of a string and adds it to the expansions array.
  292. addString := func(s string, b *byte) {
  293. if len(s) == 0 {
  294. // Zero-length mappings exist, but only for conditional casing,
  295. // which we are representing outside of this table.
  296. log.Fatalf("%U: has zero-length mapping.", ri.Rune)
  297. }
  298. *b <<= 3
  299. if s != orig {
  300. n := len(s)
  301. if n > 7 {
  302. log.Fatalf("%U: mapping larger than 7 (%d)", ri.Rune, n)
  303. }
  304. *b |= byte(n)
  305. exceptionData = append(exceptionData, s...)
  306. }
  307. }
  308. // byte 0:
  309. exceptionData = append(exceptionData, byte(ccc)|byte(len(ff)))
  310. // byte 1:
  311. p := len(exceptionData)
  312. exceptionData = append(exceptionData, 0)
  313. if len(ff) > 7 { // May be zero-length.
  314. log.Fatalf("%U: fold string larger than 7 (%d)", ri.Rune, len(ff))
  315. }
  316. exceptionData = append(exceptionData, ff...)
  317. ct := ri.CaseMode
  318. if ct != cLower {
  319. addString(lc, &exceptionData[p])
  320. }
  321. if ct != cUpper {
  322. addString(uc, &exceptionData[p])
  323. }
  324. if ct != cTitle {
  325. // If title is the same as upper, we set it to the original string so
  326. // that it will be marked as not present. This implies title case is
  327. // the same as upper case.
  328. if tc == uc {
  329. tc = orig
  330. }
  331. addString(tc, &exceptionData[p])
  332. }
  333. }
  334. // sparseCompacter is a trie value block Compacter. There are many cases where
  335. // successive runes alternate between lower- and upper-case. This Compacter
  336. // exploits this by adding a special case type where the case value is obtained
  337. // from or-ing it with the least-significant bit of the rune, creating large
  338. // ranges of equal case values that compress well.
  339. type sparseCompacter struct {
  340. sparseBlocks [][]uint16
  341. sparseOffsets []uint16
  342. sparseCount int
  343. }
  344. // makeSparse returns the number of elements that compact block would contain
  345. // as well as the modified values.
  346. func makeSparse(vals []uint64) ([]uint16, int) {
  347. // Copy the values.
  348. values := make([]uint16, len(vals))
  349. for i, v := range vals {
  350. values[i] = uint16(v)
  351. }
  352. alt := func(i int, v uint16) uint16 {
  353. if cm := info(v & fullCasedMask); cm == cUpper || cm == cLower {
  354. // Convert cLower or cUpper to cXORCase value, which has the form 11x.
  355. xor := v
  356. xor &^= 1
  357. xor |= uint16(i&1) ^ (v & 1)
  358. xor |= 0x4
  359. return xor
  360. }
  361. return v
  362. }
  363. var count int
  364. var previous uint16
  365. for i, v := range values {
  366. if v != 0 {
  367. // Try if the unmodified value is equal to the previous.
  368. if v == previous {
  369. continue
  370. }
  371. // Try if the xor-ed value is equal to the previous value.
  372. a := alt(i, v)
  373. if a == previous {
  374. values[i] = a
  375. continue
  376. }
  377. // This is a new value.
  378. count++
  379. // Use the xor-ed value if it will be identical to the next value.
  380. if p := i + 1; p < len(values) && alt(p, values[p]) == a {
  381. values[i] = a
  382. v = a
  383. }
  384. }
  385. previous = v
  386. }
  387. return values, count
  388. }
  389. func (s *sparseCompacter) Size(v []uint64) (int, bool) {
  390. _, n := makeSparse(v)
  391. // We limit using this method to having 16 entries.
  392. if n > 16 {
  393. return 0, false
  394. }
  395. return 2 + int(reflect.TypeOf(valueRange{}).Size())*n, true
  396. }
  397. func (s *sparseCompacter) Store(v []uint64) uint32 {
  398. h := uint32(len(s.sparseOffsets))
  399. values, sz := makeSparse(v)
  400. s.sparseBlocks = append(s.sparseBlocks, values)
  401. s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
  402. s.sparseCount += sz
  403. return h
  404. }
  405. func (s *sparseCompacter) Handler() string {
  406. // The sparse global variable and its lookup method is defined in gen_trieval.go.
  407. return "sparse.lookup"
  408. }
  409. func (s *sparseCompacter) Print(w io.Writer) (retErr error) {
  410. p := func(format string, args ...interface{}) {
  411. _, err := fmt.Fprintf(w, format, args...)
  412. if retErr == nil && err != nil {
  413. retErr = err
  414. }
  415. }
  416. ls := len(s.sparseBlocks)
  417. if ls == len(s.sparseOffsets) {
  418. s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
  419. }
  420. p("// sparseOffsets: %d entries, %d bytes\n", ls+1, (ls+1)*2)
  421. p("var sparseOffsets = %#v\n\n", s.sparseOffsets)
  422. ns := s.sparseCount
  423. p("// sparseValues: %d entries, %d bytes\n", ns, ns*4)
  424. p("var sparseValues = [%d]valueRange {", ns)
  425. for i, values := range s.sparseBlocks {
  426. p("\n// Block %#x, offset %#x", i, s.sparseOffsets[i])
  427. var v uint16
  428. for i, nv := range values {
  429. if nv != v {
  430. if v != 0 {
  431. p(",hi:%#02x},", 0x80+i-1)
  432. }
  433. if nv != 0 {
  434. p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
  435. }
  436. }
  437. v = nv
  438. }
  439. if v != 0 {
  440. p(",hi:%#02x},", 0x80+len(values)-1)
  441. }
  442. }
  443. p("\n}\n\n")
  444. return
  445. }
  446. // verifyProperties that properties of the runes that are relied upon in the
  447. // implementation. Each property is marked with an identifier that is referred
  448. // to in the places where it is used.
  449. func verifyProperties(chars []runeInfo) {
  450. for i, c := range chars {
  451. r := rune(i)
  452. // Rune properties.
  453. // A.1: modifier never changes on lowercase. [ltLower]
  454. if c.CCC > 0 && unicode.ToLower(r) != r {
  455. log.Fatalf("%U: non-starter changes when lowercased", r)
  456. }
  457. // A.2: properties of decompositions starting with I or J. [ltLower]
  458. d := norm.NFD.PropertiesString(string(r)).Decomposition()
  459. if len(d) > 0 {
  460. if d[0] == 'I' || d[0] == 'J' {
  461. // A.2.1: we expect at least an ASCII character and a modifier.
  462. if len(d) < 3 {
  463. log.Fatalf("%U: length of decomposition was %d; want >= 3", r, len(d))
  464. }
  465. // All subsequent runes are modifiers and all have the same CCC.
  466. runes := []rune(string(d[1:]))
  467. ccc := chars[runes[0]].CCC
  468. for _, mr := range runes[1:] {
  469. mc := chars[mr]
  470. // A.2.2: all modifiers have a CCC of Above or less.
  471. if ccc == 0 || ccc > above {
  472. log.Fatalf("%U: CCC of successive rune (%U) was %d; want (0,230]", r, mr, ccc)
  473. }
  474. // A.2.3: a sequence of modifiers all have the same CCC.
  475. if mc.CCC != ccc {
  476. log.Fatalf("%U: CCC of follow-up modifier (%U) was %d; want %d", r, mr, mc.CCC, ccc)
  477. }
  478. // A.2.4: for each trailing r, r in [0x300, 0x311] <=> CCC == Above.
  479. if (ccc == above) != (0x300 <= mr && mr <= 0x311) {
  480. log.Fatalf("%U: modifier %U in [U+0300, U+0311] != ccc(%U) == 230", r, mr, mr)
  481. }
  482. if i += len(string(mr)); i >= len(d) {
  483. break
  484. }
  485. }
  486. }
  487. }
  488. // A.3: no U+0307 in decomposition of Soft-Dotted rune. [ltUpper]
  489. if unicode.Is(unicode.Soft_Dotted, r) && strings.Contains(string(d), "\u0307") {
  490. log.Fatalf("%U: decomposition of soft-dotted rune may not contain U+0307", r)
  491. }
  492. // A.4: only rune U+0345 may be of CCC Iota_Subscript. [elUpper]
  493. if c.CCC == iotaSubscript && r != 0x0345 {
  494. log.Fatalf("%U: only rune U+0345 may have CCC Iota_Subscript", r)
  495. }
  496. // A.5: soft-dotted runes do not have exceptions.
  497. if c.SoftDotted && c.entry&exceptionBit != 0 {
  498. log.Fatalf("%U: soft-dotted has exception", r)
  499. }
  500. // A.6: Greek decomposition. [elUpper]
  501. if unicode.Is(unicode.Greek, r) {
  502. if b := norm.NFD.PropertiesString(string(r)).Decomposition(); b != nil {
  503. runes := []rune(string(b))
  504. // A.6.1: If a Greek rune decomposes and the first rune of the
  505. // decomposition is greater than U+00FF, the rune is always
  506. // great and not a modifier.
  507. if f := runes[0]; unicode.IsMark(f) || f > 0xFF && !unicode.Is(unicode.Greek, f) {
  508. log.Fatalf("%U: expected first rune of Greek decomposition to be letter, found %U", r, f)
  509. }
  510. // A.6.2: Any follow-up rune in a Greek decomposition is a
  511. // modifier of which the first should be gobbled in
  512. // decomposition.
  513. for _, m := range runes[1:] {
  514. switch m {
  515. case 0x0313, 0x0314, 0x0301, 0x0300, 0x0306, 0x0342, 0x0308, 0x0304, 0x345:
  516. default:
  517. log.Fatalf("%U: modifier %U is outside of expected Greek modifier set", r, m)
  518. }
  519. }
  520. }
  521. }
  522. // Breaking properties.
  523. // B.1: all runes with CCC > 0 are of break type Extend.
  524. if c.CCC > 0 && c.BreakType != "Extend" {
  525. log.Fatalf("%U: CCC == %d, but got break type %s; want Extend", r, c.CCC, c.BreakType)
  526. }
  527. // B.2: all cased runes with c.CCC == 0 are of break type ALetter.
  528. if c.CCC == 0 && c.Cased && c.BreakType != "ALetter" {
  529. log.Fatalf("%U: cased, but got break type %s; want ALetter", r, c.BreakType)
  530. }
  531. // B.3: letter category.
  532. if c.CCC == 0 && c.BreakCat != breakBreak && !c.CaseIgnorable {
  533. if c.BreakCat != breakLetter {
  534. log.Fatalf("%U: check for letter break type gave %d; want %d", r, c.BreakCat, breakLetter)
  535. }
  536. }
  537. }
  538. }
  539. func genTablesTest() {
  540. w := &bytes.Buffer{}
  541. fmt.Fprintln(w, "var (")
  542. printProperties(w, "DerivedCoreProperties.txt", "Case_Ignorable", verifyIgnore)
  543. // We discard the output as we know we have perfect functions. We run them
  544. // just to verify the properties are correct.
  545. n := printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Cased", verifyCased)
  546. n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Lowercase", verifyLower)
  547. n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Uppercase", verifyUpper)
  548. if n > 0 {
  549. log.Fatalf("One of the discarded properties does not have a perfect filter.")
  550. }
  551. // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
  552. fmt.Fprintln(w, "\tspecial = map[rune]struct{ toLower, toTitle, toUpper string }{")
  553. parse("SpecialCasing.txt", func(p *ucd.Parser) {
  554. // Skip conditional entries.
  555. if p.String(4) != "" {
  556. return
  557. }
  558. r := p.Rune(0)
  559. fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n",
  560. r, string(p.Runes(1)), string(p.Runes(2)), string(p.Runes(3)))
  561. })
  562. fmt.Fprint(w, "\t}\n\n")
  563. // <code>; <type>; <runes>
  564. table := map[rune]struct{ simple, full, special string }{}
  565. parse("CaseFolding.txt", func(p *ucd.Parser) {
  566. r := p.Rune(0)
  567. t := p.String(1)
  568. v := string(p.Runes(2))
  569. if t != "T" && v == string(unicode.ToLower(r)) {
  570. return
  571. }
  572. x := table[r]
  573. switch t {
  574. case "C":
  575. x.full = v
  576. x.simple = v
  577. case "S":
  578. x.simple = v
  579. case "F":
  580. x.full = v
  581. case "T":
  582. x.special = v
  583. }
  584. table[r] = x
  585. })
  586. fmt.Fprintln(w, "\tfoldMap = map[rune]struct{ simple, full, special string }{")
  587. for r := rune(0); r < 0x10FFFF; r++ {
  588. x, ok := table[r]
  589. if !ok {
  590. continue
  591. }
  592. fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", r, x.simple, x.full, x.special)
  593. }
  594. fmt.Fprint(w, "\t}\n\n")
  595. // Break property
  596. notBreak := map[rune]bool{}
  597. parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
  598. switch p.String(1) {
  599. case "Extend", "Format", "MidLetter", "MidNumLet", "Single_Quote",
  600. "ALetter", "Hebrew_Letter", "Numeric", "ExtendNumLet", "ZWJ":
  601. notBreak[p.Rune(0)] = true
  602. }
  603. })
  604. fmt.Fprintln(w, "\tbreakProp = []struct{ lo, hi rune }{")
  605. inBreak := false
  606. for r := rune(0); r <= lastRuneForTesting; r++ {
  607. if isBreak := !notBreak[r]; isBreak != inBreak {
  608. if isBreak {
  609. fmt.Fprintf(w, "\t\t{0x%x, ", r)
  610. } else {
  611. fmt.Fprintf(w, "0x%x},\n", r-1)
  612. }
  613. inBreak = isBreak
  614. }
  615. }
  616. if inBreak {
  617. fmt.Fprintf(w, "0x%x},\n", lastRuneForTesting)
  618. }
  619. fmt.Fprint(w, "\t}\n\n")
  620. // Word break test
  621. // Filter out all samples that do not contain cased characters.
  622. cased := map[rune]bool{}
  623. parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
  624. if p.String(1) == "Cased" {
  625. cased[p.Rune(0)] = true
  626. }
  627. })
  628. fmt.Fprintln(w, "\tbreakTest = []string{")
  629. parse("auxiliary/WordBreakTest.txt", func(p *ucd.Parser) {
  630. c := strings.Split(p.String(0), " ")
  631. const sep = '|'
  632. numCased := 0
  633. test := ""
  634. for ; len(c) >= 2; c = c[2:] {
  635. if c[0] == "÷" && test != "" {
  636. test += string(sep)
  637. }
  638. i, err := strconv.ParseUint(c[1], 16, 32)
  639. r := rune(i)
  640. if err != nil {
  641. log.Fatalf("Invalid rune %q.", c[1])
  642. }
  643. if r == sep {
  644. log.Fatalf("Separator %q not allowed in test data. Pick another one.", sep)
  645. }
  646. if cased[r] {
  647. numCased++
  648. }
  649. test += string(r)
  650. }
  651. if numCased > 1 {
  652. fmt.Fprintf(w, "\t\t%q,\n", test)
  653. }
  654. })
  655. fmt.Fprintln(w, "\t}")
  656. fmt.Fprintln(w, ")")
  657. gen.WriteVersionedGoFile("tables_test.go", "cases", w.Bytes())
  658. }
  659. // These functions are just used for verification that their definition have not
  660. // changed in the Unicode Standard.
  661. func verifyCased(r rune) bool {
  662. return verifyLower(r) || verifyUpper(r) || unicode.IsTitle(r)
  663. }
  664. func verifyLower(r rune) bool {
  665. return unicode.IsLower(r) || unicode.Is(unicode.Other_Lowercase, r)
  666. }
  667. func verifyUpper(r rune) bool {
  668. return unicode.IsUpper(r) || unicode.Is(unicode.Other_Uppercase, r)
  669. }
  670. // verifyIgnore is an approximation of the Case_Ignorable property using the
  671. // core unicode package. It is used to reduce the size of the test data.
  672. func verifyIgnore(r rune) bool {
  673. props := []*unicode.RangeTable{
  674. unicode.Mn,
  675. unicode.Me,
  676. unicode.Cf,
  677. unicode.Lm,
  678. unicode.Sk,
  679. }
  680. for _, p := range props {
  681. if unicode.Is(p, r) {
  682. return true
  683. }
  684. }
  685. return false
  686. }
  687. // printProperties prints tables of rune properties from the given UCD file.
  688. // A filter func f can be given to exclude certain values. A rune r will have
  689. // the indicated property if it is in the generated table or if f(r).
  690. func printProperties(w io.Writer, file, property string, f func(r rune) bool) int {
  691. verify := map[rune]bool{}
  692. n := 0
  693. varNameParts := strings.Split(property, "_")
  694. varNameParts[0] = strings.ToLower(varNameParts[0])
  695. fmt.Fprintf(w, "\t%s = map[rune]bool{\n", strings.Join(varNameParts, ""))
  696. parse(file, func(p *ucd.Parser) {
  697. if p.String(1) == property {
  698. r := p.Rune(0)
  699. verify[r] = true
  700. if !f(r) {
  701. n++
  702. fmt.Fprintf(w, "\t\t0x%.4x: true,\n", r)
  703. }
  704. }
  705. })
  706. fmt.Fprint(w, "\t}\n\n")
  707. // Verify that f is correct, that is, it represents a subset of the property.
  708. for r := rune(0); r <= lastRuneForTesting; r++ {
  709. if !verify[r] && f(r) {
  710. log.Fatalf("Incorrect filter func for property %q.", property)
  711. }
  712. }
  713. return n
  714. }
  715. // The newCaseTrie, sparseValues and sparseOffsets definitions below are
  716. // placeholders referred to by gen_trieval.go. The real definitions are
  717. // generated by this program and written to tables.go.
  718. func newCaseTrie(int) int { return 0 }
  719. var (
  720. sparseValues [0]valueRange
  721. sparseOffsets [0]uint16
  722. )