You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

277 lines
7.1 KiB

  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build ignore
  5. // This program generates the trie for idna operations. The Unicode casing
  6. // algorithm requires the lookup of various properties and mappings for each
  7. // rune. The table generated by this generator combines several of the most
  8. // frequently used of these into a single trie so that they can be accessed
  9. // with a single lookup.
  10. package main
  11. import (
  12. "fmt"
  13. "io"
  14. "log"
  15. "unicode"
  16. "unicode/utf8"
  17. "golang.org/x/text/internal/gen"
  18. "golang.org/x/text/internal/triegen"
  19. "golang.org/x/text/internal/ucd"
  20. "golang.org/x/text/unicode/bidi"
  21. )
  22. func main() {
  23. gen.Init()
  24. genTables()
  25. gen.Repackage("gen_trieval.go", "trieval.go", "idna")
  26. gen.Repackage("gen_common.go", "common_test.go", "idna")
  27. }
  28. var runes = map[rune]info{}
  29. func genTables() {
  30. t := triegen.NewTrie("idna")
  31. ucd.Parse(gen.OpenUCDFile("DerivedNormalizationProps.txt"), func(p *ucd.Parser) {
  32. r := p.Rune(0)
  33. if p.String(1) == "NFC_QC" { // p.String(2) is "N" or "M"
  34. runes[r] = mayNeedNorm
  35. }
  36. })
  37. ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
  38. r := p.Rune(0)
  39. const cccVirama = 9
  40. if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
  41. runes[p.Rune(0)] = viramaModifier
  42. }
  43. switch {
  44. case unicode.In(r, unicode.Mark):
  45. runes[r] |= modifier | mayNeedNorm
  46. }
  47. // TODO: by using UnicodeData.txt we don't mark undefined codepoints
  48. // that are earmarked as RTL properly. However, an undefined cp will
  49. // always fail, so there is no need to store this info.
  50. switch p, _ := bidi.LookupRune(r); p.Class() {
  51. case bidi.R, bidi.AL, bidi.AN:
  52. if x := runes[r]; x != 0 && x != mayNeedNorm {
  53. log.Fatalf("%U: rune both modifier and RTL letter/number", r)
  54. }
  55. runes[r] = rtl
  56. }
  57. })
  58. ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
  59. switch v := p.String(1); v {
  60. case "L", "D", "T", "R":
  61. runes[p.Rune(0)] |= joinType[v] << joinShift
  62. }
  63. })
  64. ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) {
  65. r := p.Rune(0)
  66. // The mappings table explicitly defines surrogates as invalid.
  67. if !utf8.ValidRune(r) {
  68. return
  69. }
  70. cat := catFromEntry(p)
  71. isMapped := cat == mapped || cat == disallowedSTD3Mapped || cat == deviation
  72. if !isMapped {
  73. // Only include additional category information for non-mapped
  74. // runes. The additional information is only used after mapping and
  75. // the bits would clash with mapping information.
  76. // TODO: it would be possible to inline this data and avoid
  77. // additional lookups. This is quite tedious, though, so let's first
  78. // see if we need this.
  79. cat |= category(runes[r])
  80. }
  81. s := string(p.Runes(2))
  82. if s != "" && !isMapped {
  83. log.Fatalf("%U: Mapping with non-mapping category %d", r, cat)
  84. }
  85. t.Insert(r, uint64(makeEntry(r, s))+uint64(cat))
  86. })
  87. w := gen.NewCodeWriter()
  88. defer w.WriteVersionedGoFile("tables.go", "idna")
  89. gen.WriteUnicodeVersion(w)
  90. w.WriteVar("mappings", string(mappings))
  91. w.WriteVar("xorData", string(xorData))
  92. sz, err := t.Gen(w, triegen.Compact(&normCompacter{}))
  93. if err != nil {
  94. log.Fatal(err)
  95. }
  96. w.Size += sz
  97. }
  98. var (
  99. // mappings contains replacement strings for mapped runes, each prefixed
  100. // with a byte containing the length of the following string.
  101. mappings = []byte{}
  102. mapCache = map[string]int{}
  103. // xorData is like mappings, except that it contains XOR data.
  104. // We split these two tables so that we don't get an overflow.
  105. xorData = []byte{}
  106. xorCache = map[string]int{}
  107. )
  108. // makeEntry creates a trie entry.
  109. func makeEntry(r rune, mapped string) info {
  110. orig := string(r)
  111. if len(orig) != len(mapped) {
  112. // Store the mapped value as is in the mappings table.
  113. index := len(mappings)
  114. if x, ok := mapCache[mapped]; ok {
  115. index = x
  116. } else {
  117. mapCache[mapped] = index
  118. mappings = append(mappings, byte(len(mapped)))
  119. mappings = append(mappings, mapped...)
  120. }
  121. return info(index) << indexShift
  122. }
  123. // Create per-byte XOR mask.
  124. var b []byte
  125. for i := 0; i < len(orig); i++ {
  126. b = append(b, orig[i]^mapped[i])
  127. }
  128. // Remove leading 0 bytes, but keep at least one byte.
  129. for ; len(b) > 1 && b[0] == 0; b = b[1:] {
  130. }
  131. if len(b) == 1 {
  132. return xorBit | inlineXOR | info(b[0])<<indexShift
  133. }
  134. mapped = string(b)
  135. // Store the mapped value as is in the mappings table.
  136. index := len(xorData)
  137. if x, ok := xorCache[mapped]; ok {
  138. index = x
  139. } else {
  140. xorCache[mapped] = index
  141. xorData = append(xorData, byte(len(mapped)))
  142. xorData = append(xorData, mapped...)
  143. }
  144. return xorBit | info(index)<<indexShift
  145. }
  146. // The following code implements a triegen.Compacter that was originally
  147. // designed for normalization. The IDNA table has some similarities with the
  148. // norm table. Using this compacter, together with the XOR pattern approach,
  149. // reduces the table size by roughly 100K. It can probably be compressed further
  150. // by also including elements of the compacter used by cases, but for now it is
  151. // good enough.
  152. const maxSparseEntries = 16
  153. type normCompacter struct {
  154. sparseBlocks [][]uint64
  155. sparseOffset []uint16
  156. sparseCount int
  157. }
  158. func mostFrequentStride(a []uint64) int {
  159. counts := make(map[int]int)
  160. var v int
  161. for _, x := range a {
  162. if stride := int(x) - v; v != 0 && stride >= 0 {
  163. counts[stride]++
  164. }
  165. v = int(x)
  166. }
  167. var maxs, maxc int
  168. for stride, cnt := range counts {
  169. if cnt > maxc || (cnt == maxc && stride < maxs) {
  170. maxs, maxc = stride, cnt
  171. }
  172. }
  173. return maxs
  174. }
  175. func countSparseEntries(a []uint64) int {
  176. stride := mostFrequentStride(a)
  177. var v, count int
  178. for _, tv := range a {
  179. if int(tv)-v != stride {
  180. if tv != 0 {
  181. count++
  182. }
  183. }
  184. v = int(tv)
  185. }
  186. return count
  187. }
  188. func (c *normCompacter) Size(v []uint64) (sz int, ok bool) {
  189. if n := countSparseEntries(v); n <= maxSparseEntries {
  190. return (n+1)*4 + 2, true
  191. }
  192. return 0, false
  193. }
  194. func (c *normCompacter) Store(v []uint64) uint32 {
  195. h := uint32(len(c.sparseOffset))
  196. c.sparseBlocks = append(c.sparseBlocks, v)
  197. c.sparseOffset = append(c.sparseOffset, uint16(c.sparseCount))
  198. c.sparseCount += countSparseEntries(v) + 1
  199. return h
  200. }
  201. func (c *normCompacter) Handler() string {
  202. return "idnaSparse.lookup"
  203. }
  204. func (c *normCompacter) Print(w io.Writer) (retErr error) {
  205. p := func(f string, x ...interface{}) {
  206. if _, err := fmt.Fprintf(w, f, x...); retErr == nil && err != nil {
  207. retErr = err
  208. }
  209. }
  210. ls := len(c.sparseBlocks)
  211. p("// idnaSparseOffset: %d entries, %d bytes\n", ls, ls*2)
  212. p("var idnaSparseOffset = %#v\n\n", c.sparseOffset)
  213. ns := c.sparseCount
  214. p("// idnaSparseValues: %d entries, %d bytes\n", ns, ns*4)
  215. p("var idnaSparseValues = [%d]valueRange {", ns)
  216. for i, b := range c.sparseBlocks {
  217. p("\n// Block %#x, offset %#x", i, c.sparseOffset[i])
  218. var v int
  219. stride := mostFrequentStride(b)
  220. n := countSparseEntries(b)
  221. p("\n{value:%#04x,lo:%#02x},", stride, uint8(n))
  222. for i, nv := range b {
  223. if int(nv)-v != stride {
  224. if v != 0 {
  225. p(",hi:%#02x},", 0x80+i-1)
  226. }
  227. if nv != 0 {
  228. p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
  229. }
  230. }
  231. v = int(nv)
  232. }
  233. if v != 0 {
  234. p(",hi:%#02x},", 0x80+len(b)-1)
  235. }
  236. }
  237. p("\n}\n\n")
  238. return
  239. }