You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

252 lines
7.3 KiB

  1. // Copyright 2014 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package triegen
  5. import (
  6. "bytes"
  7. "fmt"
  8. "io"
  9. "strings"
  10. "text/template"
  11. )
  12. // print writes all the data structures as well as the code necessary to use the
  13. // trie to w.
  14. func (b *builder) print(w io.Writer) error {
  15. b.Stats.NValueEntries = len(b.ValueBlocks) * blockSize
  16. b.Stats.NValueBytes = len(b.ValueBlocks) * blockSize * b.ValueSize
  17. b.Stats.NIndexEntries = len(b.IndexBlocks) * blockSize
  18. b.Stats.NIndexBytes = len(b.IndexBlocks) * blockSize * b.IndexSize
  19. b.Stats.NHandleBytes = len(b.Trie) * 2 * b.IndexSize
  20. // If we only have one root trie, all starter blocks are at position 0 and
  21. // we can access the arrays directly.
  22. if len(b.Trie) == 1 {
  23. // At this point we cannot refer to the generated tables directly.
  24. b.ASCIIBlock = b.Name + "Values"
  25. b.StarterBlock = b.Name + "Index"
  26. } else {
  27. // Otherwise we need to have explicit starter indexes in the trie
  28. // structure.
  29. b.ASCIIBlock = "t.ascii"
  30. b.StarterBlock = "t.utf8Start"
  31. }
  32. b.SourceType = "[]byte"
  33. if err := lookupGen.Execute(w, b); err != nil {
  34. return err
  35. }
  36. b.SourceType = "string"
  37. if err := lookupGen.Execute(w, b); err != nil {
  38. return err
  39. }
  40. if err := trieGen.Execute(w, b); err != nil {
  41. return err
  42. }
  43. for _, c := range b.Compactions {
  44. if err := c.c.Print(w); err != nil {
  45. return err
  46. }
  47. }
  48. return nil
  49. }
  50. func printValues(n int, values []uint64) string {
  51. w := &bytes.Buffer{}
  52. boff := n * blockSize
  53. fmt.Fprintf(w, "\t// Block %#x, offset %#x", n, boff)
  54. var newline bool
  55. for i, v := range values {
  56. if i%6 == 0 {
  57. newline = true
  58. }
  59. if v != 0 {
  60. if newline {
  61. fmt.Fprintf(w, "\n")
  62. newline = false
  63. }
  64. fmt.Fprintf(w, "\t%#02x:%#04x, ", boff+i, v)
  65. }
  66. }
  67. return w.String()
  68. }
  69. func printIndex(b *builder, nr int, n *node) string {
  70. w := &bytes.Buffer{}
  71. boff := nr * blockSize
  72. fmt.Fprintf(w, "\t// Block %#x, offset %#x", nr, boff)
  73. var newline bool
  74. for i, c := range n.children {
  75. if i%8 == 0 {
  76. newline = true
  77. }
  78. if c != nil {
  79. v := b.Compactions[c.index.compaction].Offset + uint32(c.index.index)
  80. if v != 0 {
  81. if newline {
  82. fmt.Fprintf(w, "\n")
  83. newline = false
  84. }
  85. fmt.Fprintf(w, "\t%#02x:%#02x, ", boff+i, v)
  86. }
  87. }
  88. }
  89. return w.String()
  90. }
  91. var (
  92. trieGen = template.Must(template.New("trie").Funcs(template.FuncMap{
  93. "printValues": printValues,
  94. "printIndex": printIndex,
  95. "title": strings.Title,
  96. "dec": func(x int) int { return x - 1 },
  97. "psize": func(n int) string {
  98. return fmt.Sprintf("%d bytes (%.2f KiB)", n, float64(n)/1024)
  99. },
  100. }).Parse(trieTemplate))
  101. lookupGen = template.Must(template.New("lookup").Parse(lookupTemplate))
  102. )
  103. // TODO: consider the return type of lookup. It could be uint64, even if the
  104. // internal value type is smaller. We will have to verify this with the
  105. // performance of unicode/norm, which is very sensitive to such changes.
  106. const trieTemplate = `{{$b := .}}{{$multi := gt (len .Trie) 1}}
  107. // {{.Name}}Trie. Total size: {{psize .Size}}. Checksum: {{printf "%08x" .Checksum}}.
  108. type {{.Name}}Trie struct { {{if $multi}}
  109. ascii []{{.ValueType}} // index for ASCII bytes
  110. utf8Start []{{.IndexType}} // index for UTF-8 bytes >= 0xC0
  111. {{end}}}
  112. func new{{title .Name}}Trie(i int) *{{.Name}}Trie { {{if $multi}}
  113. h := {{.Name}}TrieHandles[i]
  114. return &{{.Name}}Trie{ {{.Name}}Values[uint32(h.ascii)<<6:], {{.Name}}Index[uint32(h.multi)<<6:] }
  115. }
  116. type {{.Name}}TrieHandle struct {
  117. ascii, multi {{.IndexType}}
  118. }
  119. // {{.Name}}TrieHandles: {{len .Trie}} handles, {{.Stats.NHandleBytes}} bytes
  120. var {{.Name}}TrieHandles = [{{len .Trie}}]{{.Name}}TrieHandle{
  121. {{range .Trie}} { {{.ASCIIIndex}}, {{.StarterIndex}} }, // {{printf "%08x" .Checksum}}: {{.Name}}
  122. {{end}}}{{else}}
  123. return &{{.Name}}Trie{}
  124. }
  125. {{end}}
  126. // lookupValue determines the type of block n and looks up the value for b.
  127. func (t *{{.Name}}Trie) lookupValue(n uint32, b byte) {{.ValueType}}{{$last := dec (len .Compactions)}} {
  128. switch { {{range $i, $c := .Compactions}}
  129. {{if eq $i $last}}default{{else}}case n < {{$c.Cutoff}}{{end}}:{{if ne $i 0}}
  130. n -= {{$c.Offset}}{{end}}
  131. return {{print $b.ValueType}}({{$c.Handler}}){{end}}
  132. }
  133. }
  134. // {{.Name}}Values: {{len .ValueBlocks}} blocks, {{.Stats.NValueEntries}} entries, {{.Stats.NValueBytes}} bytes
  135. // The third block is the zero block.
  136. var {{.Name}}Values = [{{.Stats.NValueEntries}}]{{.ValueType}} {
  137. {{range $i, $v := .ValueBlocks}}{{printValues $i $v}}
  138. {{end}}}
  139. // {{.Name}}Index: {{len .IndexBlocks}} blocks, {{.Stats.NIndexEntries}} entries, {{.Stats.NIndexBytes}} bytes
  140. // Block 0 is the zero block.
  141. var {{.Name}}Index = [{{.Stats.NIndexEntries}}]{{.IndexType}} {
  142. {{range $i, $v := .IndexBlocks}}{{printIndex $b $i $v}}
  143. {{end}}}
  144. `
  145. // TODO: consider allowing zero-length strings after evaluating performance with
  146. // unicode/norm.
  147. const lookupTemplate = `
  148. // lookup{{if eq .SourceType "string"}}String{{end}} returns the trie value for the first UTF-8 encoding in s and
  149. // the width in bytes of this encoding. The size will be 0 if s does not
  150. // hold enough bytes to complete the encoding. len(s) must be greater than 0.
  151. func (t *{{.Name}}Trie) lookup{{if eq .SourceType "string"}}String{{end}}(s {{.SourceType}}) (v {{.ValueType}}, sz int) {
  152. c0 := s[0]
  153. switch {
  154. case c0 < 0x80: // is ASCII
  155. return {{.ASCIIBlock}}[c0], 1
  156. case c0 < 0xC2:
  157. return 0, 1 // Illegal UTF-8: not a starter, not ASCII.
  158. case c0 < 0xE0: // 2-byte UTF-8
  159. if len(s) < 2 {
  160. return 0, 0
  161. }
  162. i := {{.StarterBlock}}[c0]
  163. c1 := s[1]
  164. if c1 < 0x80 || 0xC0 <= c1 {
  165. return 0, 1 // Illegal UTF-8: not a continuation byte.
  166. }
  167. return t.lookupValue(uint32(i), c1), 2
  168. case c0 < 0xF0: // 3-byte UTF-8
  169. if len(s) < 3 {
  170. return 0, 0
  171. }
  172. i := {{.StarterBlock}}[c0]
  173. c1 := s[1]
  174. if c1 < 0x80 || 0xC0 <= c1 {
  175. return 0, 1 // Illegal UTF-8: not a continuation byte.
  176. }
  177. o := uint32(i)<<6 + uint32(c1)
  178. i = {{.Name}}Index[o]
  179. c2 := s[2]
  180. if c2 < 0x80 || 0xC0 <= c2 {
  181. return 0, 2 // Illegal UTF-8: not a continuation byte.
  182. }
  183. return t.lookupValue(uint32(i), c2), 3
  184. case c0 < 0xF8: // 4-byte UTF-8
  185. if len(s) < 4 {
  186. return 0, 0
  187. }
  188. i := {{.StarterBlock}}[c0]
  189. c1 := s[1]
  190. if c1 < 0x80 || 0xC0 <= c1 {
  191. return 0, 1 // Illegal UTF-8: not a continuation byte.
  192. }
  193. o := uint32(i)<<6 + uint32(c1)
  194. i = {{.Name}}Index[o]
  195. c2 := s[2]
  196. if c2 < 0x80 || 0xC0 <= c2 {
  197. return 0, 2 // Illegal UTF-8: not a continuation byte.
  198. }
  199. o = uint32(i)<<6 + uint32(c2)
  200. i = {{.Name}}Index[o]
  201. c3 := s[3]
  202. if c3 < 0x80 || 0xC0 <= c3 {
  203. return 0, 3 // Illegal UTF-8: not a continuation byte.
  204. }
  205. return t.lookupValue(uint32(i), c3), 4
  206. }
  207. // Illegal rune
  208. return 0, 1
  209. }
  210. // lookup{{if eq .SourceType "string"}}String{{end}}Unsafe returns the trie value for the first UTF-8 encoding in s.
  211. // s must start with a full and valid UTF-8 encoded rune.
  212. func (t *{{.Name}}Trie) lookup{{if eq .SourceType "string"}}String{{end}}Unsafe(s {{.SourceType}}) {{.ValueType}} {
  213. c0 := s[0]
  214. if c0 < 0x80 { // is ASCII
  215. return {{.ASCIIBlock}}[c0]
  216. }
  217. i := {{.StarterBlock}}[c0]
  218. if c0 < 0xE0 { // 2-byte UTF-8
  219. return t.lookupValue(uint32(i), s[1])
  220. }
  221. i = {{.Name}}Index[uint32(i)<<6+uint32(s[1])]
  222. if c0 < 0xF0 { // 3-byte UTF-8
  223. return t.lookupValue(uint32(i), s[2])
  224. }
  225. i = {{.Name}}Index[uint32(i)<<6+uint32(s[2])]
  226. if c0 < 0xF8 { // 4-byte UTF-8
  227. return t.lookupValue(uint32(i), s[3])
  228. }
  229. return 0
  230. }
  231. `