You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

276 lines
7.1 KiB

  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package colltab
  5. import (
  6. "unicode/utf8"
  7. "golang.org/x/text/unicode/norm"
  8. )
  9. // Table holds all collation data for a given collation ordering.
  10. type Table struct {
  11. Index Trie // main trie
  12. // expansion info
  13. ExpandElem []uint32
  14. // contraction info
  15. ContractTries ContractTrieSet
  16. ContractElem []uint32
  17. MaxContractLen int
  18. VariableTop uint32
  19. }
  20. func (t *Table) AppendNext(w []Elem, b []byte) (res []Elem, n int) {
  21. return t.appendNext(w, source{bytes: b})
  22. }
  23. func (t *Table) AppendNextString(w []Elem, s string) (res []Elem, n int) {
  24. return t.appendNext(w, source{str: s})
  25. }
  26. func (t *Table) Start(p int, b []byte) int {
  27. // TODO: implement
  28. panic("not implemented")
  29. }
  30. func (t *Table) StartString(p int, s string) int {
  31. // TODO: implement
  32. panic("not implemented")
  33. }
  34. func (t *Table) Domain() []string {
  35. // TODO: implement
  36. panic("not implemented")
  37. }
  38. func (t *Table) Top() uint32 {
  39. return t.VariableTop
  40. }
  41. type source struct {
  42. str string
  43. bytes []byte
  44. }
  45. func (src *source) lookup(t *Table) (ce Elem, sz int) {
  46. if src.bytes == nil {
  47. return t.Index.lookupString(src.str)
  48. }
  49. return t.Index.lookup(src.bytes)
  50. }
  51. func (src *source) tail(sz int) {
  52. if src.bytes == nil {
  53. src.str = src.str[sz:]
  54. } else {
  55. src.bytes = src.bytes[sz:]
  56. }
  57. }
  58. func (src *source) nfd(buf []byte, end int) []byte {
  59. if src.bytes == nil {
  60. return norm.NFD.AppendString(buf[:0], src.str[:end])
  61. }
  62. return norm.NFD.Append(buf[:0], src.bytes[:end]...)
  63. }
  64. func (src *source) rune() (r rune, sz int) {
  65. if src.bytes == nil {
  66. return utf8.DecodeRuneInString(src.str)
  67. }
  68. return utf8.DecodeRune(src.bytes)
  69. }
  70. func (src *source) properties(f norm.Form) norm.Properties {
  71. if src.bytes == nil {
  72. return f.PropertiesString(src.str)
  73. }
  74. return f.Properties(src.bytes)
  75. }
  76. // appendNext appends the weights corresponding to the next rune or
  77. // contraction in s. If a contraction is matched to a discontinuous
  78. // sequence of runes, the weights for the interstitial runes are
  79. // appended as well. It returns a new slice that includes the appended
  80. // weights and the number of bytes consumed from s.
  81. func (t *Table) appendNext(w []Elem, src source) (res []Elem, n int) {
  82. ce, sz := src.lookup(t)
  83. tp := ce.ctype()
  84. if tp == ceNormal {
  85. if ce == 0 {
  86. r, _ := src.rune()
  87. const (
  88. hangulSize = 3
  89. firstHangul = 0xAC00
  90. lastHangul = 0xD7A3
  91. )
  92. if r >= firstHangul && r <= lastHangul {
  93. // TODO: performance can be considerably improved here.
  94. n = sz
  95. var buf [16]byte // Used for decomposing Hangul.
  96. for b := src.nfd(buf[:0], hangulSize); len(b) > 0; b = b[sz:] {
  97. ce, sz = t.Index.lookup(b)
  98. w = append(w, ce)
  99. }
  100. return w, n
  101. }
  102. ce = makeImplicitCE(implicitPrimary(r))
  103. }
  104. w = append(w, ce)
  105. } else if tp == ceExpansionIndex {
  106. w = t.appendExpansion(w, ce)
  107. } else if tp == ceContractionIndex {
  108. n := 0
  109. src.tail(sz)
  110. if src.bytes == nil {
  111. w, n = t.matchContractionString(w, ce, src.str)
  112. } else {
  113. w, n = t.matchContraction(w, ce, src.bytes)
  114. }
  115. sz += n
  116. } else if tp == ceDecompose {
  117. // Decompose using NFKD and replace tertiary weights.
  118. t1, t2 := splitDecompose(ce)
  119. i := len(w)
  120. nfkd := src.properties(norm.NFKD).Decomposition()
  121. for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {
  122. w, p = t.appendNext(w, source{bytes: nfkd})
  123. }
  124. w[i] = w[i].updateTertiary(t1)
  125. if i++; i < len(w) {
  126. w[i] = w[i].updateTertiary(t2)
  127. for i++; i < len(w); i++ {
  128. w[i] = w[i].updateTertiary(maxTertiary)
  129. }
  130. }
  131. }
  132. return w, sz
  133. }
  134. func (t *Table) appendExpansion(w []Elem, ce Elem) []Elem {
  135. i := splitExpandIndex(ce)
  136. n := int(t.ExpandElem[i])
  137. i++
  138. for _, ce := range t.ExpandElem[i : i+n] {
  139. w = append(w, Elem(ce))
  140. }
  141. return w
  142. }
  143. func (t *Table) matchContraction(w []Elem, ce Elem, suffix []byte) ([]Elem, int) {
  144. index, n, offset := splitContractIndex(ce)
  145. scan := t.ContractTries.scanner(index, n, suffix)
  146. buf := [norm.MaxSegmentSize]byte{}
  147. bufp := 0
  148. p := scan.scan(0)
  149. if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
  150. // By now we should have filtered most cases.
  151. p0 := p
  152. bufn := 0
  153. rune := norm.NFD.Properties(suffix[p:])
  154. p += rune.Size()
  155. if rune.LeadCCC() != 0 {
  156. prevCC := rune.TrailCCC()
  157. // A gap may only occur in the last normalization segment.
  158. // This also ensures that len(scan.s) < norm.MaxSegmentSize.
  159. if end := norm.NFD.FirstBoundary(suffix[p:]); end != -1 {
  160. scan.s = suffix[:p+end]
  161. }
  162. for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
  163. rune = norm.NFD.Properties(suffix[p:])
  164. if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
  165. break
  166. }
  167. prevCC = rune.TrailCCC()
  168. if pp := scan.scan(p); pp != p {
  169. // Copy the interstitial runes for later processing.
  170. bufn += copy(buf[bufn:], suffix[p0:p])
  171. if scan.pindex == pp {
  172. bufp = bufn
  173. }
  174. p, p0 = pp, pp
  175. } else {
  176. p += rune.Size()
  177. }
  178. }
  179. }
  180. }
  181. // Append weights for the matched contraction, which may be an expansion.
  182. i, n := scan.result()
  183. ce = Elem(t.ContractElem[i+offset])
  184. if ce.ctype() == ceNormal {
  185. w = append(w, ce)
  186. } else {
  187. w = t.appendExpansion(w, ce)
  188. }
  189. // Append weights for the runes in the segment not part of the contraction.
  190. for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
  191. w, p = t.appendNext(w, source{bytes: b})
  192. }
  193. return w, n
  194. }
  195. // TODO: unify the two implementations. This is best done after first simplifying
  196. // the algorithm taking into account the inclusion of both NFC and NFD forms
  197. // in the table.
  198. func (t *Table) matchContractionString(w []Elem, ce Elem, suffix string) ([]Elem, int) {
  199. index, n, offset := splitContractIndex(ce)
  200. scan := t.ContractTries.scannerString(index, n, suffix)
  201. buf := [norm.MaxSegmentSize]byte{}
  202. bufp := 0
  203. p := scan.scan(0)
  204. if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
  205. // By now we should have filtered most cases.
  206. p0 := p
  207. bufn := 0
  208. rune := norm.NFD.PropertiesString(suffix[p:])
  209. p += rune.Size()
  210. if rune.LeadCCC() != 0 {
  211. prevCC := rune.TrailCCC()
  212. // A gap may only occur in the last normalization segment.
  213. // This also ensures that len(scan.s) < norm.MaxSegmentSize.
  214. if end := norm.NFD.FirstBoundaryInString(suffix[p:]); end != -1 {
  215. scan.s = suffix[:p+end]
  216. }
  217. for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
  218. rune = norm.NFD.PropertiesString(suffix[p:])
  219. if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
  220. break
  221. }
  222. prevCC = rune.TrailCCC()
  223. if pp := scan.scan(p); pp != p {
  224. // Copy the interstitial runes for later processing.
  225. bufn += copy(buf[bufn:], suffix[p0:p])
  226. if scan.pindex == pp {
  227. bufp = bufn
  228. }
  229. p, p0 = pp, pp
  230. } else {
  231. p += rune.Size()
  232. }
  233. }
  234. }
  235. }
  236. // Append weights for the matched contraction, which may be an expansion.
  237. i, n := scan.result()
  238. ce = Elem(t.ContractElem[i+offset])
  239. if ce.ctype() == ceNormal {
  240. w = append(w, ce)
  241. } else {
  242. w = t.appendExpansion(w, ce)
  243. }
  244. // Append weights for the runes in the segment not part of the contraction.
  245. for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
  246. w, p = t.appendNext(w, source{bytes: b})
  247. }
  248. return w, n
  249. }