You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

146 lines
3.6 KiB

  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package colltab
  5. import "unicode/utf8"
  6. // For a description of ContractTrieSet, see text/collate/build/contract.go.
  7. type ContractTrieSet []struct{ L, H, N, I uint8 }
  8. // ctScanner is used to match a trie to an input sequence.
  9. // A contraction may match a non-contiguous sequence of bytes in an input string.
  10. // For example, if there is a contraction for <a, combining_ring>, it should match
  11. // the sequence <a, combining_cedilla, combining_ring>, as combining_cedilla does
  12. // not block combining_ring.
  13. // ctScanner does not automatically skip over non-blocking non-starters, but rather
  14. // retains the state of the last match and leaves it up to the user to continue
  15. // the match at the appropriate points.
  16. type ctScanner struct {
  17. states ContractTrieSet
  18. s []byte
  19. n int
  20. index int
  21. pindex int
  22. done bool
  23. }
  24. type ctScannerString struct {
  25. states ContractTrieSet
  26. s string
  27. n int
  28. index int
  29. pindex int
  30. done bool
  31. }
  32. func (t ContractTrieSet) scanner(index, n int, b []byte) ctScanner {
  33. return ctScanner{s: b, states: t[index:], n: n}
  34. }
  35. func (t ContractTrieSet) scannerString(index, n int, str string) ctScannerString {
  36. return ctScannerString{s: str, states: t[index:], n: n}
  37. }
  38. // result returns the offset i and bytes consumed p so far. If no suffix
  39. // matched, i and p will be 0.
  40. func (s *ctScanner) result() (i, p int) {
  41. return s.index, s.pindex
  42. }
  43. func (s *ctScannerString) result() (i, p int) {
  44. return s.index, s.pindex
  45. }
  46. const (
  47. final = 0
  48. noIndex = 0xFF
  49. )
  50. // scan matches the longest suffix at the current location in the input
  51. // and returns the number of bytes consumed.
  52. func (s *ctScanner) scan(p int) int {
  53. pr := p // the p at the rune start
  54. str := s.s
  55. states, n := s.states, s.n
  56. for i := 0; i < n && p < len(str); {
  57. e := states[i]
  58. c := str[p]
  59. // TODO: a significant number of contractions are of a form that
  60. // cannot match discontiguous UTF-8 in a normalized string. We could let
  61. // a negative value of e.n mean that we can set s.done = true and avoid
  62. // the need for additional matches.
  63. if c >= e.L {
  64. if e.L == c {
  65. p++
  66. if e.I != noIndex {
  67. s.index = int(e.I)
  68. s.pindex = p
  69. }
  70. if e.N != final {
  71. i, states, n = 0, states[int(e.H)+n:], int(e.N)
  72. if p >= len(str) || utf8.RuneStart(str[p]) {
  73. s.states, s.n, pr = states, n, p
  74. }
  75. } else {
  76. s.done = true
  77. return p
  78. }
  79. continue
  80. } else if e.N == final && c <= e.H {
  81. p++
  82. s.done = true
  83. s.index = int(c-e.L) + int(e.I)
  84. s.pindex = p
  85. return p
  86. }
  87. }
  88. i++
  89. }
  90. return pr
  91. }
  92. // scan is a verbatim copy of ctScanner.scan.
  93. func (s *ctScannerString) scan(p int) int {
  94. pr := p // the p at the rune start
  95. str := s.s
  96. states, n := s.states, s.n
  97. for i := 0; i < n && p < len(str); {
  98. e := states[i]
  99. c := str[p]
  100. // TODO: a significant number of contractions are of a form that
  101. // cannot match discontiguous UTF-8 in a normalized string. We could let
  102. // a negative value of e.n mean that we can set s.done = true and avoid
  103. // the need for additional matches.
  104. if c >= e.L {
  105. if e.L == c {
  106. p++
  107. if e.I != noIndex {
  108. s.index = int(e.I)
  109. s.pindex = p
  110. }
  111. if e.N != final {
  112. i, states, n = 0, states[int(e.H)+n:], int(e.N)
  113. if p >= len(str) || utf8.RuneStart(str[p]) {
  114. s.states, s.n, pr = states, n, p
  115. }
  116. } else {
  117. s.done = true
  118. return p
  119. }
  120. continue
  121. } else if e.N == final && c <= e.H {
  122. p++
  123. s.done = true
  124. s.index = int(c-e.L) + int(e.I)
  125. s.pindex = p
  126. return p
  127. }
  128. }
  129. i++
  130. }
  131. return pr
  132. }