You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

237 lines
7.0 KiB

  1. // Copyright 2014 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package colltab
  5. import (
  6. "unicode"
  7. "unicode/utf8"
  8. )
  9. // NewNumericWeighter wraps w to replace individual digits to sort based on their
  10. // numeric value.
  11. //
  12. // Weighter w must have a free primary weight after the primary weight for 9.
  13. // If this is not the case, numeric value will sort at the same primary level
  14. // as the first primary sorting after 9.
  15. func NewNumericWeighter(w Weighter) Weighter {
  16. getElem := func(s string) Elem {
  17. elems, _ := w.AppendNextString(nil, s)
  18. return elems[0]
  19. }
  20. nine := getElem("9")
  21. // Numbers should order before zero, but the DUCET has no room for this.
  22. // TODO: move before zero once we use fractional collation elements.
  23. ns, _ := MakeElem(nine.Primary()+1, nine.Secondary(), int(nine.Tertiary()), 0)
  24. return &numericWeighter{
  25. Weighter: w,
  26. // We assume that w sorts digits of different kinds in order of numeric
  27. // value and that the tertiary weight order is preserved.
  28. //
  29. // TODO: evaluate whether it is worth basing the ranges on the Elem
  30. // encoding itself once the move to fractional weights is complete.
  31. zero: getElem("0"),
  32. zeroSpecialLo: getElem("0"), // U+FF10 FULLWIDTH DIGIT ZERO
  33. zeroSpecialHi: getElem("₀"), // U+2080 SUBSCRIPT ZERO
  34. nine: nine,
  35. nineSpecialHi: getElem("₉"), // U+2089 SUBSCRIPT NINE
  36. numberStart: ns,
  37. }
  38. }
  39. // A numericWeighter translates a stream of digits into a stream of weights
  40. // representing the numeric value.
  41. type numericWeighter struct {
  42. Weighter
  43. // The Elems below all demarcate boundaries of specific ranges. With the
  44. // current element encoding digits are in two ranges: normal (default
  45. // tertiary value) and special. For most languages, digits have collation
  46. // elements in the normal range.
  47. //
  48. // Note: the range tests are very specific for the element encoding used by
  49. // this implementation. The tests in collate_test.go are designed to fail
  50. // if this code is not updated when an encoding has changed.
  51. zero Elem // normal digit zero
  52. zeroSpecialLo Elem // special digit zero, low tertiary value
  53. zeroSpecialHi Elem // special digit zero, high tertiary value
  54. nine Elem // normal digit nine
  55. nineSpecialHi Elem // special digit nine
  56. numberStart Elem
  57. }
  58. // AppendNext calls the namesake of the underlying weigher, but replaces single
  59. // digits with weights representing their value.
  60. func (nw *numericWeighter) AppendNext(buf []Elem, s []byte) (ce []Elem, n int) {
  61. ce, n = nw.Weighter.AppendNext(buf, s)
  62. nc := numberConverter{
  63. elems: buf,
  64. w: nw,
  65. b: s,
  66. }
  67. isZero, ok := nc.checkNextDigit(ce)
  68. if !ok {
  69. return ce, n
  70. }
  71. // ce might have been grown already, so take it instead of buf.
  72. nc.init(ce, len(buf), isZero)
  73. for n < len(s) {
  74. ce, sz := nw.Weighter.AppendNext(nc.elems, s[n:])
  75. nc.b = s
  76. n += sz
  77. if !nc.update(ce) {
  78. break
  79. }
  80. }
  81. return nc.result(), n
  82. }
  83. // AppendNextString calls the namesake of the underlying weigher, but replaces
  84. // single digits with weights representing their value.
  85. func (nw *numericWeighter) AppendNextString(buf []Elem, s string) (ce []Elem, n int) {
  86. ce, n = nw.Weighter.AppendNextString(buf, s)
  87. nc := numberConverter{
  88. elems: buf,
  89. w: nw,
  90. s: s,
  91. }
  92. isZero, ok := nc.checkNextDigit(ce)
  93. if !ok {
  94. return ce, n
  95. }
  96. nc.init(ce, len(buf), isZero)
  97. for n < len(s) {
  98. ce, sz := nw.Weighter.AppendNextString(nc.elems, s[n:])
  99. nc.s = s
  100. n += sz
  101. if !nc.update(ce) {
  102. break
  103. }
  104. }
  105. return nc.result(), n
  106. }
  107. type numberConverter struct {
  108. w *numericWeighter
  109. elems []Elem
  110. nDigits int
  111. lenIndex int
  112. s string // set if the input was of type string
  113. b []byte // set if the input was of type []byte
  114. }
  115. // init completes initialization of a numberConverter and prepares it for adding
  116. // more digits. elems is assumed to have a digit starting at oldLen.
  117. func (nc *numberConverter) init(elems []Elem, oldLen int, isZero bool) {
  118. // Insert a marker indicating the start of a number and a placeholder
  119. // for the number of digits.
  120. if isZero {
  121. elems = append(elems[:oldLen], nc.w.numberStart, 0)
  122. } else {
  123. elems = append(elems, 0, 0)
  124. copy(elems[oldLen+2:], elems[oldLen:])
  125. elems[oldLen] = nc.w.numberStart
  126. elems[oldLen+1] = 0
  127. nc.nDigits = 1
  128. }
  129. nc.elems = elems
  130. nc.lenIndex = oldLen + 1
  131. }
  132. // checkNextDigit reports whether bufNew adds a single digit relative to the old
  133. // buffer. If it does, it also reports whether this digit is zero.
  134. func (nc *numberConverter) checkNextDigit(bufNew []Elem) (isZero, ok bool) {
  135. if len(nc.elems) >= len(bufNew) {
  136. return false, false
  137. }
  138. e := bufNew[len(nc.elems)]
  139. if e < nc.w.zeroSpecialLo || nc.w.nine < e {
  140. // Not a number.
  141. return false, false
  142. }
  143. if e < nc.w.zero {
  144. if e > nc.w.nineSpecialHi {
  145. // Not a number.
  146. return false, false
  147. }
  148. if !nc.isDigit() {
  149. return false, false
  150. }
  151. isZero = e <= nc.w.zeroSpecialHi
  152. } else {
  153. // This is the common case if we encounter a digit.
  154. isZero = e == nc.w.zero
  155. }
  156. // Test the remaining added collation elements have a zero primary value.
  157. if n := len(bufNew) - len(nc.elems); n > 1 {
  158. for i := len(nc.elems) + 1; i < len(bufNew); i++ {
  159. if bufNew[i].Primary() != 0 {
  160. return false, false
  161. }
  162. }
  163. // In some rare cases, collation elements will encode runes in
  164. // unicode.No as a digit. For example Ethiopic digits (U+1369 - U+1371)
  165. // are not in Nd. Also some digits that clearly belong in unicode.No,
  166. // like U+0C78 TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR, have
  167. // collation elements indistinguishable from normal digits.
  168. // Unfortunately, this means we need to make this check for nearly all
  169. // non-Latin digits.
  170. //
  171. // TODO: check the performance impact and find something better if it is
  172. // an issue.
  173. if !nc.isDigit() {
  174. return false, false
  175. }
  176. }
  177. return isZero, true
  178. }
  179. func (nc *numberConverter) isDigit() bool {
  180. if nc.b != nil {
  181. r, _ := utf8.DecodeRune(nc.b)
  182. return unicode.In(r, unicode.Nd)
  183. }
  184. r, _ := utf8.DecodeRuneInString(nc.s)
  185. return unicode.In(r, unicode.Nd)
  186. }
  187. // We currently support a maximum of about 2M digits (the number of primary
  188. // values). Such numbers will compare correctly against small numbers, but their
  189. // comparison against other large numbers is undefined.
  190. //
  191. // TODO: define a proper fallback, such as comparing large numbers textually or
  192. // actually allowing numbers of unlimited length.
  193. //
  194. // TODO: cap this to a lower number (like 100) and maybe allow a larger number
  195. // in an option?
  196. const maxDigits = 1<<maxPrimaryBits - 1
  197. func (nc *numberConverter) update(elems []Elem) bool {
  198. isZero, ok := nc.checkNextDigit(elems)
  199. if nc.nDigits == 0 && isZero {
  200. return true
  201. }
  202. nc.elems = elems
  203. if !ok {
  204. return false
  205. }
  206. nc.nDigits++
  207. return nc.nDigits < maxDigits
  208. }
  209. // result fills in the length element for the digit sequence and returns the
  210. // completed collation elements.
  211. func (nc *numberConverter) result() []Elem {
  212. e, _ := MakeElem(nc.nDigits, defaultSecondary, defaultTertiary, 0)
  213. nc.elems[nc.lenIndex] = e
  214. return nc.elems
  215. }