You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

240 rivejä
6.3 KiB

  1. // Copyright 2014 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package collate
  5. import (
  6. "sort"
  7. "golang.org/x/text/internal/colltab"
  8. "golang.org/x/text/language"
  9. "golang.org/x/text/unicode/norm"
  10. )
  11. // newCollator creates a new collator with default options configured.
  12. func newCollator(t colltab.Weighter) *Collator {
  13. // Initialize a collator with default options.
  14. c := &Collator{
  15. options: options{
  16. ignore: [colltab.NumLevels]bool{
  17. colltab.Quaternary: true,
  18. colltab.Identity: true,
  19. },
  20. f: norm.NFD,
  21. t: t,
  22. },
  23. }
  24. // TODO: store vt in tags or remove.
  25. c.variableTop = t.Top()
  26. return c
  27. }
  28. // An Option is used to change the behavior of a Collator. Options override the
  29. // settings passed through the locale identifier.
  30. type Option struct {
  31. priority int
  32. f func(o *options)
  33. }
  34. type prioritizedOptions []Option
  35. func (p prioritizedOptions) Len() int {
  36. return len(p)
  37. }
  38. func (p prioritizedOptions) Swap(i, j int) {
  39. p[i], p[j] = p[j], p[i]
  40. }
  41. func (p prioritizedOptions) Less(i, j int) bool {
  42. return p[i].priority < p[j].priority
  43. }
  44. type options struct {
  45. // ignore specifies which levels to ignore.
  46. ignore [colltab.NumLevels]bool
  47. // caseLevel is true if there is an additional level of case matching
  48. // between the secondary and tertiary levels.
  49. caseLevel bool
  50. // backwards specifies the order of sorting at the secondary level.
  51. // This option exists predominantly to support reverse sorting of accents in French.
  52. backwards bool
  53. // numeric specifies whether any sequence of decimal digits (category is Nd)
  54. // is sorted at a primary level with its numeric value.
  55. // For example, "A-21" < "A-123".
  56. // This option is set by wrapping the main Weighter with NewNumericWeighter.
  57. numeric bool
  58. // alternate specifies an alternative handling of variables.
  59. alternate alternateHandling
  60. // variableTop is the largest primary value that is considered to be
  61. // variable.
  62. variableTop uint32
  63. t colltab.Weighter
  64. f norm.Form
  65. }
  66. func (o *options) setOptions(opts []Option) {
  67. sort.Sort(prioritizedOptions(opts))
  68. for _, x := range opts {
  69. x.f(o)
  70. }
  71. }
  72. // OptionsFromTag extracts the BCP47 collation options from the tag and
  73. // configures a collator accordingly. These options are set before any other
  74. // option.
  75. func OptionsFromTag(t language.Tag) Option {
  76. return Option{0, func(o *options) {
  77. o.setFromTag(t)
  78. }}
  79. }
  80. func (o *options) setFromTag(t language.Tag) {
  81. o.caseLevel = ldmlBool(t, o.caseLevel, "kc")
  82. o.backwards = ldmlBool(t, o.backwards, "kb")
  83. o.numeric = ldmlBool(t, o.numeric, "kn")
  84. // Extract settings from the BCP47 u extension.
  85. switch t.TypeForKey("ks") { // strength
  86. case "level1":
  87. o.ignore[colltab.Secondary] = true
  88. o.ignore[colltab.Tertiary] = true
  89. case "level2":
  90. o.ignore[colltab.Tertiary] = true
  91. case "level3", "":
  92. // The default.
  93. case "level4":
  94. o.ignore[colltab.Quaternary] = false
  95. case "identic":
  96. o.ignore[colltab.Quaternary] = false
  97. o.ignore[colltab.Identity] = false
  98. }
  99. switch t.TypeForKey("ka") {
  100. case "shifted":
  101. o.alternate = altShifted
  102. // The following two types are not official BCP47, but we support them to
  103. // give access to this otherwise hidden functionality. The name blanked is
  104. // derived from the LDML name blanked and posix reflects the main use of
  105. // the shift-trimmed option.
  106. case "blanked":
  107. o.alternate = altBlanked
  108. case "posix":
  109. o.alternate = altShiftTrimmed
  110. }
  111. // TODO: caseFirst ("kf"), reorder ("kr"), and maybe variableTop ("vt").
  112. // Not used:
  113. // - normalization ("kk", not necessary for this implementation)
  114. // - hiraganaQuatenary ("kh", obsolete)
  115. }
  116. func ldmlBool(t language.Tag, old bool, key string) bool {
  117. switch t.TypeForKey(key) {
  118. case "true":
  119. return true
  120. case "false":
  121. return false
  122. default:
  123. return old
  124. }
  125. }
  126. var (
  127. // IgnoreCase sets case-insensitive comparison.
  128. IgnoreCase Option = ignoreCase
  129. ignoreCase = Option{3, ignoreCaseF}
  130. // IgnoreDiacritics causes diacritical marks to be ignored. ("o" == "ö").
  131. IgnoreDiacritics Option = ignoreDiacritics
  132. ignoreDiacritics = Option{3, ignoreDiacriticsF}
  133. // IgnoreWidth causes full-width characters to match their half-width
  134. // equivalents.
  135. IgnoreWidth Option = ignoreWidth
  136. ignoreWidth = Option{2, ignoreWidthF}
  137. // Loose sets the collator to ignore diacritics, case and width.
  138. Loose Option = loose
  139. loose = Option{4, looseF}
  140. // Force ordering if strings are equivalent but not equal.
  141. Force Option = force
  142. force = Option{5, forceF}
  143. // Numeric specifies that numbers should sort numerically ("2" < "12").
  144. Numeric Option = numeric
  145. numeric = Option{5, numericF}
  146. )
  147. func ignoreWidthF(o *options) {
  148. o.ignore[colltab.Tertiary] = true
  149. o.caseLevel = true
  150. }
  151. func ignoreDiacriticsF(o *options) {
  152. o.ignore[colltab.Secondary] = true
  153. }
  154. func ignoreCaseF(o *options) {
  155. o.ignore[colltab.Tertiary] = true
  156. o.caseLevel = false
  157. }
  158. func looseF(o *options) {
  159. ignoreWidthF(o)
  160. ignoreDiacriticsF(o)
  161. ignoreCaseF(o)
  162. }
  163. func forceF(o *options) {
  164. o.ignore[colltab.Identity] = false
  165. }
  166. func numericF(o *options) { o.numeric = true }
  167. // Reorder overrides the pre-defined ordering of scripts and character sets.
  168. func Reorder(s ...string) Option {
  169. // TODO: need fractional weights to implement this.
  170. panic("TODO: implement")
  171. }
  172. // TODO: consider making these public again. These options cannot be fully
  173. // specified in BCP47, so an API interface seems warranted. Still a higher-level
  174. // interface would be nice (e.g. a POSIX option for enabling altShiftTrimmed)
  175. // alternateHandling identifies the various ways in which variables are handled.
  176. // A rune with a primary weight lower than the variable top is considered a
  177. // variable.
  178. // See https://www.unicode.org/reports/tr10/#Variable_Weighting for details.
  179. type alternateHandling int
  180. const (
  181. // altNonIgnorable turns off special handling of variables.
  182. altNonIgnorable alternateHandling = iota
  183. // altBlanked sets variables and all subsequent primary ignorables to be
  184. // ignorable at all levels. This is identical to removing all variables
  185. // and subsequent primary ignorables from the input.
  186. altBlanked
  187. // altShifted sets variables to be ignorable for levels one through three and
  188. // adds a fourth level based on the values of the ignored levels.
  189. altShifted
  190. // altShiftTrimmed is a slight variant of altShifted that is used to
  191. // emulate POSIX.
  192. altShiftTrimmed
  193. )