You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

841 lines
26 KiB

  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package language
  5. import "errors"
  6. // Matcher is the interface that wraps the Match method.
  7. //
  8. // Match returns the best match for any of the given tags, along with
  9. // a unique index associated with the returned tag and a confidence
  10. // score.
  11. type Matcher interface {
  12. Match(t ...Tag) (tag Tag, index int, c Confidence)
  13. }
  14. // Comprehends reports the confidence score for a speaker of a given language
  15. // to being able to comprehend the written form of an alternative language.
  16. func Comprehends(speaker, alternative Tag) Confidence {
  17. _, _, c := NewMatcher([]Tag{alternative}).Match(speaker)
  18. return c
  19. }
  20. // NewMatcher returns a Matcher that matches an ordered list of preferred tags
  21. // against a list of supported tags based on written intelligibility, closeness
  22. // of dialect, equivalence of subtags and various other rules. It is initialized
  23. // with the list of supported tags. The first element is used as the default
  24. // value in case no match is found.
  25. //
  26. // Its Match method matches the first of the given Tags to reach a certain
  27. // confidence threshold. The tags passed to Match should therefore be specified
  28. // in order of preference. Extensions are ignored for matching.
  29. //
  30. // The index returned by the Match method corresponds to the index of the
  31. // matched tag in t, but is augmented with the Unicode extension ('u')of the
  32. // corresponding preferred tag. This allows user locale options to be passed
  33. // transparently.
  34. func NewMatcher(t []Tag) Matcher {
  35. return newMatcher(t)
  36. }
  37. func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) {
  38. match, w, c := m.getBest(want...)
  39. if match == nil {
  40. t = m.default_.tag
  41. } else {
  42. t, index = match.tag, match.index
  43. }
  44. // Copy options from the user-provided tag into the result tag. This is hard
  45. // to do after the fact, so we do it here.
  46. // TODO: consider also adding in variants that are compatible with the
  47. // matched language.
  48. // TODO: Add back region if it is non-ambiguous? Or create another tag to
  49. // preserve the region?
  50. if u, ok := w.Extension('u'); ok {
  51. t, _ = Raw.Compose(t, u)
  52. }
  53. return t, index, c
  54. }
  55. type scriptRegionFlags uint8
  56. const (
  57. isList = 1 << iota
  58. scriptInFrom
  59. regionInFrom
  60. )
  61. func (t *Tag) setUndefinedLang(id langID) {
  62. if t.lang == 0 {
  63. t.lang = id
  64. }
  65. }
  66. func (t *Tag) setUndefinedScript(id scriptID) {
  67. if t.script == 0 {
  68. t.script = id
  69. }
  70. }
  71. func (t *Tag) setUndefinedRegion(id regionID) {
  72. if t.region == 0 || t.region.contains(id) {
  73. t.region = id
  74. }
  75. }
  76. // ErrMissingLikelyTagsData indicates no information was available
  77. // to compute likely values of missing tags.
  78. var ErrMissingLikelyTagsData = errors.New("missing likely tags data")
  79. // addLikelySubtags sets subtags to their most likely value, given the locale.
  80. // In most cases this means setting fields for unknown values, but in some
  81. // cases it may alter a value. It returns a ErrMissingLikelyTagsData error
  82. // if the given locale cannot be expanded.
  83. func (t Tag) addLikelySubtags() (Tag, error) {
  84. id, err := addTags(t)
  85. if err != nil {
  86. return t, err
  87. } else if id.equalTags(t) {
  88. return t, nil
  89. }
  90. id.remakeString()
  91. return id, nil
  92. }
  93. // specializeRegion attempts to specialize a group region.
  94. func specializeRegion(t *Tag) bool {
  95. if i := regionInclusion[t.region]; i < nRegionGroups {
  96. x := likelyRegionGroup[i]
  97. if langID(x.lang) == t.lang && scriptID(x.script) == t.script {
  98. t.region = regionID(x.region)
  99. }
  100. return true
  101. }
  102. return false
  103. }
  104. func addTags(t Tag) (Tag, error) {
  105. // We leave private use identifiers alone.
  106. if t.private() {
  107. return t, nil
  108. }
  109. if t.script != 0 && t.region != 0 {
  110. if t.lang != 0 {
  111. // already fully specified
  112. specializeRegion(&t)
  113. return t, nil
  114. }
  115. // Search matches for und-script-region. Note that for these cases
  116. // region will never be a group so there is no need to check for this.
  117. list := likelyRegion[t.region : t.region+1]
  118. if x := list[0]; x.flags&isList != 0 {
  119. list = likelyRegionList[x.lang : x.lang+uint16(x.script)]
  120. }
  121. for _, x := range list {
  122. // Deviating from the spec. See match_test.go for details.
  123. if scriptID(x.script) == t.script {
  124. t.setUndefinedLang(langID(x.lang))
  125. return t, nil
  126. }
  127. }
  128. }
  129. if t.lang != 0 {
  130. // Search matches for lang-script and lang-region, where lang != und.
  131. if t.lang < langNoIndexOffset {
  132. x := likelyLang[t.lang]
  133. if x.flags&isList != 0 {
  134. list := likelyLangList[x.region : x.region+uint16(x.script)]
  135. if t.script != 0 {
  136. for _, x := range list {
  137. if scriptID(x.script) == t.script && x.flags&scriptInFrom != 0 {
  138. t.setUndefinedRegion(regionID(x.region))
  139. return t, nil
  140. }
  141. }
  142. } else if t.region != 0 {
  143. count := 0
  144. goodScript := true
  145. tt := t
  146. for _, x := range list {
  147. // We visit all entries for which the script was not
  148. // defined, including the ones where the region was not
  149. // defined. This allows for proper disambiguation within
  150. // regions.
  151. if x.flags&scriptInFrom == 0 && t.region.contains(regionID(x.region)) {
  152. tt.region = regionID(x.region)
  153. tt.setUndefinedScript(scriptID(x.script))
  154. goodScript = goodScript && tt.script == scriptID(x.script)
  155. count++
  156. }
  157. }
  158. if count == 1 {
  159. return tt, nil
  160. }
  161. // Even if we fail to find a unique Region, we might have
  162. // an unambiguous script.
  163. if goodScript {
  164. t.script = tt.script
  165. }
  166. }
  167. }
  168. }
  169. } else {
  170. // Search matches for und-script.
  171. if t.script != 0 {
  172. x := likelyScript[t.script]
  173. if x.region != 0 {
  174. t.setUndefinedRegion(regionID(x.region))
  175. t.setUndefinedLang(langID(x.lang))
  176. return t, nil
  177. }
  178. }
  179. // Search matches for und-region. If und-script-region exists, it would
  180. // have been found earlier.
  181. if t.region != 0 {
  182. if i := regionInclusion[t.region]; i < nRegionGroups {
  183. x := likelyRegionGroup[i]
  184. if x.region != 0 {
  185. t.setUndefinedLang(langID(x.lang))
  186. t.setUndefinedScript(scriptID(x.script))
  187. t.region = regionID(x.region)
  188. }
  189. } else {
  190. x := likelyRegion[t.region]
  191. if x.flags&isList != 0 {
  192. x = likelyRegionList[x.lang]
  193. }
  194. if x.script != 0 && x.flags != scriptInFrom {
  195. t.setUndefinedLang(langID(x.lang))
  196. t.setUndefinedScript(scriptID(x.script))
  197. return t, nil
  198. }
  199. }
  200. }
  201. }
  202. // Search matches for lang.
  203. if t.lang < langNoIndexOffset {
  204. x := likelyLang[t.lang]
  205. if x.flags&isList != 0 {
  206. x = likelyLangList[x.region]
  207. }
  208. if x.region != 0 {
  209. t.setUndefinedScript(scriptID(x.script))
  210. t.setUndefinedRegion(regionID(x.region))
  211. }
  212. specializeRegion(&t)
  213. if t.lang == 0 {
  214. t.lang = _en // default language
  215. }
  216. return t, nil
  217. }
  218. return t, ErrMissingLikelyTagsData
  219. }
  220. func (t *Tag) setTagsFrom(id Tag) {
  221. t.lang = id.lang
  222. t.script = id.script
  223. t.region = id.region
  224. }
  225. // minimize removes the region or script subtags from t such that
  226. // t.addLikelySubtags() == t.minimize().addLikelySubtags().
  227. func (t Tag) minimize() (Tag, error) {
  228. t, err := minimizeTags(t)
  229. if err != nil {
  230. return t, err
  231. }
  232. t.remakeString()
  233. return t, nil
  234. }
  235. // minimizeTags mimics the behavior of the ICU 51 C implementation.
  236. func minimizeTags(t Tag) (Tag, error) {
  237. if t.equalTags(und) {
  238. return t, nil
  239. }
  240. max, err := addTags(t)
  241. if err != nil {
  242. return t, err
  243. }
  244. for _, id := range [...]Tag{
  245. {lang: t.lang},
  246. {lang: t.lang, region: t.region},
  247. {lang: t.lang, script: t.script},
  248. } {
  249. if x, err := addTags(id); err == nil && max.equalTags(x) {
  250. t.setTagsFrom(id)
  251. break
  252. }
  253. }
  254. return t, nil
  255. }
  256. // Tag Matching
  257. // CLDR defines an algorithm for finding the best match between two sets of language
  258. // tags. The basic algorithm defines how to score a possible match and then find
  259. // the match with the best score
  260. // (see http://www.unicode.org/reports/tr35/#LanguageMatching).
  261. // Using scoring has several disadvantages. The scoring obfuscates the importance of
  262. // the various factors considered, making the algorithm harder to understand. Using
  263. // scoring also requires the full score to be computed for each pair of tags.
  264. //
  265. // We will use a different algorithm which aims to have the following properties:
  266. // - clarity on the precedence of the various selection factors, and
  267. // - improved performance by allowing early termination of a comparison.
  268. //
  269. // Matching algorithm (overview)
  270. // Input:
  271. // - supported: a set of supported tags
  272. // - default: the default tag to return in case there is no match
  273. // - desired: list of desired tags, ordered by preference, starting with
  274. // the most-preferred.
  275. //
  276. // Algorithm:
  277. // 1) Set the best match to the lowest confidence level
  278. // 2) For each tag in "desired":
  279. // a) For each tag in "supported":
  280. // 1) compute the match between the two tags.
  281. // 2) if the match is better than the previous best match, replace it
  282. // with the new match. (see next section)
  283. // b) if the current best match is above a certain threshold, return this
  284. // match without proceeding to the next tag in "desired". [See Note 1]
  285. // 3) If the best match so far is below a certain threshold, return "default".
  286. //
  287. // Ranking:
  288. // We use two phases to determine whether one pair of tags are a better match
  289. // than another pair of tags. First, we determine a rough confidence level. If the
  290. // levels are different, the one with the highest confidence wins.
  291. // Second, if the rough confidence levels are identical, we use a set of tie-breaker
  292. // rules.
  293. //
  294. // The confidence level of matching a pair of tags is determined by finding the
  295. // lowest confidence level of any matches of the corresponding subtags (the
  296. // result is deemed as good as its weakest link).
  297. // We define the following levels:
  298. // Exact - An exact match of a subtag, before adding likely subtags.
  299. // MaxExact - An exact match of a subtag, after adding likely subtags.
  300. // [See Note 2].
  301. // High - High level of mutual intelligibility between different subtag
  302. // variants.
  303. // Low - Low level of mutual intelligibility between different subtag
  304. // variants.
  305. // No - No mutual intelligibility.
  306. //
  307. // The following levels can occur for each type of subtag:
  308. // Base: Exact, MaxExact, High, Low, No
  309. // Script: Exact, MaxExact [see Note 3], Low, No
  310. // Region: Exact, MaxExact, High
  311. // Variant: Exact, High
  312. // Private: Exact, No
  313. //
  314. // Any result with a confidence level of Low or higher is deemed a possible match.
  315. // Once a desired tag matches any of the supported tags with a level of MaxExact
  316. // or higher, the next desired tag is not considered (see Step 2.b).
  317. // Note that CLDR provides languageMatching data that defines close equivalence
  318. // classes for base languages, scripts and regions.
  319. //
  320. // Tie-breaking
  321. // If we get the same confidence level for two matches, we apply a sequence of
  322. // tie-breaking rules. The first that succeeds defines the result. The rules are
  323. // applied in the following order.
  324. // 1) Original language was defined and was identical.
  325. // 2) Original region was defined and was identical.
  326. // 3) Distance between two maximized regions was the smallest.
  327. // 4) Original script was defined and was identical.
  328. // 5) Distance from want tag to have tag using the parent relation [see Note 5.]
  329. // If there is still no winner after these rules are applied, the first match
  330. // found wins.
  331. //
  332. // Notes:
  333. // [1] Note that even if we may not have a perfect match, if a match is above a
  334. // certain threshold, it is considered a better match than any other match
  335. // to a tag later in the list of preferred language tags.
  336. // [2] In practice, as matching of Exact is done in a separate phase from
  337. // matching the other levels, we reuse the Exact level to mean MaxExact in
  338. // the second phase. As a consequence, we only need the levels defined by
  339. // the Confidence type. The MaxExact confidence level is mapped to High in
  340. // the public API.
  341. // [3] We do not differentiate between maximized script values that were derived
  342. // from suppressScript versus most likely tag data. We determined that in
  343. // ranking the two, one ranks just after the other. Moreover, the two cannot
  344. // occur concurrently. As a consequence, they are identical for practical
  345. // purposes.
  346. // [4] In case of deprecated, macro-equivalents and legacy mappings, we assign
  347. // the MaxExact level to allow iw vs he to still be a closer match than
  348. // en-AU vs en-US, for example.
  349. // [5] In CLDR a locale inherits fields that are unspecified for this locale
  350. // from its parent. Therefore, if a locale is a parent of another locale,
  351. // it is a strong measure for closeness, especially when no other tie
  352. // breaker rule applies. One could also argue it is inconsistent, for
  353. // example, when pt-AO matches pt (which CLDR equates with pt-BR), even
  354. // though its parent is pt-PT according to the inheritance rules.
  355. //
  356. // Implementation Details:
  357. // There are several performance considerations worth pointing out. Most notably,
  358. // we preprocess as much as possible (within reason) at the time of creation of a
  359. // matcher. This includes:
  360. // - creating a per-language map, which includes data for the raw base language
  361. // and its canonicalized variant (if applicable),
  362. // - expanding entries for the equivalence classes defined in CLDR's
  363. // languageMatch data.
  364. // The per-language map ensures that typically only a very small number of tags
  365. // need to be considered. The pre-expansion of canonicalized subtags and
  366. // equivalence classes reduces the amount of map lookups that need to be done at
  367. // runtime.
  368. // matcher keeps a set of supported language tags, indexed by language.
  369. type matcher struct {
  370. default_ *haveTag
  371. index map[langID]*matchHeader
  372. passSettings bool
  373. }
  374. // matchHeader has the lists of tags for exact matches and matches based on
  375. // maximized and canonicalized tags for a given language.
  376. type matchHeader struct {
  377. exact []haveTag
  378. max []haveTag
  379. }
  380. // haveTag holds a supported Tag and its maximized script and region. The maximized
  381. // or canonicalized language is not stored as it is not needed during matching.
  382. type haveTag struct {
  383. tag Tag
  384. // index of this tag in the original list of supported tags.
  385. index int
  386. // conf is the maximum confidence that can result from matching this haveTag.
  387. // When conf < Exact this means it was inserted after applying a CLDR equivalence rule.
  388. conf Confidence
  389. // Maximized region and script.
  390. maxRegion regionID
  391. maxScript scriptID
  392. // altScript may be checked as an alternative match to maxScript. If altScript
  393. // matches, the confidence level for this match is Low. Theoretically there
  394. // could be multiple alternative scripts. This does not occur in practice.
  395. altScript scriptID
  396. // nextMax is the index of the next haveTag with the same maximized tags.
  397. nextMax uint16
  398. }
  399. func makeHaveTag(tag Tag, index int) (haveTag, langID) {
  400. max := tag
  401. if tag.lang != 0 {
  402. max, _ = max.canonicalize(All)
  403. max, _ = addTags(max)
  404. max.remakeString()
  405. }
  406. return haveTag{tag, index, Exact, max.region, max.script, altScript(max.lang, max.script), 0}, max.lang
  407. }
  408. // altScript returns an alternative script that may match the given script with
  409. // a low confidence. At the moment, the langMatch data allows for at most one
  410. // script to map to another and we rely on this to keep the code simple.
  411. func altScript(l langID, s scriptID) scriptID {
  412. for _, alt := range matchScript {
  413. if (alt.lang == 0 || langID(alt.lang) == l) && scriptID(alt.have) == s {
  414. return scriptID(alt.want)
  415. }
  416. }
  417. return 0
  418. }
  419. // addIfNew adds a haveTag to the list of tags only if it is a unique tag.
  420. // Tags that have the same maximized values are linked by index.
  421. func (h *matchHeader) addIfNew(n haveTag, exact bool) {
  422. // Don't add new exact matches.
  423. for _, v := range h.exact {
  424. if v.tag.equalsRest(n.tag) {
  425. return
  426. }
  427. }
  428. if exact {
  429. h.exact = append(h.exact, n)
  430. }
  431. // Allow duplicate maximized tags, but create a linked list to allow quickly
  432. // comparing the equivalents and bail out.
  433. for i, v := range h.max {
  434. if v.maxScript == n.maxScript &&
  435. v.maxRegion == n.maxRegion &&
  436. v.tag.variantOrPrivateTagStr() == n.tag.variantOrPrivateTagStr() {
  437. for h.max[i].nextMax != 0 {
  438. i = int(h.max[i].nextMax)
  439. }
  440. h.max[i].nextMax = uint16(len(h.max))
  441. break
  442. }
  443. }
  444. h.max = append(h.max, n)
  445. }
  446. // header returns the matchHeader for the given language. It creates one if
  447. // it doesn't already exist.
  448. func (m *matcher) header(l langID) *matchHeader {
  449. if h := m.index[l]; h != nil {
  450. return h
  451. }
  452. h := &matchHeader{}
  453. m.index[l] = h
  454. return h
  455. }
  456. // newMatcher builds an index for the given supported tags and returns it as
  457. // a matcher. It also expands the index by considering various equivalence classes
  458. // for a given tag.
  459. func newMatcher(supported []Tag) *matcher {
  460. m := &matcher{
  461. index: make(map[langID]*matchHeader),
  462. }
  463. if len(supported) == 0 {
  464. m.default_ = &haveTag{}
  465. return m
  466. }
  467. // Add supported languages to the index. Add exact matches first to give
  468. // them precedence.
  469. for i, tag := range supported {
  470. pair, _ := makeHaveTag(tag, i)
  471. m.header(tag.lang).addIfNew(pair, true)
  472. }
  473. m.default_ = &m.header(supported[0].lang).exact[0]
  474. for i, tag := range supported {
  475. pair, max := makeHaveTag(tag, i)
  476. if max != tag.lang {
  477. m.header(max).addIfNew(pair, false)
  478. }
  479. }
  480. // update is used to add indexes in the map for equivalent languages.
  481. // If force is true, the update will also apply to derived entries. To
  482. // avoid applying a "transitive closure", use false.
  483. update := func(want, have uint16, conf Confidence, force bool) {
  484. if hh := m.index[langID(have)]; hh != nil {
  485. if !force && len(hh.exact) == 0 {
  486. return
  487. }
  488. hw := m.header(langID(want))
  489. for _, v := range hh.max {
  490. if conf < v.conf {
  491. v.conf = conf
  492. }
  493. v.nextMax = 0 // this value needs to be recomputed
  494. if v.altScript != 0 {
  495. v.altScript = altScript(langID(want), v.maxScript)
  496. }
  497. hw.addIfNew(v, conf == Exact && len(hh.exact) > 0)
  498. }
  499. }
  500. }
  501. // Add entries for languages with mutual intelligibility as defined by CLDR's
  502. // languageMatch data.
  503. for _, ml := range matchLang {
  504. update(ml.want, ml.have, Confidence(ml.conf), false)
  505. if !ml.oneway {
  506. update(ml.have, ml.want, Confidence(ml.conf), false)
  507. }
  508. }
  509. // Add entries for possible canonicalizations. This is an optimization to
  510. // ensure that only one map lookup needs to be done at runtime per desired tag.
  511. // First we match deprecated equivalents. If they are perfect equivalents
  512. // (their canonicalization simply substitutes a different language code, but
  513. // nothing else), the match confidence is Exact, otherwise it is High.
  514. for i, lm := range langAliasMap {
  515. if lm.from == _sh {
  516. continue
  517. }
  518. // If deprecated codes match and there is no fiddling with the script or
  519. // or region, we consider it an exact match.
  520. conf := Exact
  521. if langAliasTypes[i] != langMacro {
  522. if !isExactEquivalent(langID(lm.from)) {
  523. conf = High
  524. }
  525. update(lm.to, lm.from, conf, true)
  526. }
  527. update(lm.from, lm.to, conf, true)
  528. }
  529. return m
  530. }
  531. // getBest gets the best matching tag in m for any of the given tags, taking into
  532. // account the order of preference of the given tags.
  533. func (m *matcher) getBest(want ...Tag) (got *haveTag, orig Tag, c Confidence) {
  534. best := bestMatch{}
  535. for _, w := range want {
  536. var max Tag
  537. // Check for exact match first.
  538. h := m.index[w.lang]
  539. if w.lang != 0 {
  540. // Base language is defined.
  541. if h == nil {
  542. continue
  543. }
  544. for i := range h.exact {
  545. have := &h.exact[i]
  546. if have.tag.equalsRest(w) {
  547. return have, w, Exact
  548. }
  549. }
  550. max, _ = w.canonicalize(Legacy | Deprecated)
  551. max, _ = addTags(max)
  552. } else {
  553. // Base language is not defined.
  554. if h != nil {
  555. for i := range h.exact {
  556. have := &h.exact[i]
  557. if have.tag.equalsRest(w) {
  558. return have, w, Exact
  559. }
  560. }
  561. }
  562. if w.script == 0 && w.region == 0 {
  563. // We skip all tags matching und for approximate matching, including
  564. // private tags.
  565. continue
  566. }
  567. max, _ = addTags(w)
  568. if h = m.index[max.lang]; h == nil {
  569. continue
  570. }
  571. }
  572. // Check for match based on maximized tag.
  573. for i := range h.max {
  574. have := &h.max[i]
  575. best.update(have, w, max.script, max.region)
  576. if best.conf == Exact {
  577. for have.nextMax != 0 {
  578. have = &h.max[have.nextMax]
  579. best.update(have, w, max.script, max.region)
  580. }
  581. return best.have, best.want, High
  582. }
  583. }
  584. }
  585. if best.conf <= No {
  586. if len(want) != 0 {
  587. return nil, want[0], No
  588. }
  589. return nil, Tag{}, No
  590. }
  591. return best.have, best.want, best.conf
  592. }
  593. // bestMatch accumulates the best match so far.
  594. type bestMatch struct {
  595. have *haveTag
  596. want Tag
  597. conf Confidence
  598. // Cached results from applying tie-breaking rules.
  599. origLang bool
  600. origReg bool
  601. regDist uint8
  602. origScript bool
  603. parentDist uint8 // 255 if have is not an ancestor of want tag.
  604. }
  605. // update updates the existing best match if the new pair is considered to be a
  606. // better match.
  607. // To determine if the given pair is a better match, it first computes the rough
  608. // confidence level. If this surpasses the current match, it will replace it and
  609. // update the tie-breaker rule cache. If there is a tie, it proceeds with applying
  610. // a series of tie-breaker rules. If there is no conclusive winner after applying
  611. // the tie-breaker rules, it leaves the current match as the preferred match.
  612. func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion regionID) {
  613. // Bail if the maximum attainable confidence is below that of the current best match.
  614. c := have.conf
  615. if c < m.conf {
  616. return
  617. }
  618. if have.maxScript != maxScript {
  619. // There is usually very little comprehension between different scripts.
  620. // In a few cases there may still be Low comprehension. This possibility is
  621. // pre-computed and stored in have.altScript.
  622. if Low < m.conf || have.altScript != maxScript {
  623. return
  624. }
  625. c = Low
  626. } else if have.maxRegion != maxRegion {
  627. // There is usually a small difference between languages across regions.
  628. // We use the region distance (below) to disambiguate between equal matches.
  629. if High < c {
  630. c = High
  631. }
  632. }
  633. // We store the results of the computations of the tie-breaker rules along
  634. // with the best match. There is no need to do the checks once we determine
  635. // we have a winner, but we do still need to do the tie-breaker computations.
  636. // We use "beaten" to keep track if we still need to do the checks.
  637. beaten := false // true if the new pair defeats the current one.
  638. if c != m.conf {
  639. if c < m.conf {
  640. return
  641. }
  642. beaten = true
  643. }
  644. // Tie-breaker rules:
  645. // We prefer if the pre-maximized language was specified and identical.
  646. origLang := have.tag.lang == tag.lang && tag.lang != 0
  647. if !beaten && m.origLang != origLang {
  648. if m.origLang {
  649. return
  650. }
  651. beaten = true
  652. }
  653. // We prefer if the pre-maximized region was specified and identical.
  654. origReg := have.tag.region == tag.region && tag.region != 0
  655. if !beaten && m.origReg != origReg {
  656. if m.origReg {
  657. return
  658. }
  659. beaten = true
  660. }
  661. // Next we prefer smaller distances between regions, as defined by regionDist.
  662. regDist := regionDist(have.maxRegion, maxRegion, tag.lang)
  663. if !beaten && m.regDist != regDist {
  664. if regDist > m.regDist {
  665. return
  666. }
  667. beaten = true
  668. }
  669. // Next we prefer if the pre-maximized script was specified and identical.
  670. origScript := have.tag.script == tag.script && tag.script != 0
  671. if !beaten && m.origScript != origScript {
  672. if m.origScript {
  673. return
  674. }
  675. beaten = true
  676. }
  677. // Finally we prefer tags which have a closer parent relationship.
  678. parentDist := parentDistance(have.tag.region, tag)
  679. if !beaten && m.parentDist != parentDist {
  680. if parentDist > m.parentDist {
  681. return
  682. }
  683. beaten = true
  684. }
  685. // Update m to the newly found best match.
  686. if beaten {
  687. m.have = have
  688. m.want = tag
  689. m.conf = c
  690. m.origLang = origLang
  691. m.origReg = origReg
  692. m.origScript = origScript
  693. m.regDist = regDist
  694. m.parentDist = parentDist
  695. }
  696. }
  697. // parentDistance returns the number of times Parent must be called before the
  698. // regions match. It is assumed that it has already been checked that lang and
  699. // script are identical. If haveRegion does not occur in the ancestor chain of
  700. // tag, it returns 255.
  701. func parentDistance(haveRegion regionID, tag Tag) uint8 {
  702. p := tag.Parent()
  703. d := uint8(1)
  704. for haveRegion != p.region {
  705. if p.region == 0 {
  706. return 255
  707. }
  708. p = p.Parent()
  709. d++
  710. }
  711. return d
  712. }
  713. // regionDist wraps regionDistance with some exceptions to the algorithmic distance.
  714. func regionDist(a, b regionID, lang langID) uint8 {
  715. if lang == _en {
  716. // Two variants of non-US English are close to each other, regardless of distance.
  717. if a != _US && b != _US {
  718. return 2
  719. }
  720. }
  721. return uint8(regionDistance(a, b))
  722. }
  723. // regionDistance computes the distance between two regions based on the
  724. // distance in the graph of region containments as defined in CLDR. It iterates
  725. // over increasingly inclusive sets of groups, represented as bit vectors, until
  726. // the source bit vector has bits in common with the destination vector.
  727. func regionDistance(a, b regionID) int {
  728. if a == b {
  729. return 0
  730. }
  731. p, q := regionInclusion[a], regionInclusion[b]
  732. if p < nRegionGroups {
  733. p, q = q, p
  734. }
  735. set := regionInclusionBits
  736. if q < nRegionGroups && set[p]&(1<<q) != 0 {
  737. return 1
  738. }
  739. d := 2
  740. for goal := set[q]; set[p]&goal == 0; p = regionInclusionNext[p] {
  741. d++
  742. }
  743. return d
  744. }
  745. func (t Tag) variants() string {
  746. if t.pVariant == 0 {
  747. return ""
  748. }
  749. return t.str[t.pVariant:t.pExt]
  750. }
  751. // variantOrPrivateTagStr returns variants or private use tags.
  752. func (t Tag) variantOrPrivateTagStr() string {
  753. if t.pExt > 0 {
  754. return t.str[t.pVariant:t.pExt]
  755. }
  756. return t.str[t.pVariant:]
  757. }
  758. // equalsRest compares everything except the language.
  759. func (a Tag) equalsRest(b Tag) bool {
  760. // TODO: don't include extensions in this comparison. To do this efficiently,
  761. // though, we should handle private tags separately.
  762. return a.script == b.script && a.region == b.region && a.variantOrPrivateTagStr() == b.variantOrPrivateTagStr()
  763. }
  764. // isExactEquivalent returns true if canonicalizing the language will not alter
  765. // the script or region of a tag.
  766. func isExactEquivalent(l langID) bool {
  767. for _, o := range notEquivalent {
  768. if o == l {
  769. return false
  770. }
  771. }
  772. return true
  773. }
  774. var notEquivalent []langID
  775. func init() {
  776. // Create a list of all languages for which canonicalization may alter the
  777. // script or region.
  778. for _, lm := range langAliasMap {
  779. tag := Tag{lang: langID(lm.from)}
  780. if tag, _ = tag.canonicalize(All); tag.script != 0 || tag.region != 0 {
  781. notEquivalent = append(notEquivalent, langID(lm.from))
  782. }
  783. }
  784. }