You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

976 lines
30 KiB

  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:generate go run maketables.go gen_common.go -output tables.go
  5. //go:generate go run gen_index.go
  6. // Package language implements BCP 47 language tags and related functionality.
  7. //
  8. // The Tag type, which is used to represent languages, is agnostic to the
  9. // meaning of its subtags. Tags are not fully canonicalized to preserve
  10. // information that may be valuable in certain contexts. As a consequence, two
  11. // different tags may represent identical languages.
  12. //
  13. // Initializing language- or locale-specific components usually consists of
  14. // two steps. The first step is to select a display language based on the
  15. // preferred languages of the user and the languages supported by an application.
  16. // The second step is to create the language-specific services based on
  17. // this selection. Each is discussed in more details below.
  18. //
  19. // Matching preferred against supported languages
  20. //
  21. // An application may support various languages. This list is typically limited
  22. // by the languages for which there exists translations of the user interface.
  23. // Similarly, a user may provide a list of preferred languages which is limited
  24. // by the languages understood by this user.
  25. // An application should use a Matcher to find the best supported language based
  26. // on the user's preferred list.
  27. // Matchers are aware of the intricacies of equivalence between languages.
  28. // The default Matcher implementation takes into account things such as
  29. // deprecated subtags, legacy tags, and mutual intelligibility between scripts
  30. // and languages.
  31. //
  32. // A Matcher for English, Australian English, Danish, and standard Mandarin can
  33. // be defined as follows:
  34. //
  35. // var matcher = language.NewMatcher([]language.Tag{
  36. // language.English, // The first language is used as fallback.
  37. // language.MustParse("en-AU"),
  38. // language.Danish,
  39. // language.Chinese,
  40. // })
  41. //
  42. // The following code selects the best match for someone speaking Spanish and
  43. // Norwegian:
  44. //
  45. // preferred := []language.Tag{ language.Spanish, language.Norwegian }
  46. // tag, _, _ := matcher.Match(preferred...)
  47. //
  48. // In this case, the best match is Danish, as Danish is sufficiently a match to
  49. // Norwegian to not have to fall back to the default.
  50. // See ParseAcceptLanguage on how to handle the Accept-Language HTTP header.
  51. //
  52. // Selecting language-specific services
  53. //
  54. // One should always use the Tag returned by the Matcher to create an instance
  55. // of any of the language-specific services provided by the text repository.
  56. // This prevents the mixing of languages, such as having a different language for
  57. // messages and display names, as well as improper casing or sorting order for
  58. // the selected language.
  59. // Using the returned Tag also allows user-defined settings, such as collation
  60. // order or numbering system to be transparently passed as options.
  61. //
  62. // If you have language-specific data in your application, however, it will in
  63. // most cases suffice to use the index returned by the matcher to identify
  64. // the user language.
  65. // The following loop provides an alternative in case this is not sufficient:
  66. //
  67. // supported := map[language.Tag]data{
  68. // language.English: enData,
  69. // language.MustParse("en-AU"): enAUData,
  70. // language.Danish: daData,
  71. // language.Chinese: zhData,
  72. // }
  73. // tag, _, _ := matcher.Match(preferred...)
  74. // for ; tag != language.Und; tag = tag.Parent() {
  75. // if v, ok := supported[tag]; ok {
  76. // return v
  77. // }
  78. // }
  79. // return enData // should not reach here
  80. //
  81. // Repeatedly taking the Parent of the tag returned by Match will eventually
  82. // match one of the tags used to initialize the Matcher.
  83. //
  84. // Canonicalization
  85. //
  86. // By default, only legacy and deprecated tags are converted into their
  87. // canonical equivalent. All other information is preserved. This approach makes
  88. // the confidence scores more accurate and allows matchers to distinguish
  89. // between variants that are otherwise lost.
  90. //
  91. // As a consequence, two tags that should be treated as identical according to
  92. // BCP 47 or CLDR, like "en-Latn" and "en", will be represented differently. The
  93. // Matchers will handle such distinctions, though, and are aware of the
  94. // equivalence relations. The CanonType type can be used to alter the
  95. // canonicalization form.
  96. //
  97. // References
  98. //
  99. // BCP 47 - Tags for Identifying Languages
  100. // http://tools.ietf.org/html/bcp47
  101. package language // import "golang.org/x/text/language"
  102. // TODO: Remove above NOTE after:
  103. // - verifying that tables are dropped correctly (most notably matcher tables).
  104. import (
  105. "errors"
  106. "fmt"
  107. "strings"
  108. )
  109. const (
  110. // maxCoreSize is the maximum size of a BCP 47 tag without variants and
  111. // extensions. Equals max lang (3) + script (4) + max reg (3) + 2 dashes.
  112. maxCoreSize = 12
  113. // max99thPercentileSize is a somewhat arbitrary buffer size that presumably
  114. // is large enough to hold at least 99% of the BCP 47 tags.
  115. max99thPercentileSize = 32
  116. // maxSimpleUExtensionSize is the maximum size of a -u extension with one
  117. // key-type pair. Equals len("-u-") + key (2) + dash + max value (8).
  118. maxSimpleUExtensionSize = 14
  119. )
  120. // Tag represents a BCP 47 language tag. It is used to specify an instance of a
  121. // specific language or locale. All language tag values are guaranteed to be
  122. // well-formed.
  123. type Tag struct {
  124. lang langID
  125. region regionID
  126. script scriptID
  127. pVariant byte // offset in str, includes preceding '-'
  128. pExt uint16 // offset of first extension, includes preceding '-'
  129. // str is the string representation of the Tag. It will only be used if the
  130. // tag has variants or extensions.
  131. str string
  132. }
  133. // Make is a convenience wrapper for Parse that omits the error.
  134. // In case of an error, a sensible default is returned.
  135. func Make(s string) Tag {
  136. return Default.Make(s)
  137. }
  138. // Make is a convenience wrapper for c.Parse that omits the error.
  139. // In case of an error, a sensible default is returned.
  140. func (c CanonType) Make(s string) Tag {
  141. t, _ := c.Parse(s)
  142. return t
  143. }
  144. // Raw returns the raw base language, script and region, without making an
  145. // attempt to infer their values.
  146. func (t Tag) Raw() (b Base, s Script, r Region) {
  147. return Base{t.lang}, Script{t.script}, Region{t.region}
  148. }
  149. // equalTags compares language, script and region subtags only.
  150. func (t Tag) equalTags(a Tag) bool {
  151. return t.lang == a.lang && t.script == a.script && t.region == a.region
  152. }
  153. // IsRoot returns true if t is equal to language "und".
  154. func (t Tag) IsRoot() bool {
  155. if int(t.pVariant) < len(t.str) {
  156. return false
  157. }
  158. return t.equalTags(und)
  159. }
  160. // private reports whether the Tag consists solely of a private use tag.
  161. func (t Tag) private() bool {
  162. return t.str != "" && t.pVariant == 0
  163. }
  164. // CanonType can be used to enable or disable various types of canonicalization.
  165. type CanonType int
  166. const (
  167. // Replace deprecated base languages with their preferred replacements.
  168. DeprecatedBase CanonType = 1 << iota
  169. // Replace deprecated scripts with their preferred replacements.
  170. DeprecatedScript
  171. // Replace deprecated regions with their preferred replacements.
  172. DeprecatedRegion
  173. // Remove redundant scripts.
  174. SuppressScript
  175. // Normalize legacy encodings. This includes legacy languages defined in
  176. // CLDR as well as bibliographic codes defined in ISO-639.
  177. Legacy
  178. // Map the dominant language of a macro language group to the macro language
  179. // subtag. For example cmn -> zh.
  180. Macro
  181. // The CLDR flag should be used if full compatibility with CLDR is required.
  182. // There are a few cases where language.Tag may differ from CLDR. To follow all
  183. // of CLDR's suggestions, use All|CLDR.
  184. CLDR
  185. // Raw can be used to Compose or Parse without Canonicalization.
  186. Raw CanonType = 0
  187. // Replace all deprecated tags with their preferred replacements.
  188. Deprecated = DeprecatedBase | DeprecatedScript | DeprecatedRegion
  189. // All canonicalizations recommended by BCP 47.
  190. BCP47 = Deprecated | SuppressScript
  191. // All canonicalizations.
  192. All = BCP47 | Legacy | Macro
  193. // Default is the canonicalization used by Parse, Make and Compose. To
  194. // preserve as much information as possible, canonicalizations that remove
  195. // potentially valuable information are not included. The Matcher is
  196. // designed to recognize similar tags that would be the same if
  197. // they were canonicalized using All.
  198. Default = Deprecated | Legacy
  199. canonLang = DeprecatedBase | Legacy | Macro
  200. // TODO: LikelyScript, LikelyRegion: suppress similar to ICU.
  201. )
  202. // canonicalize returns the canonicalized equivalent of the tag and
  203. // whether there was any change.
  204. func (t Tag) canonicalize(c CanonType) (Tag, bool) {
  205. if c == Raw {
  206. return t, false
  207. }
  208. changed := false
  209. if c&SuppressScript != 0 {
  210. if t.lang < langNoIndexOffset && uint8(t.script) == suppressScript[t.lang] {
  211. t.script = 0
  212. changed = true
  213. }
  214. }
  215. if c&canonLang != 0 {
  216. for {
  217. if l, aliasType := normLang(t.lang); l != t.lang {
  218. switch aliasType {
  219. case langLegacy:
  220. if c&Legacy != 0 {
  221. if t.lang == _sh && t.script == 0 {
  222. t.script = _Latn
  223. }
  224. t.lang = l
  225. changed = true
  226. }
  227. case langMacro:
  228. if c&Macro != 0 {
  229. // We deviate here from CLDR. The mapping "nb" -> "no"
  230. // qualifies as a typical Macro language mapping. However,
  231. // for legacy reasons, CLDR maps "no", the macro language
  232. // code for Norwegian, to the dominant variant "nb". This
  233. // change is currently under consideration for CLDR as well.
  234. // See http://unicode.org/cldr/trac/ticket/2698 and also
  235. // http://unicode.org/cldr/trac/ticket/1790 for some of the
  236. // practical implications. TODO: this check could be removed
  237. // if CLDR adopts this change.
  238. if c&CLDR == 0 || t.lang != _nb {
  239. changed = true
  240. t.lang = l
  241. }
  242. }
  243. case langDeprecated:
  244. if c&DeprecatedBase != 0 {
  245. if t.lang == _mo && t.region == 0 {
  246. t.region = _MD
  247. }
  248. t.lang = l
  249. changed = true
  250. // Other canonicalization types may still apply.
  251. continue
  252. }
  253. }
  254. } else if c&Legacy != 0 && t.lang == _no && c&CLDR != 0 {
  255. t.lang = _nb
  256. changed = true
  257. }
  258. break
  259. }
  260. }
  261. if c&DeprecatedScript != 0 {
  262. if t.script == _Qaai {
  263. changed = true
  264. t.script = _Zinh
  265. }
  266. }
  267. if c&DeprecatedRegion != 0 {
  268. if r := normRegion(t.region); r != 0 {
  269. changed = true
  270. t.region = r
  271. }
  272. }
  273. return t, changed
  274. }
  275. // Canonicalize returns the canonicalized equivalent of the tag.
  276. func (c CanonType) Canonicalize(t Tag) (Tag, error) {
  277. t, changed := t.canonicalize(c)
  278. if changed {
  279. t.remakeString()
  280. }
  281. return t, nil
  282. }
  283. // Confidence indicates the level of certainty for a given return value.
  284. // For example, Serbian may be written in Cyrillic or Latin script.
  285. // The confidence level indicates whether a value was explicitly specified,
  286. // whether it is typically the only possible value, or whether there is
  287. // an ambiguity.
  288. type Confidence int
  289. const (
  290. No Confidence = iota // full confidence that there was no match
  291. Low // most likely value picked out of a set of alternatives
  292. High // value is generally assumed to be the correct match
  293. Exact // exact match or explicitly specified value
  294. )
  295. var confName = []string{"No", "Low", "High", "Exact"}
  296. func (c Confidence) String() string {
  297. return confName[c]
  298. }
  299. // remakeString is used to update t.str in case lang, script or region changed.
  300. // It is assumed that pExt and pVariant still point to the start of the
  301. // respective parts.
  302. func (t *Tag) remakeString() {
  303. if t.str == "" {
  304. return
  305. }
  306. extra := t.str[t.pVariant:]
  307. if t.pVariant > 0 {
  308. extra = extra[1:]
  309. }
  310. if t.equalTags(und) && strings.HasPrefix(extra, "x-") {
  311. t.str = extra
  312. t.pVariant = 0
  313. t.pExt = 0
  314. return
  315. }
  316. var buf [max99thPercentileSize]byte // avoid extra memory allocation in most cases.
  317. b := buf[:t.genCoreBytes(buf[:])]
  318. if extra != "" {
  319. diff := len(b) - int(t.pVariant)
  320. b = append(b, '-')
  321. b = append(b, extra...)
  322. t.pVariant = uint8(int(t.pVariant) + diff)
  323. t.pExt = uint16(int(t.pExt) + diff)
  324. } else {
  325. t.pVariant = uint8(len(b))
  326. t.pExt = uint16(len(b))
  327. }
  328. t.str = string(b)
  329. }
  330. // genCoreBytes writes a string for the base languages, script and region tags
  331. // to the given buffer and returns the number of bytes written. It will never
  332. // write more than maxCoreSize bytes.
  333. func (t *Tag) genCoreBytes(buf []byte) int {
  334. n := t.lang.stringToBuf(buf[:])
  335. if t.script != 0 {
  336. n += copy(buf[n:], "-")
  337. n += copy(buf[n:], t.script.String())
  338. }
  339. if t.region != 0 {
  340. n += copy(buf[n:], "-")
  341. n += copy(buf[n:], t.region.String())
  342. }
  343. return n
  344. }
  345. // String returns the canonical string representation of the language tag.
  346. func (t Tag) String() string {
  347. if t.str != "" {
  348. return t.str
  349. }
  350. if t.script == 0 && t.region == 0 {
  351. return t.lang.String()
  352. }
  353. buf := [maxCoreSize]byte{}
  354. return string(buf[:t.genCoreBytes(buf[:])])
  355. }
  356. // Base returns the base language of the language tag. If the base language is
  357. // unspecified, an attempt will be made to infer it from the context.
  358. // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
  359. func (t Tag) Base() (Base, Confidence) {
  360. if t.lang != 0 {
  361. return Base{t.lang}, Exact
  362. }
  363. c := High
  364. if t.script == 0 && !(Region{t.region}).IsCountry() {
  365. c = Low
  366. }
  367. if tag, err := addTags(t); err == nil && tag.lang != 0 {
  368. return Base{tag.lang}, c
  369. }
  370. return Base{0}, No
  371. }
  372. // Script infers the script for the language tag. If it was not explicitly given, it will infer
  373. // a most likely candidate.
  374. // If more than one script is commonly used for a language, the most likely one
  375. // is returned with a low confidence indication. For example, it returns (Cyrl, Low)
  376. // for Serbian.
  377. // If a script cannot be inferred (Zzzz, No) is returned. We do not use Zyyy (undetermined)
  378. // as one would suspect from the IANA registry for BCP 47. In a Unicode context Zyyy marks
  379. // common characters (like 1, 2, 3, '.', etc.) and is therefore more like multiple scripts.
  380. // See http://www.unicode.org/reports/tr24/#Values for more details. Zzzz is also used for
  381. // unknown value in CLDR. (Zzzz, Exact) is returned if Zzzz was explicitly specified.
  382. // Note that an inferred script is never guaranteed to be the correct one. Latin is
  383. // almost exclusively used for Afrikaans, but Arabic has been used for some texts
  384. // in the past. Also, the script that is commonly used may change over time.
  385. // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
  386. func (t Tag) Script() (Script, Confidence) {
  387. if t.script != 0 {
  388. return Script{t.script}, Exact
  389. }
  390. sc, c := scriptID(_Zzzz), No
  391. if t.lang < langNoIndexOffset {
  392. if scr := scriptID(suppressScript[t.lang]); scr != 0 {
  393. // Note: it is not always the case that a language with a suppress
  394. // script value is only written in one script (e.g. kk, ms, pa).
  395. if t.region == 0 {
  396. return Script{scriptID(scr)}, High
  397. }
  398. sc, c = scr, High
  399. }
  400. }
  401. if tag, err := addTags(t); err == nil {
  402. if tag.script != sc {
  403. sc, c = tag.script, Low
  404. }
  405. } else {
  406. t, _ = (Deprecated | Macro).Canonicalize(t)
  407. if tag, err := addTags(t); err == nil && tag.script != sc {
  408. sc, c = tag.script, Low
  409. }
  410. }
  411. return Script{sc}, c
  412. }
  413. // Region returns the region for the language tag. If it was not explicitly given, it will
  414. // infer a most likely candidate from the context.
  415. // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
  416. func (t Tag) Region() (Region, Confidence) {
  417. if t.region != 0 {
  418. return Region{t.region}, Exact
  419. }
  420. if t, err := addTags(t); err == nil {
  421. return Region{t.region}, Low // TODO: differentiate between high and low.
  422. }
  423. t, _ = (Deprecated | Macro).Canonicalize(t)
  424. if tag, err := addTags(t); err == nil {
  425. return Region{tag.region}, Low
  426. }
  427. return Region{_ZZ}, No // TODO: return world instead of undetermined?
  428. }
  429. // Variant returns the variants specified explicitly for this language tag.
  430. // or nil if no variant was specified.
  431. func (t Tag) Variants() []Variant {
  432. v := []Variant{}
  433. if int(t.pVariant) < int(t.pExt) {
  434. for x, str := "", t.str[t.pVariant:t.pExt]; str != ""; {
  435. x, str = nextToken(str)
  436. v = append(v, Variant{x})
  437. }
  438. }
  439. return v
  440. }
  441. // Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
  442. // specific language are substituted with fields from the parent language.
  443. // The parent for a language may change for newer versions of CLDR.
  444. func (t Tag) Parent() Tag {
  445. if t.str != "" {
  446. // Strip the variants and extensions.
  447. t, _ = Raw.Compose(t.Raw())
  448. if t.region == 0 && t.script != 0 && t.lang != 0 {
  449. base, _ := addTags(Tag{lang: t.lang})
  450. if base.script == t.script {
  451. return Tag{lang: t.lang}
  452. }
  453. }
  454. return t
  455. }
  456. if t.lang != 0 {
  457. if t.region != 0 {
  458. maxScript := t.script
  459. if maxScript == 0 {
  460. max, _ := addTags(t)
  461. maxScript = max.script
  462. }
  463. for i := range parents {
  464. if langID(parents[i].lang) == t.lang && scriptID(parents[i].maxScript) == maxScript {
  465. for _, r := range parents[i].fromRegion {
  466. if regionID(r) == t.region {
  467. return Tag{
  468. lang: t.lang,
  469. script: scriptID(parents[i].script),
  470. region: regionID(parents[i].toRegion),
  471. }
  472. }
  473. }
  474. }
  475. }
  476. // Strip the script if it is the default one.
  477. base, _ := addTags(Tag{lang: t.lang})
  478. if base.script != maxScript {
  479. return Tag{lang: t.lang, script: maxScript}
  480. }
  481. return Tag{lang: t.lang}
  482. } else if t.script != 0 {
  483. // The parent for an base-script pair with a non-default script is
  484. // "und" instead of the base language.
  485. base, _ := addTags(Tag{lang: t.lang})
  486. if base.script != t.script {
  487. return und
  488. }
  489. return Tag{lang: t.lang}
  490. }
  491. }
  492. return und
  493. }
  494. // returns token t and the rest of the string.
  495. func nextToken(s string) (t, tail string) {
  496. p := strings.Index(s[1:], "-")
  497. if p == -1 {
  498. return s[1:], ""
  499. }
  500. p++
  501. return s[1:p], s[p:]
  502. }
  503. // Extension is a single BCP 47 extension.
  504. type Extension struct {
  505. s string
  506. }
  507. // String returns the string representation of the extension, including the
  508. // type tag.
  509. func (e Extension) String() string {
  510. return e.s
  511. }
  512. // ParseExtension parses s as an extension and returns it on success.
  513. func ParseExtension(s string) (e Extension, err error) {
  514. scan := makeScannerString(s)
  515. var end int
  516. if n := len(scan.token); n != 1 {
  517. return Extension{}, errSyntax
  518. }
  519. scan.toLower(0, len(scan.b))
  520. end = parseExtension(&scan)
  521. if end != len(s) {
  522. return Extension{}, errSyntax
  523. }
  524. return Extension{string(scan.b)}, nil
  525. }
  526. // Type returns the one-byte extension type of e. It returns 0 for the zero
  527. // exception.
  528. func (e Extension) Type() byte {
  529. if e.s == "" {
  530. return 0
  531. }
  532. return e.s[0]
  533. }
  534. // Tokens returns the list of tokens of e.
  535. func (e Extension) Tokens() []string {
  536. return strings.Split(e.s, "-")
  537. }
  538. // Extension returns the extension of type x for tag t. It will return
  539. // false for ok if t does not have the requested extension. The returned
  540. // extension will be invalid in this case.
  541. func (t Tag) Extension(x byte) (ext Extension, ok bool) {
  542. for i := int(t.pExt); i < len(t.str)-1; {
  543. var ext string
  544. i, ext = getExtension(t.str, i)
  545. if ext[0] == x {
  546. return Extension{ext}, true
  547. }
  548. }
  549. return Extension{string(x)}, false
  550. }
  551. // Extensions returns all extensions of t.
  552. func (t Tag) Extensions() []Extension {
  553. e := []Extension{}
  554. for i := int(t.pExt); i < len(t.str)-1; {
  555. var ext string
  556. i, ext = getExtension(t.str, i)
  557. e = append(e, Extension{ext})
  558. }
  559. return e
  560. }
  561. // TypeForKey returns the type associated with the given key, where key and type
  562. // are of the allowed values defined for the Unicode locale extension ('u') in
  563. // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  564. // TypeForKey will traverse the inheritance chain to get the correct value.
  565. func (t Tag) TypeForKey(key string) string {
  566. if start, end, _ := t.findTypeForKey(key); end != start {
  567. return t.str[start:end]
  568. }
  569. return ""
  570. }
  571. var (
  572. errPrivateUse = errors.New("cannot set a key on a private use tag")
  573. errInvalidArguments = errors.New("invalid key or type")
  574. )
  575. // SetTypeForKey returns a new Tag with the key set to type, where key and type
  576. // are of the allowed values defined for the Unicode locale extension ('u') in
  577. // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  578. // An empty value removes an existing pair with the same key.
  579. func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
  580. if t.private() {
  581. return t, errPrivateUse
  582. }
  583. if len(key) != 2 {
  584. return t, errInvalidArguments
  585. }
  586. // Remove the setting if value is "".
  587. if value == "" {
  588. start, end, _ := t.findTypeForKey(key)
  589. if start != end {
  590. // Remove key tag and leading '-'.
  591. start -= 4
  592. // Remove a possible empty extension.
  593. if (end == len(t.str) || t.str[end+2] == '-') && t.str[start-2] == '-' {
  594. start -= 2
  595. }
  596. if start == int(t.pVariant) && end == len(t.str) {
  597. t.str = ""
  598. t.pVariant, t.pExt = 0, 0
  599. } else {
  600. t.str = fmt.Sprintf("%s%s", t.str[:start], t.str[end:])
  601. }
  602. }
  603. return t, nil
  604. }
  605. if len(value) < 3 || len(value) > 8 {
  606. return t, errInvalidArguments
  607. }
  608. var (
  609. buf [maxCoreSize + maxSimpleUExtensionSize]byte
  610. uStart int // start of the -u extension.
  611. )
  612. // Generate the tag string if needed.
  613. if t.str == "" {
  614. uStart = t.genCoreBytes(buf[:])
  615. buf[uStart] = '-'
  616. uStart++
  617. }
  618. // Create new key-type pair and parse it to verify.
  619. b := buf[uStart:]
  620. copy(b, "u-")
  621. copy(b[2:], key)
  622. b[4] = '-'
  623. b = b[:5+copy(b[5:], value)]
  624. scan := makeScanner(b)
  625. if parseExtensions(&scan); scan.err != nil {
  626. return t, scan.err
  627. }
  628. // Assemble the replacement string.
  629. if t.str == "" {
  630. t.pVariant, t.pExt = byte(uStart-1), uint16(uStart-1)
  631. t.str = string(buf[:uStart+len(b)])
  632. } else {
  633. s := t.str
  634. start, end, hasExt := t.findTypeForKey(key)
  635. if start == end {
  636. if hasExt {
  637. b = b[2:]
  638. }
  639. t.str = fmt.Sprintf("%s-%s%s", s[:start], b, s[end:])
  640. } else {
  641. t.str = fmt.Sprintf("%s%s%s", s[:start], value, s[end:])
  642. }
  643. }
  644. return t, nil
  645. }
  646. // findKeyAndType returns the start and end position for the type corresponding
  647. // to key or the point at which to insert the key-value pair if the type
  648. // wasn't found. The hasExt return value reports whether an -u extension was present.
  649. // Note: the extensions are typically very small and are likely to contain
  650. // only one key-type pair.
  651. func (t Tag) findTypeForKey(key string) (start, end int, hasExt bool) {
  652. p := int(t.pExt)
  653. if len(key) != 2 || p == len(t.str) || p == 0 {
  654. return p, p, false
  655. }
  656. s := t.str
  657. // Find the correct extension.
  658. for p++; s[p] != 'u'; p++ {
  659. if s[p] > 'u' {
  660. p--
  661. return p, p, false
  662. }
  663. if p = nextExtension(s, p); p == len(s) {
  664. return len(s), len(s), false
  665. }
  666. }
  667. // Proceed to the hyphen following the extension name.
  668. p++
  669. // curKey is the key currently being processed.
  670. curKey := ""
  671. // Iterate over keys until we get the end of a section.
  672. for {
  673. // p points to the hyphen preceding the current token.
  674. if p3 := p + 3; s[p3] == '-' {
  675. // Found a key.
  676. // Check whether we just processed the key that was requested.
  677. if curKey == key {
  678. return start, p, true
  679. }
  680. // Set to the next key and continue scanning type tokens.
  681. curKey = s[p+1 : p3]
  682. if curKey > key {
  683. return p, p, true
  684. }
  685. // Start of the type token sequence.
  686. start = p + 4
  687. // A type is at least 3 characters long.
  688. p += 7 // 4 + 3
  689. } else {
  690. // Attribute or type, which is at least 3 characters long.
  691. p += 4
  692. }
  693. // p points past the third character of a type or attribute.
  694. max := p + 5 // maximum length of token plus hyphen.
  695. if len(s) < max {
  696. max = len(s)
  697. }
  698. for ; p < max && s[p] != '-'; p++ {
  699. }
  700. // Bail if we have exhausted all tokens or if the next token starts
  701. // a new extension.
  702. if p == len(s) || s[p+2] == '-' {
  703. if curKey == key {
  704. return start, p, true
  705. }
  706. return p, p, true
  707. }
  708. }
  709. }
  710. // CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags
  711. // for which data exists in the text repository. The index will change over time
  712. // and should not be stored in persistent storage. Extensions, except for the
  713. // 'va' type of the 'u' extension, are ignored. It will return 0, false if no
  714. // compact tag exists, where 0 is the index for the root language (Und).
  715. func CompactIndex(t Tag) (index int, ok bool) {
  716. // TODO: perhaps give more frequent tags a lower index.
  717. // TODO: we could make the indexes stable. This will excluded some
  718. // possibilities for optimization, so don't do this quite yet.
  719. b, s, r := t.Raw()
  720. if len(t.str) > 0 {
  721. if strings.HasPrefix(t.str, "x-") {
  722. // We have no entries for user-defined tags.
  723. return 0, false
  724. }
  725. if uint16(t.pVariant) != t.pExt {
  726. // There are no tags with variants and an u-va type.
  727. if t.TypeForKey("va") != "" {
  728. return 0, false
  729. }
  730. t, _ = Raw.Compose(b, s, r, t.Variants())
  731. } else if _, ok := t.Extension('u'); ok {
  732. // Strip all but the 'va' entry.
  733. variant := t.TypeForKey("va")
  734. t, _ = Raw.Compose(b, s, r)
  735. t, _ = t.SetTypeForKey("va", variant)
  736. }
  737. if len(t.str) > 0 {
  738. // We have some variants.
  739. for i, s := range specialTags {
  740. if s == t {
  741. return i + 1, true
  742. }
  743. }
  744. return 0, false
  745. }
  746. }
  747. // No variants specified: just compare core components.
  748. // The key has the form lllssrrr, where l, s, and r are nibbles for
  749. // respectively the langID, scriptID, and regionID.
  750. key := uint32(b.langID) << (8 + 12)
  751. key |= uint32(s.scriptID) << 12
  752. key |= uint32(r.regionID)
  753. x, ok := coreTags[key]
  754. return int(x), ok
  755. }
  756. // Base is an ISO 639 language code, used for encoding the base language
  757. // of a language tag.
  758. type Base struct {
  759. langID
  760. }
  761. // ParseBase parses a 2- or 3-letter ISO 639 code.
  762. // It returns a ValueError if s is a well-formed but unknown language identifier
  763. // or another error if another error occurred.
  764. func ParseBase(s string) (Base, error) {
  765. if n := len(s); n < 2 || 3 < n {
  766. return Base{}, errSyntax
  767. }
  768. var buf [3]byte
  769. l, err := getLangID(buf[:copy(buf[:], s)])
  770. return Base{l}, err
  771. }
  772. // Script is a 4-letter ISO 15924 code for representing scripts.
  773. // It is idiomatically represented in title case.
  774. type Script struct {
  775. scriptID
  776. }
  777. // ParseScript parses a 4-letter ISO 15924 code.
  778. // It returns a ValueError if s is a well-formed but unknown script identifier
  779. // or another error if another error occurred.
  780. func ParseScript(s string) (Script, error) {
  781. if len(s) != 4 {
  782. return Script{}, errSyntax
  783. }
  784. var buf [4]byte
  785. sc, err := getScriptID(script, buf[:copy(buf[:], s)])
  786. return Script{sc}, err
  787. }
  788. // Region is an ISO 3166-1 or UN M.49 code for representing countries and regions.
  789. type Region struct {
  790. regionID
  791. }
  792. // EncodeM49 returns the Region for the given UN M.49 code.
  793. // It returns an error if r is not a valid code.
  794. func EncodeM49(r int) (Region, error) {
  795. rid, err := getRegionM49(r)
  796. return Region{rid}, err
  797. }
  798. // ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code.
  799. // It returns a ValueError if s is a well-formed but unknown region identifier
  800. // or another error if another error occurred.
  801. func ParseRegion(s string) (Region, error) {
  802. if n := len(s); n < 2 || 3 < n {
  803. return Region{}, errSyntax
  804. }
  805. var buf [3]byte
  806. r, err := getRegionID(buf[:copy(buf[:], s)])
  807. return Region{r}, err
  808. }
  809. // IsCountry returns whether this region is a country or autonomous area. This
  810. // includes non-standard definitions from CLDR.
  811. func (r Region) IsCountry() bool {
  812. if r.regionID == 0 || r.IsGroup() || r.IsPrivateUse() && r.regionID != _XK {
  813. return false
  814. }
  815. return true
  816. }
  817. // IsGroup returns whether this region defines a collection of regions. This
  818. // includes non-standard definitions from CLDR.
  819. func (r Region) IsGroup() bool {
  820. if r.regionID == 0 {
  821. return false
  822. }
  823. return int(regionInclusion[r.regionID]) < len(regionContainment)
  824. }
  825. // Contains returns whether Region c is contained by Region r. It returns true
  826. // if c == r.
  827. func (r Region) Contains(c Region) bool {
  828. return r.regionID.contains(c.regionID)
  829. }
  830. func (r regionID) contains(c regionID) bool {
  831. if r == c {
  832. return true
  833. }
  834. g := regionInclusion[r]
  835. if g >= nRegionGroups {
  836. return false
  837. }
  838. m := regionContainment[g]
  839. d := regionInclusion[c]
  840. b := regionInclusionBits[d]
  841. // A contained country may belong to multiple disjoint groups. Matching any
  842. // of these indicates containment. If the contained region is a group, it
  843. // must strictly be a subset.
  844. if d >= nRegionGroups {
  845. return b&m != 0
  846. }
  847. return b&^m == 0
  848. }
  849. var errNoTLD = errors.New("language: region is not a valid ccTLD")
  850. // TLD returns the country code top-level domain (ccTLD). UK is returned for GB.
  851. // In all other cases it returns either the region itself or an error.
  852. //
  853. // This method may return an error for a region for which there exists a
  854. // canonical form with a ccTLD. To get that ccTLD canonicalize r first. The
  855. // region will already be canonicalized it was obtained from a Tag that was
  856. // obtained using any of the default methods.
  857. func (r Region) TLD() (Region, error) {
  858. // See http://en.wikipedia.org/wiki/Country_code_top-level_domain for the
  859. // difference between ISO 3166-1 and IANA ccTLD.
  860. if r.regionID == _GB {
  861. r = Region{_UK}
  862. }
  863. if (r.typ() & ccTLD) == 0 {
  864. return Region{}, errNoTLD
  865. }
  866. return r, nil
  867. }
  868. // Canonicalize returns the region or a possible replacement if the region is
  869. // deprecated. It will not return a replacement for deprecated regions that
  870. // are split into multiple regions.
  871. func (r Region) Canonicalize() Region {
  872. if cr := normRegion(r.regionID); cr != 0 {
  873. return Region{cr}
  874. }
  875. return r
  876. }
  877. // Variant represents a registered variant of a language as defined by BCP 47.
  878. type Variant struct {
  879. variant string
  880. }
  881. // ParseVariant parses and returns a Variant. An error is returned if s is not
  882. // a valid variant.
  883. func ParseVariant(s string) (Variant, error) {
  884. s = strings.ToLower(s)
  885. if _, ok := variantIndex[s]; ok {
  886. return Variant{s}, nil
  887. }
  888. return Variant{}, mkErrInvalid([]byte(s))
  889. }
  890. // String returns the string representation of the variant.
  891. func (v Variant) String() string {
  892. return v.variant
  893. }