You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

600 lines
17 KiB

  1. // Copyright 2014 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package cases
  5. // This file contains the definitions of case mappings for all supported
  6. // languages. The rules for the language-specific tailorings were taken and
  7. // modified from the CLDR transform definitions in common/transforms.
  8. import (
  9. "strings"
  10. "unicode"
  11. "unicode/utf8"
  12. "golang.org/x/text/language"
  13. "golang.org/x/text/transform"
  14. "golang.org/x/text/unicode/norm"
  15. )
  16. // A mapFunc takes a context set to the current rune and writes the mapped
  17. // version to the same context. It may advance the context to the next rune. It
  18. // returns whether a checkpoint is possible: whether the pDst bytes written to
  19. // dst so far won't need changing as we see more source bytes.
  20. type mapFunc func(*context) bool
  21. // maxIgnorable defines the maximum number of ignorables to consider for
  22. // lookahead operations.
  23. const maxIgnorable = 30
  24. // supported lists the language tags for which we have tailorings.
  25. const supported = "und af az el lt nl tr"
  26. func init() {
  27. tags := []language.Tag{}
  28. for _, s := range strings.Split(supported, " ") {
  29. tags = append(tags, language.MustParse(s))
  30. }
  31. matcher = language.NewMatcher(tags)
  32. Supported = language.NewCoverage(tags)
  33. }
  34. var (
  35. matcher language.Matcher
  36. Supported language.Coverage
  37. // We keep the following lists separate, instead of having a single per-
  38. // language struct, to give the compiler a chance to remove unused code.
  39. // Some uppercase mappers are stateless, so we can precompute the
  40. // Transformers and save a bit on runtime allocations.
  41. upperFunc = []mapFunc{
  42. nil, // und
  43. nil, // af
  44. aztrUpper(upper), // az
  45. elUpper, // el
  46. ltUpper(upper), // lt
  47. nil, // nl
  48. aztrUpper(upper), // tr
  49. }
  50. undUpper transform.Transformer = &undUpperCaser{}
  51. lowerFunc = []mapFunc{
  52. lower, // und
  53. lower, // af
  54. aztrLower, // az
  55. lower, // el
  56. ltLower, // lt
  57. lower, // nl
  58. aztrLower, // tr
  59. }
  60. titleInfos = []struct {
  61. title, lower mapFunc
  62. rewrite func(*context)
  63. }{
  64. {title, lower, nil}, // und
  65. {title, lower, afnlRewrite}, // af
  66. {aztrUpper(title), aztrLower, nil}, // az
  67. {title, lower, nil}, // el
  68. {ltUpper(title), ltLower, nil}, // lt
  69. {nlTitle, lower, afnlRewrite}, // nl
  70. {aztrUpper(title), aztrLower, nil}, // tr
  71. }
  72. )
  73. func makeUpper(t language.Tag, o options) transform.Transformer {
  74. _, i, _ := matcher.Match(t)
  75. f := upperFunc[i]
  76. if f == nil {
  77. return undUpper
  78. }
  79. return &simpleCaser{f: f}
  80. }
  81. func makeLower(t language.Tag, o options) transform.Transformer {
  82. _, i, _ := matcher.Match(t)
  83. f := lowerFunc[i]
  84. if o.noFinalSigma {
  85. return &simpleCaser{f: f}
  86. }
  87. return &lowerCaser{
  88. first: f,
  89. midWord: finalSigma(f),
  90. }
  91. }
  92. func makeTitle(t language.Tag, o options) transform.Transformer {
  93. _, i, _ := matcher.Match(t)
  94. x := &titleInfos[i]
  95. lower := x.lower
  96. if o.noLower {
  97. lower = (*context).copy
  98. } else if !o.noFinalSigma {
  99. lower = finalSigma(lower)
  100. }
  101. return &titleCaser{
  102. title: x.title,
  103. lower: lower,
  104. rewrite: x.rewrite,
  105. }
  106. }
  107. // TODO: consider a similar special case for the fast majority lower case. This
  108. // is a bit more involved so will require some more precise benchmarking to
  109. // justify it.
  110. type undUpperCaser struct{ transform.NopResetter }
  111. // undUpperCaser implements the Transformer interface for doing an upper case
  112. // mapping for the root locale (und). It eliminates the need for an allocation
  113. // as it prevents escaping by not using function pointers.
  114. func (t *undUpperCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  115. c := context{dst: dst, src: src, atEOF: atEOF}
  116. for c.next() {
  117. upper(&c)
  118. c.checkpoint()
  119. }
  120. return c.ret()
  121. }
  122. type simpleCaser struct {
  123. context
  124. f mapFunc
  125. }
  126. // simpleCaser implements the Transformer interface for doing a case operation
  127. // on a rune-by-rune basis.
  128. func (t *simpleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  129. t.context = context{dst: dst, src: src, atEOF: atEOF}
  130. c := &t.context
  131. for c.next() && t.f(c) {
  132. c.checkpoint()
  133. }
  134. return c.ret()
  135. }
  136. // lowerCaser implements the Transformer interface. The default Unicode lower
  137. // casing requires different treatment for the first and subsequent characters
  138. // of a word, most notably to handle the Greek final Sigma.
  139. type lowerCaser struct {
  140. context
  141. first, midWord mapFunc
  142. }
  143. func (t *lowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  144. t.context = context{dst: dst, src: src, atEOF: atEOF}
  145. c := &t.context
  146. for isInterWord := true; c.next(); {
  147. if isInterWord {
  148. if c.info.isCased() {
  149. if !t.first(c) {
  150. break
  151. }
  152. isInterWord = false
  153. } else if !c.copy() {
  154. break
  155. }
  156. } else {
  157. if c.info.isNotCasedAndNotCaseIgnorable() {
  158. if !c.copy() {
  159. break
  160. }
  161. isInterWord = true
  162. } else if !t.midWord(c) {
  163. break
  164. }
  165. }
  166. c.checkpoint()
  167. }
  168. return c.ret()
  169. }
  170. // titleCaser implements the Transformer interface. Title casing algorithms
  171. // distinguish between the first letter of a word and subsequent letters of the
  172. // same word. It uses state to avoid requiring a potentially infinite lookahead.
  173. type titleCaser struct {
  174. context
  175. // rune mappings used by the actual casing algorithms.
  176. title, lower mapFunc
  177. rewrite func(*context)
  178. }
  179. // Transform implements the standard Unicode title case algorithm as defined in
  180. // Chapter 3 of The Unicode Standard:
  181. // toTitlecase(X): Find the word boundaries in X according to Unicode Standard
  182. // Annex #29, "Unicode Text Segmentation." For each word boundary, find the
  183. // first cased character F following the word boundary. If F exists, map F to
  184. // Titlecase_Mapping(F); then map all characters C between F and the following
  185. // word boundary to Lowercase_Mapping(C).
  186. func (t *titleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  187. t.context = context{dst: dst, src: src, atEOF: atEOF, isMidWord: t.isMidWord}
  188. c := &t.context
  189. if !c.next() {
  190. return c.ret()
  191. }
  192. for {
  193. p := c.info
  194. if t.rewrite != nil {
  195. t.rewrite(c)
  196. }
  197. wasMid := p.isCaseIgnorableAndNonBreakStarter()
  198. // Break out of this loop on failure to ensure we do not modify the
  199. // state incorrectly.
  200. if p.isCased() && !p.isCaseIgnorableAndNotCased() {
  201. if !c.isMidWord {
  202. if !t.title(c) {
  203. break
  204. }
  205. c.isMidWord = true
  206. } else if !t.lower(c) {
  207. break
  208. }
  209. } else if !c.copy() {
  210. break
  211. }
  212. // TODO: make this an "else if" if we can prove that no rune that does
  213. // not match the first condition of the if statement can be a break.
  214. if p.isBreak() {
  215. c.isMidWord = false
  216. }
  217. // As we save the state of the transformer, it is safe to call
  218. // checkpoint after any successful write.
  219. c.checkpoint()
  220. if !c.next() {
  221. break
  222. }
  223. if wasMid && c.info.isCaseIgnorableAndNonBreakStarter() {
  224. c.isMidWord = false
  225. }
  226. }
  227. return c.ret()
  228. }
  229. // finalSigma adds Greek final Sigma handing to another casing function. It
  230. // determines whether a lowercased sigma should be σ or ς, by looking ahead for
  231. // case-ignorables and a cased letters.
  232. func finalSigma(f mapFunc) mapFunc {
  233. return func(c *context) bool {
  234. // ::NFD();
  235. // # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
  236. // Σ } [:case-ignorable:]* [:cased:] → σ;
  237. // [:cased:] [:case-ignorable:]* { Σ → ς;
  238. // ::Any-Lower;
  239. // ::NFC();
  240. if !c.hasPrefix("Σ") {
  241. return f(c)
  242. }
  243. p := c.pDst
  244. c.writeString("ς")
  245. // We need to do one more iteration after maxIgnorable, as a cased
  246. // letter is not an ignorable and may modify the result.
  247. for i := 0; i < maxIgnorable+1; i++ {
  248. if !c.next() {
  249. return false
  250. }
  251. if !c.info.isCaseIgnorable() {
  252. if c.info.isCased() {
  253. // p+1 is guaranteed to be in bounds: if writing ς was
  254. // successful, p+1 will contain the second byte of ς. If not,
  255. // this function will have returned after c.next returned false.
  256. c.dst[p+1]++ // ς → σ
  257. }
  258. c.unreadRune()
  259. return true
  260. }
  261. // A case ignorable may also introduce a word break, so we may need
  262. // to continue searching even after detecting a break.
  263. c.isMidWord = c.isMidWord && !c.info.isBreak()
  264. c.copy()
  265. }
  266. return true
  267. }
  268. }
  269. // elUpper implements Greek upper casing, which entails removing a predefined
  270. // set of non-blocked modifiers. Note that these accents should not be removed
  271. // for title casing!
  272. // Example: "Οδός" -> "ΟΔΟΣ".
  273. func elUpper(c *context) bool {
  274. // From CLDR:
  275. // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ;
  276. // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ;
  277. r, _ := utf8.DecodeRune(c.src[c.pSrc:])
  278. oldPDst := c.pDst
  279. if !upper(c) {
  280. return false
  281. }
  282. if !unicode.Is(unicode.Greek, r) {
  283. return true
  284. }
  285. i := 0
  286. // Take the properties of the uppercased rune that is already written to the
  287. // destination. This saves us the trouble of having to uppercase the
  288. // decomposed rune again.
  289. if b := norm.NFD.Properties(c.dst[oldPDst:]).Decomposition(); b != nil {
  290. // Restore the destination position and process the decomposed rune.
  291. r, sz := utf8.DecodeRune(b)
  292. if r <= 0xFF { // See A.6.1
  293. return true
  294. }
  295. c.pDst = oldPDst
  296. // Insert the first rune and ignore the modifiers. See A.6.2.
  297. c.writeBytes(b[:sz])
  298. i = len(b[sz:]) / 2 // Greek modifiers are always of length 2.
  299. }
  300. for ; i < maxIgnorable && c.next(); i++ {
  301. switch r, _ := utf8.DecodeRune(c.src[c.pSrc:]); r {
  302. // Above and Iota Subscript
  303. case 0x0300, // U+0300 COMBINING GRAVE ACCENT
  304. 0x0301, // U+0301 COMBINING ACUTE ACCENT
  305. 0x0304, // U+0304 COMBINING MACRON
  306. 0x0306, // U+0306 COMBINING BREVE
  307. 0x0308, // U+0308 COMBINING DIAERESIS
  308. 0x0313, // U+0313 COMBINING COMMA ABOVE
  309. 0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE
  310. 0x0342, // U+0342 COMBINING GREEK PERISPOMENI
  311. 0x0345: // U+0345 COMBINING GREEK YPOGEGRAMMENI
  312. // No-op. Gobble the modifier.
  313. default:
  314. switch v, _ := trie.lookup(c.src[c.pSrc:]); info(v).cccType() {
  315. case cccZero:
  316. c.unreadRune()
  317. return true
  318. // We don't need to test for IotaSubscript as the only rune that
  319. // qualifies (U+0345) was already excluded in the switch statement
  320. // above. See A.4.
  321. case cccAbove:
  322. return c.copy()
  323. default:
  324. // Some other modifier. We're still allowed to gobble Greek
  325. // modifiers after this.
  326. c.copy()
  327. }
  328. }
  329. }
  330. return i == maxIgnorable
  331. }
  332. func ltLower(c *context) bool {
  333. // From CLDR:
  334. // # Introduce an explicit dot above when lowercasing capital I's and J's
  335. // # whenever there are more accents above.
  336. // # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
  337. // # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
  338. // # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
  339. // # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
  340. // # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
  341. // # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
  342. // # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
  343. // ::NFD();
  344. // I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307;
  345. // J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307;
  346. // Į } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → į \u0307;
  347. // Ì → i \u0307 \u0300;
  348. // Í → i \u0307 \u0301;
  349. // Ĩ → i \u0307 \u0303;
  350. // ::Any-Lower();
  351. // ::NFC();
  352. i := 0
  353. if r := c.src[c.pSrc]; r < utf8.RuneSelf {
  354. lower(c)
  355. if r != 'I' && r != 'J' {
  356. return true
  357. }
  358. } else {
  359. p := norm.NFD.Properties(c.src[c.pSrc:])
  360. if d := p.Decomposition(); len(d) >= 3 && (d[0] == 'I' || d[0] == 'J') {
  361. // UTF-8 optimization: the decomposition will only have an above
  362. // modifier if the last rune of the decomposition is in [U+300-U+311].
  363. // In all other cases, a decomposition starting with I is always
  364. // an I followed by modifiers that are not cased themselves. See A.2.
  365. if d[1] == 0xCC && d[2] <= 0x91 { // A.2.4.
  366. if !c.writeBytes(d[:1]) {
  367. return false
  368. }
  369. c.dst[c.pDst-1] += 'a' - 'A' // lower
  370. // Assumption: modifier never changes on lowercase. See A.1.
  371. // Assumption: all modifiers added have CCC = Above. See A.2.3.
  372. return c.writeString("\u0307") && c.writeBytes(d[1:])
  373. }
  374. // In all other cases the additional modifiers will have a CCC
  375. // that is less than 230 (Above). We will insert the U+0307, if
  376. // needed, after these modifiers so that a string in FCD form
  377. // will remain so. See A.2.2.
  378. lower(c)
  379. i = 1
  380. } else {
  381. return lower(c)
  382. }
  383. }
  384. for ; i < maxIgnorable && c.next(); i++ {
  385. switch c.info.cccType() {
  386. case cccZero:
  387. c.unreadRune()
  388. return true
  389. case cccAbove:
  390. return c.writeString("\u0307") && c.copy() // See A.1.
  391. default:
  392. c.copy() // See A.1.
  393. }
  394. }
  395. return i == maxIgnorable
  396. }
  397. func ltUpper(f mapFunc) mapFunc {
  398. return func(c *context) bool {
  399. // From CLDR:
  400. // ::NFD();
  401. // [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ;
  402. // ::Any-Upper();
  403. // ::NFC();
  404. // TODO: See A.5. A soft-dotted rune never has an exception. This would
  405. // allow us to overload the exception bit and encode this property in
  406. // info. Need to measure performance impact of this.
  407. r, _ := utf8.DecodeRune(c.src[c.pSrc:])
  408. oldPDst := c.pDst
  409. if !f(c) {
  410. return false
  411. }
  412. if !unicode.Is(unicode.Soft_Dotted, r) {
  413. return true
  414. }
  415. // We don't need to do an NFD normalization, as a soft-dotted rune never
  416. // contains U+0307. See A.3.
  417. i := 0
  418. for ; i < maxIgnorable && c.next(); i++ {
  419. switch c.info.cccType() {
  420. case cccZero:
  421. c.unreadRune()
  422. return true
  423. case cccAbove:
  424. if c.hasPrefix("\u0307") {
  425. // We don't do a full NFC, but rather combine runes for
  426. // some of the common cases. (Returning NFC or
  427. // preserving normal form is neither a requirement nor
  428. // a possibility anyway).
  429. if !c.next() {
  430. return false
  431. }
  432. if c.dst[oldPDst] == 'I' && c.pDst == oldPDst+1 && c.src[c.pSrc] == 0xcc {
  433. s := ""
  434. switch c.src[c.pSrc+1] {
  435. case 0x80: // U+0300 COMBINING GRAVE ACCENT
  436. s = "\u00cc" // U+00CC LATIN CAPITAL LETTER I WITH GRAVE
  437. case 0x81: // U+0301 COMBINING ACUTE ACCENT
  438. s = "\u00cd" // U+00CD LATIN CAPITAL LETTER I WITH ACUTE
  439. case 0x83: // U+0303 COMBINING TILDE
  440. s = "\u0128" // U+0128 LATIN CAPITAL LETTER I WITH TILDE
  441. case 0x88: // U+0308 COMBINING DIAERESIS
  442. s = "\u00cf" // U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS
  443. default:
  444. }
  445. if s != "" {
  446. c.pDst = oldPDst
  447. return c.writeString(s)
  448. }
  449. }
  450. }
  451. return c.copy()
  452. default:
  453. c.copy()
  454. }
  455. }
  456. return i == maxIgnorable
  457. }
  458. }
  459. func aztrUpper(f mapFunc) mapFunc {
  460. return func(c *context) bool {
  461. // i→İ;
  462. if c.src[c.pSrc] == 'i' {
  463. return c.writeString("İ")
  464. }
  465. return f(c)
  466. }
  467. }
  468. func aztrLower(c *context) (done bool) {
  469. // From CLDR:
  470. // # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
  471. // # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
  472. // İ→i;
  473. // # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
  474. // # This matches the behavior of the canonically equivalent I-dot_above
  475. // # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
  476. // # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
  477. // # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
  478. // I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ;
  479. // I→ı ;
  480. // ::Any-Lower();
  481. if c.hasPrefix("\u0130") { // İ
  482. return c.writeString("i")
  483. }
  484. if c.src[c.pSrc] != 'I' {
  485. return lower(c)
  486. }
  487. // We ignore the lower-case I for now, but insert it later when we know
  488. // which form we need.
  489. start := c.pSrc + c.sz
  490. i := 0
  491. Loop:
  492. // We check for up to n ignorables before \u0307. As \u0307 is an
  493. // ignorable as well, n is maxIgnorable-1.
  494. for ; i < maxIgnorable && c.next(); i++ {
  495. switch c.info.cccType() {
  496. case cccAbove:
  497. if c.hasPrefix("\u0307") {
  498. return c.writeString("i") && c.writeBytes(c.src[start:c.pSrc]) // ignore U+0307
  499. }
  500. done = true
  501. break Loop
  502. case cccZero:
  503. c.unreadRune()
  504. done = true
  505. break Loop
  506. default:
  507. // We'll write this rune after we know which starter to use.
  508. }
  509. }
  510. if i == maxIgnorable {
  511. done = true
  512. }
  513. return c.writeString("ı") && c.writeBytes(c.src[start:c.pSrc+c.sz]) && done
  514. }
  515. func nlTitle(c *context) bool {
  516. // From CLDR:
  517. // # Special titlecasing for Dutch initial "ij".
  518. // ::Any-Title();
  519. // # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
  520. // [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;
  521. if c.src[c.pSrc] != 'I' && c.src[c.pSrc] != 'i' {
  522. return title(c)
  523. }
  524. if !c.writeString("I") || !c.next() {
  525. return false
  526. }
  527. if c.src[c.pSrc] == 'j' || c.src[c.pSrc] == 'J' {
  528. return c.writeString("J")
  529. }
  530. c.unreadRune()
  531. return true
  532. }
  533. // Not part of CLDR, but see http://unicode.org/cldr/trac/ticket/7078.
  534. func afnlRewrite(c *context) {
  535. if c.hasPrefix("'") || c.hasPrefix("’") {
  536. c.isMidWord = true
  537. }
  538. }