You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

554 rivejä
13 KiB

  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build ignore
  5. // Collation table generator.
  6. // Data read from the web.
  7. package main
  8. import (
  9. "archive/zip"
  10. "bufio"
  11. "bytes"
  12. "flag"
  13. "fmt"
  14. "io"
  15. "io/ioutil"
  16. "log"
  17. "os"
  18. "regexp"
  19. "sort"
  20. "strconv"
  21. "strings"
  22. "unicode/utf8"
  23. "golang.org/x/text/collate"
  24. "golang.org/x/text/collate/build"
  25. "golang.org/x/text/internal/colltab"
  26. "golang.org/x/text/internal/gen"
  27. "golang.org/x/text/language"
  28. "golang.org/x/text/unicode/cldr"
  29. )
  30. var (
  31. test = flag.Bool("test", false,
  32. "test existing tables; can be used to compare web data with package data.")
  33. short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
  34. draft = flag.Bool("draft", false, `Use draft versions, when available.`)
  35. tags = flag.String("tags", "", "build tags to be included after +build directive")
  36. pkg = flag.String("package", "collate",
  37. "the name of the package in which the generated file is to be included")
  38. tables = flagStringSetAllowAll("tables", "collate", "collate,chars",
  39. "comma-spearated list of tables to generate.")
  40. exclude = flagStringSet("exclude", "zh2", "",
  41. "comma-separated list of languages to exclude.")
  42. include = flagStringSet("include", "", "",
  43. "comma-separated list of languages to include. Include trumps exclude.")
  44. // TODO: Not included: unihan gb2312han zhuyin big5han (for size reasons)
  45. // TODO: Not included: traditional (buggy for Bengali)
  46. types = flagStringSetAllowAll("types", "standard,phonebook,phonetic,reformed,pinyin,stroke", "",
  47. "comma-separated list of types that should be included.")
  48. )
  49. // stringSet implements an ordered set based on a list. It implements flag.Value
  50. // to allow a set to be specified as a comma-separated list.
  51. type stringSet struct {
  52. s []string
  53. allowed *stringSet
  54. dirty bool // needs compaction if true
  55. all bool
  56. allowAll bool
  57. }
  58. func flagStringSet(name, def, allowed, usage string) *stringSet {
  59. ss := &stringSet{}
  60. if allowed != "" {
  61. usage += fmt.Sprintf(" (allowed values: any of %s)", allowed)
  62. ss.allowed = &stringSet{}
  63. failOnError(ss.allowed.Set(allowed))
  64. }
  65. ss.Set(def)
  66. flag.Var(ss, name, usage)
  67. return ss
  68. }
  69. func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet {
  70. ss := &stringSet{allowAll: true}
  71. if allowed == "" {
  72. flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`))
  73. } else {
  74. ss.allowed = &stringSet{}
  75. failOnError(ss.allowed.Set(allowed))
  76. flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed))
  77. }
  78. ss.Set(def)
  79. return ss
  80. }
  81. func (ss stringSet) Len() int {
  82. return len(ss.s)
  83. }
  84. func (ss stringSet) String() string {
  85. return strings.Join(ss.s, ",")
  86. }
  87. func (ss *stringSet) Set(s string) error {
  88. if ss.allowAll && s == "all" {
  89. ss.s = nil
  90. ss.all = true
  91. return nil
  92. }
  93. ss.s = ss.s[:0]
  94. for _, s := range strings.Split(s, ",") {
  95. if s := strings.TrimSpace(s); s != "" {
  96. if ss.allowed != nil && !ss.allowed.contains(s) {
  97. return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed)
  98. }
  99. ss.add(s)
  100. }
  101. }
  102. ss.compact()
  103. return nil
  104. }
  105. func (ss *stringSet) add(s string) {
  106. ss.s = append(ss.s, s)
  107. ss.dirty = true
  108. }
  109. func (ss *stringSet) values() []string {
  110. ss.compact()
  111. return ss.s
  112. }
  113. func (ss *stringSet) contains(s string) bool {
  114. if ss.all {
  115. return true
  116. }
  117. for _, v := range ss.s {
  118. if v == s {
  119. return true
  120. }
  121. }
  122. return false
  123. }
  124. func (ss *stringSet) compact() {
  125. if !ss.dirty {
  126. return
  127. }
  128. a := ss.s
  129. sort.Strings(a)
  130. k := 0
  131. for i := 1; i < len(a); i++ {
  132. if a[k] != a[i] {
  133. a[k+1] = a[i]
  134. k++
  135. }
  136. }
  137. ss.s = a[:k+1]
  138. ss.dirty = false
  139. }
  140. func skipLang(l string) bool {
  141. if include.Len() > 0 {
  142. return !include.contains(l)
  143. }
  144. return exclude.contains(l)
  145. }
  146. // altInclude returns a list of alternatives (for the LDML alt attribute)
  147. // in order of preference. An empty string in this list indicates the
  148. // default entry.
  149. func altInclude() []string {
  150. l := []string{}
  151. if *short {
  152. l = append(l, "short")
  153. }
  154. l = append(l, "")
  155. // TODO: handle draft using cldr.SetDraftLevel
  156. if *draft {
  157. l = append(l, "proposed")
  158. }
  159. return l
  160. }
  161. func failOnError(e error) {
  162. if e != nil {
  163. log.Panic(e)
  164. }
  165. }
  166. func openArchive() *zip.Reader {
  167. f := gen.OpenCLDRCoreZip()
  168. buffer, err := ioutil.ReadAll(f)
  169. f.Close()
  170. failOnError(err)
  171. archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
  172. failOnError(err)
  173. return archive
  174. }
  175. // parseUCA parses a Default Unicode Collation Element Table of the format
  176. // specified in https://www.unicode.org/reports/tr10/#File_Format.
  177. // It returns the variable top.
  178. func parseUCA(builder *build.Builder) {
  179. var r io.ReadCloser
  180. var err error
  181. for _, f := range openArchive().File {
  182. if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") {
  183. r, err = f.Open()
  184. }
  185. }
  186. if r == nil {
  187. log.Fatal("File allkeys_CLDR.txt not found in archive.")
  188. }
  189. failOnError(err)
  190. defer r.Close()
  191. scanner := bufio.NewScanner(r)
  192. colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
  193. for i := 1; scanner.Scan(); i++ {
  194. line := scanner.Text()
  195. if len(line) == 0 || line[0] == '#' {
  196. continue
  197. }
  198. if line[0] == '@' {
  199. // parse properties
  200. switch {
  201. case strings.HasPrefix(line[1:], "version "):
  202. a := strings.Split(line[1:], " ")
  203. if a[1] != gen.UnicodeVersion() {
  204. log.Fatalf("incompatible version %s; want %s", a[1], gen.UnicodeVersion())
  205. }
  206. case strings.HasPrefix(line[1:], "backwards "):
  207. log.Fatalf("%d: unsupported option backwards", i)
  208. default:
  209. log.Printf("%d: unknown option %s", i, line[1:])
  210. }
  211. } else {
  212. // parse entries
  213. part := strings.Split(line, " ; ")
  214. if len(part) != 2 {
  215. log.Fatalf("%d: production rule without ';': %v", i, line)
  216. }
  217. lhs := []rune{}
  218. for _, v := range strings.Split(part[0], " ") {
  219. if v == "" {
  220. continue
  221. }
  222. lhs = append(lhs, rune(convHex(i, v)))
  223. }
  224. var n int
  225. var vars []int
  226. rhs := [][]int{}
  227. for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
  228. n += len(m[0])
  229. elem := []int{}
  230. for _, h := range strings.Split(m[2], ".") {
  231. elem = append(elem, convHex(i, h))
  232. }
  233. if m[1] == "*" {
  234. vars = append(vars, i)
  235. }
  236. rhs = append(rhs, elem)
  237. }
  238. if len(part[1]) < n+3 || part[1][n+1] != '#' {
  239. log.Fatalf("%d: expected comment; found %s", i, part[1][n:])
  240. }
  241. if *test {
  242. testInput.add(string(lhs))
  243. }
  244. failOnError(builder.Add(lhs, rhs, vars))
  245. }
  246. }
  247. if scanner.Err() != nil {
  248. log.Fatal(scanner.Err())
  249. }
  250. }
  251. func convHex(line int, s string) int {
  252. r, e := strconv.ParseInt(s, 16, 32)
  253. if e != nil {
  254. log.Fatalf("%d: %v", line, e)
  255. }
  256. return int(r)
  257. }
  258. var testInput = stringSet{}
  259. var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`)
  260. var tagRe = regexp.MustCompile(`<([a-z_]*) */>`)
  261. var mainLocales = []string{}
  262. // charsets holds a list of exemplar characters per category.
  263. type charSets map[string][]string
  264. func (p charSets) fprint(w io.Writer) {
  265. fmt.Fprintln(w, "[exN]string{")
  266. for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} {
  267. if set := p[k]; len(set) != 0 {
  268. fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " "))
  269. }
  270. }
  271. fmt.Fprintln(w, "\t},")
  272. }
  273. var localeChars = make(map[string]charSets)
  274. const exemplarHeader = `
  275. type exemplarType int
  276. const (
  277. exCharacters exemplarType = iota
  278. exContractions
  279. exPunctuation
  280. exAuxiliary
  281. exCurrency
  282. exIndex
  283. exN
  284. )
  285. `
  286. func printExemplarCharacters(w io.Writer) {
  287. fmt.Fprintln(w, exemplarHeader)
  288. fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{")
  289. for _, loc := range mainLocales {
  290. fmt.Fprintf(w, "\t%q: ", loc)
  291. localeChars[loc].fprint(w)
  292. }
  293. fmt.Fprintln(w, "}")
  294. }
  295. func decodeCLDR(d *cldr.Decoder) *cldr.CLDR {
  296. r := gen.OpenCLDRCoreZip()
  297. data, err := d.DecodeZip(r)
  298. failOnError(err)
  299. return data
  300. }
  301. // parseMain parses XML files in the main directory of the CLDR core.zip file.
  302. func parseMain() {
  303. d := &cldr.Decoder{}
  304. d.SetDirFilter("main")
  305. d.SetSectionFilter("characters")
  306. data := decodeCLDR(d)
  307. for _, loc := range data.Locales() {
  308. x := data.RawLDML(loc)
  309. if skipLang(x.Identity.Language.Type) {
  310. continue
  311. }
  312. if x.Characters != nil {
  313. x, _ = data.LDML(loc)
  314. loc = language.Make(loc).String()
  315. for _, ec := range x.Characters.ExemplarCharacters {
  316. if ec.Draft != "" {
  317. continue
  318. }
  319. if _, ok := localeChars[loc]; !ok {
  320. mainLocales = append(mainLocales, loc)
  321. localeChars[loc] = make(charSets)
  322. }
  323. localeChars[loc][ec.Type] = parseCharacters(ec.Data())
  324. }
  325. }
  326. }
  327. }
  328. func parseCharacters(chars string) []string {
  329. parseSingle := func(s string) (r rune, tail string, escaped bool) {
  330. if s[0] == '\\' {
  331. return rune(s[1]), s[2:], true
  332. }
  333. r, sz := utf8.DecodeRuneInString(s)
  334. return r, s[sz:], false
  335. }
  336. chars = strings.TrimSpace(chars)
  337. if n := len(chars) - 1; chars[n] == ']' && chars[0] == '[' {
  338. chars = chars[1:n]
  339. }
  340. list := []string{}
  341. var r, last, end rune
  342. for len(chars) > 0 {
  343. if chars[0] == '{' { // character sequence
  344. buf := []rune{}
  345. for chars = chars[1:]; len(chars) > 0; {
  346. r, chars, _ = parseSingle(chars)
  347. if r == '}' {
  348. break
  349. }
  350. if r == ' ' {
  351. log.Fatalf("space not supported in sequence %q", chars)
  352. }
  353. buf = append(buf, r)
  354. }
  355. list = append(list, string(buf))
  356. last = 0
  357. } else { // single character
  358. escaped := false
  359. r, chars, escaped = parseSingle(chars)
  360. if r != ' ' {
  361. if r == '-' && !escaped {
  362. if last == 0 {
  363. log.Fatal("'-' should be preceded by a character")
  364. }
  365. end, chars, _ = parseSingle(chars)
  366. for ; last <= end; last++ {
  367. list = append(list, string(last))
  368. }
  369. last = 0
  370. } else {
  371. list = append(list, string(r))
  372. last = r
  373. }
  374. }
  375. }
  376. }
  377. return list
  378. }
  379. var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`)
  380. // typeMap translates legacy type keys to their BCP47 equivalent.
  381. var typeMap = map[string]string{
  382. "phonebook": "phonebk",
  383. "traditional": "trad",
  384. }
  385. // parseCollation parses XML files in the collation directory of the CLDR core.zip file.
  386. func parseCollation(b *build.Builder) {
  387. d := &cldr.Decoder{}
  388. d.SetDirFilter("collation")
  389. data := decodeCLDR(d)
  390. for _, loc := range data.Locales() {
  391. x, err := data.LDML(loc)
  392. failOnError(err)
  393. if skipLang(x.Identity.Language.Type) {
  394. continue
  395. }
  396. cs := x.Collations.Collation
  397. sl := cldr.MakeSlice(&cs)
  398. if len(types.s) == 0 {
  399. sl.SelectAnyOf("type", x.Collations.Default())
  400. } else if !types.all {
  401. sl.SelectAnyOf("type", types.s...)
  402. }
  403. sl.SelectOnePerGroup("alt", altInclude())
  404. for _, c := range cs {
  405. id, err := language.Parse(loc)
  406. if err != nil {
  407. fmt.Fprintf(os.Stderr, "invalid locale: %q", err)
  408. continue
  409. }
  410. // Support both old- and new-style defaults.
  411. d := c.Type
  412. if x.Collations.DefaultCollation == nil {
  413. d = x.Collations.Default()
  414. } else {
  415. d = x.Collations.DefaultCollation.Data()
  416. }
  417. // We assume tables are being built either for search or collation,
  418. // but not both. For search the default is always "search".
  419. if d != c.Type && c.Type != "search" {
  420. typ := c.Type
  421. if len(c.Type) > 8 {
  422. typ = typeMap[c.Type]
  423. }
  424. id, err = id.SetTypeForKey("co", typ)
  425. failOnError(err)
  426. }
  427. t := b.Tailoring(id)
  428. c.Process(processor{t})
  429. }
  430. }
  431. }
  432. type processor struct {
  433. t *build.Tailoring
  434. }
  435. func (p processor) Reset(anchor string, before int) (err error) {
  436. if before != 0 {
  437. err = p.t.SetAnchorBefore(anchor)
  438. } else {
  439. err = p.t.SetAnchor(anchor)
  440. }
  441. failOnError(err)
  442. return nil
  443. }
  444. func (p processor) Insert(level int, str, context, extend string) error {
  445. str = context + str
  446. if *test {
  447. testInput.add(str)
  448. }
  449. // TODO: mimic bug in old maketables: remove.
  450. err := p.t.Insert(colltab.Level(level-1), str, context+extend)
  451. failOnError(err)
  452. return nil
  453. }
  454. func (p processor) Index(id string) {
  455. }
  456. func testCollator(c *collate.Collator) {
  457. c0 := collate.New(language.Und)
  458. // iterator over all characters for all locales and check
  459. // whether Key is equal.
  460. buf := collate.Buffer{}
  461. // Add all common and not too uncommon runes to the test set.
  462. for i := rune(0); i < 0x30000; i++ {
  463. testInput.add(string(i))
  464. }
  465. for i := rune(0xE0000); i < 0xF0000; i++ {
  466. testInput.add(string(i))
  467. }
  468. for _, str := range testInput.values() {
  469. k0 := c0.KeyFromString(&buf, str)
  470. k := c.KeyFromString(&buf, str)
  471. if !bytes.Equal(k0, k) {
  472. failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k))
  473. }
  474. buf.Reset()
  475. }
  476. fmt.Println("PASS")
  477. }
  478. func main() {
  479. gen.Init()
  480. b := build.NewBuilder()
  481. parseUCA(b)
  482. if tables.contains("chars") {
  483. parseMain()
  484. }
  485. parseCollation(b)
  486. c, err := b.Build()
  487. failOnError(err)
  488. if *test {
  489. testCollator(collate.NewFromTable(c))
  490. } else {
  491. w := &bytes.Buffer{}
  492. gen.WriteUnicodeVersion(w)
  493. gen.WriteCLDRVersion(w)
  494. if tables.contains("collate") {
  495. _, err = b.Print(w)
  496. failOnError(err)
  497. }
  498. if tables.contains("chars") {
  499. printExemplarCharacters(w)
  500. }
  501. gen.WriteGoFile("tables.go", *pkg, w.Bytes())
  502. }
  503. }