You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

603 lines
15 KiB

  1. // Copyright 2014 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build ignore
  5. // Generator for display name tables.
  6. package main
  7. import (
  8. "bytes"
  9. "flag"
  10. "fmt"
  11. "log"
  12. "reflect"
  13. "sort"
  14. "strings"
  15. "golang.org/x/text/internal/gen"
  16. "golang.org/x/text/language"
  17. "golang.org/x/text/unicode/cldr"
  18. )
  19. var (
  20. test = flag.Bool("test", false,
  21. "test existing tables; can be used to compare web data with package data.")
  22. outputFile = flag.String("output", "tables.go", "output file")
  23. stats = flag.Bool("stats", false, "prints statistics to stderr")
  24. short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
  25. draft = flag.String("draft",
  26. "contributed",
  27. `Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
  28. pkg = flag.String("package",
  29. "display",
  30. "the name of the package in which the generated file is to be included")
  31. tags = newTagSet("tags",
  32. []language.Tag{},
  33. "space-separated list of tags to include or empty for all")
  34. dict = newTagSet("dict",
  35. dictTags(),
  36. "space-separated list or tags for which to include a Dictionary. "+
  37. `"" means the common list from go.text/language.`)
  38. )
  39. func dictTags() (tag []language.Tag) {
  40. // TODO: replace with language.Common.Tags() once supported.
  41. const str = "af am ar ar-001 az bg bn ca cs da de el en en-US en-GB " +
  42. "es es-ES es-419 et fa fi fil fr fr-CA gu he hi hr hu hy id is it ja " +
  43. "ka kk km kn ko ky lo lt lv mk ml mn mr ms my ne nl no pa pl pt pt-BR " +
  44. "pt-PT ro ru si sk sl sq sr sr-Latn sv sw ta te th tr uk ur uz vi " +
  45. "zh zh-Hans zh-Hant zu"
  46. for _, s := range strings.Split(str, " ") {
  47. tag = append(tag, language.MustParse(s))
  48. }
  49. return tag
  50. }
  51. func main() {
  52. gen.Init()
  53. // Read the CLDR zip file.
  54. r := gen.OpenCLDRCoreZip()
  55. defer r.Close()
  56. d := &cldr.Decoder{}
  57. d.SetDirFilter("main", "supplemental")
  58. d.SetSectionFilter("localeDisplayNames")
  59. data, err := d.DecodeZip(r)
  60. if err != nil {
  61. log.Fatalf("DecodeZip: %v", err)
  62. }
  63. w := gen.NewCodeWriter()
  64. defer w.WriteGoFile(*outputFile, "display")
  65. gen.WriteCLDRVersion(w)
  66. b := builder{
  67. w: w,
  68. data: data,
  69. group: make(map[string]*group),
  70. }
  71. b.generate()
  72. }
  73. const tagForm = language.All
  74. // tagSet is used to parse command line flags of tags. It implements the
  75. // flag.Value interface.
  76. type tagSet map[language.Tag]bool
  77. func newTagSet(name string, tags []language.Tag, usage string) tagSet {
  78. f := tagSet(make(map[language.Tag]bool))
  79. for _, t := range tags {
  80. f[t] = true
  81. }
  82. flag.Var(f, name, usage)
  83. return f
  84. }
  85. // String implements the String method of the flag.Value interface.
  86. func (f tagSet) String() string {
  87. tags := []string{}
  88. for t := range f {
  89. tags = append(tags, t.String())
  90. }
  91. sort.Strings(tags)
  92. return strings.Join(tags, " ")
  93. }
  94. // Set implements Set from the flag.Value interface.
  95. func (f tagSet) Set(s string) error {
  96. if s != "" {
  97. for _, s := range strings.Split(s, " ") {
  98. if s != "" {
  99. tag, err := tagForm.Parse(s)
  100. if err != nil {
  101. return err
  102. }
  103. f[tag] = true
  104. }
  105. }
  106. }
  107. return nil
  108. }
  109. func (f tagSet) contains(t language.Tag) bool {
  110. if len(f) == 0 {
  111. return true
  112. }
  113. return f[t]
  114. }
  115. // builder is used to create all tables with display name information.
  116. type builder struct {
  117. w *gen.CodeWriter
  118. data *cldr.CLDR
  119. fromLocs []string
  120. // destination tags for the current locale.
  121. toTags []string
  122. toTagIndex map[string]int
  123. // list of supported tags
  124. supported []language.Tag
  125. // key-value pairs per group
  126. group map[string]*group
  127. // statistics
  128. sizeIndex int // total size of all indexes of headers
  129. sizeData int // total size of all data of headers
  130. totalSize int
  131. }
  132. type group struct {
  133. // Maps from a given language to the Namer data for this language.
  134. lang map[language.Tag]keyValues
  135. headers []header
  136. toTags []string
  137. threeStart int
  138. fourPlusStart int
  139. }
  140. // set sets the typ to the name for locale loc.
  141. func (g *group) set(t language.Tag, typ, name string) {
  142. kv := g.lang[t]
  143. if kv == nil {
  144. kv = make(keyValues)
  145. g.lang[t] = kv
  146. }
  147. if kv[typ] == "" {
  148. kv[typ] = name
  149. }
  150. }
  151. type keyValues map[string]string
  152. type header struct {
  153. tag language.Tag
  154. data string
  155. index []uint16
  156. }
  157. var versionInfo = `// Version is deprecated. Use CLDRVersion.
  158. const Version = %#v
  159. `
  160. var self = language.MustParse("mul")
  161. // generate builds and writes all tables.
  162. func (b *builder) generate() {
  163. fmt.Fprintf(b.w, versionInfo, cldr.Version)
  164. b.filter()
  165. b.setData("lang", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
  166. if ldn.Languages != nil {
  167. for _, v := range ldn.Languages.Language {
  168. lang := v.Type
  169. if lang == "root" {
  170. // We prefer the data from "und"
  171. // TODO: allow both the data for root and und somehow.
  172. continue
  173. }
  174. tag := tagForm.MustParse(lang)
  175. if tags.contains(tag) {
  176. g.set(loc, tag.String(), v.Data())
  177. }
  178. }
  179. }
  180. })
  181. b.setData("script", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
  182. if ldn.Scripts != nil {
  183. for _, v := range ldn.Scripts.Script {
  184. code := language.MustParseScript(v.Type)
  185. if code.IsPrivateUse() { // Qaaa..Qabx
  186. // TODO: data currently appears to be very meager.
  187. // Reconsider if we have data for English.
  188. if loc == language.English {
  189. log.Fatal("Consider including data for private use scripts.")
  190. }
  191. continue
  192. }
  193. g.set(loc, code.String(), v.Data())
  194. }
  195. }
  196. })
  197. b.setData("region", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
  198. if ldn.Territories != nil {
  199. for _, v := range ldn.Territories.Territory {
  200. g.set(loc, language.MustParseRegion(v.Type).String(), v.Data())
  201. }
  202. }
  203. })
  204. b.makeSupported()
  205. b.writeParents()
  206. b.writeGroup("lang")
  207. b.writeGroup("script")
  208. b.writeGroup("region")
  209. b.w.WriteConst("numSupported", len(b.supported))
  210. buf := bytes.Buffer{}
  211. for _, tag := range b.supported {
  212. fmt.Fprint(&buf, tag.String(), "|")
  213. }
  214. b.w.WriteConst("supported", buf.String())
  215. b.writeDictionaries()
  216. b.supported = []language.Tag{self}
  217. // Compute the names of locales in their own language. Some of these names
  218. // may be specified in their parent locales. We iterate the maximum depth
  219. // of the parent three times to match successive parents of tags until a
  220. // possible match is found.
  221. for i := 0; i < 4; i++ {
  222. b.setData("self", func(g *group, tag language.Tag, ldn *cldr.LocaleDisplayNames) {
  223. parent := tag
  224. if b, s, r := tag.Raw(); i > 0 && (s != language.Script{} && r == language.Region{}) {
  225. parent, _ = language.Raw.Compose(b)
  226. }
  227. if ldn.Languages != nil {
  228. for _, v := range ldn.Languages.Language {
  229. key := tagForm.MustParse(v.Type)
  230. saved := key
  231. if key == parent {
  232. g.set(self, tag.String(), v.Data())
  233. }
  234. for k := 0; k < i; k++ {
  235. key = key.Parent()
  236. }
  237. if key == tag {
  238. g.set(self, saved.String(), v.Data()) // set does not overwrite a value.
  239. }
  240. }
  241. }
  242. })
  243. }
  244. b.writeGroup("self")
  245. }
  246. func (b *builder) setData(name string, f func(*group, language.Tag, *cldr.LocaleDisplayNames)) {
  247. b.sizeIndex = 0
  248. b.sizeData = 0
  249. b.toTags = nil
  250. b.fromLocs = nil
  251. b.toTagIndex = make(map[string]int)
  252. g := b.group[name]
  253. if g == nil {
  254. g = &group{lang: make(map[language.Tag]keyValues)}
  255. b.group[name] = g
  256. }
  257. for _, loc := range b.data.Locales() {
  258. // We use RawLDML instead of LDML as we are managing our own inheritance
  259. // in this implementation.
  260. ldml := b.data.RawLDML(loc)
  261. // We do not support the POSIX variant (it is not a supported BCP 47
  262. // variant). This locale also doesn't happen to contain any data, so
  263. // we'll skip it by checking for this.
  264. tag, err := tagForm.Parse(loc)
  265. if err != nil {
  266. if ldml.LocaleDisplayNames != nil {
  267. log.Fatalf("setData: %v", err)
  268. }
  269. continue
  270. }
  271. if ldml.LocaleDisplayNames != nil && tags.contains(tag) {
  272. f(g, tag, ldml.LocaleDisplayNames)
  273. }
  274. }
  275. }
  276. func (b *builder) filter() {
  277. filter := func(s *cldr.Slice) {
  278. if *short {
  279. s.SelectOnePerGroup("alt", []string{"short", ""})
  280. } else {
  281. s.SelectOnePerGroup("alt", []string{"stand-alone", ""})
  282. }
  283. d, err := cldr.ParseDraft(*draft)
  284. if err != nil {
  285. log.Fatalf("filter: %v", err)
  286. }
  287. s.SelectDraft(d)
  288. }
  289. for _, loc := range b.data.Locales() {
  290. if ldn := b.data.RawLDML(loc).LocaleDisplayNames; ldn != nil {
  291. if ldn.Languages != nil {
  292. s := cldr.MakeSlice(&ldn.Languages.Language)
  293. if filter(&s); len(ldn.Languages.Language) == 0 {
  294. ldn.Languages = nil
  295. }
  296. }
  297. if ldn.Scripts != nil {
  298. s := cldr.MakeSlice(&ldn.Scripts.Script)
  299. if filter(&s); len(ldn.Scripts.Script) == 0 {
  300. ldn.Scripts = nil
  301. }
  302. }
  303. if ldn.Territories != nil {
  304. s := cldr.MakeSlice(&ldn.Territories.Territory)
  305. if filter(&s); len(ldn.Territories.Territory) == 0 {
  306. ldn.Territories = nil
  307. }
  308. }
  309. }
  310. }
  311. }
  312. // makeSupported creates a list of all supported locales.
  313. func (b *builder) makeSupported() {
  314. // tags across groups
  315. for _, g := range b.group {
  316. for t, _ := range g.lang {
  317. b.supported = append(b.supported, t)
  318. }
  319. }
  320. b.supported = b.supported[:unique(tagsSorter(b.supported))]
  321. }
  322. type tagsSorter []language.Tag
  323. func (a tagsSorter) Len() int { return len(a) }
  324. func (a tagsSorter) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
  325. func (a tagsSorter) Less(i, j int) bool { return a[i].String() < a[j].String() }
  326. func (b *builder) writeGroup(name string) {
  327. g := b.group[name]
  328. for _, kv := range g.lang {
  329. for t, _ := range kv {
  330. g.toTags = append(g.toTags, t)
  331. }
  332. }
  333. g.toTags = g.toTags[:unique(tagsBySize(g.toTags))]
  334. // Allocate header per supported value.
  335. g.headers = make([]header, len(b.supported))
  336. for i, sup := range b.supported {
  337. kv, ok := g.lang[sup]
  338. if !ok {
  339. g.headers[i].tag = sup
  340. continue
  341. }
  342. data := []byte{}
  343. index := make([]uint16, len(g.toTags), len(g.toTags)+1)
  344. for j, t := range g.toTags {
  345. index[j] = uint16(len(data))
  346. data = append(data, kv[t]...)
  347. }
  348. index = append(index, uint16(len(data)))
  349. // Trim the tail of the index.
  350. // TODO: indexes can be reduced in size quite a bit more.
  351. n := len(index)
  352. for ; n >= 2 && index[n-2] == index[n-1]; n-- {
  353. }
  354. index = index[:n]
  355. // Workaround for a bug in CLDR 26.
  356. // See https://unicode.org/cldr/trac/ticket/8042.
  357. if cldr.Version == "26" && sup.String() == "hsb" {
  358. data = bytes.Replace(data, []byte{'"'}, nil, 1)
  359. }
  360. g.headers[i] = header{sup, string(data), index}
  361. }
  362. g.writeTable(b.w, name)
  363. }
  364. type tagsBySize []string
  365. func (l tagsBySize) Len() int { return len(l) }
  366. func (l tagsBySize) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
  367. func (l tagsBySize) Less(i, j int) bool {
  368. a, b := l[i], l[j]
  369. // Sort single-tag entries based on size first. Otherwise alphabetic.
  370. if len(a) != len(b) && (len(a) <= 4 || len(b) <= 4) {
  371. return len(a) < len(b)
  372. }
  373. return a < b
  374. }
  375. // parentIndices returns slice a of len(tags) where tags[a[i]] is the parent
  376. // of tags[i].
  377. func parentIndices(tags []language.Tag) []int16 {
  378. index := make(map[language.Tag]int16)
  379. for i, t := range tags {
  380. index[t] = int16(i)
  381. }
  382. // Construct default parents.
  383. parents := make([]int16, len(tags))
  384. for i, t := range tags {
  385. parents[i] = -1
  386. for t = t.Parent(); t != language.Und; t = t.Parent() {
  387. if j, ok := index[t]; ok {
  388. parents[i] = j
  389. break
  390. }
  391. }
  392. }
  393. return parents
  394. }
  395. func (b *builder) writeParents() {
  396. parents := parentIndices(b.supported)
  397. fmt.Fprintf(b.w, "var parents = ")
  398. b.w.WriteArray(parents)
  399. }
  400. // writeKeys writes keys to a special index used by the display package.
  401. // tags are assumed to be sorted by length.
  402. func writeKeys(w *gen.CodeWriter, name string, keys []string) {
  403. w.Size += int(3 * reflect.TypeOf("").Size())
  404. w.WriteComment("Number of keys: %d", len(keys))
  405. fmt.Fprintf(w, "var (\n\t%sIndex = tagIndex{\n", name)
  406. for i := 2; i <= 4; i++ {
  407. sub := []string{}
  408. for _, t := range keys {
  409. if len(t) != i {
  410. break
  411. }
  412. sub = append(sub, t)
  413. }
  414. s := strings.Join(sub, "")
  415. w.WriteString(s)
  416. fmt.Fprintf(w, ",\n")
  417. keys = keys[len(sub):]
  418. }
  419. fmt.Fprintln(w, "\t}")
  420. if len(keys) > 0 {
  421. w.Size += int(reflect.TypeOf([]string{}).Size())
  422. fmt.Fprintf(w, "\t%sTagsLong = ", name)
  423. w.WriteSlice(keys)
  424. }
  425. fmt.Fprintln(w, ")\n")
  426. }
  427. // identifier creates an identifier from the given tag.
  428. func identifier(t language.Tag) string {
  429. return strings.Replace(t.String(), "-", "", -1)
  430. }
  431. func (h *header) writeEntry(w *gen.CodeWriter, name string) {
  432. if len(dict) > 0 && dict.contains(h.tag) {
  433. fmt.Fprintf(w, "\t{ // %s\n", h.tag)
  434. fmt.Fprintf(w, "\t\t%[1]s%[2]sStr,\n\t\t%[1]s%[2]sIdx,\n", identifier(h.tag), name)
  435. fmt.Fprintln(w, "\t},")
  436. } else if len(h.data) == 0 {
  437. fmt.Fprintln(w, "\t\t{}, //", h.tag)
  438. } else {
  439. fmt.Fprintf(w, "\t{ // %s\n", h.tag)
  440. w.WriteString(h.data)
  441. fmt.Fprintln(w, ",")
  442. w.WriteSlice(h.index)
  443. fmt.Fprintln(w, ",\n\t},")
  444. }
  445. }
  446. // write the data for the given header as single entries. The size for this data
  447. // was already accounted for in writeEntry.
  448. func (h *header) writeSingle(w *gen.CodeWriter, name string) {
  449. if len(dict) > 0 && dict.contains(h.tag) {
  450. tag := identifier(h.tag)
  451. w.WriteConst(tag+name+"Str", h.data)
  452. // Note that we create a slice instead of an array. If we use an array
  453. // we need to refer to it as a[:] in other tables, which will cause the
  454. // array to always be included by the linker. See Issue 7651.
  455. w.WriteVar(tag+name+"Idx", h.index)
  456. }
  457. }
  458. // WriteTable writes an entry for a single Namer.
  459. func (g *group) writeTable(w *gen.CodeWriter, name string) {
  460. start := w.Size
  461. writeKeys(w, name, g.toTags)
  462. w.Size += len(g.headers) * int(reflect.ValueOf(g.headers[0]).Type().Size())
  463. fmt.Fprintf(w, "var %sHeaders = [%d]header{\n", name, len(g.headers))
  464. title := strings.Title(name)
  465. for _, h := range g.headers {
  466. h.writeEntry(w, title)
  467. }
  468. fmt.Fprintln(w, "}\n")
  469. for _, h := range g.headers {
  470. h.writeSingle(w, title)
  471. }
  472. n := w.Size - start
  473. fmt.Fprintf(w, "// Total size for %s: %d bytes (%d KB)\n\n", name, n, n/1000)
  474. }
  475. func (b *builder) writeDictionaries() {
  476. fmt.Fprintln(b.w, "// Dictionary entries of frequent languages")
  477. fmt.Fprintln(b.w, "var (")
  478. parents := parentIndices(b.supported)
  479. for i, t := range b.supported {
  480. if dict.contains(t) {
  481. ident := identifier(t)
  482. fmt.Fprintf(b.w, "\t%s = Dictionary{ // %s\n", ident, t)
  483. if p := parents[i]; p == -1 {
  484. fmt.Fprintln(b.w, "\t\tnil,")
  485. } else {
  486. fmt.Fprintf(b.w, "\t\t&%s,\n", identifier(b.supported[p]))
  487. }
  488. fmt.Fprintf(b.w, "\t\theader{%[1]sLangStr, %[1]sLangIdx},\n", ident)
  489. fmt.Fprintf(b.w, "\t\theader{%[1]sScriptStr, %[1]sScriptIdx},\n", ident)
  490. fmt.Fprintf(b.w, "\t\theader{%[1]sRegionStr, %[1]sRegionIdx},\n", ident)
  491. fmt.Fprintln(b.w, "\t}")
  492. }
  493. }
  494. fmt.Fprintln(b.w, ")")
  495. var s string
  496. var a []uint16
  497. sz := reflect.TypeOf(s).Size()
  498. sz += reflect.TypeOf(a).Size()
  499. sz *= 3
  500. sz += reflect.TypeOf(&a).Size()
  501. n := int(sz) * len(dict)
  502. fmt.Fprintf(b.w, "// Total size for %d entries: %d bytes (%d KB)\n\n", len(dict), n, n/1000)
  503. b.w.Size += n
  504. }
  505. // unique sorts the given lists and removes duplicate entries by swapping them
  506. // past position k, where k is the number of unique values. It returns k.
  507. func unique(a sort.Interface) int {
  508. if a.Len() == 0 {
  509. return 0
  510. }
  511. sort.Sort(a)
  512. k := 1
  513. for i := 1; i < a.Len(); i++ {
  514. if a.Less(k-1, i) {
  515. if k != i {
  516. a.Swap(k, i)
  517. }
  518. k++
  519. }
  520. }
  521. return k
  522. }