Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 
 

1354 linhas
33 KiB

  1. // Copyright 2009 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build ignore
  5. // Unicode table generator.
  6. // Data read from the web.
  7. package main
  8. import (
  9. "bufio"
  10. "flag"
  11. "fmt"
  12. "io"
  13. "log"
  14. "net/http"
  15. "os"
  16. "os/exec"
  17. "path/filepath"
  18. "regexp"
  19. "sort"
  20. "strconv"
  21. "strings"
  22. "unicode"
  23. "golang.org/x/text/unicode/rangetable"
  24. )
  25. func main() {
  26. flag.Parse()
  27. setupOutput()
  28. loadChars() // always needed
  29. loadCasefold()
  30. printCategories()
  31. printScriptOrProperty(false)
  32. printScriptOrProperty(true)
  33. printCases()
  34. printLatinProperties()
  35. printCasefold()
  36. printSizes()
  37. flushOutput()
  38. }
  39. func defaultVersion() string {
  40. if v := os.Getenv("UNICODE_VERSION"); v != "" {
  41. return v
  42. }
  43. return unicode.Version
  44. }
  45. var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
  46. var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt")
  47. var url = flag.String("url",
  48. "http://www.unicode.org/Public/"+defaultVersion()+"/ucd/",
  49. "URL of Unicode database directory")
  50. var tablelist = flag.String("tables",
  51. "all",
  52. "comma-separated list of which tables to generate; can be letter")
  53. var scriptlist = flag.String("scripts",
  54. "all",
  55. "comma-separated list of which script tables to generate")
  56. var proplist = flag.String("props",
  57. "all",
  58. "comma-separated list of which property tables to generate")
  59. var cases = flag.Bool("cases",
  60. true,
  61. "generate case tables")
  62. var test = flag.Bool("test",
  63. false,
  64. "test existing tables; can be used to compare web data with package data")
  65. var localFiles = flag.Bool("local",
  66. false,
  67. "data files have been copied to current directory; for debugging only")
  68. var outputFile = flag.String("output",
  69. "",
  70. "output file for generated tables; default stdout")
  71. var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`)
  72. var logger = log.New(os.Stderr, "", log.Lshortfile)
  73. var output *bufio.Writer // points to os.Stdout or to "gofmt > outputFile"
  74. func setupOutput() {
  75. output = bufio.NewWriter(startGofmt())
  76. }
  77. // startGofmt connects output to a gofmt process if -output is set.
  78. func startGofmt() io.Writer {
  79. if *outputFile == "" {
  80. return os.Stdout
  81. }
  82. stdout, err := os.Create(*outputFile)
  83. if err != nil {
  84. logger.Fatal(err)
  85. }
  86. // Pipe output to gofmt.
  87. gofmt := exec.Command("gofmt")
  88. fd, err := gofmt.StdinPipe()
  89. if err != nil {
  90. logger.Fatal(err)
  91. }
  92. gofmt.Stdout = stdout
  93. gofmt.Stderr = os.Stderr
  94. err = gofmt.Start()
  95. if err != nil {
  96. logger.Fatal(err)
  97. }
  98. return fd
  99. }
  100. func flushOutput() {
  101. err := output.Flush()
  102. if err != nil {
  103. logger.Fatal(err)
  104. }
  105. }
  106. func printf(format string, args ...interface{}) {
  107. fmt.Fprintf(output, format, args...)
  108. }
  109. func print(args ...interface{}) {
  110. fmt.Fprint(output, args...)
  111. }
  112. func println(args ...interface{}) {
  113. fmt.Fprintln(output, args...)
  114. }
  115. type reader struct {
  116. *bufio.Reader
  117. fd *os.File
  118. resp *http.Response
  119. }
  120. func open(url string) *reader {
  121. file := filepath.Base(url)
  122. if *localFiles {
  123. fd, err := os.Open(file)
  124. if err != nil {
  125. logger.Fatal(err)
  126. }
  127. return &reader{bufio.NewReader(fd), fd, nil}
  128. }
  129. resp, err := http.Get(url)
  130. if err != nil {
  131. logger.Fatal(err)
  132. }
  133. if resp.StatusCode != 200 {
  134. logger.Fatalf("bad GET status for %s: %d", file, resp.Status)
  135. }
  136. return &reader{bufio.NewReader(resp.Body), nil, resp}
  137. }
  138. func (r *reader) close() {
  139. if r.fd != nil {
  140. r.fd.Close()
  141. } else {
  142. r.resp.Body.Close()
  143. }
  144. }
  145. var category = map[string]bool{
  146. // Nd Lu etc.
  147. // We use one-character names to identify merged categories
  148. "L": true, // Lu Ll Lt Lm Lo
  149. "P": true, // Pc Pd Ps Pe Pu Pf Po
  150. "M": true, // Mn Mc Me
  151. "N": true, // Nd Nl No
  152. "S": true, // Sm Sc Sk So
  153. "Z": true, // Zs Zl Zp
  154. "C": true, // Cc Cf Cs Co Cn
  155. }
  156. // UnicodeData.txt has form:
  157. // 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;;
  158. // 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A
  159. // See https://www.unicode.org/reports/tr44/ for a full explanation
  160. // The fields:
  161. const (
  162. FCodePoint = iota
  163. FName
  164. FGeneralCategory
  165. FCanonicalCombiningClass
  166. FBidiClass
  167. FDecompositionTypeAndMapping
  168. FNumericType
  169. FNumericDigit // If a decimal digit.
  170. FNumericValue // Includes non-decimal, e.g. U+2155=1/5
  171. FBidiMirrored
  172. FUnicode1Name
  173. FISOComment
  174. FSimpleUppercaseMapping
  175. FSimpleLowercaseMapping
  176. FSimpleTitlecaseMapping
  177. NumField
  178. MaxChar = 0x10FFFF // anything above this shouldn't exist
  179. )
  180. var fieldName = []string{
  181. FCodePoint: "CodePoint",
  182. FName: "Name",
  183. FGeneralCategory: "GeneralCategory",
  184. FCanonicalCombiningClass: "CanonicalCombiningClass",
  185. FBidiClass: "BidiClass",
  186. FDecompositionTypeAndMapping: "DecompositionTypeAndMapping",
  187. FNumericType: "NumericType",
  188. FNumericDigit: "NumericDigit",
  189. FNumericValue: "NumericValue",
  190. FBidiMirrored: "BidiMirrored",
  191. FUnicode1Name: "Unicode1Name",
  192. FISOComment: "ISOComment",
  193. FSimpleUppercaseMapping: "SimpleUppercaseMapping",
  194. FSimpleLowercaseMapping: "SimpleLowercaseMapping",
  195. FSimpleTitlecaseMapping: "SimpleTitlecaseMapping",
  196. }
  197. // This contains only the properties we're interested in.
  198. type Char struct {
  199. field []string // debugging only; could be deleted if we take out char.dump()
  200. codePoint rune // if zero, this index is not a valid code point.
  201. category string
  202. upperCase rune
  203. lowerCase rune
  204. titleCase rune
  205. foldCase rune // simple case folding
  206. caseOrbit rune // next in simple case folding orbit
  207. }
  208. // Scripts.txt has form:
  209. // A673 ; Cyrillic # Po SLAVONIC ASTERISK
  210. // A67C..A67D ; Cyrillic # Mn [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK
  211. // See https://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation
  212. type Script struct {
  213. lo, hi uint32 // range of code points
  214. script string
  215. }
  216. var chars = make([]Char, MaxChar+1)
  217. var scripts = make(map[string][]Script)
  218. var props = make(map[string][]Script) // a property looks like a script; can share the format
  219. var lastChar rune = 0
  220. // In UnicodeData.txt, some ranges are marked like this:
  221. // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
  222. // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
  223. // parseCategory returns a state variable indicating the weirdness.
  224. type State int
  225. const (
  226. SNormal State = iota // known to be zero for the type
  227. SFirst
  228. SLast
  229. SMissing
  230. )
  231. func parseCategory(line string) (state State) {
  232. field := strings.Split(line, ";")
  233. if len(field) != NumField {
  234. logger.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField)
  235. }
  236. point, err := strconv.ParseUint(field[FCodePoint], 16, 64)
  237. if err != nil {
  238. logger.Fatalf("%.5s...: %s", line, err)
  239. }
  240. lastChar = rune(point)
  241. if point > MaxChar {
  242. return
  243. }
  244. char := &chars[point]
  245. char.field = field
  246. if char.codePoint != 0 {
  247. logger.Fatalf("point %U reused", point)
  248. }
  249. char.codePoint = lastChar
  250. char.category = field[FGeneralCategory]
  251. category[char.category] = true
  252. switch char.category {
  253. case "Nd":
  254. // Decimal digit
  255. _, err := strconv.Atoi(field[FNumericValue])
  256. if err != nil {
  257. logger.Fatalf("%U: bad numeric field: %s", point, err)
  258. }
  259. case "Lu":
  260. char.letter(field[FCodePoint], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping])
  261. case "Ll":
  262. char.letter(field[FSimpleUppercaseMapping], field[FCodePoint], field[FSimpleTitlecaseMapping])
  263. case "Lt":
  264. char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FCodePoint])
  265. default:
  266. char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping])
  267. }
  268. switch {
  269. case strings.Index(field[FName], ", First>") > 0:
  270. state = SFirst
  271. case strings.Index(field[FName], ", Last>") > 0:
  272. state = SLast
  273. }
  274. return
  275. }
  276. func (char *Char) dump(s string) {
  277. print(s, " ")
  278. for i := 0; i < len(char.field); i++ {
  279. printf("%s:%q ", fieldName[i], char.field[i])
  280. }
  281. print("\n")
  282. }
  283. func (char *Char) letter(u, l, t string) {
  284. char.upperCase = char.letterValue(u, "U")
  285. char.lowerCase = char.letterValue(l, "L")
  286. char.titleCase = char.letterValue(t, "T")
  287. }
  288. func (char *Char) letterValue(s string, cas string) rune {
  289. if s == "" {
  290. return 0
  291. }
  292. v, err := strconv.ParseUint(s, 16, 64)
  293. if err != nil {
  294. char.dump(cas)
  295. logger.Fatalf("%U: bad letter(%s): %s", char.codePoint, s, err)
  296. }
  297. return rune(v)
  298. }
  299. func allCategories() []string {
  300. a := make([]string, 0, len(category))
  301. for k := range category {
  302. a = append(a, k)
  303. }
  304. sort.Strings(a)
  305. return a
  306. }
  307. func all(scripts map[string][]Script) []string {
  308. a := make([]string, 0, len(scripts))
  309. for k := range scripts {
  310. a = append(a, k)
  311. }
  312. sort.Strings(a)
  313. return a
  314. }
  315. func allCatFold(m map[string]map[rune]bool) []string {
  316. a := make([]string, 0, len(m))
  317. for k := range m {
  318. a = append(a, k)
  319. }
  320. sort.Strings(a)
  321. return a
  322. }
  323. // Extract the version number from the URL
  324. func version() string {
  325. // Break on slashes and look for the first numeric field
  326. fields := strings.Split(*url, "/")
  327. for _, f := range fields {
  328. if len(f) > 0 && '0' <= f[0] && f[0] <= '9' {
  329. return f
  330. }
  331. }
  332. logger.Fatal("unknown version")
  333. return "Unknown"
  334. }
  335. func categoryOp(code rune, class uint8) bool {
  336. category := chars[code].category
  337. return len(category) > 0 && category[0] == class
  338. }
  339. func loadChars() {
  340. if *dataURL == "" {
  341. flag.Set("data", *url+"UnicodeData.txt")
  342. }
  343. input := open(*dataURL)
  344. defer input.close()
  345. scanner := bufio.NewScanner(input)
  346. var first rune = 0
  347. for scanner.Scan() {
  348. switch parseCategory(scanner.Text()) {
  349. case SNormal:
  350. if first != 0 {
  351. logger.Fatalf("bad state normal at %U", lastChar)
  352. }
  353. case SFirst:
  354. if first != 0 {
  355. logger.Fatalf("bad state first at %U", lastChar)
  356. }
  357. first = lastChar
  358. case SLast:
  359. if first == 0 {
  360. logger.Fatalf("bad state last at %U", lastChar)
  361. }
  362. for i := first + 1; i <= lastChar; i++ {
  363. chars[i] = chars[first]
  364. chars[i].codePoint = i
  365. }
  366. first = 0
  367. }
  368. }
  369. if scanner.Err() != nil {
  370. logger.Fatal(scanner.Err())
  371. }
  372. }
  373. func loadCasefold() {
  374. if *casefoldingURL == "" {
  375. flag.Set("casefolding", *url+"CaseFolding.txt")
  376. }
  377. input := open(*casefoldingURL)
  378. defer input.close()
  379. scanner := bufio.NewScanner(input)
  380. for scanner.Scan() {
  381. line := scanner.Text()
  382. if len(line) == 0 || line[0] == '#' || len(strings.TrimSpace(line)) == 0 {
  383. continue
  384. }
  385. field := strings.Split(line, "; ")
  386. if len(field) != 4 {
  387. logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4)
  388. }
  389. kind := field[1]
  390. if kind != "C" && kind != "S" {
  391. // Only care about 'common' and 'simple' foldings.
  392. continue
  393. }
  394. p1, err := strconv.ParseUint(field[0], 16, 64)
  395. if err != nil {
  396. logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
  397. }
  398. p2, err := strconv.ParseUint(field[2], 16, 64)
  399. if err != nil {
  400. logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
  401. }
  402. chars[p1].foldCase = rune(p2)
  403. }
  404. if scanner.Err() != nil {
  405. logger.Fatal(scanner.Err())
  406. }
  407. }
  408. const progHeader = `// Copyright 2013 The Go Authors. All rights reserved.
  409. // Use of this source code is governed by a BSD-style
  410. // license that can be found in the LICENSE file.
  411. // Code generated by go generate; DO NOT EDIT.
  412. package unicode
  413. `
  414. var categoryMapping = map[string]string{
  415. "Lu": "Letter, uppercase",
  416. "Ll": "Letter, lowercase",
  417. "Lt": "Letter, titlecase",
  418. "Lm": "Letter, modifier",
  419. "Lo": "Letter, other",
  420. "Mn": "Mark, nonspacing",
  421. "Mc": "Mark, spacing combining",
  422. "Me": "Mark, enclosing",
  423. "Nd": "Number, decimal digit",
  424. "Nl": "Number, letter",
  425. "No": "Number, other",
  426. "Pc": "Punctuation, connector",
  427. "Pd": "Punctuation, dash",
  428. "Ps": "Punctuation, open",
  429. "Pe": "Punctuation, close",
  430. "Pi": "Punctuation, initial quote",
  431. "Pf": "Punctuation, final quote",
  432. "Po": "Punctuation, other",
  433. "Sm": "Symbol, math",
  434. "Sc": "Symbol, currency",
  435. "Sk": "Symbol, modifier",
  436. "So": "Symbol, other",
  437. "Zs": "Separator, space",
  438. "Zl": "Separator, line",
  439. "Zp": "Separator, paragraph",
  440. "Cc": "Other, control",
  441. "Cf": "Other, format",
  442. "Cs": "Other, surrogate",
  443. "Co": "Other, private use",
  444. "Cn": "Other, not assigned",
  445. }
  446. func printCategories() {
  447. if *tablelist == "" {
  448. return
  449. }
  450. // Find out which categories to dump
  451. list := strings.Split(*tablelist, ",")
  452. if *tablelist == "all" {
  453. list = allCategories()
  454. }
  455. if *test {
  456. fullCategoryTest(list)
  457. return
  458. }
  459. printf(progHeader)
  460. println("// Version is the Unicode edition from which the tables are derived.")
  461. printf("const Version = %q\n\n", version())
  462. if *tablelist == "all" {
  463. println("// Categories is the set of Unicode category tables.")
  464. println("var Categories = map[string] *RangeTable {")
  465. for _, k := range allCategories() {
  466. printf("\t%q: %s,\n", k, k)
  467. }
  468. print("}\n\n")
  469. }
  470. decl := make(sort.StringSlice, len(list))
  471. ndecl := 0
  472. for _, name := range list {
  473. if _, ok := category[name]; !ok {
  474. logger.Fatal("unknown category", name)
  475. }
  476. // We generate an UpperCase name to serve as concise documentation and an _UnderScored
  477. // name to store the data. This stops godoc dumping all the tables but keeps them
  478. // available to clients.
  479. // Cases deserving special comments
  480. varDecl := ""
  481. switch name {
  482. case "C":
  483. varDecl = "\tOther = _C; // Other/C is the set of Unicode control and special characters, category C.\n"
  484. varDecl += "\tC = _C\n"
  485. case "L":
  486. varDecl = "\tLetter = _L; // Letter/L is the set of Unicode letters, category L.\n"
  487. varDecl += "\tL = _L\n"
  488. case "M":
  489. varDecl = "\tMark = _M; // Mark/M is the set of Unicode mark characters, category M.\n"
  490. varDecl += "\tM = _M\n"
  491. case "N":
  492. varDecl = "\tNumber = _N; // Number/N is the set of Unicode number characters, category N.\n"
  493. varDecl += "\tN = _N\n"
  494. case "P":
  495. varDecl = "\tPunct = _P; // Punct/P is the set of Unicode punctuation characters, category P.\n"
  496. varDecl += "\tP = _P\n"
  497. case "S":
  498. varDecl = "\tSymbol = _S; // Symbol/S is the set of Unicode symbol characters, category S.\n"
  499. varDecl += "\tS = _S\n"
  500. case "Z":
  501. varDecl = "\tSpace = _Z; // Space/Z is the set of Unicode space characters, category Z.\n"
  502. varDecl += "\tZ = _Z\n"
  503. case "Nd":
  504. varDecl = "\tDigit = _Nd; // Digit is the set of Unicode characters with the \"decimal digit\" property.\n"
  505. case "Lu":
  506. varDecl = "\tUpper = _Lu; // Upper is the set of Unicode upper case letters.\n"
  507. case "Ll":
  508. varDecl = "\tLower = _Ll; // Lower is the set of Unicode lower case letters.\n"
  509. case "Lt":
  510. varDecl = "\tTitle = _Lt; // Title is the set of Unicode title case letters.\n"
  511. }
  512. if len(name) > 1 {
  513. desc, ok := categoryMapping[name]
  514. if ok {
  515. varDecl += fmt.Sprintf(
  516. "\t%s = _%s; // %s is the set of Unicode characters in category %s (%s).\n",
  517. name, name, name, name, desc)
  518. } else {
  519. varDecl += fmt.Sprintf(
  520. "\t%s = _%s; // %s is the set of Unicode characters in category %s.\n",
  521. name, name, name, name)
  522. }
  523. }
  524. decl[ndecl] = varDecl
  525. ndecl++
  526. if len(name) == 1 { // unified categories
  527. decl := fmt.Sprintf("var _%s = &RangeTable{\n", name)
  528. dumpRange(
  529. decl,
  530. func(code rune) bool { return categoryOp(code, name[0]) })
  531. continue
  532. }
  533. dumpRange(
  534. fmt.Sprintf("var _%s = &RangeTable{\n", name),
  535. func(code rune) bool { return chars[code].category == name })
  536. }
  537. decl.Sort()
  538. println("// These variables have type *RangeTable.")
  539. println("var (")
  540. for _, d := range decl {
  541. print(d)
  542. }
  543. print(")\n\n")
  544. }
  545. type Op func(code rune) bool
  546. func dumpRange(header string, inCategory Op) {
  547. runes := []rune{}
  548. for i := range chars {
  549. r := rune(i)
  550. if inCategory(r) {
  551. runes = append(runes, r)
  552. }
  553. }
  554. printRangeTable(header, runes)
  555. }
  556. func printRangeTable(header string, runes []rune) {
  557. rt := rangetable.New(runes...)
  558. print(header)
  559. println("\tR16: []Range16{")
  560. for _, r := range rt.R16 {
  561. printf("\t\t{%#04x, %#04x, %d},\n", r.Lo, r.Hi, r.Stride)
  562. range16Count++
  563. }
  564. println("\t},")
  565. if len(rt.R32) > 0 {
  566. println("\tR32: []Range32{")
  567. for _, r := range rt.R32 {
  568. printf("\t\t{%#x, %#x, %d},\n", r.Lo, r.Hi, r.Stride)
  569. range32Count++
  570. }
  571. println("\t},")
  572. }
  573. if rt.LatinOffset > 0 {
  574. printf("\tLatinOffset: %d,\n", rt.LatinOffset)
  575. }
  576. printf("}\n\n")
  577. }
  578. func fullCategoryTest(list []string) {
  579. for _, name := range list {
  580. if _, ok := category[name]; !ok {
  581. logger.Fatal("unknown category", name)
  582. }
  583. r, ok := unicode.Categories[name]
  584. if !ok && len(name) > 1 {
  585. logger.Fatalf("unknown table %q", name)
  586. }
  587. if len(name) == 1 {
  588. verifyRange(name, func(code rune) bool { return categoryOp(code, name[0]) }, r)
  589. } else {
  590. verifyRange(
  591. name,
  592. func(code rune) bool { return chars[code].category == name },
  593. r)
  594. }
  595. }
  596. }
  597. func verifyRange(name string, inCategory Op, table *unicode.RangeTable) {
  598. count := 0
  599. for j := range chars {
  600. i := rune(j)
  601. web := inCategory(i)
  602. pkg := unicode.Is(table, i)
  603. if web != pkg {
  604. fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg)
  605. count++
  606. if count > 10 {
  607. break
  608. }
  609. }
  610. }
  611. }
  612. func parseScript(line string, scripts map[string][]Script) {
  613. comment := strings.Index(line, "#")
  614. if comment >= 0 {
  615. line = line[0:comment]
  616. }
  617. line = strings.TrimSpace(line)
  618. if len(line) == 0 {
  619. return
  620. }
  621. field := strings.Split(line, ";")
  622. if len(field) != 2 {
  623. logger.Fatalf("%s: %d fields (expected 2)\n", line, len(field))
  624. }
  625. matches := scriptRe.FindStringSubmatch(line)
  626. if len(matches) != 4 {
  627. logger.Fatalf("%s: %d matches (expected 3)\n", line, len(matches))
  628. }
  629. lo, err := strconv.ParseUint(matches[1], 16, 64)
  630. if err != nil {
  631. logger.Fatalf("%.5s...: %s", line, err)
  632. }
  633. hi := lo
  634. if len(matches[2]) > 2 { // ignore leading ..
  635. hi, err = strconv.ParseUint(matches[2][2:], 16, 64)
  636. if err != nil {
  637. logger.Fatalf("%.5s...: %s", line, err)
  638. }
  639. }
  640. name := matches[3]
  641. scripts[name] = append(scripts[name], Script{uint32(lo), uint32(hi), name})
  642. }
  643. func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) {
  644. for _, name := range list {
  645. if _, ok := scripts[name]; !ok {
  646. logger.Fatal("unknown script", name)
  647. }
  648. _, ok := installed[name]
  649. if !ok {
  650. logger.Fatal("unknown table", name)
  651. }
  652. for _, script := range scripts[name] {
  653. for r := script.lo; r <= script.hi; r++ {
  654. if !unicode.Is(installed[name], rune(r)) {
  655. fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name)
  656. }
  657. }
  658. }
  659. }
  660. }
  661. var deprecatedAliases = map[string]string{
  662. "Sentence_Terminal": "STerm",
  663. }
  664. // PropList.txt has the same format as Scripts.txt so we can share its parser.
  665. func printScriptOrProperty(doProps bool) {
  666. flaglist := *scriptlist
  667. file := "Scripts.txt"
  668. table := scripts
  669. installed := unicode.Scripts
  670. if doProps {
  671. flaglist = *proplist
  672. file = "PropList.txt"
  673. table = props
  674. installed = unicode.Properties
  675. }
  676. if flaglist == "" {
  677. return
  678. }
  679. input := open(*url + file)
  680. scanner := bufio.NewScanner(input)
  681. for scanner.Scan() {
  682. parseScript(scanner.Text(), table)
  683. }
  684. if scanner.Err() != nil {
  685. logger.Fatal(scanner.Err())
  686. }
  687. input.close()
  688. // Find out which scripts to dump
  689. list := strings.Split(flaglist, ",")
  690. if flaglist == "all" {
  691. list = all(table)
  692. }
  693. if *test {
  694. fullScriptTest(list, installed, table)
  695. return
  696. }
  697. if flaglist == "all" {
  698. if doProps {
  699. println("// Properties is the set of Unicode property tables.")
  700. println("var Properties = map[string] *RangeTable{")
  701. } else {
  702. println("// Scripts is the set of Unicode script tables.")
  703. println("var Scripts = map[string] *RangeTable{")
  704. }
  705. for _, k := range all(table) {
  706. printf("\t%q: %s,\n", k, k)
  707. if alias, ok := deprecatedAliases[k]; ok {
  708. printf("\t%q: %s,\n", alias, k)
  709. }
  710. }
  711. print("}\n\n")
  712. }
  713. decl := make(sort.StringSlice, len(list)+len(deprecatedAliases))
  714. ndecl := 0
  715. for _, name := range list {
  716. if doProps {
  717. decl[ndecl] = fmt.Sprintf(
  718. "\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n",
  719. name, name, name, name)
  720. } else {
  721. decl[ndecl] = fmt.Sprintf(
  722. "\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n",
  723. name, name, name, name)
  724. }
  725. ndecl++
  726. if alias, ok := deprecatedAliases[name]; ok {
  727. decl[ndecl] = fmt.Sprintf(
  728. "\t%[1]s = _%[2]s;\t// %[1]s is an alias for %[2]s.\n",
  729. alias, name)
  730. ndecl++
  731. }
  732. decl := fmt.Sprintf("var _%s = &RangeTable {\n", name)
  733. runes := []rune{}
  734. for _, scr := range table[name] {
  735. for r := scr.lo; r <= scr.hi; r++ {
  736. runes = append(runes, rune(r))
  737. }
  738. }
  739. printRangeTable(decl, runes)
  740. }
  741. decl.Sort()
  742. println("// These variables have type *RangeTable.")
  743. println("var (")
  744. for _, d := range decl {
  745. print(d)
  746. }
  747. print(")\n\n")
  748. }
  749. const (
  750. CaseUpper = 1 << iota
  751. CaseLower
  752. CaseTitle
  753. CaseNone = 0 // must be zero
  754. CaseMissing = -1 // character not present; not a valid case state
  755. )
  756. type caseState struct {
  757. point rune
  758. _case int
  759. deltaToUpper rune
  760. deltaToLower rune
  761. deltaToTitle rune
  762. }
  763. // Is d a continuation of the state of c?
  764. func (c *caseState) adjacent(d *caseState) bool {
  765. if d.point < c.point {
  766. c, d = d, c
  767. }
  768. switch {
  769. case d.point != c.point+1: // code points not adjacent (shouldn't happen)
  770. return false
  771. case d._case != c._case: // different cases
  772. return c.upperLowerAdjacent(d)
  773. case c._case == CaseNone:
  774. return false
  775. case c._case == CaseMissing:
  776. return false
  777. case d.deltaToUpper != c.deltaToUpper:
  778. return false
  779. case d.deltaToLower != c.deltaToLower:
  780. return false
  781. case d.deltaToTitle != c.deltaToTitle:
  782. return false
  783. }
  784. return true
  785. }
  786. // Is d the same as c, but opposite in upper/lower case? this would make it
  787. // an element of an UpperLower sequence.
  788. func (c *caseState) upperLowerAdjacent(d *caseState) bool {
  789. // check they're a matched case pair. we know they have adjacent values
  790. switch {
  791. case c._case == CaseUpper && d._case != CaseLower:
  792. return false
  793. case c._case == CaseLower && d._case != CaseUpper:
  794. return false
  795. }
  796. // matched pair (at least in upper/lower). make the order Upper Lower
  797. if c._case == CaseLower {
  798. c, d = d, c
  799. }
  800. // for an Upper Lower sequence the deltas have to be in order
  801. // c: 0 1 0
  802. // d: -1 0 -1
  803. switch {
  804. case c.deltaToUpper != 0:
  805. return false
  806. case c.deltaToLower != 1:
  807. return false
  808. case c.deltaToTitle != 0:
  809. return false
  810. case d.deltaToUpper != -1:
  811. return false
  812. case d.deltaToLower != 0:
  813. return false
  814. case d.deltaToTitle != -1:
  815. return false
  816. }
  817. return true
  818. }
  819. // Does this character start an UpperLower sequence?
  820. func (c *caseState) isUpperLower() bool {
  821. // for an Upper Lower sequence the deltas have to be in order
  822. // c: 0 1 0
  823. switch {
  824. case c.deltaToUpper != 0:
  825. return false
  826. case c.deltaToLower != 1:
  827. return false
  828. case c.deltaToTitle != 0:
  829. return false
  830. }
  831. return true
  832. }
  833. // Does this character start a LowerUpper sequence?
  834. func (c *caseState) isLowerUpper() bool {
  835. // for an Upper Lower sequence the deltas have to be in order
  836. // c: -1 0 -1
  837. switch {
  838. case c.deltaToUpper != -1:
  839. return false
  840. case c.deltaToLower != 0:
  841. return false
  842. case c.deltaToTitle != -1:
  843. return false
  844. }
  845. return true
  846. }
  847. func getCaseState(i rune) (c *caseState) {
  848. c = &caseState{point: i, _case: CaseNone}
  849. ch := &chars[i]
  850. switch ch.codePoint {
  851. case 0:
  852. c._case = CaseMissing // Will get NUL wrong but that doesn't matter
  853. return
  854. case ch.upperCase:
  855. c._case = CaseUpper
  856. case ch.lowerCase:
  857. c._case = CaseLower
  858. case ch.titleCase:
  859. c._case = CaseTitle
  860. }
  861. // Some things such as roman numeral U+2161 don't describe themselves
  862. // as upper case, but have a lower case. Second-guess them.
  863. if c._case == CaseNone && ch.lowerCase != 0 {
  864. c._case = CaseUpper
  865. }
  866. // Same in the other direction.
  867. if c._case == CaseNone && ch.upperCase != 0 {
  868. c._case = CaseLower
  869. }
  870. if ch.upperCase != 0 {
  871. c.deltaToUpper = ch.upperCase - i
  872. }
  873. if ch.lowerCase != 0 {
  874. c.deltaToLower = ch.lowerCase - i
  875. }
  876. if ch.titleCase != 0 {
  877. c.deltaToTitle = ch.titleCase - i
  878. }
  879. return
  880. }
  881. func printCases() {
  882. if !*cases {
  883. return
  884. }
  885. if *test {
  886. fullCaseTest()
  887. return
  888. }
  889. printf(
  890. "// CaseRanges is the table describing case mappings for all letters with\n" +
  891. "// non-self mappings.\n" +
  892. "var CaseRanges = _CaseRanges\n" +
  893. "var _CaseRanges = []CaseRange {\n")
  894. var startState *caseState // the start of a run; nil for not active
  895. var prevState = &caseState{} // the state of the previous character
  896. for i := range chars {
  897. state := getCaseState(rune(i))
  898. if state.adjacent(prevState) {
  899. prevState = state
  900. continue
  901. }
  902. // end of run (possibly)
  903. printCaseRange(startState, prevState)
  904. startState = nil
  905. if state._case != CaseMissing && state._case != CaseNone {
  906. startState = state
  907. }
  908. prevState = state
  909. }
  910. print("}\n")
  911. }
  912. func printCaseRange(lo, hi *caseState) {
  913. if lo == nil {
  914. return
  915. }
  916. if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 {
  917. // character represents itself in all cases - no need to mention it
  918. return
  919. }
  920. switch {
  921. case hi.point > lo.point && lo.isUpperLower():
  922. printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n",
  923. lo.point, hi.point)
  924. case hi.point > lo.point && lo.isLowerUpper():
  925. logger.Fatalf("LowerUpper sequence: should not happen: %U. If it's real, need to fix To()", lo.point)
  926. printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n",
  927. lo.point, hi.point)
  928. default:
  929. printf("\t{0x%04X, 0x%04X, d{%d, %d, %d}},\n",
  930. lo.point, hi.point,
  931. lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle)
  932. }
  933. }
  934. // If the cased value in the Char is 0, it means use the rune itself.
  935. func caseIt(r, cased rune) rune {
  936. if cased == 0 {
  937. return r
  938. }
  939. return cased
  940. }
  941. func fullCaseTest() {
  942. for j, c := range chars {
  943. i := rune(j)
  944. lower := unicode.ToLower(i)
  945. want := caseIt(i, c.lowerCase)
  946. if lower != want {
  947. fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower)
  948. }
  949. upper := unicode.ToUpper(i)
  950. want = caseIt(i, c.upperCase)
  951. if upper != want {
  952. fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper)
  953. }
  954. title := unicode.ToTitle(i)
  955. want = caseIt(i, c.titleCase)
  956. if title != want {
  957. fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title)
  958. }
  959. }
  960. }
  961. func printLatinProperties() {
  962. if *test {
  963. return
  964. }
  965. println("var properties = [MaxLatin1+1]uint8{")
  966. for code := 0; code <= unicode.MaxLatin1; code++ {
  967. var property string
  968. switch chars[code].category {
  969. case "Cc", "": // NUL has no category.
  970. property = "pC"
  971. case "Cf": // soft hyphen, unique category, not printable.
  972. property = "0"
  973. case "Ll":
  974. property = "pLl | pp"
  975. case "Lo":
  976. property = "pLo | pp"
  977. case "Lu":
  978. property = "pLu | pp"
  979. case "Nd", "No":
  980. property = "pN | pp"
  981. case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps":
  982. property = "pP | pp"
  983. case "Sc", "Sk", "Sm", "So":
  984. property = "pS | pp"
  985. case "Zs":
  986. property = "pZ"
  987. default:
  988. logger.Fatalf("%U has unknown category %q", code, chars[code].category)
  989. }
  990. // Special case
  991. if code == ' ' {
  992. property = "pZ | pp"
  993. }
  994. printf("\t0x%02X: %s, // %q\n", code, property, code)
  995. }
  996. printf("}\n\n")
  997. }
  998. func printCasefold() {
  999. // Build list of case-folding groups attached to each canonical folded char (typically lower case).
  1000. var caseOrbit = make([][]rune, MaxChar+1)
  1001. for j := range chars {
  1002. i := rune(j)
  1003. c := &chars[i]
  1004. if c.foldCase == 0 {
  1005. continue
  1006. }
  1007. orb := caseOrbit[c.foldCase]
  1008. if orb == nil {
  1009. orb = append(orb, c.foldCase)
  1010. }
  1011. caseOrbit[c.foldCase] = append(orb, i)
  1012. }
  1013. // Insert explicit 1-element groups when assuming [lower, upper] would be wrong.
  1014. for j := range chars {
  1015. i := rune(j)
  1016. c := &chars[i]
  1017. f := c.foldCase
  1018. if f == 0 {
  1019. f = i
  1020. }
  1021. orb := caseOrbit[f]
  1022. if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) {
  1023. // Default assumption of [upper, lower] is wrong.
  1024. caseOrbit[i] = []rune{i}
  1025. }
  1026. }
  1027. // Delete the groups for which assuming [lower, upper] or [upper, lower] is right.
  1028. for i, orb := range caseOrbit {
  1029. if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] {
  1030. caseOrbit[i] = nil
  1031. }
  1032. if len(orb) == 2 && chars[orb[1]].upperCase == orb[0] && chars[orb[0]].lowerCase == orb[1] {
  1033. caseOrbit[i] = nil
  1034. }
  1035. }
  1036. // Record orbit information in chars.
  1037. for _, orb := range caseOrbit {
  1038. if orb == nil {
  1039. continue
  1040. }
  1041. sort.Slice(orb, func(i, j int) bool {
  1042. return orb[i] < orb[j]
  1043. })
  1044. c := orb[len(orb)-1]
  1045. for _, d := range orb {
  1046. chars[c].caseOrbit = d
  1047. c = d
  1048. }
  1049. }
  1050. printAsciiFold()
  1051. printCaseOrbit()
  1052. // Tables of category and script folding exceptions: code points
  1053. // that must be added when interpreting a particular category/script
  1054. // in a case-folding context.
  1055. cat := make(map[string]map[rune]bool)
  1056. for name := range category {
  1057. if x := foldExceptions(inCategory(name)); len(x) > 0 {
  1058. cat[name] = x
  1059. }
  1060. }
  1061. scr := make(map[string]map[rune]bool)
  1062. for name := range scripts {
  1063. if x := foldExceptions(inScript(name)); len(x) > 0 {
  1064. scr[name] = x
  1065. }
  1066. }
  1067. printCatFold("FoldCategory", cat)
  1068. printCatFold("FoldScript", scr)
  1069. }
  1070. // inCategory returns a list of all the runes in the category.
  1071. func inCategory(name string) []rune {
  1072. var x []rune
  1073. for j := range chars {
  1074. i := rune(j)
  1075. c := &chars[i]
  1076. if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] {
  1077. x = append(x, i)
  1078. }
  1079. }
  1080. return x
  1081. }
  1082. // inScript returns a list of all the runes in the script.
  1083. func inScript(name string) []rune {
  1084. var x []rune
  1085. for _, s := range scripts[name] {
  1086. for c := s.lo; c <= s.hi; c++ {
  1087. x = append(x, rune(c))
  1088. }
  1089. }
  1090. return x
  1091. }
  1092. // foldExceptions returns a list of all the runes fold-equivalent
  1093. // to runes in class but not in class themselves.
  1094. func foldExceptions(class []rune) map[rune]bool {
  1095. // Create map containing class and all fold-equivalent chars.
  1096. m := make(map[rune]bool)
  1097. for _, r := range class {
  1098. c := &chars[r]
  1099. if c.caseOrbit == 0 {
  1100. // Just upper and lower.
  1101. if u := c.upperCase; u != 0 {
  1102. m[u] = true
  1103. }
  1104. if l := c.lowerCase; l != 0 {
  1105. m[l] = true
  1106. }
  1107. m[r] = true
  1108. continue
  1109. }
  1110. // Otherwise walk orbit.
  1111. r0 := r
  1112. for {
  1113. m[r] = true
  1114. r = chars[r].caseOrbit
  1115. if r == r0 {
  1116. break
  1117. }
  1118. }
  1119. }
  1120. // Remove class itself.
  1121. for _, r := range class {
  1122. delete(m, r)
  1123. }
  1124. // What's left is the exceptions.
  1125. return m
  1126. }
  1127. var comment = map[string]string{
  1128. "FoldCategory": "// FoldCategory maps a category name to a table of\n" +
  1129. "// code points outside the category that are equivalent under\n" +
  1130. "// simple case folding to code points inside the category.\n" +
  1131. "// If there is no entry for a category name, there are no such points.\n",
  1132. "FoldScript": "// FoldScript maps a script name to a table of\n" +
  1133. "// code points outside the script that are equivalent under\n" +
  1134. "// simple case folding to code points inside the script.\n" +
  1135. "// If there is no entry for a script name, there are no such points.\n",
  1136. }
  1137. func printAsciiFold() {
  1138. printf("var asciiFold = [MaxASCII + 1]uint16{\n")
  1139. for i := rune(0); i <= unicode.MaxASCII; i++ {
  1140. c := chars[i]
  1141. f := c.caseOrbit
  1142. if f == 0 {
  1143. if c.lowerCase != i && c.lowerCase != 0 {
  1144. f = c.lowerCase
  1145. } else if c.upperCase != i && c.upperCase != 0 {
  1146. f = c.upperCase
  1147. } else {
  1148. f = i
  1149. }
  1150. }
  1151. printf("\t0x%04X,\n", f)
  1152. }
  1153. printf("}\n\n")
  1154. }
  1155. func printCaseOrbit() {
  1156. if *test {
  1157. for j := range chars {
  1158. i := rune(j)
  1159. c := &chars[i]
  1160. f := c.caseOrbit
  1161. if f == 0 {
  1162. if c.lowerCase != i && c.lowerCase != 0 {
  1163. f = c.lowerCase
  1164. } else if c.upperCase != i && c.upperCase != 0 {
  1165. f = c.upperCase
  1166. } else {
  1167. f = i
  1168. }
  1169. }
  1170. if g := unicode.SimpleFold(i); g != f {
  1171. fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f)
  1172. }
  1173. }
  1174. return
  1175. }
  1176. printf("var caseOrbit = []foldPair{\n")
  1177. for i := range chars {
  1178. c := &chars[i]
  1179. if c.caseOrbit != 0 {
  1180. printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit)
  1181. foldPairCount++
  1182. }
  1183. }
  1184. printf("}\n\n")
  1185. }
  1186. func printCatFold(name string, m map[string]map[rune]bool) {
  1187. if *test {
  1188. var pkgMap map[string]*unicode.RangeTable
  1189. if name == "FoldCategory" {
  1190. pkgMap = unicode.FoldCategory
  1191. } else {
  1192. pkgMap = unicode.FoldScript
  1193. }
  1194. if len(pkgMap) != len(m) {
  1195. fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m))
  1196. return
  1197. }
  1198. for k, v := range m {
  1199. t, ok := pkgMap[k]
  1200. if !ok {
  1201. fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k)
  1202. continue
  1203. }
  1204. n := 0
  1205. for _, r := range t.R16 {
  1206. for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) {
  1207. if !v[c] {
  1208. fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
  1209. }
  1210. n++
  1211. }
  1212. }
  1213. for _, r := range t.R32 {
  1214. for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) {
  1215. if !v[c] {
  1216. fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
  1217. }
  1218. n++
  1219. }
  1220. }
  1221. if n != len(v) {
  1222. fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v))
  1223. }
  1224. }
  1225. return
  1226. }
  1227. print(comment[name])
  1228. printf("var %s = map[string]*RangeTable{\n", name)
  1229. for _, name := range allCatFold(m) {
  1230. printf("\t%q: fold%s,\n", name, name)
  1231. }
  1232. printf("}\n\n")
  1233. for _, name := range allCatFold(m) {
  1234. class := m[name]
  1235. dumpRange(
  1236. fmt.Sprintf("var fold%s = &RangeTable{\n", name),
  1237. func(code rune) bool { return class[code] })
  1238. }
  1239. }
  1240. var range16Count = 0 // Number of entries in the 16-bit range tables.
  1241. var range32Count = 0 // Number of entries in the 32-bit range tables.
  1242. var foldPairCount = 0 // Number of fold pairs in the exception tables.
  1243. func printSizes() {
  1244. if *test {
  1245. return
  1246. }
  1247. println()
  1248. printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count)
  1249. range16Bytes := range16Count * 3 * 2
  1250. range32Bytes := range32Count * 3 * 4
  1251. printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
  1252. println()
  1253. printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2)
  1254. }