You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

140 lines
3.6 KiB

  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package precis
  5. import "errors"
  6. // This file contains tables and code related to context rules.
  7. type catBitmap uint16
  8. const (
  9. // These bits, once set depending on the current value, are never unset.
  10. bJapanese catBitmap = 1 << iota
  11. bArabicIndicDigit
  12. bExtendedArabicIndicDigit
  13. // These bits are set on each iteration depending on the current value.
  14. bJoinStart
  15. bJoinMid
  16. bJoinEnd
  17. bVirama
  18. bLatinSmallL
  19. bGreek
  20. bHebrew
  21. // These bits indicated which of the permanent bits need to be set at the
  22. // end of the checks.
  23. bMustHaveJapn
  24. permanent = bJapanese | bArabicIndicDigit | bExtendedArabicIndicDigit | bMustHaveJapn
  25. )
  26. const finalShift = 10
  27. var errContext = errors.New("precis: contextual rule violated")
  28. func init() {
  29. // Programmatically set these required bits as, manually setting them seems
  30. // too error prone.
  31. for i, ct := range categoryTransitions {
  32. categoryTransitions[i].keep |= permanent
  33. categoryTransitions[i].accept |= ct.term
  34. }
  35. }
  36. var categoryTransitions = []struct {
  37. keep catBitmap // mask selecting which bits to keep from the previous state
  38. set catBitmap // mask for which bits to set for this transition
  39. // These bitmaps are used for rules that require lookahead.
  40. // term&accept == term must be true, which is enforced programmatically.
  41. term catBitmap // bits accepted as termination condition
  42. accept catBitmap // bits that pass, but not sufficient as termination
  43. // The rule function cannot take a *context as an argument, as it would
  44. // cause the context to escape, adding significant overhead.
  45. rule func(beforeBits catBitmap) (doLookahead bool, err error)
  46. }{
  47. joiningL: {set: bJoinStart},
  48. joiningD: {set: bJoinStart | bJoinEnd},
  49. joiningT: {keep: bJoinStart, set: bJoinMid},
  50. joiningR: {set: bJoinEnd},
  51. viramaModifier: {set: bVirama},
  52. viramaJoinT: {set: bVirama | bJoinMid},
  53. latinSmallL: {set: bLatinSmallL},
  54. greek: {set: bGreek},
  55. greekJoinT: {set: bGreek | bJoinMid},
  56. hebrew: {set: bHebrew},
  57. hebrewJoinT: {set: bHebrew | bJoinMid},
  58. japanese: {set: bJapanese},
  59. katakanaMiddleDot: {set: bMustHaveJapn},
  60. zeroWidthNonJoiner: {
  61. term: bJoinEnd,
  62. accept: bJoinMid,
  63. rule: func(before catBitmap) (doLookAhead bool, err error) {
  64. if before&bVirama != 0 {
  65. return false, nil
  66. }
  67. if before&bJoinStart == 0 {
  68. return false, errContext
  69. }
  70. return true, nil
  71. },
  72. },
  73. zeroWidthJoiner: {
  74. rule: func(before catBitmap) (doLookAhead bool, err error) {
  75. if before&bVirama == 0 {
  76. err = errContext
  77. }
  78. return false, err
  79. },
  80. },
  81. middleDot: {
  82. term: bLatinSmallL,
  83. rule: func(before catBitmap) (doLookAhead bool, err error) {
  84. if before&bLatinSmallL == 0 {
  85. return false, errContext
  86. }
  87. return true, nil
  88. },
  89. },
  90. greekLowerNumeralSign: {
  91. set: bGreek,
  92. term: bGreek,
  93. rule: func(before catBitmap) (doLookAhead bool, err error) {
  94. return true, nil
  95. },
  96. },
  97. hebrewPreceding: {
  98. set: bHebrew,
  99. rule: func(before catBitmap) (doLookAhead bool, err error) {
  100. if before&bHebrew == 0 {
  101. err = errContext
  102. }
  103. return false, err
  104. },
  105. },
  106. arabicIndicDigit: {
  107. set: bArabicIndicDigit,
  108. rule: func(before catBitmap) (doLookAhead bool, err error) {
  109. if before&bExtendedArabicIndicDigit != 0 {
  110. err = errContext
  111. }
  112. return false, err
  113. },
  114. },
  115. extendedArabicIndicDigit: {
  116. set: bExtendedArabicIndicDigit,
  117. rule: func(before catBitmap) (doLookAhead bool, err error) {
  118. if before&bArabicIndicDigit != 0 {
  119. err = errContext
  120. }
  121. return false, err
  122. },
  123. },
  124. }