You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

207 line
5.7 KiB

  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package bidi
  5. import "unicode/utf8"
  6. // Properties provides access to BiDi properties of runes.
  7. type Properties struct {
  8. entry uint8
  9. last uint8
  10. }
  11. var trie = newBidiTrie(0)
  12. // TODO: using this for bidirule reduces the running time by about 5%. Consider
  13. // if this is worth exposing or if we can find a way to speed up the Class
  14. // method.
  15. //
  16. // // CompactClass is like Class, but maps all of the BiDi control classes
  17. // // (LRO, RLO, LRE, RLE, PDF, LRI, RLI, FSI, PDI) to the class Control.
  18. // func (p Properties) CompactClass() Class {
  19. // return Class(p.entry & 0x0F)
  20. // }
  21. // Class returns the Bidi class for p.
  22. func (p Properties) Class() Class {
  23. c := Class(p.entry & 0x0F)
  24. if c == Control {
  25. c = controlByteToClass[p.last&0xF]
  26. }
  27. return c
  28. }
  29. // IsBracket reports whether the rune is a bracket.
  30. func (p Properties) IsBracket() bool { return p.entry&0xF0 != 0 }
  31. // IsOpeningBracket reports whether the rune is an opening bracket.
  32. // IsBracket must return true.
  33. func (p Properties) IsOpeningBracket() bool { return p.entry&openMask != 0 }
  34. // TODO: find a better API and expose.
  35. func (p Properties) reverseBracket(r rune) rune {
  36. return xorMasks[p.entry>>xorMaskShift] ^ r
  37. }
  38. var controlByteToClass = [16]Class{
  39. 0xD: LRO, // U+202D LeftToRightOverride,
  40. 0xE: RLO, // U+202E RightToLeftOverride,
  41. 0xA: LRE, // U+202A LeftToRightEmbedding,
  42. 0xB: RLE, // U+202B RightToLeftEmbedding,
  43. 0xC: PDF, // U+202C PopDirectionalFormat,
  44. 0x6: LRI, // U+2066 LeftToRightIsolate,
  45. 0x7: RLI, // U+2067 RightToLeftIsolate,
  46. 0x8: FSI, // U+2068 FirstStrongIsolate,
  47. 0x9: PDI, // U+2069 PopDirectionalIsolate,
  48. }
  49. // LookupRune returns properties for r.
  50. func LookupRune(r rune) (p Properties, size int) {
  51. var buf [4]byte
  52. n := utf8.EncodeRune(buf[:], r)
  53. return Lookup(buf[:n])
  54. }
  55. // TODO: these lookup methods are based on the generated trie code. The returned
  56. // sizes have slightly different semantics from the generated code, in that it
  57. // always returns size==1 for an illegal UTF-8 byte (instead of the length
  58. // of the maximum invalid subsequence). Most Transformers, like unicode/norm,
  59. // leave invalid UTF-8 untouched, in which case it has performance benefits to
  60. // do so (without changing the semantics). Bidi requires the semantics used here
  61. // for the bidirule implementation to be compatible with the Go semantics.
  62. // They ultimately should perhaps be adopted by all trie implementations, for
  63. // convenience sake.
  64. // This unrolled code also boosts performance of the secure/bidirule package by
  65. // about 30%.
  66. // So, to remove this code:
  67. // - add option to trie generator to define return type.
  68. // - always return 1 byte size for ill-formed UTF-8 runes.
  69. // Lookup returns properties for the first rune in s and the width in bytes of
  70. // its encoding. The size will be 0 if s does not hold enough bytes to complete
  71. // the encoding.
  72. func Lookup(s []byte) (p Properties, sz int) {
  73. c0 := s[0]
  74. switch {
  75. case c0 < 0x80: // is ASCII
  76. return Properties{entry: bidiValues[c0]}, 1
  77. case c0 < 0xC2:
  78. return Properties{}, 1
  79. case c0 < 0xE0: // 2-byte UTF-8
  80. if len(s) < 2 {
  81. return Properties{}, 0
  82. }
  83. i := bidiIndex[c0]
  84. c1 := s[1]
  85. if c1 < 0x80 || 0xC0 <= c1 {
  86. return Properties{}, 1
  87. }
  88. return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2
  89. case c0 < 0xF0: // 3-byte UTF-8
  90. if len(s) < 3 {
  91. return Properties{}, 0
  92. }
  93. i := bidiIndex[c0]
  94. c1 := s[1]
  95. if c1 < 0x80 || 0xC0 <= c1 {
  96. return Properties{}, 1
  97. }
  98. o := uint32(i)<<6 + uint32(c1)
  99. i = bidiIndex[o]
  100. c2 := s[2]
  101. if c2 < 0x80 || 0xC0 <= c2 {
  102. return Properties{}, 1
  103. }
  104. return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3
  105. case c0 < 0xF8: // 4-byte UTF-8
  106. if len(s) < 4 {
  107. return Properties{}, 0
  108. }
  109. i := bidiIndex[c0]
  110. c1 := s[1]
  111. if c1 < 0x80 || 0xC0 <= c1 {
  112. return Properties{}, 1
  113. }
  114. o := uint32(i)<<6 + uint32(c1)
  115. i = bidiIndex[o]
  116. c2 := s[2]
  117. if c2 < 0x80 || 0xC0 <= c2 {
  118. return Properties{}, 1
  119. }
  120. o = uint32(i)<<6 + uint32(c2)
  121. i = bidiIndex[o]
  122. c3 := s[3]
  123. if c3 < 0x80 || 0xC0 <= c3 {
  124. return Properties{}, 1
  125. }
  126. return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4
  127. }
  128. // Illegal rune
  129. return Properties{}, 1
  130. }
  131. // LookupString returns properties for the first rune in s and the width in
  132. // bytes of its encoding. The size will be 0 if s does not hold enough bytes to
  133. // complete the encoding.
  134. func LookupString(s string) (p Properties, sz int) {
  135. c0 := s[0]
  136. switch {
  137. case c0 < 0x80: // is ASCII
  138. return Properties{entry: bidiValues[c0]}, 1
  139. case c0 < 0xC2:
  140. return Properties{}, 1
  141. case c0 < 0xE0: // 2-byte UTF-8
  142. if len(s) < 2 {
  143. return Properties{}, 0
  144. }
  145. i := bidiIndex[c0]
  146. c1 := s[1]
  147. if c1 < 0x80 || 0xC0 <= c1 {
  148. return Properties{}, 1
  149. }
  150. return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2
  151. case c0 < 0xF0: // 3-byte UTF-8
  152. if len(s) < 3 {
  153. return Properties{}, 0
  154. }
  155. i := bidiIndex[c0]
  156. c1 := s[1]
  157. if c1 < 0x80 || 0xC0 <= c1 {
  158. return Properties{}, 1
  159. }
  160. o := uint32(i)<<6 + uint32(c1)
  161. i = bidiIndex[o]
  162. c2 := s[2]
  163. if c2 < 0x80 || 0xC0 <= c2 {
  164. return Properties{}, 1
  165. }
  166. return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3
  167. case c0 < 0xF8: // 4-byte UTF-8
  168. if len(s) < 4 {
  169. return Properties{}, 0
  170. }
  171. i := bidiIndex[c0]
  172. c1 := s[1]
  173. if c1 < 0x80 || 0xC0 <= c1 {
  174. return Properties{}, 1
  175. }
  176. o := uint32(i)<<6 + uint32(c1)
  177. i = bidiIndex[o]
  178. c2 := s[2]
  179. if c2 < 0x80 || 0xC0 <= c2 {
  180. return Properties{}, 1
  181. }
  182. o = uint32(i)<<6 + uint32(c2)
  183. i = bidiIndex[o]
  184. c3 := s[3]
  185. if c3 < 0x80 || 0xC0 <= c3 {
  186. return Properties{}, 1
  187. }
  188. return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4
  189. }
  190. // Illegal rune
  191. return Properties{}, 1
  192. }