Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.
 
 
 

279 wiersze
6.7 KiB

  1. // Copyright 2014 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package runes provide transforms for UTF-8 encoded text.
  5. package runes // import "golang.org/x/text/runes"
  6. import (
  7. "unicode"
  8. "unicode/utf8"
  9. "golang.org/x/text/transform"
  10. )
  11. // A Set is a collection of runes.
  12. type Set interface {
  13. // Contains returns true if r is contained in the set.
  14. Contains(r rune) bool
  15. }
  16. type setFunc func(rune) bool
  17. func (s setFunc) Contains(r rune) bool {
  18. return s(r)
  19. }
  20. // Note: using funcs here instead of wrapping types result in cleaner
  21. // documentation and a smaller API.
  22. // In creates a Set with a Contains method that returns true for all runes in
  23. // the given RangeTable.
  24. func In(rt *unicode.RangeTable) Set {
  25. return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
  26. }
  27. // In creates a Set with a Contains method that returns true for all runes not
  28. // in the given RangeTable.
  29. func NotIn(rt *unicode.RangeTable) Set {
  30. return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
  31. }
  32. // Predicate creates a Set with a Contains method that returns f(r).
  33. func Predicate(f func(rune) bool) Set {
  34. return setFunc(f)
  35. }
  36. // Transformer implements the transform.Transformer interface.
  37. type Transformer struct {
  38. transform.Transformer
  39. }
  40. // Bytes returns a new byte slice with the result of converting b using t. It
  41. // calls Reset on t. It returns nil if any error was found. This can only happen
  42. // if an error-producing Transformer is passed to If.
  43. func (t Transformer) Bytes(b []byte) []byte {
  44. b, _, err := transform.Bytes(t, b)
  45. if err != nil {
  46. return nil
  47. }
  48. return b
  49. }
  50. // String returns a string with the result of converting s using t. It calls
  51. // Reset on t. It returns the empty string if any error was found. This can only
  52. // happen if an error-producing Transformer is passed to If.
  53. func (t Transformer) String(s string) string {
  54. s, _, err := transform.String(t, s)
  55. if err != nil {
  56. return ""
  57. }
  58. return s
  59. }
  60. // TODO:
  61. // - Copy: copying strings and bytes in whole-rune units.
  62. // - Validation (maybe)
  63. // - Well-formed-ness (maybe)
  64. const runeErrorString = string(utf8.RuneError)
  65. // Remove returns a Transformer that removes runes r for which s.Contains(r).
  66. // Illegal input bytes are replaced by RuneError before being passed to f.
  67. func Remove(s Set) Transformer {
  68. if f, ok := s.(setFunc); ok {
  69. // This little trick cuts the running time of BenchmarkRemove for sets
  70. // created by Predicate roughly in half.
  71. // TODO: special-case RangeTables as well.
  72. return Transformer{remove(f)}
  73. }
  74. return Transformer{remove(s.Contains)}
  75. }
  76. // TODO: remove transform.RemoveFunc.
  77. type remove func(r rune) bool
  78. func (remove) Reset() {}
  79. // Transform implements transform.Transformer.
  80. func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  81. for r, size := rune(0), 0; nSrc < len(src); {
  82. if r = rune(src[nSrc]); r < utf8.RuneSelf {
  83. size = 1
  84. } else {
  85. r, size = utf8.DecodeRune(src[nSrc:])
  86. if size == 1 {
  87. // Invalid rune.
  88. if !atEOF && !utf8.FullRune(src[nSrc:]) {
  89. err = transform.ErrShortSrc
  90. break
  91. }
  92. // We replace illegal bytes with RuneError. Not doing so might
  93. // otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
  94. // The resulting byte sequence may subsequently contain runes
  95. // for which t(r) is true that were passed unnoticed.
  96. if !t(utf8.RuneError) {
  97. if nDst+3 > len(dst) {
  98. err = transform.ErrShortDst
  99. break
  100. }
  101. dst[nDst+0] = runeErrorString[0]
  102. dst[nDst+1] = runeErrorString[1]
  103. dst[nDst+2] = runeErrorString[2]
  104. nDst += 3
  105. }
  106. nSrc++
  107. continue
  108. }
  109. }
  110. if t(r) {
  111. nSrc += size
  112. continue
  113. }
  114. if nDst+size > len(dst) {
  115. err = transform.ErrShortDst
  116. break
  117. }
  118. for i := 0; i < size; i++ {
  119. dst[nDst] = src[nSrc]
  120. nDst++
  121. nSrc++
  122. }
  123. }
  124. return
  125. }
  126. // Map returns a Transformer that maps the runes in the input using the given
  127. // mapping. Illegal bytes in the input are converted to utf8.RuneError before
  128. // being passed to the mapping func.
  129. func Map(mapping func(rune) rune) Transformer {
  130. return Transformer{mapper(mapping)}
  131. }
  132. type mapper func(rune) rune
  133. func (mapper) Reset() {}
  134. // Transform implements transform.Transformer.
  135. func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  136. var replacement rune
  137. var b [utf8.UTFMax]byte
  138. for r, size := rune(0), 0; nSrc < len(src); {
  139. if r = rune(src[nSrc]); r < utf8.RuneSelf {
  140. if replacement = t(r); replacement < utf8.RuneSelf {
  141. if nDst == len(dst) {
  142. err = transform.ErrShortDst
  143. break
  144. }
  145. dst[nDst] = byte(replacement)
  146. nDst++
  147. nSrc++
  148. continue
  149. }
  150. size = 1
  151. } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
  152. // Invalid rune.
  153. if !atEOF && !utf8.FullRune(src[nSrc:]) {
  154. err = transform.ErrShortSrc
  155. break
  156. }
  157. if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
  158. if nDst+3 > len(dst) {
  159. err = transform.ErrShortDst
  160. break
  161. }
  162. dst[nDst+0] = runeErrorString[0]
  163. dst[nDst+1] = runeErrorString[1]
  164. dst[nDst+2] = runeErrorString[2]
  165. nDst += 3
  166. nSrc++
  167. continue
  168. }
  169. } else if replacement = t(r); replacement == r {
  170. if nDst+size > len(dst) {
  171. err = transform.ErrShortDst
  172. break
  173. }
  174. for i := 0; i < size; i++ {
  175. dst[nDst] = src[nSrc]
  176. nDst++
  177. nSrc++
  178. }
  179. continue
  180. }
  181. n := utf8.EncodeRune(b[:], replacement)
  182. if nDst+n > len(dst) {
  183. err = transform.ErrShortDst
  184. break
  185. }
  186. for i := 0; i < n; i++ {
  187. dst[nDst] = b[i]
  188. nDst++
  189. }
  190. nSrc += size
  191. }
  192. return
  193. }
  194. // ReplaceIllFormed returns a transformer that replaces all input bytes that are
  195. // not part of a well-formed UTF-8 code sequence with utf8.RuneError.
  196. func ReplaceIllFormed() Transformer {
  197. return Transformer{&replaceIllFormed{}}
  198. }
  199. type replaceIllFormed struct{ transform.NopResetter }
  200. func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  201. for nSrc < len(src) {
  202. r, size := utf8.DecodeRune(src[nSrc:])
  203. // Look for an ASCII rune.
  204. if r < utf8.RuneSelf {
  205. if nDst == len(dst) {
  206. err = transform.ErrShortDst
  207. break
  208. }
  209. dst[nDst] = byte(r)
  210. nDst++
  211. nSrc++
  212. continue
  213. }
  214. // Look for a valid non-ASCII rune.
  215. if r != utf8.RuneError || size != 1 {
  216. if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
  217. err = transform.ErrShortDst
  218. break
  219. }
  220. nDst += size
  221. nSrc += size
  222. continue
  223. }
  224. // Look for short source data.
  225. if !atEOF && !utf8.FullRune(src[nSrc:]) {
  226. err = transform.ErrShortSrc
  227. break
  228. }
  229. // We have an invalid rune.
  230. if nDst+3 > len(dst) {
  231. err = transform.ErrShortDst
  232. break
  233. }
  234. dst[nDst+0] = runeErrorString[0]
  235. dst[nDst+1] = runeErrorString[1]
  236. dst[nDst+2] = runeErrorString[2]
  237. nDst += 3
  238. nSrc++
  239. }
  240. return nDst, nSrc, err
  241. }