You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

662 lines
20 KiB

  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package transform provides reader and writer wrappers that transform the
  5. // bytes passing through as well as various transformations. Example
  6. // transformations provided by other packages include normalization and
  7. // conversion between character sets.
  8. package transform // import "golang.org/x/text/transform"
  9. import (
  10. "bytes"
  11. "errors"
  12. "io"
  13. "unicode/utf8"
  14. )
  15. var (
  16. // ErrShortDst means that the destination buffer was too short to
  17. // receive all of the transformed bytes.
  18. ErrShortDst = errors.New("transform: short destination buffer")
  19. // ErrShortSrc means that the source buffer has insufficient data to
  20. // complete the transformation.
  21. ErrShortSrc = errors.New("transform: short source buffer")
  22. // errInconsistentByteCount means that Transform returned success (nil
  23. // error) but also returned nSrc inconsistent with the src argument.
  24. errInconsistentByteCount = errors.New("transform: inconsistent byte count returned")
  25. // errShortInternal means that an internal buffer is not large enough
  26. // to make progress and the Transform operation must be aborted.
  27. errShortInternal = errors.New("transform: short internal buffer")
  28. )
  29. // Transformer transforms bytes.
  30. type Transformer interface {
  31. // Transform writes to dst the transformed bytes read from src, and
  32. // returns the number of dst bytes written and src bytes read. The
  33. // atEOF argument tells whether src represents the last bytes of the
  34. // input.
  35. //
  36. // Callers should always process the nDst bytes produced and account
  37. // for the nSrc bytes consumed before considering the error err.
  38. //
  39. // A nil error means that all of the transformed bytes (whether freshly
  40. // transformed from src or left over from previous Transform calls)
  41. // were written to dst. A nil error can be returned regardless of
  42. // whether atEOF is true. If err is nil then nSrc must equal len(src);
  43. // the converse is not necessarily true.
  44. //
  45. // ErrShortDst means that dst was too short to receive all of the
  46. // transformed bytes. ErrShortSrc means that src had insufficient data
  47. // to complete the transformation. If both conditions apply, then
  48. // either error may be returned. Other than the error conditions listed
  49. // here, implementations are free to report other errors that arise.
  50. Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error)
  51. // Reset resets the state and allows a Transformer to be reused.
  52. Reset()
  53. }
  54. // NopResetter can be embedded by implementations of Transformer to add a nop
  55. // Reset method.
  56. type NopResetter struct{}
  57. // Reset implements the Reset method of the Transformer interface.
  58. func (NopResetter) Reset() {}
  59. // Reader wraps another io.Reader by transforming the bytes read.
  60. type Reader struct {
  61. r io.Reader
  62. t Transformer
  63. err error
  64. // dst[dst0:dst1] contains bytes that have been transformed by t but
  65. // not yet copied out via Read.
  66. dst []byte
  67. dst0, dst1 int
  68. // src[src0:src1] contains bytes that have been read from r but not
  69. // yet transformed through t.
  70. src []byte
  71. src0, src1 int
  72. // transformComplete is whether the transformation is complete,
  73. // regardless of whether or not it was successful.
  74. transformComplete bool
  75. }
  76. const defaultBufSize = 4096
  77. // NewReader returns a new Reader that wraps r by transforming the bytes read
  78. // via t. It calls Reset on t.
  79. func NewReader(r io.Reader, t Transformer) *Reader {
  80. t.Reset()
  81. return &Reader{
  82. r: r,
  83. t: t,
  84. dst: make([]byte, defaultBufSize),
  85. src: make([]byte, defaultBufSize),
  86. }
  87. }
  88. // Read implements the io.Reader interface.
  89. func (r *Reader) Read(p []byte) (int, error) {
  90. n, err := 0, error(nil)
  91. for {
  92. // Copy out any transformed bytes and return the final error if we are done.
  93. if r.dst0 != r.dst1 {
  94. n = copy(p, r.dst[r.dst0:r.dst1])
  95. r.dst0 += n
  96. if r.dst0 == r.dst1 && r.transformComplete {
  97. return n, r.err
  98. }
  99. return n, nil
  100. } else if r.transformComplete {
  101. return 0, r.err
  102. }
  103. // Try to transform some source bytes, or to flush the transformer if we
  104. // are out of source bytes. We do this even if r.r.Read returned an error.
  105. // As the io.Reader documentation says, "process the n > 0 bytes returned
  106. // before considering the error".
  107. if r.src0 != r.src1 || r.err != nil {
  108. r.dst0 = 0
  109. r.dst1, n, err = r.t.Transform(r.dst, r.src[r.src0:r.src1], r.err == io.EOF)
  110. r.src0 += n
  111. switch {
  112. case err == nil:
  113. if r.src0 != r.src1 {
  114. r.err = errInconsistentByteCount
  115. }
  116. // The Transform call was successful; we are complete if we
  117. // cannot read more bytes into src.
  118. r.transformComplete = r.err != nil
  119. continue
  120. case err == ErrShortDst && (r.dst1 != 0 || n != 0):
  121. // Make room in dst by copying out, and try again.
  122. continue
  123. case err == ErrShortSrc && r.src1-r.src0 != len(r.src) && r.err == nil:
  124. // Read more bytes into src via the code below, and try again.
  125. default:
  126. r.transformComplete = true
  127. // The reader error (r.err) takes precedence over the
  128. // transformer error (err) unless r.err is nil or io.EOF.
  129. if r.err == nil || r.err == io.EOF {
  130. r.err = err
  131. }
  132. continue
  133. }
  134. }
  135. // Move any untransformed source bytes to the start of the buffer
  136. // and read more bytes.
  137. if r.src0 != 0 {
  138. r.src0, r.src1 = 0, copy(r.src, r.src[r.src0:r.src1])
  139. }
  140. n, r.err = r.r.Read(r.src[r.src1:])
  141. r.src1 += n
  142. }
  143. }
  144. // TODO: implement ReadByte (and ReadRune??).
  145. // Writer wraps another io.Writer by transforming the bytes read.
  146. // The user needs to call Close to flush unwritten bytes that may
  147. // be buffered.
  148. type Writer struct {
  149. w io.Writer
  150. t Transformer
  151. dst []byte
  152. // src[:n] contains bytes that have not yet passed through t.
  153. src []byte
  154. n int
  155. }
  156. // NewWriter returns a new Writer that wraps w by transforming the bytes written
  157. // via t. It calls Reset on t.
  158. func NewWriter(w io.Writer, t Transformer) *Writer {
  159. t.Reset()
  160. return &Writer{
  161. w: w,
  162. t: t,
  163. dst: make([]byte, defaultBufSize),
  164. src: make([]byte, defaultBufSize),
  165. }
  166. }
  167. // Write implements the io.Writer interface. If there are not enough
  168. // bytes available to complete a Transform, the bytes will be buffered
  169. // for the next write. Call Close to convert the remaining bytes.
  170. func (w *Writer) Write(data []byte) (n int, err error) {
  171. src := data
  172. if w.n > 0 {
  173. // Append bytes from data to the last remainder.
  174. // TODO: limit the amount copied on first try.
  175. n = copy(w.src[w.n:], data)
  176. w.n += n
  177. src = w.src[:w.n]
  178. }
  179. for {
  180. nDst, nSrc, err := w.t.Transform(w.dst, src, false)
  181. if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
  182. return n, werr
  183. }
  184. src = src[nSrc:]
  185. if w.n == 0 {
  186. n += nSrc
  187. } else if len(src) <= n {
  188. // Enough bytes from w.src have been consumed. We make src point
  189. // to data instead to reduce the copying.
  190. w.n = 0
  191. n -= len(src)
  192. src = data[n:]
  193. if n < len(data) && (err == nil || err == ErrShortSrc) {
  194. continue
  195. }
  196. }
  197. switch err {
  198. case ErrShortDst:
  199. // This error is okay as long as we are making progress.
  200. if nDst > 0 || nSrc > 0 {
  201. continue
  202. }
  203. case ErrShortSrc:
  204. if len(src) < len(w.src) {
  205. m := copy(w.src, src)
  206. // If w.n > 0, bytes from data were already copied to w.src and n
  207. // was already set to the number of bytes consumed.
  208. if w.n == 0 {
  209. n += m
  210. }
  211. w.n = m
  212. err = nil
  213. } else if nDst > 0 || nSrc > 0 {
  214. // Not enough buffer to store the remainder. Keep processing as
  215. // long as there is progress. Without this case, transforms that
  216. // require a lookahead larger than the buffer may result in an
  217. // error. This is not something one may expect to be common in
  218. // practice, but it may occur when buffers are set to small
  219. // sizes during testing.
  220. continue
  221. }
  222. case nil:
  223. if w.n > 0 {
  224. err = errInconsistentByteCount
  225. }
  226. }
  227. return n, err
  228. }
  229. }
  230. // Close implements the io.Closer interface.
  231. func (w *Writer) Close() error {
  232. src := w.src[:w.n]
  233. for {
  234. nDst, nSrc, err := w.t.Transform(w.dst, src, true)
  235. if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
  236. return werr
  237. }
  238. if err != ErrShortDst {
  239. return err
  240. }
  241. src = src[nSrc:]
  242. }
  243. }
  244. type nop struct{ NopResetter }
  245. func (nop) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  246. n := copy(dst, src)
  247. if n < len(src) {
  248. err = ErrShortDst
  249. }
  250. return n, n, err
  251. }
  252. type discard struct{ NopResetter }
  253. func (discard) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  254. return 0, len(src), nil
  255. }
  256. var (
  257. // Discard is a Transformer for which all Transform calls succeed
  258. // by consuming all bytes and writing nothing.
  259. Discard Transformer = discard{}
  260. // Nop is a Transformer that copies src to dst.
  261. Nop Transformer = nop{}
  262. )
  263. // chain is a sequence of links. A chain with N Transformers has N+1 links and
  264. // N+1 buffers. Of those N+1 buffers, the first and last are the src and dst
  265. // buffers given to chain.Transform and the middle N-1 buffers are intermediate
  266. // buffers owned by the chain. The i'th link transforms bytes from the i'th
  267. // buffer chain.link[i].b at read offset chain.link[i].p to the i+1'th buffer
  268. // chain.link[i+1].b at write offset chain.link[i+1].n, for i in [0, N).
  269. type chain struct {
  270. link []link
  271. err error
  272. // errStart is the index at which the error occurred plus 1. Processing
  273. // errStart at this level at the next call to Transform. As long as
  274. // errStart > 0, chain will not consume any more source bytes.
  275. errStart int
  276. }
  277. func (c *chain) fatalError(errIndex int, err error) {
  278. if i := errIndex + 1; i > c.errStart {
  279. c.errStart = i
  280. c.err = err
  281. }
  282. }
  283. type link struct {
  284. t Transformer
  285. // b[p:n] holds the bytes to be transformed by t.
  286. b []byte
  287. p int
  288. n int
  289. }
  290. func (l *link) src() []byte {
  291. return l.b[l.p:l.n]
  292. }
  293. func (l *link) dst() []byte {
  294. return l.b[l.n:]
  295. }
  296. // Chain returns a Transformer that applies t in sequence.
  297. func Chain(t ...Transformer) Transformer {
  298. if len(t) == 0 {
  299. return nop{}
  300. }
  301. c := &chain{link: make([]link, len(t)+1)}
  302. for i, tt := range t {
  303. c.link[i].t = tt
  304. }
  305. // Allocate intermediate buffers.
  306. b := make([][defaultBufSize]byte, len(t)-1)
  307. for i := range b {
  308. c.link[i+1].b = b[i][:]
  309. }
  310. return c
  311. }
  312. // Reset resets the state of Chain. It calls Reset on all the Transformers.
  313. func (c *chain) Reset() {
  314. for i, l := range c.link {
  315. if l.t != nil {
  316. l.t.Reset()
  317. }
  318. c.link[i].p, c.link[i].n = 0, 0
  319. }
  320. }
  321. // Transform applies the transformers of c in sequence.
  322. func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  323. // Set up src and dst in the chain.
  324. srcL := &c.link[0]
  325. dstL := &c.link[len(c.link)-1]
  326. srcL.b, srcL.p, srcL.n = src, 0, len(src)
  327. dstL.b, dstL.n = dst, 0
  328. var lastFull, needProgress bool // for detecting progress
  329. // i is the index of the next Transformer to apply, for i in [low, high].
  330. // low is the lowest index for which c.link[low] may still produce bytes.
  331. // high is the highest index for which c.link[high] has a Transformer.
  332. // The error returned by Transform determines whether to increase or
  333. // decrease i. We try to completely fill a buffer before converting it.
  334. for low, i, high := c.errStart, c.errStart, len(c.link)-2; low <= i && i <= high; {
  335. in, out := &c.link[i], &c.link[i+1]
  336. nDst, nSrc, err0 := in.t.Transform(out.dst(), in.src(), atEOF && low == i)
  337. out.n += nDst
  338. in.p += nSrc
  339. if i > 0 && in.p == in.n {
  340. in.p, in.n = 0, 0
  341. }
  342. needProgress, lastFull = lastFull, false
  343. switch err0 {
  344. case ErrShortDst:
  345. // Process the destination buffer next. Return if we are already
  346. // at the high index.
  347. if i == high {
  348. return dstL.n, srcL.p, ErrShortDst
  349. }
  350. if out.n != 0 {
  351. i++
  352. // If the Transformer at the next index is not able to process any
  353. // source bytes there is nothing that can be done to make progress
  354. // and the bytes will remain unprocessed. lastFull is used to
  355. // detect this and break out of the loop with a fatal error.
  356. lastFull = true
  357. continue
  358. }
  359. // The destination buffer was too small, but is completely empty.
  360. // Return a fatal error as this transformation can never complete.
  361. c.fatalError(i, errShortInternal)
  362. case ErrShortSrc:
  363. if i == 0 {
  364. // Save ErrShortSrc in err. All other errors take precedence.
  365. err = ErrShortSrc
  366. break
  367. }
  368. // Source bytes were depleted before filling up the destination buffer.
  369. // Verify we made some progress, move the remaining bytes to the errStart
  370. // and try to get more source bytes.
  371. if needProgress && nSrc == 0 || in.n-in.p == len(in.b) {
  372. // There were not enough source bytes to proceed while the source
  373. // buffer cannot hold any more bytes. Return a fatal error as this
  374. // transformation can never complete.
  375. c.fatalError(i, errShortInternal)
  376. break
  377. }
  378. // in.b is an internal buffer and we can make progress.
  379. in.p, in.n = 0, copy(in.b, in.src())
  380. fallthrough
  381. case nil:
  382. // if i == low, we have depleted the bytes at index i or any lower levels.
  383. // In that case we increase low and i. In all other cases we decrease i to
  384. // fetch more bytes before proceeding to the next index.
  385. if i > low {
  386. i--
  387. continue
  388. }
  389. default:
  390. c.fatalError(i, err0)
  391. }
  392. // Exhausted level low or fatal error: increase low and continue
  393. // to process the bytes accepted so far.
  394. i++
  395. low = i
  396. }
  397. // If c.errStart > 0, this means we found a fatal error. We will clear
  398. // all upstream buffers. At this point, no more progress can be made
  399. // downstream, as Transform would have bailed while handling ErrShortDst.
  400. if c.errStart > 0 {
  401. for i := 1; i < c.errStart; i++ {
  402. c.link[i].p, c.link[i].n = 0, 0
  403. }
  404. err, c.errStart, c.err = c.err, 0, nil
  405. }
  406. return dstL.n, srcL.p, err
  407. }
  408. // RemoveFunc returns a Transformer that removes from the input all runes r for
  409. // which f(r) is true. Illegal bytes in the input are replaced by RuneError.
  410. func RemoveFunc(f func(r rune) bool) Transformer {
  411. return removeF(f)
  412. }
  413. type removeF func(r rune) bool
  414. func (removeF) Reset() {}
  415. // Transform implements the Transformer interface.
  416. func (t removeF) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  417. for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] {
  418. if r = rune(src[0]); r < utf8.RuneSelf {
  419. sz = 1
  420. } else {
  421. r, sz = utf8.DecodeRune(src)
  422. if sz == 1 {
  423. // Invalid rune.
  424. if !atEOF && !utf8.FullRune(src) {
  425. err = ErrShortSrc
  426. break
  427. }
  428. // We replace illegal bytes with RuneError. Not doing so might
  429. // otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
  430. // The resulting byte sequence may subsequently contain runes
  431. // for which t(r) is true that were passed unnoticed.
  432. if !t(r) {
  433. if nDst+3 > len(dst) {
  434. err = ErrShortDst
  435. break
  436. }
  437. nDst += copy(dst[nDst:], "\uFFFD")
  438. }
  439. nSrc++
  440. continue
  441. }
  442. }
  443. if !t(r) {
  444. if nDst+sz > len(dst) {
  445. err = ErrShortDst
  446. break
  447. }
  448. nDst += copy(dst[nDst:], src[:sz])
  449. }
  450. nSrc += sz
  451. }
  452. return
  453. }
  454. // grow returns a new []byte that is longer than b, and copies the first n bytes
  455. // of b to the start of the new slice.
  456. func grow(b []byte, n int) []byte {
  457. m := len(b)
  458. if m <= 32 {
  459. m = 64
  460. } else if m <= 256 {
  461. m *= 2
  462. } else {
  463. m += m >> 1
  464. }
  465. buf := make([]byte, m)
  466. copy(buf, b[:n])
  467. return buf
  468. }
  469. const initialBufSize = 128
  470. // String returns a string with the result of converting s[:n] using t, where
  471. // n <= len(s). If err == nil, n will be len(s). It calls Reset on t.
  472. func String(t Transformer, s string) (result string, n int, err error) {
  473. t.Reset()
  474. if s == "" {
  475. // Fast path for the common case for empty input. Results in about a
  476. // 86% reduction of running time for BenchmarkStringLowerEmpty.
  477. if _, _, err := t.Transform(nil, nil, true); err == nil {
  478. return "", 0, nil
  479. }
  480. }
  481. // Allocate only once. Note that both dst and src escape when passed to
  482. // Transform.
  483. buf := [2 * initialBufSize]byte{}
  484. dst := buf[:initialBufSize:initialBufSize]
  485. src := buf[initialBufSize : 2*initialBufSize]
  486. // The input string s is transformed in multiple chunks (starting with a
  487. // chunk size of initialBufSize). nDst and nSrc are per-chunk (or
  488. // per-Transform-call) indexes, pDst and pSrc are overall indexes.
  489. nDst, nSrc := 0, 0
  490. pDst, pSrc := 0, 0
  491. // pPrefix is the length of a common prefix: the first pPrefix bytes of the
  492. // result will equal the first pPrefix bytes of s. It is not guaranteed to
  493. // be the largest such value, but if pPrefix, len(result) and len(s) are
  494. // all equal after the final transform (i.e. calling Transform with atEOF
  495. // being true returned nil error) then we don't need to allocate a new
  496. // result string.
  497. pPrefix := 0
  498. for {
  499. // Invariant: pDst == pPrefix && pSrc == pPrefix.
  500. n := copy(src, s[pSrc:])
  501. nDst, nSrc, err = t.Transform(dst, src[:n], pSrc+n == len(s))
  502. pDst += nDst
  503. pSrc += nSrc
  504. // TODO: let transformers implement an optional Spanner interface, akin
  505. // to norm's QuickSpan. This would even allow us to avoid any allocation.
  506. if !bytes.Equal(dst[:nDst], src[:nSrc]) {
  507. break
  508. }
  509. pPrefix = pSrc
  510. if err == ErrShortDst {
  511. // A buffer can only be short if a transformer modifies its input.
  512. break
  513. } else if err == ErrShortSrc {
  514. if nSrc == 0 {
  515. // No progress was made.
  516. break
  517. }
  518. // Equal so far and !atEOF, so continue checking.
  519. } else if err != nil || pPrefix == len(s) {
  520. return string(s[:pPrefix]), pPrefix, err
  521. }
  522. }
  523. // Post-condition: pDst == pPrefix + nDst && pSrc == pPrefix + nSrc.
  524. // We have transformed the first pSrc bytes of the input s to become pDst
  525. // transformed bytes. Those transformed bytes are discontiguous: the first
  526. // pPrefix of them equal s[:pPrefix] and the last nDst of them equal
  527. // dst[:nDst]. We copy them around, into a new dst buffer if necessary, so
  528. // that they become one contiguous slice: dst[:pDst].
  529. if pPrefix != 0 {
  530. newDst := dst
  531. if pDst > len(newDst) {
  532. newDst = make([]byte, len(s)+nDst-nSrc)
  533. }
  534. copy(newDst[pPrefix:pDst], dst[:nDst])
  535. copy(newDst[:pPrefix], s[:pPrefix])
  536. dst = newDst
  537. }
  538. // Prevent duplicate Transform calls with atEOF being true at the end of
  539. // the input. Also return if we have an unrecoverable error.
  540. if (err == nil && pSrc == len(s)) ||
  541. (err != nil && err != ErrShortDst && err != ErrShortSrc) {
  542. return string(dst[:pDst]), pSrc, err
  543. }
  544. // Transform the remaining input, growing dst and src buffers as necessary.
  545. for {
  546. n := copy(src, s[pSrc:])
  547. nDst, nSrc, err := t.Transform(dst[pDst:], src[:n], pSrc+n == len(s))
  548. pDst += nDst
  549. pSrc += nSrc
  550. // If we got ErrShortDst or ErrShortSrc, do not grow as long as we can
  551. // make progress. This may avoid excessive allocations.
  552. if err == ErrShortDst {
  553. if nDst == 0 {
  554. dst = grow(dst, pDst)
  555. }
  556. } else if err == ErrShortSrc {
  557. if nSrc == 0 {
  558. src = grow(src, 0)
  559. }
  560. } else if err != nil || pSrc == len(s) {
  561. return string(dst[:pDst]), pSrc, err
  562. }
  563. }
  564. }
  565. // Bytes returns a new byte slice with the result of converting b[:n] using t,
  566. // where n <= len(b). If err == nil, n will be len(b). It calls Reset on t.
  567. func Bytes(t Transformer, b []byte) (result []byte, n int, err error) {
  568. return doAppend(t, 0, make([]byte, len(b)), b)
  569. }
  570. // Append appends the result of converting src[:n] using t to dst, where
  571. // n <= len(src), If err == nil, n will be len(src). It calls Reset on t.
  572. func Append(t Transformer, dst, src []byte) (result []byte, n int, err error) {
  573. if len(dst) == cap(dst) {
  574. n := len(src) + len(dst) // It is okay for this to be 0.
  575. b := make([]byte, n)
  576. dst = b[:copy(b, dst)]
  577. }
  578. return doAppend(t, len(dst), dst[:cap(dst)], src)
  579. }
  580. func doAppend(t Transformer, pDst int, dst, src []byte) (result []byte, n int, err error) {
  581. t.Reset()
  582. pSrc := 0
  583. for {
  584. nDst, nSrc, err := t.Transform(dst[pDst:], src[pSrc:], true)
  585. pDst += nDst
  586. pSrc += nSrc
  587. if err != ErrShortDst {
  588. return dst[:pDst], pSrc, err
  589. }
  590. // Grow the destination buffer, but do not grow as long as we can make
  591. // progress. This may avoid excessive allocations.
  592. if nDst == 0 {
  593. dst = grow(dst, pDst)
  594. }
  595. }
  596. }