Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

686 wiersze
17 KiB

  1. // Copyright 2018 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package text
  5. import (
  6. "bytes"
  7. "fmt"
  8. "io"
  9. "strconv"
  10. "unicode/utf8"
  11. "google.golang.org/protobuf/internal/errors"
  12. )
  13. // Decoder is a token-based textproto decoder.
  14. type Decoder struct {
  15. // lastCall is last method called, either readCall or peekCall.
  16. // Initial value is readCall.
  17. lastCall call
  18. // lastToken contains the last read token.
  19. lastToken Token
  20. // lastErr contains the last read error.
  21. lastErr error
  22. // openStack is a stack containing the byte characters for MessageOpen and
  23. // ListOpen kinds. The top of stack represents the message or the list that
  24. // the current token is nested in. An empty stack means the current token is
  25. // at the top level message. The characters '{' and '<' both represent the
  26. // MessageOpen kind.
  27. openStack []byte
  28. // orig is used in reporting line and column.
  29. orig []byte
  30. // in contains the unconsumed input.
  31. in []byte
  32. }
  33. // NewDecoder returns a Decoder to read the given []byte.
  34. func NewDecoder(b []byte) *Decoder {
  35. return &Decoder{orig: b, in: b}
  36. }
  37. // ErrUnexpectedEOF means that EOF was encountered in the middle of the input.
  38. var ErrUnexpectedEOF = errors.New("%v", io.ErrUnexpectedEOF)
  39. // call specifies which Decoder method was invoked.
  40. type call uint8
  41. const (
  42. readCall call = iota
  43. peekCall
  44. )
  45. // Peek looks ahead and returns the next token and error without advancing a read.
  46. func (d *Decoder) Peek() (Token, error) {
  47. defer func() { d.lastCall = peekCall }()
  48. if d.lastCall == readCall {
  49. d.lastToken, d.lastErr = d.Read()
  50. }
  51. return d.lastToken, d.lastErr
  52. }
  53. // Read returns the next token.
  54. // It will return an error if there is no valid token.
  55. func (d *Decoder) Read() (Token, error) {
  56. defer func() { d.lastCall = readCall }()
  57. if d.lastCall == peekCall {
  58. return d.lastToken, d.lastErr
  59. }
  60. tok, err := d.parseNext(d.lastToken.kind)
  61. if err != nil {
  62. return Token{}, err
  63. }
  64. switch tok.kind {
  65. case comma, semicolon:
  66. tok, err = d.parseNext(tok.kind)
  67. if err != nil {
  68. return Token{}, err
  69. }
  70. }
  71. d.lastToken = tok
  72. return tok, nil
  73. }
  74. const (
  75. mismatchedFmt = "mismatched close character %q"
  76. unexpectedFmt = "unexpected character %q"
  77. )
  78. // parseNext parses the next Token based on given last kind.
  79. func (d *Decoder) parseNext(lastKind Kind) (Token, error) {
  80. // Trim leading spaces.
  81. d.consume(0)
  82. isEOF := false
  83. if len(d.in) == 0 {
  84. isEOF = true
  85. }
  86. switch lastKind {
  87. case EOF:
  88. return d.consumeToken(EOF, 0, 0), nil
  89. case bof:
  90. // Start of top level message. Next token can be EOF or Name.
  91. if isEOF {
  92. return d.consumeToken(EOF, 0, 0), nil
  93. }
  94. return d.parseFieldName()
  95. case Name:
  96. // Next token can be MessageOpen, ListOpen or Scalar.
  97. if isEOF {
  98. return Token{}, ErrUnexpectedEOF
  99. }
  100. switch ch := d.in[0]; ch {
  101. case '{', '<':
  102. d.pushOpenStack(ch)
  103. return d.consumeToken(MessageOpen, 1, 0), nil
  104. case '[':
  105. d.pushOpenStack(ch)
  106. return d.consumeToken(ListOpen, 1, 0), nil
  107. default:
  108. return d.parseScalar()
  109. }
  110. case Scalar:
  111. openKind, closeCh := d.currentOpenKind()
  112. switch openKind {
  113. case bof:
  114. // Top level message.
  115. // Next token can be EOF, comma, semicolon or Name.
  116. if isEOF {
  117. return d.consumeToken(EOF, 0, 0), nil
  118. }
  119. switch d.in[0] {
  120. case ',':
  121. return d.consumeToken(comma, 1, 0), nil
  122. case ';':
  123. return d.consumeToken(semicolon, 1, 0), nil
  124. default:
  125. return d.parseFieldName()
  126. }
  127. case MessageOpen:
  128. // Next token can be MessageClose, comma, semicolon or Name.
  129. if isEOF {
  130. return Token{}, ErrUnexpectedEOF
  131. }
  132. switch ch := d.in[0]; ch {
  133. case closeCh:
  134. d.popOpenStack()
  135. return d.consumeToken(MessageClose, 1, 0), nil
  136. case otherCloseChar[closeCh]:
  137. return Token{}, d.newSyntaxError(mismatchedFmt, ch)
  138. case ',':
  139. return d.consumeToken(comma, 1, 0), nil
  140. case ';':
  141. return d.consumeToken(semicolon, 1, 0), nil
  142. default:
  143. return d.parseFieldName()
  144. }
  145. case ListOpen:
  146. // Next token can be ListClose or comma.
  147. if isEOF {
  148. return Token{}, ErrUnexpectedEOF
  149. }
  150. switch ch := d.in[0]; ch {
  151. case ']':
  152. d.popOpenStack()
  153. return d.consumeToken(ListClose, 1, 0), nil
  154. case ',':
  155. return d.consumeToken(comma, 1, 0), nil
  156. default:
  157. return Token{}, d.newSyntaxError(unexpectedFmt, ch)
  158. }
  159. }
  160. case MessageOpen:
  161. // Next token can be MessageClose or Name.
  162. if isEOF {
  163. return Token{}, ErrUnexpectedEOF
  164. }
  165. _, closeCh := d.currentOpenKind()
  166. switch ch := d.in[0]; ch {
  167. case closeCh:
  168. d.popOpenStack()
  169. return d.consumeToken(MessageClose, 1, 0), nil
  170. case otherCloseChar[closeCh]:
  171. return Token{}, d.newSyntaxError(mismatchedFmt, ch)
  172. default:
  173. return d.parseFieldName()
  174. }
  175. case MessageClose:
  176. openKind, closeCh := d.currentOpenKind()
  177. switch openKind {
  178. case bof:
  179. // Top level message.
  180. // Next token can be EOF, comma, semicolon or Name.
  181. if isEOF {
  182. return d.consumeToken(EOF, 0, 0), nil
  183. }
  184. switch ch := d.in[0]; ch {
  185. case ',':
  186. return d.consumeToken(comma, 1, 0), nil
  187. case ';':
  188. return d.consumeToken(semicolon, 1, 0), nil
  189. default:
  190. return d.parseFieldName()
  191. }
  192. case MessageOpen:
  193. // Next token can be MessageClose, comma, semicolon or Name.
  194. if isEOF {
  195. return Token{}, ErrUnexpectedEOF
  196. }
  197. switch ch := d.in[0]; ch {
  198. case closeCh:
  199. d.popOpenStack()
  200. return d.consumeToken(MessageClose, 1, 0), nil
  201. case otherCloseChar[closeCh]:
  202. return Token{}, d.newSyntaxError(mismatchedFmt, ch)
  203. case ',':
  204. return d.consumeToken(comma, 1, 0), nil
  205. case ';':
  206. return d.consumeToken(semicolon, 1, 0), nil
  207. default:
  208. return d.parseFieldName()
  209. }
  210. case ListOpen:
  211. // Next token can be ListClose or comma
  212. if isEOF {
  213. return Token{}, ErrUnexpectedEOF
  214. }
  215. switch ch := d.in[0]; ch {
  216. case closeCh:
  217. d.popOpenStack()
  218. return d.consumeToken(ListClose, 1, 0), nil
  219. case ',':
  220. return d.consumeToken(comma, 1, 0), nil
  221. default:
  222. return Token{}, d.newSyntaxError(unexpectedFmt, ch)
  223. }
  224. }
  225. case ListOpen:
  226. // Next token can be ListClose, MessageStart or Scalar.
  227. if isEOF {
  228. return Token{}, ErrUnexpectedEOF
  229. }
  230. switch ch := d.in[0]; ch {
  231. case ']':
  232. d.popOpenStack()
  233. return d.consumeToken(ListClose, 1, 0), nil
  234. case '{', '<':
  235. d.pushOpenStack(ch)
  236. return d.consumeToken(MessageOpen, 1, 0), nil
  237. default:
  238. return d.parseScalar()
  239. }
  240. case ListClose:
  241. openKind, closeCh := d.currentOpenKind()
  242. switch openKind {
  243. case bof:
  244. // Top level message.
  245. // Next token can be EOF, comma, semicolon or Name.
  246. if isEOF {
  247. return d.consumeToken(EOF, 0, 0), nil
  248. }
  249. switch ch := d.in[0]; ch {
  250. case ',':
  251. return d.consumeToken(comma, 1, 0), nil
  252. case ';':
  253. return d.consumeToken(semicolon, 1, 0), nil
  254. default:
  255. return d.parseFieldName()
  256. }
  257. case MessageOpen:
  258. // Next token can be MessageClose, comma, semicolon or Name.
  259. if isEOF {
  260. return Token{}, ErrUnexpectedEOF
  261. }
  262. switch ch := d.in[0]; ch {
  263. case closeCh:
  264. d.popOpenStack()
  265. return d.consumeToken(MessageClose, 1, 0), nil
  266. case otherCloseChar[closeCh]:
  267. return Token{}, d.newSyntaxError(mismatchedFmt, ch)
  268. case ',':
  269. return d.consumeToken(comma, 1, 0), nil
  270. case ';':
  271. return d.consumeToken(semicolon, 1, 0), nil
  272. default:
  273. return d.parseFieldName()
  274. }
  275. default:
  276. // It is not possible to have this case. Let it panic below.
  277. }
  278. case comma, semicolon:
  279. openKind, closeCh := d.currentOpenKind()
  280. switch openKind {
  281. case bof:
  282. // Top level message. Next token can be EOF or Name.
  283. if isEOF {
  284. return d.consumeToken(EOF, 0, 0), nil
  285. }
  286. return d.parseFieldName()
  287. case MessageOpen:
  288. // Next token can be MessageClose or Name.
  289. if isEOF {
  290. return Token{}, ErrUnexpectedEOF
  291. }
  292. switch ch := d.in[0]; ch {
  293. case closeCh:
  294. d.popOpenStack()
  295. return d.consumeToken(MessageClose, 1, 0), nil
  296. case otherCloseChar[closeCh]:
  297. return Token{}, d.newSyntaxError(mismatchedFmt, ch)
  298. default:
  299. return d.parseFieldName()
  300. }
  301. case ListOpen:
  302. if lastKind == semicolon {
  303. // It is not be possible to have this case as logic here
  304. // should not have produced a semicolon Token when inside a
  305. // list. Let it panic below.
  306. break
  307. }
  308. // Next token can be MessageOpen or Scalar.
  309. if isEOF {
  310. return Token{}, ErrUnexpectedEOF
  311. }
  312. switch ch := d.in[0]; ch {
  313. case '{', '<':
  314. d.pushOpenStack(ch)
  315. return d.consumeToken(MessageOpen, 1, 0), nil
  316. default:
  317. return d.parseScalar()
  318. }
  319. }
  320. }
  321. line, column := d.Position(len(d.orig) - len(d.in))
  322. panic(fmt.Sprintf("Decoder.parseNext: bug at handling line %d:%d with lastKind=%v", line, column, lastKind))
  323. }
  324. var otherCloseChar = map[byte]byte{
  325. '}': '>',
  326. '>': '}',
  327. }
  328. // currentOpenKind indicates whether current position is inside a message, list
  329. // or top-level message by returning MessageOpen, ListOpen or bof respectively.
  330. // If the returned kind is either a MessageOpen or ListOpen, it also returns the
  331. // corresponding closing character.
  332. func (d *Decoder) currentOpenKind() (Kind, byte) {
  333. if len(d.openStack) == 0 {
  334. return bof, 0
  335. }
  336. openCh := d.openStack[len(d.openStack)-1]
  337. switch openCh {
  338. case '{':
  339. return MessageOpen, '}'
  340. case '<':
  341. return MessageOpen, '>'
  342. case '[':
  343. return ListOpen, ']'
  344. }
  345. panic(fmt.Sprintf("Decoder: openStack contains invalid byte %c", openCh))
  346. }
  347. func (d *Decoder) pushOpenStack(ch byte) {
  348. d.openStack = append(d.openStack, ch)
  349. }
  350. func (d *Decoder) popOpenStack() {
  351. d.openStack = d.openStack[:len(d.openStack)-1]
  352. }
  353. // parseFieldName parses field name and separator.
  354. func (d *Decoder) parseFieldName() (tok Token, err error) {
  355. defer func() {
  356. if err == nil && d.tryConsumeChar(':') {
  357. tok.attrs |= hasSeparator
  358. }
  359. }()
  360. // Extension or Any type URL.
  361. if d.in[0] == '[' {
  362. return d.parseTypeName()
  363. }
  364. // Identifier.
  365. if size := parseIdent(d.in, false); size > 0 {
  366. return d.consumeToken(Name, size, uint8(IdentName)), nil
  367. }
  368. // Field number. Identify if input is a valid number that is not negative
  369. // and is decimal integer within 32-bit range.
  370. if num := parseNumber(d.in); num.size > 0 {
  371. if !num.neg && num.kind == numDec {
  372. if _, err := strconv.ParseInt(string(d.in[:num.size]), 10, 32); err == nil {
  373. return d.consumeToken(Name, num.size, uint8(FieldNumber)), nil
  374. }
  375. }
  376. return Token{}, d.newSyntaxError("invalid field number: %s", d.in[:num.size])
  377. }
  378. return Token{}, d.newSyntaxError("invalid field name: %s", errId(d.in))
  379. }
  380. // parseTypeName parses Any type URL or extension field name. The name is
  381. // enclosed in [ and ] characters. The C++ parser does not handle many legal URL
  382. // strings. This implementation is more liberal and allows for the pattern
  383. // ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`). Whitespaces and comments are allowed
  384. // in between [ ], '.', '/' and the sub names.
  385. func (d *Decoder) parseTypeName() (Token, error) {
  386. startPos := len(d.orig) - len(d.in)
  387. // Use alias s to advance first in order to use d.in for error handling.
  388. // Caller already checks for [ as first character.
  389. s := consume(d.in[1:], 0)
  390. if len(s) == 0 {
  391. return Token{}, ErrUnexpectedEOF
  392. }
  393. var name []byte
  394. for len(s) > 0 && isTypeNameChar(s[0]) {
  395. name = append(name, s[0])
  396. s = s[1:]
  397. }
  398. s = consume(s, 0)
  399. var closed bool
  400. for len(s) > 0 && !closed {
  401. switch {
  402. case s[0] == ']':
  403. s = s[1:]
  404. closed = true
  405. case s[0] == '/', s[0] == '.':
  406. if len(name) > 0 && (name[len(name)-1] == '/' || name[len(name)-1] == '.') {
  407. return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
  408. d.orig[startPos:len(d.orig)-len(s)+1])
  409. }
  410. name = append(name, s[0])
  411. s = s[1:]
  412. s = consume(s, 0)
  413. for len(s) > 0 && isTypeNameChar(s[0]) {
  414. name = append(name, s[0])
  415. s = s[1:]
  416. }
  417. s = consume(s, 0)
  418. default:
  419. return Token{}, d.newSyntaxError(
  420. "invalid type URL/extension field name: %s", d.orig[startPos:len(d.orig)-len(s)+1])
  421. }
  422. }
  423. if !closed {
  424. return Token{}, ErrUnexpectedEOF
  425. }
  426. // First character cannot be '.'. Last character cannot be '.' or '/'.
  427. size := len(name)
  428. if size == 0 || name[0] == '.' || name[size-1] == '.' || name[size-1] == '/' {
  429. return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
  430. d.orig[startPos:len(d.orig)-len(s)])
  431. }
  432. d.in = s
  433. endPos := len(d.orig) - len(d.in)
  434. d.consume(0)
  435. return Token{
  436. kind: Name,
  437. attrs: uint8(TypeName),
  438. pos: startPos,
  439. raw: d.orig[startPos:endPos],
  440. str: string(name),
  441. }, nil
  442. }
  443. func isTypeNameChar(b byte) bool {
  444. return (b == '-' || b == '_' ||
  445. ('0' <= b && b <= '9') ||
  446. ('a' <= b && b <= 'z') ||
  447. ('A' <= b && b <= 'Z'))
  448. }
  449. func isWhiteSpace(b byte) bool {
  450. switch b {
  451. case ' ', '\n', '\r', '\t':
  452. return true
  453. default:
  454. return false
  455. }
  456. }
  457. // parseIdent parses an unquoted proto identifier and returns size.
  458. // If allowNeg is true, it allows '-' to be the first character in the
  459. // identifier. This is used when parsing literal values like -infinity, etc.
  460. // Regular expression matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*`
  461. func parseIdent(input []byte, allowNeg bool) int {
  462. var size int
  463. s := input
  464. if len(s) == 0 {
  465. return 0
  466. }
  467. if allowNeg && s[0] == '-' {
  468. s = s[1:]
  469. size++
  470. if len(s) == 0 {
  471. return 0
  472. }
  473. }
  474. switch {
  475. case s[0] == '_',
  476. 'a' <= s[0] && s[0] <= 'z',
  477. 'A' <= s[0] && s[0] <= 'Z':
  478. s = s[1:]
  479. size++
  480. default:
  481. return 0
  482. }
  483. for len(s) > 0 && (s[0] == '_' ||
  484. 'a' <= s[0] && s[0] <= 'z' ||
  485. 'A' <= s[0] && s[0] <= 'Z' ||
  486. '0' <= s[0] && s[0] <= '9') {
  487. s = s[1:]
  488. size++
  489. }
  490. if len(s) > 0 && !isDelim(s[0]) {
  491. return 0
  492. }
  493. return size
  494. }
  495. // parseScalar parses for a string, literal or number value.
  496. func (d *Decoder) parseScalar() (Token, error) {
  497. if d.in[0] == '"' || d.in[0] == '\'' {
  498. return d.parseStringValue()
  499. }
  500. if tok, ok := d.parseLiteralValue(); ok {
  501. return tok, nil
  502. }
  503. if tok, ok := d.parseNumberValue(); ok {
  504. return tok, nil
  505. }
  506. return Token{}, d.newSyntaxError("invalid scalar value: %s", errId(d.in))
  507. }
  508. // parseLiteralValue parses a literal value. A literal value is used for
  509. // bools, special floats and enums. This function simply identifies that the
  510. // field value is a literal.
  511. func (d *Decoder) parseLiteralValue() (Token, bool) {
  512. size := parseIdent(d.in, true)
  513. if size == 0 {
  514. return Token{}, false
  515. }
  516. return d.consumeToken(Scalar, size, literalValue), true
  517. }
  518. // consumeToken constructs a Token for given Kind from d.in and consumes given
  519. // size-length from it.
  520. func (d *Decoder) consumeToken(kind Kind, size int, attrs uint8) Token {
  521. // Important to compute raw and pos before consuming.
  522. tok := Token{
  523. kind: kind,
  524. attrs: attrs,
  525. pos: len(d.orig) - len(d.in),
  526. raw: d.in[:size],
  527. }
  528. d.consume(size)
  529. return tok
  530. }
  531. // newSyntaxError returns a syntax error with line and column information for
  532. // current position.
  533. func (d *Decoder) newSyntaxError(f string, x ...interface{}) error {
  534. e := errors.New(f, x...)
  535. line, column := d.Position(len(d.orig) - len(d.in))
  536. return errors.New("syntax error (line %d:%d): %v", line, column, e)
  537. }
  538. // Position returns line and column number of given index of the original input.
  539. // It will panic if index is out of range.
  540. func (d *Decoder) Position(idx int) (line int, column int) {
  541. b := d.orig[:idx]
  542. line = bytes.Count(b, []byte("\n")) + 1
  543. if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
  544. b = b[i+1:]
  545. }
  546. column = utf8.RuneCount(b) + 1 // ignore multi-rune characters
  547. return line, column
  548. }
  549. func (d *Decoder) tryConsumeChar(c byte) bool {
  550. if len(d.in) > 0 && d.in[0] == c {
  551. d.consume(1)
  552. return true
  553. }
  554. return false
  555. }
  556. // consume consumes n bytes of input and any subsequent whitespace or comments.
  557. func (d *Decoder) consume(n int) {
  558. d.in = consume(d.in, n)
  559. return
  560. }
  561. // consume consumes n bytes of input and any subsequent whitespace or comments.
  562. func consume(b []byte, n int) []byte {
  563. b = b[n:]
  564. for len(b) > 0 {
  565. switch b[0] {
  566. case ' ', '\n', '\r', '\t':
  567. b = b[1:]
  568. case '#':
  569. if i := bytes.IndexByte(b, '\n'); i >= 0 {
  570. b = b[i+len("\n"):]
  571. } else {
  572. b = nil
  573. }
  574. default:
  575. return b
  576. }
  577. }
  578. return b
  579. }
  580. // errId extracts a byte sequence that looks like an invalid ID
  581. // (for the purposes of error reporting).
  582. func errId(seq []byte) []byte {
  583. const maxLen = 32
  584. for i := 0; i < len(seq); {
  585. if i > maxLen {
  586. return append(seq[:i:i], "…"...)
  587. }
  588. r, size := utf8.DecodeRune(seq[i:])
  589. if r > utf8.RuneSelf || (r != '/' && isDelim(byte(r))) {
  590. if i == 0 {
  591. // Either the first byte is invalid UTF-8 or a
  592. // delimiter, or the first rune is non-ASCII.
  593. // Return it as-is.
  594. i = size
  595. }
  596. return seq[:i:i]
  597. }
  598. i += size
  599. }
  600. // No delimiter found.
  601. return seq
  602. }
  603. // isDelim returns true if given byte is a delimiter character.
  604. func isDelim(c byte) bool {
  605. return !(c == '-' || c == '+' || c == '.' || c == '_' ||
  606. ('a' <= c && c <= 'z') ||
  607. ('A' <= c && c <= 'Z') ||
  608. ('0' <= c && c <= '9'))
  609. }