You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1220 lines
28 KiB

  1. package toml
  2. import (
  3. "fmt"
  4. "reflect"
  5. "runtime"
  6. "strings"
  7. "unicode"
  8. "unicode/utf8"
  9. )
  10. type itemType int
  11. const (
  12. itemError itemType = iota
  13. itemNIL // used in the parser to indicate no type
  14. itemEOF
  15. itemText
  16. itemString
  17. itemRawString
  18. itemMultilineString
  19. itemRawMultilineString
  20. itemBool
  21. itemInteger
  22. itemFloat
  23. itemDatetime
  24. itemArray // the start of an array
  25. itemArrayEnd
  26. itemTableStart
  27. itemTableEnd
  28. itemArrayTableStart
  29. itemArrayTableEnd
  30. itemKeyStart
  31. itemKeyEnd
  32. itemCommentStart
  33. itemInlineTableStart
  34. itemInlineTableEnd
  35. )
  36. const eof = 0
  37. type stateFn func(lx *lexer) stateFn
  38. func (p Position) String() string {
  39. return fmt.Sprintf("at line %d; start %d; length %d", p.Line, p.Start, p.Len)
  40. }
  41. type lexer struct {
  42. input string
  43. start int
  44. pos int
  45. line int
  46. state stateFn
  47. items chan item
  48. // Allow for backing up up to 4 runes. This is necessary because TOML
  49. // contains 3-rune tokens (""" and ''').
  50. prevWidths [4]int
  51. nprev int // how many of prevWidths are in use
  52. atEOF bool // If we emit an eof, we can still back up, but it is not OK to call next again.
  53. // A stack of state functions used to maintain context.
  54. //
  55. // The idea is to reuse parts of the state machine in various places. For
  56. // example, values can appear at the top level or within arbitrarily nested
  57. // arrays. The last state on the stack is used after a value has been lexed.
  58. // Similarly for comments.
  59. stack []stateFn
  60. }
  61. type item struct {
  62. typ itemType
  63. val string
  64. err error
  65. pos Position
  66. }
  67. func (lx *lexer) nextItem() item {
  68. for {
  69. select {
  70. case item := <-lx.items:
  71. return item
  72. default:
  73. lx.state = lx.state(lx)
  74. //fmt.Printf(" STATE %-24s current: %-10q stack: %s\n", lx.state, lx.current(), lx.stack)
  75. }
  76. }
  77. }
  78. func lex(input string) *lexer {
  79. lx := &lexer{
  80. input: input,
  81. state: lexTop,
  82. items: make(chan item, 10),
  83. stack: make([]stateFn, 0, 10),
  84. line: 1,
  85. }
  86. return lx
  87. }
  88. func (lx *lexer) push(state stateFn) {
  89. lx.stack = append(lx.stack, state)
  90. }
  91. func (lx *lexer) pop() stateFn {
  92. if len(lx.stack) == 0 {
  93. return lx.errorf("BUG in lexer: no states to pop")
  94. }
  95. last := lx.stack[len(lx.stack)-1]
  96. lx.stack = lx.stack[0 : len(lx.stack)-1]
  97. return last
  98. }
  99. func (lx *lexer) current() string {
  100. return lx.input[lx.start:lx.pos]
  101. }
  102. func (lx lexer) getPos() Position {
  103. p := Position{
  104. Line: lx.line,
  105. Start: lx.start,
  106. Len: lx.pos - lx.start,
  107. }
  108. if p.Len <= 0 {
  109. p.Len = 1
  110. }
  111. return p
  112. }
  113. func (lx *lexer) emit(typ itemType) {
  114. lx.items <- item{typ: typ, pos: lx.getPos(), val: lx.current()}
  115. lx.start = lx.pos
  116. }
  117. func (lx *lexer) emitTrim(typ itemType) {
  118. lx.items <- item{typ: typ, pos: lx.getPos(), val: strings.TrimSpace(lx.current())}
  119. lx.start = lx.pos
  120. }
  121. func (lx *lexer) next() (r rune) {
  122. if lx.atEOF {
  123. panic("BUG in lexer: next called after EOF")
  124. }
  125. if lx.pos >= len(lx.input) {
  126. lx.atEOF = true
  127. return eof
  128. }
  129. if lx.input[lx.pos] == '\n' {
  130. lx.line++
  131. }
  132. lx.prevWidths[3] = lx.prevWidths[2]
  133. lx.prevWidths[2] = lx.prevWidths[1]
  134. lx.prevWidths[1] = lx.prevWidths[0]
  135. if lx.nprev < 4 {
  136. lx.nprev++
  137. }
  138. r, w := utf8.DecodeRuneInString(lx.input[lx.pos:])
  139. if r == utf8.RuneError {
  140. lx.error(errLexUTF8{lx.input[lx.pos]})
  141. return utf8.RuneError
  142. }
  143. // Note: don't use peek() here, as this calls next().
  144. if isControl(r) || (r == '\r' && (len(lx.input)-1 == lx.pos || lx.input[lx.pos+1] != '\n')) {
  145. lx.errorControlChar(r)
  146. return utf8.RuneError
  147. }
  148. lx.prevWidths[0] = w
  149. lx.pos += w
  150. return r
  151. }
  152. // ignore skips over the pending input before this point.
  153. func (lx *lexer) ignore() {
  154. lx.start = lx.pos
  155. }
  156. // backup steps back one rune. Can be called 4 times between calls to next.
  157. func (lx *lexer) backup() {
  158. if lx.atEOF {
  159. lx.atEOF = false
  160. return
  161. }
  162. if lx.nprev < 1 {
  163. panic("BUG in lexer: backed up too far")
  164. }
  165. w := lx.prevWidths[0]
  166. lx.prevWidths[0] = lx.prevWidths[1]
  167. lx.prevWidths[1] = lx.prevWidths[2]
  168. lx.prevWidths[2] = lx.prevWidths[3]
  169. lx.nprev--
  170. lx.pos -= w
  171. if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' {
  172. lx.line--
  173. }
  174. }
  175. // accept consumes the next rune if it's equal to `valid`.
  176. func (lx *lexer) accept(valid rune) bool {
  177. if lx.next() == valid {
  178. return true
  179. }
  180. lx.backup()
  181. return false
  182. }
  183. // peek returns but does not consume the next rune in the input.
  184. func (lx *lexer) peek() rune {
  185. r := lx.next()
  186. lx.backup()
  187. return r
  188. }
  189. // skip ignores all input that matches the given predicate.
  190. func (lx *lexer) skip(pred func(rune) bool) {
  191. for {
  192. r := lx.next()
  193. if pred(r) {
  194. continue
  195. }
  196. lx.backup()
  197. lx.ignore()
  198. return
  199. }
  200. }
  201. // error stops all lexing by emitting an error and returning `nil`.
  202. //
  203. // Note that any value that is a character is escaped if it's a special
  204. // character (newlines, tabs, etc.).
  205. func (lx *lexer) error(err error) stateFn {
  206. if lx.atEOF {
  207. return lx.errorPrevLine(err)
  208. }
  209. lx.items <- item{typ: itemError, pos: lx.getPos(), err: err}
  210. return nil
  211. }
  212. // errorfPrevline is like error(), but sets the position to the last column of
  213. // the previous line.
  214. //
  215. // This is so that unexpected EOF or NL errors don't show on a new blank line.
  216. func (lx *lexer) errorPrevLine(err error) stateFn {
  217. pos := lx.getPos()
  218. pos.Line--
  219. pos.Len = 1
  220. pos.Start = lx.pos - 1
  221. lx.items <- item{typ: itemError, pos: pos, err: err}
  222. return nil
  223. }
  224. // errorPos is like error(), but allows explicitly setting the position.
  225. func (lx *lexer) errorPos(start, length int, err error) stateFn {
  226. pos := lx.getPos()
  227. pos.Start = start
  228. pos.Len = length
  229. lx.items <- item{typ: itemError, pos: pos, err: err}
  230. return nil
  231. }
  232. // errorf is like error, and creates a new error.
  233. func (lx *lexer) errorf(format string, values ...interface{}) stateFn {
  234. if lx.atEOF {
  235. pos := lx.getPos()
  236. pos.Line--
  237. pos.Len = 1
  238. pos.Start = lx.pos - 1
  239. lx.items <- item{typ: itemError, pos: pos, err: fmt.Errorf(format, values...)}
  240. return nil
  241. }
  242. lx.items <- item{typ: itemError, pos: lx.getPos(), err: fmt.Errorf(format, values...)}
  243. return nil
  244. }
  245. func (lx *lexer) errorControlChar(cc rune) stateFn {
  246. return lx.errorPos(lx.pos-1, 1, errLexControl{cc})
  247. }
  248. // lexTop consumes elements at the top level of TOML data.
  249. func lexTop(lx *lexer) stateFn {
  250. r := lx.next()
  251. if isWhitespace(r) || isNL(r) {
  252. return lexSkip(lx, lexTop)
  253. }
  254. switch r {
  255. case '#':
  256. lx.push(lexTop)
  257. return lexCommentStart
  258. case '[':
  259. return lexTableStart
  260. case eof:
  261. if lx.pos > lx.start {
  262. return lx.errorf("unexpected EOF")
  263. }
  264. lx.emit(itemEOF)
  265. return nil
  266. }
  267. // At this point, the only valid item can be a key, so we back up
  268. // and let the key lexer do the rest.
  269. lx.backup()
  270. lx.push(lexTopEnd)
  271. return lexKeyStart
  272. }
  273. // lexTopEnd is entered whenever a top-level item has been consumed. (A value
  274. // or a table.) It must see only whitespace, and will turn back to lexTop
  275. // upon a newline. If it sees EOF, it will quit the lexer successfully.
  276. func lexTopEnd(lx *lexer) stateFn {
  277. r := lx.next()
  278. switch {
  279. case r == '#':
  280. // a comment will read to a newline for us.
  281. lx.push(lexTop)
  282. return lexCommentStart
  283. case isWhitespace(r):
  284. return lexTopEnd
  285. case isNL(r):
  286. lx.ignore()
  287. return lexTop
  288. case r == eof:
  289. lx.emit(itemEOF)
  290. return nil
  291. }
  292. return lx.errorf(
  293. "expected a top-level item to end with a newline, comment, or EOF, but got %q instead",
  294. r)
  295. }
  296. // lexTable lexes the beginning of a table. Namely, it makes sure that
  297. // it starts with a character other than '.' and ']'.
  298. // It assumes that '[' has already been consumed.
  299. // It also handles the case that this is an item in an array of tables.
  300. // e.g., '[[name]]'.
  301. func lexTableStart(lx *lexer) stateFn {
  302. if lx.peek() == '[' {
  303. lx.next()
  304. lx.emit(itemArrayTableStart)
  305. lx.push(lexArrayTableEnd)
  306. } else {
  307. lx.emit(itemTableStart)
  308. lx.push(lexTableEnd)
  309. }
  310. return lexTableNameStart
  311. }
  312. func lexTableEnd(lx *lexer) stateFn {
  313. lx.emit(itemTableEnd)
  314. return lexTopEnd
  315. }
  316. func lexArrayTableEnd(lx *lexer) stateFn {
  317. if r := lx.next(); r != ']' {
  318. return lx.errorf("expected end of table array name delimiter ']', but got %q instead", r)
  319. }
  320. lx.emit(itemArrayTableEnd)
  321. return lexTopEnd
  322. }
  323. func lexTableNameStart(lx *lexer) stateFn {
  324. lx.skip(isWhitespace)
  325. switch r := lx.peek(); {
  326. case r == ']' || r == eof:
  327. return lx.errorf("unexpected end of table name (table names cannot be empty)")
  328. case r == '.':
  329. return lx.errorf("unexpected table separator (table names cannot be empty)")
  330. case r == '"' || r == '\'':
  331. lx.ignore()
  332. lx.push(lexTableNameEnd)
  333. return lexQuotedName
  334. default:
  335. lx.push(lexTableNameEnd)
  336. return lexBareName
  337. }
  338. }
  339. // lexTableNameEnd reads the end of a piece of a table name, optionally
  340. // consuming whitespace.
  341. func lexTableNameEnd(lx *lexer) stateFn {
  342. lx.skip(isWhitespace)
  343. switch r := lx.next(); {
  344. case isWhitespace(r):
  345. return lexTableNameEnd
  346. case r == '.':
  347. lx.ignore()
  348. return lexTableNameStart
  349. case r == ']':
  350. return lx.pop()
  351. default:
  352. return lx.errorf("expected '.' or ']' to end table name, but got %q instead", r)
  353. }
  354. }
  355. // lexBareName lexes one part of a key or table.
  356. //
  357. // It assumes that at least one valid character for the table has already been
  358. // read.
  359. //
  360. // Lexes only one part, e.g. only 'a' inside 'a.b'.
  361. func lexBareName(lx *lexer) stateFn {
  362. r := lx.next()
  363. if isBareKeyChar(r) {
  364. return lexBareName
  365. }
  366. lx.backup()
  367. lx.emit(itemText)
  368. return lx.pop()
  369. }
  370. // lexBareName lexes one part of a key or table.
  371. //
  372. // It assumes that at least one valid character for the table has already been
  373. // read.
  374. //
  375. // Lexes only one part, e.g. only '"a"' inside '"a".b'.
  376. func lexQuotedName(lx *lexer) stateFn {
  377. r := lx.next()
  378. switch {
  379. case isWhitespace(r):
  380. return lexSkip(lx, lexValue)
  381. case r == '"':
  382. lx.ignore() // ignore the '"'
  383. return lexString
  384. case r == '\'':
  385. lx.ignore() // ignore the "'"
  386. return lexRawString
  387. case r == eof:
  388. return lx.errorf("unexpected EOF; expected value")
  389. default:
  390. return lx.errorf("expected value but found %q instead", r)
  391. }
  392. }
  393. // lexKeyStart consumes all key parts until a '='.
  394. func lexKeyStart(lx *lexer) stateFn {
  395. lx.skip(isWhitespace)
  396. switch r := lx.peek(); {
  397. case r == '=' || r == eof:
  398. return lx.errorf("unexpected '=': key name appears blank")
  399. case r == '.':
  400. return lx.errorf("unexpected '.': keys cannot start with a '.'")
  401. case r == '"' || r == '\'':
  402. lx.ignore()
  403. fallthrough
  404. default: // Bare key
  405. lx.emit(itemKeyStart)
  406. return lexKeyNameStart
  407. }
  408. }
  409. func lexKeyNameStart(lx *lexer) stateFn {
  410. lx.skip(isWhitespace)
  411. switch r := lx.peek(); {
  412. case r == '=' || r == eof:
  413. return lx.errorf("unexpected '='")
  414. case r == '.':
  415. return lx.errorf("unexpected '.'")
  416. case r == '"' || r == '\'':
  417. lx.ignore()
  418. lx.push(lexKeyEnd)
  419. return lexQuotedName
  420. default:
  421. lx.push(lexKeyEnd)
  422. return lexBareName
  423. }
  424. }
  425. // lexKeyEnd consumes the end of a key and trims whitespace (up to the key
  426. // separator).
  427. func lexKeyEnd(lx *lexer) stateFn {
  428. lx.skip(isWhitespace)
  429. switch r := lx.next(); {
  430. case isWhitespace(r):
  431. return lexSkip(lx, lexKeyEnd)
  432. case r == eof:
  433. return lx.errorf("unexpected EOF; expected key separator '='")
  434. case r == '.':
  435. lx.ignore()
  436. return lexKeyNameStart
  437. case r == '=':
  438. lx.emit(itemKeyEnd)
  439. return lexSkip(lx, lexValue)
  440. default:
  441. return lx.errorf("expected '.' or '=', but got %q instead", r)
  442. }
  443. }
  444. // lexValue starts the consumption of a value anywhere a value is expected.
  445. // lexValue will ignore whitespace.
  446. // After a value is lexed, the last state on the next is popped and returned.
  447. func lexValue(lx *lexer) stateFn {
  448. // We allow whitespace to precede a value, but NOT newlines.
  449. // In array syntax, the array states are responsible for ignoring newlines.
  450. r := lx.next()
  451. switch {
  452. case isWhitespace(r):
  453. return lexSkip(lx, lexValue)
  454. case isDigit(r):
  455. lx.backup() // avoid an extra state and use the same as above
  456. return lexNumberOrDateStart
  457. }
  458. switch r {
  459. case '[':
  460. lx.ignore()
  461. lx.emit(itemArray)
  462. return lexArrayValue
  463. case '{':
  464. lx.ignore()
  465. lx.emit(itemInlineTableStart)
  466. return lexInlineTableValue
  467. case '"':
  468. if lx.accept('"') {
  469. if lx.accept('"') {
  470. lx.ignore() // Ignore """
  471. return lexMultilineString
  472. }
  473. lx.backup()
  474. }
  475. lx.ignore() // ignore the '"'
  476. return lexString
  477. case '\'':
  478. if lx.accept('\'') {
  479. if lx.accept('\'') {
  480. lx.ignore() // Ignore """
  481. return lexMultilineRawString
  482. }
  483. lx.backup()
  484. }
  485. lx.ignore() // ignore the "'"
  486. return lexRawString
  487. case '.': // special error case, be kind to users
  488. return lx.errorf("floats must start with a digit, not '.'")
  489. case 'i', 'n':
  490. if (lx.accept('n') && lx.accept('f')) || (lx.accept('a') && lx.accept('n')) {
  491. lx.emit(itemFloat)
  492. return lx.pop()
  493. }
  494. case '-', '+':
  495. return lexDecimalNumberStart
  496. }
  497. if unicode.IsLetter(r) {
  498. // Be permissive here; lexBool will give a nice error if the
  499. // user wrote something like
  500. // x = foo
  501. // (i.e. not 'true' or 'false' but is something else word-like.)
  502. lx.backup()
  503. return lexBool
  504. }
  505. if r == eof {
  506. return lx.errorf("unexpected EOF; expected value")
  507. }
  508. return lx.errorf("expected value but found %q instead", r)
  509. }
  510. // lexArrayValue consumes one value in an array. It assumes that '[' or ','
  511. // have already been consumed. All whitespace and newlines are ignored.
  512. func lexArrayValue(lx *lexer) stateFn {
  513. r := lx.next()
  514. switch {
  515. case isWhitespace(r) || isNL(r):
  516. return lexSkip(lx, lexArrayValue)
  517. case r == '#':
  518. lx.push(lexArrayValue)
  519. return lexCommentStart
  520. case r == ',':
  521. return lx.errorf("unexpected comma")
  522. case r == ']':
  523. return lexArrayEnd
  524. }
  525. lx.backup()
  526. lx.push(lexArrayValueEnd)
  527. return lexValue
  528. }
  529. // lexArrayValueEnd consumes everything between the end of an array value and
  530. // the next value (or the end of the array): it ignores whitespace and newlines
  531. // and expects either a ',' or a ']'.
  532. func lexArrayValueEnd(lx *lexer) stateFn {
  533. switch r := lx.next(); {
  534. case isWhitespace(r) || isNL(r):
  535. return lexSkip(lx, lexArrayValueEnd)
  536. case r == '#':
  537. lx.push(lexArrayValueEnd)
  538. return lexCommentStart
  539. case r == ',':
  540. lx.ignore()
  541. return lexArrayValue // move on to the next value
  542. case r == ']':
  543. return lexArrayEnd
  544. default:
  545. return lx.errorf("expected a comma (',') or array terminator (']'), but got %s", runeOrEOF(r))
  546. }
  547. }
  548. // lexArrayEnd finishes the lexing of an array.
  549. // It assumes that a ']' has just been consumed.
  550. func lexArrayEnd(lx *lexer) stateFn {
  551. lx.ignore()
  552. lx.emit(itemArrayEnd)
  553. return lx.pop()
  554. }
  555. // lexInlineTableValue consumes one key/value pair in an inline table.
  556. // It assumes that '{' or ',' have already been consumed. Whitespace is ignored.
  557. func lexInlineTableValue(lx *lexer) stateFn {
  558. r := lx.next()
  559. switch {
  560. case isWhitespace(r):
  561. return lexSkip(lx, lexInlineTableValue)
  562. case isNL(r):
  563. return lx.errorPrevLine(errLexInlineTableNL{})
  564. case r == '#':
  565. lx.push(lexInlineTableValue)
  566. return lexCommentStart
  567. case r == ',':
  568. return lx.errorf("unexpected comma")
  569. case r == '}':
  570. return lexInlineTableEnd
  571. }
  572. lx.backup()
  573. lx.push(lexInlineTableValueEnd)
  574. return lexKeyStart
  575. }
  576. // lexInlineTableValueEnd consumes everything between the end of an inline table
  577. // key/value pair and the next pair (or the end of the table):
  578. // it ignores whitespace and expects either a ',' or a '}'.
  579. func lexInlineTableValueEnd(lx *lexer) stateFn {
  580. switch r := lx.next(); {
  581. case isWhitespace(r):
  582. return lexSkip(lx, lexInlineTableValueEnd)
  583. case isNL(r):
  584. return lx.errorPrevLine(errLexInlineTableNL{})
  585. case r == '#':
  586. lx.push(lexInlineTableValueEnd)
  587. return lexCommentStart
  588. case r == ',':
  589. lx.ignore()
  590. lx.skip(isWhitespace)
  591. if lx.peek() == '}' {
  592. return lx.errorf("trailing comma not allowed in inline tables")
  593. }
  594. return lexInlineTableValue
  595. case r == '}':
  596. return lexInlineTableEnd
  597. default:
  598. return lx.errorf("expected a comma or an inline table terminator '}', but got %s instead", runeOrEOF(r))
  599. }
  600. }
  601. func runeOrEOF(r rune) string {
  602. if r == eof {
  603. return "end of file"
  604. }
  605. return "'" + string(r) + "'"
  606. }
  607. // lexInlineTableEnd finishes the lexing of an inline table.
  608. // It assumes that a '}' has just been consumed.
  609. func lexInlineTableEnd(lx *lexer) stateFn {
  610. lx.ignore()
  611. lx.emit(itemInlineTableEnd)
  612. return lx.pop()
  613. }
  614. // lexString consumes the inner contents of a string. It assumes that the
  615. // beginning '"' has already been consumed and ignored.
  616. func lexString(lx *lexer) stateFn {
  617. r := lx.next()
  618. switch {
  619. case r == eof:
  620. return lx.errorf(`unexpected EOF; expected '"'`)
  621. case isNL(r):
  622. return lx.errorPrevLine(errLexStringNL{})
  623. case r == '\\':
  624. lx.push(lexString)
  625. return lexStringEscape
  626. case r == '"':
  627. lx.backup()
  628. lx.emit(itemString)
  629. lx.next()
  630. lx.ignore()
  631. return lx.pop()
  632. }
  633. return lexString
  634. }
  635. // lexMultilineString consumes the inner contents of a string. It assumes that
  636. // the beginning '"""' has already been consumed and ignored.
  637. func lexMultilineString(lx *lexer) stateFn {
  638. r := lx.next()
  639. switch r {
  640. default:
  641. return lexMultilineString
  642. case eof:
  643. return lx.errorf(`unexpected EOF; expected '"""'`)
  644. case '\\':
  645. return lexMultilineStringEscape
  646. case '"':
  647. /// Found " → try to read two more "".
  648. if lx.accept('"') {
  649. if lx.accept('"') {
  650. /// Peek ahead: the string can contain " and "", including at the
  651. /// end: """str"""""
  652. /// 6 or more at the end, however, is an error.
  653. if lx.peek() == '"' {
  654. /// Check if we already lexed 5 's; if so we have 6 now, and
  655. /// that's just too many man!
  656. if strings.HasSuffix(lx.current(), `"""""`) {
  657. return lx.errorf(`unexpected '""""""'`)
  658. }
  659. lx.backup()
  660. lx.backup()
  661. return lexMultilineString
  662. }
  663. lx.backup() /// backup: don't include the """ in the item.
  664. lx.backup()
  665. lx.backup()
  666. lx.emit(itemMultilineString)
  667. lx.next() /// Read over ''' again and discard it.
  668. lx.next()
  669. lx.next()
  670. lx.ignore()
  671. return lx.pop()
  672. }
  673. lx.backup()
  674. }
  675. return lexMultilineString
  676. }
  677. }
  678. // lexRawString consumes a raw string. Nothing can be escaped in such a string.
  679. // It assumes that the beginning "'" has already been consumed and ignored.
  680. func lexRawString(lx *lexer) stateFn {
  681. r := lx.next()
  682. switch {
  683. default:
  684. return lexRawString
  685. case r == eof:
  686. return lx.errorf(`unexpected EOF; expected "'"`)
  687. case isNL(r):
  688. return lx.errorPrevLine(errLexStringNL{})
  689. case r == '\'':
  690. lx.backup()
  691. lx.emit(itemRawString)
  692. lx.next()
  693. lx.ignore()
  694. return lx.pop()
  695. }
  696. }
  697. // lexMultilineRawString consumes a raw string. Nothing can be escaped in such
  698. // a string. It assumes that the beginning "'''" has already been consumed and
  699. // ignored.
  700. func lexMultilineRawString(lx *lexer) stateFn {
  701. r := lx.next()
  702. switch r {
  703. default:
  704. return lexMultilineRawString
  705. case eof:
  706. return lx.errorf(`unexpected EOF; expected "'''"`)
  707. case '\'':
  708. /// Found ' → try to read two more ''.
  709. if lx.accept('\'') {
  710. if lx.accept('\'') {
  711. /// Peek ahead: the string can contain ' and '', including at the
  712. /// end: '''str'''''
  713. /// 6 or more at the end, however, is an error.
  714. if lx.peek() == '\'' {
  715. /// Check if we already lexed 5 's; if so we have 6 now, and
  716. /// that's just too many man!
  717. if strings.HasSuffix(lx.current(), "'''''") {
  718. return lx.errorf(`unexpected "''''''"`)
  719. }
  720. lx.backup()
  721. lx.backup()
  722. return lexMultilineRawString
  723. }
  724. lx.backup() /// backup: don't include the ''' in the item.
  725. lx.backup()
  726. lx.backup()
  727. lx.emit(itemRawMultilineString)
  728. lx.next() /// Read over ''' again and discard it.
  729. lx.next()
  730. lx.next()
  731. lx.ignore()
  732. return lx.pop()
  733. }
  734. lx.backup()
  735. }
  736. return lexMultilineRawString
  737. }
  738. }
  739. // lexMultilineStringEscape consumes an escaped character. It assumes that the
  740. // preceding '\\' has already been consumed.
  741. func lexMultilineStringEscape(lx *lexer) stateFn {
  742. // Handle the special case first:
  743. if isNL(lx.next()) {
  744. return lexMultilineString
  745. }
  746. lx.backup()
  747. lx.push(lexMultilineString)
  748. return lexStringEscape(lx)
  749. }
  750. func lexStringEscape(lx *lexer) stateFn {
  751. r := lx.next()
  752. switch r {
  753. case 'b':
  754. fallthrough
  755. case 't':
  756. fallthrough
  757. case 'n':
  758. fallthrough
  759. case 'f':
  760. fallthrough
  761. case 'r':
  762. fallthrough
  763. case '"':
  764. fallthrough
  765. case ' ', '\t':
  766. // Inside """ .. """ strings you can use \ to escape newlines, and any
  767. // amount of whitespace can be between the \ and \n.
  768. fallthrough
  769. case '\\':
  770. return lx.pop()
  771. case 'u':
  772. return lexShortUnicodeEscape
  773. case 'U':
  774. return lexLongUnicodeEscape
  775. }
  776. return lx.error(errLexEscape{r})
  777. }
  778. func lexShortUnicodeEscape(lx *lexer) stateFn {
  779. var r rune
  780. for i := 0; i < 4; i++ {
  781. r = lx.next()
  782. if !isHexadecimal(r) {
  783. return lx.errorf(
  784. `expected four hexadecimal digits after '\u', but got %q instead`,
  785. lx.current())
  786. }
  787. }
  788. return lx.pop()
  789. }
  790. func lexLongUnicodeEscape(lx *lexer) stateFn {
  791. var r rune
  792. for i := 0; i < 8; i++ {
  793. r = lx.next()
  794. if !isHexadecimal(r) {
  795. return lx.errorf(
  796. `expected eight hexadecimal digits after '\U', but got %q instead`,
  797. lx.current())
  798. }
  799. }
  800. return lx.pop()
  801. }
  802. // lexNumberOrDateStart processes the first character of a value which begins
  803. // with a digit. It exists to catch values starting with '0', so that
  804. // lexBaseNumberOrDate can differentiate base prefixed integers from other
  805. // types.
  806. func lexNumberOrDateStart(lx *lexer) stateFn {
  807. r := lx.next()
  808. switch r {
  809. case '0':
  810. return lexBaseNumberOrDate
  811. }
  812. if !isDigit(r) {
  813. // The only way to reach this state is if the value starts
  814. // with a digit, so specifically treat anything else as an
  815. // error.
  816. return lx.errorf("expected a digit but got %q", r)
  817. }
  818. return lexNumberOrDate
  819. }
  820. // lexNumberOrDate consumes either an integer, float or datetime.
  821. func lexNumberOrDate(lx *lexer) stateFn {
  822. r := lx.next()
  823. if isDigit(r) {
  824. return lexNumberOrDate
  825. }
  826. switch r {
  827. case '-', ':':
  828. return lexDatetime
  829. case '_':
  830. return lexDecimalNumber
  831. case '.', 'e', 'E':
  832. return lexFloat
  833. }
  834. lx.backup()
  835. lx.emit(itemInteger)
  836. return lx.pop()
  837. }
  838. // lexDatetime consumes a Datetime, to a first approximation.
  839. // The parser validates that it matches one of the accepted formats.
  840. func lexDatetime(lx *lexer) stateFn {
  841. r := lx.next()
  842. if isDigit(r) {
  843. return lexDatetime
  844. }
  845. switch r {
  846. case '-', ':', 'T', 't', ' ', '.', 'Z', 'z', '+':
  847. return lexDatetime
  848. }
  849. lx.backup()
  850. lx.emitTrim(itemDatetime)
  851. return lx.pop()
  852. }
  853. // lexHexInteger consumes a hexadecimal integer after seeing the '0x' prefix.
  854. func lexHexInteger(lx *lexer) stateFn {
  855. r := lx.next()
  856. if isHexadecimal(r) {
  857. return lexHexInteger
  858. }
  859. switch r {
  860. case '_':
  861. return lexHexInteger
  862. }
  863. lx.backup()
  864. lx.emit(itemInteger)
  865. return lx.pop()
  866. }
  867. // lexOctalInteger consumes an octal integer after seeing the '0o' prefix.
  868. func lexOctalInteger(lx *lexer) stateFn {
  869. r := lx.next()
  870. if isOctal(r) {
  871. return lexOctalInteger
  872. }
  873. switch r {
  874. case '_':
  875. return lexOctalInteger
  876. }
  877. lx.backup()
  878. lx.emit(itemInteger)
  879. return lx.pop()
  880. }
  881. // lexBinaryInteger consumes a binary integer after seeing the '0b' prefix.
  882. func lexBinaryInteger(lx *lexer) stateFn {
  883. r := lx.next()
  884. if isBinary(r) {
  885. return lexBinaryInteger
  886. }
  887. switch r {
  888. case '_':
  889. return lexBinaryInteger
  890. }
  891. lx.backup()
  892. lx.emit(itemInteger)
  893. return lx.pop()
  894. }
  895. // lexDecimalNumber consumes a decimal float or integer.
  896. func lexDecimalNumber(lx *lexer) stateFn {
  897. r := lx.next()
  898. if isDigit(r) {
  899. return lexDecimalNumber
  900. }
  901. switch r {
  902. case '.', 'e', 'E':
  903. return lexFloat
  904. case '_':
  905. return lexDecimalNumber
  906. }
  907. lx.backup()
  908. lx.emit(itemInteger)
  909. return lx.pop()
  910. }
  911. // lexDecimalNumber consumes the first digit of a number beginning with a sign.
  912. // It assumes the sign has already been consumed. Values which start with a sign
  913. // are only allowed to be decimal integers or floats.
  914. //
  915. // The special "nan" and "inf" values are also recognized.
  916. func lexDecimalNumberStart(lx *lexer) stateFn {
  917. r := lx.next()
  918. // Special error cases to give users better error messages
  919. switch r {
  920. case 'i':
  921. if !lx.accept('n') || !lx.accept('f') {
  922. return lx.errorf("invalid float: '%s'", lx.current())
  923. }
  924. lx.emit(itemFloat)
  925. return lx.pop()
  926. case 'n':
  927. if !lx.accept('a') || !lx.accept('n') {
  928. return lx.errorf("invalid float: '%s'", lx.current())
  929. }
  930. lx.emit(itemFloat)
  931. return lx.pop()
  932. case '0':
  933. p := lx.peek()
  934. switch p {
  935. case 'b', 'o', 'x':
  936. return lx.errorf("cannot use sign with non-decimal numbers: '%s%c'", lx.current(), p)
  937. }
  938. case '.':
  939. return lx.errorf("floats must start with a digit, not '.'")
  940. }
  941. if isDigit(r) {
  942. return lexDecimalNumber
  943. }
  944. return lx.errorf("expected a digit but got %q", r)
  945. }
  946. // lexBaseNumberOrDate differentiates between the possible values which
  947. // start with '0'. It assumes that before reaching this state, the initial '0'
  948. // has been consumed.
  949. func lexBaseNumberOrDate(lx *lexer) stateFn {
  950. r := lx.next()
  951. // Note: All datetimes start with at least two digits, so we don't
  952. // handle date characters (':', '-', etc.) here.
  953. if isDigit(r) {
  954. return lexNumberOrDate
  955. }
  956. switch r {
  957. case '_':
  958. // Can only be decimal, because there can't be an underscore
  959. // between the '0' and the base designator, and dates can't
  960. // contain underscores.
  961. return lexDecimalNumber
  962. case '.', 'e', 'E':
  963. return lexFloat
  964. case 'b':
  965. r = lx.peek()
  966. if !isBinary(r) {
  967. lx.errorf("not a binary number: '%s%c'", lx.current(), r)
  968. }
  969. return lexBinaryInteger
  970. case 'o':
  971. r = lx.peek()
  972. if !isOctal(r) {
  973. lx.errorf("not an octal number: '%s%c'", lx.current(), r)
  974. }
  975. return lexOctalInteger
  976. case 'x':
  977. r = lx.peek()
  978. if !isHexadecimal(r) {
  979. lx.errorf("not a hexidecimal number: '%s%c'", lx.current(), r)
  980. }
  981. return lexHexInteger
  982. }
  983. lx.backup()
  984. lx.emit(itemInteger)
  985. return lx.pop()
  986. }
  987. // lexFloat consumes the elements of a float. It allows any sequence of
  988. // float-like characters, so floats emitted by the lexer are only a first
  989. // approximation and must be validated by the parser.
  990. func lexFloat(lx *lexer) stateFn {
  991. r := lx.next()
  992. if isDigit(r) {
  993. return lexFloat
  994. }
  995. switch r {
  996. case '_', '.', '-', '+', 'e', 'E':
  997. return lexFloat
  998. }
  999. lx.backup()
  1000. lx.emit(itemFloat)
  1001. return lx.pop()
  1002. }
  1003. // lexBool consumes a bool string: 'true' or 'false.
  1004. func lexBool(lx *lexer) stateFn {
  1005. var rs []rune
  1006. for {
  1007. r := lx.next()
  1008. if !unicode.IsLetter(r) {
  1009. lx.backup()
  1010. break
  1011. }
  1012. rs = append(rs, r)
  1013. }
  1014. s := string(rs)
  1015. switch s {
  1016. case "true", "false":
  1017. lx.emit(itemBool)
  1018. return lx.pop()
  1019. }
  1020. return lx.errorf("expected value but found %q instead", s)
  1021. }
  1022. // lexCommentStart begins the lexing of a comment. It will emit
  1023. // itemCommentStart and consume no characters, passing control to lexComment.
  1024. func lexCommentStart(lx *lexer) stateFn {
  1025. lx.ignore()
  1026. lx.emit(itemCommentStart)
  1027. return lexComment
  1028. }
  1029. // lexComment lexes an entire comment. It assumes that '#' has been consumed.
  1030. // It will consume *up to* the first newline character, and pass control
  1031. // back to the last state on the stack.
  1032. func lexComment(lx *lexer) stateFn {
  1033. switch r := lx.next(); {
  1034. case isNL(r) || r == eof:
  1035. lx.backup()
  1036. lx.emit(itemText)
  1037. return lx.pop()
  1038. default:
  1039. return lexComment
  1040. }
  1041. }
  1042. // lexSkip ignores all slurped input and moves on to the next state.
  1043. func lexSkip(lx *lexer, nextState stateFn) stateFn {
  1044. lx.ignore()
  1045. return nextState
  1046. }
  1047. func (s stateFn) String() string {
  1048. name := runtime.FuncForPC(reflect.ValueOf(s).Pointer()).Name()
  1049. if i := strings.LastIndexByte(name, '.'); i > -1 {
  1050. name = name[i+1:]
  1051. }
  1052. if s == nil {
  1053. name = "<nil>"
  1054. }
  1055. return name + "()"
  1056. }
  1057. func (itype itemType) String() string {
  1058. switch itype {
  1059. case itemError:
  1060. return "Error"
  1061. case itemNIL:
  1062. return "NIL"
  1063. case itemEOF:
  1064. return "EOF"
  1065. case itemText:
  1066. return "Text"
  1067. case itemString, itemRawString, itemMultilineString, itemRawMultilineString:
  1068. return "String"
  1069. case itemBool:
  1070. return "Bool"
  1071. case itemInteger:
  1072. return "Integer"
  1073. case itemFloat:
  1074. return "Float"
  1075. case itemDatetime:
  1076. return "DateTime"
  1077. case itemTableStart:
  1078. return "TableStart"
  1079. case itemTableEnd:
  1080. return "TableEnd"
  1081. case itemKeyStart:
  1082. return "KeyStart"
  1083. case itemKeyEnd:
  1084. return "KeyEnd"
  1085. case itemArray:
  1086. return "Array"
  1087. case itemArrayEnd:
  1088. return "ArrayEnd"
  1089. case itemCommentStart:
  1090. return "CommentStart"
  1091. case itemInlineTableStart:
  1092. return "InlineTableStart"
  1093. case itemInlineTableEnd:
  1094. return "InlineTableEnd"
  1095. }
  1096. panic(fmt.Sprintf("BUG: Unknown type '%d'.", int(itype)))
  1097. }
  1098. func (item item) String() string {
  1099. return fmt.Sprintf("(%s, %s)", item.typ.String(), item.val)
  1100. }
  1101. func isWhitespace(r rune) bool { return r == '\t' || r == ' ' }
  1102. func isNL(r rune) bool { return r == '\n' || r == '\r' }
  1103. func isControl(r rune) bool { // Control characters except \t, \r, \n
  1104. switch r {
  1105. case '\t', '\r', '\n':
  1106. return false
  1107. default:
  1108. return (r >= 0x00 && r <= 0x1f) || r == 0x7f
  1109. }
  1110. }
  1111. func isDigit(r rune) bool { return r >= '0' && r <= '9' }
  1112. func isBinary(r rune) bool { return r == '0' || r == '1' }
  1113. func isOctal(r rune) bool { return r >= '0' && r <= '7' }
  1114. func isHexadecimal(r rune) bool {
  1115. return (r >= '0' && r <= '9') || (r >= 'a' && r <= 'f') || (r >= 'A' && r <= 'F')
  1116. }
  1117. func isBareKeyChar(r rune) bool {
  1118. return (r >= 'A' && r <= 'Z') ||
  1119. (r >= 'a' && r <= 'z') ||
  1120. (r >= '0' && r <= '9') ||
  1121. r == '_' || r == '-'
  1122. }