lexer.go - third_party/pelletier/go-toml - Git at Google

 // TOML lexer.
 //
 // Written using the principles developped by Rob Pike in
 // http://www.youtube.com/watch?v=HxaD_trXwRE

 package toml

 import (
 	"fmt"
 	"regexp"
 	"strconv"
 	"strings"
 	"unicode/utf8"
 )

 var dateRegexp *regexp.Regexp

 // Define state functions
 type tomlLexStateFn func() tomlLexStateFn

 // Define lexer
 type tomlLexer struct {
 	input  string
 	start  int
 	pos    int
 	width  int
 	tokens chan token
 	depth  int
 	line   int
 	col    int
 }

 func (l *tomlLexer) run() {
 	for state := l.lexVoid; state != nil; {
 		state = state()
 	}
 	close(l.tokens)
 }

 func (l *tomlLexer) nextStart() {
 	// iterate by runes (utf8 characters)
 	// search for newlines and advance line/col counts
 	for i := l.start; i < l.pos; {
 		r, width := utf8.DecodeRuneInString(l.input[i:])
 		if r == '\n' {
 			l.line++
 			l.col = 1
 		} else {
 			l.col++
 		}
 		i += width
 	}
 	// advance start position to next token
 	l.start = l.pos
 }

 func (l *tomlLexer) emit(t tokenType) {
 	l.tokens <- token{
 		Position: Position{l.line, l.col},
 		typ:      t,
 		val:      l.input[l.start:l.pos],
 	}
 	l.nextStart()
 }

 func (l *tomlLexer) emitWithValue(t tokenType, value string) {
 	l.tokens <- token{
 		Position: Position{l.line, l.col},
 		typ:      t,
 		val:      value,
 	}
 	l.nextStart()
 }

 func (l *tomlLexer) next() rune {
 	if l.pos >= len(l.input) {
 		l.width = 0
 		return eof
 	}
 	var r rune
 	r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
 	l.pos += l.width
 	return r
 }

 func (l *tomlLexer) ignore() {
 	l.nextStart()
 }

 func (l *tomlLexer) backup() {
 	l.pos -= l.width
 }

 func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn {
 	l.tokens <- token{
 		Position: Position{l.line, l.col},
 		typ:      tokenError,
 		val:      fmt.Sprintf(format, args...),
 	}
 	return nil
 }

 func (l *tomlLexer) peek() rune {
 	r := l.next()
 	l.backup()
 	return r
 }

 func (l *tomlLexer) accept(valid string) bool {
 	if strings.IndexRune(valid, l.next()) >= 0 {
 		return true
 	}
 	l.backup()
 	return false
 }

 func (l *tomlLexer) follow(next string) bool {
 	return strings.HasPrefix(l.input[l.pos:], next)
 }

 func (l *tomlLexer) lexVoid() tomlLexStateFn {
 	for {
 		next := l.peek()
 		switch next {
 		case '[':
 			return l.lexKeyGroup
 		case '#':
 			return l.lexComment
 		case '=':
 			return l.lexEqual
 		}

 		if isSpace(next) {
 			l.ignore()
 		}

 		if l.depth > 0 {
 			return l.lexRvalue
 		}

 		if isKeyStartChar(next) {
 			return l.lexKey
 		}

 		if l.next() == eof {
 			break
 		}
 	}

 	l.emit(tokenEOF)
 	return nil
 }

 func (l *tomlLexer) lexRvalue() tomlLexStateFn {
 	for {
 		next := l.peek()
 		switch next {
 		case '.':
 			return l.errorf("cannot start float with a dot")
 		case '=':
 			return l.errorf("cannot have multiple equals for the same key")
 		case '[':
 			l.depth++
 			return l.lexLeftBracket
 		case ']':
 			l.depth--
 			return l.lexRightBracket
 		case '#':
 			return l.lexComment
 		case '"':
 			return l.lexString
 		case '\'':
 			return l.lexLiteralString
 		case ',':
 			return l.lexComma
 		case '\n':
 			l.ignore()
 			l.pos++
 			if l.depth == 0 {
 				return l.lexVoid
 			}
 			return l.lexRvalue
 		}

 		if l.follow("true") {
 			return l.lexTrue
 		}

 		if l.follow("false") {
 			return l.lexFalse
 		}

 		if isAlphanumeric(next) {
 			return l.lexKey
 		}

 		dateMatch := dateRegexp.FindString(l.input[l.pos:])
 		if dateMatch != "" {
 			l.ignore()
 			l.pos += len(dateMatch)
 			return l.lexDate
 		}

 		if next == '+' || next == '-' || isDigit(next) {
 			return l.lexNumber
 		}

 		if isSpace(next) {
 			l.ignore()
 		}

 		if l.next() == eof {
 			break
 		}
 	}

 	l.emit(tokenEOF)
 	return nil
 }

 func (l *tomlLexer) lexDate() tomlLexStateFn {
 	l.emit(tokenDate)
 	return l.lexRvalue
 }

 func (l *tomlLexer) lexTrue() tomlLexStateFn {
 	l.ignore()
 	l.pos += 4
 	l.emit(tokenTrue)
 	return l.lexRvalue
 }

 func (l *tomlLexer) lexFalse() tomlLexStateFn {
 	l.ignore()
 	l.pos += 5
 	l.emit(tokenFalse)
 	return l.lexRvalue
 }

 func (l *tomlLexer) lexEqual() tomlLexStateFn {
 	l.ignore()
 	l.accept("=")
 	l.emit(tokenEqual)
 	return l.lexRvalue
 }

 func (l *tomlLexer) lexComma() tomlLexStateFn {
 	l.ignore()
 	l.accept(",")
 	l.emit(tokenComma)
 	return l.lexRvalue
 }

 func (l *tomlLexer) lexKey() tomlLexStateFn {
 	l.ignore()
 	for r := l.next(); isKeyChar(r); r = l.next() {
 		if r == '#' {
 			return l.errorf("keys cannot contain # character")
 		}
 	}
 	l.backup()
 	l.emit(tokenKey)
 	return l.lexVoid
 }

 func (l *tomlLexer) lexComment() tomlLexStateFn {
 	for {
 		next := l.next()
 		if next == '\n' || next == eof {
 			break
 		}
 	}
 	l.ignore()
 	return l.lexVoid
 }

 func (l *tomlLexer) lexLeftBracket() tomlLexStateFn {
 	l.ignore()
 	l.pos++
 	l.emit(tokenLeftBracket)
 	return l.lexRvalue
 }

 func (l *tomlLexer) lexLiteralString() tomlLexStateFn {
 	l.pos++
 	l.ignore()
 	growingString := ""

 	// handle special case for triple-quote
 	terminator := "'"
 	if l.follow("''") {
 		l.pos += 2
 		l.ignore()
 		terminator = "'''"

 		// special case: discard leading newline
 		if l.peek() == '\n' {
 			l.pos++
 			l.ignore()
 		}
 	}

 	// find end of string
 	for {
 		if l.follow(terminator) {
 			l.emitWithValue(tokenString, growingString)
 			l.pos += len(terminator)
 			l.ignore()
 			return l.lexRvalue
 		}

 		growingString += string(l.peek())

 		if l.next() == eof {
 			break
 		}
 	}

 	return l.errorf("unclosed string")
 }

 func (l *tomlLexer) lexString() tomlLexStateFn {
 	l.pos++
 	l.ignore()
 	growingString := ""

 	// handle special case for triple-quote
 	terminator := "\""
 	if l.follow("\"\"") {
 		l.pos += 2
 		l.ignore()
 		terminator = "\"\"\""

 		// special case: discard leading newline
 		if l.peek() == '\n' {
 			l.pos++
 			l.ignore()
 		}
 	}

 	for {
 		if l.follow(terminator) {
 			l.emitWithValue(tokenString, growingString)
 			l.pos += len(terminator)
 			l.ignore()
 			return l.lexRvalue
 		}

 		if l.follow("\\") {
 			l.pos++
 			switch l.peek() {
 			case '\r':
 				fallthrough
 			case '\n':
 				fallthrough
 			case '\t':
 				fallthrough
 			case ' ':
 				// skip all whitespace chars following backslash
 				l.pos++
 				for strings.ContainsRune("\r\n\t ", l.peek()) {
 					l.pos++
 				}
 				l.pos--
 			case '"':
 				growingString += "\""
 			case 'n':
 				growingString += "\n"
 			case 'b':
 				growingString += "\b"
 			case 'f':
 				growingString += "\f"
 			case '/':
 				growingString += "/"
 			case 't':
 				growingString += "\t"
 			case 'r':
 				growingString += "\r"
 			case '\\':
 				growingString += "\\"
 			case 'u':
 				l.pos++
 				code := ""
 				for i := 0; i < 4; i++ {
 					c := l.peek()
 					l.pos++
 					if !isHexDigit(c) {
 						return l.errorf("unfinished unicode escape")
 					}
 					code = code + string(c)
 				}
 				l.pos--
 				intcode, err := strconv.ParseInt(code, 16, 32)
 				if err != nil {
 					return l.errorf("invalid unicode escape: \\u" + code)
 				}
 				growingString += string(rune(intcode))
 			default:
 				return l.errorf("invalid escape sequence: \\" + string(l.peek()))
 			}
 		} else {
 			growingString += string(l.peek())
 		}

 		if l.next() == eof {
 			break
 		}
 	}

 	return l.errorf("unclosed string")
 }

 func (l *tomlLexer) lexKeyGroup() tomlLexStateFn {
 	l.ignore()
 	l.pos++

 	if l.peek() == '[' {
 		// token '[[' signifies an array of anonymous key groups
 		l.pos++
 		l.emit(tokenDoubleLeftBracket)
 		return l.lexInsideKeyGroupArray
 	}
 	// vanilla key group
 	l.emit(tokenLeftBracket)
 	return l.lexInsideKeyGroup
 }

 func (l *tomlLexer) lexInsideKeyGroupArray() tomlLexStateFn {
 	for {
 		if l.peek() == ']' {
 			if l.pos > l.start {
 				l.emit(tokenKeyGroupArray)
 			}
 			l.ignore()
 			l.pos++
 			if l.peek() != ']' {
 				break // error
 			}
 			l.pos++
 			l.emit(tokenDoubleRightBracket)
 			return l.lexVoid
 		} else if l.peek() == '[' {
 			return l.errorf("group name cannot contain ']'")
 		}

 		if l.next() == eof {
 			break
 		}
 	}
 	return l.errorf("unclosed key group array")
 }

 func (l *tomlLexer) lexInsideKeyGroup() tomlLexStateFn {
 	for {
 		if l.peek() == ']' {
 			if l.pos > l.start {
 				l.emit(tokenKeyGroup)
 			}
 			l.ignore()
 			l.pos++
 			l.emit(tokenRightBracket)
 			return l.lexVoid
 		} else if l.peek() == '[' {
 			return l.errorf("group name cannot contain ']'")
 		}

 		if l.next() == eof {
 			break
 		}
 	}
 	return l.errorf("unclosed key group")
 }

 func (l *tomlLexer) lexRightBracket() tomlLexStateFn {
 	l.ignore()
 	l.pos++
 	l.emit(tokenRightBracket)
 	return l.lexRvalue
 }

 func (l *tomlLexer) lexNumber() tomlLexStateFn {
 	l.ignore()
 	if !l.accept("+") {
 		l.accept("-")
 	}
 	pointSeen := false
 	expSeen := false
 	digitSeen := false
 	for {
 		next := l.next()
 		if next == '.' {
 			if pointSeen {
 				return l.errorf("cannot have two dots in one float")
 			}
 			if !isDigit(l.peek()) {
 				return l.errorf("float cannot end with a dot")
 			}
 			pointSeen = true
 		} else if next == 'e' || next == 'E' {
 			expSeen = true
 			if !l.accept("+") {
 				l.accept("-")
 			}
 		} else if isDigit(next) {
 			digitSeen = true
 		} else {
 			l.backup()
 			break
 		}
 		if pointSeen && !digitSeen {
 			return l.errorf("cannot start float with a dot")
 		}
 	}

 	if !digitSeen {
 		return l.errorf("no digit in that number")
 	}
 	if pointSeen || expSeen {
 		l.emit(tokenFloat)
 	} else {
 		l.emit(tokenInteger)
 	}
 	return l.lexRvalue
 }

 func init() {
 	dateRegexp = regexp.MustCompile("^\\d{1,4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(\\.\\d{1,9})?(Z|[+-]\\d{2}:\\d{2})")
 }

 // Entry point
 func lexToml(input string) chan token {
 	l := &tomlLexer{
 		input:  input,
 		tokens: make(chan token),
 		line:   1,
 		col:    1,
 	}
 	go l.run()
 	return l.tokens
 }
	// TOML lexer.
	//
	// Written using the principles developped by Rob Pike in
	// http://www.youtube.com/watch?v=HxaD_trXwRE

	package toml

	import (
	"fmt"
	"regexp"
	"strconv"
	"strings"
	"unicode/utf8"
	)

	var dateRegexp *regexp.Regexp

	// Define state functions
	type tomlLexStateFn func() tomlLexStateFn

	// Define lexer
	type tomlLexer struct {
	input string
	start int
	pos int
	width int
	tokens chan token
	depth int
	line int
	col int
	}

	func (l *tomlLexer) run() {
	for state := l.lexVoid; state != nil; {
	state = state()
	}
	close(l.tokens)
	}

	func (l *tomlLexer) nextStart() {
	// iterate by runes (utf8 characters)
	// search for newlines and advance line/col counts
	for i := l.start; i < l.pos; {
	r, width := utf8.DecodeRuneInString(l.input[i:])
	if r == '\n' {
	l.line++
	l.col = 1
	} else {
	l.col++
	}
	i += width
	}
	// advance start position to next token
	l.start = l.pos
	}

	func (l *tomlLexer) emit(t tokenType) {
	l.tokens <- token{
	Position: Position{l.line, l.col},
	typ: t,
	val: l.input[l.start:l.pos],
	}
	l.nextStart()
	}

	func (l *tomlLexer) emitWithValue(t tokenType, value string) {
	l.tokens <- token{
	Position: Position{l.line, l.col},
	typ: t,
	val: value,
	}
	l.nextStart()
	}

	func (l *tomlLexer) next() rune {
	if l.pos >= len(l.input) {
	l.width = 0
	return eof
	}
	var r rune
	r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
	l.pos += l.width
	return r
	}

	func (l *tomlLexer) ignore() {
	l.nextStart()
	}

	func (l *tomlLexer) backup() {
	l.pos -= l.width
	}

	func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn {
	l.tokens <- token{
	Position: Position{l.line, l.col},
	typ: tokenError,
	val: fmt.Sprintf(format, args...),
	}
	return nil
	}

	func (l *tomlLexer) peek() rune {
	r := l.next()
	l.backup()
	return r
	}

	func (l *tomlLexer) accept(valid string) bool {
	if strings.IndexRune(valid, l.next()) >= 0 {
	return true
	}
	l.backup()
	return false
	}

	func (l *tomlLexer) follow(next string) bool {
	return strings.HasPrefix(l.input[l.pos:], next)
	}

	func (l *tomlLexer) lexVoid() tomlLexStateFn {
	for {
	next := l.peek()
	switch next {
	case '[':
	return l.lexKeyGroup
	case '#':
	return l.lexComment
	case '=':
	return l.lexEqual
	}

	if isSpace(next) {
	l.ignore()
	}

	if l.depth > 0 {
	return l.lexRvalue
	}

	if isKeyStartChar(next) {
	return l.lexKey
	}

	if l.next() == eof {
	break
	}
	}

	l.emit(tokenEOF)
	return nil
	}

	func (l *tomlLexer) lexRvalue() tomlLexStateFn {
	for {
	next := l.peek()
	switch next {
	case '.':
	return l.errorf("cannot start float with a dot")
	case '=':
	return l.errorf("cannot have multiple equals for the same key")
	case '[':
	l.depth++
	return l.lexLeftBracket
	case ']':
	l.depth--
	return l.lexRightBracket
	case '#':
	return l.lexComment
	case '"':
	return l.lexString
	case '\'':
	return l.lexLiteralString
	case ',':
	return l.lexComma
	case '\n':
	l.ignore()
	l.pos++
	if l.depth == 0 {
	return l.lexVoid
	}
	return l.lexRvalue
	}

	if l.follow("true") {
	return l.lexTrue
	}

	if l.follow("false") {
	return l.lexFalse
	}

	if isAlphanumeric(next) {
	return l.lexKey
	}

	dateMatch := dateRegexp.FindString(l.input[l.pos:])
	if dateMatch != "" {
	l.ignore()
	l.pos += len(dateMatch)
	return l.lexDate
	}

	if next == '+' \|\| next == '-' \|\| isDigit(next) {
	return l.lexNumber
	}

	if isSpace(next) {
	l.ignore()
	}

	if l.next() == eof {
	break
	}
	}

	l.emit(tokenEOF)
	return nil
	}

	func (l *tomlLexer) lexDate() tomlLexStateFn {
	l.emit(tokenDate)
	return l.lexRvalue
	}

	func (l *tomlLexer) lexTrue() tomlLexStateFn {
	l.ignore()
	l.pos += 4
	l.emit(tokenTrue)
	return l.lexRvalue
	}

	func (l *tomlLexer) lexFalse() tomlLexStateFn {
	l.ignore()
	l.pos += 5
	l.emit(tokenFalse)
	return l.lexRvalue
	}

	func (l *tomlLexer) lexEqual() tomlLexStateFn {
	l.ignore()
	l.accept("=")
	l.emit(tokenEqual)
	return l.lexRvalue
	}

	func (l *tomlLexer) lexComma() tomlLexStateFn {
	l.ignore()
	l.accept(",")
	l.emit(tokenComma)
	return l.lexRvalue
	}

	func (l *tomlLexer) lexKey() tomlLexStateFn {
	l.ignore()
	for r := l.next(); isKeyChar(r); r = l.next() {
	if r == '#' {
	return l.errorf("keys cannot contain # character")
	}
	}
	l.backup()
	l.emit(tokenKey)
	return l.lexVoid
	}

	func (l *tomlLexer) lexComment() tomlLexStateFn {
	for {
	next := l.next()
	if next == '\n' \|\| next == eof {
	break
	}
	}
	l.ignore()
	return l.lexVoid
	}

	func (l *tomlLexer) lexLeftBracket() tomlLexStateFn {
	l.ignore()
	l.pos++
	l.emit(tokenLeftBracket)
	return l.lexRvalue
	}

	func (l *tomlLexer) lexLiteralString() tomlLexStateFn {
	l.pos++
	l.ignore()
	growingString := ""

	// handle special case for triple-quote
	terminator := "'"
	if l.follow("''") {
	l.pos += 2
	l.ignore()
	terminator = "'''"

	// special case: discard leading newline
	if l.peek() == '\n' {
	l.pos++
	l.ignore()
	}
	}

	// find end of string
	for {
	if l.follow(terminator) {
	l.emitWithValue(tokenString, growingString)
	l.pos += len(terminator)
	l.ignore()
	return l.lexRvalue
	}

	growingString += string(l.peek())

	if l.next() == eof {
	break
	}
	}

	return l.errorf("unclosed string")
	}

	func (l *tomlLexer) lexString() tomlLexStateFn {
	l.pos++
	l.ignore()
	growingString := ""

	// handle special case for triple-quote
	terminator := "\""
	if l.follow("\"\"") {
	l.pos += 2
	l.ignore()
	terminator = "\"\"\""

	// special case: discard leading newline
	if l.peek() == '\n' {
	l.pos++
	l.ignore()
	}
	}

	for {
	if l.follow(terminator) {
	l.emitWithValue(tokenString, growingString)
	l.pos += len(terminator)
	l.ignore()
	return l.lexRvalue
	}

	if l.follow("\\") {
	l.pos++
	switch l.peek() {
	case '\r':
	fallthrough
	case '\n':
	fallthrough
	case '\t':
	fallthrough
	case ' ':
	// skip all whitespace chars following backslash
	l.pos++
	for strings.ContainsRune("\r\n\t ", l.peek()) {
	l.pos++
	}
	l.pos--
	case '"':
	growingString += "\""
	case 'n':
	growingString += "\n"
	case 'b':
	growingString += "\b"
	case 'f':
	growingString += "\f"
	case '/':
	growingString += "/"
	case 't':
	growingString += "\t"
	case 'r':
	growingString += "\r"
	case '\\':
	growingString += "\\"
	case 'u':
	l.pos++
	code := ""
	for i := 0; i < 4; i++ {
	c := l.peek()
	l.pos++
	if !isHexDigit(c) {
	return l.errorf("unfinished unicode escape")
	}
	code = code + string(c)
	}
	l.pos--
	intcode, err := strconv.ParseInt(code, 16, 32)
	if err != nil {
	return l.errorf("invalid unicode escape: \\u" + code)
	}
	growingString += string(rune(intcode))
	default:
	return l.errorf("invalid escape sequence: \\" + string(l.peek()))
	}
	} else {
	growingString += string(l.peek())
	}

	if l.next() == eof {
	break
	}
	}

	return l.errorf("unclosed string")
	}

	func (l *tomlLexer) lexKeyGroup() tomlLexStateFn {
	l.ignore()
	l.pos++

	if l.peek() == '[' {
	// token '[[' signifies an array of anonymous key groups
	l.pos++
	l.emit(tokenDoubleLeftBracket)
	return l.lexInsideKeyGroupArray
	}
	// vanilla key group
	l.emit(tokenLeftBracket)
	return l.lexInsideKeyGroup
	}

	func (l *tomlLexer) lexInsideKeyGroupArray() tomlLexStateFn {
	for {
	if l.peek() == ']' {
	if l.pos > l.start {
	l.emit(tokenKeyGroupArray)
	}
	l.ignore()
	l.pos++
	if l.peek() != ']' {
	break // error
	}
	l.pos++
	l.emit(tokenDoubleRightBracket)
	return l.lexVoid
	} else if l.peek() == '[' {
	return l.errorf("group name cannot contain ']'")
	}

	if l.next() == eof {
	break
	}
	}
	return l.errorf("unclosed key group array")
	}

	func (l *tomlLexer) lexInsideKeyGroup() tomlLexStateFn {
	for {
	if l.peek() == ']' {
	if l.pos > l.start {
	l.emit(tokenKeyGroup)
	}
	l.ignore()
	l.pos++
	l.emit(tokenRightBracket)
	return l.lexVoid
	} else if l.peek() == '[' {
	return l.errorf("group name cannot contain ']'")
	}

	if l.next() == eof {
	break
	}
	}
	return l.errorf("unclosed key group")
	}

	func (l *tomlLexer) lexRightBracket() tomlLexStateFn {
	l.ignore()
	l.pos++
	l.emit(tokenRightBracket)
	return l.lexRvalue
	}

	func (l *tomlLexer) lexNumber() tomlLexStateFn {
	l.ignore()
	if !l.accept("+") {
	l.accept("-")
	}
	pointSeen := false
	expSeen := false
	digitSeen := false
	for {
	next := l.next()
	if next == '.' {
	if pointSeen {
	return l.errorf("cannot have two dots in one float")
	}
	if !isDigit(l.peek()) {
	return l.errorf("float cannot end with a dot")
	}
	pointSeen = true
	} else if next == 'e' \|\| next == 'E' {
	expSeen = true
	if !l.accept("+") {
	l.accept("-")
	}
	} else if isDigit(next) {
	digitSeen = true
	} else {
	l.backup()
	break
	}
	if pointSeen && !digitSeen {
	return l.errorf("cannot start float with a dot")
	}
	}

	if !digitSeen {
	return l.errorf("no digit in that number")
	}
	if pointSeen \|\| expSeen {
	l.emit(tokenFloat)
	} else {
	l.emit(tokenInteger)
	}
	return l.lexRvalue
	}

	func init() {
	dateRegexp = regexp.MustCompile("^\\d{1,4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(\\.\\d{1,9})?(Z\|[+-]\\d{2}:\\d{2})")
	}

	// Entry point
	func lexToml(input string) chan token {
	l := &tomlLexer{
	input: input,
	tokens: make(chan token),
	line: 1,
	col: 1,
	}
	go l.run()
	return l.tokens
	}