Initial commit
diff --git a/lex.go b/lex.go new file mode 100644 index 0000000..4b9b41a --- /dev/null +++ b/lex.go
@@ -0,0 +1,295 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package properties + +import ( + "fmt" + // "log" + "strings" + // "unicode" + "strconv" + "unicode/utf8" +) + +// item represents a token or text string returned from the scanner. +type item struct { + typ itemType // The type of this item. + pos Pos // The starting position, in bytes, of this item in the input string. + val string // The value of this item. +} + +func (i item) String() string { + switch { + case i.typ == itemEOF: + return "EOF" + case i.typ == itemError: + return i.val + case len(i.val) > 10: + return fmt.Sprintf("%.10q...", i.val) + } + return fmt.Sprintf("%q", i.val) +} + +type Pos int + +// itemType identifies the type of lex items. +type itemType int + +const ( + itemError itemType = iota // error occurred; value is text of error + itemEOF + itemDelim // a = or : delimiter char + itemKey // a key + itemValue // a value +) + +const eof = -1 + +// stateFn represents the state of the scanner as a function that returns the next state. +type stateFn func(*lexer) stateFn + +// lexer holds the state of the scanner. +type lexer struct { + input string // the string being scanned + state stateFn // the next lexing function to enter + pos Pos // current position in the input + start Pos // start position of this item + width Pos // width of last rune read from input + lastPos Pos // position of most recent item returned by nextItem + items chan item // channel of scanned items +} + +// next returns the next rune in the input. +func (l *lexer) next() rune { + if int(l.pos) >= len(l.input) { + l.width = 0 + return eof + } + r, w := utf8.DecodeRuneInString(l.input[l.pos:]) + l.width = Pos(w) + l.pos += l.width + return r +} + +// peek returns but does not consume the next rune in the input. +func (l *lexer) peek() rune { + r := l.next() + l.backup() + return r +} + +// backup steps back one rune. Can only be called once per call of next. +func (l *lexer) backup() { + l.pos -= l.width +} + +// emit passes an item back to the client. +func (l *lexer) emit(t itemType) { + l.emitWithValue(t, l.input[l.start:l.pos]) +} + +// emitWithValue passes an item with a specific value back to the client. +func (l *lexer) emitWithValue(t itemType, value string) { + item := item{t, l.start, value} + // log.Printf("lex.emit: %s", item) + l.items <- item + l.start = l.pos +} + +// ignore skips over the pending input before this point. +func (l *lexer) ignore() { + l.start = l.pos +} + +// accept consumes the next rune if it's from the valid set. +func (l *lexer) accept(valid string) bool { + if strings.IndexRune(valid, l.next()) >= 0 { + return true + } + l.backup() + return false +} + +// acceptRun consumes a run of runes from the valid set. +func (l *lexer) acceptRun(valid string) { + for strings.IndexRune(valid, l.next()) >= 0 { + } + l.backup() +} + +// accept until consumes runes until a termination rune. +func (l *lexer) acceptUntil(term rune) { + for r := l.next(); r != eof && r != term; { + } +} + +// hasText returns true if the current parsed text is not empty. +func (l *lexer) isNotEmpty() bool { + return l.pos > l.start +} + +// lineNumber reports which line we're on, based on the position of +// the previous item returned by nextItem. Doing it this way +// means we don't have to worry about peek double counting. +func (l *lexer) lineNumber() int { + return 1 + strings.Count(l.input[:l.lastPos], "\n") +} + +// errorf returns an error token and terminates the scan by passing +// back a nil pointer that will be the next state, terminating l.nextItem. +func (l *lexer) errorf(format string, args ...interface{}) stateFn { + l.items <- item{itemError, l.start, fmt.Sprintf(format, args...)} + return nil +} + +// nextItem returns the next item from the input. +func (l *lexer) nextItem() item { + item := <-l.items + l.lastPos = item.pos + return item +} + +// lex creates a new scanner for the input string. +func lex(input string) *lexer { + l := &lexer{ + input: input, + items: make(chan item), + } + go l.run() + return l +} + +// run runs the state machine for the lexer. +func (l *lexer) run() { + for l.state = lexKey(l); l.state != nil; { + l.state = l.state(l) + } +} + +// state functions +// TODO: handle comments +// TODO: handle multi-line values +// TODO: handle unicode literals + +// lexKey scans the key up to a delimiter +func lexKey(l *lexer) stateFn { + if l.peek() == eof { + l.emit(itemEOF) + return nil + } + + runes := make([]rune, 0, 32) + +Loop: + for { + switch r := l.next(); { + + case r == '\\': + switch r = l.next(); { + + // escaped key termination chars + case r == ' ' || r == ':' || r == '=': + runes = append(runes, r) + + // unicode literals + case r == 'u' || r == 'U': + r, err := scanUnicodeLiteral(l) + if err != nil { + return l.errorf(err.Error()) + } + runes = append(runes, r) + + // EOF + case r == eof: + return l.errorf("premature EOF") + + // everything else is an error + default: + return l.errorf("invalid escape sequence %s", string(r)) + } + + // terminate the key (same as escapes above) + case r == ' ' || r == ':' || r == '=': + l.backup() + break Loop + + case r == eof: + return l.errorf("premature EOF") + + default: + runes = append(runes, r) + } + } + + if len(runes) > 0 { + l.emitWithValue(itemKey, string(runes)) + } + + // ignore trailing spaces + l.acceptRun(" ") + l.ignore() + + return lexDelim +} + +// lexDelim scans the delimiter. We expect to be just before the delimiter +func lexDelim(l *lexer) stateFn { + if l.next() == eof { + return l.errorf("premature EOF") + } + l.emit(itemDelim) + return lexValue +} + +// lexValue scans text until the end of the line. We expect to be just after the delimiter +func lexValue(l *lexer) stateFn { + // ignore leading spaces + l.acceptRun(" ") + l.ignore() + + runes := make([]rune, 0, 128) + for { + switch r := l.next(); { + // TODO: handle multiline with indent on subsequent lines + // TODO: handle unicode literals \uXXXX and \Uxxxx + // TODO: handle escaped chars \n, \r, \t and \\ + case r == '\n': + l.emitWithValue(itemValue, string(runes)) + + // ignore the new line + l.ignore() + return lexKey + + case r == eof: + l.emitWithValue(itemValue, string(runes)) + l.emit(itemEOF) + return nil + + default: + runes = append(runes, r) + } + } +} + +// scans the digits of the unicode literal in \uXXXX form. +// We expect to be before the first digit +func scanUnicodeLiteral(l *lexer) (rune, error) { + d := make([]rune, 4) + for i := 0; i < 4; i++ { + d[i] = l.next() + if d[i] == eof { + return eof, nil + } + } + + u := string(d) + s, err := strconv.Unquote(fmt.Sprintf("'\\u%s'", u)) + if err != nil { + return 0, fmt.Errorf("invalid unicode literal %s", u) + } + + r, _ := utf8.DecodeRuneInString(s) + return r, nil +}
diff --git a/parser.go b/parser.go new file mode 100644 index 0000000..c2a6e35 --- /dev/null +++ b/parser.go
@@ -0,0 +1,80 @@ +package properties + +import ( + "fmt" + // "log" + "runtime" +) + +type parser struct { + lex *lexer +} + +func newParser() *parser { + return &parser{} +} + +func (p *parser) Parse(input string) (props *Properties, err error) { + // log.Printf("Parsing input '%s'", input) + + defer p.recover(&err) + p.lex = lex(input) + props = &Properties{m:make(map[string]string)} + + for { + token := p.expectOneOf(itemKey, itemEOF) + if token.typ == itemEOF { + break + } + key := token.val + p.expect(itemDelim) + token = p.expect(itemValue) + props.Set(key, token.val) + } + + return props, nil +} + +func (p *parser) errorf(format string, args ...interface{}) { + format = fmt.Sprintf("properties: Line %d: %s", p.lex.lineNumber(), format) + panic(fmt.Errorf(format, args...)) +} + +func (p *parser) expect(expected itemType) (token item) { + token = p.lex.nextItem() + if token.typ != expected { + p.unexpected(token) + } + return token +} + +func (p *parser) expectOneOf(expected1, expected2 itemType) (token item) { + token = p.lex.nextItem() + if token.typ != expected1 && token.typ != expected2 { + p.unexpected(token) + } + return token +} + +func (p *parser) unexpected(token item) { + p.errorf(token.String()) +} + +// recover is the handler that turns panics into returns from the top level of Parse. +func (p *parser) recover(errp *error) { + e := recover() + if e != nil { + if _, ok := e.(runtime.Error); ok { + panic(e) + } + // if p != nil { + // p.stopParse() + // } + *errp = e.(error) + } + return +} + +// func (p *parser) stopParse() { + +// }
diff --git a/properties.go b/properties.go new file mode 100644 index 0000000..2b2a5ef --- /dev/null +++ b/properties.go
@@ -0,0 +1,77 @@ +package properties + +import ( + "fmt" + "io" + "io/ioutil" + "unicode/utf8" +) + +type Properties struct { + m map[string]string +} + +// Reads bytes fully and parses them as ISO-8859-1. +func NewProperties(r io.Reader) (*Properties, error) { + buf, err := ioutil.ReadAll(r) + if err != nil { + return nil, err + } + + return NewPropertiesFromString(toUtf8(buf)) +} + +func NewPropertiesFromISO8859_1(buf []byte) (*Properties, error) { + return newParser().Parse(toUtf8(buf)) +} + +// Java properties spec says that .properties files must be ISO-8859-1 +// encoded. Therefore, we first convert them to UTF-8 and then parse them. +func NewPropertiesFromString(input string) (*Properties, error) { + if err := isISO8859_1(input); err != nil { + return nil, err + } + return newParser().Parse(input) +} + +// returns the value for the given key +func (p *Properties) Get(key string) (value string, ok bool) { + value, ok = p.m[key] + return value, ok +} + +// sets the property key = value and returns the previous value if exists or an empty string +func (p *Properties) Set(key, value string) (prevValue string) { + prevValue, ok := p.m[key] + if !ok { + prevValue = "" + } + + p.m[key] = value + return prevValue +} + +// returns the number of keys +func (p *Properties) Len() int { + return len(p.m) +} + +// taken from +// http://stackoverflow.com/questions/13510458/golang-convert-iso8859-1-to-utf8 +func toUtf8(iso8859_1_buf []byte) string { + buf := make([]rune, len(iso8859_1_buf)) + for i, b := range iso8859_1_buf { + buf[i] = rune(b) + } + return string(buf) +} + +func isISO8859_1(s string) error { + for i := 0; i < len(s); i++ { + r, w := utf8.DecodeRuneInString(s[i:]) + if w > 1 || r > 255 { + return fmt.Errorf("invalid ISO-8859-1 input. %s", s) + } + } + return nil +}
diff --git a/properties_test.go b/properties_test.go new file mode 100644 index 0000000..b8ce5dd --- /dev/null +++ b/properties_test.go
@@ -0,0 +1,88 @@ +package properties + +import ( + "fmt" + "strings" + "testing" + + . "github.scm.corp.ebay.com/ecg-marktplaats/cas-go/third_party/launchpad.net/gocheck" +) + +func Test(t *testing.T) { TestingT(t) } + +type LoadSuite struct{} + +var _ = Suite(&LoadSuite{}) + +func (l *LoadSuite) TestKeyWithEmptyValue(c *C) { + testAllDelimiterCombinations(c, "key", "") +} + +func (l *LoadSuite) TestOneKeyValue(c *C) { + testAllDelimiterCombinations(c, "key", "value") +} + +func (l *LoadSuite) TestValueWithTrailingSpaces(c *C) { + testAllDelimiterCombinations(c, "key", "value ") +} + +func (l *LoadSuite) TestEscapedCharsInKey(c *C) { + testKeyValue(c, "k\\ e\\:y\\= = value", "k e:y=", "value") +} + +func (l *LoadSuite) TestUnicodeLiteralInKey(c *C) { + testKeyValue(c, "key\\u2318 = value", "key⌘", "value") + testKeyValue(c, "key\\U2318 = value", "key⌘", "value") +} + +// func (l *LoadSuite) TestEscapedCharsInValue(c *C) { +// testKeyValue(c, "key = v\\ a\\:lu\\=e", "key", "v a:lu=e") +// } + +// func (l *LoadSuite) TestMultilineValue(c *C) { +// input := "key = valueA,\\\n valueB" +// testKeyValue(c, input, "key", "valueA,valueB") +// } + +func (l *LoadSuite) TestFailWithPrematureEOF(c *C) { + _, err := NewPropertiesFromString("key") + c.Assert(err, NotNil) + c.Assert(strings.Contains(err.Error(), "premature EOF"), Equals, true) +} + +func (l *LoadSuite) TestFailWithNonISO8859_1Input(c *C) { + _, err := NewPropertiesFromString("key₡") + c.Assert(err, NotNil) + c.Assert(strings.Contains(err.Error(), "invalid ISO-8859-1 input"), Equals, true) +} + +func (l *LoadSuite) TestFailWithInvalidUnicodeLiteralInKey(c *C) { + _, err := NewPropertiesFromString("key\\ugh32 = value") + c.Assert(err, NotNil) + c.Assert(strings.Contains(err.Error(), "invalid unicode literal"), Equals, true) +} + +// tests all combinations of delimiters plus leading and/or trailing spaces. +func testAllDelimiterCombinations(c *C, key, value string) { + delimiters := []string{"=", " =", "= ", " = ", ":", " :", ": ", " : "} + for _, delim := range delimiters { + testKeyValue(c, fmt.Sprintf("%s%s%s", key, delim, value), key, value) + testKeyValue(c, fmt.Sprintf("%s%s%s\n", key, delim, value), key, value) + } +} + +// tests a single key/value combination for a given input. +func testKeyValue(c *C, input, key, value string) { + // fmt.Printf("Testing '%s'\n", input) + p, err := NewPropertiesFromString(input) + c.Assert(err, IsNil) + c.Assert(p, NotNil) + c.Assert(p.Len(), Equals, 1) + assertKeyValue(c, p, key, value) +} + +func assertKeyValue(c *C, p *Properties, key, value string) { + v, ok := p.Get(key) + c.Assert(ok, Equals, true) + c.Assert(v, Equals, value) +}