Updated documentation for first public release. Added more tests for invalid unicode literals. Accept keys with empty values. Replaced the Decoder struct with a Decode function since we need the whole buffer anyway. Hide the iso-8859-1/utf8 encoding mess. Added examples.
diff --git a/decoder.go b/decoder.go index 705873c..150c8b9 100644 --- a/decoder.go +++ b/decoder.go
@@ -1,52 +1,42 @@ +// Copyright 2013 Frank Schroeder. All rights reserved. MIT licensed. + package properties import ( "fmt" - "io" - "io/ioutil" ) -type Decoder struct { - r io.Reader -} - -type Encoding uint +type encoding uint const ( - UTF8 Encoding = 1 << iota - ISO_8859_1 + enc_utf8 encoding = 1 << iota + enc_iso_8859_1 ) -func NewDecoder(r io.Reader) *Decoder { - return &Decoder{r: r} +// Decodes an ISO-8859-1 encoded string into a Properties struct. +func Decode(buf []byte) (*Properties, error) { + return decodeWithEncoding(buf, enc_iso_8859_1) } -func (d *Decoder) Decode() (*Properties, error) { - return decode(d.r, ISO_8859_1) +// Decodes an UTF-8 string into a Properties struct. +func DecodeFromString(input string) (*Properties, error) { + return decodeWithEncoding([]byte(input), enc_utf8) } -func (d *Decoder) DecodeWithEncoding(enc Encoding) (*Properties, error) { - return decode(d.r, enc) -} - -func decode(r io.Reader, enc Encoding) (*Properties, error) { - buf, err := ioutil.ReadAll(r) - if err != nil { - return nil, err - } - +// Decodes either an ISO-8859-1 or an UTF-8 encoded string into a Properties struct. +func decodeWithEncoding(buf []byte, enc encoding) (*Properties, error) { return newParser().Parse(convert(buf, enc)) } // The Java properties spec says that .properties files must be ISO-8859-1 // encoded. Since the first 256 unicode code points cover ISO-8859-1 we -// can convert each byte into a rune and use the resulting string +// can convert each byte straight into a rune and use the resulting string // as UTF-8 input for the parser. -func convert(buf []byte, enc Encoding) string { +func convert(buf []byte, enc encoding) string { switch enc { - case UTF8: + case enc_utf8: return string(buf) - case ISO_8859_1: + case enc_iso_8859_1: runes := make([]rune, len(buf)) for i, b := range buf { runes[i] = rune(b)
diff --git a/decoder_test.go b/decoder_test.go index 9be226d..1543c45 100644 --- a/decoder_test.go +++ b/decoder_test.go
@@ -31,17 +31,24 @@ {" \f\tkey=value", "key", "value"}, // mix prefix // multiple keys - {"key1=value1\nkey2=value2", "key1", "value1", "key2", "value2"}, + {"key1=value1\nkey2=value2\n", "key1", "value1", "key2", "value2"}, + {"key1=value1\rkey2=value2\r", "key1", "value1", "key2", "value2"}, + {"key1=value1\r\nkey2=value2\r\n", "key1", "value1", "key2", "value2"}, // blank lines - {"\n\nkey=value\n\n", "key", "value"}, // leading and trailing new lines + {"\nkey=value\n", "key", "value"}, + {"\rkey=value\r", "key", "value"}, + {"\r\nkey=value\r\n", "key", "value"}, // escaped chars {"k\\ e\\:y\\= = value", "k e:y=", "value"}, // escaped chars in key {"key = v\\ a\\:lu\\=e\\n\\r\\t", "key", "v a:lu=e\n\r\t"}, // escaped chars in value // unicode literals - {"key\\u2318 = value", "key⌘", "value"}, // unicode literal in key + {"key\\u2318 = value", "key⌘", "value"}, + {"k\\u2318ey = value", "k⌘ey", "value"}, + {"key = value\\u2318", "key", "value⌘"}, + {"key = valu\\u2318e", "key", "valu⌘e"}, // multiline values {"key = valueA,\\\n valueB", "key", "valueA,valueB"}, // SPACE indent @@ -56,8 +63,23 @@ // define error test cases in the form of // {"input", "expected error message"} var errorTests = [][]string{ - {"key", "premature EOF"}, - {"key\\ugh32 = value", "invalid unicode literal"}, + {"key\\u1 = value", "invalid unicode literal"}, + {"key\\u12 = value", "invalid unicode literal"}, + {"key\\u123 = value", "invalid unicode literal"}, + {"key\\u123g = value", "invalid unicode literal"}, + {"key\\u123", "invalid unicode literal"}, +} + +// Benchmarks the decoder by creating a property file with 1000 key/value pairs. +func BenchmarkDecoder(b *testing.B) { + input := "" + for i := 0; i < 1000; i++ { + input += fmt.Sprintf("key%d=value%d\n", i, i) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + Decode([]byte(input)) + } } // tests basic single key/value combinations with all possible whitespace, delimiter and newline permutations. @@ -67,45 +89,36 @@ testAllCombinations(c, "key", "value ") } +// tests more complex cases. func (l *TestSuite) TestComplex(c *C) { - for i, test := range complexTests { - printf("[C%02d] %q %q\n", i, test[0], test[1:]) + for _, test := range complexTests { testKeyValue(c, test[0], test[1:]...) } } +// tests error cases. func (l *TestSuite) TestErrors(c *C) { - for i, test := range errorTests { + for _, test := range errorTests { input, msg := test[0], test[1] - printf("[E%02d] %q %q\n", i, input, msg) testError(c, input, msg) } } -func BenchmarkDecoder(b *testing.B) { - input := "" - for i := 0; i < 1000; i++ { - input += fmt.Sprintf("key%d=value%d\n", i, i) - } - b.ResetTimer() - for i := 0; i < b.N; i++ { - d := NewDecoder(strings.NewReader(input)) - d.Decode() - } -} - -// tests all combinations of delimiters plus leading and/or trailing spaces. +// tests all combinations of delimiters, leading and/or trailing whitespace and newlines. func testAllCombinations(c *C, key, value string) { - whitespace := []string{" ", "\f", "\t"} - delimiters := []string{"", "=", ":"} - // newlines := []string{"", "\r", "\n", "\r\n"} - newlines := []string{"", "\n", "\r"} + whitespace := []string{"", " ", "\f", "\t"} + delimiters := []string{"", " ", "=", ":"} + newlines := []string{"", "\r", "\n", "\r\n"} for _, dl := range delimiters { for _, ws1 := range whitespace { for _, ws2 := range whitespace { for _, nl := range newlines { + // skip the one case where there is nothing between a key and a value + if ws1 == "" && dl == "" && ws2 == "" && value != "" { + continue + } + input := fmt.Sprintf("%s%s%s%s%s%s", key, ws1, dl, ws2, value, nl) - printf("%q\n", input) testKeyValue(c, input, key, value) } } @@ -113,13 +126,16 @@ } } -// tests key/value pairs for a given input. +// tests whether key/value pairs exist for a given input. +// keyvalues is expected to be an even number of strings of "key", "value", ... func testKeyValue(c *C, input string, keyvalues ...string) { - d := NewDecoder(strings.NewReader(input)) - p, err := d.Decode() + printf("%q\n", input) + + p, err := Decode([]byte(input)) c.Assert(err, IsNil) c.Assert(p, NotNil) c.Assert(p.Len(), Equals, len(keyvalues)/2, Commentf("Odd number of key/value pairs.")) + for i := 0; i < len(keyvalues)/2; i += 2 { key, value := keyvalues[i], keyvalues[i+1] v, ok := p.Get(key) @@ -128,14 +144,16 @@ } } -// tests whether a given input produces a given error message. +// tests whether some input produces a given error message. func testError(c *C, input, msg string) { - d := NewDecoder(strings.NewReader(input)) - _, err := d.Decode() + printf("%q\n", input) + + _, err := Decode([]byte(input)) c.Assert(err, NotNil) - c.Assert(strings.Contains(err.Error(), msg), Equals, true) + c.Assert(strings.Contains(err.Error(), msg), Equals, true, Commentf("Expected %q got %q", msg, err.Error())) } +// prints to stderr if the -verbose flag was given. func printf(format string, args ...interface{}) { if *verbose { fmt.Fprintf(os.Stderr, format, args...)
diff --git a/doc.go b/doc.go index 821f5bc..4faf16e 100644 --- a/doc.go +++ b/doc.go
@@ -4,18 +4,29 @@ // // Java properties files contain key/value pairs in one of the following form: // +// key value // key = value // key : value // -// Whitespace around the delimiter is ignored which means that the following expressions are equal +// The value is optional and ends with EOF or a new line which can either be '\n', '\r' or "\r\n". +// Therefore, the following expression is legal and results in a key with an empty value: +// +// key +// +// Whitespace before the key and around the delimiter is ignored. Whitespace at the end of the value is part of the value. +// Besides the space ' ' (U+0020) character the TAB (U+0009) and FF (U+000C) characters are also treated as whitespace. +// Therefore, the following expressions are equal: // // key=value +// key=value // key= value // key =value // key = value // key = value +// key\f=\fvalue +// key\t=\tvalue // -// Blank lines and lines starting with '#' or '!' and are ignored until the end of the line. +// Blank lines and comment lines starting with '#' or '!' and are ignored until the end of the line. // // # the next line is empty and will be ignored // @@ -24,8 +35,9 @@ // // If the delimiter characters '=' and ':' appear in either key or value then // they must be escaped with a backslash. Because of this the backslash must -// also be escaped. The characters '\n', '\r' or '\t' can be included in both -// key and value and will be replaced with their correpsonding character. +// also be escaped. The characters '\n', '\r' or '\t' can be part of both key +// or value and must be escaped. For all other characters the backslash is +// silently dropped. // // # key:1 = value=2 // key\:1 = value\=2 @@ -33,17 +45,21 @@ // # key = value with tabs // key = value\twith\ttabs // +// # key = value with silently dropped backslash +// key = v\alu\e with silently dropped backslash +// // Values can span multiple lines by using a backslash before the newline character. -// All subsequent whitespace on the following line is ignored. +// All subsequent whitespace on the following line is ignored. Comment lines cannot be +// extended like this. // // # key = value continued // key = value \ // continued // -// Java properties files must be ISO-8559-1 encoded and can have Unicode literals for -// characters outside the character set. Both \uXXXX and \UXXXX are accepted. +// Java properties files are ISO-8559-1 encoded and can have Unicode literals for +// characters outside the character set. Unicode literals are specified as \uXXXX. // // # key = value with € -// key = value with \U20AC +// key = value with \u20AC // package properties
diff --git a/example_test.go b/example_test.go new file mode 100644 index 0000000..b4072e2 --- /dev/null +++ b/example_test.go
@@ -0,0 +1,29 @@ +// Copyright 2013 Frank Schroeder. All rights reserved. MIT licensed. + +package properties + +import ( + "fmt" +) + +func ExampleDecode() { + buf := []byte("key = ISO-8859-1 value with unicode literal \\u2318 and umlaut ") + buf = append(buf, 0xE4) // 0xE4 == ä + p, _ := Decode(buf) + v, ok := p.Get("key") + fmt.Println(ok) + fmt.Println(v) + // Output: + // true + // ISO-8859-1 value with unicode literal ⌘ and umlaut ä +} + +func ExampleDecodeFromString() { + p, _ := DecodeFromString("key = UTF-8 value with unicode character ⌘ and umlaut ä") + v, ok := p.Get("key") + fmt.Println(ok) + fmt.Println(v) + // Output: + // true + // UTF-8 value with unicode character ⌘ and umlaut ä +}
diff --git a/lex.go b/lex.go index aa9b6f0..ae2bdea 100644 --- a/lex.go +++ b/lex.go
@@ -20,7 +20,7 @@ // item represents a token or text string returned from the scanner. type item struct { typ itemType // The type of this item. - pos Pos // The starting position, in bytes, of this item in the input string. + pos int // The starting position, in bytes, of this item in the input string. val string // The value of this item. } @@ -36,8 +36,6 @@ return fmt.Sprintf("%q", i.val) } -type Pos int - // itemType identifies the type of lex items. type itemType int @@ -59,10 +57,10 @@ type lexer struct { input string // the string being scanned state stateFn // the next lexing function to enter - pos Pos // current position in the input - start Pos // start position of this item - width Pos // width of last rune read from input - lastPos Pos // position of most recent item returned by nextItem + pos int // current position in the input + start int // start position of this item + width int // width of last rune read from input + lastPos int // position of most recent item returned by nextItem runes []rune // scanned runes for this item items chan item // channel of scanned items } @@ -74,7 +72,7 @@ return eof } r, w := utf8.DecodeRuneInString(l.input[l.pos:]) - l.width = Pos(w) + l.width = w l.pos += l.width return r } @@ -208,7 +206,7 @@ // lexComment scans a comment line. The comment character has already been scanned. func lexComment(l *lexer) stateFn { for { - switch r := l.next(); { + switch r := l.next(); { case isEOF(r): l.ignore() l.emit(itemEOF) @@ -224,9 +222,10 @@ func lexKey(l *lexer) stateFn { // fmt.Println("lexKey") + var r rune Loop: for { - switch r := l.next(); { + switch r = l.next(); { case isEscape(r): err := l.scanEscapeSequence() @@ -239,7 +238,7 @@ break Loop case isEOF(r): - return l.errorf("premature EOF") + break Loop default: l.appendRune(r) @@ -250,6 +249,11 @@ l.emit(itemKey) } + if isEOF(r) { + l.emit(itemEOF) + return nil + } + return lexBeforeValue } @@ -321,16 +325,13 @@ } } -// scans a unicode literal in \[uU]XXXX form. We expect to be after the \[uU]. +// scans a unicode literal in the form \uXXXX. We expect to be after the \u. func (l *lexer) scanUnicodeLiteral() error { // scan the digits d := make([]rune, 4) for i := 0; i < 4; i++ { d[i] = l.next() - if d[i] == eof { - return fmt.Errorf("premature EOF") - } - if !strings.ContainsRune("0123456789abcdefABCDEF", d[i]) { + if d[i] == eof || !strings.ContainsRune("0123456789abcdefABCDEF", d[i]) { return fmt.Errorf("invalid unicode literal") } } @@ -340,12 +341,12 @@ if err != nil { return err } + l.appendRune(rune(r)) return nil } -// decodeEscapedCharacter returns the rune the unescaped value for the -// escaped rune (minus escape character). +// decodeEscapedCharacter returns the unescaped rune. We expect to be after the escape character. func decodeEscapedCharacter(r rune) rune { switch r { case 'n':
diff --git a/parser.go b/parser.go index ba3dbed..ffd7233 100644 --- a/parser.go +++ b/parser.go
@@ -4,7 +4,6 @@ import ( "fmt" - // "log" "runtime" ) @@ -17,8 +16,6 @@ } func (p *parser) Parse(input string) (props *Properties, err error) { - // log.Printf("Parsing input '%s'", input) - defer p.recover(&err) p.lex = lex(input) props = &Properties{m:make(map[string]string)} @@ -29,6 +26,7 @@ break } key := token.val + token = p.expectOneOf(itemValue, itemEOF) if token.typ == itemEOF { props.Set(key, "") @@ -72,14 +70,7 @@ if _, ok := e.(runtime.Error); ok { panic(e) } - // if p != nil { - // p.stopParse() - // } *errp = e.(error) } return } - -// func (p *parser) stopParse() { - -// }
diff --git a/properties.go b/properties.go index 15e707c..65dcd41 100644 --- a/properties.go +++ b/properties.go
@@ -6,13 +6,13 @@ m map[string]string } -// returns the value for the given key +// Returns the value for the given key. func (p *Properties) Get(key string) (value string, ok bool) { value, ok = p.m[key] return value, ok } -// sets the property key = value and returns the previous value if exists or an empty string +// Sets the property key to the given value and returns the previous value if exists or an empty string. func (p *Properties) Set(key, value string) (prevValue string) { prevValue, ok := p.m[key] if !ok { @@ -23,7 +23,7 @@ return prevValue } -// returns the number of keys +// Returns the number of keys. func (p *Properties) Len() int { return len(p.m) }