|  | // Copyright 2012 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | // +build ignore | 
|  |  | 
|  | // Collation table generator. | 
|  | // Data read from the web. | 
|  |  | 
|  | package main | 
|  |  | 
|  | import ( | 
|  | "archive/zip" | 
|  | "bufio" | 
|  | "bytes" | 
|  | "flag" | 
|  | "fmt" | 
|  | "io" | 
|  | "io/ioutil" | 
|  | "log" | 
|  | "os" | 
|  | "regexp" | 
|  | "sort" | 
|  | "strconv" | 
|  | "strings" | 
|  | "unicode/utf8" | 
|  |  | 
|  | "golang.org/x/text/collate" | 
|  | "golang.org/x/text/collate/build" | 
|  | "golang.org/x/text/internal/colltab" | 
|  | "golang.org/x/text/internal/gen" | 
|  | "golang.org/x/text/language" | 
|  | "golang.org/x/text/unicode/cldr" | 
|  | ) | 
|  |  | 
|  | var ( | 
|  | test = flag.Bool("test", false, | 
|  | "test existing tables; can be used to compare web data with package data.") | 
|  | short = flag.Bool("short", false, `Use "short" alternatives, when available.`) | 
|  | draft = flag.Bool("draft", false, `Use draft versions, when available.`) | 
|  | tags  = flag.String("tags", "", "build tags to be included after +build directive") | 
|  | pkg   = flag.String("package", "collate", | 
|  | "the name of the package in which the generated file is to be included") | 
|  |  | 
|  | tables = flagStringSetAllowAll("tables", "collate", "collate,chars", | 
|  | "comma-spearated list of tables to generate.") | 
|  | exclude = flagStringSet("exclude", "zh2", "", | 
|  | "comma-separated list of languages to exclude.") | 
|  | include = flagStringSet("include", "", "", | 
|  | "comma-separated list of languages to include. Include trumps exclude.") | 
|  | // TODO: Not included: unihan gb2312han zhuyin big5han (for size reasons) | 
|  | // TODO: Not included: traditional (buggy for Bengali) | 
|  | types = flagStringSetAllowAll("types", "standard,phonebook,phonetic,reformed,pinyin,stroke", "", | 
|  | "comma-separated list of types that should be included.") | 
|  | ) | 
|  |  | 
|  | // stringSet implements an ordered set based on a list.  It implements flag.Value | 
|  | // to allow a set to be specified as a comma-separated list. | 
|  | type stringSet struct { | 
|  | s        []string | 
|  | allowed  *stringSet | 
|  | dirty    bool // needs compaction if true | 
|  | all      bool | 
|  | allowAll bool | 
|  | } | 
|  |  | 
|  | func flagStringSet(name, def, allowed, usage string) *stringSet { | 
|  | ss := &stringSet{} | 
|  | if allowed != "" { | 
|  | usage += fmt.Sprintf(" (allowed values: any of %s)", allowed) | 
|  | ss.allowed = &stringSet{} | 
|  | failOnError(ss.allowed.Set(allowed)) | 
|  | } | 
|  | ss.Set(def) | 
|  | flag.Var(ss, name, usage) | 
|  | return ss | 
|  | } | 
|  |  | 
|  | func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet { | 
|  | ss := &stringSet{allowAll: true} | 
|  | if allowed == "" { | 
|  | flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`)) | 
|  | } else { | 
|  | ss.allowed = &stringSet{} | 
|  | failOnError(ss.allowed.Set(allowed)) | 
|  | flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed)) | 
|  | } | 
|  | ss.Set(def) | 
|  | return ss | 
|  | } | 
|  |  | 
|  | func (ss stringSet) Len() int { | 
|  | return len(ss.s) | 
|  | } | 
|  |  | 
|  | func (ss stringSet) String() string { | 
|  | return strings.Join(ss.s, ",") | 
|  | } | 
|  |  | 
|  | func (ss *stringSet) Set(s string) error { | 
|  | if ss.allowAll && s == "all" { | 
|  | ss.s = nil | 
|  | ss.all = true | 
|  | return nil | 
|  | } | 
|  | ss.s = ss.s[:0] | 
|  | for _, s := range strings.Split(s, ",") { | 
|  | if s := strings.TrimSpace(s); s != "" { | 
|  | if ss.allowed != nil && !ss.allowed.contains(s) { | 
|  | return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed) | 
|  | } | 
|  | ss.add(s) | 
|  | } | 
|  | } | 
|  | ss.compact() | 
|  | return nil | 
|  | } | 
|  |  | 
|  | func (ss *stringSet) add(s string) { | 
|  | ss.s = append(ss.s, s) | 
|  | ss.dirty = true | 
|  | } | 
|  |  | 
|  | func (ss *stringSet) values() []string { | 
|  | ss.compact() | 
|  | return ss.s | 
|  | } | 
|  |  | 
|  | func (ss *stringSet) contains(s string) bool { | 
|  | if ss.all { | 
|  | return true | 
|  | } | 
|  | for _, v := range ss.s { | 
|  | if v == s { | 
|  | return true | 
|  | } | 
|  | } | 
|  | return false | 
|  | } | 
|  |  | 
|  | func (ss *stringSet) compact() { | 
|  | if !ss.dirty { | 
|  | return | 
|  | } | 
|  | a := ss.s | 
|  | sort.Strings(a) | 
|  | k := 0 | 
|  | for i := 1; i < len(a); i++ { | 
|  | if a[k] != a[i] { | 
|  | a[k+1] = a[i] | 
|  | k++ | 
|  | } | 
|  | } | 
|  | ss.s = a[:k+1] | 
|  | ss.dirty = false | 
|  | } | 
|  |  | 
|  | func skipLang(l string) bool { | 
|  | if include.Len() > 0 { | 
|  | return !include.contains(l) | 
|  | } | 
|  | return exclude.contains(l) | 
|  | } | 
|  |  | 
|  | // altInclude returns a list of alternatives (for the LDML alt attribute) | 
|  | // in order of preference.  An empty string in this list indicates the | 
|  | // default entry. | 
|  | func altInclude() []string { | 
|  | l := []string{} | 
|  | if *short { | 
|  | l = append(l, "short") | 
|  | } | 
|  | l = append(l, "") | 
|  | // TODO: handle draft using cldr.SetDraftLevel | 
|  | if *draft { | 
|  | l = append(l, "proposed") | 
|  | } | 
|  | return l | 
|  | } | 
|  |  | 
|  | func failOnError(e error) { | 
|  | if e != nil { | 
|  | log.Panic(e) | 
|  | } | 
|  | } | 
|  |  | 
|  | func openArchive() *zip.Reader { | 
|  | f := gen.OpenCLDRCoreZip() | 
|  | buffer, err := ioutil.ReadAll(f) | 
|  | f.Close() | 
|  | failOnError(err) | 
|  | archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer))) | 
|  | failOnError(err) | 
|  | return archive | 
|  | } | 
|  |  | 
|  | // parseUCA parses a Default Unicode Collation Element Table of the format | 
|  | // specified in http://www.unicode.org/reports/tr10/#File_Format. | 
|  | // It returns the variable top. | 
|  | func parseUCA(builder *build.Builder) { | 
|  | var r io.ReadCloser | 
|  | var err error | 
|  | for _, f := range openArchive().File { | 
|  | if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") { | 
|  | r, err = f.Open() | 
|  | } | 
|  | } | 
|  | if r == nil { | 
|  | log.Fatal("File allkeys_CLDR.txt not found in archive.") | 
|  | } | 
|  | failOnError(err) | 
|  | defer r.Close() | 
|  | scanner := bufio.NewScanner(r) | 
|  | colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`) | 
|  | for i := 1; scanner.Scan(); i++ { | 
|  | line := scanner.Text() | 
|  | if len(line) == 0 || line[0] == '#' { | 
|  | continue | 
|  | } | 
|  | if line[0] == '@' { | 
|  | // parse properties | 
|  | switch { | 
|  | case strings.HasPrefix(line[1:], "version "): | 
|  | a := strings.Split(line[1:], " ") | 
|  | if a[1] != gen.UnicodeVersion() { | 
|  | log.Fatalf("incompatible version %s; want %s", a[1], gen.UnicodeVersion()) | 
|  | } | 
|  | case strings.HasPrefix(line[1:], "backwards "): | 
|  | log.Fatalf("%d: unsupported option backwards", i) | 
|  | default: | 
|  | log.Printf("%d: unknown option %s", i, line[1:]) | 
|  | } | 
|  | } else { | 
|  | // parse entries | 
|  | part := strings.Split(line, " ; ") | 
|  | if len(part) != 2 { | 
|  | log.Fatalf("%d: production rule without ';': %v", i, line) | 
|  | } | 
|  | lhs := []rune{} | 
|  | for _, v := range strings.Split(part[0], " ") { | 
|  | if v == "" { | 
|  | continue | 
|  | } | 
|  | lhs = append(lhs, rune(convHex(i, v))) | 
|  | } | 
|  | var n int | 
|  | var vars []int | 
|  | rhs := [][]int{} | 
|  | for i, m := range colelem.FindAllStringSubmatch(part[1], -1) { | 
|  | n += len(m[0]) | 
|  | elem := []int{} | 
|  | for _, h := range strings.Split(m[2], ".") { | 
|  | elem = append(elem, convHex(i, h)) | 
|  | } | 
|  | if m[1] == "*" { | 
|  | vars = append(vars, i) | 
|  | } | 
|  | rhs = append(rhs, elem) | 
|  | } | 
|  | if len(part[1]) < n+3 || part[1][n+1] != '#' { | 
|  | log.Fatalf("%d: expected comment; found %s", i, part[1][n:]) | 
|  | } | 
|  | if *test { | 
|  | testInput.add(string(lhs)) | 
|  | } | 
|  | failOnError(builder.Add(lhs, rhs, vars)) | 
|  | } | 
|  | } | 
|  | if scanner.Err() != nil { | 
|  | log.Fatal(scanner.Err()) | 
|  | } | 
|  | } | 
|  |  | 
|  | func convHex(line int, s string) int { | 
|  | r, e := strconv.ParseInt(s, 16, 32) | 
|  | if e != nil { | 
|  | log.Fatalf("%d: %v", line, e) | 
|  | } | 
|  | return int(r) | 
|  | } | 
|  |  | 
|  | var testInput = stringSet{} | 
|  |  | 
|  | var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`) | 
|  | var tagRe = regexp.MustCompile(`<([a-z_]*)  */>`) | 
|  |  | 
|  | var mainLocales = []string{} | 
|  |  | 
|  | // charsets holds a list of exemplar characters per category. | 
|  | type charSets map[string][]string | 
|  |  | 
|  | func (p charSets) fprint(w io.Writer) { | 
|  | fmt.Fprintln(w, "[exN]string{") | 
|  | for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} { | 
|  | if set := p[k]; len(set) != 0 { | 
|  | fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " ")) | 
|  | } | 
|  | } | 
|  | fmt.Fprintln(w, "\t},") | 
|  | } | 
|  |  | 
|  | var localeChars = make(map[string]charSets) | 
|  |  | 
|  | const exemplarHeader = ` | 
|  | type exemplarType int | 
|  | const ( | 
|  | exCharacters exemplarType = iota | 
|  | exContractions | 
|  | exPunctuation | 
|  | exAuxiliary | 
|  | exCurrency | 
|  | exIndex | 
|  | exN | 
|  | ) | 
|  | ` | 
|  |  | 
|  | func printExemplarCharacters(w io.Writer) { | 
|  | fmt.Fprintln(w, exemplarHeader) | 
|  | fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{") | 
|  | for _, loc := range mainLocales { | 
|  | fmt.Fprintf(w, "\t%q: ", loc) | 
|  | localeChars[loc].fprint(w) | 
|  | } | 
|  | fmt.Fprintln(w, "}") | 
|  | } | 
|  |  | 
|  | func decodeCLDR(d *cldr.Decoder) *cldr.CLDR { | 
|  | r := gen.OpenCLDRCoreZip() | 
|  | data, err := d.DecodeZip(r) | 
|  | failOnError(err) | 
|  | return data | 
|  | } | 
|  |  | 
|  | // parseMain parses XML files in the main directory of the CLDR core.zip file. | 
|  | func parseMain() { | 
|  | d := &cldr.Decoder{} | 
|  | d.SetDirFilter("main") | 
|  | d.SetSectionFilter("characters") | 
|  | data := decodeCLDR(d) | 
|  | for _, loc := range data.Locales() { | 
|  | x := data.RawLDML(loc) | 
|  | if skipLang(x.Identity.Language.Type) { | 
|  | continue | 
|  | } | 
|  | if x.Characters != nil { | 
|  | x, _ = data.LDML(loc) | 
|  | loc = language.Make(loc).String() | 
|  | for _, ec := range x.Characters.ExemplarCharacters { | 
|  | if ec.Draft != "" { | 
|  | continue | 
|  | } | 
|  | if _, ok := localeChars[loc]; !ok { | 
|  | mainLocales = append(mainLocales, loc) | 
|  | localeChars[loc] = make(charSets) | 
|  | } | 
|  | localeChars[loc][ec.Type] = parseCharacters(ec.Data()) | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | func parseCharacters(chars string) []string { | 
|  | parseSingle := func(s string) (r rune, tail string, escaped bool) { | 
|  | if s[0] == '\\' { | 
|  | return rune(s[1]), s[2:], true | 
|  | } | 
|  | r, sz := utf8.DecodeRuneInString(s) | 
|  | return r, s[sz:], false | 
|  | } | 
|  | chars = strings.TrimSpace(chars) | 
|  | if n := len(chars) - 1; chars[n] == ']' && chars[0] == '[' { | 
|  | chars = chars[1:n] | 
|  | } | 
|  | list := []string{} | 
|  | var r, last, end rune | 
|  | for len(chars) > 0 { | 
|  | if chars[0] == '{' { // character sequence | 
|  | buf := []rune{} | 
|  | for chars = chars[1:]; len(chars) > 0; { | 
|  | r, chars, _ = parseSingle(chars) | 
|  | if r == '}' { | 
|  | break | 
|  | } | 
|  | if r == ' ' { | 
|  | log.Fatalf("space not supported in sequence %q", chars) | 
|  | } | 
|  | buf = append(buf, r) | 
|  | } | 
|  | list = append(list, string(buf)) | 
|  | last = 0 | 
|  | } else { // single character | 
|  | escaped := false | 
|  | r, chars, escaped = parseSingle(chars) | 
|  | if r != ' ' { | 
|  | if r == '-' && !escaped { | 
|  | if last == 0 { | 
|  | log.Fatal("'-' should be preceded by a character") | 
|  | } | 
|  | end, chars, _ = parseSingle(chars) | 
|  | for ; last <= end; last++ { | 
|  | list = append(list, string(last)) | 
|  | } | 
|  | last = 0 | 
|  | } else { | 
|  | list = append(list, string(r)) | 
|  | last = r | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  | return list | 
|  | } | 
|  |  | 
|  | var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`) | 
|  |  | 
|  | // typeMap translates legacy type keys to their BCP47 equivalent. | 
|  | var typeMap = map[string]string{ | 
|  | "phonebook":   "phonebk", | 
|  | "traditional": "trad", | 
|  | } | 
|  |  | 
|  | // parseCollation parses XML files in the collation directory of the CLDR core.zip file. | 
|  | func parseCollation(b *build.Builder) { | 
|  | d := &cldr.Decoder{} | 
|  | d.SetDirFilter("collation") | 
|  | data := decodeCLDR(d) | 
|  | for _, loc := range data.Locales() { | 
|  | x, err := data.LDML(loc) | 
|  | failOnError(err) | 
|  | if skipLang(x.Identity.Language.Type) { | 
|  | continue | 
|  | } | 
|  | cs := x.Collations.Collation | 
|  | sl := cldr.MakeSlice(&cs) | 
|  | if len(types.s) == 0 { | 
|  | sl.SelectAnyOf("type", x.Collations.Default()) | 
|  | } else if !types.all { | 
|  | sl.SelectAnyOf("type", types.s...) | 
|  | } | 
|  | sl.SelectOnePerGroup("alt", altInclude()) | 
|  |  | 
|  | for _, c := range cs { | 
|  | id, err := language.Parse(loc) | 
|  | if err != nil { | 
|  | fmt.Fprintf(os.Stderr, "invalid locale: %q", err) | 
|  | continue | 
|  | } | 
|  | // Support both old- and new-style defaults. | 
|  | d := c.Type | 
|  | if x.Collations.DefaultCollation == nil { | 
|  | d = x.Collations.Default() | 
|  | } else { | 
|  | d = x.Collations.DefaultCollation.Data() | 
|  | } | 
|  | // We assume tables are being built either for search or collation, | 
|  | // but not both. For search the default is always "search". | 
|  | if d != c.Type && c.Type != "search" { | 
|  | typ := c.Type | 
|  | if len(c.Type) > 8 { | 
|  | typ = typeMap[c.Type] | 
|  | } | 
|  | id, err = id.SetTypeForKey("co", typ) | 
|  | failOnError(err) | 
|  | } | 
|  | t := b.Tailoring(id) | 
|  | c.Process(processor{t}) | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | type processor struct { | 
|  | t *build.Tailoring | 
|  | } | 
|  |  | 
|  | func (p processor) Reset(anchor string, before int) (err error) { | 
|  | if before != 0 { | 
|  | err = p.t.SetAnchorBefore(anchor) | 
|  | } else { | 
|  | err = p.t.SetAnchor(anchor) | 
|  | } | 
|  | failOnError(err) | 
|  | return nil | 
|  | } | 
|  |  | 
|  | func (p processor) Insert(level int, str, context, extend string) error { | 
|  | str = context + str | 
|  | if *test { | 
|  | testInput.add(str) | 
|  | } | 
|  | // TODO: mimic bug in old maketables: remove. | 
|  | err := p.t.Insert(colltab.Level(level-1), str, context+extend) | 
|  | failOnError(err) | 
|  | return nil | 
|  | } | 
|  |  | 
|  | func (p processor) Index(id string) { | 
|  | } | 
|  |  | 
|  | func testCollator(c *collate.Collator) { | 
|  | c0 := collate.New(language.Und) | 
|  |  | 
|  | // iterator over all characters for all locales and check | 
|  | // whether Key is equal. | 
|  | buf := collate.Buffer{} | 
|  |  | 
|  | // Add all common and not too uncommon runes to the test set. | 
|  | for i := rune(0); i < 0x30000; i++ { | 
|  | testInput.add(string(i)) | 
|  | } | 
|  | for i := rune(0xE0000); i < 0xF0000; i++ { | 
|  | testInput.add(string(i)) | 
|  | } | 
|  | for _, str := range testInput.values() { | 
|  | k0 := c0.KeyFromString(&buf, str) | 
|  | k := c.KeyFromString(&buf, str) | 
|  | if !bytes.Equal(k0, k) { | 
|  | failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k)) | 
|  | } | 
|  | buf.Reset() | 
|  | } | 
|  | fmt.Println("PASS") | 
|  | } | 
|  |  | 
|  | func main() { | 
|  | gen.Init() | 
|  | b := build.NewBuilder() | 
|  | parseUCA(b) | 
|  | if tables.contains("chars") { | 
|  | parseMain() | 
|  | } | 
|  | parseCollation(b) | 
|  |  | 
|  | c, err := b.Build() | 
|  | failOnError(err) | 
|  |  | 
|  | if *test { | 
|  | testCollator(collate.NewFromTable(c)) | 
|  | } else { | 
|  | w := &bytes.Buffer{} | 
|  |  | 
|  | gen.WriteUnicodeVersion(w) | 
|  | gen.WriteCLDRVersion(w) | 
|  |  | 
|  | if tables.contains("collate") { | 
|  | _, err = b.Print(w) | 
|  | failOnError(err) | 
|  | } | 
|  | if tables.contains("chars") { | 
|  | printExemplarCharacters(w) | 
|  | } | 
|  | gen.WriteGoFile("tables.go", *pkg, w.Bytes()) | 
|  | } | 
|  | } |