123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213 |
- //go:build generate
- // This program generates a Go containing a slice of test cases based on the
- // Unicode Character Database auxiliary data files. The command line arguments
- // are as follows:
- //
- // 1. The name of the Unicode data file (just the filename, without extension).
- // 2. The name of the locally generated Go file.
- // 3. The name of the slice containing the test cases.
- // 4. The name of the generator, for logging purposes.
- //
- //go:generate go run gen_breaktest.go GraphemeBreakTest graphemebreak_test.go graphemeBreakTestCases graphemes
- //go:generate go run gen_breaktest.go WordBreakTest wordbreak_test.go wordBreakTestCases words
- //go:generate go run gen_breaktest.go SentenceBreakTest sentencebreak_test.go sentenceBreakTestCases sentences
- //go:generate go run gen_breaktest.go LineBreakTest linebreak_test.go lineBreakTestCases lines
- package main
- import (
- "bufio"
- "bytes"
- "errors"
- "fmt"
- "go/format"
- "io/ioutil"
- "log"
- "net/http"
- "os"
- "time"
- )
- // We want to test against a specific version rather than the latest. When the
- // package is upgraded to a new version, change these to generate new tests.
- const (
- testCaseURL = `https://www.unicode.org/Public/14.0.0/ucd/auxiliary/%s.txt`
- )
- func main() {
- if len(os.Args) < 5 {
- fmt.Println("Not enough arguments, see code for details")
- os.Exit(1)
- }
- log.SetPrefix("gen_breaktest (" + os.Args[4] + "): ")
- log.SetFlags(0)
- // Read text of testcases and parse into Go source code.
- src, err := parse(fmt.Sprintf(testCaseURL, os.Args[1]))
- if err != nil {
- log.Fatal(err)
- }
- // Format the Go code.
- formatted, err := format.Source(src)
- if err != nil {
- log.Fatalln("gofmt:", err)
- }
- // Write it out.
- log.Print("Writing to ", os.Args[2])
- if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
- log.Fatal(err)
- }
- }
- // parse reads a break text file, either from a local file or from a URL. It
- // parses the file data into Go source code representing the test cases.
- func parse(url string) ([]byte, error) {
- log.Printf("Parsing %s", url)
- res, err := http.Get(url)
- if err != nil {
- return nil, err
- }
- body := res.Body
- defer body.Close()
- buf := new(bytes.Buffer)
- buf.Grow(120 << 10)
- buf.WriteString(`package uniseg
- // Code generated via go generate from gen_breaktest.go. DO NOT EDIT.
- // ` + os.Args[3] + ` are Grapheme testcases taken from
- // ` + url + `
- // on ` + time.Now().Format("January 2, 2006") + `. See
- // https://www.unicode.org/license.html for the Unicode license agreement.
- var ` + os.Args[3] + ` = []testCase {
- `)
- sc := bufio.NewScanner(body)
- num := 1
- var line []byte
- original := make([]byte, 0, 64)
- expected := make([]byte, 0, 64)
- for sc.Scan() {
- num++
- line = sc.Bytes()
- if len(line) == 0 || line[0] == '#' {
- continue
- }
- var comment []byte
- if i := bytes.IndexByte(line, '#'); i >= 0 {
- comment = bytes.TrimSpace(line[i+1:])
- line = bytes.TrimSpace(line[:i])
- }
- original, expected, err := parseRuneSequence(line, original[:0], expected[:0])
- if err != nil {
- return nil, fmt.Errorf(`line %d: %v: %q`, num, err, line)
- }
- fmt.Fprintf(buf, "\t{original: \"%s\", expected: %s}, // %s\n", original, expected, comment)
- }
- if err := sc.Err(); err != nil {
- return nil, err
- }
- // Check for final "# EOF", useful check if we're streaming via HTTP
- if !bytes.Equal(line, []byte("# EOF")) {
- return nil, fmt.Errorf(`line %d: exected "# EOF" as final line, got %q`, num, line)
- }
- buf.WriteString("}\n")
- return buf.Bytes(), nil
- }
- // Used by parseRuneSequence to match input via bytes.HasPrefix.
- var (
- prefixBreak = []byte("÷ ")
- prefixDontBreak = []byte("× ")
- breakOk = []byte("÷")
- breakNo = []byte("×")
- )
- // parseRuneSequence parses a rune + breaking opportunity sequence from b
- // and appends the Go code for testcase.original to orig
- // and appends the Go code for testcase.expected to exp.
- // It retuns the new orig and exp slices.
- //
- // E.g. for the input b="÷ 0020 × 0308 ÷ 1F1E6 ÷"
- // it will append
- // "\u0020\u0308\U0001F1E6"
- // and "[][]rune{{0x0020,0x0308},{0x1F1E6},}"
- // to orig and exp respectively.
- //
- // The formatting of exp is expected to be cleaned up by gofmt or format.Source.
- // Note we explicitly require the sequence to start with ÷ and we implicitly
- // require it to end with ÷.
- func parseRuneSequence(b, orig, exp []byte) ([]byte, []byte, error) {
- // Check for and remove first ÷ or ×.
- if !bytes.HasPrefix(b, prefixBreak) && !bytes.HasPrefix(b, prefixDontBreak) {
- return nil, nil, errors.New("expected ÷ or × as first character")
- }
- if bytes.HasPrefix(b, prefixBreak) {
- b = b[len(prefixBreak):]
- } else {
- b = b[len(prefixDontBreak):]
- }
- boundary := true
- exp = append(exp, "[][]rune{"...)
- for len(b) > 0 {
- if boundary {
- exp = append(exp, '{')
- }
- exp = append(exp, "0x"...)
- // Find end of hex digits.
- var i int
- for i = 0; i < len(b) && b[i] != ' '; i++ {
- if d := b[i]; ('0' <= d || d <= '9') ||
- ('A' <= d || d <= 'F') ||
- ('a' <= d || d <= 'f') {
- continue
- }
- return nil, nil, errors.New("bad hex digit")
- }
- switch i {
- case 4:
- orig = append(orig, "\\u"...)
- case 5:
- orig = append(orig, "\\U000"...)
- default:
- return nil, nil, errors.New("unsupport code point hex length")
- }
- orig = append(orig, b[:i]...)
- exp = append(exp, b[:i]...)
- b = b[i:]
- // Check for space between hex and ÷ or ×.
- if len(b) < 1 || b[0] != ' ' {
- return nil, nil, errors.New("bad input")
- }
- b = b[1:]
- // Check for next boundary.
- switch {
- case bytes.HasPrefix(b, breakOk):
- boundary = true
- b = b[len(breakOk):]
- case bytes.HasPrefix(b, breakNo):
- boundary = false
- b = b[len(breakNo):]
- default:
- return nil, nil, errors.New("missing ÷ or ×")
- }
- if boundary {
- exp = append(exp, '}')
- }
- exp = append(exp, ',')
- if len(b) > 0 && b[0] == ' ' {
- b = b[1:]
- }
- }
- exp = append(exp, '}')
- return orig, exp, nil
- }
|