gen_breaktest.go 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. //go:build generate
  2. // This program generates a Go containing a slice of test cases based on the
  3. // Unicode Character Database auxiliary data files. The command line arguments
  4. // are as follows:
  5. //
  6. // 1. The name of the Unicode data file (just the filename, without extension).
  7. // 2. The name of the locally generated Go file.
  8. // 3. The name of the slice containing the test cases.
  9. // 4. The name of the generator, for logging purposes.
  10. //
  11. //go:generate go run gen_breaktest.go GraphemeBreakTest graphemebreak_test.go graphemeBreakTestCases graphemes
  12. //go:generate go run gen_breaktest.go WordBreakTest wordbreak_test.go wordBreakTestCases words
  13. //go:generate go run gen_breaktest.go SentenceBreakTest sentencebreak_test.go sentenceBreakTestCases sentences
  14. //go:generate go run gen_breaktest.go LineBreakTest linebreak_test.go lineBreakTestCases lines
  15. package main
  16. import (
  17. "bufio"
  18. "bytes"
  19. "errors"
  20. "fmt"
  21. "go/format"
  22. "io/ioutil"
  23. "log"
  24. "net/http"
  25. "os"
  26. "time"
  27. )
  28. // We want to test against a specific version rather than the latest. When the
  29. // package is upgraded to a new version, change these to generate new tests.
  30. const (
  31. testCaseURL = `https://www.unicode.org/Public/14.0.0/ucd/auxiliary/%s.txt`
  32. )
  33. func main() {
  34. if len(os.Args) < 5 {
  35. fmt.Println("Not enough arguments, see code for details")
  36. os.Exit(1)
  37. }
  38. log.SetPrefix("gen_breaktest (" + os.Args[4] + "): ")
  39. log.SetFlags(0)
  40. // Read text of testcases and parse into Go source code.
  41. src, err := parse(fmt.Sprintf(testCaseURL, os.Args[1]))
  42. if err != nil {
  43. log.Fatal(err)
  44. }
  45. // Format the Go code.
  46. formatted, err := format.Source(src)
  47. if err != nil {
  48. log.Fatalln("gofmt:", err)
  49. }
  50. // Write it out.
  51. log.Print("Writing to ", os.Args[2])
  52. if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
  53. log.Fatal(err)
  54. }
  55. }
  56. // parse reads a break text file, either from a local file or from a URL. It
  57. // parses the file data into Go source code representing the test cases.
  58. func parse(url string) ([]byte, error) {
  59. log.Printf("Parsing %s", url)
  60. res, err := http.Get(url)
  61. if err != nil {
  62. return nil, err
  63. }
  64. body := res.Body
  65. defer body.Close()
  66. buf := new(bytes.Buffer)
  67. buf.Grow(120 << 10)
  68. buf.WriteString(`package uniseg
  69. // Code generated via go generate from gen_breaktest.go. DO NOT EDIT.
  70. // ` + os.Args[3] + ` are Grapheme testcases taken from
  71. // ` + url + `
  72. // on ` + time.Now().Format("January 2, 2006") + `. See
  73. // https://www.unicode.org/license.html for the Unicode license agreement.
  74. var ` + os.Args[3] + ` = []testCase {
  75. `)
  76. sc := bufio.NewScanner(body)
  77. num := 1
  78. var line []byte
  79. original := make([]byte, 0, 64)
  80. expected := make([]byte, 0, 64)
  81. for sc.Scan() {
  82. num++
  83. line = sc.Bytes()
  84. if len(line) == 0 || line[0] == '#' {
  85. continue
  86. }
  87. var comment []byte
  88. if i := bytes.IndexByte(line, '#'); i >= 0 {
  89. comment = bytes.TrimSpace(line[i+1:])
  90. line = bytes.TrimSpace(line[:i])
  91. }
  92. original, expected, err := parseRuneSequence(line, original[:0], expected[:0])
  93. if err != nil {
  94. return nil, fmt.Errorf(`line %d: %v: %q`, num, err, line)
  95. }
  96. fmt.Fprintf(buf, "\t{original: \"%s\", expected: %s}, // %s\n", original, expected, comment)
  97. }
  98. if err := sc.Err(); err != nil {
  99. return nil, err
  100. }
  101. // Check for final "# EOF", useful check if we're streaming via HTTP
  102. if !bytes.Equal(line, []byte("# EOF")) {
  103. return nil, fmt.Errorf(`line %d: exected "# EOF" as final line, got %q`, num, line)
  104. }
  105. buf.WriteString("}\n")
  106. return buf.Bytes(), nil
  107. }
  108. // Used by parseRuneSequence to match input via bytes.HasPrefix.
  109. var (
  110. prefixBreak = []byte("÷ ")
  111. prefixDontBreak = []byte("× ")
  112. breakOk = []byte("÷")
  113. breakNo = []byte("×")
  114. )
  115. // parseRuneSequence parses a rune + breaking opportunity sequence from b
  116. // and appends the Go code for testcase.original to orig
  117. // and appends the Go code for testcase.expected to exp.
  118. // It retuns the new orig and exp slices.
  119. //
  120. // E.g. for the input b="÷ 0020 × 0308 ÷ 1F1E6 ÷"
  121. // it will append
  122. // "\u0020\u0308\U0001F1E6"
  123. // and "[][]rune{{0x0020,0x0308},{0x1F1E6},}"
  124. // to orig and exp respectively.
  125. //
  126. // The formatting of exp is expected to be cleaned up by gofmt or format.Source.
  127. // Note we explicitly require the sequence to start with ÷ and we implicitly
  128. // require it to end with ÷.
  129. func parseRuneSequence(b, orig, exp []byte) ([]byte, []byte, error) {
  130. // Check for and remove first ÷ or ×.
  131. if !bytes.HasPrefix(b, prefixBreak) && !bytes.HasPrefix(b, prefixDontBreak) {
  132. return nil, nil, errors.New("expected ÷ or × as first character")
  133. }
  134. if bytes.HasPrefix(b, prefixBreak) {
  135. b = b[len(prefixBreak):]
  136. } else {
  137. b = b[len(prefixDontBreak):]
  138. }
  139. boundary := true
  140. exp = append(exp, "[][]rune{"...)
  141. for len(b) > 0 {
  142. if boundary {
  143. exp = append(exp, '{')
  144. }
  145. exp = append(exp, "0x"...)
  146. // Find end of hex digits.
  147. var i int
  148. for i = 0; i < len(b) && b[i] != ' '; i++ {
  149. if d := b[i]; ('0' <= d || d <= '9') ||
  150. ('A' <= d || d <= 'F') ||
  151. ('a' <= d || d <= 'f') {
  152. continue
  153. }
  154. return nil, nil, errors.New("bad hex digit")
  155. }
  156. switch i {
  157. case 4:
  158. orig = append(orig, "\\u"...)
  159. case 5:
  160. orig = append(orig, "\\U000"...)
  161. default:
  162. return nil, nil, errors.New("unsupport code point hex length")
  163. }
  164. orig = append(orig, b[:i]...)
  165. exp = append(exp, b[:i]...)
  166. b = b[i:]
  167. // Check for space between hex and ÷ or ×.
  168. if len(b) < 1 || b[0] != ' ' {
  169. return nil, nil, errors.New("bad input")
  170. }
  171. b = b[1:]
  172. // Check for next boundary.
  173. switch {
  174. case bytes.HasPrefix(b, breakOk):
  175. boundary = true
  176. b = b[len(breakOk):]
  177. case bytes.HasPrefix(b, breakNo):
  178. boundary = false
  179. b = b[len(breakNo):]
  180. default:
  181. return nil, nil, errors.New("missing ÷ or ×")
  182. }
  183. if boundary {
  184. exp = append(exp, '}')
  185. }
  186. exp = append(exp, ',')
  187. if len(b) > 0 && b[0] == ' ' {
  188. b = b[1:]
  189. }
  190. }
  191. exp = append(exp, '}')
  192. return orig, exp, nil
  193. }