gen_properties.go 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. //go:build generate
  2. // This program generates a property file in Go file from Unicode Character
  3. // Database auxiliary data files. The command line arguments are as follows:
  4. //
  5. // 1. The name of the Unicode data file (just the filename, without extension).
  6. // Can be "-" (to skip) if the emoji flag is included.
  7. // 2. The name of the locally generated Go file.
  8. // 3. The name of the slice mapping code points to properties.
  9. // 4. The name of the generator, for logging purposes.
  10. // 5. (Optional) Flags, comma-separated. The following flags are available:
  11. // - "emojis=<property>": include the specified emoji properties (e.g.
  12. // "Extended_Pictographic").
  13. // - "gencat": include general category properties.
  14. //
  15. //go:generate go run gen_properties.go auxiliary/GraphemeBreakProperty graphemeproperties.go graphemeCodePoints graphemes emojis=Extended_Pictographic
  16. //go:generate go run gen_properties.go auxiliary/WordBreakProperty wordproperties.go workBreakCodePoints words emojis=Extended_Pictographic
  17. //go:generate go run gen_properties.go auxiliary/SentenceBreakProperty sentenceproperties.go sentenceBreakCodePoints sentences
  18. //go:generate go run gen_properties.go LineBreak lineproperties.go lineBreakCodePoints lines gencat
  19. //go:generate go run gen_properties.go EastAsianWidth eastasianwidth.go eastAsianWidth eastasianwidth
  20. //go:generate go run gen_properties.go - emojipresentation.go emojiPresentation emojipresentation emojis=Emoji_Presentation
  21. package main
  22. import (
  23. "bufio"
  24. "bytes"
  25. "errors"
  26. "fmt"
  27. "go/format"
  28. "io/ioutil"
  29. "log"
  30. "net/http"
  31. "os"
  32. "regexp"
  33. "sort"
  34. "strconv"
  35. "strings"
  36. "time"
  37. )
  38. // We want to test against a specific version rather than the latest. When the
  39. // package is upgraded to a new version, change these to generate new tests.
  40. const (
  41. propertyURL = `https://www.unicode.org/Public/15.0.0/ucd/%s.txt`
  42. emojiURL = `https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt`
  43. )
  44. // The regular expression for a line containing a code point range property.
  45. var propertyPattern = regexp.MustCompile(`^([0-9A-F]{4,6})(\.\.([0-9A-F]{4,6}))?\s*;\s*([A-Za-z0-9_]+)\s*#\s(.+)$`)
  46. func main() {
  47. if len(os.Args) < 5 {
  48. fmt.Println("Not enough arguments, see code for details")
  49. os.Exit(1)
  50. }
  51. log.SetPrefix("gen_properties (" + os.Args[4] + "): ")
  52. log.SetFlags(0)
  53. // Parse flags.
  54. flags := make(map[string]string)
  55. if len(os.Args) >= 6 {
  56. for _, flag := range strings.Split(os.Args[5], ",") {
  57. flagFields := strings.Split(flag, "=")
  58. if len(flagFields) == 1 {
  59. flags[flagFields[0]] = "yes"
  60. } else {
  61. flags[flagFields[0]] = flagFields[1]
  62. }
  63. }
  64. }
  65. // Parse the text file and generate Go source code from it.
  66. _, includeGeneralCategory := flags["gencat"]
  67. var mainURL string
  68. if os.Args[1] != "-" {
  69. mainURL = fmt.Sprintf(propertyURL, os.Args[1])
  70. }
  71. src, err := parse(mainURL, flags["emojis"], includeGeneralCategory)
  72. if err != nil {
  73. log.Fatal(err)
  74. }
  75. // Format the Go code.
  76. formatted, err := format.Source([]byte(src))
  77. if err != nil {
  78. log.Fatal("gofmt:", err)
  79. }
  80. // Save it to the (local) target file.
  81. log.Print("Writing to ", os.Args[2])
  82. if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
  83. log.Fatal(err)
  84. }
  85. }
  86. // parse parses the Unicode Properties text files located at the given URLs and
  87. // returns their equivalent Go source code to be used in the uniseg package. If
  88. // "emojiProperty" is not an empty string, emoji code points for that emoji
  89. // property (e.g. "Extended_Pictographic") will be included. In those cases, you
  90. // may pass an empty "propertyURL" to skip parsing the main properties file. If
  91. // "includeGeneralCategory" is true, the Unicode General Category property will
  92. // be extracted from the comments and included in the output.
  93. func parse(propertyURL, emojiProperty string, includeGeneralCategory bool) (string, error) {
  94. if propertyURL == "" && emojiProperty == "" {
  95. return "", errors.New("no properties to parse")
  96. }
  97. // Temporary buffer to hold properties.
  98. var properties [][4]string
  99. // Open the first URL.
  100. if propertyURL != "" {
  101. log.Printf("Parsing %s", propertyURL)
  102. res, err := http.Get(propertyURL)
  103. if err != nil {
  104. return "", err
  105. }
  106. in1 := res.Body
  107. defer in1.Close()
  108. // Parse it.
  109. scanner := bufio.NewScanner(in1)
  110. num := 0
  111. for scanner.Scan() {
  112. num++
  113. line := strings.TrimSpace(scanner.Text())
  114. // Skip comments and empty lines.
  115. if strings.HasPrefix(line, "#") || line == "" {
  116. continue
  117. }
  118. // Everything else must be a code point range, a property and a comment.
  119. from, to, property, comment, err := parseProperty(line)
  120. if err != nil {
  121. return "", fmt.Errorf("%s line %d: %v", os.Args[4], num, err)
  122. }
  123. properties = append(properties, [4]string{from, to, property, comment})
  124. }
  125. if err := scanner.Err(); err != nil {
  126. return "", err
  127. }
  128. }
  129. // Open the second URL.
  130. if emojiProperty != "" {
  131. log.Printf("Parsing %s", emojiURL)
  132. res, err := http.Get(emojiURL)
  133. if err != nil {
  134. return "", err
  135. }
  136. in2 := res.Body
  137. defer in2.Close()
  138. // Parse it.
  139. scanner := bufio.NewScanner(in2)
  140. num := 0
  141. for scanner.Scan() {
  142. num++
  143. line := scanner.Text()
  144. // Skip comments, empty lines, and everything not containing
  145. // "Extended_Pictographic".
  146. if strings.HasPrefix(line, "#") || line == "" || !strings.Contains(line, emojiProperty) {
  147. continue
  148. }
  149. // Everything else must be a code point range, a property and a comment.
  150. from, to, property, comment, err := parseProperty(line)
  151. if err != nil {
  152. return "", fmt.Errorf("emojis line %d: %v", num, err)
  153. }
  154. properties = append(properties, [4]string{from, to, property, comment})
  155. }
  156. if err := scanner.Err(); err != nil {
  157. return "", err
  158. }
  159. }
  160. // Avoid overflow during binary search.
  161. if len(properties) >= 1<<31 {
  162. return "", errors.New("too many properties")
  163. }
  164. // Sort properties.
  165. sort.Slice(properties, func(i, j int) bool {
  166. left, _ := strconv.ParseUint(properties[i][0], 16, 64)
  167. right, _ := strconv.ParseUint(properties[j][0], 16, 64)
  168. return left < right
  169. })
  170. // Header.
  171. var (
  172. buf bytes.Buffer
  173. emojiComment string
  174. )
  175. columns := 3
  176. if includeGeneralCategory {
  177. columns = 4
  178. }
  179. if emojiURL != "" {
  180. emojiComment = `
  181. // and
  182. // ` + emojiURL + `
  183. // ("Extended_Pictographic" only)`
  184. }
  185. buf.WriteString(`// Code generated via go generate from gen_properties.go. DO NOT EDIT.
  186. package uniseg
  187. // ` + os.Args[3] + ` are taken from
  188. // ` + propertyURL + emojiComment + `
  189. // on ` + time.Now().Format("January 2, 2006") + `. See https://www.unicode.org/license.html for the Unicode
  190. // license agreement.
  191. var ` + os.Args[3] + ` = [][` + strconv.Itoa(columns) + `]int{
  192. `)
  193. // Properties.
  194. for _, prop := range properties {
  195. if includeGeneralCategory {
  196. generalCategory := "gc" + prop[3][:2]
  197. if generalCategory == "gcL&" {
  198. generalCategory = "gcLC"
  199. }
  200. prop[3] = prop[3][3:]
  201. fmt.Fprintf(&buf, "{0x%s,0x%s,%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), generalCategory, prop[3])
  202. } else {
  203. fmt.Fprintf(&buf, "{0x%s,0x%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), prop[3])
  204. }
  205. }
  206. // Tail.
  207. buf.WriteString("}")
  208. return buf.String(), nil
  209. }
  210. // parseProperty parses a line of the Unicode properties text file containing a
  211. // property for a code point range and returns it along with its comment.
  212. func parseProperty(line string) (from, to, property, comment string, err error) {
  213. fields := propertyPattern.FindStringSubmatch(line)
  214. if fields == nil {
  215. err = errors.New("no property found")
  216. return
  217. }
  218. from = fields[1]
  219. to = fields[3]
  220. if to == "" {
  221. to = from
  222. }
  223. property = fields[4]
  224. comment = fields[5]
  225. return
  226. }
  227. // translateProperty translates a property name as used in the Unicode data file
  228. // to a variable used in the Go code.
  229. func translateProperty(prefix, property string) string {
  230. return prefix + strings.ReplaceAll(property, "_", "")
  231. }