properties.go 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. package uniseg
  2. // The Unicode properties as used in the various parsers. Only the ones needed
  3. // in the context of this package are included.
  4. const (
  5. prXX = 0 // Same as prAny.
  6. prAny = iota // prAny must be 0.
  7. prPrepend // Grapheme properties must come first, to reduce the number of bits stored in the state vector.
  8. prCR
  9. prLF
  10. prControl
  11. prExtend
  12. prRegionalIndicator
  13. prSpacingMark
  14. prL
  15. prV
  16. prT
  17. prLV
  18. prLVT
  19. prZWJ
  20. prExtendedPictographic
  21. prNewline
  22. prWSegSpace
  23. prDoubleQuote
  24. prSingleQuote
  25. prMidNumLet
  26. prNumeric
  27. prMidLetter
  28. prMidNum
  29. prExtendNumLet
  30. prALetter
  31. prFormat
  32. prHebrewLetter
  33. prKatakana
  34. prSp
  35. prSTerm
  36. prClose
  37. prSContinue
  38. prATerm
  39. prUpper
  40. prLower
  41. prSep
  42. prOLetter
  43. prCM
  44. prBA
  45. prBK
  46. prSP
  47. prEX
  48. prQU
  49. prAL
  50. prPR
  51. prPO
  52. prOP
  53. prCP
  54. prIS
  55. prHY
  56. prSY
  57. prNU
  58. prCL
  59. prNL
  60. prGL
  61. prAI
  62. prBB
  63. prHL
  64. prSA
  65. prJL
  66. prJV
  67. prJT
  68. prNS
  69. prZW
  70. prB2
  71. prIN
  72. prWJ
  73. prID
  74. prEB
  75. prCJ
  76. prH2
  77. prH3
  78. prSG
  79. prCB
  80. prRI
  81. prEM
  82. prN
  83. prNa
  84. prA
  85. prW
  86. prH
  87. prF
  88. prEmojiPresentation
  89. )
  90. // Unicode General Categories. Only the ones needed in the context of this
  91. // package are included.
  92. const (
  93. gcNone = iota // gcNone must be 0.
  94. gcCc
  95. gcZs
  96. gcPo
  97. gcSc
  98. gcPs
  99. gcPe
  100. gcSm
  101. gcPd
  102. gcNd
  103. gcLu
  104. gcSk
  105. gcPc
  106. gcLl
  107. gcSo
  108. gcLo
  109. gcPi
  110. gcCf
  111. gcNo
  112. gcPf
  113. gcLC
  114. gcLm
  115. gcMn
  116. gcMe
  117. gcMc
  118. gcNl
  119. gcZl
  120. gcZp
  121. gcCn
  122. gcCs
  123. gcCo
  124. )
  125. // Special code points.
  126. const (
  127. vs15 = 0xfe0e // Variation Selector-15 (text presentation)
  128. vs16 = 0xfe0f // Variation Selector-16 (emoji presentation)
  129. )
  130. // propertySearch performs a binary search on a property slice and returns the
  131. // entry whose range (start = first array element, end = second array element)
  132. // includes r, or an array of 0's if no such entry was found.
  133. func propertySearch[E interface{ [3]int | [4]int }](dictionary []E, r rune) (result E) {
  134. // Run a binary search.
  135. from := 0
  136. to := len(dictionary)
  137. for to > from {
  138. middle := (from + to) / 2
  139. cpRange := dictionary[middle]
  140. if int(r) < cpRange[0] {
  141. to = middle
  142. continue
  143. }
  144. if int(r) > cpRange[1] {
  145. from = middle + 1
  146. continue
  147. }
  148. return cpRange
  149. }
  150. return
  151. }
  152. // property returns the Unicode property value (see constants above) of the
  153. // given code point.
  154. func property(dictionary [][3]int, r rune) int {
  155. return propertySearch(dictionary, r)[2]
  156. }
  157. // propertyLineBreak returns the Unicode property value and General Category
  158. // (see constants above) of the given code point, as listed in the line break
  159. // code points table, while fast tracking ASCII digits and letters.
  160. func propertyLineBreak(r rune) (property, generalCategory int) {
  161. if r >= 'a' && r <= 'z' {
  162. return prAL, gcLl
  163. }
  164. if r >= 'A' && r <= 'Z' {
  165. return prAL, gcLu
  166. }
  167. if r >= '0' && r <= '9' {
  168. return prNU, gcNd
  169. }
  170. entry := propertySearch(lineBreakCodePoints, r)
  171. return entry[2], entry[3]
  172. }
  173. // propertyGraphemes returns the Unicode grapheme cluster property value of the
  174. // given code point while fast tracking ASCII characters.
  175. func propertyGraphemes(r rune) int {
  176. if r >= 0x20 && r <= 0x7e {
  177. return prAny
  178. }
  179. if r == 0x0a {
  180. return prLF
  181. }
  182. if r == 0x0d {
  183. return prCR
  184. }
  185. if r >= 0 && r <= 0x1f || r == 0x7f {
  186. return prControl
  187. }
  188. return property(graphemeCodePoints, r)
  189. }
  190. // propertyEastAsianWidth returns the Unicode East Asian Width property value of
  191. // the given code point while fast tracking ASCII characters.
  192. func propertyEastAsianWidth(r rune) int {
  193. if r >= 0x20 && r <= 0x7e {
  194. return prNa
  195. }
  196. if r >= 0 && r <= 0x1f || r == 0x7f {
  197. return prN
  198. }
  199. return property(eastAsianWidth, r)
  200. }