wordrules.go 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. package uniseg
  2. import "unicode/utf8"
  3. // The states of the word break parser.
  4. const (
  5. wbAny = iota
  6. wbCR
  7. wbLF
  8. wbNewline
  9. wbWSegSpace
  10. wbHebrewLetter
  11. wbALetter
  12. wbWB7
  13. wbWB7c
  14. wbNumeric
  15. wbWB11
  16. wbKatakana
  17. wbExtendNumLet
  18. wbOddRI
  19. wbEvenRI
  20. wbZWJBit = 16 // This bit is set for any states followed by at least one zero-width joiner (see WB4 and WB3c).
  21. )
  22. // wbTransitions implements the word break parser's state transitions. It's
  23. // anologous to [grTransitions], see comments there for details.
  24. //
  25. // Unicode version 15.0.0.
  26. func wbTransitions(state, prop int) (newState int, wordBreak bool, rule int) {
  27. switch uint64(state) | uint64(prop)<<32 {
  28. // WB3b.
  29. case wbAny | prNewline<<32:
  30. return wbNewline, true, 32
  31. case wbAny | prCR<<32:
  32. return wbCR, true, 32
  33. case wbAny | prLF<<32:
  34. return wbLF, true, 32
  35. // WB3a.
  36. case wbNewline | prAny<<32:
  37. return wbAny, true, 31
  38. case wbCR | prAny<<32:
  39. return wbAny, true, 31
  40. case wbLF | prAny<<32:
  41. return wbAny, true, 31
  42. // WB3.
  43. case wbCR | prLF<<32:
  44. return wbLF, false, 30
  45. // WB3d.
  46. case wbAny | prWSegSpace<<32:
  47. return wbWSegSpace, true, 9990
  48. case wbWSegSpace | prWSegSpace<<32:
  49. return wbWSegSpace, false, 34
  50. // WB5.
  51. case wbAny | prALetter<<32:
  52. return wbALetter, true, 9990
  53. case wbAny | prHebrewLetter<<32:
  54. return wbHebrewLetter, true, 9990
  55. case wbALetter | prALetter<<32:
  56. return wbALetter, false, 50
  57. case wbALetter | prHebrewLetter<<32:
  58. return wbHebrewLetter, false, 50
  59. case wbHebrewLetter | prALetter<<32:
  60. return wbALetter, false, 50
  61. case wbHebrewLetter | prHebrewLetter<<32:
  62. return wbHebrewLetter, false, 50
  63. // WB7. Transitions to wbWB7 handled by transitionWordBreakState().
  64. case wbWB7 | prALetter<<32:
  65. return wbALetter, false, 70
  66. case wbWB7 | prHebrewLetter<<32:
  67. return wbHebrewLetter, false, 70
  68. // WB7a.
  69. case wbHebrewLetter | prSingleQuote<<32:
  70. return wbAny, false, 71
  71. // WB7c. Transitions to wbWB7c handled by transitionWordBreakState().
  72. case wbWB7c | prHebrewLetter<<32:
  73. return wbHebrewLetter, false, 73
  74. // WB8.
  75. case wbAny | prNumeric<<32:
  76. return wbNumeric, true, 9990
  77. case wbNumeric | prNumeric<<32:
  78. return wbNumeric, false, 80
  79. // WB9.
  80. case wbALetter | prNumeric<<32:
  81. return wbNumeric, false, 90
  82. case wbHebrewLetter | prNumeric<<32:
  83. return wbNumeric, false, 90
  84. // WB10.
  85. case wbNumeric | prALetter<<32:
  86. return wbALetter, false, 100
  87. case wbNumeric | prHebrewLetter<<32:
  88. return wbHebrewLetter, false, 100
  89. // WB11. Transitions to wbWB11 handled by transitionWordBreakState().
  90. case wbWB11 | prNumeric<<32:
  91. return wbNumeric, false, 110
  92. // WB13.
  93. case wbAny | prKatakana<<32:
  94. return wbKatakana, true, 9990
  95. case wbKatakana | prKatakana<<32:
  96. return wbKatakana, false, 130
  97. // WB13a.
  98. case wbAny | prExtendNumLet<<32:
  99. return wbExtendNumLet, true, 9990
  100. case wbALetter | prExtendNumLet<<32:
  101. return wbExtendNumLet, false, 131
  102. case wbHebrewLetter | prExtendNumLet<<32:
  103. return wbExtendNumLet, false, 131
  104. case wbNumeric | prExtendNumLet<<32:
  105. return wbExtendNumLet, false, 131
  106. case wbKatakana | prExtendNumLet<<32:
  107. return wbExtendNumLet, false, 131
  108. case wbExtendNumLet | prExtendNumLet<<32:
  109. return wbExtendNumLet, false, 131
  110. // WB13b.
  111. case wbExtendNumLet | prALetter<<32:
  112. return wbALetter, false, 132
  113. case wbExtendNumLet | prHebrewLetter<<32:
  114. return wbHebrewLetter, false, 132
  115. case wbExtendNumLet | prNumeric<<32:
  116. return wbNumeric, false, 132
  117. case wbExtendNumLet | prKatakana<<32:
  118. return wbKatakana, false, 132
  119. default:
  120. return -1, false, -1
  121. }
  122. }
  123. // transitionWordBreakState determines the new state of the word break parser
  124. // given the current state and the next code point. It also returns whether a
  125. // word boundary was detected. If more than one code point is needed to
  126. // determine the new state, the byte slice or the string starting after rune "r"
  127. // can be used (whichever is not nil or empty) for further lookups.
  128. func transitionWordBreakState(state int, r rune, b []byte, str string) (newState int, wordBreak bool) {
  129. // Determine the property of the next character.
  130. nextProperty := property(workBreakCodePoints, r)
  131. // "Replacing Ignore Rules".
  132. if nextProperty == prZWJ {
  133. // WB4 (for zero-width joiners).
  134. if state == wbNewline || state == wbCR || state == wbLF {
  135. return wbAny | wbZWJBit, true // Make sure we don't apply WB4 to WB3a.
  136. }
  137. if state < 0 {
  138. return wbAny | wbZWJBit, false
  139. }
  140. return state | wbZWJBit, false
  141. } else if nextProperty == prExtend || nextProperty == prFormat {
  142. // WB4 (for Extend and Format).
  143. if state == wbNewline || state == wbCR || state == wbLF {
  144. return wbAny, true // Make sure we don't apply WB4 to WB3a.
  145. }
  146. if state == wbWSegSpace || state == wbAny|wbZWJBit {
  147. return wbAny, false // We don't break but this is also not WB3d or WB3c.
  148. }
  149. if state < 0 {
  150. return wbAny, false
  151. }
  152. return state, false
  153. } else if nextProperty == prExtendedPictographic && state >= 0 && state&wbZWJBit != 0 {
  154. // WB3c.
  155. return wbAny, false
  156. }
  157. if state >= 0 {
  158. state = state &^ wbZWJBit
  159. }
  160. // Find the applicable transition in the table.
  161. var rule int
  162. newState, wordBreak, rule = wbTransitions(state, nextProperty)
  163. if newState < 0 {
  164. // No specific transition found. Try the less specific ones.
  165. anyPropState, anyPropWordBreak, anyPropRule := wbTransitions(state, prAny)
  166. anyStateState, anyStateWordBreak, anyStateRule := wbTransitions(wbAny, nextProperty)
  167. if anyPropState >= 0 && anyStateState >= 0 {
  168. // Both apply. We'll use a mix (see comments for grTransitions).
  169. newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
  170. if anyPropRule < anyStateRule {
  171. wordBreak, rule = anyPropWordBreak, anyPropRule
  172. }
  173. } else if anyPropState >= 0 {
  174. // We only have a specific state.
  175. newState, wordBreak, rule = anyPropState, anyPropWordBreak, anyPropRule
  176. // This branch will probably never be reached because okAnyState will
  177. // always be true given the current transition map. But we keep it here
  178. // for future modifications to the transition map where this may not be
  179. // true anymore.
  180. } else if anyStateState >= 0 {
  181. // We only have a specific property.
  182. newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
  183. } else {
  184. // No known transition. WB999: Any ÷ Any.
  185. newState, wordBreak, rule = wbAny, true, 9990
  186. }
  187. }
  188. // For those rules that need to look up runes further in the string, we
  189. // determine the property after nextProperty, skipping over Format, Extend,
  190. // and ZWJ (according to WB4). It's -1 if not needed, if such a rune cannot
  191. // be determined (because the text ends or the rune is faulty).
  192. farProperty := -1
  193. if rule > 60 &&
  194. (state == wbALetter || state == wbHebrewLetter || state == wbNumeric) &&
  195. (nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote || // WB6.
  196. nextProperty == prDoubleQuote || // WB7b.
  197. nextProperty == prMidNum) { // WB12.
  198. for {
  199. var (
  200. r rune
  201. length int
  202. )
  203. if b != nil { // Byte slice version.
  204. r, length = utf8.DecodeRune(b)
  205. b = b[length:]
  206. } else { // String version.
  207. r, length = utf8.DecodeRuneInString(str)
  208. str = str[length:]
  209. }
  210. if r == utf8.RuneError {
  211. break
  212. }
  213. prop := property(workBreakCodePoints, r)
  214. if prop == prExtend || prop == prFormat || prop == prZWJ {
  215. continue
  216. }
  217. farProperty = prop
  218. break
  219. }
  220. }
  221. // WB6.
  222. if rule > 60 &&
  223. (state == wbALetter || state == wbHebrewLetter) &&
  224. (nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
  225. (farProperty == prALetter || farProperty == prHebrewLetter) {
  226. return wbWB7, false
  227. }
  228. // WB7b.
  229. if rule > 72 &&
  230. state == wbHebrewLetter &&
  231. nextProperty == prDoubleQuote &&
  232. farProperty == prHebrewLetter {
  233. return wbWB7c, false
  234. }
  235. // WB12.
  236. if rule > 120 &&
  237. state == wbNumeric &&
  238. (nextProperty == prMidNum || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
  239. farProperty == prNumeric {
  240. return wbWB11, false
  241. }
  242. // WB15 and WB16.
  243. if newState == wbAny && nextProperty == prRegionalIndicator {
  244. if state != wbOddRI && state != wbEvenRI { // Includes state == -1.
  245. // Transition into the first RI.
  246. return wbOddRI, true
  247. }
  248. if state == wbOddRI {
  249. // Don't break pairs of Regional Indicators.
  250. return wbEvenRI, false
  251. }
  252. return wbOddRI, true // We can break after a pair.
  253. }
  254. return
  255. }