sentencerules.go 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. package uniseg
  2. import "unicode/utf8"
  3. // The states of the sentence break parser.
  4. const (
  5. sbAny = iota
  6. sbCR
  7. sbParaSep
  8. sbATerm
  9. sbUpper
  10. sbLower
  11. sbSB7
  12. sbSB8Close
  13. sbSB8Sp
  14. sbSTerm
  15. sbSB8aClose
  16. sbSB8aSp
  17. )
  18. // sbTransitions implements the sentence break parser's state transitions. It's
  19. // anologous to [grTransitions], see comments there for details.
  20. //
  21. // Unicode version 15.0.0.
  22. func sbTransitions(state, prop int) (newState int, sentenceBreak bool, rule int) {
  23. switch uint64(state) | uint64(prop)<<32 {
  24. // SB3.
  25. case sbAny | prCR<<32:
  26. return sbCR, false, 9990
  27. case sbCR | prLF<<32:
  28. return sbParaSep, false, 30
  29. // SB4.
  30. case sbAny | prSep<<32:
  31. return sbParaSep, false, 9990
  32. case sbAny | prLF<<32:
  33. return sbParaSep, false, 9990
  34. case sbParaSep | prAny<<32:
  35. return sbAny, true, 40
  36. case sbCR | prAny<<32:
  37. return sbAny, true, 40
  38. // SB6.
  39. case sbAny | prATerm<<32:
  40. return sbATerm, false, 9990
  41. case sbATerm | prNumeric<<32:
  42. return sbAny, false, 60
  43. case sbSB7 | prNumeric<<32:
  44. return sbAny, false, 60 // Because ATerm also appears in SB7.
  45. // SB7.
  46. case sbAny | prUpper<<32:
  47. return sbUpper, false, 9990
  48. case sbAny | prLower<<32:
  49. return sbLower, false, 9990
  50. case sbUpper | prATerm<<32:
  51. return sbSB7, false, 70
  52. case sbLower | prATerm<<32:
  53. return sbSB7, false, 70
  54. case sbSB7 | prUpper<<32:
  55. return sbUpper, false, 70
  56. // SB8a.
  57. case sbAny | prSTerm<<32:
  58. return sbSTerm, false, 9990
  59. case sbATerm | prSContinue<<32:
  60. return sbAny, false, 81
  61. case sbATerm | prATerm<<32:
  62. return sbATerm, false, 81
  63. case sbATerm | prSTerm<<32:
  64. return sbSTerm, false, 81
  65. case sbSB7 | prSContinue<<32:
  66. return sbAny, false, 81
  67. case sbSB7 | prATerm<<32:
  68. return sbATerm, false, 81
  69. case sbSB7 | prSTerm<<32:
  70. return sbSTerm, false, 81
  71. case sbSB8Close | prSContinue<<32:
  72. return sbAny, false, 81
  73. case sbSB8Close | prATerm<<32:
  74. return sbATerm, false, 81
  75. case sbSB8Close | prSTerm<<32:
  76. return sbSTerm, false, 81
  77. case sbSB8Sp | prSContinue<<32:
  78. return sbAny, false, 81
  79. case sbSB8Sp | prATerm<<32:
  80. return sbATerm, false, 81
  81. case sbSB8Sp | prSTerm<<32:
  82. return sbSTerm, false, 81
  83. case sbSTerm | prSContinue<<32:
  84. return sbAny, false, 81
  85. case sbSTerm | prATerm<<32:
  86. return sbATerm, false, 81
  87. case sbSTerm | prSTerm<<32:
  88. return sbSTerm, false, 81
  89. case sbSB8aClose | prSContinue<<32:
  90. return sbAny, false, 81
  91. case sbSB8aClose | prATerm<<32:
  92. return sbATerm, false, 81
  93. case sbSB8aClose | prSTerm<<32:
  94. return sbSTerm, false, 81
  95. case sbSB8aSp | prSContinue<<32:
  96. return sbAny, false, 81
  97. case sbSB8aSp | prATerm<<32:
  98. return sbATerm, false, 81
  99. case sbSB8aSp | prSTerm<<32:
  100. return sbSTerm, false, 81
  101. // SB9.
  102. case sbATerm | prClose<<32:
  103. return sbSB8Close, false, 90
  104. case sbSB7 | prClose<<32:
  105. return sbSB8Close, false, 90
  106. case sbSB8Close | prClose<<32:
  107. return sbSB8Close, false, 90
  108. case sbATerm | prSp<<32:
  109. return sbSB8Sp, false, 90
  110. case sbSB7 | prSp<<32:
  111. return sbSB8Sp, false, 90
  112. case sbSB8Close | prSp<<32:
  113. return sbSB8Sp, false, 90
  114. case sbSTerm | prClose<<32:
  115. return sbSB8aClose, false, 90
  116. case sbSB8aClose | prClose<<32:
  117. return sbSB8aClose, false, 90
  118. case sbSTerm | prSp<<32:
  119. return sbSB8aSp, false, 90
  120. case sbSB8aClose | prSp<<32:
  121. return sbSB8aSp, false, 90
  122. case sbATerm | prSep<<32:
  123. return sbParaSep, false, 90
  124. case sbATerm | prCR<<32:
  125. return sbParaSep, false, 90
  126. case sbATerm | prLF<<32:
  127. return sbParaSep, false, 90
  128. case sbSB7 | prSep<<32:
  129. return sbParaSep, false, 90
  130. case sbSB7 | prCR<<32:
  131. return sbParaSep, false, 90
  132. case sbSB7 | prLF<<32:
  133. return sbParaSep, false, 90
  134. case sbSB8Close | prSep<<32:
  135. return sbParaSep, false, 90
  136. case sbSB8Close | prCR<<32:
  137. return sbParaSep, false, 90
  138. case sbSB8Close | prLF<<32:
  139. return sbParaSep, false, 90
  140. case sbSTerm | prSep<<32:
  141. return sbParaSep, false, 90
  142. case sbSTerm | prCR<<32:
  143. return sbParaSep, false, 90
  144. case sbSTerm | prLF<<32:
  145. return sbParaSep, false, 90
  146. case sbSB8aClose | prSep<<32:
  147. return sbParaSep, false, 90
  148. case sbSB8aClose | prCR<<32:
  149. return sbParaSep, false, 90
  150. case sbSB8aClose | prLF<<32:
  151. return sbParaSep, false, 90
  152. // SB10.
  153. case sbSB8Sp | prSp<<32:
  154. return sbSB8Sp, false, 100
  155. case sbSB8aSp | prSp<<32:
  156. return sbSB8aSp, false, 100
  157. case sbSB8Sp | prSep<<32:
  158. return sbParaSep, false, 100
  159. case sbSB8Sp | prCR<<32:
  160. return sbParaSep, false, 100
  161. case sbSB8Sp | prLF<<32:
  162. return sbParaSep, false, 100
  163. // SB11.
  164. case sbATerm | prAny<<32:
  165. return sbAny, true, 110
  166. case sbSB7 | prAny<<32:
  167. return sbAny, true, 110
  168. case sbSB8Close | prAny<<32:
  169. return sbAny, true, 110
  170. case sbSB8Sp | prAny<<32:
  171. return sbAny, true, 110
  172. case sbSTerm | prAny<<32:
  173. return sbAny, true, 110
  174. case sbSB8aClose | prAny<<32:
  175. return sbAny, true, 110
  176. case sbSB8aSp | prAny<<32:
  177. return sbAny, true, 110
  178. // We'll always break after ParaSep due to SB4.
  179. default:
  180. return -1, false, -1
  181. }
  182. }
  183. // transitionSentenceBreakState determines the new state of the sentence break
  184. // parser given the current state and the next code point. It also returns
  185. // whether a sentence boundary was detected. If more than one code point is
  186. // needed to determine the new state, the byte slice or the string starting
  187. // after rune "r" can be used (whichever is not nil or empty) for further
  188. // lookups.
  189. func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) {
  190. // Determine the property of the next character.
  191. nextProperty := property(sentenceBreakCodePoints, r)
  192. // SB5 (Replacing Ignore Rules).
  193. if nextProperty == prExtend || nextProperty == prFormat {
  194. if state == sbParaSep || state == sbCR {
  195. return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4.
  196. }
  197. if state < 0 {
  198. return sbAny, true // SB1.
  199. }
  200. return state, false
  201. }
  202. // Find the applicable transition in the table.
  203. var rule int
  204. newState, sentenceBreak, rule = sbTransitions(state, nextProperty)
  205. if newState < 0 {
  206. // No specific transition found. Try the less specific ones.
  207. anyPropState, anyPropProp, anyPropRule := sbTransitions(state, prAny)
  208. anyStateState, anyStateProp, anyStateRule := sbTransitions(sbAny, nextProperty)
  209. if anyPropState >= 0 && anyStateState >= 0 {
  210. // Both apply. We'll use a mix (see comments for grTransitions).
  211. newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
  212. if anyPropRule < anyStateRule {
  213. sentenceBreak, rule = anyPropProp, anyPropRule
  214. }
  215. } else if anyPropState >= 0 {
  216. // We only have a specific state.
  217. newState, sentenceBreak, rule = anyPropState, anyPropProp, anyPropRule
  218. // This branch will probably never be reached because okAnyState will
  219. // always be true given the current transition map. But we keep it here
  220. // for future modifications to the transition map where this may not be
  221. // true anymore.
  222. } else if anyStateState >= 0 {
  223. // We only have a specific property.
  224. newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
  225. } else {
  226. // No known transition. SB999: Any × Any.
  227. newState, sentenceBreak, rule = sbAny, false, 9990
  228. }
  229. }
  230. // SB8.
  231. if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) {
  232. // Check the right side of the rule.
  233. var length int
  234. for nextProperty != prOLetter &&
  235. nextProperty != prUpper &&
  236. nextProperty != prLower &&
  237. nextProperty != prSep &&
  238. nextProperty != prCR &&
  239. nextProperty != prLF &&
  240. nextProperty != prATerm &&
  241. nextProperty != prSTerm {
  242. // Move on to the next rune.
  243. if b != nil { // Byte slice version.
  244. r, length = utf8.DecodeRune(b)
  245. b = b[length:]
  246. } else { // String version.
  247. r, length = utf8.DecodeRuneInString(str)
  248. str = str[length:]
  249. }
  250. if r == utf8.RuneError {
  251. break
  252. }
  253. nextProperty = property(sentenceBreakCodePoints, r)
  254. }
  255. if nextProperty == prLower {
  256. return sbLower, false
  257. }
  258. }
  259. return
  260. }