sentencerules.go 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. package uniseg
  2. import "unicode/utf8"
  3. // The states of the sentence break parser.
  4. const (
  5. sbAny = iota
  6. sbCR
  7. sbParaSep
  8. sbATerm
  9. sbUpper
  10. sbLower
  11. sbSB7
  12. sbSB8Close
  13. sbSB8Sp
  14. sbSTerm
  15. sbSB8aClose
  16. sbSB8aSp
  17. )
  18. // The sentence break parser's breaking instructions.
  19. const (
  20. sbDontBreak = iota
  21. sbBreak
  22. )
  23. // The sentence break parser's state transitions. It's anologous to
  24. // grTransitions, see comments there for details. Unicode version 14.0.0.
  25. var sbTransitions = map[[2]int][3]int{
  26. // SB3.
  27. {sbAny, prCR}: {sbCR, sbDontBreak, 9990},
  28. {sbCR, prLF}: {sbParaSep, sbDontBreak, 30},
  29. // SB4.
  30. {sbAny, prSep}: {sbParaSep, sbDontBreak, 9990},
  31. {sbAny, prLF}: {sbParaSep, sbDontBreak, 9990},
  32. {sbParaSep, prAny}: {sbAny, sbBreak, 40},
  33. {sbCR, prAny}: {sbAny, sbBreak, 40},
  34. // SB6.
  35. {sbAny, prATerm}: {sbATerm, sbDontBreak, 9990},
  36. {sbATerm, prNumeric}: {sbAny, sbDontBreak, 60},
  37. {sbSB7, prNumeric}: {sbAny, sbDontBreak, 60}, // Because ATerm also appears in SB7.
  38. // SB7.
  39. {sbAny, prUpper}: {sbUpper, sbDontBreak, 9990},
  40. {sbAny, prLower}: {sbLower, sbDontBreak, 9990},
  41. {sbUpper, prATerm}: {sbSB7, sbDontBreak, 70},
  42. {sbLower, prATerm}: {sbSB7, sbDontBreak, 70},
  43. {sbSB7, prUpper}: {sbUpper, sbDontBreak, 70},
  44. // SB8a.
  45. {sbAny, prSTerm}: {sbSTerm, sbDontBreak, 9990},
  46. {sbATerm, prSContinue}: {sbAny, sbDontBreak, 81},
  47. {sbATerm, prATerm}: {sbATerm, sbDontBreak, 81},
  48. {sbATerm, prSTerm}: {sbSTerm, sbDontBreak, 81},
  49. {sbSB7, prSContinue}: {sbAny, sbDontBreak, 81},
  50. {sbSB7, prATerm}: {sbATerm, sbDontBreak, 81},
  51. {sbSB7, prSTerm}: {sbSTerm, sbDontBreak, 81},
  52. {sbSB8Close, prSContinue}: {sbAny, sbDontBreak, 81},
  53. {sbSB8Close, prATerm}: {sbATerm, sbDontBreak, 81},
  54. {sbSB8Close, prSTerm}: {sbSTerm, sbDontBreak, 81},
  55. {sbSB8Sp, prSContinue}: {sbAny, sbDontBreak, 81},
  56. {sbSB8Sp, prATerm}: {sbATerm, sbDontBreak, 81},
  57. {sbSB8Sp, prSTerm}: {sbSTerm, sbDontBreak, 81},
  58. {sbSTerm, prSContinue}: {sbAny, sbDontBreak, 81},
  59. {sbSTerm, prATerm}: {sbATerm, sbDontBreak, 81},
  60. {sbSTerm, prSTerm}: {sbSTerm, sbDontBreak, 81},
  61. {sbSB8aClose, prSContinue}: {sbAny, sbDontBreak, 81},
  62. {sbSB8aClose, prATerm}: {sbATerm, sbDontBreak, 81},
  63. {sbSB8aClose, prSTerm}: {sbSTerm, sbDontBreak, 81},
  64. {sbSB8aSp, prSContinue}: {sbAny, sbDontBreak, 81},
  65. {sbSB8aSp, prATerm}: {sbATerm, sbDontBreak, 81},
  66. {sbSB8aSp, prSTerm}: {sbSTerm, sbDontBreak, 81},
  67. // SB9.
  68. {sbATerm, prClose}: {sbSB8Close, sbDontBreak, 90},
  69. {sbSB7, prClose}: {sbSB8Close, sbDontBreak, 90},
  70. {sbSB8Close, prClose}: {sbSB8Close, sbDontBreak, 90},
  71. {sbATerm, prSp}: {sbSB8Sp, sbDontBreak, 90},
  72. {sbSB7, prSp}: {sbSB8Sp, sbDontBreak, 90},
  73. {sbSB8Close, prSp}: {sbSB8Sp, sbDontBreak, 90},
  74. {sbSTerm, prClose}: {sbSB8aClose, sbDontBreak, 90},
  75. {sbSB8aClose, prClose}: {sbSB8aClose, sbDontBreak, 90},
  76. {sbSTerm, prSp}: {sbSB8aSp, sbDontBreak, 90},
  77. {sbSB8aClose, prSp}: {sbSB8aSp, sbDontBreak, 90},
  78. {sbATerm, prSep}: {sbParaSep, sbDontBreak, 90},
  79. {sbATerm, prCR}: {sbParaSep, sbDontBreak, 90},
  80. {sbATerm, prLF}: {sbParaSep, sbDontBreak, 90},
  81. {sbSB7, prSep}: {sbParaSep, sbDontBreak, 90},
  82. {sbSB7, prCR}: {sbParaSep, sbDontBreak, 90},
  83. {sbSB7, prLF}: {sbParaSep, sbDontBreak, 90},
  84. {sbSB8Close, prSep}: {sbParaSep, sbDontBreak, 90},
  85. {sbSB8Close, prCR}: {sbParaSep, sbDontBreak, 90},
  86. {sbSB8Close, prLF}: {sbParaSep, sbDontBreak, 90},
  87. {sbSTerm, prSep}: {sbParaSep, sbDontBreak, 90},
  88. {sbSTerm, prCR}: {sbParaSep, sbDontBreak, 90},
  89. {sbSTerm, prLF}: {sbParaSep, sbDontBreak, 90},
  90. {sbSB8aClose, prSep}: {sbParaSep, sbDontBreak, 90},
  91. {sbSB8aClose, prCR}: {sbParaSep, sbDontBreak, 90},
  92. {sbSB8aClose, prLF}: {sbParaSep, sbDontBreak, 90},
  93. // SB10.
  94. {sbSB8Sp, prSp}: {sbSB8Sp, sbDontBreak, 100},
  95. {sbSB8aSp, prSp}: {sbSB8aSp, sbDontBreak, 100},
  96. {sbSB8Sp, prSep}: {sbParaSep, sbDontBreak, 100},
  97. {sbSB8Sp, prCR}: {sbParaSep, sbDontBreak, 100},
  98. {sbSB8Sp, prLF}: {sbParaSep, sbDontBreak, 100},
  99. // SB11.
  100. {sbATerm, prAny}: {sbAny, sbBreak, 110},
  101. {sbSB7, prAny}: {sbAny, sbBreak, 110},
  102. {sbSB8Close, prAny}: {sbAny, sbBreak, 110},
  103. {sbSB8Sp, prAny}: {sbAny, sbBreak, 110},
  104. {sbSTerm, prAny}: {sbAny, sbBreak, 110},
  105. {sbSB8aClose, prAny}: {sbAny, sbBreak, 110},
  106. {sbSB8aSp, prAny}: {sbAny, sbBreak, 110},
  107. // We'll always break after ParaSep due to SB4.
  108. }
  109. // transitionSentenceBreakState determines the new state of the sentence break
  110. // parser given the current state and the next code point. It also returns
  111. // whether a sentence boundary was detected. If more than one code point is
  112. // needed to determine the new state, the byte slice or the string starting
  113. // after rune "r" can be used (whichever is not nil or empty) for further
  114. // lookups.
  115. func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) {
  116. // Determine the property of the next character.
  117. nextProperty := property(sentenceBreakCodePoints, r)
  118. // SB5 (Replacing Ignore Rules).
  119. if nextProperty == prExtend || nextProperty == prFormat {
  120. if state == sbParaSep || state == sbCR {
  121. return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4.
  122. }
  123. if state < 0 {
  124. return sbAny, true // SB1.
  125. }
  126. return state, false
  127. }
  128. // Find the applicable transition in the table.
  129. var rule int
  130. transition, ok := sbTransitions[[2]int{state, nextProperty}]
  131. if ok {
  132. // We have a specific transition. We'll use it.
  133. newState, sentenceBreak, rule = transition[0], transition[1] == sbBreak, transition[2]
  134. } else {
  135. // No specific transition found. Try the less specific ones.
  136. transAnyProp, okAnyProp := sbTransitions[[2]int{state, prAny}]
  137. transAnyState, okAnyState := sbTransitions[[2]int{sbAny, nextProperty}]
  138. if okAnyProp && okAnyState {
  139. // Both apply. We'll use a mix (see comments for grTransitions).
  140. newState, sentenceBreak, rule = transAnyState[0], transAnyState[1] == sbBreak, transAnyState[2]
  141. if transAnyProp[2] < transAnyState[2] {
  142. sentenceBreak, rule = transAnyProp[1] == sbBreak, transAnyProp[2]
  143. }
  144. } else if okAnyProp {
  145. // We only have a specific state.
  146. newState, sentenceBreak, rule = transAnyProp[0], transAnyProp[1] == sbBreak, transAnyProp[2]
  147. // This branch will probably never be reached because okAnyState will
  148. // always be true given the current transition map. But we keep it here
  149. // for future modifications to the transition map where this may not be
  150. // true anymore.
  151. } else if okAnyState {
  152. // We only have a specific property.
  153. newState, sentenceBreak, rule = transAnyState[0], transAnyState[1] == sbBreak, transAnyState[2]
  154. } else {
  155. // No known transition. SB999: Any × Any.
  156. newState, sentenceBreak, rule = sbAny, false, 9990
  157. }
  158. }
  159. // SB8.
  160. if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) {
  161. // Check the right side of the rule.
  162. var length int
  163. for nextProperty != prOLetter &&
  164. nextProperty != prUpper &&
  165. nextProperty != prLower &&
  166. nextProperty != prSep &&
  167. nextProperty != prCR &&
  168. nextProperty != prLF &&
  169. nextProperty != prATerm &&
  170. nextProperty != prSTerm {
  171. // Move on to the next rune.
  172. if b != nil { // Byte slice version.
  173. r, length = utf8.DecodeRune(b)
  174. b = b[length:]
  175. } else { // String version.
  176. r, length = utf8.DecodeRuneInString(str)
  177. str = str[length:]
  178. }
  179. if r == utf8.RuneError {
  180. break
  181. }
  182. nextProperty = property(sentenceBreakCodePoints, r)
  183. }
  184. if nextProperty == prLower {
  185. return sbLower, false
  186. }
  187. }
  188. return
  189. }