123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276 |
- package uniseg
- import "unicode/utf8"
- // The states of the sentence break parser.
- const (
- sbAny = iota
- sbCR
- sbParaSep
- sbATerm
- sbUpper
- sbLower
- sbSB7
- sbSB8Close
- sbSB8Sp
- sbSTerm
- sbSB8aClose
- sbSB8aSp
- )
- // sbTransitions implements the sentence break parser's state transitions. It's
- // anologous to [grTransitions], see comments there for details.
- //
- // Unicode version 15.0.0.
- func sbTransitions(state, prop int) (newState int, sentenceBreak bool, rule int) {
- switch uint64(state) | uint64(prop)<<32 {
- // SB3.
- case sbAny | prCR<<32:
- return sbCR, false, 9990
- case sbCR | prLF<<32:
- return sbParaSep, false, 30
- // SB4.
- case sbAny | prSep<<32:
- return sbParaSep, false, 9990
- case sbAny | prLF<<32:
- return sbParaSep, false, 9990
- case sbParaSep | prAny<<32:
- return sbAny, true, 40
- case sbCR | prAny<<32:
- return sbAny, true, 40
- // SB6.
- case sbAny | prATerm<<32:
- return sbATerm, false, 9990
- case sbATerm | prNumeric<<32:
- return sbAny, false, 60
- case sbSB7 | prNumeric<<32:
- return sbAny, false, 60 // Because ATerm also appears in SB7.
- // SB7.
- case sbAny | prUpper<<32:
- return sbUpper, false, 9990
- case sbAny | prLower<<32:
- return sbLower, false, 9990
- case sbUpper | prATerm<<32:
- return sbSB7, false, 70
- case sbLower | prATerm<<32:
- return sbSB7, false, 70
- case sbSB7 | prUpper<<32:
- return sbUpper, false, 70
- // SB8a.
- case sbAny | prSTerm<<32:
- return sbSTerm, false, 9990
- case sbATerm | prSContinue<<32:
- return sbAny, false, 81
- case sbATerm | prATerm<<32:
- return sbATerm, false, 81
- case sbATerm | prSTerm<<32:
- return sbSTerm, false, 81
- case sbSB7 | prSContinue<<32:
- return sbAny, false, 81
- case sbSB7 | prATerm<<32:
- return sbATerm, false, 81
- case sbSB7 | prSTerm<<32:
- return sbSTerm, false, 81
- case sbSB8Close | prSContinue<<32:
- return sbAny, false, 81
- case sbSB8Close | prATerm<<32:
- return sbATerm, false, 81
- case sbSB8Close | prSTerm<<32:
- return sbSTerm, false, 81
- case sbSB8Sp | prSContinue<<32:
- return sbAny, false, 81
- case sbSB8Sp | prATerm<<32:
- return sbATerm, false, 81
- case sbSB8Sp | prSTerm<<32:
- return sbSTerm, false, 81
- case sbSTerm | prSContinue<<32:
- return sbAny, false, 81
- case sbSTerm | prATerm<<32:
- return sbATerm, false, 81
- case sbSTerm | prSTerm<<32:
- return sbSTerm, false, 81
- case sbSB8aClose | prSContinue<<32:
- return sbAny, false, 81
- case sbSB8aClose | prATerm<<32:
- return sbATerm, false, 81
- case sbSB8aClose | prSTerm<<32:
- return sbSTerm, false, 81
- case sbSB8aSp | prSContinue<<32:
- return sbAny, false, 81
- case sbSB8aSp | prATerm<<32:
- return sbATerm, false, 81
- case sbSB8aSp | prSTerm<<32:
- return sbSTerm, false, 81
- // SB9.
- case sbATerm | prClose<<32:
- return sbSB8Close, false, 90
- case sbSB7 | prClose<<32:
- return sbSB8Close, false, 90
- case sbSB8Close | prClose<<32:
- return sbSB8Close, false, 90
- case sbATerm | prSp<<32:
- return sbSB8Sp, false, 90
- case sbSB7 | prSp<<32:
- return sbSB8Sp, false, 90
- case sbSB8Close | prSp<<32:
- return sbSB8Sp, false, 90
- case sbSTerm | prClose<<32:
- return sbSB8aClose, false, 90
- case sbSB8aClose | prClose<<32:
- return sbSB8aClose, false, 90
- case sbSTerm | prSp<<32:
- return sbSB8aSp, false, 90
- case sbSB8aClose | prSp<<32:
- return sbSB8aSp, false, 90
- case sbATerm | prSep<<32:
- return sbParaSep, false, 90
- case sbATerm | prCR<<32:
- return sbParaSep, false, 90
- case sbATerm | prLF<<32:
- return sbParaSep, false, 90
- case sbSB7 | prSep<<32:
- return sbParaSep, false, 90
- case sbSB7 | prCR<<32:
- return sbParaSep, false, 90
- case sbSB7 | prLF<<32:
- return sbParaSep, false, 90
- case sbSB8Close | prSep<<32:
- return sbParaSep, false, 90
- case sbSB8Close | prCR<<32:
- return sbParaSep, false, 90
- case sbSB8Close | prLF<<32:
- return sbParaSep, false, 90
- case sbSTerm | prSep<<32:
- return sbParaSep, false, 90
- case sbSTerm | prCR<<32:
- return sbParaSep, false, 90
- case sbSTerm | prLF<<32:
- return sbParaSep, false, 90
- case sbSB8aClose | prSep<<32:
- return sbParaSep, false, 90
- case sbSB8aClose | prCR<<32:
- return sbParaSep, false, 90
- case sbSB8aClose | prLF<<32:
- return sbParaSep, false, 90
- // SB10.
- case sbSB8Sp | prSp<<32:
- return sbSB8Sp, false, 100
- case sbSB8aSp | prSp<<32:
- return sbSB8aSp, false, 100
- case sbSB8Sp | prSep<<32:
- return sbParaSep, false, 100
- case sbSB8Sp | prCR<<32:
- return sbParaSep, false, 100
- case sbSB8Sp | prLF<<32:
- return sbParaSep, false, 100
- // SB11.
- case sbATerm | prAny<<32:
- return sbAny, true, 110
- case sbSB7 | prAny<<32:
- return sbAny, true, 110
- case sbSB8Close | prAny<<32:
- return sbAny, true, 110
- case sbSB8Sp | prAny<<32:
- return sbAny, true, 110
- case sbSTerm | prAny<<32:
- return sbAny, true, 110
- case sbSB8aClose | prAny<<32:
- return sbAny, true, 110
- case sbSB8aSp | prAny<<32:
- return sbAny, true, 110
- // We'll always break after ParaSep due to SB4.
- default:
- return -1, false, -1
- }
- }
- // transitionSentenceBreakState determines the new state of the sentence break
- // parser given the current state and the next code point. It also returns
- // whether a sentence boundary was detected. If more than one code point is
- // needed to determine the new state, the byte slice or the string starting
- // after rune "r" can be used (whichever is not nil or empty) for further
- // lookups.
- func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) {
- // Determine the property of the next character.
- nextProperty := property(sentenceBreakCodePoints, r)
- // SB5 (Replacing Ignore Rules).
- if nextProperty == prExtend || nextProperty == prFormat {
- if state == sbParaSep || state == sbCR {
- return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4.
- }
- if state < 0 {
- return sbAny, true // SB1.
- }
- return state, false
- }
- // Find the applicable transition in the table.
- var rule int
- newState, sentenceBreak, rule = sbTransitions(state, nextProperty)
- if newState < 0 {
- // No specific transition found. Try the less specific ones.
- anyPropState, anyPropProp, anyPropRule := sbTransitions(state, prAny)
- anyStateState, anyStateProp, anyStateRule := sbTransitions(sbAny, nextProperty)
- if anyPropState >= 0 && anyStateState >= 0 {
- // Both apply. We'll use a mix (see comments for grTransitions).
- newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
- if anyPropRule < anyStateRule {
- sentenceBreak, rule = anyPropProp, anyPropRule
- }
- } else if anyPropState >= 0 {
- // We only have a specific state.
- newState, sentenceBreak, rule = anyPropState, anyPropProp, anyPropRule
- // This branch will probably never be reached because okAnyState will
- // always be true given the current transition map. But we keep it here
- // for future modifications to the transition map where this may not be
- // true anymore.
- } else if anyStateState >= 0 {
- // We only have a specific property.
- newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
- } else {
- // No known transition. SB999: Any × Any.
- newState, sentenceBreak, rule = sbAny, false, 9990
- }
- }
- // SB8.
- if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) {
- // Check the right side of the rule.
- var length int
- for nextProperty != prOLetter &&
- nextProperty != prUpper &&
- nextProperty != prLower &&
- nextProperty != prSep &&
- nextProperty != prCR &&
- nextProperty != prLF &&
- nextProperty != prATerm &&
- nextProperty != prSTerm {
- // Move on to the next rune.
- if b != nil { // Byte slice version.
- r, length = utf8.DecodeRune(b)
- b = b[length:]
- } else { // String version.
- r, length = utf8.DecodeRuneInString(str)
- str = str[length:]
- }
- if r == utf8.RuneError {
- break
- }
- nextProperty = property(sentenceBreakCodePoints, r)
- }
- if nextProperty == prLower {
- return sbLower, false
- }
- }
- return
- }
|