123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205 |
- package uniseg
- import "unicode/utf8"
- // The states of the sentence break parser.
- const (
- sbAny = iota
- sbCR
- sbParaSep
- sbATerm
- sbUpper
- sbLower
- sbSB7
- sbSB8Close
- sbSB8Sp
- sbSTerm
- sbSB8aClose
- sbSB8aSp
- )
- // The sentence break parser's breaking instructions.
- const (
- sbDontBreak = iota
- sbBreak
- )
- // The sentence break parser's state transitions. It's anologous to
- // grTransitions, see comments there for details. Unicode version 14.0.0.
- var sbTransitions = map[[2]int][3]int{
- // SB3.
- {sbAny, prCR}: {sbCR, sbDontBreak, 9990},
- {sbCR, prLF}: {sbParaSep, sbDontBreak, 30},
- // SB4.
- {sbAny, prSep}: {sbParaSep, sbDontBreak, 9990},
- {sbAny, prLF}: {sbParaSep, sbDontBreak, 9990},
- {sbParaSep, prAny}: {sbAny, sbBreak, 40},
- {sbCR, prAny}: {sbAny, sbBreak, 40},
- // SB6.
- {sbAny, prATerm}: {sbATerm, sbDontBreak, 9990},
- {sbATerm, prNumeric}: {sbAny, sbDontBreak, 60},
- {sbSB7, prNumeric}: {sbAny, sbDontBreak, 60}, // Because ATerm also appears in SB7.
- // SB7.
- {sbAny, prUpper}: {sbUpper, sbDontBreak, 9990},
- {sbAny, prLower}: {sbLower, sbDontBreak, 9990},
- {sbUpper, prATerm}: {sbSB7, sbDontBreak, 70},
- {sbLower, prATerm}: {sbSB7, sbDontBreak, 70},
- {sbSB7, prUpper}: {sbUpper, sbDontBreak, 70},
- // SB8a.
- {sbAny, prSTerm}: {sbSTerm, sbDontBreak, 9990},
- {sbATerm, prSContinue}: {sbAny, sbDontBreak, 81},
- {sbATerm, prATerm}: {sbATerm, sbDontBreak, 81},
- {sbATerm, prSTerm}: {sbSTerm, sbDontBreak, 81},
- {sbSB7, prSContinue}: {sbAny, sbDontBreak, 81},
- {sbSB7, prATerm}: {sbATerm, sbDontBreak, 81},
- {sbSB7, prSTerm}: {sbSTerm, sbDontBreak, 81},
- {sbSB8Close, prSContinue}: {sbAny, sbDontBreak, 81},
- {sbSB8Close, prATerm}: {sbATerm, sbDontBreak, 81},
- {sbSB8Close, prSTerm}: {sbSTerm, sbDontBreak, 81},
- {sbSB8Sp, prSContinue}: {sbAny, sbDontBreak, 81},
- {sbSB8Sp, prATerm}: {sbATerm, sbDontBreak, 81},
- {sbSB8Sp, prSTerm}: {sbSTerm, sbDontBreak, 81},
- {sbSTerm, prSContinue}: {sbAny, sbDontBreak, 81},
- {sbSTerm, prATerm}: {sbATerm, sbDontBreak, 81},
- {sbSTerm, prSTerm}: {sbSTerm, sbDontBreak, 81},
- {sbSB8aClose, prSContinue}: {sbAny, sbDontBreak, 81},
- {sbSB8aClose, prATerm}: {sbATerm, sbDontBreak, 81},
- {sbSB8aClose, prSTerm}: {sbSTerm, sbDontBreak, 81},
- {sbSB8aSp, prSContinue}: {sbAny, sbDontBreak, 81},
- {sbSB8aSp, prATerm}: {sbATerm, sbDontBreak, 81},
- {sbSB8aSp, prSTerm}: {sbSTerm, sbDontBreak, 81},
- // SB9.
- {sbATerm, prClose}: {sbSB8Close, sbDontBreak, 90},
- {sbSB7, prClose}: {sbSB8Close, sbDontBreak, 90},
- {sbSB8Close, prClose}: {sbSB8Close, sbDontBreak, 90},
- {sbATerm, prSp}: {sbSB8Sp, sbDontBreak, 90},
- {sbSB7, prSp}: {sbSB8Sp, sbDontBreak, 90},
- {sbSB8Close, prSp}: {sbSB8Sp, sbDontBreak, 90},
- {sbSTerm, prClose}: {sbSB8aClose, sbDontBreak, 90},
- {sbSB8aClose, prClose}: {sbSB8aClose, sbDontBreak, 90},
- {sbSTerm, prSp}: {sbSB8aSp, sbDontBreak, 90},
- {sbSB8aClose, prSp}: {sbSB8aSp, sbDontBreak, 90},
- {sbATerm, prSep}: {sbParaSep, sbDontBreak, 90},
- {sbATerm, prCR}: {sbParaSep, sbDontBreak, 90},
- {sbATerm, prLF}: {sbParaSep, sbDontBreak, 90},
- {sbSB7, prSep}: {sbParaSep, sbDontBreak, 90},
- {sbSB7, prCR}: {sbParaSep, sbDontBreak, 90},
- {sbSB7, prLF}: {sbParaSep, sbDontBreak, 90},
- {sbSB8Close, prSep}: {sbParaSep, sbDontBreak, 90},
- {sbSB8Close, prCR}: {sbParaSep, sbDontBreak, 90},
- {sbSB8Close, prLF}: {sbParaSep, sbDontBreak, 90},
- {sbSTerm, prSep}: {sbParaSep, sbDontBreak, 90},
- {sbSTerm, prCR}: {sbParaSep, sbDontBreak, 90},
- {sbSTerm, prLF}: {sbParaSep, sbDontBreak, 90},
- {sbSB8aClose, prSep}: {sbParaSep, sbDontBreak, 90},
- {sbSB8aClose, prCR}: {sbParaSep, sbDontBreak, 90},
- {sbSB8aClose, prLF}: {sbParaSep, sbDontBreak, 90},
- // SB10.
- {sbSB8Sp, prSp}: {sbSB8Sp, sbDontBreak, 100},
- {sbSB8aSp, prSp}: {sbSB8aSp, sbDontBreak, 100},
- {sbSB8Sp, prSep}: {sbParaSep, sbDontBreak, 100},
- {sbSB8Sp, prCR}: {sbParaSep, sbDontBreak, 100},
- {sbSB8Sp, prLF}: {sbParaSep, sbDontBreak, 100},
- // SB11.
- {sbATerm, prAny}: {sbAny, sbBreak, 110},
- {sbSB7, prAny}: {sbAny, sbBreak, 110},
- {sbSB8Close, prAny}: {sbAny, sbBreak, 110},
- {sbSB8Sp, prAny}: {sbAny, sbBreak, 110},
- {sbSTerm, prAny}: {sbAny, sbBreak, 110},
- {sbSB8aClose, prAny}: {sbAny, sbBreak, 110},
- {sbSB8aSp, prAny}: {sbAny, sbBreak, 110},
- // We'll always break after ParaSep due to SB4.
- }
- // transitionSentenceBreakState determines the new state of the sentence break
- // parser given the current state and the next code point. It also returns
- // whether a sentence boundary was detected. If more than one code point is
- // needed to determine the new state, the byte slice or the string starting
- // after rune "r" can be used (whichever is not nil or empty) for further
- // lookups.
- func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) {
- // Determine the property of the next character.
- nextProperty := property(sentenceBreakCodePoints, r)
- // SB5 (Replacing Ignore Rules).
- if nextProperty == prExtend || nextProperty == prFormat {
- if state == sbParaSep || state == sbCR {
- return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4.
- }
- if state < 0 {
- return sbAny, true // SB1.
- }
- return state, false
- }
- // Find the applicable transition in the table.
- var rule int
- transition, ok := sbTransitions[[2]int{state, nextProperty}]
- if ok {
- // We have a specific transition. We'll use it.
- newState, sentenceBreak, rule = transition[0], transition[1] == sbBreak, transition[2]
- } else {
- // No specific transition found. Try the less specific ones.
- transAnyProp, okAnyProp := sbTransitions[[2]int{state, prAny}]
- transAnyState, okAnyState := sbTransitions[[2]int{sbAny, nextProperty}]
- if okAnyProp && okAnyState {
- // Both apply. We'll use a mix (see comments for grTransitions).
- newState, sentenceBreak, rule = transAnyState[0], transAnyState[1] == sbBreak, transAnyState[2]
- if transAnyProp[2] < transAnyState[2] {
- sentenceBreak, rule = transAnyProp[1] == sbBreak, transAnyProp[2]
- }
- } else if okAnyProp {
- // We only have a specific state.
- newState, sentenceBreak, rule = transAnyProp[0], transAnyProp[1] == sbBreak, transAnyProp[2]
- // This branch will probably never be reached because okAnyState will
- // always be true given the current transition map. But we keep it here
- // for future modifications to the transition map where this may not be
- // true anymore.
- } else if okAnyState {
- // We only have a specific property.
- newState, sentenceBreak, rule = transAnyState[0], transAnyState[1] == sbBreak, transAnyState[2]
- } else {
- // No known transition. SB999: Any × Any.
- newState, sentenceBreak, rule = sbAny, false, 9990
- }
- }
- // SB8.
- if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) {
- // Check the right side of the rule.
- var length int
- for nextProperty != prOLetter &&
- nextProperty != prUpper &&
- nextProperty != prLower &&
- nextProperty != prSep &&
- nextProperty != prCR &&
- nextProperty != prLF &&
- nextProperty != prATerm &&
- nextProperty != prSTerm {
- // Move on to the next rune.
- if b != nil { // Byte slice version.
- r, length = utf8.DecodeRune(b)
- b = b[length:]
- } else { // String version.
- r, length = utf8.DecodeRuneInString(str)
- str = str[length:]
- }
- if r == utf8.RuneError {
- break
- }
- nextProperty = property(sentenceBreakCodePoints, r)
- }
- if nextProperty == prLower {
- return sbLower, false
- }
- }
- return
- }
|