123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246 |
- package uniseg
- import "unicode/utf8"
- // The states of the word break parser.
- const (
- wbAny = iota
- wbCR
- wbLF
- wbNewline
- wbWSegSpace
- wbHebrewLetter
- wbALetter
- wbWB7
- wbWB7c
- wbNumeric
- wbWB11
- wbKatakana
- wbExtendNumLet
- wbOddRI
- wbEvenRI
- wbZWJBit = 16 // This bit is set for any states followed by at least one zero-width joiner (see WB4 and WB3c).
- )
- // The word break parser's breaking instructions.
- const (
- wbDontBreak = iota
- wbBreak
- )
- // The word break parser's state transitions. It's anologous to grTransitions,
- // see comments there for details. Unicode version 14.0.0.
- var wbTransitions = map[[2]int][3]int{
- // WB3b.
- {wbAny, prNewline}: {wbNewline, wbBreak, 32},
- {wbAny, prCR}: {wbCR, wbBreak, 32},
- {wbAny, prLF}: {wbLF, wbBreak, 32},
- // WB3a.
- {wbNewline, prAny}: {wbAny, wbBreak, 31},
- {wbCR, prAny}: {wbAny, wbBreak, 31},
- {wbLF, prAny}: {wbAny, wbBreak, 31},
- // WB3.
- {wbCR, prLF}: {wbLF, wbDontBreak, 30},
- // WB3d.
- {wbAny, prWSegSpace}: {wbWSegSpace, wbBreak, 9990},
- {wbWSegSpace, prWSegSpace}: {wbWSegSpace, wbDontBreak, 34},
- // WB5.
- {wbAny, prALetter}: {wbALetter, wbBreak, 9990},
- {wbAny, prHebrewLetter}: {wbHebrewLetter, wbBreak, 9990},
- {wbALetter, prALetter}: {wbALetter, wbDontBreak, 50},
- {wbALetter, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 50},
- {wbHebrewLetter, prALetter}: {wbALetter, wbDontBreak, 50},
- {wbHebrewLetter, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 50},
- // WB7. Transitions to wbWB7 handled by transitionWordBreakState().
- {wbWB7, prALetter}: {wbALetter, wbDontBreak, 70},
- {wbWB7, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 70},
- // WB7a.
- {wbHebrewLetter, prSingleQuote}: {wbAny, wbDontBreak, 71},
- // WB7c. Transitions to wbWB7c handled by transitionWordBreakState().
- {wbWB7c, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 73},
- // WB8.
- {wbAny, prNumeric}: {wbNumeric, wbBreak, 9990},
- {wbNumeric, prNumeric}: {wbNumeric, wbDontBreak, 80},
- // WB9.
- {wbALetter, prNumeric}: {wbNumeric, wbDontBreak, 90},
- {wbHebrewLetter, prNumeric}: {wbNumeric, wbDontBreak, 90},
- // WB10.
- {wbNumeric, prALetter}: {wbALetter, wbDontBreak, 100},
- {wbNumeric, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 100},
- // WB11. Transitions to wbWB11 handled by transitionWordBreakState().
- {wbWB11, prNumeric}: {wbNumeric, wbDontBreak, 110},
- // WB13.
- {wbAny, prKatakana}: {wbKatakana, wbBreak, 9990},
- {wbKatakana, prKatakana}: {wbKatakana, wbDontBreak, 130},
- // WB13a.
- {wbAny, prExtendNumLet}: {wbExtendNumLet, wbBreak, 9990},
- {wbALetter, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
- {wbHebrewLetter, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
- {wbNumeric, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
- {wbKatakana, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
- {wbExtendNumLet, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
- // WB13b.
- {wbExtendNumLet, prALetter}: {wbALetter, wbDontBreak, 132},
- {wbExtendNumLet, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 132},
- {wbExtendNumLet, prNumeric}: {wbNumeric, wbDontBreak, 132},
- {wbExtendNumLet, prKatakana}: {prKatakana, wbDontBreak, 132},
- }
- // transitionWordBreakState determines the new state of the word break parser
- // given the current state and the next code point. It also returns whether a
- // word boundary was detected. If more than one code point is needed to
- // determine the new state, the byte slice or the string starting after rune "r"
- // can be used (whichever is not nil or empty) for further lookups.
- func transitionWordBreakState(state int, r rune, b []byte, str string) (newState int, wordBreak bool) {
- // Determine the property of the next character.
- nextProperty := property(workBreakCodePoints, r)
- // "Replacing Ignore Rules".
- if nextProperty == prZWJ {
- // WB4 (for zero-width joiners).
- if state == wbNewline || state == wbCR || state == wbLF {
- return wbAny | wbZWJBit, true // Make sure we don't apply WB4 to WB3a.
- }
- if state < 0 {
- return wbAny | wbZWJBit, false
- }
- return state | wbZWJBit, false
- } else if nextProperty == prExtend || nextProperty == prFormat {
- // WB4 (for Extend and Format).
- if state == wbNewline || state == wbCR || state == wbLF {
- return wbAny, true // Make sure we don't apply WB4 to WB3a.
- }
- if state == wbWSegSpace || state == wbAny|wbZWJBit {
- return wbAny, false // We don't break but this is also not WB3d or WB3c.
- }
- if state < 0 {
- return wbAny, false
- }
- return state, false
- } else if nextProperty == prExtendedPictographic && state >= 0 && state&wbZWJBit != 0 {
- // WB3c.
- return wbAny, false
- }
- if state >= 0 {
- state = state &^ wbZWJBit
- }
- // Find the applicable transition in the table.
- var rule int
- transition, ok := wbTransitions[[2]int{state, nextProperty}]
- if ok {
- // We have a specific transition. We'll use it.
- newState, wordBreak, rule = transition[0], transition[1] == wbBreak, transition[2]
- } else {
- // No specific transition found. Try the less specific ones.
- transAnyProp, okAnyProp := wbTransitions[[2]int{state, prAny}]
- transAnyState, okAnyState := wbTransitions[[2]int{wbAny, nextProperty}]
- if okAnyProp && okAnyState {
- // Both apply. We'll use a mix (see comments for grTransitions).
- newState, wordBreak, rule = transAnyState[0], transAnyState[1] == wbBreak, transAnyState[2]
- if transAnyProp[2] < transAnyState[2] {
- wordBreak, rule = transAnyProp[1] == wbBreak, transAnyProp[2]
- }
- } else if okAnyProp {
- // We only have a specific state.
- newState, wordBreak, rule = transAnyProp[0], transAnyProp[1] == wbBreak, transAnyProp[2]
- // This branch will probably never be reached because okAnyState will
- // always be true given the current transition map. But we keep it here
- // for future modifications to the transition map where this may not be
- // true anymore.
- } else if okAnyState {
- // We only have a specific property.
- newState, wordBreak, rule = transAnyState[0], transAnyState[1] == wbBreak, transAnyState[2]
- } else {
- // No known transition. WB999: Any ÷ Any.
- newState, wordBreak, rule = wbAny, true, 9990
- }
- }
- // For those rules that need to look up runes further in the string, we
- // determine the property after nextProperty, skipping over Format, Extend,
- // and ZWJ (according to WB4). It's -1 if not needed, if such a rune cannot
- // be determined (because the text ends or the rune is faulty).
- farProperty := -1
- if rule > 60 &&
- (state == wbALetter || state == wbHebrewLetter || state == wbNumeric) &&
- (nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote || // WB6.
- nextProperty == prDoubleQuote || // WB7b.
- nextProperty == prMidNum) { // WB12.
- for {
- var (
- r rune
- length int
- )
- if b != nil { // Byte slice version.
- r, length = utf8.DecodeRune(b)
- b = b[length:]
- } else { // String version.
- r, length = utf8.DecodeRuneInString(str)
- str = str[length:]
- }
- if r == utf8.RuneError {
- break
- }
- prop := property(workBreakCodePoints, r)
- if prop == prExtend || prop == prFormat || prop == prZWJ {
- continue
- }
- farProperty = prop
- break
- }
- }
- // WB6.
- if rule > 60 &&
- (state == wbALetter || state == wbHebrewLetter) &&
- (nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
- (farProperty == prALetter || farProperty == prHebrewLetter) {
- return wbWB7, false
- }
- // WB7b.
- if rule > 72 &&
- state == wbHebrewLetter &&
- nextProperty == prDoubleQuote &&
- farProperty == prHebrewLetter {
- return wbWB7c, false
- }
- // WB12.
- if rule > 120 &&
- state == wbNumeric &&
- (nextProperty == prMidNum || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
- farProperty == prNumeric {
- return wbWB11, false
- }
- // WB15 and WB16.
- if newState == wbAny && nextProperty == prRegionalIndicator {
- if state != wbOddRI && state != wbEvenRI { // Includes state == -1.
- // Transition into the first RI.
- return wbOddRI, true
- }
- if state == wbOddRI {
- // Don't break pairs of Regional Indicators.
- return wbEvenRI, false
- }
- return wbOddRI, true // We can break after a pair.
- }
- return
- }
|