123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 |
- package uniseg
- // The states of the grapheme cluster parser.
- const (
- grAny = iota
- grCR
- grControlLF
- grL
- grLVV
- grLVTT
- grPrepend
- grExtendedPictographic
- grExtendedPictographicZWJ
- grRIOdd
- grRIEven
- )
- // The grapheme cluster parser's breaking instructions.
- const (
- grNoBoundary = iota
- grBoundary
- )
- // The grapheme cluster parser's state transitions. Maps (state, property) to
- // (new state, breaking instruction, rule number). The breaking instruction
- // always refers to the boundary between the last and next code point.
- //
- // This map is queried as follows:
- //
- // 1. Find specific state + specific property. Stop if found.
- // 2. Find specific state + any property.
- // 3. Find any state + specific property.
- // 4. If only (2) or (3) (but not both) was found, stop.
- // 5. If both (2) and (3) were found, use state from (3) and breaking instruction
- // from the transition with the lower rule number, prefer (3) if rule numbers
- // are equal. Stop.
- // 6. Assume grAny and grBoundary.
- //
- // Unicode version 14.0.0.
- var grTransitions = map[[2]int][3]int{
- // GB5
- {grAny, prCR}: {grCR, grBoundary, 50},
- {grAny, prLF}: {grControlLF, grBoundary, 50},
- {grAny, prControl}: {grControlLF, grBoundary, 50},
- // GB4
- {grCR, prAny}: {grAny, grBoundary, 40},
- {grControlLF, prAny}: {grAny, grBoundary, 40},
- // GB3.
- {grCR, prLF}: {grControlLF, grNoBoundary, 30},
- // GB6.
- {grAny, prL}: {grL, grBoundary, 9990},
- {grL, prL}: {grL, grNoBoundary, 60},
- {grL, prV}: {grLVV, grNoBoundary, 60},
- {grL, prLV}: {grLVV, grNoBoundary, 60},
- {grL, prLVT}: {grLVTT, grNoBoundary, 60},
- // GB7.
- {grAny, prLV}: {grLVV, grBoundary, 9990},
- {grAny, prV}: {grLVV, grBoundary, 9990},
- {grLVV, prV}: {grLVV, grNoBoundary, 70},
- {grLVV, prT}: {grLVTT, grNoBoundary, 70},
- // GB8.
- {grAny, prLVT}: {grLVTT, grBoundary, 9990},
- {grAny, prT}: {grLVTT, grBoundary, 9990},
- {grLVTT, prT}: {grLVTT, grNoBoundary, 80},
- // GB9.
- {grAny, prExtend}: {grAny, grNoBoundary, 90},
- {grAny, prZWJ}: {grAny, grNoBoundary, 90},
- // GB9a.
- {grAny, prSpacingMark}: {grAny, grNoBoundary, 91},
- // GB9b.
- {grAny, prPrepend}: {grPrepend, grBoundary, 9990},
- {grPrepend, prAny}: {grAny, grNoBoundary, 92},
- // GB11.
- {grAny, prExtendedPictographic}: {grExtendedPictographic, grBoundary, 9990},
- {grExtendedPictographic, prExtend}: {grExtendedPictographic, grNoBoundary, 110},
- {grExtendedPictographic, prZWJ}: {grExtendedPictographicZWJ, grNoBoundary, 110},
- {grExtendedPictographicZWJ, prExtendedPictographic}: {grExtendedPictographic, grNoBoundary, 110},
- // GB12 / GB13.
- {grAny, prRegionalIndicator}: {grRIOdd, grBoundary, 9990},
- {grRIOdd, prRegionalIndicator}: {grRIEven, grNoBoundary, 120},
- {grRIEven, prRegionalIndicator}: {grRIOdd, grBoundary, 120},
- }
- // transitionGraphemeState determines the new state of the grapheme cluster
- // parser given the current state and the next code point. It also returns the
- // code point's grapheme property (the value mapped by the [graphemeCodePoints]
- // table) and whether a cluster boundary was detected.
- func transitionGraphemeState(state int, r rune) (newState, prop int, boundary bool) {
- // Determine the property of the next character.
- prop = property(graphemeCodePoints, r)
- // Find the applicable transition.
- transition, ok := grTransitions[[2]int{state, prop}]
- if ok {
- // We have a specific transition. We'll use it.
- return transition[0], prop, transition[1] == grBoundary
- }
- // No specific transition found. Try the less specific ones.
- transAnyProp, okAnyProp := grTransitions[[2]int{state, prAny}]
- transAnyState, okAnyState := grTransitions[[2]int{grAny, prop}]
- if okAnyProp && okAnyState {
- // Both apply. We'll use a mix (see comments for grTransitions).
- newState = transAnyState[0]
- boundary = transAnyState[1] == grBoundary
- if transAnyProp[2] < transAnyState[2] {
- boundary = transAnyProp[1] == grBoundary
- }
- return
- }
- if okAnyProp {
- // We only have a specific state.
- return transAnyProp[0], prop, transAnyProp[1] == grBoundary
- // This branch will probably never be reached because okAnyState will
- // always be true given the current transition map. But we keep it here
- // for future modifications to the transition map where this may not be
- // true anymore.
- }
- if okAnyState {
- // We only have a specific property.
- return transAnyState[0], prop, transAnyState[1] == grBoundary
- }
- // No known transition. GB999: Any ÷ Any.
- return grAny, prop, true
- }
|