123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334 |
- package uniseg
- import "unicode/utf8"
- // Graphemes implements an iterator over Unicode grapheme clusters, or
- // user-perceived characters. While iterating, it also provides information
- // about word boundaries, sentence boundaries, line breaks, and monospace
- // character widths.
- //
- // After constructing the class via [NewGraphemes] for a given string "str",
- // [Graphemes.Next] is called for every grapheme cluster in a loop until it
- // returns false. Inside the loop, information about the grapheme cluster as
- // well as boundary information and character width is available via the various
- // methods (see examples below).
- //
- // Using this class to iterate over a string is convenient but it is much slower
- // than using this package's [Step] or [StepString] functions or any of the
- // other specialized functions starting with "First".
- type Graphemes struct {
- // The original string.
- original string
- // The remaining string to be parsed.
- remaining string
- // The current grapheme cluster.
- cluster string
- // The byte offset of the current grapheme cluster relative to the original
- // string.
- offset int
- // The current boundary information of the [Step] parser.
- boundaries int
- // The current state of the [Step] parser.
- state int
- }
- // NewGraphemes returns a new grapheme cluster iterator.
- func NewGraphemes(str string) *Graphemes {
- return &Graphemes{
- original: str,
- remaining: str,
- state: -1,
- }
- }
- // Next advances the iterator by one grapheme cluster and returns false if no
- // clusters are left. This function must be called before the first cluster is
- // accessed.
- func (g *Graphemes) Next() bool {
- if len(g.remaining) == 0 {
- // We're already past the end.
- g.state = -2
- g.cluster = ""
- return false
- }
- g.offset += len(g.cluster)
- g.cluster, g.remaining, g.boundaries, g.state = StepString(g.remaining, g.state)
- return true
- }
- // Runes returns a slice of runes (code points) which corresponds to the current
- // grapheme cluster. If the iterator is already past the end or [Graphemes.Next]
- // has not yet been called, nil is returned.
- func (g *Graphemes) Runes() []rune {
- if g.state < 0 {
- return nil
- }
- return []rune(g.cluster)
- }
- // Str returns a substring of the original string which corresponds to the
- // current grapheme cluster. If the iterator is already past the end or
- // [Graphemes.Next] has not yet been called, an empty string is returned.
- func (g *Graphemes) Str() string {
- return g.cluster
- }
- // Bytes returns a byte slice which corresponds to the current grapheme cluster.
- // If the iterator is already past the end or [Graphemes.Next] has not yet been
- // called, nil is returned.
- func (g *Graphemes) Bytes() []byte {
- if g.state < 0 {
- return nil
- }
- return []byte(g.cluster)
- }
- // Positions returns the interval of the current grapheme cluster as byte
- // positions into the original string. The first returned value "from" indexes
- // the first byte and the second returned value "to" indexes the first byte that
- // is not included anymore, i.e. str[from:to] is the current grapheme cluster of
- // the original string "str". If [Graphemes.Next] has not yet been called, both
- // values are 0. If the iterator is already past the end, both values are 1.
- func (g *Graphemes) Positions() (int, int) {
- if g.state == -1 {
- return 0, 0
- } else if g.state == -2 {
- return 1, 1
- }
- return g.offset, g.offset + len(g.cluster)
- }
- // IsWordBoundary returns true if a word ends after the current grapheme
- // cluster.
- func (g *Graphemes) IsWordBoundary() bool {
- if g.state < 0 {
- return true
- }
- return g.boundaries&MaskWord != 0
- }
- // IsSentenceBoundary returns true if a sentence ends after the current
- // grapheme cluster.
- func (g *Graphemes) IsSentenceBoundary() bool {
- if g.state < 0 {
- return true
- }
- return g.boundaries&MaskSentence != 0
- }
- // LineBreak returns whether the line can be broken after the current grapheme
- // cluster. A value of [LineDontBreak] means the line may not be broken, a value
- // of [LineMustBreak] means the line must be broken, and a value of
- // [LineCanBreak] means the line may or may not be broken.
- func (g *Graphemes) LineBreak() int {
- if g.state == -1 {
- return LineDontBreak
- }
- if g.state == -2 {
- return LineMustBreak
- }
- return g.boundaries & MaskLine
- }
- // Width returns the monospace width of the current grapheme cluster.
- func (g *Graphemes) Width() int {
- if g.state < 0 {
- return 0
- }
- return g.boundaries >> ShiftWidth
- }
- // Reset puts the iterator into its initial state such that the next call to
- // [Graphemes.Next] sets it to the first grapheme cluster again.
- func (g *Graphemes) Reset() {
- g.state = -1
- g.offset = 0
- g.cluster = ""
- g.remaining = g.original
- }
- // GraphemeClusterCount returns the number of user-perceived characters
- // (grapheme clusters) for the given string.
- func GraphemeClusterCount(s string) (n int) {
- state := -1
- for len(s) > 0 {
- _, s, _, state = FirstGraphemeClusterInString(s, state)
- n++
- }
- return
- }
- // ReverseString reverses the given string while observing grapheme cluster
- // boundaries.
- func ReverseString(s string) string {
- str := []byte(s)
- reversed := make([]byte, len(str))
- state := -1
- index := len(str)
- for len(str) > 0 {
- var cluster []byte
- cluster, str, _, state = FirstGraphemeCluster(str, state)
- index -= len(cluster)
- copy(reversed[index:], cluster)
- if index <= len(str)/2 {
- break
- }
- }
- return string(reversed)
- }
- // The number of bits the grapheme property must be shifted to make place for
- // grapheme states.
- const shiftGraphemePropState = 4
- // FirstGraphemeCluster returns the first grapheme cluster found in the given
- // byte slice according to the rules of [Unicode Standard Annex #29, Grapheme
- // Cluster Boundaries]. This function can be called continuously to extract all
- // grapheme clusters from a byte slice, as illustrated in the example below.
- //
- // If you don't know the current state, for example when calling the function
- // for the first time, you must pass -1. For consecutive calls, pass the state
- // and rest slice returned by the previous call.
- //
- // The "rest" slice is the sub-slice of the original byte slice "b" starting
- // after the last byte of the identified grapheme cluster. If the length of the
- // "rest" slice is 0, the entire byte slice "b" has been processed. The
- // "cluster" byte slice is the sub-slice of the input slice containing the
- // identified grapheme cluster.
- //
- // The returned width is the width of the grapheme cluster for most monospace
- // fonts where a value of 1 represents one character cell.
- //
- // Given an empty byte slice "b", the function returns nil values.
- //
- // While slightly less convenient than using the Graphemes class, this function
- // has much better performance and makes no allocations. It lends itself well to
- // large byte slices.
- //
- // [Unicode Standard Annex #29, Grapheme Cluster Boundaries]: http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
- func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, width, newState int) {
- // An empty byte slice returns nothing.
- if len(b) == 0 {
- return
- }
- // Extract the first rune.
- r, length := utf8.DecodeRune(b)
- if len(b) <= length { // If we're already past the end, there is nothing else to parse.
- var prop int
- if state < 0 {
- prop = property(graphemeCodePoints, r)
- } else {
- prop = state >> shiftGraphemePropState
- }
- return b, nil, runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
- }
- // If we don't know the state, determine it now.
- var firstProp int
- if state < 0 {
- state, firstProp, _ = transitionGraphemeState(state, r)
- } else {
- firstProp = state >> shiftGraphemePropState
- }
- width += runeWidth(r, firstProp)
- // Transition until we find a boundary.
- for {
- var (
- prop int
- boundary bool
- )
- r, l := utf8.DecodeRune(b[length:])
- state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
- if boundary {
- return b[:length], b[length:], width, state | (prop << shiftGraphemePropState)
- }
- if r == vs16 {
- width = 2
- } else if firstProp != prExtendedPictographic && firstProp != prRegionalIndicator && firstProp != prL {
- width += runeWidth(r, prop)
- } else if firstProp == prExtendedPictographic {
- if r == vs15 {
- width = 1
- } else {
- width = 2
- }
- }
- length += l
- if len(b) <= length {
- return b, nil, width, grAny | (prop << shiftGraphemePropState)
- }
- }
- }
- // FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and
- // outputs are strings.
- func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, width, newState int) {
- // An empty string returns nothing.
- if len(str) == 0 {
- return
- }
- // Extract the first rune.
- r, length := utf8.DecodeRuneInString(str)
- if len(str) <= length { // If we're already past the end, there is nothing else to parse.
- var prop int
- if state < 0 {
- prop = property(graphemeCodePoints, r)
- } else {
- prop = state >> shiftGraphemePropState
- }
- return str, "", runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
- }
- // If we don't know the state, determine it now.
- var firstProp int
- if state < 0 {
- state, firstProp, _ = transitionGraphemeState(state, r)
- } else {
- firstProp = state >> shiftGraphemePropState
- }
- width += runeWidth(r, firstProp)
- // Transition until we find a boundary.
- for {
- var (
- prop int
- boundary bool
- )
- r, l := utf8.DecodeRuneInString(str[length:])
- state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
- if boundary {
- return str[:length], str[length:], width, state | (prop << shiftGraphemePropState)
- }
- if r == vs16 {
- width = 2
- } else if firstProp != prExtendedPictographic && firstProp != prRegionalIndicator && firstProp != prL {
- width += runeWidth(r, prop)
- } else if firstProp == prExtendedPictographic {
- if r == vs15 {
- width = 1
- } else {
- width = 2
- }
- }
- length += l
- if len(str) <= length {
- return str, "", width, grAny | (prop << shiftGraphemePropState)
- }
- }
- }
|