grapheme.go 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
  1. package uniseg
  2. import "unicode/utf8"
  3. // Graphemes implements an iterator over Unicode grapheme clusters, or
  4. // user-perceived characters. While iterating, it also provides information
  5. // about word boundaries, sentence boundaries, line breaks, and monospace
  6. // character widths.
  7. //
  8. // After constructing the class via [NewGraphemes] for a given string "str",
  9. // [Graphemes.Next] is called for every grapheme cluster in a loop until it
  10. // returns false. Inside the loop, information about the grapheme cluster as
  11. // well as boundary information and character width is available via the various
  12. // methods (see examples below).
  13. //
  14. // This class basically wraps the [StepString] parser and provides a convenient
  15. // interface to it. If you are only interested in some parts of this package's
  16. // functionality, using the specialized functions starting with "First" is
  17. // almost always faster.
  18. type Graphemes struct {
  19. // The original string.
  20. original string
  21. // The remaining string to be parsed.
  22. remaining string
  23. // The current grapheme cluster.
  24. cluster string
  25. // The byte offset of the current grapheme cluster relative to the original
  26. // string.
  27. offset int
  28. // The current boundary information of the [Step] parser.
  29. boundaries int
  30. // The current state of the [Step] parser.
  31. state int
  32. }
  33. // NewGraphemes returns a new grapheme cluster iterator.
  34. func NewGraphemes(str string) *Graphemes {
  35. return &Graphemes{
  36. original: str,
  37. remaining: str,
  38. state: -1,
  39. }
  40. }
  41. // Next advances the iterator by one grapheme cluster and returns false if no
  42. // clusters are left. This function must be called before the first cluster is
  43. // accessed.
  44. func (g *Graphemes) Next() bool {
  45. if len(g.remaining) == 0 {
  46. // We're already past the end.
  47. g.state = -2
  48. g.cluster = ""
  49. return false
  50. }
  51. g.offset += len(g.cluster)
  52. g.cluster, g.remaining, g.boundaries, g.state = StepString(g.remaining, g.state)
  53. return true
  54. }
  55. // Runes returns a slice of runes (code points) which corresponds to the current
  56. // grapheme cluster. If the iterator is already past the end or [Graphemes.Next]
  57. // has not yet been called, nil is returned.
  58. func (g *Graphemes) Runes() []rune {
  59. if g.state < 0 {
  60. return nil
  61. }
  62. return []rune(g.cluster)
  63. }
  64. // Str returns a substring of the original string which corresponds to the
  65. // current grapheme cluster. If the iterator is already past the end or
  66. // [Graphemes.Next] has not yet been called, an empty string is returned.
  67. func (g *Graphemes) Str() string {
  68. return g.cluster
  69. }
  70. // Bytes returns a byte slice which corresponds to the current grapheme cluster.
  71. // If the iterator is already past the end or [Graphemes.Next] has not yet been
  72. // called, nil is returned.
  73. func (g *Graphemes) Bytes() []byte {
  74. if g.state < 0 {
  75. return nil
  76. }
  77. return []byte(g.cluster)
  78. }
  79. // Positions returns the interval of the current grapheme cluster as byte
  80. // positions into the original string. The first returned value "from" indexes
  81. // the first byte and the second returned value "to" indexes the first byte that
  82. // is not included anymore, i.e. str[from:to] is the current grapheme cluster of
  83. // the original string "str". If [Graphemes.Next] has not yet been called, both
  84. // values are 0. If the iterator is already past the end, both values are 1.
  85. func (g *Graphemes) Positions() (int, int) {
  86. if g.state == -1 {
  87. return 0, 0
  88. } else if g.state == -2 {
  89. return 1, 1
  90. }
  91. return g.offset, g.offset + len(g.cluster)
  92. }
  93. // IsWordBoundary returns true if a word ends after the current grapheme
  94. // cluster.
  95. func (g *Graphemes) IsWordBoundary() bool {
  96. if g.state < 0 {
  97. return true
  98. }
  99. return g.boundaries&MaskWord != 0
  100. }
  101. // IsSentenceBoundary returns true if a sentence ends after the current
  102. // grapheme cluster.
  103. func (g *Graphemes) IsSentenceBoundary() bool {
  104. if g.state < 0 {
  105. return true
  106. }
  107. return g.boundaries&MaskSentence != 0
  108. }
  109. // LineBreak returns whether the line can be broken after the current grapheme
  110. // cluster. A value of [LineDontBreak] means the line may not be broken, a value
  111. // of [LineMustBreak] means the line must be broken, and a value of
  112. // [LineCanBreak] means the line may or may not be broken.
  113. func (g *Graphemes) LineBreak() int {
  114. if g.state == -1 {
  115. return LineDontBreak
  116. }
  117. if g.state == -2 {
  118. return LineMustBreak
  119. }
  120. return g.boundaries & MaskLine
  121. }
  122. // Width returns the monospace width of the current grapheme cluster.
  123. func (g *Graphemes) Width() int {
  124. if g.state < 0 {
  125. return 0
  126. }
  127. return g.boundaries >> ShiftWidth
  128. }
  129. // Reset puts the iterator into its initial state such that the next call to
  130. // [Graphemes.Next] sets it to the first grapheme cluster again.
  131. func (g *Graphemes) Reset() {
  132. g.state = -1
  133. g.offset = 0
  134. g.cluster = ""
  135. g.remaining = g.original
  136. }
  137. // GraphemeClusterCount returns the number of user-perceived characters
  138. // (grapheme clusters) for the given string.
  139. func GraphemeClusterCount(s string) (n int) {
  140. state := -1
  141. for len(s) > 0 {
  142. _, s, _, state = FirstGraphemeClusterInString(s, state)
  143. n++
  144. }
  145. return
  146. }
  147. // ReverseString reverses the given string while observing grapheme cluster
  148. // boundaries.
  149. func ReverseString(s string) string {
  150. str := []byte(s)
  151. reversed := make([]byte, len(str))
  152. state := -1
  153. index := len(str)
  154. for len(str) > 0 {
  155. var cluster []byte
  156. cluster, str, _, state = FirstGraphemeCluster(str, state)
  157. index -= len(cluster)
  158. copy(reversed[index:], cluster)
  159. if index <= len(str)/2 {
  160. break
  161. }
  162. }
  163. return string(reversed)
  164. }
  165. // The number of bits the grapheme property must be shifted to make place for
  166. // grapheme states.
  167. const shiftGraphemePropState = 4
  168. // FirstGraphemeCluster returns the first grapheme cluster found in the given
  169. // byte slice according to the rules of [Unicode Standard Annex #29, Grapheme
  170. // Cluster Boundaries]. This function can be called continuously to extract all
  171. // grapheme clusters from a byte slice, as illustrated in the example below.
  172. //
  173. // If you don't know the current state, for example when calling the function
  174. // for the first time, you must pass -1. For consecutive calls, pass the state
  175. // and rest slice returned by the previous call.
  176. //
  177. // The "rest" slice is the sub-slice of the original byte slice "b" starting
  178. // after the last byte of the identified grapheme cluster. If the length of the
  179. // "rest" slice is 0, the entire byte slice "b" has been processed. The
  180. // "cluster" byte slice is the sub-slice of the input slice containing the
  181. // identified grapheme cluster.
  182. //
  183. // The returned width is the width of the grapheme cluster for most monospace
  184. // fonts where a value of 1 represents one character cell.
  185. //
  186. // Given an empty byte slice "b", the function returns nil values.
  187. //
  188. // While slightly less convenient than using the Graphemes class, this function
  189. // has much better performance and makes no allocations. It lends itself well to
  190. // large byte slices.
  191. //
  192. // [Unicode Standard Annex #29, Grapheme Cluster Boundaries]: http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
  193. func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, width, newState int) {
  194. // An empty byte slice returns nothing.
  195. if len(b) == 0 {
  196. return
  197. }
  198. // Extract the first rune.
  199. r, length := utf8.DecodeRune(b)
  200. if len(b) <= length { // If we're already past the end, there is nothing else to parse.
  201. var prop int
  202. if state < 0 {
  203. prop = propertyGraphemes(r)
  204. } else {
  205. prop = state >> shiftGraphemePropState
  206. }
  207. return b, nil, runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
  208. }
  209. // If we don't know the state, determine it now.
  210. var firstProp int
  211. if state < 0 {
  212. state, firstProp, _ = transitionGraphemeState(state, r)
  213. } else {
  214. firstProp = state >> shiftGraphemePropState
  215. }
  216. width += runeWidth(r, firstProp)
  217. // Transition until we find a boundary.
  218. for {
  219. var (
  220. prop int
  221. boundary bool
  222. )
  223. r, l := utf8.DecodeRune(b[length:])
  224. state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
  225. if boundary {
  226. return b[:length], b[length:], width, state | (prop << shiftGraphemePropState)
  227. }
  228. if firstProp == prExtendedPictographic {
  229. if r == vs15 {
  230. width = 1
  231. } else if r == vs16 {
  232. width = 2
  233. }
  234. } else if firstProp != prRegionalIndicator && firstProp != prL {
  235. width += runeWidth(r, prop)
  236. }
  237. length += l
  238. if len(b) <= length {
  239. return b, nil, width, grAny | (prop << shiftGraphemePropState)
  240. }
  241. }
  242. }
  243. // FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and
  244. // outputs are strings.
  245. func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, width, newState int) {
  246. // An empty string returns nothing.
  247. if len(str) == 0 {
  248. return
  249. }
  250. // Extract the first rune.
  251. r, length := utf8.DecodeRuneInString(str)
  252. if len(str) <= length { // If we're already past the end, there is nothing else to parse.
  253. var prop int
  254. if state < 0 {
  255. prop = propertyGraphemes(r)
  256. } else {
  257. prop = state >> shiftGraphemePropState
  258. }
  259. return str, "", runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
  260. }
  261. // If we don't know the state, determine it now.
  262. var firstProp int
  263. if state < 0 {
  264. state, firstProp, _ = transitionGraphemeState(state, r)
  265. } else {
  266. firstProp = state >> shiftGraphemePropState
  267. }
  268. width += runeWidth(r, firstProp)
  269. // Transition until we find a boundary.
  270. for {
  271. var (
  272. prop int
  273. boundary bool
  274. )
  275. r, l := utf8.DecodeRuneInString(str[length:])
  276. state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
  277. if boundary {
  278. return str[:length], str[length:], width, state | (prop << shiftGraphemePropState)
  279. }
  280. if firstProp == prExtendedPictographic {
  281. if r == vs15 {
  282. width = 1
  283. } else if r == vs16 {
  284. width = 2
  285. }
  286. } else if firstProp != prRegionalIndicator && firstProp != prL {
  287. width += runeWidth(r, prop)
  288. }
  289. length += l
  290. if len(str) <= length {
  291. return str, "", width, grAny | (prop << shiftGraphemePropState)
  292. }
  293. }
  294. }