grapheme.go 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. package uniseg
  2. import "unicode/utf8"
  3. // Graphemes implements an iterator over Unicode grapheme clusters, or
  4. // user-perceived characters. While iterating, it also provides information
  5. // about word boundaries, sentence boundaries, line breaks, and monospace
  6. // character widths.
  7. //
  8. // After constructing the class via [NewGraphemes] for a given string "str",
  9. // [Graphemes.Next] is called for every grapheme cluster in a loop until it
  10. // returns false. Inside the loop, information about the grapheme cluster as
  11. // well as boundary information and character width is available via the various
  12. // methods (see examples below).
  13. //
  14. // Using this class to iterate over a string is convenient but it is much slower
  15. // than using this package's [Step] or [StepString] functions or any of the
  16. // other specialized functions starting with "First".
  17. type Graphemes struct {
  18. // The original string.
  19. original string
  20. // The remaining string to be parsed.
  21. remaining string
  22. // The current grapheme cluster.
  23. cluster string
  24. // The byte offset of the current grapheme cluster relative to the original
  25. // string.
  26. offset int
  27. // The current boundary information of the [Step] parser.
  28. boundaries int
  29. // The current state of the [Step] parser.
  30. state int
  31. }
  32. // NewGraphemes returns a new grapheme cluster iterator.
  33. func NewGraphemes(str string) *Graphemes {
  34. return &Graphemes{
  35. original: str,
  36. remaining: str,
  37. state: -1,
  38. }
  39. }
  40. // Next advances the iterator by one grapheme cluster and returns false if no
  41. // clusters are left. This function must be called before the first cluster is
  42. // accessed.
  43. func (g *Graphemes) Next() bool {
  44. if len(g.remaining) == 0 {
  45. // We're already past the end.
  46. g.state = -2
  47. g.cluster = ""
  48. return false
  49. }
  50. g.offset += len(g.cluster)
  51. g.cluster, g.remaining, g.boundaries, g.state = StepString(g.remaining, g.state)
  52. return true
  53. }
  54. // Runes returns a slice of runes (code points) which corresponds to the current
  55. // grapheme cluster. If the iterator is already past the end or [Graphemes.Next]
  56. // has not yet been called, nil is returned.
  57. func (g *Graphemes) Runes() []rune {
  58. if g.state < 0 {
  59. return nil
  60. }
  61. return []rune(g.cluster)
  62. }
  63. // Str returns a substring of the original string which corresponds to the
  64. // current grapheme cluster. If the iterator is already past the end or
  65. // [Graphemes.Next] has not yet been called, an empty string is returned.
  66. func (g *Graphemes) Str() string {
  67. return g.cluster
  68. }
  69. // Bytes returns a byte slice which corresponds to the current grapheme cluster.
  70. // If the iterator is already past the end or [Graphemes.Next] has not yet been
  71. // called, nil is returned.
  72. func (g *Graphemes) Bytes() []byte {
  73. if g.state < 0 {
  74. return nil
  75. }
  76. return []byte(g.cluster)
  77. }
  78. // Positions returns the interval of the current grapheme cluster as byte
  79. // positions into the original string. The first returned value "from" indexes
  80. // the first byte and the second returned value "to" indexes the first byte that
  81. // is not included anymore, i.e. str[from:to] is the current grapheme cluster of
  82. // the original string "str". If [Graphemes.Next] has not yet been called, both
  83. // values are 0. If the iterator is already past the end, both values are 1.
  84. func (g *Graphemes) Positions() (int, int) {
  85. if g.state == -1 {
  86. return 0, 0
  87. } else if g.state == -2 {
  88. return 1, 1
  89. }
  90. return g.offset, g.offset + len(g.cluster)
  91. }
  92. // IsWordBoundary returns true if a word ends after the current grapheme
  93. // cluster.
  94. func (g *Graphemes) IsWordBoundary() bool {
  95. if g.state < 0 {
  96. return true
  97. }
  98. return g.boundaries&MaskWord != 0
  99. }
  100. // IsSentenceBoundary returns true if a sentence ends after the current
  101. // grapheme cluster.
  102. func (g *Graphemes) IsSentenceBoundary() bool {
  103. if g.state < 0 {
  104. return true
  105. }
  106. return g.boundaries&MaskSentence != 0
  107. }
  108. // LineBreak returns whether the line can be broken after the current grapheme
  109. // cluster. A value of [LineDontBreak] means the line may not be broken, a value
  110. // of [LineMustBreak] means the line must be broken, and a value of
  111. // [LineCanBreak] means the line may or may not be broken.
  112. func (g *Graphemes) LineBreak() int {
  113. if g.state == -1 {
  114. return LineDontBreak
  115. }
  116. if g.state == -2 {
  117. return LineMustBreak
  118. }
  119. return g.boundaries & MaskLine
  120. }
  121. // Width returns the monospace width of the current grapheme cluster.
  122. func (g *Graphemes) Width() int {
  123. if g.state < 0 {
  124. return 0
  125. }
  126. return g.boundaries >> ShiftWidth
  127. }
  128. // Reset puts the iterator into its initial state such that the next call to
  129. // [Graphemes.Next] sets it to the first grapheme cluster again.
  130. func (g *Graphemes) Reset() {
  131. g.state = -1
  132. g.offset = 0
  133. g.cluster = ""
  134. g.remaining = g.original
  135. }
  136. // GraphemeClusterCount returns the number of user-perceived characters
  137. // (grapheme clusters) for the given string.
  138. func GraphemeClusterCount(s string) (n int) {
  139. state := -1
  140. for len(s) > 0 {
  141. _, s, _, state = FirstGraphemeClusterInString(s, state)
  142. n++
  143. }
  144. return
  145. }
  146. // ReverseString reverses the given string while observing grapheme cluster
  147. // boundaries.
  148. func ReverseString(s string) string {
  149. str := []byte(s)
  150. reversed := make([]byte, len(str))
  151. state := -1
  152. index := len(str)
  153. for len(str) > 0 {
  154. var cluster []byte
  155. cluster, str, _, state = FirstGraphemeCluster(str, state)
  156. index -= len(cluster)
  157. copy(reversed[index:], cluster)
  158. if index <= len(str)/2 {
  159. break
  160. }
  161. }
  162. return string(reversed)
  163. }
  164. // The number of bits the grapheme property must be shifted to make place for
  165. // grapheme states.
  166. const shiftGraphemePropState = 4
  167. // FirstGraphemeCluster returns the first grapheme cluster found in the given
  168. // byte slice according to the rules of [Unicode Standard Annex #29, Grapheme
  169. // Cluster Boundaries]. This function can be called continuously to extract all
  170. // grapheme clusters from a byte slice, as illustrated in the example below.
  171. //
  172. // If you don't know the current state, for example when calling the function
  173. // for the first time, you must pass -1. For consecutive calls, pass the state
  174. // and rest slice returned by the previous call.
  175. //
  176. // The "rest" slice is the sub-slice of the original byte slice "b" starting
  177. // after the last byte of the identified grapheme cluster. If the length of the
  178. // "rest" slice is 0, the entire byte slice "b" has been processed. The
  179. // "cluster" byte slice is the sub-slice of the input slice containing the
  180. // identified grapheme cluster.
  181. //
  182. // The returned width is the width of the grapheme cluster for most monospace
  183. // fonts where a value of 1 represents one character cell.
  184. //
  185. // Given an empty byte slice "b", the function returns nil values.
  186. //
  187. // While slightly less convenient than using the Graphemes class, this function
  188. // has much better performance and makes no allocations. It lends itself well to
  189. // large byte slices.
  190. //
  191. // [Unicode Standard Annex #29, Grapheme Cluster Boundaries]: http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
  192. func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, width, newState int) {
  193. // An empty byte slice returns nothing.
  194. if len(b) == 0 {
  195. return
  196. }
  197. // Extract the first rune.
  198. r, length := utf8.DecodeRune(b)
  199. if len(b) <= length { // If we're already past the end, there is nothing else to parse.
  200. var prop int
  201. if state < 0 {
  202. prop = property(graphemeCodePoints, r)
  203. } else {
  204. prop = state >> shiftGraphemePropState
  205. }
  206. return b, nil, runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
  207. }
  208. // If we don't know the state, determine it now.
  209. var firstProp int
  210. if state < 0 {
  211. state, firstProp, _ = transitionGraphemeState(state, r)
  212. } else {
  213. firstProp = state >> shiftGraphemePropState
  214. }
  215. width += runeWidth(r, firstProp)
  216. // Transition until we find a boundary.
  217. for {
  218. var (
  219. prop int
  220. boundary bool
  221. )
  222. r, l := utf8.DecodeRune(b[length:])
  223. state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
  224. if boundary {
  225. return b[:length], b[length:], width, state | (prop << shiftGraphemePropState)
  226. }
  227. if r == vs16 {
  228. width = 2
  229. } else if firstProp != prExtendedPictographic && firstProp != prRegionalIndicator && firstProp != prL {
  230. width += runeWidth(r, prop)
  231. } else if firstProp == prExtendedPictographic {
  232. if r == vs15 {
  233. width = 1
  234. } else {
  235. width = 2
  236. }
  237. }
  238. length += l
  239. if len(b) <= length {
  240. return b, nil, width, grAny | (prop << shiftGraphemePropState)
  241. }
  242. }
  243. }
  244. // FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and
  245. // outputs are strings.
  246. func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, width, newState int) {
  247. // An empty string returns nothing.
  248. if len(str) == 0 {
  249. return
  250. }
  251. // Extract the first rune.
  252. r, length := utf8.DecodeRuneInString(str)
  253. if len(str) <= length { // If we're already past the end, there is nothing else to parse.
  254. var prop int
  255. if state < 0 {
  256. prop = property(graphemeCodePoints, r)
  257. } else {
  258. prop = state >> shiftGraphemePropState
  259. }
  260. return str, "", runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
  261. }
  262. // If we don't know the state, determine it now.
  263. var firstProp int
  264. if state < 0 {
  265. state, firstProp, _ = transitionGraphemeState(state, r)
  266. } else {
  267. firstProp = state >> shiftGraphemePropState
  268. }
  269. width += runeWidth(r, firstProp)
  270. // Transition until we find a boundary.
  271. for {
  272. var (
  273. prop int
  274. boundary bool
  275. )
  276. r, l := utf8.DecodeRuneInString(str[length:])
  277. state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
  278. if boundary {
  279. return str[:length], str[length:], width, state | (prop << shiftGraphemePropState)
  280. }
  281. if r == vs16 {
  282. width = 2
  283. } else if firstProp != prExtendedPictographic && firstProp != prRegionalIndicator && firstProp != prL {
  284. width += runeWidth(r, prop)
  285. } else if firstProp == prExtendedPictographic {
  286. if r == vs15 {
  287. width = 1
  288. } else {
  289. width = 2
  290. }
  291. }
  292. length += l
  293. if len(str) <= length {
  294. return str, "", width, grAny | (prop << shiftGraphemePropState)
  295. }
  296. }
  297. }