step.go 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. package uniseg
  2. import "unicode/utf8"
  3. // The bit masks used to extract boundary information returned by [Step].
  4. const (
  5. MaskLine = 3
  6. MaskWord = 4
  7. MaskSentence = 8
  8. )
  9. // The number of bits to shift the boundary information returned by [Step] to
  10. // obtain the monospace width of the grapheme cluster.
  11. const ShiftWidth = 4
  12. // The bit positions by which boundary flags are shifted by the [Step] function.
  13. // These must correspond to the Mask constants.
  14. const (
  15. shiftWord = 2
  16. shiftSentence = 3
  17. // shiftwWidth is ShiftWidth above. No mask as these are always the remaining bits.
  18. )
  19. // The bit positions by which states are shifted by the [Step] function. These
  20. // values must ensure state values defined for each of the boundary algorithms
  21. // don't overlap (and that they all still fit in a single int). These must
  22. // correspond to the Mask constants.
  23. const (
  24. shiftWordState = 4
  25. shiftSentenceState = 9
  26. shiftLineState = 13
  27. shiftPropState = 21 // No mask as these are always the remaining bits.
  28. )
  29. // The bit mask used to extract the state returned by the [Step] function, after
  30. // shifting. These values must correspond to the shift constants.
  31. const (
  32. maskGraphemeState = 0xf
  33. maskWordState = 0x1f
  34. maskSentenceState = 0xf
  35. maskLineState = 0xff
  36. )
  37. // Step returns the first grapheme cluster (user-perceived character) found in
  38. // the given byte slice. It also returns information about the boundary between
  39. // that grapheme cluster and the one following it as well as the monospace width
  40. // of the grapheme cluster. There are three types of boundary information: word
  41. // boundaries, sentence boundaries, and line breaks. This function is therefore
  42. // a combination of [FirstGraphemeCluster], [FirstWord], [FirstSentence], and
  43. // [FirstLineSegment].
  44. //
  45. // The "boundaries" return value can be evaluated as follows:
  46. //
  47. // - boundaries&MaskWord != 0: The boundary is a word boundary.
  48. // - boundaries&MaskWord == 0: The boundary is not a word boundary.
  49. // - boundaries&MaskSentence != 0: The boundary is a sentence boundary.
  50. // - boundaries&MaskSentence == 0: The boundary is not a sentence boundary.
  51. // - boundaries&MaskLine == LineDontBreak: You must not break the line at the
  52. // boundary.
  53. // - boundaries&MaskLine == LineMustBreak: You must break the line at the
  54. // boundary.
  55. // - boundaries&MaskLine == LineCanBreak: You may or may not break the line at
  56. // the boundary.
  57. // - boundaries >> ShiftWidth: The width of the grapheme cluster for most
  58. // monospace fonts where a value of 1 represents one character cell.
  59. //
  60. // This function can be called continuously to extract all grapheme clusters
  61. // from a byte slice, as illustrated in the examples below.
  62. //
  63. // If you don't know which state to pass, for example when calling the function
  64. // for the first time, you must pass -1. For consecutive calls, pass the state
  65. // and rest slice returned by the previous call.
  66. //
  67. // The "rest" slice is the sub-slice of the original byte slice "b" starting
  68. // after the last byte of the identified grapheme cluster. If the length of the
  69. // "rest" slice is 0, the entire byte slice "b" has been processed. The
  70. // "cluster" byte slice is the sub-slice of the input slice containing the
  71. // first identified grapheme cluster.
  72. //
  73. // Given an empty byte slice "b", the function returns nil values.
  74. //
  75. // While slightly less convenient than using the Graphemes class, this function
  76. // has much better performance and makes no allocations. It lends itself well to
  77. // large byte slices.
  78. //
  79. // Note that in accordance with [UAX #14 LB3], the final segment will end with
  80. // a mandatory line break (boundaries&MaskLine == LineMustBreak). You can choose
  81. // to ignore this by checking if the length of the "rest" slice is 0 and calling
  82. // [HasTrailingLineBreak] or [HasTrailingLineBreakInString] on the last rune.
  83. //
  84. // [UAX #14 LB3]: https://www.unicode.org/reports/tr14/#Algorithm
  85. func Step(b []byte, state int) (cluster, rest []byte, boundaries int, newState int) {
  86. // An empty byte slice returns nothing.
  87. if len(b) == 0 {
  88. return
  89. }
  90. // Extract the first rune.
  91. r, length := utf8.DecodeRune(b)
  92. if len(b) <= length { // If we're already past the end, there is nothing else to parse.
  93. var prop int
  94. if state < 0 {
  95. prop = property(graphemeCodePoints, r)
  96. } else {
  97. prop = state >> shiftPropState
  98. }
  99. return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
  100. }
  101. // If we don't know the state, determine it now.
  102. var graphemeState, wordState, sentenceState, lineState, firstProp int
  103. remainder := b[length:]
  104. if state < 0 {
  105. graphemeState, firstProp, _ = transitionGraphemeState(state, r)
  106. wordState, _ = transitionWordBreakState(state, r, remainder, "")
  107. sentenceState, _ = transitionSentenceBreakState(state, r, remainder, "")
  108. lineState, _ = transitionLineBreakState(state, r, remainder, "")
  109. } else {
  110. graphemeState = state & maskGraphemeState
  111. wordState = (state >> shiftWordState) & maskWordState
  112. sentenceState = (state >> shiftSentenceState) & maskSentenceState
  113. lineState = (state >> shiftLineState) & maskLineState
  114. firstProp = state >> shiftPropState
  115. }
  116. // Transition until we find a grapheme cluster boundary.
  117. width := runeWidth(r, firstProp)
  118. for {
  119. var (
  120. graphemeBoundary, wordBoundary, sentenceBoundary bool
  121. lineBreak, prop int
  122. )
  123. r, l := utf8.DecodeRune(remainder)
  124. remainder = b[length+l:]
  125. graphemeState, prop, graphemeBoundary = transitionGraphemeState(graphemeState, r)
  126. wordState, wordBoundary = transitionWordBreakState(wordState, r, remainder, "")
  127. sentenceState, sentenceBoundary = transitionSentenceBreakState(sentenceState, r, remainder, "")
  128. lineState, lineBreak = transitionLineBreakState(lineState, r, remainder, "")
  129. if graphemeBoundary {
  130. boundary := lineBreak | (width << ShiftWidth)
  131. if wordBoundary {
  132. boundary |= 1 << shiftWord
  133. }
  134. if sentenceBoundary {
  135. boundary |= 1 << shiftSentence
  136. }
  137. return b[:length], b[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState) | (prop << shiftPropState)
  138. }
  139. if r == vs16 {
  140. width = 2
  141. } else if firstProp != prExtendedPictographic && firstProp != prRegionalIndicator && firstProp != prL {
  142. width += runeWidth(r, prop)
  143. } else if firstProp == prExtendedPictographic {
  144. if r == vs15 {
  145. width = 1
  146. } else {
  147. width = 2
  148. }
  149. }
  150. length += l
  151. if len(b) <= length {
  152. return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (width << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
  153. }
  154. }
  155. }
  156. // StepString is like [Step] but its input and outputs are strings.
  157. func StepString(str string, state int) (cluster, rest string, boundaries int, newState int) {
  158. // An empty byte slice returns nothing.
  159. if len(str) == 0 {
  160. return
  161. }
  162. // Extract the first rune.
  163. r, length := utf8.DecodeRuneInString(str)
  164. if len(str) <= length { // If we're already past the end, there is nothing else to parse.
  165. prop := property(graphemeCodePoints, r)
  166. return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
  167. }
  168. // If we don't know the state, determine it now.
  169. var graphemeState, wordState, sentenceState, lineState, firstProp int
  170. remainder := str[length:]
  171. if state < 0 {
  172. graphemeState, firstProp, _ = transitionGraphemeState(state, r)
  173. wordState, _ = transitionWordBreakState(state, r, nil, remainder)
  174. sentenceState, _ = transitionSentenceBreakState(state, r, nil, remainder)
  175. lineState, _ = transitionLineBreakState(state, r, nil, remainder)
  176. } else {
  177. graphemeState = state & maskGraphemeState
  178. wordState = (state >> shiftWordState) & maskWordState
  179. sentenceState = (state >> shiftSentenceState) & maskSentenceState
  180. lineState = (state >> shiftLineState) & maskLineState
  181. firstProp = state >> shiftPropState
  182. }
  183. // Transition until we find a grapheme cluster boundary.
  184. width := runeWidth(r, firstProp)
  185. for {
  186. var (
  187. graphemeBoundary, wordBoundary, sentenceBoundary bool
  188. lineBreak, prop int
  189. )
  190. r, l := utf8.DecodeRuneInString(remainder)
  191. remainder = str[length+l:]
  192. graphemeState, prop, graphemeBoundary = transitionGraphemeState(graphemeState, r)
  193. wordState, wordBoundary = transitionWordBreakState(wordState, r, nil, remainder)
  194. sentenceState, sentenceBoundary = transitionSentenceBreakState(sentenceState, r, nil, remainder)
  195. lineState, lineBreak = transitionLineBreakState(lineState, r, nil, remainder)
  196. if graphemeBoundary {
  197. boundary := lineBreak | (width << ShiftWidth)
  198. if wordBoundary {
  199. boundary |= 1 << shiftWord
  200. }
  201. if sentenceBoundary {
  202. boundary |= 1 << shiftSentence
  203. }
  204. return str[:length], str[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState) | (prop << shiftPropState)
  205. }
  206. if r == vs16 {
  207. width = 2
  208. } else if firstProp != prExtendedPictographic && firstProp != prRegionalIndicator && firstProp != prL {
  209. width += runeWidth(r, prop)
  210. } else if firstProp == prExtendedPictographic {
  211. if r == vs15 {
  212. width = 1
  213. } else {
  214. width = 2
  215. }
  216. }
  217. length += l
  218. if len(str) <= length {
  219. return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (width << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
  220. }
  221. }
  222. }