graphemerules.go 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. package uniseg
  2. // The states of the grapheme cluster parser.
  3. const (
  4. grAny = iota
  5. grCR
  6. grControlLF
  7. grL
  8. grLVV
  9. grLVTT
  10. grPrepend
  11. grExtendedPictographic
  12. grExtendedPictographicZWJ
  13. grRIOdd
  14. grRIEven
  15. )
  16. // The grapheme cluster parser's breaking instructions.
  17. const (
  18. grNoBoundary = iota
  19. grBoundary
  20. )
  21. // grTransitions implements the grapheme cluster parser's state transitions.
  22. // Maps state and property to a new state, a breaking instruction, and rule
  23. // number. The breaking instruction always refers to the boundary between the
  24. // last and next code point. Returns negative values if no transition is found.
  25. //
  26. // This function is used as follows:
  27. //
  28. // 1. Find specific state + specific property. Stop if found.
  29. // 2. Find specific state + any property.
  30. // 3. Find any state + specific property.
  31. // 4. If only (2) or (3) (but not both) was found, stop.
  32. // 5. If both (2) and (3) were found, use state from (3) and breaking instruction
  33. // from the transition with the lower rule number, prefer (3) if rule numbers
  34. // are equal. Stop.
  35. // 6. Assume grAny and grBoundary.
  36. //
  37. // Unicode version 15.0.0.
  38. func grTransitions(state, prop int) (newState int, newProp int, boundary int) {
  39. // It turns out that using a big switch statement is much faster than using
  40. // a map.
  41. switch uint64(state) | uint64(prop)<<32 {
  42. // GB5
  43. case grAny | prCR<<32:
  44. return grCR, grBoundary, 50
  45. case grAny | prLF<<32:
  46. return grControlLF, grBoundary, 50
  47. case grAny | prControl<<32:
  48. return grControlLF, grBoundary, 50
  49. // GB4
  50. case grCR | prAny<<32:
  51. return grAny, grBoundary, 40
  52. case grControlLF | prAny<<32:
  53. return grAny, grBoundary, 40
  54. // GB3
  55. case grCR | prLF<<32:
  56. return grControlLF, grNoBoundary, 30
  57. // GB6
  58. case grAny | prL<<32:
  59. return grL, grBoundary, 9990
  60. case grL | prL<<32:
  61. return grL, grNoBoundary, 60
  62. case grL | prV<<32:
  63. return grLVV, grNoBoundary, 60
  64. case grL | prLV<<32:
  65. return grLVV, grNoBoundary, 60
  66. case grL | prLVT<<32:
  67. return grLVTT, grNoBoundary, 60
  68. // GB7
  69. case grAny | prLV<<32:
  70. return grLVV, grBoundary, 9990
  71. case grAny | prV<<32:
  72. return grLVV, grBoundary, 9990
  73. case grLVV | prV<<32:
  74. return grLVV, grNoBoundary, 70
  75. case grLVV | prT<<32:
  76. return grLVTT, grNoBoundary, 70
  77. // GB8
  78. case grAny | prLVT<<32:
  79. return grLVTT, grBoundary, 9990
  80. case grAny | prT<<32:
  81. return grLVTT, grBoundary, 9990
  82. case grLVTT | prT<<32:
  83. return grLVTT, grNoBoundary, 80
  84. // GB9
  85. case grAny | prExtend<<32:
  86. return grAny, grNoBoundary, 90
  87. case grAny | prZWJ<<32:
  88. return grAny, grNoBoundary, 90
  89. // GB9a
  90. case grAny | prSpacingMark<<32:
  91. return grAny, grNoBoundary, 91
  92. // GB9b
  93. case grAny | prPrepend<<32:
  94. return grPrepend, grBoundary, 9990
  95. case grPrepend | prAny<<32:
  96. return grAny, grNoBoundary, 92
  97. // GB11
  98. case grAny | prExtendedPictographic<<32:
  99. return grExtendedPictographic, grBoundary, 9990
  100. case grExtendedPictographic | prExtend<<32:
  101. return grExtendedPictographic, grNoBoundary, 110
  102. case grExtendedPictographic | prZWJ<<32:
  103. return grExtendedPictographicZWJ, grNoBoundary, 110
  104. case grExtendedPictographicZWJ | prExtendedPictographic<<32:
  105. return grExtendedPictographic, grNoBoundary, 110
  106. // GB12 / GB13
  107. case grAny | prRegionalIndicator<<32:
  108. return grRIOdd, grBoundary, 9990
  109. case grRIOdd | prRegionalIndicator<<32:
  110. return grRIEven, grNoBoundary, 120
  111. case grRIEven | prRegionalIndicator<<32:
  112. return grRIOdd, grBoundary, 120
  113. default:
  114. return -1, -1, -1
  115. }
  116. }
  117. // transitionGraphemeState determines the new state of the grapheme cluster
  118. // parser given the current state and the next code point. It also returns the
  119. // code point's grapheme property (the value mapped by the [graphemeCodePoints]
  120. // table) and whether a cluster boundary was detected.
  121. func transitionGraphemeState(state int, r rune) (newState, prop int, boundary bool) {
  122. // Determine the property of the next character.
  123. prop = propertyGraphemes(r)
  124. // Find the applicable transition.
  125. nextState, nextProp, _ := grTransitions(state, prop)
  126. if nextState >= 0 {
  127. // We have a specific transition. We'll use it.
  128. return nextState, prop, nextProp == grBoundary
  129. }
  130. // No specific transition found. Try the less specific ones.
  131. anyPropState, anyPropProp, anyPropRule := grTransitions(state, prAny)
  132. anyStateState, anyStateProp, anyStateRule := grTransitions(grAny, prop)
  133. if anyPropState >= 0 && anyStateState >= 0 {
  134. // Both apply. We'll use a mix (see comments for grTransitions).
  135. newState = anyStateState
  136. boundary = anyStateProp == grBoundary
  137. if anyPropRule < anyStateRule {
  138. boundary = anyPropProp == grBoundary
  139. }
  140. return
  141. }
  142. if anyPropState >= 0 {
  143. // We only have a specific state.
  144. return anyPropState, prop, anyPropProp == grBoundary
  145. // This branch will probably never be reached because okAnyState will
  146. // always be true given the current transition map. But we keep it here
  147. // for future modifications to the transition map where this may not be
  148. // true anymore.
  149. }
  150. if anyStateState >= 0 {
  151. // We only have a specific property.
  152. return anyStateState, prop, anyStateProp == grBoundary
  153. }
  154. // No known transition. GB999: Any ÷ Any.
  155. return grAny, prop, true
  156. }