graphemerules.go 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. package uniseg
  2. // The states of the grapheme cluster parser.
  3. const (
  4. grAny = iota
  5. grCR
  6. grControlLF
  7. grL
  8. grLVV
  9. grLVTT
  10. grPrepend
  11. grExtendedPictographic
  12. grExtendedPictographicZWJ
  13. grRIOdd
  14. grRIEven
  15. )
  16. // The grapheme cluster parser's breaking instructions.
  17. const (
  18. grNoBoundary = iota
  19. grBoundary
  20. )
  21. // The grapheme cluster parser's state transitions. Maps (state, property) to
  22. // (new state, breaking instruction, rule number). The breaking instruction
  23. // always refers to the boundary between the last and next code point.
  24. //
  25. // This map is queried as follows:
  26. //
  27. // 1. Find specific state + specific property. Stop if found.
  28. // 2. Find specific state + any property.
  29. // 3. Find any state + specific property.
  30. // 4. If only (2) or (3) (but not both) was found, stop.
  31. // 5. If both (2) and (3) were found, use state from (3) and breaking instruction
  32. // from the transition with the lower rule number, prefer (3) if rule numbers
  33. // are equal. Stop.
  34. // 6. Assume grAny and grBoundary.
  35. //
  36. // Unicode version 14.0.0.
  37. var grTransitions = map[[2]int][3]int{
  38. // GB5
  39. {grAny, prCR}: {grCR, grBoundary, 50},
  40. {grAny, prLF}: {grControlLF, grBoundary, 50},
  41. {grAny, prControl}: {grControlLF, grBoundary, 50},
  42. // GB4
  43. {grCR, prAny}: {grAny, grBoundary, 40},
  44. {grControlLF, prAny}: {grAny, grBoundary, 40},
  45. // GB3.
  46. {grCR, prLF}: {grControlLF, grNoBoundary, 30},
  47. // GB6.
  48. {grAny, prL}: {grL, grBoundary, 9990},
  49. {grL, prL}: {grL, grNoBoundary, 60},
  50. {grL, prV}: {grLVV, grNoBoundary, 60},
  51. {grL, prLV}: {grLVV, grNoBoundary, 60},
  52. {grL, prLVT}: {grLVTT, grNoBoundary, 60},
  53. // GB7.
  54. {grAny, prLV}: {grLVV, grBoundary, 9990},
  55. {grAny, prV}: {grLVV, grBoundary, 9990},
  56. {grLVV, prV}: {grLVV, grNoBoundary, 70},
  57. {grLVV, prT}: {grLVTT, grNoBoundary, 70},
  58. // GB8.
  59. {grAny, prLVT}: {grLVTT, grBoundary, 9990},
  60. {grAny, prT}: {grLVTT, grBoundary, 9990},
  61. {grLVTT, prT}: {grLVTT, grNoBoundary, 80},
  62. // GB9.
  63. {grAny, prExtend}: {grAny, grNoBoundary, 90},
  64. {grAny, prZWJ}: {grAny, grNoBoundary, 90},
  65. // GB9a.
  66. {grAny, prSpacingMark}: {grAny, grNoBoundary, 91},
  67. // GB9b.
  68. {grAny, prPrepend}: {grPrepend, grBoundary, 9990},
  69. {grPrepend, prAny}: {grAny, grNoBoundary, 92},
  70. // GB11.
  71. {grAny, prExtendedPictographic}: {grExtendedPictographic, grBoundary, 9990},
  72. {grExtendedPictographic, prExtend}: {grExtendedPictographic, grNoBoundary, 110},
  73. {grExtendedPictographic, prZWJ}: {grExtendedPictographicZWJ, grNoBoundary, 110},
  74. {grExtendedPictographicZWJ, prExtendedPictographic}: {grExtendedPictographic, grNoBoundary, 110},
  75. // GB12 / GB13.
  76. {grAny, prRegionalIndicator}: {grRIOdd, grBoundary, 9990},
  77. {grRIOdd, prRegionalIndicator}: {grRIEven, grNoBoundary, 120},
  78. {grRIEven, prRegionalIndicator}: {grRIOdd, grBoundary, 120},
  79. }
  80. // transitionGraphemeState determines the new state of the grapheme cluster
  81. // parser given the current state and the next code point. It also returns the
  82. // code point's grapheme property (the value mapped by the [graphemeCodePoints]
  83. // table) and whether a cluster boundary was detected.
  84. func transitionGraphemeState(state int, r rune) (newState, prop int, boundary bool) {
  85. // Determine the property of the next character.
  86. prop = property(graphemeCodePoints, r)
  87. // Find the applicable transition.
  88. transition, ok := grTransitions[[2]int{state, prop}]
  89. if ok {
  90. // We have a specific transition. We'll use it.
  91. return transition[0], prop, transition[1] == grBoundary
  92. }
  93. // No specific transition found. Try the less specific ones.
  94. transAnyProp, okAnyProp := grTransitions[[2]int{state, prAny}]
  95. transAnyState, okAnyState := grTransitions[[2]int{grAny, prop}]
  96. if okAnyProp && okAnyState {
  97. // Both apply. We'll use a mix (see comments for grTransitions).
  98. newState = transAnyState[0]
  99. boundary = transAnyState[1] == grBoundary
  100. if transAnyProp[2] < transAnyState[2] {
  101. boundary = transAnyProp[1] == grBoundary
  102. }
  103. return
  104. }
  105. if okAnyProp {
  106. // We only have a specific state.
  107. return transAnyProp[0], prop, transAnyProp[1] == grBoundary
  108. // This branch will probably never be reached because okAnyState will
  109. // always be true given the current transition map. But we keep it here
  110. // for future modifications to the transition map where this may not be
  111. // true anymore.
  112. }
  113. if okAnyState {
  114. // We only have a specific property.
  115. return transAnyState[0], prop, transAnyState[1] == grBoundary
  116. }
  117. // No known transition. GB999: Any ÷ Any.
  118. return grAny, prop, true
  119. }