line.go 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. package uniseg
  2. import "unicode/utf8"
  3. // FirstLineSegment returns the prefix of the given byte slice after which a
  4. // decision to break the string over to the next line can or must be made,
  5. // according to the rules of [Unicode Standard Annex #14]. This is used to
  6. // implement line breaking.
  7. //
  8. // Line breaking, also known as word wrapping, is the process of breaking a
  9. // section of text into lines such that it will fit in the available width of a
  10. // page, window or other display area.
  11. //
  12. // The returned "segment" may not be broken into smaller parts, unless no other
  13. // breaking opportunities present themselves, in which case you may break by
  14. // grapheme clusters (using the [FirstGraphemeCluster] function to determine the
  15. // grapheme clusters).
  16. //
  17. // The "mustBreak" flag indicates whether you MUST break the line after the
  18. // given segment (true), for example after newline characters, or you MAY break
  19. // the line after the given segment (false).
  20. //
  21. // This function can be called continuously to extract all non-breaking sub-sets
  22. // from a byte slice, as illustrated in the example below.
  23. //
  24. // If you don't know the current state, for example when calling the function
  25. // for the first time, you must pass -1. For consecutive calls, pass the state
  26. // and rest slice returned by the previous call.
  27. //
  28. // The "rest" slice is the sub-slice of the original byte slice "b" starting
  29. // after the last byte of the identified line segment. If the length of the
  30. // "rest" slice is 0, the entire byte slice "b" has been processed. The
  31. // "segment" byte slice is the sub-slice of the input slice containing the
  32. // identified line segment.
  33. //
  34. // Given an empty byte slice "b", the function returns nil values.
  35. //
  36. // Note that in accordance with [UAX #14 LB3], the final segment will end with
  37. // "mustBreak" set to true. You can choose to ignore this by checking if the
  38. // length of the "rest" slice is 0 and calling [HasTrailingLineBreak] or
  39. // [HasTrailingLineBreakInString] on the last rune.
  40. //
  41. // Note also that this algorithm may break within grapheme clusters. This is
  42. // addressed in Section 8.2 Example 6 of UAX #14. To avoid this, you can use
  43. // the [Step] function instead.
  44. //
  45. // [Unicode Standard Annex #14]: https://www.unicode.org/reports/tr14/
  46. // [UAX #14 LB3]: https://www.unicode.org/reports/tr14/#Algorithm
  47. func FirstLineSegment(b []byte, state int) (segment, rest []byte, mustBreak bool, newState int) {
  48. // An empty byte slice returns nothing.
  49. if len(b) == 0 {
  50. return
  51. }
  52. // Extract the first rune.
  53. r, length := utf8.DecodeRune(b)
  54. if len(b) <= length { // If we're already past the end, there is nothing else to parse.
  55. return b, nil, true, lbAny // LB3.
  56. }
  57. // If we don't know the state, determine it now.
  58. if state < 0 {
  59. state, _ = transitionLineBreakState(state, r, b[length:], "")
  60. }
  61. // Transition until we find a boundary.
  62. var boundary int
  63. for {
  64. r, l := utf8.DecodeRune(b[length:])
  65. state, boundary = transitionLineBreakState(state, r, b[length+l:], "")
  66. if boundary != LineDontBreak {
  67. return b[:length], b[length:], boundary == LineMustBreak, state
  68. }
  69. length += l
  70. if len(b) <= length {
  71. return b, nil, true, lbAny // LB3
  72. }
  73. }
  74. }
  75. // FirstLineSegmentInString is like FirstLineSegment() but its input and outputs
  76. // are strings.
  77. func FirstLineSegmentInString(str string, state int) (segment, rest string, mustBreak bool, newState int) {
  78. // An empty byte slice returns nothing.
  79. if len(str) == 0 {
  80. return
  81. }
  82. // Extract the first rune.
  83. r, length := utf8.DecodeRuneInString(str)
  84. if len(str) <= length { // If we're already past the end, there is nothing else to parse.
  85. return str, "", true, lbAny // LB3.
  86. }
  87. // If we don't know the state, determine it now.
  88. if state < 0 {
  89. state, _ = transitionLineBreakState(state, r, nil, str[length:])
  90. }
  91. // Transition until we find a boundary.
  92. var boundary int
  93. for {
  94. r, l := utf8.DecodeRuneInString(str[length:])
  95. state, boundary = transitionLineBreakState(state, r, nil, str[length+l:])
  96. if boundary != LineDontBreak {
  97. return str[:length], str[length:], boundary == LineMustBreak, state
  98. }
  99. length += l
  100. if len(str) <= length {
  101. return str, "", true, lbAny // LB3.
  102. }
  103. }
  104. }
  105. // HasTrailingLineBreak returns true if the last rune in the given byte slice is
  106. // one of the hard line break code points defined in LB4 and LB5 of [UAX #14].
  107. //
  108. // [UAX #14]: https://www.unicode.org/reports/tr14/#Algorithm
  109. func HasTrailingLineBreak(b []byte) bool {
  110. r, _ := utf8.DecodeLastRune(b)
  111. property, _ := propertyWithGenCat(lineBreakCodePoints, r)
  112. return property == lbBK || property == lbCR || property == lbLF || property == lbNL
  113. }
  114. // HasTrailingLineBreakInString is like [HasTrailingLineBreak] but for a string.
  115. func HasTrailingLineBreakInString(str string) bool {
  116. r, _ := utf8.DecodeLastRuneInString(str)
  117. property, _ := propertyWithGenCat(lineBreakCodePoints, r)
  118. return property == lbBK || property == lbCR || property == lbLF || property == lbNL
  119. }