linerules.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. package uniseg
  2. import "unicode/utf8"
  3. // The states of the line break parser.
  4. const (
  5. lbAny = iota
  6. lbBK
  7. lbCR
  8. lbLF
  9. lbNL
  10. lbSP
  11. lbZW
  12. lbWJ
  13. lbGL
  14. lbBA
  15. lbHY
  16. lbCL
  17. lbCP
  18. lbEX
  19. lbIS
  20. lbSY
  21. lbOP
  22. lbQU
  23. lbQUSP
  24. lbNS
  25. lbCLCPSP
  26. lbB2
  27. lbB2SP
  28. lbCB
  29. lbBB
  30. lbLB21a
  31. lbHL
  32. lbAL
  33. lbNU
  34. lbPR
  35. lbEB
  36. lbIDEM
  37. lbNUNU
  38. lbNUSY
  39. lbNUIS
  40. lbNUCL
  41. lbNUCP
  42. lbPO
  43. lbJL
  44. lbJV
  45. lbJT
  46. lbH2
  47. lbH3
  48. lbOddRI
  49. lbEvenRI
  50. lbExtPicCn
  51. lbZWJBit = 64
  52. lbCPeaFWHBit = 128
  53. )
  54. // These constants define whether a given text may be broken into the next line.
  55. // If the break is optional (LineCanBreak), you may choose to break or not based
  56. // on your own criteria, for example, if the text has reached the available
  57. // width.
  58. const (
  59. LineDontBreak = iota // You may not break the line here.
  60. LineCanBreak // You may or may not break the line here.
  61. LineMustBreak // You must break the line here.
  62. )
  63. // The line break parser's state transitions. It's anologous to grTransitions,
  64. // see comments there for details. Unicode version 14.0.0.
  65. var lbTransitions = map[[2]int][3]int{
  66. // LB4.
  67. {lbAny, prBK}: {lbBK, LineCanBreak, 310},
  68. {lbBK, prAny}: {lbAny, LineMustBreak, 40},
  69. // LB5.
  70. {lbAny, prCR}: {lbCR, LineCanBreak, 310},
  71. {lbAny, prLF}: {lbLF, LineCanBreak, 310},
  72. {lbAny, prNL}: {lbNL, LineCanBreak, 310},
  73. {lbCR, prLF}: {lbLF, LineDontBreak, 50},
  74. {lbCR, prAny}: {lbAny, LineMustBreak, 50},
  75. {lbLF, prAny}: {lbAny, LineMustBreak, 50},
  76. {lbNL, prAny}: {lbAny, LineMustBreak, 50},
  77. // LB6.
  78. {lbAny, prBK}: {lbBK, LineDontBreak, 60},
  79. {lbAny, prCR}: {lbCR, LineDontBreak, 60},
  80. {lbAny, prLF}: {lbLF, LineDontBreak, 60},
  81. {lbAny, prNL}: {lbNL, LineDontBreak, 60},
  82. // LB7.
  83. {lbAny, prSP}: {lbSP, LineDontBreak, 70},
  84. {lbAny, prZW}: {lbZW, LineDontBreak, 70},
  85. // LB8.
  86. {lbZW, prSP}: {lbZW, LineDontBreak, 70},
  87. {lbZW, prAny}: {lbAny, LineCanBreak, 80},
  88. // LB11.
  89. {lbAny, prWJ}: {lbWJ, LineDontBreak, 110},
  90. {lbWJ, prAny}: {lbAny, LineDontBreak, 110},
  91. // LB12.
  92. {lbAny, prGL}: {lbGL, LineCanBreak, 310},
  93. {lbGL, prAny}: {lbAny, LineDontBreak, 120},
  94. // LB13 (simple transitions).
  95. {lbAny, prCL}: {lbCL, LineCanBreak, 310},
  96. {lbAny, prCP}: {lbCP, LineCanBreak, 310},
  97. {lbAny, prEX}: {lbEX, LineDontBreak, 130},
  98. {lbAny, prIS}: {lbIS, LineCanBreak, 310},
  99. {lbAny, prSY}: {lbSY, LineCanBreak, 310},
  100. // LB14.
  101. {lbAny, prOP}: {lbOP, LineCanBreak, 310},
  102. {lbOP, prSP}: {lbOP, LineDontBreak, 70},
  103. {lbOP, prAny}: {lbAny, LineDontBreak, 140},
  104. // LB15.
  105. {lbQU, prSP}: {lbQUSP, LineDontBreak, 70},
  106. {lbQU, prOP}: {lbOP, LineDontBreak, 150},
  107. {lbQUSP, prOP}: {lbOP, LineDontBreak, 150},
  108. // LB16.
  109. {lbCL, prSP}: {lbCLCPSP, LineDontBreak, 70},
  110. {lbNUCL, prSP}: {lbCLCPSP, LineDontBreak, 70},
  111. {lbCP, prSP}: {lbCLCPSP, LineDontBreak, 70},
  112. {lbNUCP, prSP}: {lbCLCPSP, LineDontBreak, 70},
  113. {lbCL, prNS}: {lbNS, LineDontBreak, 160},
  114. {lbNUCL, prNS}: {lbNS, LineDontBreak, 160},
  115. {lbCP, prNS}: {lbNS, LineDontBreak, 160},
  116. {lbNUCP, prNS}: {lbNS, LineDontBreak, 160},
  117. {lbCLCPSP, prNS}: {lbNS, LineDontBreak, 160},
  118. // LB17.
  119. {lbAny, prB2}: {lbB2, LineCanBreak, 310},
  120. {lbB2, prSP}: {lbB2SP, LineDontBreak, 70},
  121. {lbB2, prB2}: {lbB2, LineDontBreak, 170},
  122. {lbB2SP, prB2}: {lbB2, LineDontBreak, 170},
  123. // LB18.
  124. {lbSP, prAny}: {lbAny, LineCanBreak, 180},
  125. {lbQUSP, prAny}: {lbAny, LineCanBreak, 180},
  126. {lbCLCPSP, prAny}: {lbAny, LineCanBreak, 180},
  127. {lbB2SP, prAny}: {lbAny, LineCanBreak, 180},
  128. // LB19.
  129. {lbAny, prQU}: {lbQU, LineDontBreak, 190},
  130. {lbQU, prAny}: {lbAny, LineDontBreak, 190},
  131. // LB20.
  132. {lbAny, prCB}: {lbCB, LineCanBreak, 200},
  133. {lbCB, prAny}: {lbAny, LineCanBreak, 200},
  134. // LB21.
  135. {lbAny, prBA}: {lbBA, LineDontBreak, 210},
  136. {lbAny, prHY}: {lbHY, LineDontBreak, 210},
  137. {lbAny, prNS}: {lbNS, LineDontBreak, 210},
  138. {lbAny, prBB}: {lbBB, LineCanBreak, 310},
  139. {lbBB, prAny}: {lbAny, LineDontBreak, 210},
  140. // LB21a.
  141. {lbAny, prHL}: {lbHL, LineCanBreak, 310},
  142. {lbHL, prHY}: {lbLB21a, LineDontBreak, 210},
  143. {lbHL, prBA}: {lbLB21a, LineDontBreak, 210},
  144. {lbLB21a, prAny}: {lbAny, LineDontBreak, 211},
  145. // LB21b.
  146. {lbSY, prHL}: {lbHL, LineDontBreak, 212},
  147. {lbNUSY, prHL}: {lbHL, LineDontBreak, 212},
  148. // LB22.
  149. {lbAny, prIN}: {lbAny, LineDontBreak, 220},
  150. // LB23.
  151. {lbAny, prAL}: {lbAL, LineCanBreak, 310},
  152. {lbAny, prNU}: {lbNU, LineCanBreak, 310},
  153. {lbAL, prNU}: {lbNU, LineDontBreak, 230},
  154. {lbHL, prNU}: {lbNU, LineDontBreak, 230},
  155. {lbNU, prAL}: {lbAL, LineDontBreak, 230},
  156. {lbNU, prHL}: {lbHL, LineDontBreak, 230},
  157. {lbNUNU, prAL}: {lbAL, LineDontBreak, 230},
  158. {lbNUNU, prHL}: {lbHL, LineDontBreak, 230},
  159. // LB23a.
  160. {lbAny, prPR}: {lbPR, LineCanBreak, 310},
  161. {lbAny, prID}: {lbIDEM, LineCanBreak, 310},
  162. {lbAny, prEB}: {lbEB, LineCanBreak, 310},
  163. {lbAny, prEM}: {lbIDEM, LineCanBreak, 310},
  164. {lbPR, prID}: {lbIDEM, LineDontBreak, 231},
  165. {lbPR, prEB}: {lbEB, LineDontBreak, 231},
  166. {lbPR, prEM}: {lbIDEM, LineDontBreak, 231},
  167. {lbIDEM, prPO}: {lbPO, LineDontBreak, 231},
  168. {lbEB, prPO}: {lbPO, LineDontBreak, 231},
  169. // LB24.
  170. {lbAny, prPO}: {lbPO, LineCanBreak, 310},
  171. {lbPR, prAL}: {lbAL, LineDontBreak, 240},
  172. {lbPR, prHL}: {lbHL, LineDontBreak, 240},
  173. {lbPO, prAL}: {lbAL, LineDontBreak, 240},
  174. {lbPO, prHL}: {lbHL, LineDontBreak, 240},
  175. {lbAL, prPR}: {lbPR, LineDontBreak, 240},
  176. {lbAL, prPO}: {lbPO, LineDontBreak, 240},
  177. {lbHL, prPR}: {lbPR, LineDontBreak, 240},
  178. {lbHL, prPO}: {lbPO, LineDontBreak, 240},
  179. // LB25 (simple transitions).
  180. {lbPR, prNU}: {lbNU, LineDontBreak, 250},
  181. {lbPO, prNU}: {lbNU, LineDontBreak, 250},
  182. {lbOP, prNU}: {lbNU, LineDontBreak, 250},
  183. {lbHY, prNU}: {lbNU, LineDontBreak, 250},
  184. {lbNU, prNU}: {lbNUNU, LineDontBreak, 250},
  185. {lbNU, prSY}: {lbNUSY, LineDontBreak, 250},
  186. {lbNU, prIS}: {lbNUIS, LineDontBreak, 250},
  187. {lbNUNU, prNU}: {lbNUNU, LineDontBreak, 250},
  188. {lbNUNU, prSY}: {lbNUSY, LineDontBreak, 250},
  189. {lbNUNU, prIS}: {lbNUIS, LineDontBreak, 250},
  190. {lbNUSY, prNU}: {lbNUNU, LineDontBreak, 250},
  191. {lbNUSY, prSY}: {lbNUSY, LineDontBreak, 250},
  192. {lbNUSY, prIS}: {lbNUIS, LineDontBreak, 250},
  193. {lbNUIS, prNU}: {lbNUNU, LineDontBreak, 250},
  194. {lbNUIS, prSY}: {lbNUSY, LineDontBreak, 250},
  195. {lbNUIS, prIS}: {lbNUIS, LineDontBreak, 250},
  196. {lbNU, prCL}: {lbNUCL, LineDontBreak, 250},
  197. {lbNU, prCP}: {lbNUCP, LineDontBreak, 250},
  198. {lbNUNU, prCL}: {lbNUCL, LineDontBreak, 250},
  199. {lbNUNU, prCP}: {lbNUCP, LineDontBreak, 250},
  200. {lbNUSY, prCL}: {lbNUCL, LineDontBreak, 250},
  201. {lbNUSY, prCP}: {lbNUCP, LineDontBreak, 250},
  202. {lbNUIS, prCL}: {lbNUCL, LineDontBreak, 250},
  203. {lbNUIS, prCP}: {lbNUCP, LineDontBreak, 250},
  204. {lbNU, prPO}: {lbPO, LineDontBreak, 250},
  205. {lbNUNU, prPO}: {lbPO, LineDontBreak, 250},
  206. {lbNUSY, prPO}: {lbPO, LineDontBreak, 250},
  207. {lbNUIS, prPO}: {lbPO, LineDontBreak, 250},
  208. {lbNUCL, prPO}: {lbPO, LineDontBreak, 250},
  209. {lbNUCP, prPO}: {lbPO, LineDontBreak, 250},
  210. {lbNU, prPR}: {lbPR, LineDontBreak, 250},
  211. {lbNUNU, prPR}: {lbPR, LineDontBreak, 250},
  212. {lbNUSY, prPR}: {lbPR, LineDontBreak, 250},
  213. {lbNUIS, prPR}: {lbPR, LineDontBreak, 250},
  214. {lbNUCL, prPR}: {lbPR, LineDontBreak, 250},
  215. {lbNUCP, prPR}: {lbPR, LineDontBreak, 250},
  216. // LB26.
  217. {lbAny, prJL}: {lbJL, LineCanBreak, 310},
  218. {lbAny, prJV}: {lbJV, LineCanBreak, 310},
  219. {lbAny, prJT}: {lbJT, LineCanBreak, 310},
  220. {lbAny, prH2}: {lbH2, LineCanBreak, 310},
  221. {lbAny, prH3}: {lbH3, LineCanBreak, 310},
  222. {lbJL, prJL}: {lbJL, LineDontBreak, 260},
  223. {lbJL, prJV}: {lbJV, LineDontBreak, 260},
  224. {lbJL, prH2}: {lbH2, LineDontBreak, 260},
  225. {lbJL, prH3}: {lbH3, LineDontBreak, 260},
  226. {lbJV, prJV}: {lbJV, LineDontBreak, 260},
  227. {lbJV, prJT}: {lbJT, LineDontBreak, 260},
  228. {lbH2, prJV}: {lbJV, LineDontBreak, 260},
  229. {lbH2, prJT}: {lbJT, LineDontBreak, 260},
  230. {lbJT, prJT}: {lbJT, LineDontBreak, 260},
  231. {lbH3, prJT}: {lbJT, LineDontBreak, 260},
  232. // LB27.
  233. {lbJL, prPO}: {lbPO, LineDontBreak, 270},
  234. {lbJV, prPO}: {lbPO, LineDontBreak, 270},
  235. {lbJT, prPO}: {lbPO, LineDontBreak, 270},
  236. {lbH2, prPO}: {lbPO, LineDontBreak, 270},
  237. {lbH3, prPO}: {lbPO, LineDontBreak, 270},
  238. {lbPR, prJL}: {lbJL, LineDontBreak, 270},
  239. {lbPR, prJV}: {lbJV, LineDontBreak, 270},
  240. {lbPR, prJT}: {lbJT, LineDontBreak, 270},
  241. {lbPR, prH2}: {lbH2, LineDontBreak, 270},
  242. {lbPR, prH3}: {lbH3, LineDontBreak, 270},
  243. // LB28.
  244. {lbAL, prAL}: {lbAL, LineDontBreak, 280},
  245. {lbAL, prHL}: {lbHL, LineDontBreak, 280},
  246. {lbHL, prAL}: {lbAL, LineDontBreak, 280},
  247. {lbHL, prHL}: {lbHL, LineDontBreak, 280},
  248. // LB29.
  249. {lbIS, prAL}: {lbAL, LineDontBreak, 290},
  250. {lbIS, prHL}: {lbHL, LineDontBreak, 290},
  251. {lbNUIS, prAL}: {lbAL, LineDontBreak, 290},
  252. {lbNUIS, prHL}: {lbHL, LineDontBreak, 290},
  253. }
  254. // transitionLineBreakState determines the new state of the line break parser
  255. // given the current state and the next code point. It also returns the type of
  256. // line break: LineDontBreak, LineCanBreak, or LineMustBreak. If more than one
  257. // code point is needed to determine the new state, the byte slice or the string
  258. // starting after rune "r" can be used (whichever is not nil or empty) for
  259. // further lookups.
  260. func transitionLineBreakState(state int, r rune, b []byte, str string) (newState int, lineBreak int) {
  261. // Determine the property of the next character.
  262. nextProperty, generalCategory := propertyWithGenCat(lineBreakCodePoints, r)
  263. // Prepare.
  264. var forceNoBreak, isCPeaFWH bool
  265. if state >= 0 && state&lbCPeaFWHBit != 0 {
  266. isCPeaFWH = true // LB30: CP but ea is not F, W, or H.
  267. state = state &^ lbCPeaFWHBit
  268. }
  269. if state >= 0 && state&lbZWJBit != 0 {
  270. state = state &^ lbZWJBit // Extract zero-width joiner bit.
  271. forceNoBreak = true // LB8a.
  272. }
  273. defer func() {
  274. // Transition into LB30.
  275. if newState == lbCP || newState == lbNUCP {
  276. ea := property(eastAsianWidth, r)
  277. if ea != prF && ea != prW && ea != prH {
  278. newState |= lbCPeaFWHBit
  279. }
  280. }
  281. // Override break.
  282. if forceNoBreak {
  283. lineBreak = LineDontBreak
  284. }
  285. }()
  286. // LB1.
  287. if nextProperty == prAI || nextProperty == prSG || nextProperty == prXX {
  288. nextProperty = prAL
  289. } else if nextProperty == prSA {
  290. if generalCategory == gcMn || generalCategory == gcMc {
  291. nextProperty = prCM
  292. } else {
  293. nextProperty = prAL
  294. }
  295. } else if nextProperty == prCJ {
  296. nextProperty = prNS
  297. }
  298. // Combining marks.
  299. if nextProperty == prZWJ || nextProperty == prCM {
  300. var bit int
  301. if nextProperty == prZWJ {
  302. bit = lbZWJBit
  303. }
  304. mustBreakState := state < 0 || state == lbBK || state == lbCR || state == lbLF || state == lbNL
  305. if !mustBreakState && state != lbSP && state != lbZW && state != lbQUSP && state != lbCLCPSP && state != lbB2SP {
  306. // LB9.
  307. return state | bit, LineDontBreak
  308. } else {
  309. // LB10.
  310. if mustBreakState {
  311. return lbAL | bit, LineMustBreak
  312. }
  313. return lbAL | bit, LineCanBreak
  314. }
  315. }
  316. // Find the applicable transition in the table.
  317. var rule int
  318. transition, ok := lbTransitions[[2]int{state, nextProperty}]
  319. if ok {
  320. // We have a specific transition. We'll use it.
  321. newState, lineBreak, rule = transition[0], transition[1], transition[2]
  322. } else {
  323. // No specific transition found. Try the less specific ones.
  324. transAnyProp, okAnyProp := lbTransitions[[2]int{state, prAny}]
  325. transAnyState, okAnyState := lbTransitions[[2]int{lbAny, nextProperty}]
  326. if okAnyProp && okAnyState {
  327. // Both apply. We'll use a mix (see comments for grTransitions).
  328. newState, lineBreak, rule = transAnyState[0], transAnyState[1], transAnyState[2]
  329. if transAnyProp[2] < transAnyState[2] {
  330. lineBreak, rule = transAnyProp[1], transAnyProp[2]
  331. }
  332. } else if okAnyProp {
  333. // We only have a specific state.
  334. newState, lineBreak, rule = transAnyProp[0], transAnyProp[1], transAnyProp[2]
  335. // This branch will probably never be reached because okAnyState will
  336. // always be true given the current transition map. But we keep it here
  337. // for future modifications to the transition map where this may not be
  338. // true anymore.
  339. } else if okAnyState {
  340. // We only have a specific property.
  341. newState, lineBreak, rule = transAnyState[0], transAnyState[1], transAnyState[2]
  342. } else {
  343. // No known transition. LB31: ALL ÷ ALL.
  344. newState, lineBreak, rule = lbAny, LineCanBreak, 310
  345. }
  346. }
  347. // LB12a.
  348. if rule > 121 &&
  349. nextProperty == prGL &&
  350. (state != lbSP && state != lbBA && state != lbHY && state != lbLB21a && state != lbQUSP && state != lbCLCPSP && state != lbB2SP) {
  351. return lbGL, LineDontBreak
  352. }
  353. // LB13.
  354. if rule > 130 && state != lbNU && state != lbNUNU {
  355. switch nextProperty {
  356. case prCL:
  357. return lbCL, LineDontBreak
  358. case prCP:
  359. return lbCP, LineDontBreak
  360. case prIS:
  361. return lbIS, LineDontBreak
  362. case prSY:
  363. return lbSY, LineDontBreak
  364. }
  365. }
  366. // LB25 (look ahead).
  367. if rule > 250 &&
  368. (state == lbPR || state == lbPO) &&
  369. nextProperty == prOP || nextProperty == prHY {
  370. var r rune
  371. if b != nil { // Byte slice version.
  372. r, _ = utf8.DecodeRune(b)
  373. } else { // String version.
  374. r, _ = utf8.DecodeRuneInString(str)
  375. }
  376. if r != utf8.RuneError {
  377. pr, _ := propertyWithGenCat(lineBreakCodePoints, r)
  378. if pr == prNU {
  379. return lbNU, LineDontBreak
  380. }
  381. }
  382. }
  383. // LB30 (part one).
  384. if rule > 300 {
  385. if (state == lbAL || state == lbHL || state == lbNU || state == lbNUNU) && nextProperty == prOP {
  386. ea := property(eastAsianWidth, r)
  387. if ea != prF && ea != prW && ea != prH {
  388. return lbOP, LineDontBreak
  389. }
  390. } else if isCPeaFWH {
  391. switch nextProperty {
  392. case prAL:
  393. return lbAL, LineDontBreak
  394. case prHL:
  395. return lbHL, LineDontBreak
  396. case prNU:
  397. return lbNU, LineDontBreak
  398. }
  399. }
  400. }
  401. // LB30a.
  402. if newState == lbAny && nextProperty == prRI {
  403. if state != lbOddRI && state != lbEvenRI { // Includes state == -1.
  404. // Transition into the first RI.
  405. return lbOddRI, lineBreak
  406. }
  407. if state == lbOddRI {
  408. // Don't break pairs of Regional Indicators.
  409. return lbEvenRI, LineDontBreak
  410. }
  411. return lbOddRI, lineBreak
  412. }
  413. // LB30b.
  414. if rule > 302 {
  415. if nextProperty == prEM {
  416. if state == lbEB || state == lbExtPicCn {
  417. return prAny, LineDontBreak
  418. }
  419. }
  420. graphemeProperty := property(graphemeCodePoints, r)
  421. if graphemeProperty == prExtendedPictographic && generalCategory == gcCn {
  422. return lbExtPicCn, LineCanBreak
  423. }
  424. }
  425. return
  426. }