encode_arm64.s 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722
  1. // Copyright 2020 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build !appengine
  5. // +build gc
  6. // +build !noasm
  7. #include "textflag.h"
  8. // The asm code generally follows the pure Go code in encode_other.go, except
  9. // where marked with a "!!!".
  10. // ----------------------------------------------------------------------------
  11. // func emitLiteral(dst, lit []byte) int
  12. //
  13. // All local variables fit into registers. The register allocation:
  14. // - R3 len(lit)
  15. // - R4 n
  16. // - R6 return value
  17. // - R8 &dst[i]
  18. // - R10 &lit[0]
  19. //
  20. // The 32 bytes of stack space is to call runtime·memmove.
  21. //
  22. // The unusual register allocation of local variables, such as R10 for the
  23. // source pointer, matches the allocation used at the call site in encodeBlock,
  24. // which makes it easier to manually inline this function.
  25. TEXT ·emitLiteral(SB), NOSPLIT, $32-56
  26. MOVD dst_base+0(FP), R8
  27. MOVD lit_base+24(FP), R10
  28. MOVD lit_len+32(FP), R3
  29. MOVD R3, R6
  30. MOVW R3, R4
  31. SUBW $1, R4, R4
  32. CMPW $60, R4
  33. BLT oneByte
  34. CMPW $256, R4
  35. BLT twoBytes
  36. threeBytes:
  37. MOVD $0xf4, R2
  38. MOVB R2, 0(R8)
  39. MOVW R4, 1(R8)
  40. ADD $3, R8, R8
  41. ADD $3, R6, R6
  42. B memmove
  43. twoBytes:
  44. MOVD $0xf0, R2
  45. MOVB R2, 0(R8)
  46. MOVB R4, 1(R8)
  47. ADD $2, R8, R8
  48. ADD $2, R6, R6
  49. B memmove
  50. oneByte:
  51. LSLW $2, R4, R4
  52. MOVB R4, 0(R8)
  53. ADD $1, R8, R8
  54. ADD $1, R6, R6
  55. memmove:
  56. MOVD R6, ret+48(FP)
  57. // copy(dst[i:], lit)
  58. //
  59. // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
  60. // R8, R10 and R3 as arguments.
  61. MOVD R8, 8(RSP)
  62. MOVD R10, 16(RSP)
  63. MOVD R3, 24(RSP)
  64. CALL runtime·memmove(SB)
  65. RET
  66. // ----------------------------------------------------------------------------
  67. // func emitCopy(dst []byte, offset, length int) int
  68. //
  69. // All local variables fit into registers. The register allocation:
  70. // - R3 length
  71. // - R7 &dst[0]
  72. // - R8 &dst[i]
  73. // - R11 offset
  74. //
  75. // The unusual register allocation of local variables, such as R11 for the
  76. // offset, matches the allocation used at the call site in encodeBlock, which
  77. // makes it easier to manually inline this function.
  78. TEXT ·emitCopy(SB), NOSPLIT, $0-48
  79. MOVD dst_base+0(FP), R8
  80. MOVD R8, R7
  81. MOVD offset+24(FP), R11
  82. MOVD length+32(FP), R3
  83. loop0:
  84. // for length >= 68 { etc }
  85. CMPW $68, R3
  86. BLT step1
  87. // Emit a length 64 copy, encoded as 3 bytes.
  88. MOVD $0xfe, R2
  89. MOVB R2, 0(R8)
  90. MOVW R11, 1(R8)
  91. ADD $3, R8, R8
  92. SUB $64, R3, R3
  93. B loop0
  94. step1:
  95. // if length > 64 { etc }
  96. CMP $64, R3
  97. BLE step2
  98. // Emit a length 60 copy, encoded as 3 bytes.
  99. MOVD $0xee, R2
  100. MOVB R2, 0(R8)
  101. MOVW R11, 1(R8)
  102. ADD $3, R8, R8
  103. SUB $60, R3, R3
  104. step2:
  105. // if length >= 12 || offset >= 2048 { goto step3 }
  106. CMP $12, R3
  107. BGE step3
  108. CMPW $2048, R11
  109. BGE step3
  110. // Emit the remaining copy, encoded as 2 bytes.
  111. MOVB R11, 1(R8)
  112. LSRW $3, R11, R11
  113. AND $0xe0, R11, R11
  114. SUB $4, R3, R3
  115. LSLW $2, R3
  116. AND $0xff, R3, R3
  117. ORRW R3, R11, R11
  118. ORRW $1, R11, R11
  119. MOVB R11, 0(R8)
  120. ADD $2, R8, R8
  121. // Return the number of bytes written.
  122. SUB R7, R8, R8
  123. MOVD R8, ret+40(FP)
  124. RET
  125. step3:
  126. // Emit the remaining copy, encoded as 3 bytes.
  127. SUB $1, R3, R3
  128. AND $0xff, R3, R3
  129. LSLW $2, R3, R3
  130. ORRW $2, R3, R3
  131. MOVB R3, 0(R8)
  132. MOVW R11, 1(R8)
  133. ADD $3, R8, R8
  134. // Return the number of bytes written.
  135. SUB R7, R8, R8
  136. MOVD R8, ret+40(FP)
  137. RET
  138. // ----------------------------------------------------------------------------
  139. // func extendMatch(src []byte, i, j int) int
  140. //
  141. // All local variables fit into registers. The register allocation:
  142. // - R6 &src[0]
  143. // - R7 &src[j]
  144. // - R13 &src[len(src) - 8]
  145. // - R14 &src[len(src)]
  146. // - R15 &src[i]
  147. //
  148. // The unusual register allocation of local variables, such as R15 for a source
  149. // pointer, matches the allocation used at the call site in encodeBlock, which
  150. // makes it easier to manually inline this function.
  151. TEXT ·extendMatch(SB), NOSPLIT, $0-48
  152. MOVD src_base+0(FP), R6
  153. MOVD src_len+8(FP), R14
  154. MOVD i+24(FP), R15
  155. MOVD j+32(FP), R7
  156. ADD R6, R14, R14
  157. ADD R6, R15, R15
  158. ADD R6, R7, R7
  159. MOVD R14, R13
  160. SUB $8, R13, R13
  161. cmp8:
  162. // As long as we are 8 or more bytes before the end of src, we can load and
  163. // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
  164. CMP R13, R7
  165. BHI cmp1
  166. MOVD (R15), R3
  167. MOVD (R7), R4
  168. CMP R4, R3
  169. BNE bsf
  170. ADD $8, R15, R15
  171. ADD $8, R7, R7
  172. B cmp8
  173. bsf:
  174. // If those 8 bytes were not equal, XOR the two 8 byte values, and return
  175. // the index of the first byte that differs.
  176. // RBIT reverses the bit order, then CLZ counts the leading zeros, the
  177. // combination of which finds the least significant bit which is set.
  178. // The arm64 architecture is little-endian, and the shift by 3 converts
  179. // a bit index to a byte index.
  180. EOR R3, R4, R4
  181. RBIT R4, R4
  182. CLZ R4, R4
  183. ADD R4>>3, R7, R7
  184. // Convert from &src[ret] to ret.
  185. SUB R6, R7, R7
  186. MOVD R7, ret+40(FP)
  187. RET
  188. cmp1:
  189. // In src's tail, compare 1 byte at a time.
  190. CMP R7, R14
  191. BLS extendMatchEnd
  192. MOVB (R15), R3
  193. MOVB (R7), R4
  194. CMP R4, R3
  195. BNE extendMatchEnd
  196. ADD $1, R15, R15
  197. ADD $1, R7, R7
  198. B cmp1
  199. extendMatchEnd:
  200. // Convert from &src[ret] to ret.
  201. SUB R6, R7, R7
  202. MOVD R7, ret+40(FP)
  203. RET
  204. // ----------------------------------------------------------------------------
  205. // func encodeBlock(dst, src []byte) (d int)
  206. //
  207. // All local variables fit into registers, other than "var table". The register
  208. // allocation:
  209. // - R3 . .
  210. // - R4 . .
  211. // - R5 64 shift
  212. // - R6 72 &src[0], tableSize
  213. // - R7 80 &src[s]
  214. // - R8 88 &dst[d]
  215. // - R9 96 sLimit
  216. // - R10 . &src[nextEmit]
  217. // - R11 104 prevHash, currHash, nextHash, offset
  218. // - R12 112 &src[base], skip
  219. // - R13 . &src[nextS], &src[len(src) - 8]
  220. // - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x
  221. // - R15 120 candidate
  222. // - R16 . hash constant, 0x1e35a7bd
  223. // - R17 . &table
  224. // - . 128 table
  225. //
  226. // The second column (64, 72, etc) is the stack offset to spill the registers
  227. // when calling other functions. We could pack this slightly tighter, but it's
  228. // simpler to have a dedicated spill map independent of the function called.
  229. //
  230. // "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
  231. // extra 64 bytes, to call other functions, and an extra 64 bytes, to spill
  232. // local variables (registers) during calls gives 32768 + 64 + 64 = 32896.
  233. TEXT ·encodeBlock(SB), 0, $32896-56
  234. MOVD dst_base+0(FP), R8
  235. MOVD src_base+24(FP), R7
  236. MOVD src_len+32(FP), R14
  237. // shift, tableSize := uint32(32-8), 1<<8
  238. MOVD $24, R5
  239. MOVD $256, R6
  240. MOVW $0xa7bd, R16
  241. MOVKW $(0x1e35<<16), R16
  242. calcShift:
  243. // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
  244. // shift--
  245. // }
  246. MOVD $16384, R2
  247. CMP R2, R6
  248. BGE varTable
  249. CMP R14, R6
  250. BGE varTable
  251. SUB $1, R5, R5
  252. LSL $1, R6, R6
  253. B calcShift
  254. varTable:
  255. // var table [maxTableSize]uint16
  256. //
  257. // In the asm code, unlike the Go code, we can zero-initialize only the
  258. // first tableSize elements. Each uint16 element is 2 bytes and each
  259. // iterations writes 64 bytes, so we can do only tableSize/32 writes
  260. // instead of the 2048 writes that would zero-initialize all of table's
  261. // 32768 bytes. This clear could overrun the first tableSize elements, but
  262. // it won't overrun the allocated stack size.
  263. ADD $128, RSP, R17
  264. MOVD R17, R4
  265. // !!! R6 = &src[tableSize]
  266. ADD R6<<1, R17, R6
  267. memclr:
  268. STP.P (ZR, ZR), 64(R4)
  269. STP (ZR, ZR), -48(R4)
  270. STP (ZR, ZR), -32(R4)
  271. STP (ZR, ZR), -16(R4)
  272. CMP R4, R6
  273. BHI memclr
  274. // !!! R6 = &src[0]
  275. MOVD R7, R6
  276. // sLimit := len(src) - inputMargin
  277. MOVD R14, R9
  278. SUB $15, R9, R9
  279. // !!! Pre-emptively spill R5, R6 and R9 to the stack. Their values don't
  280. // change for the rest of the function.
  281. MOVD R5, 64(RSP)
  282. MOVD R6, 72(RSP)
  283. MOVD R9, 96(RSP)
  284. // nextEmit := 0
  285. MOVD R6, R10
  286. // s := 1
  287. ADD $1, R7, R7
  288. // nextHash := hash(load32(src, s), shift)
  289. MOVW 0(R7), R11
  290. MULW R16, R11, R11
  291. LSRW R5, R11, R11
  292. outer:
  293. // for { etc }
  294. // skip := 32
  295. MOVD $32, R12
  296. // nextS := s
  297. MOVD R7, R13
  298. // candidate := 0
  299. MOVD $0, R15
  300. inner0:
  301. // for { etc }
  302. // s := nextS
  303. MOVD R13, R7
  304. // bytesBetweenHashLookups := skip >> 5
  305. MOVD R12, R14
  306. LSR $5, R14, R14
  307. // nextS = s + bytesBetweenHashLookups
  308. ADD R14, R13, R13
  309. // skip += bytesBetweenHashLookups
  310. ADD R14, R12, R12
  311. // if nextS > sLimit { goto emitRemainder }
  312. MOVD R13, R3
  313. SUB R6, R3, R3
  314. CMP R9, R3
  315. BHI emitRemainder
  316. // candidate = int(table[nextHash])
  317. MOVHU 0(R17)(R11<<1), R15
  318. // table[nextHash] = uint16(s)
  319. MOVD R7, R3
  320. SUB R6, R3, R3
  321. MOVH R3, 0(R17)(R11<<1)
  322. // nextHash = hash(load32(src, nextS), shift)
  323. MOVW 0(R13), R11
  324. MULW R16, R11
  325. LSRW R5, R11, R11
  326. // if load32(src, s) != load32(src, candidate) { continue } break
  327. MOVW 0(R7), R3
  328. MOVW (R6)(R15), R4
  329. CMPW R4, R3
  330. BNE inner0
  331. fourByteMatch:
  332. // As per the encode_other.go code:
  333. //
  334. // A 4-byte match has been found. We'll later see etc.
  335. // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
  336. // on inputMargin in encode.go.
  337. MOVD R7, R3
  338. SUB R10, R3, R3
  339. CMP $16, R3
  340. BLE emitLiteralFastPath
  341. // ----------------------------------------
  342. // Begin inline of the emitLiteral call.
  343. //
  344. // d += emitLiteral(dst[d:], src[nextEmit:s])
  345. MOVW R3, R4
  346. SUBW $1, R4, R4
  347. MOVW $60, R2
  348. CMPW R2, R4
  349. BLT inlineEmitLiteralOneByte
  350. MOVW $256, R2
  351. CMPW R2, R4
  352. BLT inlineEmitLiteralTwoBytes
  353. inlineEmitLiteralThreeBytes:
  354. MOVD $0xf4, R1
  355. MOVB R1, 0(R8)
  356. MOVW R4, 1(R8)
  357. ADD $3, R8, R8
  358. B inlineEmitLiteralMemmove
  359. inlineEmitLiteralTwoBytes:
  360. MOVD $0xf0, R1
  361. MOVB R1, 0(R8)
  362. MOVB R4, 1(R8)
  363. ADD $2, R8, R8
  364. B inlineEmitLiteralMemmove
  365. inlineEmitLiteralOneByte:
  366. LSLW $2, R4, R4
  367. MOVB R4, 0(R8)
  368. ADD $1, R8, R8
  369. inlineEmitLiteralMemmove:
  370. // Spill local variables (registers) onto the stack; call; unspill.
  371. //
  372. // copy(dst[i:], lit)
  373. //
  374. // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
  375. // R8, R10 and R3 as arguments.
  376. MOVD R8, 8(RSP)
  377. MOVD R10, 16(RSP)
  378. MOVD R3, 24(RSP)
  379. // Finish the "d +=" part of "d += emitLiteral(etc)".
  380. ADD R3, R8, R8
  381. MOVD R7, 80(RSP)
  382. MOVD R8, 88(RSP)
  383. MOVD R15, 120(RSP)
  384. CALL runtime·memmove(SB)
  385. MOVD 64(RSP), R5
  386. MOVD 72(RSP), R6
  387. MOVD 80(RSP), R7
  388. MOVD 88(RSP), R8
  389. MOVD 96(RSP), R9
  390. MOVD 120(RSP), R15
  391. ADD $128, RSP, R17
  392. MOVW $0xa7bd, R16
  393. MOVKW $(0x1e35<<16), R16
  394. B inner1
  395. inlineEmitLiteralEnd:
  396. // End inline of the emitLiteral call.
  397. // ----------------------------------------
  398. emitLiteralFastPath:
  399. // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
  400. MOVB R3, R4
  401. SUBW $1, R4, R4
  402. AND $0xff, R4, R4
  403. LSLW $2, R4, R4
  404. MOVB R4, (R8)
  405. ADD $1, R8, R8
  406. // !!! Implement the copy from lit to dst as a 16-byte load and store.
  407. // (Encode's documentation says that dst and src must not overlap.)
  408. //
  409. // This always copies 16 bytes, instead of only len(lit) bytes, but that's
  410. // OK. Subsequent iterations will fix up the overrun.
  411. //
  412. // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
  413. // 16-byte loads and stores. This technique probably wouldn't be as
  414. // effective on architectures that are fussier about alignment.
  415. LDP 0(R10), (R0, R1)
  416. STP (R0, R1), 0(R8)
  417. ADD R3, R8, R8
  418. inner1:
  419. // for { etc }
  420. // base := s
  421. MOVD R7, R12
  422. // !!! offset := base - candidate
  423. MOVD R12, R11
  424. SUB R15, R11, R11
  425. SUB R6, R11, R11
  426. // ----------------------------------------
  427. // Begin inline of the extendMatch call.
  428. //
  429. // s = extendMatch(src, candidate+4, s+4)
  430. // !!! R14 = &src[len(src)]
  431. MOVD src_len+32(FP), R14
  432. ADD R6, R14, R14
  433. // !!! R13 = &src[len(src) - 8]
  434. MOVD R14, R13
  435. SUB $8, R13, R13
  436. // !!! R15 = &src[candidate + 4]
  437. ADD $4, R15, R15
  438. ADD R6, R15, R15
  439. // !!! s += 4
  440. ADD $4, R7, R7
  441. inlineExtendMatchCmp8:
  442. // As long as we are 8 or more bytes before the end of src, we can load and
  443. // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
  444. CMP R13, R7
  445. BHI inlineExtendMatchCmp1
  446. MOVD (R15), R3
  447. MOVD (R7), R4
  448. CMP R4, R3
  449. BNE inlineExtendMatchBSF
  450. ADD $8, R15, R15
  451. ADD $8, R7, R7
  452. B inlineExtendMatchCmp8
  453. inlineExtendMatchBSF:
  454. // If those 8 bytes were not equal, XOR the two 8 byte values, and return
  455. // the index of the first byte that differs.
  456. // RBIT reverses the bit order, then CLZ counts the leading zeros, the
  457. // combination of which finds the least significant bit which is set.
  458. // The arm64 architecture is little-endian, and the shift by 3 converts
  459. // a bit index to a byte index.
  460. EOR R3, R4, R4
  461. RBIT R4, R4
  462. CLZ R4, R4
  463. ADD R4>>3, R7, R7
  464. B inlineExtendMatchEnd
  465. inlineExtendMatchCmp1:
  466. // In src's tail, compare 1 byte at a time.
  467. CMP R7, R14
  468. BLS inlineExtendMatchEnd
  469. MOVB (R15), R3
  470. MOVB (R7), R4
  471. CMP R4, R3
  472. BNE inlineExtendMatchEnd
  473. ADD $1, R15, R15
  474. ADD $1, R7, R7
  475. B inlineExtendMatchCmp1
  476. inlineExtendMatchEnd:
  477. // End inline of the extendMatch call.
  478. // ----------------------------------------
  479. // ----------------------------------------
  480. // Begin inline of the emitCopy call.
  481. //
  482. // d += emitCopy(dst[d:], base-candidate, s-base)
  483. // !!! length := s - base
  484. MOVD R7, R3
  485. SUB R12, R3, R3
  486. inlineEmitCopyLoop0:
  487. // for length >= 68 { etc }
  488. MOVW $68, R2
  489. CMPW R2, R3
  490. BLT inlineEmitCopyStep1
  491. // Emit a length 64 copy, encoded as 3 bytes.
  492. MOVD $0xfe, R1
  493. MOVB R1, 0(R8)
  494. MOVW R11, 1(R8)
  495. ADD $3, R8, R8
  496. SUBW $64, R3, R3
  497. B inlineEmitCopyLoop0
  498. inlineEmitCopyStep1:
  499. // if length > 64 { etc }
  500. MOVW $64, R2
  501. CMPW R2, R3
  502. BLE inlineEmitCopyStep2
  503. // Emit a length 60 copy, encoded as 3 bytes.
  504. MOVD $0xee, R1
  505. MOVB R1, 0(R8)
  506. MOVW R11, 1(R8)
  507. ADD $3, R8, R8
  508. SUBW $60, R3, R3
  509. inlineEmitCopyStep2:
  510. // if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
  511. MOVW $12, R2
  512. CMPW R2, R3
  513. BGE inlineEmitCopyStep3
  514. MOVW $2048, R2
  515. CMPW R2, R11
  516. BGE inlineEmitCopyStep3
  517. // Emit the remaining copy, encoded as 2 bytes.
  518. MOVB R11, 1(R8)
  519. LSRW $8, R11, R11
  520. LSLW $5, R11, R11
  521. SUBW $4, R3, R3
  522. AND $0xff, R3, R3
  523. LSLW $2, R3, R3
  524. ORRW R3, R11, R11
  525. ORRW $1, R11, R11
  526. MOVB R11, 0(R8)
  527. ADD $2, R8, R8
  528. B inlineEmitCopyEnd
  529. inlineEmitCopyStep3:
  530. // Emit the remaining copy, encoded as 3 bytes.
  531. SUBW $1, R3, R3
  532. LSLW $2, R3, R3
  533. ORRW $2, R3, R3
  534. MOVB R3, 0(R8)
  535. MOVW R11, 1(R8)
  536. ADD $3, R8, R8
  537. inlineEmitCopyEnd:
  538. // End inline of the emitCopy call.
  539. // ----------------------------------------
  540. // nextEmit = s
  541. MOVD R7, R10
  542. // if s >= sLimit { goto emitRemainder }
  543. MOVD R7, R3
  544. SUB R6, R3, R3
  545. CMP R3, R9
  546. BLS emitRemainder
  547. // As per the encode_other.go code:
  548. //
  549. // We could immediately etc.
  550. // x := load64(src, s-1)
  551. MOVD -1(R7), R14
  552. // prevHash := hash(uint32(x>>0), shift)
  553. MOVW R14, R11
  554. MULW R16, R11, R11
  555. LSRW R5, R11, R11
  556. // table[prevHash] = uint16(s-1)
  557. MOVD R7, R3
  558. SUB R6, R3, R3
  559. SUB $1, R3, R3
  560. MOVHU R3, 0(R17)(R11<<1)
  561. // currHash := hash(uint32(x>>8), shift)
  562. LSR $8, R14, R14
  563. MOVW R14, R11
  564. MULW R16, R11, R11
  565. LSRW R5, R11, R11
  566. // candidate = int(table[currHash])
  567. MOVHU 0(R17)(R11<<1), R15
  568. // table[currHash] = uint16(s)
  569. ADD $1, R3, R3
  570. MOVHU R3, 0(R17)(R11<<1)
  571. // if uint32(x>>8) == load32(src, candidate) { continue }
  572. MOVW (R6)(R15), R4
  573. CMPW R4, R14
  574. BEQ inner1
  575. // nextHash = hash(uint32(x>>16), shift)
  576. LSR $8, R14, R14
  577. MOVW R14, R11
  578. MULW R16, R11, R11
  579. LSRW R5, R11, R11
  580. // s++
  581. ADD $1, R7, R7
  582. // break out of the inner1 for loop, i.e. continue the outer loop.
  583. B outer
  584. emitRemainder:
  585. // if nextEmit < len(src) { etc }
  586. MOVD src_len+32(FP), R3
  587. ADD R6, R3, R3
  588. CMP R3, R10
  589. BEQ encodeBlockEnd
  590. // d += emitLiteral(dst[d:], src[nextEmit:])
  591. //
  592. // Push args.
  593. MOVD R8, 8(RSP)
  594. MOVD $0, 16(RSP) // Unnecessary, as the callee ignores it, but conservative.
  595. MOVD $0, 24(RSP) // Unnecessary, as the callee ignores it, but conservative.
  596. MOVD R10, 32(RSP)
  597. SUB R10, R3, R3
  598. MOVD R3, 40(RSP)
  599. MOVD R3, 48(RSP) // Unnecessary, as the callee ignores it, but conservative.
  600. // Spill local variables (registers) onto the stack; call; unspill.
  601. MOVD R8, 88(RSP)
  602. CALL ·emitLiteral(SB)
  603. MOVD 88(RSP), R8
  604. // Finish the "d +=" part of "d += emitLiteral(etc)".
  605. MOVD 56(RSP), R1
  606. ADD R1, R8, R8
  607. encodeBlockEnd:
  608. MOVD dst_base+0(FP), R3
  609. SUB R3, R8, R8
  610. MOVD R8, d+48(FP)
  611. RET