123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722 |
- // Copyright 2020 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- // +build !appengine
- // +build gc
- // +build !noasm
- #include "textflag.h"
- // The asm code generally follows the pure Go code in encode_other.go, except
- // where marked with a "!!!".
- // ----------------------------------------------------------------------------
- // func emitLiteral(dst, lit []byte) int
- //
- // All local variables fit into registers. The register allocation:
- // - R3 len(lit)
- // - R4 n
- // - R6 return value
- // - R8 &dst[i]
- // - R10 &lit[0]
- //
- // The 32 bytes of stack space is to call runtime·memmove.
- //
- // The unusual register allocation of local variables, such as R10 for the
- // source pointer, matches the allocation used at the call site in encodeBlock,
- // which makes it easier to manually inline this function.
- TEXT ·emitLiteral(SB), NOSPLIT, $32-56
- MOVD dst_base+0(FP), R8
- MOVD lit_base+24(FP), R10
- MOVD lit_len+32(FP), R3
- MOVD R3, R6
- MOVW R3, R4
- SUBW $1, R4, R4
- CMPW $60, R4
- BLT oneByte
- CMPW $256, R4
- BLT twoBytes
- threeBytes:
- MOVD $0xf4, R2
- MOVB R2, 0(R8)
- MOVW R4, 1(R8)
- ADD $3, R8, R8
- ADD $3, R6, R6
- B memmove
- twoBytes:
- MOVD $0xf0, R2
- MOVB R2, 0(R8)
- MOVB R4, 1(R8)
- ADD $2, R8, R8
- ADD $2, R6, R6
- B memmove
- oneByte:
- LSLW $2, R4, R4
- MOVB R4, 0(R8)
- ADD $1, R8, R8
- ADD $1, R6, R6
- memmove:
- MOVD R6, ret+48(FP)
- // copy(dst[i:], lit)
- //
- // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
- // R8, R10 and R3 as arguments.
- MOVD R8, 8(RSP)
- MOVD R10, 16(RSP)
- MOVD R3, 24(RSP)
- CALL runtime·memmove(SB)
- RET
- // ----------------------------------------------------------------------------
- // func emitCopy(dst []byte, offset, length int) int
- //
- // All local variables fit into registers. The register allocation:
- // - R3 length
- // - R7 &dst[0]
- // - R8 &dst[i]
- // - R11 offset
- //
- // The unusual register allocation of local variables, such as R11 for the
- // offset, matches the allocation used at the call site in encodeBlock, which
- // makes it easier to manually inline this function.
- TEXT ·emitCopy(SB), NOSPLIT, $0-48
- MOVD dst_base+0(FP), R8
- MOVD R8, R7
- MOVD offset+24(FP), R11
- MOVD length+32(FP), R3
- loop0:
- // for length >= 68 { etc }
- CMPW $68, R3
- BLT step1
- // Emit a length 64 copy, encoded as 3 bytes.
- MOVD $0xfe, R2
- MOVB R2, 0(R8)
- MOVW R11, 1(R8)
- ADD $3, R8, R8
- SUB $64, R3, R3
- B loop0
- step1:
- // if length > 64 { etc }
- CMP $64, R3
- BLE step2
- // Emit a length 60 copy, encoded as 3 bytes.
- MOVD $0xee, R2
- MOVB R2, 0(R8)
- MOVW R11, 1(R8)
- ADD $3, R8, R8
- SUB $60, R3, R3
- step2:
- // if length >= 12 || offset >= 2048 { goto step3 }
- CMP $12, R3
- BGE step3
- CMPW $2048, R11
- BGE step3
- // Emit the remaining copy, encoded as 2 bytes.
- MOVB R11, 1(R8)
- LSRW $3, R11, R11
- AND $0xe0, R11, R11
- SUB $4, R3, R3
- LSLW $2, R3
- AND $0xff, R3, R3
- ORRW R3, R11, R11
- ORRW $1, R11, R11
- MOVB R11, 0(R8)
- ADD $2, R8, R8
- // Return the number of bytes written.
- SUB R7, R8, R8
- MOVD R8, ret+40(FP)
- RET
- step3:
- // Emit the remaining copy, encoded as 3 bytes.
- SUB $1, R3, R3
- AND $0xff, R3, R3
- LSLW $2, R3, R3
- ORRW $2, R3, R3
- MOVB R3, 0(R8)
- MOVW R11, 1(R8)
- ADD $3, R8, R8
- // Return the number of bytes written.
- SUB R7, R8, R8
- MOVD R8, ret+40(FP)
- RET
- // ----------------------------------------------------------------------------
- // func extendMatch(src []byte, i, j int) int
- //
- // All local variables fit into registers. The register allocation:
- // - R6 &src[0]
- // - R7 &src[j]
- // - R13 &src[len(src) - 8]
- // - R14 &src[len(src)]
- // - R15 &src[i]
- //
- // The unusual register allocation of local variables, such as R15 for a source
- // pointer, matches the allocation used at the call site in encodeBlock, which
- // makes it easier to manually inline this function.
- TEXT ·extendMatch(SB), NOSPLIT, $0-48
- MOVD src_base+0(FP), R6
- MOVD src_len+8(FP), R14
- MOVD i+24(FP), R15
- MOVD j+32(FP), R7
- ADD R6, R14, R14
- ADD R6, R15, R15
- ADD R6, R7, R7
- MOVD R14, R13
- SUB $8, R13, R13
- cmp8:
- // As long as we are 8 or more bytes before the end of src, we can load and
- // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
- CMP R13, R7
- BHI cmp1
- MOVD (R15), R3
- MOVD (R7), R4
- CMP R4, R3
- BNE bsf
- ADD $8, R15, R15
- ADD $8, R7, R7
- B cmp8
- bsf:
- // If those 8 bytes were not equal, XOR the two 8 byte values, and return
- // the index of the first byte that differs.
- // RBIT reverses the bit order, then CLZ counts the leading zeros, the
- // combination of which finds the least significant bit which is set.
- // The arm64 architecture is little-endian, and the shift by 3 converts
- // a bit index to a byte index.
- EOR R3, R4, R4
- RBIT R4, R4
- CLZ R4, R4
- ADD R4>>3, R7, R7
- // Convert from &src[ret] to ret.
- SUB R6, R7, R7
- MOVD R7, ret+40(FP)
- RET
- cmp1:
- // In src's tail, compare 1 byte at a time.
- CMP R7, R14
- BLS extendMatchEnd
- MOVB (R15), R3
- MOVB (R7), R4
- CMP R4, R3
- BNE extendMatchEnd
- ADD $1, R15, R15
- ADD $1, R7, R7
- B cmp1
- extendMatchEnd:
- // Convert from &src[ret] to ret.
- SUB R6, R7, R7
- MOVD R7, ret+40(FP)
- RET
- // ----------------------------------------------------------------------------
- // func encodeBlock(dst, src []byte) (d int)
- //
- // All local variables fit into registers, other than "var table". The register
- // allocation:
- // - R3 . .
- // - R4 . .
- // - R5 64 shift
- // - R6 72 &src[0], tableSize
- // - R7 80 &src[s]
- // - R8 88 &dst[d]
- // - R9 96 sLimit
- // - R10 . &src[nextEmit]
- // - R11 104 prevHash, currHash, nextHash, offset
- // - R12 112 &src[base], skip
- // - R13 . &src[nextS], &src[len(src) - 8]
- // - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x
- // - R15 120 candidate
- // - R16 . hash constant, 0x1e35a7bd
- // - R17 . &table
- // - . 128 table
- //
- // The second column (64, 72, etc) is the stack offset to spill the registers
- // when calling other functions. We could pack this slightly tighter, but it's
- // simpler to have a dedicated spill map independent of the function called.
- //
- // "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
- // extra 64 bytes, to call other functions, and an extra 64 bytes, to spill
- // local variables (registers) during calls gives 32768 + 64 + 64 = 32896.
- TEXT ·encodeBlock(SB), 0, $32896-56
- MOVD dst_base+0(FP), R8
- MOVD src_base+24(FP), R7
- MOVD src_len+32(FP), R14
- // shift, tableSize := uint32(32-8), 1<<8
- MOVD $24, R5
- MOVD $256, R6
- MOVW $0xa7bd, R16
- MOVKW $(0x1e35<<16), R16
- calcShift:
- // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
- // shift--
- // }
- MOVD $16384, R2
- CMP R2, R6
- BGE varTable
- CMP R14, R6
- BGE varTable
- SUB $1, R5, R5
- LSL $1, R6, R6
- B calcShift
- varTable:
- // var table [maxTableSize]uint16
- //
- // In the asm code, unlike the Go code, we can zero-initialize only the
- // first tableSize elements. Each uint16 element is 2 bytes and each
- // iterations writes 64 bytes, so we can do only tableSize/32 writes
- // instead of the 2048 writes that would zero-initialize all of table's
- // 32768 bytes. This clear could overrun the first tableSize elements, but
- // it won't overrun the allocated stack size.
- ADD $128, RSP, R17
- MOVD R17, R4
- // !!! R6 = &src[tableSize]
- ADD R6<<1, R17, R6
- memclr:
- STP.P (ZR, ZR), 64(R4)
- STP (ZR, ZR), -48(R4)
- STP (ZR, ZR), -32(R4)
- STP (ZR, ZR), -16(R4)
- CMP R4, R6
- BHI memclr
- // !!! R6 = &src[0]
- MOVD R7, R6
- // sLimit := len(src) - inputMargin
- MOVD R14, R9
- SUB $15, R9, R9
- // !!! Pre-emptively spill R5, R6 and R9 to the stack. Their values don't
- // change for the rest of the function.
- MOVD R5, 64(RSP)
- MOVD R6, 72(RSP)
- MOVD R9, 96(RSP)
- // nextEmit := 0
- MOVD R6, R10
- // s := 1
- ADD $1, R7, R7
- // nextHash := hash(load32(src, s), shift)
- MOVW 0(R7), R11
- MULW R16, R11, R11
- LSRW R5, R11, R11
- outer:
- // for { etc }
- // skip := 32
- MOVD $32, R12
- // nextS := s
- MOVD R7, R13
- // candidate := 0
- MOVD $0, R15
- inner0:
- // for { etc }
- // s := nextS
- MOVD R13, R7
- // bytesBetweenHashLookups := skip >> 5
- MOVD R12, R14
- LSR $5, R14, R14
- // nextS = s + bytesBetweenHashLookups
- ADD R14, R13, R13
- // skip += bytesBetweenHashLookups
- ADD R14, R12, R12
- // if nextS > sLimit { goto emitRemainder }
- MOVD R13, R3
- SUB R6, R3, R3
- CMP R9, R3
- BHI emitRemainder
- // candidate = int(table[nextHash])
- MOVHU 0(R17)(R11<<1), R15
- // table[nextHash] = uint16(s)
- MOVD R7, R3
- SUB R6, R3, R3
- MOVH R3, 0(R17)(R11<<1)
- // nextHash = hash(load32(src, nextS), shift)
- MOVW 0(R13), R11
- MULW R16, R11
- LSRW R5, R11, R11
- // if load32(src, s) != load32(src, candidate) { continue } break
- MOVW 0(R7), R3
- MOVW (R6)(R15), R4
- CMPW R4, R3
- BNE inner0
- fourByteMatch:
- // As per the encode_other.go code:
- //
- // A 4-byte match has been found. We'll later see etc.
- // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
- // on inputMargin in encode.go.
- MOVD R7, R3
- SUB R10, R3, R3
- CMP $16, R3
- BLE emitLiteralFastPath
- // ----------------------------------------
- // Begin inline of the emitLiteral call.
- //
- // d += emitLiteral(dst[d:], src[nextEmit:s])
- MOVW R3, R4
- SUBW $1, R4, R4
- MOVW $60, R2
- CMPW R2, R4
- BLT inlineEmitLiteralOneByte
- MOVW $256, R2
- CMPW R2, R4
- BLT inlineEmitLiteralTwoBytes
- inlineEmitLiteralThreeBytes:
- MOVD $0xf4, R1
- MOVB R1, 0(R8)
- MOVW R4, 1(R8)
- ADD $3, R8, R8
- B inlineEmitLiteralMemmove
- inlineEmitLiteralTwoBytes:
- MOVD $0xf0, R1
- MOVB R1, 0(R8)
- MOVB R4, 1(R8)
- ADD $2, R8, R8
- B inlineEmitLiteralMemmove
- inlineEmitLiteralOneByte:
- LSLW $2, R4, R4
- MOVB R4, 0(R8)
- ADD $1, R8, R8
- inlineEmitLiteralMemmove:
- // Spill local variables (registers) onto the stack; call; unspill.
- //
- // copy(dst[i:], lit)
- //
- // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
- // R8, R10 and R3 as arguments.
- MOVD R8, 8(RSP)
- MOVD R10, 16(RSP)
- MOVD R3, 24(RSP)
- // Finish the "d +=" part of "d += emitLiteral(etc)".
- ADD R3, R8, R8
- MOVD R7, 80(RSP)
- MOVD R8, 88(RSP)
- MOVD R15, 120(RSP)
- CALL runtime·memmove(SB)
- MOVD 64(RSP), R5
- MOVD 72(RSP), R6
- MOVD 80(RSP), R7
- MOVD 88(RSP), R8
- MOVD 96(RSP), R9
- MOVD 120(RSP), R15
- ADD $128, RSP, R17
- MOVW $0xa7bd, R16
- MOVKW $(0x1e35<<16), R16
- B inner1
- inlineEmitLiteralEnd:
- // End inline of the emitLiteral call.
- // ----------------------------------------
- emitLiteralFastPath:
- // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
- MOVB R3, R4
- SUBW $1, R4, R4
- AND $0xff, R4, R4
- LSLW $2, R4, R4
- MOVB R4, (R8)
- ADD $1, R8, R8
- // !!! Implement the copy from lit to dst as a 16-byte load and store.
- // (Encode's documentation says that dst and src must not overlap.)
- //
- // This always copies 16 bytes, instead of only len(lit) bytes, but that's
- // OK. Subsequent iterations will fix up the overrun.
- //
- // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
- // 16-byte loads and stores. This technique probably wouldn't be as
- // effective on architectures that are fussier about alignment.
- LDP 0(R10), (R0, R1)
- STP (R0, R1), 0(R8)
- ADD R3, R8, R8
- inner1:
- // for { etc }
- // base := s
- MOVD R7, R12
- // !!! offset := base - candidate
- MOVD R12, R11
- SUB R15, R11, R11
- SUB R6, R11, R11
- // ----------------------------------------
- // Begin inline of the extendMatch call.
- //
- // s = extendMatch(src, candidate+4, s+4)
- // !!! R14 = &src[len(src)]
- MOVD src_len+32(FP), R14
- ADD R6, R14, R14
- // !!! R13 = &src[len(src) - 8]
- MOVD R14, R13
- SUB $8, R13, R13
- // !!! R15 = &src[candidate + 4]
- ADD $4, R15, R15
- ADD R6, R15, R15
- // !!! s += 4
- ADD $4, R7, R7
- inlineExtendMatchCmp8:
- // As long as we are 8 or more bytes before the end of src, we can load and
- // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
- CMP R13, R7
- BHI inlineExtendMatchCmp1
- MOVD (R15), R3
- MOVD (R7), R4
- CMP R4, R3
- BNE inlineExtendMatchBSF
- ADD $8, R15, R15
- ADD $8, R7, R7
- B inlineExtendMatchCmp8
- inlineExtendMatchBSF:
- // If those 8 bytes were not equal, XOR the two 8 byte values, and return
- // the index of the first byte that differs.
- // RBIT reverses the bit order, then CLZ counts the leading zeros, the
- // combination of which finds the least significant bit which is set.
- // The arm64 architecture is little-endian, and the shift by 3 converts
- // a bit index to a byte index.
- EOR R3, R4, R4
- RBIT R4, R4
- CLZ R4, R4
- ADD R4>>3, R7, R7
- B inlineExtendMatchEnd
- inlineExtendMatchCmp1:
- // In src's tail, compare 1 byte at a time.
- CMP R7, R14
- BLS inlineExtendMatchEnd
- MOVB (R15), R3
- MOVB (R7), R4
- CMP R4, R3
- BNE inlineExtendMatchEnd
- ADD $1, R15, R15
- ADD $1, R7, R7
- B inlineExtendMatchCmp1
- inlineExtendMatchEnd:
- // End inline of the extendMatch call.
- // ----------------------------------------
- // ----------------------------------------
- // Begin inline of the emitCopy call.
- //
- // d += emitCopy(dst[d:], base-candidate, s-base)
- // !!! length := s - base
- MOVD R7, R3
- SUB R12, R3, R3
- inlineEmitCopyLoop0:
- // for length >= 68 { etc }
- MOVW $68, R2
- CMPW R2, R3
- BLT inlineEmitCopyStep1
- // Emit a length 64 copy, encoded as 3 bytes.
- MOVD $0xfe, R1
- MOVB R1, 0(R8)
- MOVW R11, 1(R8)
- ADD $3, R8, R8
- SUBW $64, R3, R3
- B inlineEmitCopyLoop0
- inlineEmitCopyStep1:
- // if length > 64 { etc }
- MOVW $64, R2
- CMPW R2, R3
- BLE inlineEmitCopyStep2
- // Emit a length 60 copy, encoded as 3 bytes.
- MOVD $0xee, R1
- MOVB R1, 0(R8)
- MOVW R11, 1(R8)
- ADD $3, R8, R8
- SUBW $60, R3, R3
- inlineEmitCopyStep2:
- // if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
- MOVW $12, R2
- CMPW R2, R3
- BGE inlineEmitCopyStep3
- MOVW $2048, R2
- CMPW R2, R11
- BGE inlineEmitCopyStep3
- // Emit the remaining copy, encoded as 2 bytes.
- MOVB R11, 1(R8)
- LSRW $8, R11, R11
- LSLW $5, R11, R11
- SUBW $4, R3, R3
- AND $0xff, R3, R3
- LSLW $2, R3, R3
- ORRW R3, R11, R11
- ORRW $1, R11, R11
- MOVB R11, 0(R8)
- ADD $2, R8, R8
- B inlineEmitCopyEnd
- inlineEmitCopyStep3:
- // Emit the remaining copy, encoded as 3 bytes.
- SUBW $1, R3, R3
- LSLW $2, R3, R3
- ORRW $2, R3, R3
- MOVB R3, 0(R8)
- MOVW R11, 1(R8)
- ADD $3, R8, R8
- inlineEmitCopyEnd:
- // End inline of the emitCopy call.
- // ----------------------------------------
- // nextEmit = s
- MOVD R7, R10
- // if s >= sLimit { goto emitRemainder }
- MOVD R7, R3
- SUB R6, R3, R3
- CMP R3, R9
- BLS emitRemainder
- // As per the encode_other.go code:
- //
- // We could immediately etc.
- // x := load64(src, s-1)
- MOVD -1(R7), R14
- // prevHash := hash(uint32(x>>0), shift)
- MOVW R14, R11
- MULW R16, R11, R11
- LSRW R5, R11, R11
- // table[prevHash] = uint16(s-1)
- MOVD R7, R3
- SUB R6, R3, R3
- SUB $1, R3, R3
- MOVHU R3, 0(R17)(R11<<1)
- // currHash := hash(uint32(x>>8), shift)
- LSR $8, R14, R14
- MOVW R14, R11
- MULW R16, R11, R11
- LSRW R5, R11, R11
- // candidate = int(table[currHash])
- MOVHU 0(R17)(R11<<1), R15
- // table[currHash] = uint16(s)
- ADD $1, R3, R3
- MOVHU R3, 0(R17)(R11<<1)
- // if uint32(x>>8) == load32(src, candidate) { continue }
- MOVW (R6)(R15), R4
- CMPW R4, R14
- BEQ inner1
- // nextHash = hash(uint32(x>>16), shift)
- LSR $8, R14, R14
- MOVW R14, R11
- MULW R16, R11, R11
- LSRW R5, R11, R11
- // s++
- ADD $1, R7, R7
- // break out of the inner1 for loop, i.e. continue the outer loop.
- B outer
- emitRemainder:
- // if nextEmit < len(src) { etc }
- MOVD src_len+32(FP), R3
- ADD R6, R3, R3
- CMP R3, R10
- BEQ encodeBlockEnd
- // d += emitLiteral(dst[d:], src[nextEmit:])
- //
- // Push args.
- MOVD R8, 8(RSP)
- MOVD $0, 16(RSP) // Unnecessary, as the callee ignores it, but conservative.
- MOVD $0, 24(RSP) // Unnecessary, as the callee ignores it, but conservative.
- MOVD R10, 32(RSP)
- SUB R10, R3, R3
- MOVD R3, 40(RSP)
- MOVD R3, 48(RSP) // Unnecessary, as the callee ignores it, but conservative.
- // Spill local variables (registers) onto the stack; call; unspill.
- MOVD R8, 88(RSP)
- CALL ·emitLiteral(SB)
- MOVD 88(RSP), R8
- // Finish the "d +=" part of "d += emitLiteral(etc)".
- MOVD 56(RSP), R1
- ADD R1, R8, R8
- encodeBlockEnd:
- MOVD dst_base+0(FP), R3
- SUB R3, R8, R8
- MOVD R8, d+48(FP)
- RET
|