m4.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. package matchfinder
  2. import (
  3. "encoding/binary"
  4. "math/bits"
  5. "runtime"
  6. )
  7. // M4 is an implementation of the MatchFinder
  8. // interface that uses a hash table to find matches,
  9. // optional match chains,
  10. // and the advanced parsing technique from
  11. // https://fastcompression.blogspot.com/2011/12/advanced-parsing-strategies.html.
  12. type M4 struct {
  13. // MaxDistance is the maximum distance (in bytes) to look back for
  14. // a match. The default is 65535.
  15. MaxDistance int
  16. // MinLength is the length of the shortest match to return.
  17. // The default is 4.
  18. MinLength int
  19. // HashLen is the number of bytes to use to calculate the hashes.
  20. // The maximum is 8 and the default is 6.
  21. HashLen int
  22. // TableBits is the number of bits in the hash table indexes.
  23. // The default is 17 (128K entries).
  24. TableBits int
  25. // ChainLength is how many entries to search on the "match chain" of older
  26. // locations with the same hash as the current location.
  27. ChainLength int
  28. // DistanceBitCost is used when comparing two matches to see
  29. // which is better. The comparison is primarily based on the length
  30. // of the matches, but it can also take the distance into account,
  31. // in terms of the number of bits needed to represent the distance.
  32. // One byte of length is given a score of 256, so 32 (256/8) would
  33. // be a reasonable first guess for the value of one bit.
  34. // (The default is 0, which bases the comparison solely on length.)
  35. DistanceBitCost int
  36. table []uint32
  37. chain []uint16
  38. history []byte
  39. }
  40. func (q *M4) Reset() {
  41. for i := range q.table {
  42. q.table[i] = 0
  43. }
  44. q.history = q.history[:0]
  45. q.chain = q.chain[:0]
  46. }
  47. func (q *M4) score(m absoluteMatch) int {
  48. return (m.End-m.Start)*256 + bits.LeadingZeros32(uint32(m.Start-m.Match))*q.DistanceBitCost
  49. }
  50. func (q *M4) FindMatches(dst []Match, src []byte) []Match {
  51. if q.MaxDistance == 0 {
  52. q.MaxDistance = 65535
  53. }
  54. if q.MinLength == 0 {
  55. q.MinLength = 4
  56. }
  57. if q.HashLen == 0 {
  58. q.HashLen = 6
  59. }
  60. if q.TableBits == 0 {
  61. q.TableBits = 17
  62. }
  63. if len(q.table) < 1<<q.TableBits {
  64. q.table = make([]uint32, 1<<q.TableBits)
  65. }
  66. e := matchEmitter{Dst: dst}
  67. if len(q.history) > q.MaxDistance*2 {
  68. // Trim down the history buffer.
  69. delta := len(q.history) - q.MaxDistance
  70. copy(q.history, q.history[delta:])
  71. q.history = q.history[:q.MaxDistance]
  72. if q.ChainLength > 0 {
  73. q.chain = q.chain[:q.MaxDistance]
  74. }
  75. for i, v := range q.table {
  76. newV := int(v) - delta
  77. if newV < 0 {
  78. newV = 0
  79. }
  80. q.table[i] = uint32(newV)
  81. }
  82. }
  83. // Append src to the history buffer.
  84. e.NextEmit = len(q.history)
  85. q.history = append(q.history, src...)
  86. if q.ChainLength > 0 {
  87. q.chain = append(q.chain, make([]uint16, len(src))...)
  88. }
  89. src = q.history
  90. // matches stores the matches that have been found but not emitted,
  91. // in reverse order. (matches[0] is the most recent one.)
  92. var matches [3]absoluteMatch
  93. for i := e.NextEmit; i < len(src)-7; i++ {
  94. if matches[0] != (absoluteMatch{}) && i >= matches[0].End {
  95. // We have found some matches, and we're far enough along that we probably
  96. // won't find overlapping matches, so we might as well emit them.
  97. if matches[1] != (absoluteMatch{}) {
  98. e.trim(matches[1], matches[0].Start, q.MinLength)
  99. }
  100. e.emit(matches[0])
  101. matches = [3]absoluteMatch{}
  102. }
  103. // Calculate and store the hash.
  104. h := ((binary.LittleEndian.Uint64(src[i:]) & (1<<(8*q.HashLen) - 1)) * hashMul64) >> (64 - q.TableBits)
  105. candidate := int(q.table[h])
  106. q.table[h] = uint32(i)
  107. if q.ChainLength > 0 && candidate != 0 {
  108. delta := i - candidate
  109. if delta < 1<<16 {
  110. q.chain[i] = uint16(delta)
  111. }
  112. }
  113. if i < matches[0].End && i != matches[0].End+2-q.HashLen {
  114. continue
  115. }
  116. if candidate == 0 || i-candidate > q.MaxDistance {
  117. continue
  118. }
  119. // Look for a match.
  120. var currentMatch absoluteMatch
  121. if i-candidate != matches[0].Start-matches[0].Match {
  122. if binary.LittleEndian.Uint32(src[candidate:]) == binary.LittleEndian.Uint32(src[i:]) {
  123. m := extendMatch2(src, i, candidate, e.NextEmit)
  124. if m.End-m.Start > q.MinLength {
  125. currentMatch = m
  126. }
  127. }
  128. }
  129. for j := 0; j < q.ChainLength; j++ {
  130. delta := q.chain[candidate]
  131. if delta == 0 {
  132. break
  133. }
  134. candidate -= int(delta)
  135. if candidate <= 0 || i-candidate > q.MaxDistance {
  136. break
  137. }
  138. if i-candidate != matches[0].Start-matches[0].Match {
  139. if binary.LittleEndian.Uint32(src[candidate:]) == binary.LittleEndian.Uint32(src[i:]) {
  140. m := extendMatch2(src, i, candidate, e.NextEmit)
  141. if m.End-m.Start > q.MinLength && q.score(m) > q.score(currentMatch) {
  142. currentMatch = m
  143. }
  144. }
  145. }
  146. }
  147. if currentMatch.End-currentMatch.Start < q.MinLength {
  148. continue
  149. }
  150. overlapPenalty := 0
  151. if matches[0] != (absoluteMatch{}) {
  152. overlapPenalty = 275
  153. if currentMatch.Start <= matches[1].End {
  154. // This match would completely replace the previous match,
  155. // so there is no penalty for overlap.
  156. overlapPenalty = 0
  157. }
  158. }
  159. if q.score(currentMatch) <= q.score(matches[0])+overlapPenalty {
  160. continue
  161. }
  162. matches = [3]absoluteMatch{
  163. currentMatch,
  164. matches[0],
  165. matches[1],
  166. }
  167. if matches[2] == (absoluteMatch{}) {
  168. continue
  169. }
  170. // We have three matches, so it's time to emit one and/or eliminate one.
  171. switch {
  172. case matches[0].Start < matches[2].End:
  173. // The first and third matches overlap; discard the one in between.
  174. matches = [3]absoluteMatch{
  175. matches[0],
  176. matches[2],
  177. absoluteMatch{},
  178. }
  179. case matches[0].Start < matches[2].End+q.MinLength:
  180. // The first and third matches don't overlap, but there's no room for
  181. // another match between them. Emit the first match and discard the second.
  182. e.emit(matches[2])
  183. matches = [3]absoluteMatch{
  184. matches[0],
  185. absoluteMatch{},
  186. absoluteMatch{},
  187. }
  188. default:
  189. // Emit the first match, shortening it if necessary to avoid overlap with the second.
  190. e.trim(matches[2], matches[1].Start, q.MinLength)
  191. matches[2] = absoluteMatch{}
  192. }
  193. }
  194. // We've found all the matches now; emit the remaining ones.
  195. if matches[1] != (absoluteMatch{}) {
  196. e.trim(matches[1], matches[0].Start, q.MinLength)
  197. }
  198. if matches[0] != (absoluteMatch{}) {
  199. e.emit(matches[0])
  200. }
  201. dst = e.Dst
  202. if e.NextEmit < len(src) {
  203. dst = append(dst, Match{
  204. Unmatched: len(src) - e.NextEmit,
  205. })
  206. }
  207. return dst
  208. }
  209. const hashMul64 = 0x1E35A7BD1E35A7BD
  210. // extendMatch returns the largest k such that k <= len(src) and that
  211. // src[i:i+k-j] and src[j:k] have the same contents.
  212. //
  213. // It assumes that:
  214. //
  215. // 0 <= i && i < j && j <= len(src)
  216. func extendMatch(src []byte, i, j int) int {
  217. switch runtime.GOARCH {
  218. case "amd64":
  219. // As long as we are 8 or more bytes before the end of src, we can load and
  220. // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
  221. for j+8 < len(src) {
  222. iBytes := binary.LittleEndian.Uint64(src[i:])
  223. jBytes := binary.LittleEndian.Uint64(src[j:])
  224. if iBytes != jBytes {
  225. // If those 8 bytes were not equal, XOR the two 8 byte values, and return
  226. // the index of the first byte that differs. The BSF instruction finds the
  227. // least significant 1 bit, the amd64 architecture is little-endian, and
  228. // the shift by 3 converts a bit index to a byte index.
  229. return j + bits.TrailingZeros64(iBytes^jBytes)>>3
  230. }
  231. i, j = i+8, j+8
  232. }
  233. case "386":
  234. // On a 32-bit CPU, we do it 4 bytes at a time.
  235. for j+4 < len(src) {
  236. iBytes := binary.LittleEndian.Uint32(src[i:])
  237. jBytes := binary.LittleEndian.Uint32(src[j:])
  238. if iBytes != jBytes {
  239. return j + bits.TrailingZeros32(iBytes^jBytes)>>3
  240. }
  241. i, j = i+4, j+4
  242. }
  243. }
  244. for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
  245. }
  246. return j
  247. }
  248. // Given a 4-byte match at src[start] and src[candidate], extendMatch2 extends it
  249. // upward as far as possible, and downward no farther than to min.
  250. func extendMatch2(src []byte, start, candidate, min int) absoluteMatch {
  251. end := extendMatch(src, candidate+4, start+4)
  252. for start > min && candidate > 0 && src[start-1] == src[candidate-1] {
  253. start--
  254. candidate--
  255. }
  256. return absoluteMatch{
  257. Start: start,
  258. End: end,
  259. Match: candidate,
  260. }
  261. }