token.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. // Copyright 2009 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package flate
  5. import (
  6. "bytes"
  7. "encoding/binary"
  8. "fmt"
  9. "io"
  10. "math"
  11. )
  12. const (
  13. // bits 0-16 xoffset = offset - MIN_OFFSET_SIZE, or literal - 16 bits
  14. // bits 16-22 offsetcode - 5 bits
  15. // bits 22-30 xlength = length - MIN_MATCH_LENGTH - 8 bits
  16. // bits 30-32 type 0 = literal 1=EOF 2=Match 3=Unused - 2 bits
  17. lengthShift = 22
  18. offsetMask = 1<<lengthShift - 1
  19. typeMask = 3 << 30
  20. literalType = 0 << 30
  21. matchType = 1 << 30
  22. matchOffsetOnlyMask = 0xffff
  23. )
  24. // The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
  25. // is lengthCodes[length - MIN_MATCH_LENGTH]
  26. var lengthCodes = [256]uint8{
  27. 0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
  28. 9, 9, 10, 10, 11, 11, 12, 12, 12, 12,
  29. 13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
  30. 15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
  31. 17, 17, 17, 17, 17, 17, 17, 17, 18, 18,
  32. 18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
  33. 19, 19, 19, 19, 20, 20, 20, 20, 20, 20,
  34. 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
  35. 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
  36. 21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
  37. 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
  38. 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
  39. 23, 23, 23, 23, 23, 23, 23, 23, 24, 24,
  40. 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  41. 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  42. 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  43. 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
  44. 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
  45. 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
  46. 25, 25, 26, 26, 26, 26, 26, 26, 26, 26,
  47. 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
  48. 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
  49. 26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
  50. 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
  51. 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
  52. 27, 27, 27, 27, 27, 28,
  53. }
  54. // lengthCodes1 is length codes, but starting at 1.
  55. var lengthCodes1 = [256]uint8{
  56. 1, 2, 3, 4, 5, 6, 7, 8, 9, 9,
  57. 10, 10, 11, 11, 12, 12, 13, 13, 13, 13,
  58. 14, 14, 14, 14, 15, 15, 15, 15, 16, 16,
  59. 16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
  60. 18, 18, 18, 18, 18, 18, 18, 18, 19, 19,
  61. 19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
  62. 20, 20, 20, 20, 21, 21, 21, 21, 21, 21,
  63. 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
  64. 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
  65. 22, 22, 22, 22, 22, 22, 23, 23, 23, 23,
  66. 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
  67. 23, 23, 24, 24, 24, 24, 24, 24, 24, 24,
  68. 24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
  69. 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
  70. 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
  71. 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
  72. 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
  73. 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
  74. 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
  75. 26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
  76. 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
  77. 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
  78. 27, 27, 27, 27, 28, 28, 28, 28, 28, 28,
  79. 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
  80. 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
  81. 28, 28, 28, 28, 28, 29,
  82. }
  83. var offsetCodes = [256]uint32{
  84. 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
  85. 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
  86. 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
  87. 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
  88. 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
  89. 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
  90. 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
  91. 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
  92. 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
  93. 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
  94. 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
  95. 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
  96. 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
  97. 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
  98. 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
  99. 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
  100. }
  101. // offsetCodes14 are offsetCodes, but with 14 added.
  102. var offsetCodes14 = [256]uint32{
  103. 14, 15, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
  104. 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
  105. 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
  106. 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
  107. 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
  108. 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
  109. 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
  110. 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
  111. 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
  112. 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
  113. 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
  114. 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
  115. 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
  116. 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
  117. 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
  118. 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
  119. }
  120. type token uint32
  121. type tokens struct {
  122. extraHist [32]uint16 // codes 256->maxnumlit
  123. offHist [32]uint16 // offset codes
  124. litHist [256]uint16 // codes 0->255
  125. nFilled int
  126. n uint16 // Must be able to contain maxStoreBlockSize
  127. tokens [maxStoreBlockSize + 1]token
  128. }
  129. func (t *tokens) Reset() {
  130. if t.n == 0 {
  131. return
  132. }
  133. t.n = 0
  134. t.nFilled = 0
  135. for i := range t.litHist[:] {
  136. t.litHist[i] = 0
  137. }
  138. for i := range t.extraHist[:] {
  139. t.extraHist[i] = 0
  140. }
  141. for i := range t.offHist[:] {
  142. t.offHist[i] = 0
  143. }
  144. }
  145. func (t *tokens) Fill() {
  146. if t.n == 0 {
  147. return
  148. }
  149. for i, v := range t.litHist[:] {
  150. if v == 0 {
  151. t.litHist[i] = 1
  152. t.nFilled++
  153. }
  154. }
  155. for i, v := range t.extraHist[:literalCount-256] {
  156. if v == 0 {
  157. t.nFilled++
  158. t.extraHist[i] = 1
  159. }
  160. }
  161. for i, v := range t.offHist[:offsetCodeCount] {
  162. if v == 0 {
  163. t.offHist[i] = 1
  164. }
  165. }
  166. }
  167. func indexTokens(in []token) tokens {
  168. var t tokens
  169. t.indexTokens(in)
  170. return t
  171. }
  172. func (t *tokens) indexTokens(in []token) {
  173. t.Reset()
  174. for _, tok := range in {
  175. if tok < matchType {
  176. t.AddLiteral(tok.literal())
  177. continue
  178. }
  179. t.AddMatch(uint32(tok.length()), tok.offset()&matchOffsetOnlyMask)
  180. }
  181. }
  182. // emitLiteral writes a literal chunk and returns the number of bytes written.
  183. func emitLiteral(dst *tokens, lit []byte) {
  184. for _, v := range lit {
  185. dst.tokens[dst.n] = token(v)
  186. dst.litHist[v]++
  187. dst.n++
  188. }
  189. }
  190. func (t *tokens) AddLiteral(lit byte) {
  191. t.tokens[t.n] = token(lit)
  192. t.litHist[lit]++
  193. t.n++
  194. }
  195. // from https://stackoverflow.com/a/28730362
  196. func mFastLog2(val float32) float32 {
  197. ux := int32(math.Float32bits(val))
  198. log2 := (float32)(((ux >> 23) & 255) - 128)
  199. ux &= -0x7f800001
  200. ux += 127 << 23
  201. uval := math.Float32frombits(uint32(ux))
  202. log2 += ((-0.34484843)*uval+2.02466578)*uval - 0.67487759
  203. return log2
  204. }
  205. // EstimatedBits will return an minimum size estimated by an *optimal*
  206. // compression of the block.
  207. // The size of the block
  208. func (t *tokens) EstimatedBits() int {
  209. shannon := float32(0)
  210. bits := int(0)
  211. nMatches := 0
  212. total := int(t.n) + t.nFilled
  213. if total > 0 {
  214. invTotal := 1.0 / float32(total)
  215. for _, v := range t.litHist[:] {
  216. if v > 0 {
  217. n := float32(v)
  218. shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
  219. }
  220. }
  221. // Just add 15 for EOB
  222. shannon += 15
  223. for i, v := range t.extraHist[1 : literalCount-256] {
  224. if v > 0 {
  225. n := float32(v)
  226. shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
  227. bits += int(lengthExtraBits[i&31]) * int(v)
  228. nMatches += int(v)
  229. }
  230. }
  231. }
  232. if nMatches > 0 {
  233. invTotal := 1.0 / float32(nMatches)
  234. for i, v := range t.offHist[:offsetCodeCount] {
  235. if v > 0 {
  236. n := float32(v)
  237. shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
  238. bits += int(offsetExtraBits[i&31]) * int(v)
  239. }
  240. }
  241. }
  242. return int(shannon) + bits
  243. }
  244. // AddMatch adds a match to the tokens.
  245. // This function is very sensitive to inlining and right on the border.
  246. func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
  247. if debugDeflate {
  248. if xlength >= maxMatchLength+baseMatchLength {
  249. panic(fmt.Errorf("invalid length: %v", xlength))
  250. }
  251. if xoffset >= maxMatchOffset+baseMatchOffset {
  252. panic(fmt.Errorf("invalid offset: %v", xoffset))
  253. }
  254. }
  255. oCode := offsetCode(xoffset)
  256. xoffset |= oCode << 16
  257. t.extraHist[lengthCodes1[uint8(xlength)]]++
  258. t.offHist[oCode&31]++
  259. t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
  260. t.n++
  261. }
  262. // AddMatchLong adds a match to the tokens, potentially longer than max match length.
  263. // Length should NOT have the base subtracted, only offset should.
  264. func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
  265. if debugDeflate {
  266. if xoffset >= maxMatchOffset+baseMatchOffset {
  267. panic(fmt.Errorf("invalid offset: %v", xoffset))
  268. }
  269. }
  270. oc := offsetCode(xoffset)
  271. xoffset |= oc << 16
  272. for xlength > 0 {
  273. xl := xlength
  274. if xl > 258 {
  275. // We need to have at least baseMatchLength left over for next loop.
  276. if xl > 258+baseMatchLength {
  277. xl = 258
  278. } else {
  279. xl = 258 - baseMatchLength
  280. }
  281. }
  282. xlength -= xl
  283. xl -= baseMatchLength
  284. t.extraHist[lengthCodes1[uint8(xl)]]++
  285. t.offHist[oc&31]++
  286. t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
  287. t.n++
  288. }
  289. }
  290. func (t *tokens) AddEOB() {
  291. t.tokens[t.n] = token(endBlockMarker)
  292. t.extraHist[0]++
  293. t.n++
  294. }
  295. func (t *tokens) Slice() []token {
  296. return t.tokens[:t.n]
  297. }
  298. // VarInt returns the tokens as varint encoded bytes.
  299. func (t *tokens) VarInt() []byte {
  300. var b = make([]byte, binary.MaxVarintLen32*int(t.n))
  301. var off int
  302. for _, v := range t.tokens[:t.n] {
  303. off += binary.PutUvarint(b[off:], uint64(v))
  304. }
  305. return b[:off]
  306. }
  307. // FromVarInt restores t to the varint encoded tokens provided.
  308. // Any data in t is removed.
  309. func (t *tokens) FromVarInt(b []byte) error {
  310. var buf = bytes.NewReader(b)
  311. var toks []token
  312. for {
  313. r, err := binary.ReadUvarint(buf)
  314. if err == io.EOF {
  315. break
  316. }
  317. if err != nil {
  318. return err
  319. }
  320. toks = append(toks, token(r))
  321. }
  322. t.indexTokens(toks)
  323. return nil
  324. }
  325. // Returns the type of a token
  326. func (t token) typ() uint32 { return uint32(t) & typeMask }
  327. // Returns the literal of a literal token
  328. func (t token) literal() uint8 { return uint8(t) }
  329. // Returns the extra offset of a match token
  330. func (t token) offset() uint32 { return uint32(t) & offsetMask }
  331. func (t token) length() uint8 { return uint8(t >> lengthShift) }
  332. // Convert length to code.
  333. func lengthCode(len uint8) uint8 { return lengthCodes[len] }
  334. // Returns the offset code corresponding to a specific offset
  335. func offsetCode(off uint32) uint32 {
  336. if false {
  337. if off < uint32(len(offsetCodes)) {
  338. return offsetCodes[off&255]
  339. } else if off>>7 < uint32(len(offsetCodes)) {
  340. return offsetCodes[(off>>7)&255] + 14
  341. } else {
  342. return offsetCodes[(off>>14)&255] + 28
  343. }
  344. }
  345. if off < uint32(len(offsetCodes)) {
  346. return offsetCodes[uint8(off)]
  347. }
  348. return offsetCodes14[uint8(off>>7)]
  349. }