literal_cost.go 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. package brotli
  2. func utf8Position(last uint, c uint, clamp uint) uint {
  3. if c < 128 {
  4. return 0 /* Next one is the 'Byte 1' again. */
  5. } else if c >= 192 { /* Next one is the 'Byte 2' of utf-8 encoding. */
  6. return brotli_min_size_t(1, clamp)
  7. } else {
  8. /* Let's decide over the last byte if this ends the sequence. */
  9. if last < 0xE0 {
  10. return 0 /* Completed two or three byte coding. */ /* Next one is the 'Byte 3' of utf-8 encoding. */
  11. } else {
  12. return brotli_min_size_t(2, clamp)
  13. }
  14. }
  15. }
  16. func decideMultiByteStatsLevel(pos uint, len uint, mask uint, data []byte) uint {
  17. var counts = [3]uint{0} /* should be 2, but 1 compresses better. */
  18. var max_utf8 uint = 1
  19. var last_c uint = 0
  20. var i uint
  21. for i = 0; i < len; i++ {
  22. var c uint = uint(data[(pos+i)&mask])
  23. counts[utf8Position(last_c, c, 2)]++
  24. last_c = c
  25. }
  26. if counts[2] < 500 {
  27. max_utf8 = 1
  28. }
  29. if counts[1]+counts[2] < 25 {
  30. max_utf8 = 0
  31. }
  32. return max_utf8
  33. }
  34. func estimateBitCostsForLiteralsUTF8(pos uint, len uint, mask uint, data []byte, cost []float32) {
  35. var max_utf8 uint = decideMultiByteStatsLevel(pos, uint(len), mask, data)
  36. /* Bootstrap histograms. */
  37. var histogram = [3][256]uint{[256]uint{0}}
  38. var window_half uint = 495
  39. var in_window uint = brotli_min_size_t(window_half, uint(len))
  40. var in_window_utf8 = [3]uint{0}
  41. /* max_utf8 is 0 (normal ASCII single byte modeling),
  42. 1 (for 2-byte UTF-8 modeling), or 2 (for 3-byte UTF-8 modeling). */
  43. var i uint
  44. {
  45. var last_c uint = 0
  46. var utf8_pos uint = 0
  47. for i = 0; i < in_window; i++ {
  48. var c uint = uint(data[(pos+i)&mask])
  49. histogram[utf8_pos][c]++
  50. in_window_utf8[utf8_pos]++
  51. utf8_pos = utf8Position(last_c, c, max_utf8)
  52. last_c = c
  53. }
  54. }
  55. /* Compute bit costs with sliding window. */
  56. for i = 0; i < len; i++ {
  57. if i >= window_half {
  58. var c uint
  59. var last_c uint
  60. if i < window_half+1 {
  61. c = 0
  62. } else {
  63. c = uint(data[(pos+i-window_half-1)&mask])
  64. }
  65. if i < window_half+2 {
  66. last_c = 0
  67. } else {
  68. last_c = uint(data[(pos+i-window_half-2)&mask])
  69. }
  70. /* Remove a byte in the past. */
  71. var utf8_pos2 uint = utf8Position(last_c, c, max_utf8)
  72. histogram[utf8_pos2][data[(pos+i-window_half)&mask]]--
  73. in_window_utf8[utf8_pos2]--
  74. }
  75. if i+window_half < len {
  76. var c uint = uint(data[(pos+i+window_half-1)&mask])
  77. var last_c uint = uint(data[(pos+i+window_half-2)&mask])
  78. /* Add a byte in the future. */
  79. var utf8_pos2 uint = utf8Position(last_c, c, max_utf8)
  80. histogram[utf8_pos2][data[(pos+i+window_half)&mask]]++
  81. in_window_utf8[utf8_pos2]++
  82. }
  83. {
  84. var c uint
  85. var last_c uint
  86. if i < 1 {
  87. c = 0
  88. } else {
  89. c = uint(data[(pos+i-1)&mask])
  90. }
  91. if i < 2 {
  92. last_c = 0
  93. } else {
  94. last_c = uint(data[(pos+i-2)&mask])
  95. }
  96. var utf8_pos uint = utf8Position(last_c, c, max_utf8)
  97. var masked_pos uint = (pos + i) & mask
  98. var histo uint = histogram[utf8_pos][data[masked_pos]]
  99. var lit_cost float64
  100. if histo == 0 {
  101. histo = 1
  102. }
  103. lit_cost = fastLog2(in_window_utf8[utf8_pos]) - fastLog2(histo)
  104. lit_cost += 0.02905
  105. if lit_cost < 1.0 {
  106. lit_cost *= 0.5
  107. lit_cost += 0.5
  108. }
  109. /* Make the first bytes more expensive -- seems to help, not sure why.
  110. Perhaps because the entropy source is changing its properties
  111. rapidly in the beginning of the file, perhaps because the beginning
  112. of the data is a statistical "anomaly". */
  113. if i < 2000 {
  114. lit_cost += 0.7 - (float64(2000-i) / 2000.0 * 0.35)
  115. }
  116. cost[i] = float32(lit_cost)
  117. }
  118. }
  119. }
  120. func estimateBitCostsForLiterals(pos uint, len uint, mask uint, data []byte, cost []float32) {
  121. if isMostlyUTF8(data, pos, mask, uint(len), kMinUTF8Ratio) {
  122. estimateBitCostsForLiteralsUTF8(pos, uint(len), mask, data, cost)
  123. return
  124. } else {
  125. var histogram = [256]uint{0}
  126. var window_half uint = 2000
  127. var in_window uint = brotli_min_size_t(window_half, uint(len))
  128. var i uint
  129. /* Bootstrap histogram. */
  130. for i = 0; i < in_window; i++ {
  131. histogram[data[(pos+i)&mask]]++
  132. }
  133. /* Compute bit costs with sliding window. */
  134. for i = 0; i < len; i++ {
  135. var histo uint
  136. if i >= window_half {
  137. /* Remove a byte in the past. */
  138. histogram[data[(pos+i-window_half)&mask]]--
  139. in_window--
  140. }
  141. if i+window_half < len {
  142. /* Add a byte in the future. */
  143. histogram[data[(pos+i+window_half)&mask]]++
  144. in_window++
  145. }
  146. histo = histogram[data[(pos+i)&mask]]
  147. if histo == 0 {
  148. histo = 1
  149. }
  150. {
  151. var lit_cost float64 = fastLog2(in_window) - fastLog2(histo)
  152. lit_cost += 0.029
  153. if lit_cost < 1.0 {
  154. lit_cost *= 0.5
  155. lit_cost += 0.5
  156. }
  157. cost[i] = float32(lit_cost)
  158. }
  159. }
  160. }
  161. }