cluster_literal.go 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. package brotli
  2. import "math"
  3. /* Copyright 2013 Google Inc. All Rights Reserved.
  4. Distributed under MIT license.
  5. See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
  6. */
  7. /* Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
  8. it is below a threshold, stores the pair (idx1, idx2) in the *pairs queue. */
  9. func compareAndPushToQueueLiteral(out []histogramLiteral, cluster_size []uint32, idx1 uint32, idx2 uint32, max_num_pairs uint, pairs []histogramPair, num_pairs *uint) {
  10. var is_good_pair bool = false
  11. var p histogramPair
  12. p.idx2 = 0
  13. p.idx1 = p.idx2
  14. p.cost_combo = 0
  15. p.cost_diff = p.cost_combo
  16. if idx1 == idx2 {
  17. return
  18. }
  19. if idx2 < idx1 {
  20. var t uint32 = idx2
  21. idx2 = idx1
  22. idx1 = t
  23. }
  24. p.idx1 = idx1
  25. p.idx2 = idx2
  26. p.cost_diff = 0.5 * clusterCostDiff(uint(cluster_size[idx1]), uint(cluster_size[idx2]))
  27. p.cost_diff -= out[idx1].bit_cost_
  28. p.cost_diff -= out[idx2].bit_cost_
  29. if out[idx1].total_count_ == 0 {
  30. p.cost_combo = out[idx2].bit_cost_
  31. is_good_pair = true
  32. } else if out[idx2].total_count_ == 0 {
  33. p.cost_combo = out[idx1].bit_cost_
  34. is_good_pair = true
  35. } else {
  36. var threshold float64
  37. if *num_pairs == 0 {
  38. threshold = 1e99
  39. } else {
  40. threshold = brotli_max_double(0.0, pairs[0].cost_diff)
  41. }
  42. var combo histogramLiteral = out[idx1]
  43. var cost_combo float64
  44. histogramAddHistogramLiteral(&combo, &out[idx2])
  45. cost_combo = populationCostLiteral(&combo)
  46. if cost_combo < threshold-p.cost_diff {
  47. p.cost_combo = cost_combo
  48. is_good_pair = true
  49. }
  50. }
  51. if is_good_pair {
  52. p.cost_diff += p.cost_combo
  53. if *num_pairs > 0 && histogramPairIsLess(&pairs[0], &p) {
  54. /* Replace the top of the queue if needed. */
  55. if *num_pairs < max_num_pairs {
  56. pairs[*num_pairs] = pairs[0]
  57. (*num_pairs)++
  58. }
  59. pairs[0] = p
  60. } else if *num_pairs < max_num_pairs {
  61. pairs[*num_pairs] = p
  62. (*num_pairs)++
  63. }
  64. }
  65. }
  66. func histogramCombineLiteral(out []histogramLiteral, cluster_size []uint32, symbols []uint32, clusters []uint32, pairs []histogramPair, num_clusters uint, symbols_size uint, max_clusters uint, max_num_pairs uint) uint {
  67. var cost_diff_threshold float64 = 0.0
  68. var min_cluster_size uint = 1
  69. var num_pairs uint = 0
  70. {
  71. /* We maintain a vector of histogram pairs, with the property that the pair
  72. with the maximum bit cost reduction is the first. */
  73. var idx1 uint
  74. for idx1 = 0; idx1 < num_clusters; idx1++ {
  75. var idx2 uint
  76. for idx2 = idx1 + 1; idx2 < num_clusters; idx2++ {
  77. compareAndPushToQueueLiteral(out, cluster_size, clusters[idx1], clusters[idx2], max_num_pairs, pairs[0:], &num_pairs)
  78. }
  79. }
  80. }
  81. for num_clusters > min_cluster_size {
  82. var best_idx1 uint32
  83. var best_idx2 uint32
  84. var i uint
  85. if pairs[0].cost_diff >= cost_diff_threshold {
  86. cost_diff_threshold = 1e99
  87. min_cluster_size = max_clusters
  88. continue
  89. }
  90. /* Take the best pair from the top of heap. */
  91. best_idx1 = pairs[0].idx1
  92. best_idx2 = pairs[0].idx2
  93. histogramAddHistogramLiteral(&out[best_idx1], &out[best_idx2])
  94. out[best_idx1].bit_cost_ = pairs[0].cost_combo
  95. cluster_size[best_idx1] += cluster_size[best_idx2]
  96. for i = 0; i < symbols_size; i++ {
  97. if symbols[i] == best_idx2 {
  98. symbols[i] = best_idx1
  99. }
  100. }
  101. for i = 0; i < num_clusters; i++ {
  102. if clusters[i] == best_idx2 {
  103. copy(clusters[i:], clusters[i+1:][:num_clusters-i-1])
  104. break
  105. }
  106. }
  107. num_clusters--
  108. {
  109. /* Remove pairs intersecting the just combined best pair. */
  110. var copy_to_idx uint = 0
  111. for i = 0; i < num_pairs; i++ {
  112. var p *histogramPair = &pairs[i]
  113. if p.idx1 == best_idx1 || p.idx2 == best_idx1 || p.idx1 == best_idx2 || p.idx2 == best_idx2 {
  114. /* Remove invalid pair from the queue. */
  115. continue
  116. }
  117. if histogramPairIsLess(&pairs[0], p) {
  118. /* Replace the top of the queue if needed. */
  119. var front histogramPair = pairs[0]
  120. pairs[0] = *p
  121. pairs[copy_to_idx] = front
  122. } else {
  123. pairs[copy_to_idx] = *p
  124. }
  125. copy_to_idx++
  126. }
  127. num_pairs = copy_to_idx
  128. }
  129. /* Push new pairs formed with the combined histogram to the heap. */
  130. for i = 0; i < num_clusters; i++ {
  131. compareAndPushToQueueLiteral(out, cluster_size, best_idx1, clusters[i], max_num_pairs, pairs[0:], &num_pairs)
  132. }
  133. }
  134. return num_clusters
  135. }
  136. /* What is the bit cost of moving histogram from cur_symbol to candidate. */
  137. func histogramBitCostDistanceLiteral(histogram *histogramLiteral, candidate *histogramLiteral) float64 {
  138. if histogram.total_count_ == 0 {
  139. return 0.0
  140. } else {
  141. var tmp histogramLiteral = *histogram
  142. histogramAddHistogramLiteral(&tmp, candidate)
  143. return populationCostLiteral(&tmp) - candidate.bit_cost_
  144. }
  145. }
  146. /* Find the best 'out' histogram for each of the 'in' histograms.
  147. When called, clusters[0..num_clusters) contains the unique values from
  148. symbols[0..in_size), but this property is not preserved in this function.
  149. Note: we assume that out[]->bit_cost_ is already up-to-date. */
  150. func histogramRemapLiteral(in []histogramLiteral, in_size uint, clusters []uint32, num_clusters uint, out []histogramLiteral, symbols []uint32) {
  151. var i uint
  152. for i = 0; i < in_size; i++ {
  153. var best_out uint32
  154. if i == 0 {
  155. best_out = symbols[0]
  156. } else {
  157. best_out = symbols[i-1]
  158. }
  159. var best_bits float64 = histogramBitCostDistanceLiteral(&in[i], &out[best_out])
  160. var j uint
  161. for j = 0; j < num_clusters; j++ {
  162. var cur_bits float64 = histogramBitCostDistanceLiteral(&in[i], &out[clusters[j]])
  163. if cur_bits < best_bits {
  164. best_bits = cur_bits
  165. best_out = clusters[j]
  166. }
  167. }
  168. symbols[i] = best_out
  169. }
  170. /* Recompute each out based on raw and symbols. */
  171. for i = 0; i < num_clusters; i++ {
  172. histogramClearLiteral(&out[clusters[i]])
  173. }
  174. for i = 0; i < in_size; i++ {
  175. histogramAddHistogramLiteral(&out[symbols[i]], &in[i])
  176. }
  177. }
  178. /* Reorders elements of the out[0..length) array and changes values in
  179. symbols[0..length) array in the following way:
  180. * when called, symbols[] contains indexes into out[], and has N unique
  181. values (possibly N < length)
  182. * on return, symbols'[i] = f(symbols[i]) and
  183. out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length,
  184. where f is a bijection between the range of symbols[] and [0..N), and
  185. the first occurrences of values in symbols'[i] come in consecutive
  186. increasing order.
  187. Returns N, the number of unique values in symbols[]. */
  188. var histogramReindexLiteral_kInvalidIndex uint32 = math.MaxUint32
  189. func histogramReindexLiteral(out []histogramLiteral, symbols []uint32, length uint) uint {
  190. var new_index []uint32 = make([]uint32, length)
  191. var next_index uint32
  192. var tmp []histogramLiteral
  193. var i uint
  194. for i = 0; i < length; i++ {
  195. new_index[i] = histogramReindexLiteral_kInvalidIndex
  196. }
  197. next_index = 0
  198. for i = 0; i < length; i++ {
  199. if new_index[symbols[i]] == histogramReindexLiteral_kInvalidIndex {
  200. new_index[symbols[i]] = next_index
  201. next_index++
  202. }
  203. }
  204. /* TODO: by using idea of "cycle-sort" we can avoid allocation of
  205. tmp and reduce the number of copying by the factor of 2. */
  206. tmp = make([]histogramLiteral, next_index)
  207. next_index = 0
  208. for i = 0; i < length; i++ {
  209. if new_index[symbols[i]] == next_index {
  210. tmp[next_index] = out[symbols[i]]
  211. next_index++
  212. }
  213. symbols[i] = new_index[symbols[i]]
  214. }
  215. new_index = nil
  216. for i = 0; uint32(i) < next_index; i++ {
  217. out[i] = tmp[i]
  218. }
  219. tmp = nil
  220. return uint(next_index)
  221. }
  222. func clusterHistogramsLiteral(in []histogramLiteral, in_size uint, max_histograms uint, out []histogramLiteral, out_size *uint, histogram_symbols []uint32) {
  223. var cluster_size []uint32 = make([]uint32, in_size)
  224. var clusters []uint32 = make([]uint32, in_size)
  225. var num_clusters uint = 0
  226. var max_input_histograms uint = 64
  227. var pairs_capacity uint = max_input_histograms * max_input_histograms / 2
  228. var pairs []histogramPair = make([]histogramPair, (pairs_capacity + 1))
  229. var i uint
  230. /* For the first pass of clustering, we allow all pairs. */
  231. for i = 0; i < in_size; i++ {
  232. cluster_size[i] = 1
  233. }
  234. for i = 0; i < in_size; i++ {
  235. out[i] = in[i]
  236. out[i].bit_cost_ = populationCostLiteral(&in[i])
  237. histogram_symbols[i] = uint32(i)
  238. }
  239. for i = 0; i < in_size; i += max_input_histograms {
  240. var num_to_combine uint = brotli_min_size_t(in_size-i, max_input_histograms)
  241. var num_new_clusters uint
  242. var j uint
  243. for j = 0; j < num_to_combine; j++ {
  244. clusters[num_clusters+j] = uint32(i + j)
  245. }
  246. num_new_clusters = histogramCombineLiteral(out, cluster_size, histogram_symbols[i:], clusters[num_clusters:], pairs, num_to_combine, num_to_combine, max_histograms, pairs_capacity)
  247. num_clusters += num_new_clusters
  248. }
  249. {
  250. /* For the second pass, we limit the total number of histogram pairs.
  251. After this limit is reached, we only keep searching for the best pair. */
  252. var max_num_pairs uint = brotli_min_size_t(64*num_clusters, (num_clusters/2)*num_clusters)
  253. if pairs_capacity < (max_num_pairs + 1) {
  254. var _new_size uint
  255. if pairs_capacity == 0 {
  256. _new_size = max_num_pairs + 1
  257. } else {
  258. _new_size = pairs_capacity
  259. }
  260. var new_array []histogramPair
  261. for _new_size < (max_num_pairs + 1) {
  262. _new_size *= 2
  263. }
  264. new_array = make([]histogramPair, _new_size)
  265. if pairs_capacity != 0 {
  266. copy(new_array, pairs[:pairs_capacity])
  267. }
  268. pairs = new_array
  269. pairs_capacity = _new_size
  270. }
  271. /* Collapse similar histograms. */
  272. num_clusters = histogramCombineLiteral(out, cluster_size, histogram_symbols, clusters, pairs, num_clusters, in_size, max_histograms, max_num_pairs)
  273. }
  274. pairs = nil
  275. cluster_size = nil
  276. /* Find the optimal map from original histograms to the final ones. */
  277. histogramRemapLiteral(in, in_size, clusters, num_clusters, out, histogram_symbols)
  278. clusters = nil
  279. /* Convert the context map to a canonical form. */
  280. *out_size = histogramReindexLiteral(out, histogram_symbols, in_size)
  281. }