utf8_util.go 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. package brotli
  2. /* Copyright 2013 Google Inc. All Rights Reserved.
  3. Distributed under MIT license.
  4. See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
  5. */
  6. /* Heuristics for deciding about the UTF8-ness of strings. */
  7. const kMinUTF8Ratio float64 = 0.75
  8. /* Returns 1 if at least min_fraction of the bytes between pos and
  9. pos + length in the (data, mask) ring-buffer is UTF8-encoded, otherwise
  10. returns 0. */
  11. func parseAsUTF8(symbol *int, input []byte, size uint) uint {
  12. /* ASCII */
  13. if input[0]&0x80 == 0 {
  14. *symbol = int(input[0])
  15. if *symbol > 0 {
  16. return 1
  17. }
  18. }
  19. /* 2-byte UTF8 */
  20. if size > 1 && input[0]&0xE0 == 0xC0 && input[1]&0xC0 == 0x80 {
  21. *symbol = (int(input[0])&0x1F)<<6 | int(input[1])&0x3F
  22. if *symbol > 0x7F {
  23. return 2
  24. }
  25. }
  26. /* 3-byte UFT8 */
  27. if size > 2 && input[0]&0xF0 == 0xE0 && input[1]&0xC0 == 0x80 && input[2]&0xC0 == 0x80 {
  28. *symbol = (int(input[0])&0x0F)<<12 | (int(input[1])&0x3F)<<6 | int(input[2])&0x3F
  29. if *symbol > 0x7FF {
  30. return 3
  31. }
  32. }
  33. /* 4-byte UFT8 */
  34. if size > 3 && input[0]&0xF8 == 0xF0 && input[1]&0xC0 == 0x80 && input[2]&0xC0 == 0x80 && input[3]&0xC0 == 0x80 {
  35. *symbol = (int(input[0])&0x07)<<18 | (int(input[1])&0x3F)<<12 | (int(input[2])&0x3F)<<6 | int(input[3])&0x3F
  36. if *symbol > 0xFFFF && *symbol <= 0x10FFFF {
  37. return 4
  38. }
  39. }
  40. /* Not UTF8, emit a special symbol above the UTF8-code space */
  41. *symbol = 0x110000 | int(input[0])
  42. return 1
  43. }
  44. /* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
  45. func isMostlyUTF8(data []byte, pos uint, mask uint, length uint, min_fraction float64) bool {
  46. var size_utf8 uint = 0
  47. var i uint = 0
  48. for i < length {
  49. var symbol int
  50. current_data := data[(pos+i)&mask:]
  51. var bytes_read uint = parseAsUTF8(&symbol, current_data, length-i)
  52. i += bytes_read
  53. if symbol < 0x110000 {
  54. size_utf8 += bytes_read
  55. }
  56. }
  57. return float64(size_utf8) > min_fraction*float64(length)
  58. }