1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package bzip2 6 7 import "sort" 8 9 // A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a 10 // symbol. 11 type huffmanTree struct { 12 // nodes contains all the non-leaf nodes in the tree. nodes[0] is the 13 // root of the tree and nextNode contains the index of the next element 14 // of nodes to use when the tree is being constructed. 15 nodes []huffmanNode 16 nextNode int 17 } 18 19 // A huffmanNode is a node in the tree. left and right contain indexes into the 20 // nodes slice of the tree. If left or right is invalidNodeValue then the child 21 // is a left node and its value is in leftValue/rightValue. 22 // 23 // The symbols are uint16s because bzip2 encodes not only MTF indexes in the 24 // tree, but also two magic values for run-length encoding and an EOF symbol. 25 // Thus there are more than 256 possible symbols. 26 type huffmanNode struct { 27 left, right uint16 28 leftValue, rightValue uint16 29 } 30 31 // invalidNodeValue is an invalid index which marks a leaf node in the tree. 32 const invalidNodeValue = 0xffff 33 34 // Decode reads bits from the given bitReader and navigates the tree until a 35 // symbol is found. 36 func (t *huffmanTree) Decode(br *bitReader) (v uint16) { 37 nodeIndex := uint16(0) // node 0 is the root of the tree. 38 39 for { 40 node := &t.nodes[nodeIndex] 41 42 var bit uint16 43 if br.bits > 0 { 44 // Get next bit - fast path. 45 br.bits-- 46 bit = uint16(br.n>>(br.bits&63)) & 1 47 } else { 48 // Get next bit - slow path. 49 // Use ReadBits to retrieve a single bit 50 // from the underling io.ByteReader. 51 bit = uint16(br.ReadBits(1)) 52 } 53 54 // Trick a compiler into generating conditional move instead of branch, 55 // by making both loads unconditional. 56 l, r := node.left, node.right 57 58 if bit == 1 { 59 nodeIndex = l 60 } else { 61 nodeIndex = r 62 } 63 64 if nodeIndex == invalidNodeValue { 65 // We found a leaf. Use the value of bit to decide 66 // whether is a left or a right value. 67 l, r := node.leftValue, node.rightValue 68 if bit == 1 { 69 v = l 70 } else { 71 v = r 72 } 73 return 74 } 75 } 76 } 77 78 // newHuffmanTree builds a Huffman tree from a slice containing the code 79 // lengths of each symbol. The maximum code length is 32 bits. 80 func newHuffmanTree(lengths []uint8) (huffmanTree, error) { 81 // There are many possible trees that assign the same code length to 82 // each symbol (consider reflecting a tree down the middle, for 83 // example). Since the code length assignments determine the 84 // efficiency of the tree, each of these trees is equally good. In 85 // order to minimize the amount of information needed to build a tree 86 // bzip2 uses a canonical tree so that it can be reconstructed given 87 // only the code length assignments. 88 89 if len(lengths) < 2 { 90 panic("newHuffmanTree: too few symbols") 91 } 92 93 var t huffmanTree 94 95 // First we sort the code length assignments by ascending code length, 96 // using the symbol value to break ties. 97 pairs := make([]huffmanSymbolLengthPair, len(lengths)) 98 for i, length := range lengths { 99 pairs[i].value = uint16(i) 100 pairs[i].length = length 101 } 102 103 sort.Slice(pairs, func(i, j int) bool { 104 if pairs[i].length < pairs[j].length { 105 return true 106 } 107 if pairs[i].length > pairs[j].length { 108 return false 109 } 110 if pairs[i].value < pairs[j].value { 111 return true 112 } 113 return false 114 }) 115 116 // Now we assign codes to the symbols, starting with the longest code. 117 // We keep the codes packed into a uint32, at the most-significant end. 118 // So branches are taken from the MSB downwards. This makes it easy to 119 // sort them later. 120 code := uint32(0) 121 length := uint8(32) 122 123 codes := make([]huffmanCode, len(lengths)) 124 for i := len(pairs) - 1; i >= 0; i-- { 125 if length > pairs[i].length { 126 length = pairs[i].length 127 } 128 codes[i].code = code 129 codes[i].codeLen = length 130 codes[i].value = pairs[i].value 131 // We need to 'increment' the code, which means treating |code| 132 // like a |length| bit number. 133 code += 1 << (32 - length) 134 } 135 136 // Now we can sort by the code so that the left half of each branch are 137 // grouped together, recursively. 138 sort.Slice(codes, func(i, j int) bool { 139 return codes[i].code < codes[j].code 140 }) 141 142 t.nodes = make([]huffmanNode, len(codes)) 143 _, err := buildHuffmanNode(&t, codes, 0) 144 return t, err 145 } 146 147 // huffmanSymbolLengthPair contains a symbol and its code length. 148 type huffmanSymbolLengthPair struct { 149 value uint16 150 length uint8 151 } 152 153 // huffmanCode contains a symbol, its code and code length. 154 type huffmanCode struct { 155 code uint32 156 codeLen uint8 157 value uint16 158 } 159 160 // buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in 161 // the Huffman tree at the given level. It returns the index of the newly 162 // constructed node. 163 func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err error) { 164 test := uint32(1) << (31 - level) 165 166 // We have to search the list of codes to find the divide between the left and right sides. 167 firstRightIndex := len(codes) 168 for i, code := range codes { 169 if code.code&test != 0 { 170 firstRightIndex = i 171 break 172 } 173 } 174 175 left := codes[:firstRightIndex] 176 right := codes[firstRightIndex:] 177 178 if len(left) == 0 || len(right) == 0 { 179 // There is a superfluous level in the Huffman tree indicating 180 // a bug in the encoder. However, this bug has been observed in 181 // the wild so we handle it. 182 183 // If this function was called recursively then we know that 184 // len(codes) >= 2 because, otherwise, we would have hit the 185 // "leaf node" case, below, and not recursed. 186 // 187 // However, for the initial call it's possible that len(codes) 188 // is zero or one. Both cases are invalid because a zero length 189 // tree cannot encode anything and a length-1 tree can only 190 // encode EOF and so is superfluous. We reject both. 191 if len(codes) < 2 { 192 return 0, StructuralError("empty Huffman tree") 193 } 194 195 // In this case the recursion doesn't always reduce the length 196 // of codes so we need to ensure termination via another 197 // mechanism. 198 if level == 31 { 199 // Since len(codes) >= 2 the only way that the values 200 // can match at all 32 bits is if they are equal, which 201 // is invalid. This ensures that we never enter 202 // infinite recursion. 203 return 0, StructuralError("equal symbols in Huffman tree") 204 } 205 206 if len(left) == 0 { 207 return buildHuffmanNode(t, right, level+1) 208 } 209 return buildHuffmanNode(t, left, level+1) 210 } 211 212 nodeIndex = uint16(t.nextNode) 213 node := &t.nodes[t.nextNode] 214 t.nextNode++ 215 216 if len(left) == 1 { 217 // leaf node 218 node.left = invalidNodeValue 219 node.leftValue = left[0].value 220 } else { 221 node.left, err = buildHuffmanNode(t, left, level+1) 222 } 223 224 if err != nil { 225 return 226 } 227 228 if len(right) == 1 { 229 // leaf node 230 node.right = invalidNodeValue 231 node.rightValue = right[0].value 232 } else { 233 node.right, err = buildHuffmanNode(t, right, level+1) 234 } 235 236 return 237 } 238