bzip2.go

Documentation: compress/bzip2

		 1  // Copyright 2011 The Go Authors. All rights reserved.
		 2  // Use of this source code is governed by a BSD-style
		 3  // license that can be found in the LICENSE file.
		 4  
		 5  // Package bzip2 implements bzip2 decompression.
		 6  package bzip2
		 7  
		 8  import "io"
		 9  
		10  // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
		11  // of guessing: https://en.wikipedia.org/wiki/Bzip2
		12  // The source code to pyflate was useful for debugging:
		13  // http://www.paul.sladen.org/projects/pyflate
		14  
		15  // A StructuralError is returned when the bzip2 data is found to be
		16  // syntactically invalid.
		17  type StructuralError string
		18  
		19  func (s StructuralError) Error() string {
		20  	return "bzip2 data invalid: " + string(s)
		21  }
		22  
		23  // A reader decompresses bzip2 compressed data.
		24  type reader struct {
		25  	br					 bitReader
		26  	fileCRC			uint32
		27  	blockCRC		 uint32
		28  	wantBlockCRC uint32
		29  	setupDone		bool // true if we have parsed the bzip2 header.
		30  	blockSize		int	// blockSize in bytes, i.e. 900 * 1000.
		31  	eof					bool
		32  	c						[256]uint // the ``C'' array for the inverse BWT.
		33  	tt					 []uint32	// mirrors the ``tt'' array in the bzip2 source and contains the P array in the upper 24 bits.
		34  	tPos				 uint32		// Index of the next output byte in tt.
		35  
		36  	preRLE			[]uint32 // contains the RLE data still to be processed.
		37  	preRLEUsed	int			// number of entries of preRLE used.
		38  	lastByte		int			// the last byte value seen.
		39  	byteRepeats uint		 // the number of repeats of lastByte seen.
		40  	repeats		 uint		 // the number of copies of lastByte to output.
		41  }
		42  
		43  // NewReader returns an io.Reader which decompresses bzip2 data from r.
		44  // If r does not also implement io.ByteReader,
		45  // the decompressor may read more data than necessary from r.
		46  func NewReader(r io.Reader) io.Reader {
		47  	bz2 := new(reader)
		48  	bz2.br = newBitReader(r)
		49  	return bz2
		50  }
		51  
		52  const bzip2FileMagic = 0x425a // "BZ"
		53  const bzip2BlockMagic = 0x314159265359
		54  const bzip2FinalMagic = 0x177245385090
		55  
		56  // setup parses the bzip2 header.
		57  func (bz2 *reader) setup(needMagic bool) error {
		58  	br := &bz2.br
		59  
		60  	if needMagic {
		61  		magic := br.ReadBits(16)
		62  		if magic != bzip2FileMagic {
		63  			return StructuralError("bad magic value")
		64  		}
		65  	}
		66  
		67  	t := br.ReadBits(8)
		68  	if t != 'h' {
		69  		return StructuralError("non-Huffman entropy encoding")
		70  	}
		71  
		72  	level := br.ReadBits(8)
		73  	if level < '1' || level > '9' {
		74  		return StructuralError("invalid compression level")
		75  	}
		76  
		77  	bz2.fileCRC = 0
		78  	bz2.blockSize = 100 * 1000 * (level - '0')
		79  	if bz2.blockSize > len(bz2.tt) {
		80  		bz2.tt = make([]uint32, bz2.blockSize)
		81  	}
		82  	return nil
		83  }
		84  
		85  func (bz2 *reader) Read(buf []byte) (n int, err error) {
		86  	if bz2.eof {
		87  		return 0, io.EOF
		88  	}
		89  
		90  	if !bz2.setupDone {
		91  		err = bz2.setup(true)
		92  		brErr := bz2.br.Err()
		93  		if brErr != nil {
		94  			err = brErr
		95  		}
		96  		if err != nil {
		97  			return 0, err
		98  		}
		99  		bz2.setupDone = true
	 100  	}
	 101  
	 102  	n, err = bz2.read(buf)
	 103  	brErr := bz2.br.Err()
	 104  	if brErr != nil {
	 105  		err = brErr
	 106  	}
	 107  	return
	 108  }
	 109  
	 110  func (bz2 *reader) readFromBlock(buf []byte) int {
	 111  	// bzip2 is a block based compressor, except that it has a run-length
	 112  	// preprocessing step. The block based nature means that we can
	 113  	// preallocate fixed-size buffers and reuse them. However, the RLE
	 114  	// preprocessing would require allocating huge buffers to store the
	 115  	// maximum expansion. Thus we process blocks all at once, except for
	 116  	// the RLE which we decompress as required.
	 117  	n := 0
	 118  	for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
	 119  		// We have RLE data pending.
	 120  
	 121  		// The run-length encoding works like this:
	 122  		// Any sequence of four equal bytes is followed by a length
	 123  		// byte which contains the number of repeats of that byte to
	 124  		// include. (The number of repeats can be zero.) Because we are
	 125  		// decompressing on-demand our state is kept in the reader
	 126  		// object.
	 127  
	 128  		if bz2.repeats > 0 {
	 129  			buf[n] = byte(bz2.lastByte)
	 130  			n++
	 131  			bz2.repeats--
	 132  			if bz2.repeats == 0 {
	 133  				bz2.lastByte = -1
	 134  			}
	 135  			continue
	 136  		}
	 137  
	 138  		bz2.tPos = bz2.preRLE[bz2.tPos]
	 139  		b := byte(bz2.tPos)
	 140  		bz2.tPos >>= 8
	 141  		bz2.preRLEUsed++
	 142  
	 143  		if bz2.byteRepeats == 3 {
	 144  			bz2.repeats = uint(b)
	 145  			bz2.byteRepeats = 0
	 146  			continue
	 147  		}
	 148  
	 149  		if bz2.lastByte == int(b) {
	 150  			bz2.byteRepeats++
	 151  		} else {
	 152  			bz2.byteRepeats = 0
	 153  		}
	 154  		bz2.lastByte = int(b)
	 155  
	 156  		buf[n] = b
	 157  		n++
	 158  	}
	 159  
	 160  	return n
	 161  }
	 162  
	 163  func (bz2 *reader) read(buf []byte) (int, error) {
	 164  	for {
	 165  		n := bz2.readFromBlock(buf)
	 166  		if n > 0 || len(buf) == 0 {
	 167  			bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n])
	 168  			return n, nil
	 169  		}
	 170  
	 171  		// End of block. Check CRC.
	 172  		if bz2.blockCRC != bz2.wantBlockCRC {
	 173  			bz2.br.err = StructuralError("block checksum mismatch")
	 174  			return 0, bz2.br.err
	 175  		}
	 176  
	 177  		// Find next block.
	 178  		br := &bz2.br
	 179  		switch br.ReadBits64(48) {
	 180  		default:
	 181  			return 0, StructuralError("bad magic value found")
	 182  
	 183  		case bzip2BlockMagic:
	 184  			// Start of block.
	 185  			err := bz2.readBlock()
	 186  			if err != nil {
	 187  				return 0, err
	 188  			}
	 189  
	 190  		case bzip2FinalMagic:
	 191  			// Check end-of-file CRC.
	 192  			wantFileCRC := uint32(br.ReadBits64(32))
	 193  			if br.err != nil {
	 194  				return 0, br.err
	 195  			}
	 196  			if bz2.fileCRC != wantFileCRC {
	 197  				br.err = StructuralError("file checksum mismatch")
	 198  				return 0, br.err
	 199  			}
	 200  
	 201  			// Skip ahead to byte boundary.
	 202  			// Is there a file concatenated to this one?
	 203  			// It would start with BZ.
	 204  			if br.bits%8 != 0 {
	 205  				br.ReadBits(br.bits % 8)
	 206  			}
	 207  			b, err := br.r.ReadByte()
	 208  			if err == io.EOF {
	 209  				br.err = io.EOF
	 210  				bz2.eof = true
	 211  				return 0, io.EOF
	 212  			}
	 213  			if err != nil {
	 214  				br.err = err
	 215  				return 0, err
	 216  			}
	 217  			z, err := br.r.ReadByte()
	 218  			if err != nil {
	 219  				if err == io.EOF {
	 220  					err = io.ErrUnexpectedEOF
	 221  				}
	 222  				br.err = err
	 223  				return 0, err
	 224  			}
	 225  			if b != 'B' || z != 'Z' {
	 226  				return 0, StructuralError("bad magic value in continuation file")
	 227  			}
	 228  			if err := bz2.setup(false); err != nil {
	 229  				return 0, err
	 230  			}
	 231  		}
	 232  	}
	 233  }
	 234  
	 235  // readBlock reads a bzip2 block. The magic number should already have been consumed.
	 236  func (bz2 *reader) readBlock() (err error) {
	 237  	br := &bz2.br
	 238  	bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is.
	 239  	bz2.blockCRC = 0
	 240  	bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC
	 241  	randomized := br.ReadBits(1)
	 242  	if randomized != 0 {
	 243  		return StructuralError("deprecated randomized files")
	 244  	}
	 245  	origPtr := uint(br.ReadBits(24))
	 246  
	 247  	// If not every byte value is used in the block (i.e., it's text) then
	 248  	// the symbol set is reduced. The symbols used are stored as a
	 249  	// two-level, 16x16 bitmap.
	 250  	symbolRangeUsedBitmap := br.ReadBits(16)
	 251  	symbolPresent := make([]bool, 256)
	 252  	numSymbols := 0
	 253  	for symRange := uint(0); symRange < 16; symRange++ {
	 254  		if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
	 255  			bits := br.ReadBits(16)
	 256  			for symbol := uint(0); symbol < 16; symbol++ {
	 257  				if bits&(1<<(15-symbol)) != 0 {
	 258  					symbolPresent[16*symRange+symbol] = true
	 259  					numSymbols++
	 260  				}
	 261  			}
	 262  		}
	 263  	}
	 264  
	 265  	if numSymbols == 0 {
	 266  		// There must be an EOF symbol.
	 267  		return StructuralError("no symbols in input")
	 268  	}
	 269  
	 270  	// A block uses between two and six different Huffman trees.
	 271  	numHuffmanTrees := br.ReadBits(3)
	 272  	if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
	 273  		return StructuralError("invalid number of Huffman trees")
	 274  	}
	 275  
	 276  	// The Huffman tree can switch every 50 symbols so there's a list of
	 277  	// tree indexes telling us which tree to use for each 50 symbol block.
	 278  	numSelectors := br.ReadBits(15)
	 279  	treeIndexes := make([]uint8, numSelectors)
	 280  
	 281  	// The tree indexes are move-to-front transformed and stored as unary
	 282  	// numbers.
	 283  	mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
	 284  	for i := range treeIndexes {
	 285  		c := 0
	 286  		for {
	 287  			inc := br.ReadBits(1)
	 288  			if inc == 0 {
	 289  				break
	 290  			}
	 291  			c++
	 292  		}
	 293  		if c >= numHuffmanTrees {
	 294  			return StructuralError("tree index too large")
	 295  		}
	 296  		treeIndexes[i] = mtfTreeDecoder.Decode(c)
	 297  	}
	 298  
	 299  	// The list of symbols for the move-to-front transform is taken from
	 300  	// the previously decoded symbol bitmap.
	 301  	symbols := make([]byte, numSymbols)
	 302  	nextSymbol := 0
	 303  	for i := 0; i < 256; i++ {
	 304  		if symbolPresent[i] {
	 305  			symbols[nextSymbol] = byte(i)
	 306  			nextSymbol++
	 307  		}
	 308  	}
	 309  	mtf := newMTFDecoder(symbols)
	 310  
	 311  	numSymbols += 2 // to account for RUNA and RUNB symbols
	 312  	huffmanTrees := make([]huffmanTree, numHuffmanTrees)
	 313  
	 314  	// Now we decode the arrays of code-lengths for each tree.
	 315  	lengths := make([]uint8, numSymbols)
	 316  	for i := range huffmanTrees {
	 317  		// The code lengths are delta encoded from a 5-bit base value.
	 318  		length := br.ReadBits(5)
	 319  		for j := range lengths {
	 320  			for {
	 321  				if length < 1 || length > 20 {
	 322  					return StructuralError("Huffman length out of range")
	 323  				}
	 324  				if !br.ReadBit() {
	 325  					break
	 326  				}
	 327  				if br.ReadBit() {
	 328  					length--
	 329  				} else {
	 330  					length++
	 331  				}
	 332  			}
	 333  			lengths[j] = uint8(length)
	 334  		}
	 335  		huffmanTrees[i], err = newHuffmanTree(lengths)
	 336  		if err != nil {
	 337  			return err
	 338  		}
	 339  	}
	 340  
	 341  	selectorIndex := 1 // the next tree index to use
	 342  	if len(treeIndexes) == 0 {
	 343  		return StructuralError("no tree selectors given")
	 344  	}
	 345  	if int(treeIndexes[0]) >= len(huffmanTrees) {
	 346  		return StructuralError("tree selector out of range")
	 347  	}
	 348  	currentHuffmanTree := huffmanTrees[treeIndexes[0]]
	 349  	bufIndex := 0 // indexes bz2.buf, the output buffer.
	 350  	// The output of the move-to-front transform is run-length encoded and
	 351  	// we merge the decoding into the Huffman parsing loop. These two
	 352  	// variables accumulate the repeat count. See the Wikipedia page for
	 353  	// details.
	 354  	repeat := 0
	 355  	repeatPower := 0
	 356  
	 357  	// The `C' array (used by the inverse BWT) needs to be zero initialized.
	 358  	for i := range bz2.c {
	 359  		bz2.c[i] = 0
	 360  	}
	 361  
	 362  	decoded := 0 // counts the number of symbols decoded by the current tree.
	 363  	for {
	 364  		if decoded == 50 {
	 365  			if selectorIndex >= numSelectors {
	 366  				return StructuralError("insufficient selector indices for number of symbols")
	 367  			}
	 368  			if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) {
	 369  				return StructuralError("tree selector out of range")
	 370  			}
	 371  			currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
	 372  			selectorIndex++
	 373  			decoded = 0
	 374  		}
	 375  
	 376  		v := currentHuffmanTree.Decode(br)
	 377  		decoded++
	 378  
	 379  		if v < 2 {
	 380  			// This is either the RUNA or RUNB symbol.
	 381  			if repeat == 0 {
	 382  				repeatPower = 1
	 383  			}
	 384  			repeat += repeatPower << v
	 385  			repeatPower <<= 1
	 386  
	 387  			// This limit of 2 million comes from the bzip2 source
	 388  			// code. It prevents repeat from overflowing.
	 389  			if repeat > 2*1024*1024 {
	 390  				return StructuralError("repeat count too large")
	 391  			}
	 392  			continue
	 393  		}
	 394  
	 395  		if repeat > 0 {
	 396  			// We have decoded a complete run-length so we need to
	 397  			// replicate the last output symbol.
	 398  			if repeat > bz2.blockSize-bufIndex {
	 399  				return StructuralError("repeats past end of block")
	 400  			}
	 401  			for i := 0; i < repeat; i++ {
	 402  				b := mtf.First()
	 403  				bz2.tt[bufIndex] = uint32(b)
	 404  				bz2.c[b]++
	 405  				bufIndex++
	 406  			}
	 407  			repeat = 0
	 408  		}
	 409  
	 410  		if int(v) == numSymbols-1 {
	 411  			// This is the EOF symbol. Because it's always at the
	 412  			// end of the move-to-front list, and never gets moved
	 413  			// to the front, it has this unique value.
	 414  			break
	 415  		}
	 416  
	 417  		// Since two metasymbols (RUNA and RUNB) have values 0 and 1,
	 418  		// one would expect |v-2| to be passed to the MTF decoder.
	 419  		// However, the front of the MTF list is never referenced as 0,
	 420  		// it's always referenced with a run-length of 1. Thus 0
	 421  		// doesn't need to be encoded and we have |v-1| in the next
	 422  		// line.
	 423  		b := mtf.Decode(int(v - 1))
	 424  		if bufIndex >= bz2.blockSize {
	 425  			return StructuralError("data exceeds block size")
	 426  		}
	 427  		bz2.tt[bufIndex] = uint32(b)
	 428  		bz2.c[b]++
	 429  		bufIndex++
	 430  	}
	 431  
	 432  	if origPtr >= uint(bufIndex) {
	 433  		return StructuralError("origPtr out of bounds")
	 434  	}
	 435  
	 436  	// We have completed the entropy decoding. Now we can perform the
	 437  	// inverse BWT and setup the RLE buffer.
	 438  	bz2.preRLE = bz2.tt[:bufIndex]
	 439  	bz2.preRLEUsed = 0
	 440  	bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
	 441  	bz2.lastByte = -1
	 442  	bz2.byteRepeats = 0
	 443  	bz2.repeats = 0
	 444  
	 445  	return nil
	 446  }
	 447  
	 448  // inverseBWT implements the inverse Burrows-Wheeler transform as described in
	 449  // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
	 450  // In that document, origPtr is called ``I'' and c is the ``C'' array after the
	 451  // first pass over the data. It's an argument here because we merge the first
	 452  // pass with the Huffman decoding.
	 453  //
	 454  // This also implements the ``single array'' method from the bzip2 source code
	 455  // which leaves the output, still shuffled, in the bottom 8 bits of tt with the
	 456  // index of the next byte in the top 24-bits. The index of the first byte is
	 457  // returned.
	 458  func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
	 459  	sum := uint(0)
	 460  	for i := 0; i < 256; i++ {
	 461  		sum += c[i]
	 462  		c[i] = sum - c[i]
	 463  	}
	 464  
	 465  	for i := range tt {
	 466  		b := tt[i] & 0xff
	 467  		tt[c[b]] |= uint32(i) << 8
	 468  		c[b]++
	 469  	}
	 470  
	 471  	return tt[origPtr] >> 8
	 472  }
	 473  
	 474  // This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,
	 475  // causing the bits in the input to be processed in the reverse of the usual order.
	 476  
	 477  var crctab [256]uint32
	 478  
	 479  func init() {
	 480  	const poly = 0x04C11DB7
	 481  	for i := range crctab {
	 482  		crc := uint32(i) << 24
	 483  		for j := 0; j < 8; j++ {
	 484  			if crc&0x80000000 != 0 {
	 485  				crc = (crc << 1) ^ poly
	 486  			} else {
	 487  				crc <<= 1
	 488  			}
	 489  		}
	 490  		crctab[i] = crc
	 491  	}
	 492  }
	 493  
	 494  // updateCRC updates the crc value to incorporate the data in b.
	 495  // The initial value is 0.
	 496  func updateCRC(val uint32, b []byte) uint32 {
	 497  	crc := ^val
	 498  	for _, v := range b {
	 499  		crc = crctab[byte(crc>>24)^v] ^ (crc << 8)
	 500  	}
	 501  	return ^crc
	 502  }
	 503
View as plain text