diff --git a/src/compress/flate/deflate.go b/src/compress/flate/deflate.go index 6697f3a7913cd5..3819f2e1eae81d 100644 --- a/src/compress/flate/deflate.go +++ b/src/compress/flate/deflate.go @@ -27,132 +27,121 @@ const ( // RFC 1951 compliant. That is, any valid DEFLATE decompressor will // continue to be able to decompress this output. HuffmanOnly = -2 -) -const ( - logWindowSize = 15 - windowSize = 1 << logWindowSize - windowMask = windowSize - 1 - - // The LZ77 step produces a sequence of literal tokens and - // pair tokens. The offset is also known as distance. The underlying wire - // format limits the range of lengths and offsets. For example, there are - // 256 legitimate lengths: those in the range [3, 258]. This package's - // compressor uses a higher minimum match length, enabling optimizations - // such as finding matches via 32-bit loads and compares. - baseMatchLength = 3 // The smallest match length per the RFC section 3.2.5 - minMatchLength = 4 // The smallest match length that the compressor actually emits - maxMatchLength = 258 // The largest match length - baseMatchOffset = 1 // The smallest match offset - maxMatchOffset = 1 << 15 // The largest match offset - - // The maximum number of tokens we put into a single flate block, just to - // stop things from getting too large. - maxFlateBlockTokens = 1 << 14 + logWindowSize = 15 + windowSize = 1 << logWindowSize + windowMask = windowSize - 1 + minMatchLength = 4 // The smallest match that the compressor looks for + maxMatchLength = 258 // The longest match for the compressor + minOffsetSize = 1 // The shortest offset that makes any sense + + // The maximum number of tokens we will encode at the time. + // Smaller sizes usually creates less optimal blocks. + // Bigger can make context switching slow. + // We use this for levels 7-9, so we make it big. + maxFlateBlockTokens = 1 << 15 maxStoreBlockSize = 65535 hashBits = 17 // After 17 performance degrades hashSize = 1 << hashBits hashMask = (1 << hashBits) - 1 - maxHashOffset = 1 << 24 + maxHashOffset = 1 << 28 skipNever = math.MaxInt32 ) type compressionLevel struct { - level, good, lazy, nice, chain, fastSkipHashing int + good, lazy, nice, chain, level int } var levels = []compressionLevel{ - {0, 0, 0, 0, 0, 0}, // NoCompression. - {1, 0, 0, 0, 0, 0}, // BestSpeed uses a custom algorithm; see deflatefast.go. - // For levels 2-3 we don't bother trying with lazy matches. - {2, 4, 0, 16, 8, 5}, - {3, 4, 0, 32, 32, 6}, - // Levels 4-9 use increasingly more lazy matching + {}, // 0 + // Level 1-6 uses specialized algorithm - values not used + {0, 0, 0, 0, 1}, + {0, 0, 0, 0, 2}, + {0, 0, 0, 0, 3}, + {0, 0, 0, 0, 4}, + {0, 0, 0, 0, 5}, + {0, 0, 0, 0, 6}, + // Levels 7-9 use increasingly more lazy matching // and increasingly stringent conditions for "good enough". - {4, 4, 4, 16, 16, skipNever}, - {5, 8, 16, 32, 32, skipNever}, - {6, 8, 16, 128, 128, skipNever}, - {7, 8, 32, 128, 256, skipNever}, - {8, 32, 128, 258, 1024, skipNever}, - {9, 32, 258, 258, 4096, skipNever}, + {8, 12, 16, 24, 7}, + {16, 30, 40, 64, 8}, + {32, 258, 258, 1024, 9}, } -type compressor struct { - compressionLevel +// advancedState contains state for the advanced levels, with bigger hash tables, etc. +type advancedState struct { + // deflate state + length int + offset int + maxInsertIndex int + chainHead int + hashOffset int - w *huffmanBitWriter - bulkHasher func([]byte, []uint32) + ii uint16 // position of last match, intended to overflow to reset. - // compression algorithm - fill func(*compressor, []byte) int // copy data to window - step func(*compressor) // process window - bestSpeed *deflateFast // Encoder for BestSpeed + // input window: unprocessed data is window[index:windowEnd] + index int + hashMatch [maxMatchLength + minMatchLength]uint32 // Input hash chains // hashHead[hashValue] contains the largest inputIndex with the specified hash value // If hashHead[hashValue] is within the current window, then // hashPrev[hashHead[hashValue] & windowMask] contains the previous index // with the same hash value. - chainHead int - hashHead [hashSize]uint32 - hashPrev [windowSize]uint32 - hashOffset int + hashHead [hashSize]uint32 + hashPrev [windowSize]uint32 +} - // input window: unprocessed data is window[index:windowEnd] - index int - window []byte - windowEnd int - blockStart int // window index where current tokens start - byteAvailable bool // if true, still need to process window[index-1]. +type compressor struct { + compressionLevel - sync bool // requesting flush + h *huffmanEncoder + w *huffmanBitWriter - // queued output tokens - tokens []token + // compression algorithm + fill func(*compressor, []byte) int // copy data to window + step func(*compressor) // process window - // deflate state - length int - offset int - maxInsertIndex int - err error + window []byte + windowEnd int + blockStart int // window index where current tokens start + err error + + // queued output tokens + tokens tokens + fast fastEnc + state *advancedState - // hashMatch must be able to contain hashes for the maximum match length. - hashMatch [maxMatchLength - 1]uint32 + sync bool // requesting flush + byteAvailable bool // if true, still need to process window[index-1]. } func (d *compressor) fillDeflate(b []byte) int { - if d.index >= 2*windowSize-(minMatchLength+maxMatchLength) { + s := d.state + if s.index >= 2*windowSize-(minMatchLength+maxMatchLength) { // shift the window by windowSize - copy(d.window, d.window[windowSize:2*windowSize]) - d.index -= windowSize + //copy(d.window[:], d.window[windowSize:2*windowSize]) + *(*[windowSize]byte)(d.window) = *(*[windowSize]byte)(d.window[windowSize:]) + s.index -= windowSize d.windowEnd -= windowSize if d.blockStart >= windowSize { d.blockStart -= windowSize } else { d.blockStart = math.MaxInt32 } - d.hashOffset += windowSize - if d.hashOffset > maxHashOffset { - delta := d.hashOffset - 1 - d.hashOffset -= delta - d.chainHead -= delta - + s.hashOffset += windowSize + if s.hashOffset > maxHashOffset { + delta := s.hashOffset - 1 + s.hashOffset -= delta + s.chainHead -= delta // Iterate over slices instead of arrays to avoid copying // the entire table onto the stack (Issue #18625). - for i, v := range d.hashPrev[:] { - if int(v) > delta { - d.hashPrev[i] = uint32(int(v) - delta) - } else { - d.hashPrev[i] = 0 - } + for i, v := range s.hashPrev[:] { + s.hashPrev[i] = uint32(max(int(v)-delta, 0)) } - for i, v := range d.hashHead[:] { - if int(v) > delta { - d.hashHead[i] = uint32(int(v) - delta) - } else { - d.hashHead[i] = 0 - } + for i, v := range s.hashHead[:] { + s.hashHead[i] = uint32(max(int(v)-delta, 0)) } } } @@ -161,14 +150,38 @@ func (d *compressor) fillDeflate(b []byte) int { return n } -func (d *compressor) writeBlock(tokens []token, index int) error { - if index > 0 { +func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error { + if index > 0 || eof { var window []byte if d.blockStart <= index { window = d.window[d.blockStart:index] } d.blockStart = index - d.w.writeBlock(tokens, false, window) + d.w.writeBlockDynamic(tok, eof, window, d.sync) + return d.w.err + } + return nil +} + +// writeBlockSkip writes the current block and uses the number of tokens +// to determine if the block should be stored on no matches, or +// only huffman encoded. +func (d *compressor) writeBlockSkip(tok *tokens, index int, eof bool) error { + if index > 0 || eof { + if d.blockStart <= index { + window := d.window[d.blockStart:index] + // If we removed less than a 64th of all literals + // we huffman compress the block. + if int(tok.n) > len(window)-int(tok.n>>6) { + d.w.writeBlockHuff(eof, window, d.sync) + } else { + // Write a dynamic huffman block. + d.w.writeBlockDynamic(tok, eof, window, d.sync) + } + } else { + d.w.writeBlock(tok, eof, nil) + } + d.blockStart = index return d.w.err } return nil @@ -177,103 +190,139 @@ func (d *compressor) writeBlock(tokens []token, index int) error { // fillWindow will fill the current window with the supplied // dictionary and calculate all hashes. // This is much faster than doing a full encode. -// Should only be used after a reset. +// Should only be used after a start/reset. func (d *compressor) fillWindow(b []byte) { - // Do not fill window if we are in store-only mode. - if d.compressionLevel.level < 2 { + // Do not fill window if we are in store-only or huffman mode. + if d.level <= 0 { return } - if d.index != 0 || d.windowEnd != 0 { - panic("internal error: fillWindow called with stale data") + if d.fast != nil { + // encode the last data, but discard the result + if len(b) > maxMatchOffset { + b = b[len(b)-maxMatchOffset:] + } + d.fast.Encode(&d.tokens, b) + d.tokens.Reset() + return } - + s := d.state // If we are given too much, cut it. if len(b) > windowSize { b = b[len(b)-windowSize:] } // Add all to window. - n := copy(d.window, b) + n := copy(d.window[d.windowEnd:], b) // Calculate 256 hashes at the time (more L1 cache hits) loops := (n + 256 - minMatchLength) / 256 - for j := 0; j < loops; j++ { - index := j * 256 - end := index + 256 + minMatchLength - 1 - if end > n { - end = n - } - toCheck := d.window[index:end] - dstSize := len(toCheck) - minMatchLength + 1 + for j := range loops { + startindex := j * 256 + end := min(startindex+256+minMatchLength-1, n) + tocheck := d.window[startindex:end] + dstSize := len(tocheck) - minMatchLength + 1 if dstSize <= 0 { continue } - dst := d.hashMatch[:dstSize] - d.bulkHasher(toCheck, dst) + dst := s.hashMatch[:dstSize] + bulkHash4(tocheck, dst) + var newH uint32 for i, val := range dst { - di := i + index - hh := &d.hashHead[val&hashMask] + di := i + startindex + newH = val & hashMask // Get previous value with the same hash. // Our chain should point to the previous value. - d.hashPrev[di&windowMask] = *hh + s.hashPrev[di&windowMask] = s.hashHead[newH] // Set the head of the hash chain to us. - *hh = uint32(di + d.hashOffset) + s.hashHead[newH] = uint32(di + s.hashOffset) } } // Update window information. - d.windowEnd = n - d.index = n + d.windowEnd += n + s.index = n } // Try to find a match starting at index whose length is greater than prevSize. // We only look at chainCount possibilities before giving up. -func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) { - minMatchLook := maxMatchLength - if lookahead < minMatchLook { - minMatchLook = lookahead - } +func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, offset int, ok bool) { + minMatchLook := min(lookahead, maxMatchLength) win := d.window[0 : pos+minMatchLook] // We quit when we get a match that's at least nice long - nice := len(win) - pos - if d.nice < nice { - nice = d.nice - } + nice := min(d.nice, len(win)-pos) // If we've got a match that's good enough, only look in 1/4 the chain. tries := d.chain - length = prevLength - if length >= d.good { - tries >>= 2 - } + length = minMatchLength - 1 wEnd := win[pos+length] wPos := win[pos:] - minIndex := pos - windowSize + minIndex := max(pos-windowSize, 0) + offset = 0 + + if d.chain < 100 { + for i := prevHead; tries > 0; tries-- { + if wEnd == win[i+length] { + n := matchLen(win[i:i+minMatchLook], wPos) + if n > length { + length = n + offset = pos - i + ok = true + if n >= nice { + // The match is good enough that we don't try to find a better one. + break + } + wEnd = win[pos+n] + } + } + if i <= minIndex { + // hashPrev[i & windowMask] has already been overwritten, so stop now. + break + } + i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset + if i < minIndex { + break + } + } + return + } + + // Minimum gain to accept a match. + cGain := 4 + + // Some like it higher (CSV), some like it lower (JSON) + const baseCost = 3 + // Base is 4 bytes at with an additional cost. + // Matches must be better than this. for i := prevHead; tries > 0; tries-- { if wEnd == win[i+length] { - n := matchLen(win[i:], wPos, minMatchLook) - - if n > length && (n > minMatchLength || pos-i <= 4096) { - length = n - offset = pos - i - ok = true - if n >= nice { - // The match is good enough that we don't try to find a better one. - break + n := matchLen(win[i:i+minMatchLook], wPos) + if n > length { + // Calculate gain. Estimates the gains of the new match compared to emitting as literals. + newGain := d.h.bitLengthRaw(wPos[:n]) - int(offsetExtraBits[offsetCode(uint32(pos-i))]) - baseCost - int(lengthExtraBits[lengthCodes[(n-3)&255]]) + + if newGain > cGain { + length = n + offset = pos - i + cGain = newGain + ok = true + if n >= nice { + // The match is good enough that we don't try to find a better one. + break + } + wEnd = win[pos+n] } - wEnd = win[pos+n] } } - if i == minIndex { + if i <= minIndex { // hashPrev[i & windowMask] has already been overwritten, so stop now. break } - i = int(d.hashPrev[i&windowMask]) - d.hashOffset - if i < minIndex || i < 0 { + i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset + if i < minIndex { break } } @@ -288,235 +337,272 @@ func (d *compressor) writeStoredBlock(buf []byte) error { return d.w.err } -const hashmul = 0x1e35a7bd - // hash4 returns a hash representation of the first 4 bytes // of the supplied slice. // The caller must ensure that len(b) >= 4. func hash4(b []byte) uint32 { - return ((uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24) * hashmul) >> (32 - hashBits) + return hash4u(loadLE32(b, 0), hashBits) +} + +// hash4 returns the hash of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <32. +func hash4u(u uint32, h uint8) uint32 { + return (u * prime4bytes) >> (32 - h) } // bulkHash4 will compute hashes using the same -// algorithm as hash4. +// algorithm as hash4 func bulkHash4(b []byte, dst []uint32) { - if len(b) < minMatchLength { + if len(b) < 4 { return } - hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24 - dst[0] = (hb * hashmul) >> (32 - hashBits) - end := len(b) - minMatchLength + 1 - for i := 1; i < end; i++ { - hb = (hb << 8) | uint32(b[i+3]) - dst[i] = (hb * hashmul) >> (32 - hashBits) - } -} - -// matchLen returns the number of matching bytes in a and b -// up to length 'max'. Both slices must be at least 'max' -// bytes in size. -func matchLen(a, b []byte, max int) int { - a = a[:max] - b = b[:len(a)] - for i, av := range a { - if b[i] != av { - return i - } - } - return max -} - -// encSpeed will compress and store the currently added data, -// if enough has been accumulated or we at the end of the stream. -// Any error that occurred will be in d.err -func (d *compressor) encSpeed() { - // We only compress if we have maxStoreBlockSize. - if d.windowEnd < maxStoreBlockSize { - if !d.sync { - return - } - - // Handle small sizes. - if d.windowEnd < 128 { - switch { - case d.windowEnd == 0: - return - case d.windowEnd <= 16: - d.err = d.writeStoredBlock(d.window[:d.windowEnd]) - default: - d.w.writeBlockHuff(false, d.window[:d.windowEnd]) - d.err = d.w.err - } - d.windowEnd = 0 - d.bestSpeed.reset() - return - } - - } - // Encode the block. - d.tokens = d.bestSpeed.encode(d.tokens[:0], d.window[:d.windowEnd]) + hb := loadLE32(b, 0) - // If we removed less than 1/16th, Huffman compress the block. - if len(d.tokens) > d.windowEnd-(d.windowEnd>>4) { - d.w.writeBlockHuff(false, d.window[:d.windowEnd]) - } else { - d.w.writeBlockDynamic(d.tokens, false, d.window[:d.windowEnd]) + dst[0] = hash4u(hb, hashBits) + end := len(b) - 4 + 1 + for i := 1; i < end; i++ { + hb = (hb >> 8) | uint32(b[i+3])<<24 + dst[i] = hash4u(hb, hashBits) } - d.err = d.w.err - d.windowEnd = 0 } func (d *compressor) initDeflate() { d.window = make([]byte, 2*windowSize) - d.hashOffset = 1 - d.tokens = make([]token, 0, maxFlateBlockTokens+1) - d.length = minMatchLength - 1 - d.offset = 0 d.byteAvailable = false - d.index = 0 - d.chainHead = -1 - d.bulkHasher = bulkHash4 + d.err = nil + if d.state == nil { + return + } + s := d.state + s.index = 0 + s.hashOffset = 1 + s.length = minMatchLength - 1 + s.offset = 0 + s.chainHead = -1 } -func (d *compressor) deflate() { - if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync { +// deflateLazy does encoding with lazy matching. +func (d *compressor) deflateLazy() { + s := d.state + + if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync { return } + if d.windowEnd != s.index && d.chain > 100 { + // Get literal huffman coder. + // This is used to estimate the cost of emitting a literal. + if d.h == nil { + d.h = newHuffmanEncoder(maxFlateBlockTokens) + } + var tmp [256]uint16 + for _, v := range d.window[s.index:d.windowEnd] { + tmp[v]++ + } + d.h.generate(tmp[:], 15) + } - d.maxInsertIndex = d.windowEnd - (minMatchLength - 1) + s.maxInsertIndex = d.windowEnd - (minMatchLength - 1) -Loop: for { - if d.index > d.windowEnd { - panic("index > windowEnd") - } - lookahead := d.windowEnd - d.index + lookahead := d.windowEnd - s.index if lookahead < minMatchLength+maxMatchLength { if !d.sync { - break Loop - } - if d.index > d.windowEnd { - panic("index > windowEnd") + return } if lookahead == 0 { // Flush current output block if any. if d.byteAvailable { // There is still one pending token that needs to be flushed - d.tokens = append(d.tokens, literalToken(uint32(d.window[d.index-1]))) + d.tokens.AddLiteral(d.window[s.index-1]) d.byteAvailable = false } - if len(d.tokens) > 0 { - if d.err = d.writeBlock(d.tokens, d.index); d.err != nil { + if d.tokens.n > 0 { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { return } - d.tokens = d.tokens[:0] + d.tokens.Reset() } - break Loop + return } } - if d.index < d.maxInsertIndex { + if s.index < s.maxInsertIndex { // Update the hash - hash := hash4(d.window[d.index : d.index+minMatchLength]) - hh := &d.hashHead[hash&hashMask] - d.chainHead = int(*hh) - d.hashPrev[d.index&windowMask] = uint32(d.chainHead) - *hh = uint32(d.index + d.hashOffset) + hash := hash4(d.window[s.index:]) + ch := s.hashHead[hash] + s.chainHead = int(ch) + s.hashPrev[s.index&windowMask] = ch + s.hashHead[hash] = uint32(s.index + s.hashOffset) } - prevLength := d.length - prevOffset := d.offset - d.length = minMatchLength - 1 - d.offset = 0 - minIndex := d.index - windowSize - if minIndex < 0 { - minIndex = 0 + prevLength := s.length + prevOffset := s.offset + s.length = minMatchLength - 1 + s.offset = 0 + minIndex := max(s.index-windowSize, 0) + + if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy { + if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, lookahead); ok { + s.length = newLength + s.offset = newOffset + } } - if d.chainHead-d.hashOffset >= minIndex && - (d.fastSkipHashing != skipNever && lookahead > minMatchLength-1 || - d.fastSkipHashing == skipNever && lookahead > prevLength && prevLength < d.lazy) { - if newLength, newOffset, ok := d.findMatch(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok { - d.length = newLength - d.offset = newOffset + if prevLength >= minMatchLength && s.length <= prevLength { + // No better match, but check for better match at end... + // + // Skip forward a number of bytes. + // Offset of 2 seems to yield the best results. 3 is sometimes better. + const checkOff = 2 + + // Check all, except full length + if prevLength < maxMatchLength-checkOff { + prevIndex := s.index - 1 + if prevIndex+prevLength < s.maxInsertIndex { + end := min(lookahead, maxMatchLength+checkOff) + end += prevIndex + + // Hash at match end. + h := hash4(d.window[prevIndex+prevLength:]) + ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength + if prevIndex-ch2 != prevOffset && ch2 > minIndex+checkOff { + length := matchLen(d.window[prevIndex+checkOff:end], d.window[ch2+checkOff:]) + // It seems like a pure length metric is best. + if length > prevLength { + prevLength = length + prevOffset = prevIndex - ch2 + + // Extend back... + for i := checkOff - 1; i >= 0; i-- { + if prevLength >= maxMatchLength || d.window[prevIndex+i] != d.window[ch2+i] { + // Emit tokens we "owe" + for j := 0; j <= i; j++ { + d.tokens.AddLiteral(d.window[prevIndex+j]) + if d.tokens.n == maxFlateBlockTokens { + // The block includes the current character + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { + return + } + d.tokens.Reset() + } + s.index++ + if s.index < s.maxInsertIndex { + h := hash4(d.window[s.index:]) + ch := s.hashHead[h] + s.chainHead = int(ch) + s.hashPrev[s.index&windowMask] = ch + s.hashHead[h] = uint32(s.index + s.hashOffset) + } + } + break + } else { + prevLength++ + } + } + } + } + } } - } - if d.fastSkipHashing != skipNever && d.length >= minMatchLength || - d.fastSkipHashing == skipNever && prevLength >= minMatchLength && d.length <= prevLength { // There was a match at the previous step, and the current match is // not better. Output the previous match. - if d.fastSkipHashing != skipNever { - d.tokens = append(d.tokens, matchToken(uint32(d.length-baseMatchLength), uint32(d.offset-baseMatchOffset))) - } else { - d.tokens = append(d.tokens, matchToken(uint32(prevLength-baseMatchLength), uint32(prevOffset-baseMatchOffset))) - } + d.tokens.AddMatch(uint32(prevLength-3), uint32(prevOffset-minOffsetSize)) + // Insert in the hash table all strings up to the end of the match. // index and index-1 are already inserted. If there is not enough // lookahead, the last two strings are not inserted into the hash // table. - if d.length <= d.fastSkipHashing { - var newIndex int - if d.fastSkipHashing != skipNever { - newIndex = d.index + d.length - } else { - newIndex = d.index + prevLength - 1 - } - index := d.index - for index++; index < newIndex; index++ { - if index < d.maxInsertIndex { - hash := hash4(d.window[index : index+minMatchLength]) - // Get previous value with the same hash. - // Our chain should point to the previous value. - hh := &d.hashHead[hash&hashMask] - d.hashPrev[index&windowMask] = *hh - // Set the head of the hash chain to us. - *hh = uint32(index + d.hashOffset) - } + newIndex := s.index + prevLength - 1 + // Calculate missing hashes + end := min(newIndex, s.maxInsertIndex) + end += minMatchLength - 1 + startindex := min(s.index+1, s.maxInsertIndex) + tocheck := d.window[startindex:end] + dstSize := len(tocheck) - minMatchLength + 1 + if dstSize > 0 { + dst := s.hashMatch[:dstSize] + bulkHash4(tocheck, dst) + var newH uint32 + for i, val := range dst { + di := i + startindex + newH = val & hashMask + // Get previous value with the same hash. + // Our chain should point to the previous value. + s.hashPrev[di&windowMask] = s.hashHead[newH] + // Set the head of the hash chain to us. + s.hashHead[newH] = uint32(di + s.hashOffset) } - d.index = index - - if d.fastSkipHashing == skipNever { - d.byteAvailable = false - d.length = minMatchLength - 1 - } - } else { - // For matches this long, we don't bother inserting each individual - // item into the table. - d.index += d.length } - if len(d.tokens) == maxFlateBlockTokens { + + s.index = newIndex + d.byteAvailable = false + s.length = minMatchLength - 1 + if d.tokens.n == maxFlateBlockTokens { // The block includes the current character - if d.err = d.writeBlock(d.tokens, d.index); d.err != nil { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { return } - d.tokens = d.tokens[:0] + d.tokens.Reset() } + s.ii = 0 } else { - if d.fastSkipHashing != skipNever || d.byteAvailable { - i := d.index - 1 - if d.fastSkipHashing != skipNever { - i = d.index - } - d.tokens = append(d.tokens, literalToken(uint32(d.window[i]))) - if len(d.tokens) == maxFlateBlockTokens { - if d.err = d.writeBlock(d.tokens, i+1); d.err != nil { + // Reset, if we got a match this run. + if s.length >= minMatchLength { + s.ii = 0 + } + // We have a byte waiting. Emit it. + if d.byteAvailable { + s.ii++ + d.tokens.AddLiteral(d.window[s.index-1]) + if d.tokens.n == maxFlateBlockTokens { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { return } - d.tokens = d.tokens[:0] + d.tokens.Reset() } - } - d.index++ - if d.fastSkipHashing == skipNever { + s.index++ + + // If we have a long run of no matches, skip additional bytes + // Resets when s.ii overflows after 64KB. + if n := int(s.ii) - d.chain; n > 0 { + n = 1 + int(n>>6) + for j := 0; j < n; j++ { + if s.index >= d.windowEnd-1 { + break + } + d.tokens.AddLiteral(d.window[s.index-1]) + if d.tokens.n == maxFlateBlockTokens { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { + return + } + d.tokens.Reset() + } + // Index... + if s.index < s.maxInsertIndex { + h := hash4(d.window[s.index:]) + ch := s.hashHead[h] + s.chainHead = int(ch) + s.hashPrev[s.index&windowMask] = ch + s.hashHead[h] = uint32(s.index + s.hashOffset) + } + s.index++ + } + // Flush last byte + d.tokens.AddLiteral(d.window[s.index-1]) + d.byteAvailable = false + // s.length = minMatchLength - 1 // not needed, since s.ii is reset above, so it should never be > minMatchLength + if d.tokens.n == maxFlateBlockTokens { + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { + return + } + d.tokens.Reset() + } + } + } else { + s.index++ d.byteAvailable = true } } } } -func (d *compressor) fillStore(b []byte) int { - n := copy(d.window[d.windowEnd:], b) - d.windowEnd += n - return n -} - func (d *compressor) store() { if d.windowEnd > 0 && (d.windowEnd == maxStoreBlockSize || d.sync) { d.err = d.writeStoredBlock(d.window[:d.windowEnd]) @@ -524,38 +610,93 @@ func (d *compressor) store() { } } -// storeHuff compresses and stores the currently added data -// when the d.window is full or we are at the end of the stream. +// fillWindow will fill the buffer with data for huffman-only compression. +// The number of bytes copied is returned. +func (d *compressor) fillBlock(b []byte) int { + n := copy(d.window[d.windowEnd:], b) + d.windowEnd += n + return n +} + +// storeHuff will compress and store the currently added data, +// if enough has been accumulated or we at the end of the stream. // Any error that occurred will be in d.err func (d *compressor) storeHuff() { if d.windowEnd < len(d.window) && !d.sync || d.windowEnd == 0 { return } - d.w.writeBlockHuff(false, d.window[:d.windowEnd]) + d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync) d.err = d.w.err d.windowEnd = 0 } +// storeFast will compress and store the currently added data, +// if enough has been accumulated or we at the end of the stream. +// Any error that occurred will be in d.err +func (d *compressor) storeFast() { + // We only compress if we have maxStoreBlockSize. + if d.windowEnd < len(d.window) { + if !d.sync { + return + } + // Handle extremely small sizes. + if d.windowEnd < 128 { + if d.windowEnd == 0 { + return + } + if d.windowEnd <= 32 { + d.err = d.writeStoredBlock(d.window[:d.windowEnd]) + } else { + d.w.writeBlockHuff(false, d.window[:d.windowEnd], true) + d.err = d.w.err + } + d.tokens.Reset() + d.windowEnd = 0 + d.fast.Reset() + return + } + } + + d.fast.Encode(&d.tokens, d.window[:d.windowEnd]) + // If we made zero matches, store the block as is. + if d.tokens.n == 0 { + d.err = d.writeStoredBlock(d.window[:d.windowEnd]) + // If we removed less than 1/16th, huffman compress the block. + } else if int(d.tokens.n) > d.windowEnd-(d.windowEnd>>4) { + d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync) + d.err = d.w.err + } else { + d.w.writeBlockDynamic(&d.tokens, false, d.window[:d.windowEnd], d.sync) + d.err = d.w.err + } + d.tokens.Reset() + d.windowEnd = 0 +} + +// write will add input byte to the stream. +// Unless an error occurs all bytes will be consumed. func (d *compressor) write(b []byte) (n int, err error) { if d.err != nil { return 0, d.err } n = len(b) for len(b) > 0 { - d.step(d) + if d.windowEnd == len(d.window) || d.sync { + d.step(d) + } b = b[d.fill(d, b):] if d.err != nil { return 0, d.err } } - return n, nil + return n, d.err } func (d *compressor) syncFlush() error { + d.sync = true if d.err != nil { return d.err } - d.sync = true d.step(d) if d.err == nil { d.w.writeStoredHeader(0, false) @@ -572,30 +713,33 @@ func (d *compressor) init(w io.Writer, level int) (err error) { switch { case level == NoCompression: d.window = make([]byte, maxStoreBlockSize) - d.fill = (*compressor).fillStore + d.fill = (*compressor).fillBlock d.step = (*compressor).store case level == HuffmanOnly: - d.window = make([]byte, maxStoreBlockSize) - d.fill = (*compressor).fillStore + d.w.logNewTablePenalty = 10 + d.window = make([]byte, 32<<10) + d.fill = (*compressor).fillBlock d.step = (*compressor).storeHuff - case level == BestSpeed: - d.compressionLevel = levels[level] - d.window = make([]byte, maxStoreBlockSize) - d.fill = (*compressor).fillStore - d.step = (*compressor).encSpeed - d.bestSpeed = newDeflateFast() - d.tokens = make([]token, maxStoreBlockSize) case level == DefaultCompression: level = 6 fallthrough - case 2 <= level && level <= 9: + case level >= 1 && level <= 6: + d.w.logNewTablePenalty = 7 + d.fast = newFastEnc(level) + d.window = make([]byte, maxStoreBlockSize) + d.fill = (*compressor).fillBlock + d.step = (*compressor).storeFast + case 7 <= level && level <= 9: + d.w.logNewTablePenalty = 8 + d.state = &advancedState{} d.compressionLevel = levels[level] d.initDeflate() d.fill = (*compressor).fillDeflate - d.step = (*compressor).deflate + d.step = (*compressor).deflateLazy default: return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level) } + d.level = level return nil } @@ -603,27 +747,39 @@ func (d *compressor) reset(w io.Writer) { d.w.reset(w) d.sync = false d.err = nil - switch d.compressionLevel.level { - case NoCompression: + // We only need to reset a few things for Snappy. + if d.fast != nil { + d.fast.Reset() d.windowEnd = 0 - case BestSpeed: + d.tokens.Reset() + return + } + switch d.compressionLevel.chain { + case 0: + // level was NoCompression or ConstantCompression. d.windowEnd = 0 - d.tokens = d.tokens[:0] - d.bestSpeed.reset() default: - d.chainHead = -1 - clear(d.hashHead[:]) - clear(d.hashPrev[:]) - d.hashOffset = 1 - d.index, d.windowEnd = 0, 0 + s := d.state + s.chainHead = -1 + for i := range s.hashHead { + s.hashHead[i] = 0 + } + for i := range s.hashPrev { + s.hashPrev[i] = 0 + } + s.hashOffset = 1 + s.index, d.windowEnd = 0, 0 d.blockStart, d.byteAvailable = 0, false - d.tokens = d.tokens[:0] - d.length = minMatchLength - 1 - d.offset = 0 - d.maxInsertIndex = 0 + d.tokens.Reset() + s.length = minMatchLength - 1 + s.offset = 0 + s.ii = 0 + s.maxInsertIndex = 0 } } +var errWriterClosed = errors.New("flate: closed writer") + func (d *compressor) close() error { if d.err == errWriterClosed { return nil @@ -644,6 +800,7 @@ func (d *compressor) close() error { return d.w.err } d.err = errWriterClosed + d.w.reset(nil) return nil } @@ -674,26 +831,15 @@ func NewWriter(w io.Writer, level int) (*Writer, error) { // can only be decompressed by a reader initialized with the // same dictionary (see [NewReaderDict]). func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) { - dw := &dictWriter{w} - zw, err := NewWriter(dw, level) + zw, err := NewWriter(w, level) if err != nil { return nil, err } zw.d.fillWindow(dict) zw.dict = append(zw.dict, dict...) // duplicate dictionary for Reset method. - return zw, nil -} - -type dictWriter struct { - w io.Writer + return zw, err } -func (w *dictWriter) Write(b []byte) (n int, err error) { - return w.w.Write(b) -} - -var errWriterClosed = errors.New("flate: closed writer") - // A Writer takes data written to it and writes the compressed // form of that data to an underlying writer (see [NewWriter]). type Writer struct { @@ -728,16 +874,26 @@ func (w *Writer) Close() error { } // Reset discards the writer's state and makes it equivalent to -// the result of [NewWriter] or [NewWriterDict] called with dst +// the result of NewWriter or NewWriterDict called with dst // and w's level and dictionary. func (w *Writer) Reset(dst io.Writer) { - if dw, ok := w.d.w.writer.(*dictWriter); ok { + if len(w.dict) > 0 { // w was created with NewWriterDict - dw.w = dst - w.d.reset(dw) - w.d.fillWindow(w.dict) + w.d.reset(dst) + if dst != nil { + w.d.fillWindow(w.dict) + } } else { // w was created with NewWriter w.d.reset(dst) } } + +// ResetDict discards the writer's state and makes it equivalent to +// the result of NewWriter or NewWriterDict called with dst +// and w's level, but sets a specific dictionary. +func (w *Writer) ResetDict(dst io.Writer, dict []byte) { + w.dict = dict + w.d.reset(dst) + w.d.fillWindow(w.dict) +} diff --git a/src/compress/flate/deflate_test.go b/src/compress/flate/deflate_test.go index 3610c7bf8763df..4bb89c61dcad0c 100644 --- a/src/compress/flate/deflate_test.go +++ b/src/compress/flate/deflate_test.go @@ -6,14 +6,11 @@ package flate import ( "bytes" - "errors" "fmt" - "internal/testenv" "io" - "math/rand" "os" "reflect" - "runtime/debug" + "strings" "sync" "testing" ) @@ -35,24 +32,24 @@ type reverseBitsTest struct { } var deflateTests = []*deflateTest{ - {[]byte{}, 0, []byte{1, 0, 0, 255, 255}}, - {[]byte{0x11}, -1, []byte{18, 4, 4, 0, 0, 255, 255}}, - {[]byte{0x11}, DefaultCompression, []byte{18, 4, 4, 0, 0, 255, 255}}, - {[]byte{0x11}, 4, []byte{18, 4, 4, 0, 0, 255, 255}}, - - {[]byte{0x11}, 0, []byte{0, 1, 0, 254, 255, 17, 1, 0, 0, 255, 255}}, - {[]byte{0x11, 0x12}, 0, []byte{0, 2, 0, 253, 255, 17, 18, 1, 0, 0, 255, 255}}, - {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 0, - []byte{0, 8, 0, 247, 255, 17, 17, 17, 17, 17, 17, 17, 17, 1, 0, 0, 255, 255}, + 0: {[]byte{}, 0, []byte{0x3, 0x0}}, + 1: {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}}, + 2: {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}}, + 3: {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}}, + + 4: {[]byte{0x11}, 0, []byte{0x0, 0x1, 0x0, 0xfe, 0xff, 0x11, 0x3, 0x0}}, + 5: {[]byte{0x11, 0x12}, 0, []byte{0x0, 0x2, 0x0, 0xfd, 0xff, 0x11, 0x12, 0x3, 0x0}}, + 6: {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 0, + []byte{0x0, 0x8, 0x0, 0xf7, 0xff, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x3, 0x0}, }, - {[]byte{}, 2, []byte{1, 0, 0, 255, 255}}, - {[]byte{0x11}, 2, []byte{18, 4, 4, 0, 0, 255, 255}}, - {[]byte{0x11, 0x12}, 2, []byte{18, 20, 2, 4, 0, 0, 255, 255}}, - {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 2, []byte{18, 132, 2, 64, 0, 0, 0, 255, 255}}, - {[]byte{}, 9, []byte{1, 0, 0, 255, 255}}, - {[]byte{0x11}, 9, []byte{18, 4, 4, 0, 0, 255, 255}}, - {[]byte{0x11, 0x12}, 9, []byte{18, 20, 2, 4, 0, 0, 255, 255}}, - {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 9, []byte{18, 132, 2, 64, 0, 0, 0, 255, 255}}, + 7: {[]byte{}, 1, []byte{0x3, 0x0}}, + 8: {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}}, + 9: {[]byte{0x11, 0x12}, BestCompression, []byte{0x12, 0x14, 0x2, 0xc, 0x0}}, + 10: {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, BestCompression, []byte{0x12, 0x84, 0x1, 0xc0, 0x0}}, + 11: {[]byte{}, 9, []byte{0x3, 0x0}}, + 12: {[]byte{0x11}, 9, []byte{0x12, 0x4, 0xc, 0x0}}, + 13: {[]byte{0x11, 0x12}, 9, []byte{0x12, 0x14, 0x2, 0xc, 0x0}}, + 14: {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 9, []byte{0x12, 0x84, 0x1, 0xc0, 0x0}}, } var deflateInflateTests = []*deflateInflateTest{ @@ -86,23 +83,24 @@ func largeDataChunk() []byte { func TestBulkHash4(t *testing.T) { for _, x := range deflateTests { y := x.out - if len(y) < minMatchLength { - continue - } - y = append(y, y...) - for j := 4; j < len(y); j++ { - y := y[:j] - dst := make([]uint32, len(y)-minMatchLength+1) - for i := range dst { - dst[i] = uint32(i + 100) - } - bulkHash4(y, dst) - for i, got := range dst { - want := hash4(y[i:]) - if got != want && got == uint32(i)+100 { - t.Errorf("Len:%d Index:%d, want 0x%08x but not modified", len(y), i, want) - } else if got != want { - t.Errorf("Len:%d Index:%d, got 0x%08x want:0x%08x", len(y), i, got, want) + if len(y) >= minMatchLength { + y = append(y, y...) + for j := 4; j < len(y); j++ { + y := y[:j] + dst := make([]uint32, len(y)-minMatchLength+1) + for i := range dst { + dst[i] = uint32(i + 100) + } + bulkHash4(y, dst) + for i, got := range dst { + want := hash4(y[i:]) + if got != want && got == uint32(i)+100 { + t.Errorf("Len:%d Index:%d, expected 0x%08x but not modified", len(y), i, want) + } else if got != want { + t.Errorf("Len:%d Index:%d, got 0x%08x expected:0x%08x", len(y), i, got, want) + } else { + //t.Logf("Len:%d Index:%d OK (0x%08x)", len(y), i, got) + } } } } @@ -110,7 +108,7 @@ func TestBulkHash4(t *testing.T) { } func TestDeflate(t *testing.T) { - for _, h := range deflateTests { + for i, h := range deflateTests { var buf bytes.Buffer w, err := NewWriter(&buf, h.level) if err != nil { @@ -120,45 +118,11 @@ func TestDeflate(t *testing.T) { w.Write(h.in) w.Close() if !bytes.Equal(buf.Bytes(), h.out) { - t.Errorf("Deflate(%d, %x) = \n%#v, want \n%#v", h.level, h.in, buf.Bytes(), h.out) + t.Errorf("%d: Deflate(%d, %x) got \n%#v, want \n%#v", i, h.level, h.in, buf.Bytes(), h.out) } } } -func TestWriterClose(t *testing.T) { - b := new(bytes.Buffer) - zw, err := NewWriter(b, 6) - if err != nil { - t.Fatalf("NewWriter: %v", err) - } - - if c, err := zw.Write([]byte("Test")); err != nil || c != 4 { - t.Fatalf("Write to not closed writer: %s, %d", err, c) - } - - if err := zw.Close(); err != nil { - t.Fatalf("Close: %v", err) - } - - afterClose := b.Len() - - if c, err := zw.Write([]byte("Test")); err == nil || c != 0 { - t.Fatalf("Write to closed writer: %v, %d", err, c) - } - - if err := zw.Flush(); err == nil { - t.Fatalf("Flush to closed writer: %s", err) - } - - if err := zw.Close(); err != nil { - t.Fatalf("Close: %v", err) - } - - if afterClose != b.Len() { - t.Fatalf("Writer wrote data after close. After close: %d. After writes on closed stream: %d", afterClose, b.Len()) - } -} - // A sparseReader returns a stream consisting of 0s followed by 1<<16 1s. // This tests missing hash references in a very large input. type sparseReader struct { @@ -191,7 +155,8 @@ func TestVeryLongSparseChunk(t *testing.T) { if testing.Short() { t.Skip("skipping sparse chunk during short test") } - w, err := NewWriter(io.Discard, 1) + var buf bytes.Buffer + w, err := NewWriter(&buf, 1) if err != nil { t.Errorf("NewWriter: %v", err) return @@ -200,6 +165,7 @@ func TestVeryLongSparseChunk(t *testing.T) { t.Errorf("Compress failed: %v", err) return } + t.Log("Length:", buf.Len()) } type syncBuffer struct { @@ -270,7 +236,7 @@ func testSync(t *testing.T, level int, input []byte, name string) { r := NewReader(buf) // Write half the input and read back. - for i := 0; i < 2; i++ { + for i := range 2 { var lo, hi int if i == 0 { lo, hi = 0, (len(input)+1)/2 @@ -348,13 +314,13 @@ func testToFromWithLevelAndLimit(t *testing.T, level int, input []byte, name str } w.Write(input) w.Close() + if limit > 0 { + t.Logf("level: %d - Size:%.2f%%, %d b\n", level, float64(buffer.Len()*100)/float64(limit), buffer.Len()) + } if limit > 0 && buffer.Len() > limit { t.Errorf("level: %d, len(compress(data)) = %d > limit = %d", level, buffer.Len(), limit) - return - } - if limit > 0 { - t.Logf("level: %d, size:%.2f%%, %d b\n", level, float64(buffer.Len()*100)/float64(limit), buffer.Len()) } + r := NewReader(&buffer) out, err := io.ReadAll(r) if err != nil { @@ -363,6 +329,8 @@ func testToFromWithLevelAndLimit(t *testing.T, level int, input []byte, name str } r.Close() if !bytes.Equal(input, out) { + os.WriteFile("testdata/fails/"+t.Name()+".got", out, os.ModePerm) + os.WriteFile("testdata/fails/"+t.Name()+".want", input, os.ModePerm) t.Errorf("decompress(compress(data)) != data: level=%d input=%s", level, name) return } @@ -370,19 +338,14 @@ func testToFromWithLevelAndLimit(t *testing.T, level int, input []byte, name str } func testToFromWithLimit(t *testing.T, input []byte, name string, limit [11]int) { - for i := 0; i < 10; i++ { + for i := range 10 { testToFromWithLevelAndLimit(t, i, input, name, limit[i]) } - // Test HuffmanCompression testToFromWithLevelAndLimit(t, -2, input, name, limit[10]) } func TestDeflateInflate(t *testing.T) { - t.Parallel() for i, h := range deflateInflateTests { - if testing.Short() && len(h.in) > 10000 { - continue - } testToFromWithLimit(t, h.in, fmt.Sprintf("#%d", i), [11]int{}) } } @@ -399,33 +362,38 @@ func TestReverseBits(t *testing.T) { type deflateInflateStringTest struct { filename string label string - limit [11]int + limit [11]int // Number 11 is ConstantCompression } var deflateInflateStringTests = []deflateInflateStringTest{ { "../testdata/e.txt", "2.718281828...", - [...]int{100018, 50650, 50960, 51150, 50930, 50790, 50790, 50790, 50790, 50790, 43683}, + [...]int{100018, 67900, 50960, 51150, 50930, 50790, 50790, 50790, 50790, 50790, 43683 + 100}, }, { "../../testdata/Isaac.Newton-Opticks.txt", "Isaac.Newton-Opticks", - [...]int{567248, 218338, 198211, 193152, 181100, 175427, 175427, 173597, 173422, 173422, 325240}, + [...]int{567248, 218338, 201354, 199101, 190627, 182587, 179765, 174982, 173422, 173422, 325240}, }, } func TestDeflateInflateString(t *testing.T) { - t.Parallel() - if testing.Short() && testenv.Builder() == "" { - t.Skip("skipping in short mode") - } for _, test := range deflateInflateStringTests { gold, err := os.ReadFile(test.filename) if err != nil { t.Error(err) } - testToFromWithLimit(t, gold, test.label, test.limit) + // Remove returns that may be present on Windows + neutral := strings.Map(func(r rune) rune { + if r != '\r' { + return r + } + return -1 + }, string(gold)) + + testToFromWithLimit(t, []byte(neutral), test.label, test.limit) + if testing.Short() { break } @@ -460,31 +428,36 @@ func TestReaderDict(t *testing.T) { func TestWriterDict(t *testing.T) { const ( - dict = "hello world" - text = "hello again world" + dict = "hello world Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua." + text = "hello world Lorem ipsum dolor sit amet" ) - var b bytes.Buffer - w, err := NewWriter(&b, 5) - if err != nil { - t.Fatalf("NewWriter: %v", err) - } - w.Write([]byte(dict)) - w.Flush() - b.Reset() - w.Write([]byte(text)) - w.Close() + // This test is sensitive to algorithm changes that skip + // data in favour of speed. Higher levels are less prone to this + // so we test level 4-9. + for l := 4; l < 9; l++ { + var b bytes.Buffer + w, err := NewWriter(&b, l) + if err != nil { + t.Fatalf("level %d, NewWriter: %v", l, err) + } + w.Write([]byte(dict)) + w.Flush() + b.Reset() + w.Write([]byte(text)) + w.Close() - var b1 bytes.Buffer - w, _ = NewWriterDict(&b1, 5, []byte(dict)) - w.Write([]byte(text)) - w.Close() + var b1 bytes.Buffer + w, _ = NewWriterDict(&b1, l, []byte(dict)) + w.Write([]byte(text)) + w.Close() - if !bytes.Equal(b1.Bytes(), b.Bytes()) { - t.Fatalf("writer wrote %q want %q", b1.Bytes(), b.Bytes()) + if !bytes.Equal(b1.Bytes(), b.Bytes()) { + t.Errorf("level %d, writer wrote\n%v\n want\n%v", l, b1.Bytes(), b.Bytes()) + } } } -// See https://golang.org/issue/2508 +// See http://code.google.com/p/go/issues/detail?id=2508 func TestRegression2508(t *testing.T) { if testing.Short() { t.Logf("test disabled with -short") @@ -495,7 +468,7 @@ func TestRegression2508(t *testing.T) { t.Fatalf("NewWriter: %v", err) } buf := make([]byte, 1024) - for i := 0; i < 131072; i++ { + for range 131072 { if _, err := w.Write(buf); err != nil { t.Fatalf("writer failed: %v", err) } @@ -504,8 +477,10 @@ func TestRegression2508(t *testing.T) { } func TestWriterReset(t *testing.T) { - t.Parallel() - for level := 0; level <= 9; level++ { + for level := -2; level <= 9; level++ { + if level == -1 { + level++ + } if testing.Short() && level > 1 { break } @@ -514,11 +489,7 @@ func TestWriterReset(t *testing.T) { t.Fatalf("NewWriter: %v", err) } buf := []byte("hello world") - n := 1024 - if testing.Short() { - n = 10 - } - for i := 0; i < n; i++ { + for range 1024 { w.Write(buf) } w.Reset(io.Discard) @@ -531,12 +502,12 @@ func TestWriterReset(t *testing.T) { // DeepEqual doesn't compare functions. w.d.fill, wref.d.fill = nil, nil w.d.step, wref.d.step = nil, nil - w.d.bulkHasher, wref.d.bulkHasher = nil, nil - w.d.bestSpeed, wref.d.bestSpeed = nil, nil + w.d.state, wref.d.state = nil, nil + w.d.fast, wref.d.fast = nil, nil + // hashMatch is always overwritten when used. - copy(w.d.hashMatch[:], wref.d.hashMatch[:]) - if len(w.d.tokens) != 0 { - t.Errorf("level %d Writer not reset after Reset. %d tokens were present", level, len(w.d.tokens)) + if w.d.tokens.n != 0 { + t.Errorf("level %d Writer not reset after Reset. %d tokens were present", level, w.d.tokens.n) } // As long as the length is 0, we don't care about the content. w.d.tokens = wref.d.tokens @@ -548,76 +519,64 @@ func TestWriterReset(t *testing.T) { } } - levels := []int{0, 1, 2, 5, 9} - for _, level := range levels { - t.Run(fmt.Sprint(level), func(t *testing.T) { - testResetOutput(t, level, nil) + for i := HuffmanOnly; i <= BestCompression; i++ { + testResetOutput(t, fmt.Sprint("level-", i), func(w io.Writer) (*Writer, error) { return NewWriter(w, i) }) + } + dict := []byte(strings.Repeat("we are the world - how are you?", 3)) + for i := HuffmanOnly; i <= BestCompression; i++ { + testResetOutput(t, fmt.Sprint("dict-level-", i), func(w io.Writer) (*Writer, error) { return NewWriterDict(w, i, dict) }) + } + for i := HuffmanOnly; i <= BestCompression; i++ { + testResetOutput(t, fmt.Sprint("dict-reset-level-", i), func(w io.Writer) (*Writer, error) { + w2, err := NewWriter(nil, i) + if err != nil { + return w2, err + } + w2.ResetDict(w, dict) + return w2, nil }) } - - t.Run("dict", func(t *testing.T) { - for _, level := range levels { - t.Run(fmt.Sprint(level), func(t *testing.T) { - testResetOutput(t, level, nil) - }) - } - }) } -func testResetOutput(t *testing.T, level int, dict []byte) { - writeData := func(w *Writer) { - msg := []byte("now is the time for all good gophers") - w.Write(msg) - w.Flush() - - hello := []byte("hello world") - for i := 0; i < 1024; i++ { - w.Write(hello) +func testResetOutput(t *testing.T, name string, newWriter func(w io.Writer) (*Writer, error)) { + t.Run(name, func(t *testing.T) { + buf := new(bytes.Buffer) + w, err := newWriter(buf) + if err != nil { + t.Fatalf("NewWriter: %v", err) } + b := []byte("hello world - how are you doing?") + for range 1024 { + w.Write(b) + } + w.Close() + out1 := buf.Bytes() - fill := bytes.Repeat([]byte("x"), 65000) - w.Write(fill) - } - - buf := new(bytes.Buffer) - var w *Writer - var err error - if dict == nil { - w, err = NewWriter(buf, level) - } else { - w, err = NewWriterDict(buf, level, dict) - } - if err != nil { - t.Fatalf("NewWriter: %v", err) - } - - writeData(w) - w.Close() - out1 := buf.Bytes() - - buf2 := new(bytes.Buffer) - w.Reset(buf2) - writeData(w) - w.Close() - out2 := buf2.Bytes() + buf2 := new(bytes.Buffer) + w.Reset(buf2) + for range 1024 { + w.Write(b) + } + w.Close() + out2 := buf2.Bytes() - if len(out1) != len(out2) { - t.Errorf("got %d, expected %d bytes", len(out2), len(out1)) - return - } - if !bytes.Equal(out1, out2) { - mm := 0 - for i, b := range out1[:len(out2)] { - if b != out2[i] { - t.Errorf("mismatch index %d: %#02x, expected %#02x", i, out2[i], b) - } - mm++ - if mm == 10 { - t.Fatal("Stopping") + if len(out1) != len(out2) { + t.Errorf("got %d, expected %d bytes", len(out2), len(out1)) + } + if !bytes.Equal(out1, out2) { + mm := 0 + for i, b := range out1[:len(out2)] { + if b != out2[i] { + t.Errorf("mismatch index %d: %02x, expected %02x", i, out2[i], b) + } + mm++ + if mm == 10 { + t.Fatal("Stopping") + } } } - } - t.Logf("got %d bytes", len(out1)) + t.Logf("got %d bytes", len(out1)) + }) } // TestBestSpeed tests that round-tripping through deflate and then inflate @@ -625,7 +584,6 @@ func testResetOutput(t *testing.T, level int, dict []byte) { // compressor.encSpeed method (0, 16, 128), as well as near maxStoreBlockSize // (65535). func TestBestSpeed(t *testing.T) { - t.Parallel() abc := make([]byte, 128) for i := range abc { abc[i] = byte(i) @@ -653,8 +611,8 @@ func TestBestSpeed(t *testing.T) { } for i, tc := range testCases { - if i >= 3 && testing.Short() { - break + if testing.Short() && i > 5 { + t.Skip() } for _, firstN := range []int{1, 65534, 65535, 65536, 65537, 131072} { tc[0] = firstN @@ -703,368 +661,3 @@ func TestBestSpeed(t *testing.T) { } } } - -var errIO = errors.New("IO error") - -// failWriter fails with errIO exactly at the nth call to Write. -type failWriter struct{ n int } - -func (w *failWriter) Write(b []byte) (int, error) { - w.n-- - if w.n == -1 { - return 0, errIO - } - return len(b), nil -} - -func TestWriterPersistentWriteError(t *testing.T) { - t.Parallel() - d, err := os.ReadFile("../../testdata/Isaac.Newton-Opticks.txt") - if err != nil { - t.Fatalf("ReadFile: %v", err) - } - d = d[:10000] // Keep this test short - - zw, err := NewWriter(nil, DefaultCompression) - if err != nil { - t.Fatalf("NewWriter: %v", err) - } - - // Sweep over the threshold at which an error is returned. - // The variable i makes it such that the ith call to failWriter.Write will - // return errIO. Since failWriter errors are not persistent, we must ensure - // that flate.Writer errors are persistent. - for i := 0; i < 1000; i++ { - fw := &failWriter{i} - zw.Reset(fw) - - _, werr := zw.Write(d) - cerr := zw.Close() - ferr := zw.Flush() - if werr != errIO && werr != nil { - t.Errorf("test %d, mismatching Write error: got %v, want %v", i, werr, errIO) - } - if cerr != errIO && fw.n < 0 { - t.Errorf("test %d, mismatching Close error: got %v, want %v", i, cerr, errIO) - } - if ferr != errIO && fw.n < 0 { - t.Errorf("test %d, mismatching Flush error: got %v, want %v", i, ferr, errIO) - } - if fw.n >= 0 { - // At this point, the failure threshold was sufficiently high enough - // that we wrote the whole stream without any errors. - return - } - } -} -func TestWriterPersistentFlushError(t *testing.T) { - zw, err := NewWriter(&failWriter{0}, DefaultCompression) - if err != nil { - t.Fatalf("NewWriter: %v", err) - } - flushErr := zw.Flush() - closeErr := zw.Close() - _, writeErr := zw.Write([]byte("Test")) - checkErrors([]error{closeErr, flushErr, writeErr}, errIO, t) -} - -func TestWriterPersistentCloseError(t *testing.T) { - // If underlying writer return error on closing stream we should persistent this error across all writer calls. - zw, err := NewWriter(&failWriter{0}, DefaultCompression) - if err != nil { - t.Fatalf("NewWriter: %v", err) - } - closeErr := zw.Close() - flushErr := zw.Flush() - _, writeErr := zw.Write([]byte("Test")) - checkErrors([]error{closeErr, flushErr, writeErr}, errIO, t) - - // After closing writer we should persistent "write after close" error across Flush and Write calls, but return nil - // on next Close calls. - var b bytes.Buffer - zw.Reset(&b) - err = zw.Close() - if err != nil { - t.Fatalf("First call to close returned error: %s", err) - } - err = zw.Close() - if err != nil { - t.Fatalf("Second call to close returned error: %s", err) - } - - flushErr = zw.Flush() - _, writeErr = zw.Write([]byte("Test")) - checkErrors([]error{flushErr, writeErr}, errWriterClosed, t) -} - -func checkErrors(got []error, want error, t *testing.T) { - t.Helper() - for _, err := range got { - if err != want { - t.Errorf("Error doesn't match\nWant: %s\nGot: %s", want, got) - } - } -} - -func TestBestSpeedMatch(t *testing.T) { - t.Parallel() - cases := []struct { - previous, current []byte - t, s, want int32 - }{{ - previous: []byte{0, 0, 0, 1, 2}, - current: []byte{3, 4, 5, 0, 1, 2, 3, 4, 5}, - t: -3, - s: 3, - want: 6, - }, { - previous: []byte{0, 0, 0, 1, 2}, - current: []byte{2, 4, 5, 0, 1, 2, 3, 4, 5}, - t: -3, - s: 3, - want: 3, - }, { - previous: []byte{0, 0, 0, 1, 1}, - current: []byte{3, 4, 5, 0, 1, 2, 3, 4, 5}, - t: -3, - s: 3, - want: 2, - }, { - previous: []byte{0, 0, 0, 1, 2}, - current: []byte{2, 2, 2, 2, 1, 2, 3, 4, 5}, - t: -1, - s: 0, - want: 4, - }, { - previous: []byte{0, 0, 0, 1, 2, 3, 4, 5, 2, 2}, - current: []byte{2, 2, 2, 2, 1, 2, 3, 4, 5}, - t: -7, - s: 4, - want: 5, - }, { - previous: []byte{9, 9, 9, 9, 9}, - current: []byte{2, 2, 2, 2, 1, 2, 3, 4, 5}, - t: -1, - s: 0, - want: 0, - }, { - previous: []byte{9, 9, 9, 9, 9}, - current: []byte{9, 2, 2, 2, 1, 2, 3, 4, 5}, - t: 0, - s: 1, - want: 0, - }, { - previous: []byte{}, - current: []byte{9, 2, 2, 2, 1, 2, 3, 4, 5}, - t: -5, - s: 1, - want: 0, - }, { - previous: []byte{}, - current: []byte{9, 2, 2, 2, 1, 2, 3, 4, 5}, - t: -1, - s: 1, - want: 0, - }, { - previous: []byte{}, - current: []byte{2, 2, 2, 2, 1, 2, 3, 4, 5}, - t: 0, - s: 1, - want: 3, - }, { - previous: []byte{3, 4, 5}, - current: []byte{3, 4, 5}, - t: -3, - s: 0, - want: 3, - }, { - previous: make([]byte, 1000), - current: make([]byte, 1000), - t: -1000, - s: 0, - want: maxMatchLength - 4, - }, { - previous: make([]byte, 200), - current: make([]byte, 500), - t: -200, - s: 0, - want: maxMatchLength - 4, - }, { - previous: make([]byte, 200), - current: make([]byte, 500), - t: 0, - s: 1, - want: maxMatchLength - 4, - }, { - previous: make([]byte, maxMatchLength-4), - current: make([]byte, 500), - t: -(maxMatchLength - 4), - s: 0, - want: maxMatchLength - 4, - }, { - previous: make([]byte, 200), - current: make([]byte, 500), - t: -200, - s: 400, - want: 100, - }, { - previous: make([]byte, 10), - current: make([]byte, 500), - t: 200, - s: 400, - want: 100, - }} - for i, c := range cases { - e := deflateFast{prev: c.previous} - got := e.matchLen(c.s, c.t, c.current) - if got != c.want { - t.Errorf("Test %d: match length, want %d, got %d", i, c.want, got) - } - } -} - -func TestBestSpeedMaxMatchOffset(t *testing.T) { - t.Parallel() - const abc, xyz = "abcdefgh", "stuvwxyz" - for _, matchBefore := range []bool{false, true} { - for _, extra := range []int{0, inputMargin - 1, inputMargin, inputMargin + 1, 2 * inputMargin} { - for offsetAdj := -5; offsetAdj <= +5; offsetAdj++ { - report := func(desc string, err error) { - t.Errorf("matchBefore=%t, extra=%d, offsetAdj=%d: %s%v", - matchBefore, extra, offsetAdj, desc, err) - } - - offset := maxMatchOffset + offsetAdj - - // Make src to be a []byte of the form - // "%s%s%s%s%s" % (abc, zeros0, xyzMaybe, abc, zeros1) - // where: - // zeros0 is approximately maxMatchOffset zeros. - // xyzMaybe is either xyz or the empty string. - // zeros1 is between 0 and 30 zeros. - // The difference between the two abc's will be offset, which - // is maxMatchOffset plus or minus a small adjustment. - src := make([]byte, offset+len(abc)+extra) - copy(src, abc) - if !matchBefore { - copy(src[offset-len(xyz):], xyz) - } - copy(src[offset:], abc) - - buf := new(bytes.Buffer) - w, err := NewWriter(buf, BestSpeed) - if err != nil { - report("NewWriter: ", err) - continue - } - if _, err := w.Write(src); err != nil { - report("Write: ", err) - continue - } - if err := w.Close(); err != nil { - report("Writer.Close: ", err) - continue - } - - r := NewReader(buf) - dst, err := io.ReadAll(r) - r.Close() - if err != nil { - report("ReadAll: ", err) - continue - } - - if !bytes.Equal(dst, src) { - report("", fmt.Errorf("bytes differ after round-tripping")) - continue - } - } - } - } -} - -func TestBestSpeedShiftOffsets(t *testing.T) { - // Test if shiftoffsets properly preserves matches and resets out-of-range matches - // seen in https://github.com/golang/go/issues/4142 - enc := newDeflateFast() - - // testData may not generate internal matches. - testData := make([]byte, 32) - rng := rand.New(rand.NewSource(0)) - for i := range testData { - testData[i] = byte(rng.Uint32()) - } - - // Encode the testdata with clean state. - // Second part should pick up matches from the first block. - wantFirstTokens := len(enc.encode(nil, testData)) - wantSecondTokens := len(enc.encode(nil, testData)) - - if wantFirstTokens <= wantSecondTokens { - t.Fatalf("test needs matches between inputs to be generated") - } - // Forward the current indicator to before wraparound. - enc.cur = bufferReset - int32(len(testData)) - - // Part 1 before wrap, should match clean state. - got := len(enc.encode(nil, testData)) - if wantFirstTokens != got { - t.Errorf("got %d, want %d tokens", got, wantFirstTokens) - } - - // Verify we are about to wrap. - if enc.cur != bufferReset { - t.Errorf("got %d, want e.cur to be at bufferReset (%d)", enc.cur, bufferReset) - } - - // Part 2 should match clean state as well even if wrapped. - got = len(enc.encode(nil, testData)) - if wantSecondTokens != got { - t.Errorf("got %d, want %d token", got, wantSecondTokens) - } - - // Verify that we wrapped. - if enc.cur >= bufferReset { - t.Errorf("want e.cur to be < bufferReset (%d), got %d", bufferReset, enc.cur) - } - - // Forward the current buffer, leaving the matches at the bottom. - enc.cur = bufferReset - enc.shiftOffsets() - - // Ensure that no matches were picked up. - got = len(enc.encode(nil, testData)) - if wantFirstTokens != got { - t.Errorf("got %d, want %d tokens", got, wantFirstTokens) - } -} - -func TestMaxStackSize(t *testing.T) { - // This test must not run in parallel with other tests as debug.SetMaxStack - // affects all goroutines. - n := debug.SetMaxStack(1 << 16) - defer debug.SetMaxStack(n) - - var wg sync.WaitGroup - defer wg.Wait() - - b := make([]byte, 1<<20) - for level := HuffmanOnly; level <= BestCompression; level++ { - // Run in separate goroutine to increase probability of stack regrowth. - wg.Add(1) - go func(level int) { - defer wg.Done() - zw, err := NewWriter(io.Discard, level) - if err != nil { - t.Errorf("level %d, NewWriter() = %v, want nil", level, err) - } - if n, err := zw.Write(b); n != len(b) || err != nil { - t.Errorf("level %d, Write() = (%d, %v), want (%d, nil)", level, n, err, len(b)) - } - if err := zw.Close(); err != nil { - t.Errorf("level %d, Close() = %v, want nil", level, err) - } - zw.Reset(io.Discard) - }(level) - } -} diff --git a/src/compress/flate/deflatefast.go b/src/compress/flate/deflatefast.go index e5554d6fb40842..eef1896b6f5c63 100644 --- a/src/compress/flate/deflatefast.go +++ b/src/compress/flate/deflatefast.go @@ -4,304 +4,165 @@ package flate -import "math" - -// This encoding algorithm, which prioritizes speed over output size, is -// based on Snappy's LZ77-style encoder: github.com/golang/snappy - -const ( - tableBits = 14 // Bits used in the table. - tableSize = 1 << tableBits // Size of the table. - tableMask = tableSize - 1 // Mask for table indices. Redundant, but can eliminate bounds checks. - tableShift = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32. - - // Reset the buffer offset when reaching this. - // Offsets are stored between blocks as int32 values. - // Since the offset we are checking against is at the beginning - // of the buffer, we need to subtract the current and input - // buffer to not risk overflowing the int32. - bufferReset = math.MaxInt32 - maxStoreBlockSize*2 +import ( + "math/bits" ) -func load32(b []byte, i int32) uint32 { - b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line. - return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 +type fastEnc interface { + Encode(dst *tokens, src []byte) + Reset() } -func load64(b []byte, i int32) uint64 { - b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line. - return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | - uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +func newFastEnc(level int) fastEnc { + switch level { + case 1: + return &fastEncL1{fastGen: fastGen{cur: maxStoreBlockSize}} + case 2: + return &fastEncL2{fastGen: fastGen{cur: maxStoreBlockSize}} + case 3: + return &fastEncL3{fastGen: fastGen{cur: maxStoreBlockSize}} + case 4: + return &fastEncL4{fastGen: fastGen{cur: maxStoreBlockSize}} + case 5: + return &fastEncL5{fastGen: fastGen{cur: maxStoreBlockSize}} + case 6: + return &fastEncL6{fastGen: fastGen{cur: maxStoreBlockSize}} + default: + panic("invalid level specified") + } } -func hash(u uint32) uint32 { - return (u * 0x1e35a7bd) >> tableShift -} +const ( + tableBits = 15 // Bits used in the table + tableSize = 1 << tableBits // Size of the table + hashLongBytes = 7 // Bytes used for long table hash + baseMatchOffset = 1 // The smallest match offset + baseMatchLength = 3 // The smallest match length per the RFC section 3.2.5 + maxMatchOffset = 1 << 15 // The largest match offset + + bTableBits = 17 // Bits used in the big tables + bTableSize = 1 << bTableBits // Size of the table + allocHistory = maxStoreBlockSize * 5 // Size to preallocate for history. + bufferReset = (1 << 31) - allocHistory - maxStoreBlockSize - 1 // Reset the buffer offset when reaching this. +) -// These constants are defined by the Snappy implementation so that its -// assembly implementation can fast-path some 16-bytes-at-a-time copies. They -// aren't necessary in the pure Go implementation, as we don't use those same -// optimizations, but using the same thresholds doesn't really hurt. const ( - inputMargin = 16 - 1 - minNonLiteralBlockSize = 1 + 1 + inputMargin + prime3bytes = 506832829 + prime4bytes = 2654435761 + prime5bytes = 889523592379 + prime6bytes = 227718039650203 + prime7bytes = 58295818150454627 + prime8bytes = 0xcf1bbcdcb7a56463 ) type tableEntry struct { - val uint32 // Value at destination offset int32 } -// deflateFast maintains the table for matches, -// and the previous byte block for cross block matching. -type deflateFast struct { - table [tableSize]tableEntry - prev []byte // Previous block, zero length if unknown. - cur int32 // Current match offset. -} - -func newDeflateFast() *deflateFast { - return &deflateFast{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)} +// fastGen maintains the table for matches, +// and the previous byte block for level 2. +// This is the generic implementation. +type fastGen struct { + hist []byte + cur int32 } -// encode encodes a block given in src and appends tokens -// to dst and returns the result. -func (e *deflateFast) encode(dst []token, src []byte) []token { - // Ensure that e.cur doesn't wrap. - if e.cur >= bufferReset { - e.shiftOffsets() - } - - // This check isn't in the Snappy implementation, but there, the caller - // instead of the callee handles this case. - if len(src) < minNonLiteralBlockSize { - e.cur += maxStoreBlockSize - e.prev = e.prev[:0] - return emitLiteral(dst, src) - } - - // sLimit is when to stop looking for offset/length copies. The inputMargin - // lets us use a fast path for emitLiteral in the main loop, while we are - // looking for copies. - sLimit := int32(len(src) - inputMargin) - - // nextEmit is where in src the next emitLiteral should start from. - nextEmit := int32(0) - s := int32(0) - cv := load32(src, s) - nextHash := hash(cv) - - for { - // Copied from the C++ snappy implementation: - // - // Heuristic match skipping: If 32 bytes are scanned with no matches - // found, start looking only at every other byte. If 32 more bytes are - // scanned (or skipped), look at every third byte, etc.. When a match - // is found, immediately go back to looking at every byte. This is a - // small loss (~5% performance, ~0.1% density) for compressible data - // due to more bookkeeping, but for non-compressible data (such as - // JPEG) it's a huge win since the compressor quickly "realizes" the - // data is incompressible and doesn't bother looking for matches - // everywhere. - // - // The "skip" variable keeps track of how many bytes there are since - // the last match; dividing it by 32 (ie. right-shifting by five) gives - // the number of bytes to move ahead for each iteration. - skip := int32(32) - - nextS := s - var candidate tableEntry - for { - s = nextS - bytesBetweenHashLookups := skip >> 5 - nextS = s + bytesBetweenHashLookups - skip += bytesBetweenHashLookups - if nextS > sLimit { - goto emitRemainder - } - candidate = e.table[nextHash&tableMask] - now := load32(src, nextS) - e.table[nextHash&tableMask] = tableEntry{offset: s + e.cur, val: cv} - nextHash = hash(now) - - offset := s - (candidate.offset - e.cur) - if offset > maxMatchOffset || cv != candidate.val { - // Out of range or not matched. - cv = now - continue - } - break - } - - // A 4-byte match has been found. We'll later see if more than 4 bytes - // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit - // them as literal bytes. - dst = emitLiteral(dst, src[nextEmit:s]) - - // Call emitCopy, and then see if another emitCopy could be our next - // move. Repeat until we find no match for the input immediately after - // what was consumed by the last emitCopy call. - // - // If we exit this loop normally then we need to call emitLiteral next, - // though we don't yet know how big the literal will be. We handle that - // by proceeding to the next iteration of the main loop. We also can - // exit this loop via goto if we get close to exhausting the input. - for { - // Invariant: we have a 4-byte match at s, and no need to emit any - // literal bytes prior to s. - - // Extend the 4-byte match as long as possible. - // - s += 4 - t := candidate.offset - e.cur + 4 - l := e.matchLen(s, t, src) - - // matchToken is flate's equivalent of Snappy's emitCopy. (length,offset) - dst = append(dst, matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))) - s += l - nextEmit = s - if s >= sLimit { - goto emitRemainder - } - - // We could immediately start working at s now, but to improve - // compression we first update the hash table at s-1 and at s. If - // another emitCopy is not our next move, also calculate nextHash - // at s+1. At least on GOARCH=amd64, these three hash calculations - // are faster as one load64 call (with some shifts) instead of - // three load32 calls. - x := load64(src, s-1) - prevHash := hash(uint32(x)) - e.table[prevHash&tableMask] = tableEntry{offset: e.cur + s - 1, val: uint32(x)} - x >>= 8 - currHash := hash(uint32(x)) - candidate = e.table[currHash&tableMask] - e.table[currHash&tableMask] = tableEntry{offset: e.cur + s, val: uint32(x)} - - offset := s - (candidate.offset - e.cur) - if offset > maxMatchOffset || uint32(x) != candidate.val { - cv = uint32(x >> 8) - nextHash = hash(cv) - s++ - break +func (e *fastGen) addBlock(src []byte) int32 { + // check if we have space already + if len(e.hist)+len(src) > cap(e.hist) { + if cap(e.hist) == 0 { + e.hist = make([]byte, 0, allocHistory) + } else { + if cap(e.hist) < maxMatchOffset*2 { + panic("unexpected buffer size") } + // Move down + offset := int32(len(e.hist)) - maxMatchOffset + // copy(e.hist[0:maxMatchOffset], e.hist[offset:]) + *(*[maxMatchOffset]byte)(e.hist) = *(*[maxMatchOffset]byte)(e.hist[offset:]) + e.cur += offset + e.hist = e.hist[:maxMatchOffset] } } - -emitRemainder: - if int(nextEmit) < len(src) { - dst = emitLiteral(dst, src[nextEmit:]) - } - e.cur += int32(len(src)) - e.prev = e.prev[:len(src)] - copy(e.prev, src) - return dst + s := int32(len(e.hist)) + e.hist = append(e.hist, src...) + return s } -func emitLiteral(dst []token, lit []byte) []token { - for _, v := range lit { - dst = append(dst, literalToken(uint32(v))) - } - return dst +type tableEntryPrev struct { + Cur tableEntry + Prev tableEntry } -// matchLen returns the match length between src[s:] and src[t:]. -// t can be negative to indicate the match is starting in e.prev. -// We assume that src[s-4:s] and src[t-4:t] already match. -func (e *deflateFast) matchLen(s, t int32, src []byte) int32 { - s1 := int(s) + maxMatchLength - 4 - if s1 > len(src) { - s1 = len(src) +// hashLen returns a hash of the lowest mls bytes of with length output bits. +// mls must be >=3 and <=8. Any other value will return hash for 4 bytes. +// length should always be < 32. +// Preferably, length and mls should be a constant for inlining. +func hashLen(u uint64, length, mls uint8) uint32 { + switch mls { + case 3: + return (uint32(u<<8) * prime3bytes) >> (32 - length) + case 5: + return uint32(((u << (64 - 40)) * prime5bytes) >> (64 - length)) + case 6: + return uint32(((u << (64 - 48)) * prime6bytes) >> (64 - length)) + case 7: + return uint32(((u << (64 - 56)) * prime7bytes) >> (64 - length)) + case 8: + return uint32((u * prime8bytes) >> (64 - length)) + default: + return (uint32(u) * prime4bytes) >> (32 - length) } +} - // If we are inside the current block - if t >= 0 { - b := src[t:] - a := src[s:s1] - b = b[:len(a)] - // Extend the match to be as long as possible. - for i := range a { - if a[i] != b[i] { - return int32(i) - } - } - return int32(len(a)) - } +// matchLenLimited will return the match length between offsets and t in src. +// The maximum length returned is maxMatchLength - 4. +// It is assumed that s > t, that t >=0 and s < len(src). +func (e *fastGen) matchLenLimited(s, t int, src []byte) int32 { + a := src[s:min(s+maxMatchLength-4, len(src))] + b := src[t:] + return int32(matchLen(a, b)) +} - // We found a match in the previous block. - tp := int32(len(e.prev)) + t - if tp < 0 { - return 0 - } +// matchlenLong will return the match length between offsets and t in src. +// It is assumed that s > t, that t >=0 and s < len(src). +func (e *fastGen) matchlenLong(s, t int, src []byte) int32 { + return int32(matchLen(src[s:], src[t:])) +} - // Extend the match to be as long as possible. - a := src[s:s1] - b := e.prev[tp:] - if len(b) > len(a) { - b = b[:len(a)] +// Reset the encoding table. +func (e *fastGen) Reset() { + if cap(e.hist) < allocHistory { + e.hist = make([]byte, 0, allocHistory) } - a = a[:len(b)] - for i := range b { - if a[i] != b[i] { - return int32(i) - } + // We offset current position so everything will be out of reach. + // If we are above the buffer reset it will be cleared anyway since len(hist) == 0. + if e.cur <= bufferReset { + e.cur += maxMatchOffset + int32(len(e.hist)) } + e.hist = e.hist[:0] +} - // If we reached our limit, we matched everything we are - // allowed to in the previous block and we return. - n := int32(len(b)) - if int(s+n) == s1 { - return n +// matchLen returns the maximum common prefix length of a and b. +// a must be the shortest of the two. +func matchLen(a, b []byte) (n int) { + left := len(a) + for left >= 8 { + diff := loadLE64(a, n) ^ loadLE64(b, n) + if diff != 0 { + return n + bits.TrailingZeros64(diff)>>3 + } + n += 8 + left -= 8 } - // Continue looking for more matches in the current block. - a = src[s+n : s1] - b = src[:len(a)] + a = a[n:] + b = b[n:] for i := range a { if a[i] != b[i] { - return int32(i) + n - } - } - return int32(len(a)) + n -} - -// Reset resets the encoding history. -// This ensures that no matches are made to the previous block. -func (e *deflateFast) reset() { - e.prev = e.prev[:0] - // Bump the offset, so all matches will fail distance check. - // Nothing should be >= e.cur in the table. - e.cur += maxMatchOffset - - // Protect against e.cur wraparound. - if e.cur >= bufferReset { - e.shiftOffsets() - } -} - -// shiftOffsets will shift down all match offset. -// This is only called in rare situations to prevent integer overflow. -// -// See https://golang.org/issue/18636 and https://github.com/golang/go/issues/34121. -func (e *deflateFast) shiftOffsets() { - if len(e.prev) == 0 { - // We have no history; just clear the table. - clear(e.table[:]) - e.cur = maxMatchOffset + 1 - return - } - - // Shift down everything in the table that isn't already too far away. - for i := range e.table[:] { - v := e.table[i].offset - e.cur + maxMatchOffset + 1 - if v < 0 { - // We want to reset e.cur to maxMatchOffset + 1, so we need to shift - // all table entries down by (e.cur - (maxMatchOffset + 1)). - // Because we ignore matches > maxMatchOffset, we can cap - // any negative offsets at 0. - v = 0 + break } - e.table[i].offset = v + n++ } - e.cur = maxMatchOffset + 1 + return n } diff --git a/src/compress/flate/dict_decoder.go b/src/compress/flate/dict_decoder.go index d2c19040f54f53..cb855abc4ba1d7 100644 --- a/src/compress/flate/dict_decoder.go +++ b/src/compress/flate/dict_decoder.go @@ -104,10 +104,7 @@ func (dd *dictDecoder) writeCopy(dist, length int) int { dstBase := dd.wrPos dstPos := dstBase srcPos := dstPos - dist - endPos := dstPos + length - if endPos > len(dd.hist) { - endPos = len(dd.hist) - } + endPos := min(dstPos+length, len(dd.hist)) // Copy non-overlapping section after destination position. // @@ -160,8 +157,10 @@ func (dd *dictDecoder) tryWriteCopy(dist, length int) int { srcPos := dstPos - dist // Copy possibly overlapping section before destination position. - for dstPos < endPos { - dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos]) +loop: + dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos]) + if dstPos < endPos { + goto loop // Avoid for-loop so that this function can be inlined } dd.wrPos = dstPos diff --git a/src/compress/flate/example_test.go b/src/compress/flate/example_test.go index 578009248f5704..3af5c1d95de1d1 100644 --- a/src/compress/flate/example_test.go +++ b/src/compress/flate/example_test.go @@ -93,7 +93,7 @@ func Example_dictionary() { var b bytes.Buffer // Compress the data using the specially crafted dictionary. - zw, err := flate.NewWriterDict(&b, flate.DefaultCompression, []byte(dict)) + zw, err := flate.NewWriterDict(&b, flate.BestCompression, []byte(dict)) if err != nil { log.Fatal(err) } @@ -168,6 +168,7 @@ func Example_synchronization() { wg.Add(1) go func() { defer wg.Done() + defer wp.Close() zw, err := flate.NewWriter(wp, flate.BestSpeed) if err != nil { diff --git a/src/compress/flate/fuzz_test.go b/src/compress/flate/fuzz_test.go new file mode 100644 index 00000000000000..1ea8cc49e54672 --- /dev/null +++ b/src/compress/flate/fuzz_test.go @@ -0,0 +1,111 @@ +package flate + +import ( + "bytes" + "flag" + "io" + "os" + "strconv" + "testing" +) + +// Fuzzing tweaks: +var fuzzStartF = flag.Int("start", HuffmanOnly, "Start fuzzing at this level") +var fuzzEndF = flag.Int("end", BestCompression, "End fuzzing at this level (inclusive)") +var fuzzMaxF = flag.Int("max", 1<<20, "Maximum input size") + +func TestMain(m *testing.M) { + flag.Parse() + os.Exit(m.Run()) +} + +// FuzzEncoding tests the fuzzer by doing roundtrips. +// Every input is run through the fuzzer at every level. +// Note: When running the fuzzer, it may hit the 10-second timeout on slower CPUs. +func FuzzEncoding(f *testing.F) { + startFuzz := *fuzzStartF + endFuzz := *fuzzEndF + maxSize := *fuzzMaxF + + decoder := NewReader(nil) + buf, buf2 := new(bytes.Buffer), new(bytes.Buffer) + encs := make([]*Writer, endFuzz-startFuzz+1) + for i := range encs { + var err error + encs[i], err = NewWriter(nil, i+startFuzz) + if err != nil { + f.Fatal(err.Error()) + } + } + + f.Fuzz(func(t *testing.T, data []byte) { + if len(data) > maxSize { + return + } + for level := startFuzz; level <= endFuzz; level++ { + if level == DefaultCompression { + continue // Already covered. + } + msg := "level " + strconv.Itoa(level) + ":" + buf.Reset() + fw := encs[level-startFuzz] + fw.Reset(buf) + n, err := fw.Write(data) + if n != len(data) { + t.Fatal(msg + "short write") + } + if err != nil { + t.Fatal(msg + err.Error()) + } + err = fw.Close() + if err != nil { + t.Fatal(msg + err.Error()) + } + compressed := buf.Bytes() + err = decoder.(Resetter).Reset(buf, nil) + if err != nil { + t.Fatal(msg + err.Error()) + } + data2, err := io.ReadAll(decoder) + if err != nil { + t.Fatal(msg + err.Error()) + } + if !bytes.Equal(data, data2) { + t.Fatal(msg + "decompressed not equal") + } + + // Do it again... + msg = "level " + strconv.Itoa(level) + " (reset):" + buf2.Reset() + fw.Reset(buf2) + n, err = fw.Write(data) + if n != len(data) { + t.Fatal(msg + "short write") + } + if err != nil { + t.Fatal(msg + err.Error()) + } + err = fw.Close() + if err != nil { + t.Fatal(msg + err.Error()) + } + compressed2 := buf2.Bytes() + err = decoder.(Resetter).Reset(buf2, nil) + if err != nil { + t.Fatal(msg + err.Error()) + } + data2, err = io.ReadAll(decoder) + if err != nil { + t.Fatal(msg + err.Error()) + } + if !bytes.Equal(data, data2) { + t.Fatal(msg + "decompressed not equal") + } + // Determinism checks will usually not be reproducible, + // since it often relies on the internal state of the compressor. + if !bytes.Equal(compressed, compressed2) { + t.Fatal(msg + "non-deterministic output") + } + } + }) +} diff --git a/src/compress/flate/huffman_bit_writer.go b/src/compress/flate/huffman_bit_writer.go index d68c77fb32e32a..585a9b4cf19032 100644 --- a/src/compress/flate/huffman_bit_writer.go +++ b/src/compress/flate/huffman_bit_writer.go @@ -6,6 +6,7 @@ package flate import ( "io" + "math" ) const ( @@ -22,20 +23,22 @@ const ( codegenCodeCount = 19 badCode = 255 + // maxPredefinedTokens is the maximum number of tokens + // where we check if fixed size is smaller. + maxPredefinedTokens = 250 + // bufferFlushSize indicates the buffer size // after which bytes are flushed to the writer. // Should preferably be a multiple of 6, since // we accumulate 6 bytes between writes to the buffer. - bufferFlushSize = 240 - - // bufferSize is the actual output byte buffer size. - // It must have additional headroom for a flush - // which can contain up to 8 bytes. - bufferSize = bufferFlushSize + 8 + bufferFlushSize = 246 ) +// Minimum length code that emits bits. +const lengthExtraBitsMinCode = 8 + // The number of extra bits needed by length code X - LENGTH_CODES_START. -var lengthExtraBits = []int8{ +var lengthExtraBits = [32]uint8{ /* 257 */ 0, 0, 0, /* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, /* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, @@ -43,26 +46,47 @@ var lengthExtraBits = []int8{ } // The length indicated by length code X - LENGTH_CODES_START. -var lengthBase = []uint32{ +var lengthBase = [32]uint8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 255, } +// Minimum offset code that emits bits. +const offsetExtraBitsMinCode = 4 + // offset code word extra bits. -var offsetExtraBits = []int8{ +var offsetExtraBits = [32]int8{ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, + /* extended window */ + 14, 14, } -var offsetBase = []uint32{ - 0x000000, 0x000001, 0x000002, 0x000003, 0x000004, - 0x000006, 0x000008, 0x00000c, 0x000010, 0x000018, - 0x000020, 0x000030, 0x000040, 0x000060, 0x000080, - 0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300, - 0x000400, 0x000600, 0x000800, 0x000c00, 0x001000, - 0x001800, 0x002000, 0x003000, 0x004000, 0x006000, +var offsetCombined = [32]uint32{} + +func init() { + var offsetBase = [32]uint32{ + /* normal deflate */ + 0x000000, 0x000001, 0x000002, 0x000003, 0x000004, + 0x000006, 0x000008, 0x00000c, 0x000010, 0x000018, + 0x000020, 0x000030, 0x000040, 0x000060, 0x000080, + 0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300, + 0x000400, 0x000600, 0x000800, 0x000c00, 0x001000, + 0x001800, 0x002000, 0x003000, 0x004000, 0x006000, + + /* extended window */ + 0x008000, 0x00c000, + } + + for i := range offsetCombined[:] { + // Don't use extended window values... + if offsetExtraBits[i] == 0 || offsetBase[i] > 0x006000 { + continue + } + offsetCombined[i] = uint32(offsetExtraBits[i]) | (offsetBase[i] << 8) + } } // The odd order in which the codegen code sizes are written. @@ -75,29 +99,49 @@ type huffmanBitWriter struct { writer io.Writer // Data waiting to be written is bytes[0:nbytes] - // and then the low nbits of bits. Data is always written - // sequentially into the bytes array. - bits uint64 - nbits uint - bytes [bufferSize]byte - codegenFreq [codegenCodeCount]int32 - nbytes int - literalFreq []int32 - offsetFreq []int32 - codegen []uint8 - literalEncoding *huffmanEncoder - offsetEncoding *huffmanEncoder - codegenEncoding *huffmanEncoder - err error + // and then the low nbits of bits. + bits uint64 + nbits uint8 + nbytes uint8 + lastHuffMan bool + literalEncoding *huffmanEncoder + tmpLitEncoding *huffmanEncoder + offsetEncoding *huffmanEncoder + codegenEncoding *huffmanEncoder + err error + lastHeader int + logNewTablePenalty uint // Bigger values will reduce the penalty of a new table. + bytes [256 + 8]byte + literalFreq [lengthCodesStart + 32]uint16 + offsetFreq [32]uint16 + codegenFreq [codegenCodeCount]uint16 + + // codegen must have an extra space for the final symbol. + codegen [literalCount + offsetCodeCount + 1]uint8 } +// The huffmanBitWriter supports reusing huffman tables and will combine +// blocks, if compression is less than creating a new table. +// +// This is controlled by several variables: +// +// If 'lastHeader' is non-zero the Huffman table can be reused. +// It also indicates that an EOB has not yet been emitted, so if a new table +// is generated, an EOB with the previous table must be written. +// +// If 'lastHuffMan' is set, a table for outputting literals +// has been generated and offsets are invalid. +// +// An incoming block estimates the output size of a new table using a +// 'fresh' by calculating the optimal size and adding a penalty. +// A Huffman table is not optimal, which is why we add a penalty, +// and generating a new table is slower for both compression and decompression. + func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter { return &huffmanBitWriter{ writer: w, - literalFreq: make([]int32, maxNumLit), - offsetFreq: make([]int32, offsetCodeCount), - codegen: make([]uint8, maxNumLit+offsetCodeCount+1), - literalEncoding: newHuffmanEncoder(maxNumLit), + literalEncoding: newHuffmanEncoder(literalCount), + tmpLitEncoding: newHuffmanEncoder(literalCount), codegenEncoding: newHuffmanEncoder(codegenCodeCount), offsetEncoding: newHuffmanEncoder(offsetCodeCount), } @@ -106,6 +150,37 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter { func (w *huffmanBitWriter) reset(writer io.Writer) { w.writer = writer w.bits, w.nbits, w.nbytes, w.err = 0, 0, 0, nil + w.lastHeader = 0 + w.lastHuffMan = false +} + +func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) { + a := t.offHist[:offsetCodeCount] + b := w.offsetEncoding.codes + b = b[:len(a)] + for i, v := range a { + if v != 0 && b[i].zero() { + return false + } + } + + a = t.extraHist[:literalCount-256] + b = w.literalEncoding.codes[256:literalCount] + b = b[:len(a)] + for i, v := range a { + if v != 0 && b[i].zero() { + return false + } + } + + a = t.litHist[:256] + b = w.literalEncoding.codes[:len(a)] + for i, v := range a { + if v != 0 && b[i].zero() { + return false + } + } + return true } func (w *huffmanBitWriter) flush() { @@ -113,6 +188,11 @@ func (w *huffmanBitWriter) flush() { w.nbits = 0 return } + if w.lastHeader > 0 { + // We owe an EOB + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + } n := w.nbytes for w.nbits != 0 { w.bytes[n] = byte(w.bits) @@ -125,7 +205,9 @@ func (w *huffmanBitWriter) flush() { n++ } w.bits = 0 - w.write(w.bytes[:n]) + if n > 0 { + w.write(w.bytes[:n]) + } w.nbytes = 0 } @@ -136,30 +218,11 @@ func (w *huffmanBitWriter) write(b []byte) { _, w.err = w.writer.Write(b) } -func (w *huffmanBitWriter) writeBits(b int32, nb uint) { - if w.err != nil { - return - } - w.bits |= uint64(b) << w.nbits +func (w *huffmanBitWriter) writeBits(b int32, nb uint8) { + w.bits |= uint64(b) << (w.nbits & 63) w.nbits += nb if w.nbits >= 48 { - bits := w.bits - w.bits >>= 48 - w.nbits -= 48 - n := w.nbytes - bytes := w.bytes[n : n+6] - bytes[0] = byte(bits) - bytes[1] = byte(bits >> 8) - bytes[2] = byte(bits >> 16) - bytes[3] = byte(bits >> 24) - bytes[4] = byte(bits >> 32) - bytes[5] = byte(bits >> 40) - n += 6 - if n >= bufferFlushSize { - w.write(w.bytes[:n]) - n = 0 - } - w.nbytes = n + w.writeOutBits() } } @@ -198,21 +261,23 @@ func (w *huffmanBitWriter) writeBytes(bytes []byte) { // numOffsets The number of offsets in offsetEncoding // litenc, offenc The literal and offset encoder to use func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litEnc, offEnc *huffmanEncoder) { - clear(w.codegenFreq[:]) + for i := range w.codegenFreq { + w.codegenFreq[i] = 0 + } // Note that we are using codegen both as a temporary variable for holding // a copy of the frequencies, and as the place where we put the result. // This is fine because the output is always shorter than the input used // so far. - codegen := w.codegen // cache + codegen := w.codegen[:] // cache // Copy the concatenated code sizes to codegen. Put a marker at the end. cgnl := codegen[:numLiterals] for i := range cgnl { - cgnl[i] = uint8(litEnc.codes[i].len) + cgnl[i] = litEnc.codes[i].len() } cgnl = codegen[numLiterals : numLiterals+numOffsets] for i := range cgnl { - cgnl[i] = uint8(offEnc.codes[i].len) + cgnl[i] = offEnc.codes[i].len() } codegen[numLiterals+numOffsets] = badCode @@ -234,10 +299,7 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE w.codegenFreq[size]++ count-- for count >= 3 { - n := 6 - if n > count { - n = count - } + n := min(6, count) codegen[outIndex] = 16 outIndex++ codegen[outIndex] = uint8(n - 3) @@ -247,10 +309,7 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE } } else { for count >= 11 { - n := 138 - if n > count { - n = count - } + n := min(138, count) codegen[outIndex] = 18 outIndex++ codegen[outIndex] = uint8(n - 11) @@ -282,30 +341,61 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE codegen[outIndex] = badCode } -// dynamicSize returns the size of dynamically encoded data in bits. -func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) { +func (w *huffmanBitWriter) codegens() int { + numCodegens := len(w.codegenFreq) + for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { + numCodegens-- + } + return numCodegens +} + +func (w *huffmanBitWriter) headerSize() (size, numCodegens int) { numCodegens = len(w.codegenFreq) for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { numCodegens-- } - header := 3 + 5 + 5 + 4 + (3 * numCodegens) + + return 3 + 5 + 5 + 4 + (3 * numCodegens) + w.codegenEncoding.bitLength(w.codegenFreq[:]) + int(w.codegenFreq[16])*2 + int(w.codegenFreq[17])*3 + - int(w.codegenFreq[18])*7 + int(w.codegenFreq[18])*7, numCodegens +} + +// dynamicSize returns the size of dynamically encoded data in bits. +func (w *huffmanBitWriter) dynamicReuseSize(litEnc, offEnc *huffmanEncoder) (size int) { + size = litEnc.bitLength(w.literalFreq[:]) + + offEnc.bitLength(w.offsetFreq[:]) + return size +} + +// dynamicSize returns the size of dynamically encoded data in bits. +func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) { + header, numCodegens := w.headerSize() size = header + - litEnc.bitLength(w.literalFreq) + - offEnc.bitLength(w.offsetFreq) + + litEnc.bitLength(w.literalFreq[:]) + + offEnc.bitLength(w.offsetFreq[:]) + extraBits - return size, numCodegens } +// extraBitSize will return the number of bits that will be written +// as "extra" bits on matches. +func (w *huffmanBitWriter) extraBitSize() int { + total := 0 + for i, n := range w.literalFreq[257:literalCount] { + total += int(n) * int(lengthExtraBits[i&31]) + } + for i, n := range w.offsetFreq[:offsetCodeCount] { + total += int(n) * int(offsetExtraBits[i&31]) + } + return total +} + // fixedSize returns the size of dynamically encoded data in bits. func (w *huffmanBitWriter) fixedSize(extraBits int) int { return 3 + - fixedLiteralEncoding.bitLength(w.literalFreq) + - fixedOffsetEncoding.bitLength(w.offsetFreq) + + fixedLiteralEncoding.bitLength(w.literalFreq[:]) + + fixedOffsetEncoding.bitLength(w.offsetFreq[:]) + extraBits } @@ -322,31 +412,37 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) { return 0, false } +// writeCode writes 'c' to the stream. +// Inline manually when performance is critical. func (w *huffmanBitWriter) writeCode(c hcode) { - if w.err != nil { - return - } - w.bits |= uint64(c.code) << w.nbits - w.nbits += uint(c.len) + w.bits |= c.code64() << (w.nbits & reg8SizeMask64) + w.nbits += c.len() if w.nbits >= 48 { - bits := w.bits - w.bits >>= 48 - w.nbits -= 48 - n := w.nbytes - bytes := w.bytes[n : n+6] - bytes[0] = byte(bits) - bytes[1] = byte(bits >> 8) - bytes[2] = byte(bits >> 16) - bytes[3] = byte(bits >> 24) - bytes[4] = byte(bits >> 32) - bytes[5] = byte(bits >> 40) - n += 6 - if n >= bufferFlushSize { - w.write(w.bytes[:n]) + w.writeOutBits() + } +} + +// writeOutBits will write bits to the buffer. +func (w *huffmanBitWriter) writeOutBits() { + bits := w.bits + w.bits >>= 48 + w.nbits -= 48 + n := w.nbytes + + // We overwrite, but faster... + storeLE64(w.bytes[n:], bits) + n += 6 + + if n >= bufferFlushSize { + if w.err != nil { n = 0 + return } - w.nbytes = n + w.write(w.bytes[:n]) + n = 0 } + + w.nbytes = n } // Write the header of a dynamic Huffman block to the output stream. @@ -367,19 +463,19 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n w.writeBits(int32(numOffsets-1), 5) w.writeBits(int32(numCodegens-4), 4) - for i := 0; i < numCodegens; i++ { - value := uint(w.codegenEncoding.codes[codegenOrder[i]].len) + for i := range numCodegens { + value := uint(w.codegenEncoding.codes[codegenOrder[i]].len()) w.writeBits(int32(value), 3) } i := 0 for { - var codeWord int = int(w.codegen[i]) + var codeWord = uint32(w.codegen[i]) i++ if codeWord == badCode { break } - w.writeCode(w.codegenEncoding.codes[uint32(codeWord)]) + w.writeCode(w.codegenEncoding.codes[codeWord]) switch codeWord { case 16: @@ -395,10 +491,28 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n } } +// writeStoredHeader will write a stored header. +// If the stored block is only used for EOF, +// it is replaced with a fixed huffman block. func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) { if w.err != nil { return } + if w.lastHeader > 0 { + // We owe an EOB + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + } + + // To write EOF, use a fixed encoding block. 10 bits instead of 5 bytes. + if length == 0 && isEof { + w.writeFixedHeader(isEof) + // EOB: 7 bits, value: 0 + w.writeBits(0, 7) + w.flush() + return + } + var flag int32 if isEof { flag = 1 @@ -413,6 +527,12 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) { if w.err != nil { return } + if w.lastHeader > 0 { + // We owe an EOB + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + } + // Indicate that we are a fixed Huffman block var value int32 = 2 if isEof { @@ -426,36 +546,33 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) { // is larger than the original bytes, the data will be written as a // stored block. // If the input is nil, the tokens will always be Huffman encoded. -func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { +func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) { if w.err != nil { return } - tokens = append(tokens, endBlockMarker) + tokens.AddEOB() + if w.lastHeader > 0 { + // We owe an EOB + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + } numLiterals, numOffsets := w.indexTokens(tokens) - + w.generate() var extraBits int storedSize, storable := w.storedSize(input) if storable { - // We only bother calculating the costs of the extra bits required by - // the length of offset fields (which will be the same for both fixed - // and dynamic encoding), if we need to compare those two encodings - // against stored encoding. - for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ { - // First eight length codes have extra size = 0. - extraBits += int(w.literalFreq[lengthCode]) * int(lengthExtraBits[lengthCode-lengthCodesStart]) - } - for offsetCode := 4; offsetCode < numOffsets; offsetCode++ { - // First four offset codes have extra size = 0. - extraBits += int(w.offsetFreq[offsetCode]) * int(offsetExtraBits[offsetCode]) - } + extraBits = w.extraBitSize() } // Figure out smallest code. // Fixed Huffman baseline. var literalEncoding = fixedLiteralEncoding var offsetEncoding = fixedOffsetEncoding - var size = w.fixedSize(extraBits) + var size = math.MaxInt32 + if tokens.n < maxPredefinedTokens { + size = w.fixedSize(extraBits) + } // Dynamic Huffman? var numCodegens int @@ -473,7 +590,7 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { } // Stored bytes? - if storable && storedSize < size { + if storable && storedSize <= size { w.writeStoredHeader(len(input), eof) w.writeBytes(input) return @@ -487,7 +604,7 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { } // Write the tokens. - w.writeTokens(tokens, literalEncoding.codes, offsetEncoding.codes) + w.writeTokens(tokens.Slice(), literalEncoding.codes, offsetEncoding.codes) } // writeBlockDynamic encodes a block using a dynamic Huffman table. @@ -495,53 +612,153 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { // histogram distribution. // If input is supplied and the compression savings are below 1/16th of the // input size the block is stored. -func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []byte) { +func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []byte, sync bool) { if w.err != nil { return } - tokens = append(tokens, endBlockMarker) + sync = sync || eof + if sync { + tokens.AddEOB() + } + + // We cannot reuse pure huffman table, and must mark as EOF. + if (w.lastHuffMan || eof) && w.lastHeader > 0 { + // We will not try to reuse. + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + w.lastHuffMan = false + } + + if w.lastHeader > 0 && !w.canReuse(tokens) { + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + } + numLiterals, numOffsets := w.indexTokens(tokens) + extraBits := 0 + ssize, storable := w.storedSize(input) - // Generate codegen and codegenFrequencies, which indicates how to encode - // the literalEncoding and the offsetEncoding. - w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding) - w.codegenEncoding.generate(w.codegenFreq[:], 7) - size, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, 0) + if storable || w.lastHeader > 0 { + extraBits = w.extraBitSize() + } - // Store bytes, if we don't get a reasonable improvement. - if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) { - w.writeStoredHeader(len(input), eof) - w.writeBytes(input) - return + var size int + + // Check if we should reuse. + if w.lastHeader > 0 { + // Estimate size for using a new table. + // Use the previous header size as the best estimate. + newSize := w.lastHeader + tokens.EstimatedBits() + + // The estimated size is calculated as an optimal table. + // We add a penalty to make it more realistic and re-use a bit more. + newSize += int(w.literalEncoding.codes[endBlockMarker].len()) + newSize>>w.logNewTablePenalty + + // Calculate the size for reusing the current table. + reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + extraBits + + // Check if a new table is better. + if newSize < reuseSize { + // Write the EOB we owe. + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + size = newSize + w.lastHeader = 0 + } else { + size = reuseSize + } + + // Small blocks can be more efficient with fixed encoding. + if tokens.n < maxPredefinedTokens { + if preSize := w.fixedSize(extraBits) + 7; preSize < size { + // Check if we get a reasonable size decrease. + if storable && ssize <= size { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } + w.writeFixedHeader(eof) + if !sync { + tokens.AddEOB() + } + w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes) + return + } + } + + // Check if we get a reasonable size decrease. + if storable && ssize <= size { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } } - // Write Huffman table. - w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) + // We want a new block/table + if w.lastHeader == 0 { + w.literalFreq[endBlockMarker] = 1 + + w.generate() + // Generate codegen and codegenFrequencies, which indicates how to encode + // the literalEncoding and the offsetEncoding. + w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding) + w.codegenEncoding.generate(w.codegenFreq[:], 7) + + var numCodegens int + size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits) + + // Store predefined or raw, if we don't get a reasonable improvement. + if tokens.n < maxPredefinedTokens { + if preSize := w.fixedSize(extraBits); preSize <= size { + // Store bytes, if we don't get an improvement. + if storable && ssize <= preSize { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } + w.writeFixedHeader(eof) + if !sync { + tokens.AddEOB() + } + w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes) + return + } + } + + if storable && ssize <= size { + // Store bytes, if we don't get an improvement. + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } + + // Write Huffman table. + w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) + if !sync { + w.lastHeader, _ = w.headerSize() + } + w.lastHuffMan = false + } + if sync { + w.lastHeader = 0 + } // Write the tokens. - w.writeTokens(tokens, w.literalEncoding.codes, w.offsetEncoding.codes) + w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes) } // indexTokens indexes a slice of tokens, and updates // literalFreq and offsetFreq, and generates literalEncoding // and offsetEncoding. // The number of literal and offset tokens is returned. -func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets int) { - clear(w.literalFreq) - clear(w.offsetFreq) +func (w *huffmanBitWriter) indexTokens(t *tokens) (numLiterals, numOffsets int) { + *(*[256]uint16)(w.literalFreq[:]) = t.litHist + *(*[32]uint16)(w.literalFreq[256:]) = t.extraHist + w.offsetFreq = t.offHist - for _, t := range tokens { - if t < matchType { - w.literalFreq[t.literal()]++ - continue - } - length := t.length() - offset := t.offset() - w.literalFreq[lengthCodesStart+lengthCode(length)]++ - w.offsetFreq[offsetCode(offset)]++ + if t.n == 0 { + return } - // get the number of literals numLiterals = len(w.literalFreq) for w.literalFreq[numLiterals-1] == 0 { @@ -558,41 +775,153 @@ func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets w.offsetFreq[0] = 1 numOffsets = 1 } - w.literalEncoding.generate(w.literalFreq, 15) - w.offsetEncoding.generate(w.offsetFreq, 15) return } +func (w *huffmanBitWriter) generate() { + w.literalEncoding.generate(w.literalFreq[:literalCount], 15) + w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15) +} + // writeTokens writes a slice of tokens to the output. // codes for literal and offset encoding must be supplied. func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) { if w.err != nil { return } + if len(tokens) == 0 { + return + } + + // Only last token should be endBlockMarker. + var deferEOB bool + if tokens[len(tokens)-1] == endBlockMarker { + tokens = tokens[:len(tokens)-1] + deferEOB = true + } + + // Create slices up to the next power of two to avoid bounds checks. + lits := leCodes[:256] + offs := oeCodes[:32] + lengths := leCodes[lengthCodesStart:] + lengths = lengths[:32] + + // Go 1.16 LOVES having these on stack. + bits, nbits, nbytes := w.bits, w.nbits, w.nbytes + for _, t := range tokens { - if t < matchType { - w.writeCode(leCodes[t.literal()]) + if t < 256 { + c := lits[t] + bits |= c.code64() << (nbits & 63) + nbits += c.len() + if nbits >= 48 { + storeLE64(w.bytes[nbytes:], bits) + bits >>= 48 + nbits -= 48 + nbytes += 6 + if nbytes >= bufferFlushSize { + if w.err != nil { + nbytes = 0 + return + } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 + } + } continue } + // Write the length length := t.length() - lengthCode := lengthCode(length) - w.writeCode(leCodes[lengthCode+lengthCodesStart]) - extraLengthBits := uint(lengthExtraBits[lengthCode]) - if extraLengthBits > 0 { - extraLength := int32(length - lengthBase[lengthCode]) - w.writeBits(extraLength, extraLengthBits) + lenCode := lengthCode(length) & 31 + // inlined 'w.writeCode(lengths[lengthCode])' + c := lengths[lenCode] + bits |= c.code64() << (nbits & 63) + nbits += c.len() + if nbits >= 48 { + storeLE64(w.bytes[nbytes:], bits) + bits >>= 48 + nbits -= 48 + nbytes += 6 + if nbytes >= bufferFlushSize { + if w.err != nil { + nbytes = 0 + return + } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 + } + } + + if lenCode >= lengthExtraBitsMinCode { + extraLengthBits := lengthExtraBits[lenCode] + //w.writeBits(extraLength, extraLengthBits) + extraLength := int32(length - lengthBase[lenCode]) + bits |= uint64(extraLength) << (nbits & 63) + nbits += extraLengthBits + if nbits >= 48 { + storeLE64(w.bytes[nbytes:], bits) + bits >>= 48 + nbits -= 48 + nbytes += 6 + if nbytes >= bufferFlushSize { + if w.err != nil { + nbytes = 0 + return + } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 + } + } } // Write the offset offset := t.offset() - offsetCode := offsetCode(offset) - w.writeCode(oeCodes[offsetCode]) - extraOffsetBits := uint(offsetExtraBits[offsetCode]) - if extraOffsetBits > 0 { - extraOffset := int32(offset - offsetBase[offsetCode]) - w.writeBits(extraOffset, extraOffsetBits) + offCode := (offset >> 16) & 31 + // inlined 'w.writeCode(offs[offCode])' + c = offs[offCode] + bits |= c.code64() << (nbits & 63) + nbits += c.len() + if nbits >= 48 { + storeLE64(w.bytes[nbytes:], bits) + bits >>= 48 + nbits -= 48 + nbytes += 6 + if nbytes >= bufferFlushSize { + if w.err != nil { + nbytes = 0 + return + } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 + } + } + + if offCode >= offsetExtraBitsMinCode { + offsetComb := offsetCombined[offCode] + bits |= uint64((offset-(offsetComb>>8))&matchOffsetOnlyMask) << (nbits & 63) + nbits += uint8(offsetComb) + if nbits >= 48 { + storeLE64(w.bytes[nbytes:], bits) + bits >>= 48 + nbits -= 48 + nbytes += 6 + if nbytes >= bufferFlushSize { + if w.err != nil { + nbytes = 0 + return + } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 + } + } } } + // Restore... + w.bits, w.nbits, w.nbytes = bits, nbits, nbytes + + if deferEOB { + w.writeCode(leCodes[endBlockMarker]) + } } // huffOffset is a static offset encoder used for huffman only encoding. @@ -600,94 +929,168 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) var huffOffset *huffmanEncoder func init() { - offsetFreq := make([]int32, offsetCodeCount) - offsetFreq[0] = 1 + w := newHuffmanBitWriter(nil) + w.offsetFreq[0] = 1 huffOffset = newHuffmanEncoder(offsetCodeCount) - huffOffset.generate(offsetFreq, 15) + huffOffset.generate(w.offsetFreq[:offsetCodeCount], 15) } // writeBlockHuff encodes a block of bytes as either // Huffman encoded literals or uncompressed bytes if the // results only gains very little from compression. -func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) { +func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { if w.err != nil { return } // Clear histogram - clear(w.literalFreq) - - // Add everything as literals - histogram(input, w.literalFreq) - - w.literalFreq[endBlockMarker] = 1 + for i := range w.literalFreq[:] { + w.literalFreq[i] = 0 + } + if !w.lastHuffMan { + for i := range w.offsetFreq[:] { + w.offsetFreq[i] = 0 + } + } const numLiterals = endBlockMarker + 1 - w.offsetFreq[0] = 1 const numOffsets = 1 - w.literalEncoding.generate(w.literalFreq, 15) - - // Figure out smallest code. - // Always use dynamic Huffman or Store - var numCodegens int - - // Generate codegen and codegenFrequencies, which indicates how to encode - // the literalEncoding and the offsetEncoding. - w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset) - w.codegenEncoding.generate(w.codegenFreq[:], 7) - size, numCodegens := w.dynamicSize(w.literalEncoding, huffOffset, 0) + // Add everything as literals + // We have to estimate the header size. + // Assume header is around 70 bytes: + // https://stackoverflow.com/a/25454430 + const guessHeaderSizeBits = 70 * 8 + histogram(input, w.literalFreq[:numLiterals]) + ssize, storable := w.storedSize(input) + if storable && len(input) > 1024 { + // Quick check for incompressible content. + abs := float64(0) + avg := float64(len(input)) / 256 + max := float64(len(input) * 2) + for _, v := range w.literalFreq[:256] { + diff := float64(v) - avg + abs += diff * diff + if abs > max { + break + } + } + if abs < max { + // No chance we can compress this... + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } + } + w.literalFreq[endBlockMarker] = 1 + w.tmpLitEncoding.generate(w.literalFreq[:numLiterals], 15) + estBits := w.tmpLitEncoding.canReuseBits(w.literalFreq[:numLiterals]) + if estBits < math.MaxInt32 { + estBits += w.lastHeader + if w.lastHeader == 0 { + estBits += guessHeaderSizeBits + } + estBits += estBits >> w.logNewTablePenalty + } // Store bytes, if we don't get a reasonable improvement. - if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) { + if storable && ssize <= estBits { w.writeStoredHeader(len(input), eof) w.writeBytes(input) return } - // Huffman. - w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) - encoding := w.literalEncoding.codes[:257] - n := w.nbytes - for _, t := range input { - // Bitwriting inlined, ~30% speedup - c := encoding[t] - w.bits |= uint64(c.code) << w.nbits - w.nbits += uint(c.len) - if w.nbits < 48 { - continue + if w.lastHeader > 0 { + reuseSize := w.literalEncoding.canReuseBits(w.literalFreq[:256]) + + if estBits < reuseSize { + // We owe an EOB + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 } - // Store 6 bytes - bits := w.bits - w.bits >>= 48 - w.nbits -= 48 - bytes := w.bytes[n : n+6] - bytes[0] = byte(bits) - bytes[1] = byte(bits >> 8) - bytes[2] = byte(bits >> 16) - bytes[3] = byte(bits >> 24) - bytes[4] = byte(bits >> 32) - bytes[5] = byte(bits >> 40) - n += 6 - if n < bufferFlushSize { - continue + } + + if w.lastHeader == 0 { + // Use the temp encoding, so swap. + w.literalEncoding, w.tmpLitEncoding = w.tmpLitEncoding, w.literalEncoding + // Generate codegen and codegenFrequencies, which indicates how to encode + // the literalEncoding and the offsetEncoding. + w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset) + w.codegenEncoding.generate(w.codegenFreq[:], 7) + numCodegens := w.codegens() + + // Huffman. + w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) + w.lastHuffMan = true + w.lastHeader, _ = w.headerSize() + } + + encoding := w.literalEncoding.codes[:256] + // Go 1.16 LOVES having these on stack. At least 1.5x the speed. + bits, nbits, nbytes := w.bits, w.nbits, w.nbytes + + // Unroll, write 3 codes/loop. + // Fastest number of unrolls. + for len(input) > 3 { + // We must have at least 48 bits free. + if nbits >= 8 { + n := nbits >> 3 + storeLE64(w.bytes[nbytes:], bits) + bits >>= (n * 8) & 63 + nbits -= n * 8 + nbytes += n } - w.write(w.bytes[:n]) - if w.err != nil { - return // Return early in the event of write failures + if nbytes >= bufferFlushSize { + if w.err != nil { + nbytes = 0 + return + } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 } - n = 0 + a, b := encoding[input[0]], encoding[input[1]] + bits |= a.code64() << (nbits & 63) + bits |= b.code64() << ((nbits + a.len()) & 63) + c := encoding[input[2]] + nbits += b.len() + a.len() + bits |= c.code64() << (nbits & 63) + nbits += c.len() + input = input[3:] } - w.nbytes = n - w.writeCode(encoding[endBlockMarker]) -} -// histogram accumulates a histogram of b in h. -// -// len(h) must be >= 256, and h's elements must be all zeroes. -func histogram(b []byte, h []int32) { - h = h[:256] - for _, t := range b { - h[t]++ + // Remaining... + for _, t := range input { + if nbits >= 48 { + storeLE64(w.bytes[nbytes:], bits) + bits >>= 48 + nbits -= 48 + nbytes += 6 + if nbytes >= bufferFlushSize { + if w.err != nil { + nbytes = 0 + return + } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 + } + } + // Bitwriting inlined, ~30% speedup + c := encoding[t] + bits |= c.code64() << (nbits & 63) + + nbits += c.len() + } + // Restore... + w.bits, w.nbits, w.nbytes = bits, nbits, nbytes + + // Flush if needed to have space. + if w.nbits >= 48 { + w.writeOutBits() + } + + if eof || sync { + w.writeCode(w.literalEncoding.codes[endBlockMarker]) + w.lastHeader = 0 + w.lastHuffMan = false } } diff --git a/src/compress/flate/huffman_bit_writer_test.go b/src/compress/flate/huffman_bit_writer_test.go index a57799cae02685..dfb93e326c0871 100644 --- a/src/compress/flate/huffman_bit_writer_test.go +++ b/src/compress/flate/huffman_bit_writer_test.go @@ -32,7 +32,9 @@ func TestBlockHuff(t *testing.T) { if strings.HasSuffix(in, ".in") { out = in[:len(in)-len(".in")] + ".golden" } - testBlockHuff(t, in, out) + t.Run(in, func(t *testing.T) { + testBlockHuff(t, in, out) + }) } } @@ -44,7 +46,8 @@ func testBlockHuff(t *testing.T, in, out string) { } var buf bytes.Buffer bw := newHuffmanBitWriter(&buf) - bw.writeBlockHuff(false, all) + bw.logNewTablePenalty = 8 + bw.writeBlockHuff(false, all, false) bw.flush() got := buf.Bytes() @@ -79,7 +82,7 @@ func testBlockHuff(t *testing.T, in, out string) { // Test if the writer produces the same output after reset. buf.Reset() bw.reset(&buf) - bw.writeBlockHuff(false, all) + bw.writeBlockHuff(false, all, false) bw.flush() got = buf.Bytes() if !bytes.Equal(got, want) { @@ -175,13 +178,23 @@ func TestWriteBlockDynamic(t *testing.T) { } } +// TestWriteBlockDynamic tests if the writeBlockDynamic encoding has changed. +// To update the reference files use the "-update" flag on the test. +func TestWriteBlockDynamicSync(t *testing.T) { + for _, test := range writeBlockTests { + testBlock(t, test, "sync") + } +} + // testBlock tests a block against its references, // or regenerate the references, if "-update" flag is set. func testBlock(t *testing.T, test huffTest, ttype string) { if test.want != "" { test.want = fmt.Sprintf(test.want, ttype) } + const gotSuffix = ".got" test.wantNoInput = fmt.Sprintf(test.wantNoInput, ttype) + tokens := indexTokens(test.tokens) if *update { if test.input != "" { t.Logf("Updating %q", test.want) @@ -198,7 +211,7 @@ func testBlock(t *testing.T, test huffTest, ttype string) { } defer f.Close() bw := newHuffmanBitWriter(f) - writeToType(t, ttype, bw, test.tokens, input) + writeToType(t, ttype, bw, tokens, input) } t.Logf("Updating %q", test.wantNoInput) @@ -209,7 +222,7 @@ func testBlock(t *testing.T, test huffTest, ttype string) { } defer f.Close() bw := newHuffmanBitWriter(f) - writeToType(t, ttype, bw, test.tokens, nil) + writeToType(t, ttype, bw, tokens, nil) return } @@ -227,12 +240,12 @@ func testBlock(t *testing.T, test huffTest, ttype string) { } var buf bytes.Buffer bw := newHuffmanBitWriter(&buf) - writeToType(t, ttype, bw, test.tokens, input) + writeToType(t, ttype, bw, tokens, input) got := buf.Bytes() if !bytes.Equal(got, want) { - t.Errorf("writeBlock did not yield expected result for file %q with input. See %q", test.want, test.want+".got") - if err := os.WriteFile(test.want+".got", got, 0666); err != nil { + t.Errorf("writeBlock did not yield expected result for file %q with input. See %q", test.want, test.want+gotSuffix) + if err := os.WriteFile(test.want+gotSuffix, got, 0666); err != nil { t.Error(err) } } @@ -241,12 +254,12 @@ func testBlock(t *testing.T, test huffTest, ttype string) { // Test if the writer produces the same output after reset. buf.Reset() bw.reset(&buf) - writeToType(t, ttype, bw, test.tokens, input) + writeToType(t, ttype, bw, tokens, input) bw.flush() got = buf.Bytes() if !bytes.Equal(got, want) { - t.Errorf("reset: writeBlock did not yield expected result for file %q with input. See %q", test.want, test.want+".reset.got") - if err := os.WriteFile(test.want+".reset.got", got, 0666); err != nil { + t.Errorf("reset: writeBlock did not yield expected result for file %q with input. See %q", test.want, test.want+".reset"+gotSuffix) + if err := os.WriteFile(test.want+".reset"+gotSuffix, got, 0666); err != nil { t.Error(err) } return @@ -262,12 +275,12 @@ func testBlock(t *testing.T, test huffTest, ttype string) { } var buf bytes.Buffer bw := newHuffmanBitWriter(&buf) - writeToType(t, ttype, bw, test.tokens, nil) + writeToType(t, ttype, bw, tokens, nil) got := buf.Bytes() if !bytes.Equal(got, wantNI) { - t.Errorf("writeBlock did not yield expected result for file %q with input. See %q", test.wantNoInput, test.wantNoInput+".got") - if err := os.WriteFile(test.want+".got", got, 0666); err != nil { + t.Errorf("writeBlock did not yield expected result for file %q with input. See %q", test.wantNoInput, test.wantNoInput+gotSuffix) + if err := os.WriteFile(test.wantNoInput+gotSuffix, got, 0666); err != nil { t.Error(err) } } else if got[0]&1 == 1 { @@ -280,12 +293,12 @@ func testBlock(t *testing.T, test huffTest, ttype string) { // Test if the writer produces the same output after reset. buf.Reset() bw.reset(&buf) - writeToType(t, ttype, bw, test.tokens, nil) + writeToType(t, ttype, bw, tokens, nil) bw.flush() got = buf.Bytes() if !bytes.Equal(got, wantNI) { - t.Errorf("reset: writeBlock did not yield expected result for file %q without input. See %q", test.want, test.want+".reset.got") - if err := os.WriteFile(test.want+".reset.got", got, 0666); err != nil { + t.Errorf("reset: writeBlock did not yield expected result for file %q without input. See %q", test.wantNoInput, test.wantNoInput+".reset"+gotSuffix) + if err := os.WriteFile(test.wantNoInput+".reset"+gotSuffix, got, 0666); err != nil { t.Error(err) } return @@ -294,12 +307,14 @@ func testBlock(t *testing.T, test huffTest, ttype string) { testWriterEOF(t, "wb", test, false) } -func writeToType(t *testing.T, ttype string, bw *huffmanBitWriter, tok []token, input []byte) { +func writeToType(t *testing.T, ttype string, bw *huffmanBitWriter, tok tokens, input []byte) { switch ttype { case "wb": - bw.writeBlock(tok, false, input) + bw.writeBlock(&tok, false, input) case "dyn": - bw.writeBlockDynamic(tok, false, input) + bw.writeBlockDynamic(&tok, false, input, false) + case "sync": + bw.writeBlockDynamic(&tok, false, input, true) default: panic("unknown test type") } @@ -332,13 +347,14 @@ func testWriterEOF(t *testing.T, ttype string, test huffTest, useInput bool) { } var buf bytes.Buffer bw := newHuffmanBitWriter(&buf) + tokens := indexTokens(test.tokens) switch ttype { case "wb": - bw.writeBlock(test.tokens, true, input) + bw.writeBlock(&tokens, true, input) case "dyn": - bw.writeBlockDynamic(test.tokens, true, input) + bw.writeBlockDynamic(&tokens, true, input, true) case "huff": - bw.writeBlockHuff(true, input) + bw.writeBlockHuff(true, input, true) default: panic("unknown test type") } diff --git a/src/compress/flate/huffman_code.go b/src/compress/flate/huffman_code.go index 6f69cabfd060d4..f3e202430736d3 100644 --- a/src/compress/flate/huffman_code.go +++ b/src/compress/flate/huffman_code.go @@ -7,25 +7,42 @@ package flate import ( "math" "math/bits" - "sort" +) + +const ( + maxBitsLimit = 16 + // number of valid literals + literalCount = 286 ) // hcode is a huffman code with a bit code and bit length. -type hcode struct { - code, len uint16 +type hcode uint32 + +func (h hcode) len() uint8 { + return uint8(h) +} + +func (h hcode) code64() uint64 { + return uint64(h >> 8) +} + +func (h hcode) zero() bool { + return h == 0 } type huffmanEncoder struct { - codes []hcode - freqcache []literalNode - bitCount [17]int32 - lns byLiteral // stored to avoid repeated allocation in generate - lfs byFreq // stored to avoid repeated allocation in generate + codes []hcode + bitCount [17]int32 + + // Allocate a reusable buffer with the longest possible frequency table. + // Possible lengths are codegenCodeCount, offsetCodeCount and literalCount. + // The largest of these is literalCount, so we allocate for that case. + freqcache [literalCount + 1]literalNode } type literalNode struct { literal uint16 - freq int32 + freq uint16 } // A levelInfo describes the state of the constructed tree for a given depth. @@ -49,25 +66,34 @@ type levelInfo struct { } // set sets the code and length of an hcode. -func (h *hcode) set(code uint16, length uint16) { - h.len = length - h.code = code +func (h *hcode) set(code uint16, length uint8) { + *h = hcode(length) | (hcode(code) << 8) +} + +func newhcode(code uint16, length uint8) hcode { + return hcode(length) | (hcode(code) << 8) +} + +func reverseBits(number uint16, bitLength byte) uint16 { + return bits.Reverse16(number << ((16 - bitLength) & 15)) } -func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxInt32} } +func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxUint16} } func newHuffmanEncoder(size int) *huffmanEncoder { - return &huffmanEncoder{codes: make([]hcode, size)} + // Make capacity to next power of two. + c := uint(bits.Len32(uint32(size - 1))) + return &huffmanEncoder{codes: make([]hcode, size, 1<= 3. +// canReuseBits returns the number of bits or math.MaxInt32 if the encoder cannot be reused. +func (h *huffmanEncoder) canReuseBits(freq []uint16) int { + var total int + for i, f := range freq { + if f != 0 { + code := h.codes[i] + if code.zero() { + return math.MaxInt32 + } + total += int(f) * int(code.len()) + } + } + return total +} + +// Return the number of literals assigned to each bit size in the Huffman encoding +// +// This method is only called when list.length >= 3 // The cases of 0, 1, and 2 literals are handled by special case code. // -// list is an array of the literals with non-zero frequencies -// and their associated frequencies. The array is in order of increasing -// frequency and has as its last element a special element with frequency -// MaxInt32. +// list An array of the literals with non-zero frequencies +// +// and their associated frequencies. The array is in order of increasing +// frequency, and has as its last element a special element with frequency +// MaxInt32 +// +// maxBits The maximum number of bits that should be used to encode any literal. +// +// Must be less than 16. // -// maxBits is the maximum number of bits that should be used to encode any literal. -// It must be less than 16. +// return An integer array in which array[i] indicates the number of literals // -// bitCounts returns an integer slice in which slice[i] indicates the number of literals -// that should be encoded in i bits. +// that should be encoded in i bits. func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { if maxBits >= maxBitsLimit { panic("flate: maxBits too large") @@ -154,14 +205,19 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { // of the level j ancestor. var leafCounts [maxBitsLimit][maxBitsLimit]int32 + // Descending to only have 1 bounds check. + l2f := int32(list[2].freq) + l1f := int32(list[1].freq) + l0f := int32(list[0].freq) + int32(list[1].freq) + for level := int32(1); level <= maxBits; level++ { // For every level, the first two items are the first two characters. // We initialize the levels as if we had already figured this out. levels[level] = levelInfo{ level: level, - lastFreq: list[1].freq, - nextCharFreq: list[2].freq, - nextPairFreq: list[0].freq + list[1].freq, + lastFreq: l1f, + nextCharFreq: l2f, + nextPairFreq: l0f, } leafCounts[level][level] = 2 if level == 1 { @@ -172,11 +228,11 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { // We need a total of 2*n - 2 items at top level and have already generated 2. levels[maxBits].needed = 2*n - 4 - level := maxBits - for { + level := uint32(maxBits) + for level < 16 { l := &levels[level] if l.nextPairFreq == math.MaxInt32 && l.nextCharFreq == math.MaxInt32 { - // We've run out of both leaves and pairs. + // We've run out of both leafs and pairs. // End all calculations for this level. // To make sure we never come back to this level or any lower level, // set nextPairFreq impossibly large. @@ -193,14 +249,21 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { l.lastFreq = l.nextCharFreq // Lower leafCounts are the same of the previous node. leafCounts[level][level] = n - l.nextCharFreq = list[n].freq + e := list[n] + if e.literal < math.MaxUint16 { + l.nextCharFreq = int32(e.freq) + } else { + l.nextCharFreq = math.MaxInt32 + } } else { // The next item on this row is a pair from the previous row. // nextPairFreq isn't valid until we generate two // more values in the level below l.lastFreq = l.nextPairFreq // Take leaf counts from the lower level, except counts[level] remains the same. - copy(leafCounts[level][:level], leafCounts[level-1][:level]) + save := leafCounts[level][level] + leafCounts[level] = leafCounts[level-1] + leafCounts[level][level] = save levels[l.level-1].needed = 2 } @@ -256,9 +319,9 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN // assigned in literal order (not frequency order). chunk := list[len(list)-int(bits):] - h.lns.sort(chunk) + sortByLiteral(chunk) for _, node := range chunk { - h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint16(n)} + h.codes[node.literal] = newhcode(reverseBits(code, uint8(n)), uint8(n)) code++ } list = list[0 : len(list)-int(bits)] @@ -268,15 +331,10 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN // Update this Huffman Code object to be the minimum code for the specified frequency count. // // freq is an array of frequencies, in which freq[i] gives the frequency of literal i. -// maxBits The maximum number of bits to use for any literal. -func (h *huffmanEncoder) generate(freq []int32, maxBits int32) { - if h.freqcache == nil { - // Allocate a reusable buffer with the longest possible frequency table. - // Possible lengths are codegenCodeCount, offsetCodeCount and maxNumLit. - // The largest of these is maxNumLit, so we allocate for that case. - h.freqcache = make([]literalNode, maxNumLit+1) - } +// maxBits is the maximum number of bits to use for any literal. +func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) { list := h.freqcache[:len(freq)+1] + codes := h.codes[:len(freq)] // Number of non-zero literals count := 0 // Set list to be the set of all non-zero literals and their frequencies @@ -285,9 +343,10 @@ func (h *huffmanEncoder) generate(freq []int32, maxBits int32) { list[count] = literalNode{uint16(i), f} count++ } else { - h.codes[i].len = 0 + codes[i] = 0 } } + list[count] = literalNode{} list = list[:count] if count <= 2 { @@ -299,7 +358,7 @@ func (h *huffmanEncoder) generate(freq []int32, maxBits int32) { } return } - h.lfs.sort(list) + sortByFreq(list) // Get the number of literals for each bit count bitCount := h.bitCounts(list, maxBits) @@ -307,39 +366,43 @@ func (h *huffmanEncoder) generate(freq []int32, maxBits int32) { h.assignEncodingAndSize(bitCount, list) } -type byLiteral []literalNode - -func (s *byLiteral) sort(a []literalNode) { - *s = byLiteral(a) - sort.Sort(s) +// atLeastOne clamps the result between 1 and 15. +func atLeastOne(v float32) float32 { + return min(15, max(1, v)) } -func (s byLiteral) Len() int { return len(s) } - -func (s byLiteral) Less(i, j int) bool { - return s[i].literal < s[j].literal -} - -func (s byLiteral) Swap(i, j int) { s[i], s[j] = s[j], s[i] } - -type byFreq []literalNode - -func (s *byFreq) sort(a []literalNode) { - *s = byFreq(a) - sort.Sort(s) -} - -func (s byFreq) Len() int { return len(s) } - -func (s byFreq) Less(i, j int) bool { - if s[i].freq == s[j].freq { - return s[i].literal < s[j].literal +func histogram(b []byte, h []uint16) { + if len(b) >= 8<<10 { + // Split for bigger inputs + histogramSplit(b, h) + } else { + h = h[:256] + for _, t := range b { + h[t]++ + } } - return s[i].freq < s[j].freq } -func (s byFreq) Swap(i, j int) { s[i], s[j] = s[j], s[i] } - -func reverseBits(number uint16, bitLength byte) uint16 { - return bits.Reverse16(number << (16 - bitLength)) +func histogramSplit(b []byte, h []uint16) { + // Tested, and slightly faster than 2-way. + // Writing to separate arrays and combining is also slightly slower. + h = h[:256] + // Make size divisible by 4 + for len(b)&3 != 0 { + h[b[0]]++ + b = b[1:] + } + n := len(b) / 4 + x, y, z, w := b[:n], b[n:], b[n+n:], b[n+n+n:] + y, z, w = y[:len(x)], z[:len(x)], w[:len(x)] + for i, t := range x { + v0 := &h[t] + v1 := &h[y[i]] + v3 := &h[w[i]] + v2 := &h[z[i]] + *v0++ + *v1++ + *v2++ + *v3++ + } } diff --git a/src/compress/flate/huffman_sortByFreq.go b/src/compress/flate/huffman_sortByFreq.go new file mode 100644 index 00000000000000..6c05ba8c1c2e2a --- /dev/null +++ b/src/compress/flate/huffman_sortByFreq.go @@ -0,0 +1,159 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +// Sort sorts data. +// It makes one call to data.Len to determine n, and O(n*log(n)) calls to +// data.Less and data.Swap. The sort is not guaranteed to be stable. +func sortByFreq(data []literalNode) { + n := len(data) + quickSortByFreq(data, 0, n, maxDepth(n)) +} + +func quickSortByFreq(data []literalNode, a, b, maxDepth int) { + for b-a > 12 { // Use ShellSort for slices <= 12 elements + if maxDepth == 0 { + heapSort(data, a, b) + return + } + maxDepth-- + mlo, mhi := doPivotByFreq(data, a, b) + // Avoiding recursion on the larger subproblem guarantees + // a stack depth of at most lg(b-a). + if mlo-a < b-mhi { + quickSortByFreq(data, a, mlo, maxDepth) + a = mhi // i.e., quickSortByFreq(data, mhi, b) + } else { + quickSortByFreq(data, mhi, b, maxDepth) + b = mlo // i.e., quickSortByFreq(data, a, mlo) + } + } + if b-a > 1 { + // Do ShellSort pass with gap 6 + // It could be written in this simplified form cause b-a <= 12 + for i := a + 6; i < b; i++ { + if data[i].freq == data[i-6].freq && data[i].literal < data[i-6].literal || data[i].freq < data[i-6].freq { + data[i], data[i-6] = data[i-6], data[i] + } + } + insertionSortByFreq(data, a, b) + } +} + +func doPivotByFreq(data []literalNode, lo, hi int) (midlo, midhi int) { + m := int(uint(lo+hi) >> 1) // Written like this to avoid integer overflow. + if hi-lo > 40 { + // Tukey's ``Ninther,'' median of three medians of three. + s := (hi - lo) / 8 + medianOfThreeSortByFreq(data, lo, lo+s, lo+2*s) + medianOfThreeSortByFreq(data, m, m-s, m+s) + medianOfThreeSortByFreq(data, hi-1, hi-1-s, hi-1-2*s) + } + medianOfThreeSortByFreq(data, lo, m, hi-1) + + // Invariants are: + // data[lo] = pivot (set up by ChoosePivot) + // data[lo < i < a] < pivot + // data[a <= i < b] <= pivot + // data[b <= i < c] unexamined + // data[c <= i < hi-1] > pivot + // data[hi-1] >= pivot + pivot := lo + a, c := lo+1, hi-1 + + for ; a < c && (data[a].freq == data[pivot].freq && data[a].literal < data[pivot].literal || data[a].freq < data[pivot].freq); a++ { + } + b := a + for { + for ; b < c && (data[pivot].freq == data[b].freq && data[pivot].literal > data[b].literal || data[pivot].freq > data[b].freq); b++ { // data[b] <= pivot + } + for ; b < c && (data[pivot].freq == data[c-1].freq && data[pivot].literal < data[c-1].literal || data[pivot].freq < data[c-1].freq); c-- { // data[c-1] > pivot + } + if b >= c { + break + } + // data[b] > pivot; data[c-1] <= pivot + data[b], data[c-1] = data[c-1], data[b] + b++ + c-- + } + // If hi-c<3 then there are duplicates (by property of median of nine). + // Let's be a bit more conservative, and set border to 5. + protect := hi-c < 5 + if !protect && hi-c < (hi-lo)/4 { + // Lets test some points for equality to pivot + dups := 0 + if data[pivot].freq == data[hi-1].freq && data[pivot].literal > data[hi-1].literal || data[pivot].freq > data[hi-1].freq { // data[hi-1] = pivot + data[c], data[hi-1] = data[hi-1], data[c] + c++ + dups++ + } + if data[b-1].freq == data[pivot].freq && data[b-1].literal > data[pivot].literal || data[b-1].freq > data[pivot].freq { // data[b-1] = pivot + b-- + dups++ + } + // m-lo = (hi-lo)/2 > 6 + // b-lo > (hi-lo)*3/4-1 > 8 + // ==> m < b ==> data[m] <= pivot + if data[m].freq == data[pivot].freq && data[m].literal > data[pivot].literal || data[m].freq > data[pivot].freq { // data[m] = pivot + data[m], data[b-1] = data[b-1], data[m] + b-- + dups++ + } + // if at least 2 points are equal to pivot, assume skewed distribution + protect = dups > 1 + } + if protect { + // Protect against a lot of duplicates + // Add invariant: + // data[a <= i < b] unexamined + // data[b <= i < c] = pivot + for { + for ; a < b && (data[b-1].freq == data[pivot].freq && data[b-1].literal > data[pivot].literal || data[b-1].freq > data[pivot].freq); b-- { // data[b] == pivot + } + for ; a < b && (data[a].freq == data[pivot].freq && data[a].literal < data[pivot].literal || data[a].freq < data[pivot].freq); a++ { // data[a] < pivot + } + if a >= b { + break + } + // data[a] == pivot; data[b-1] < pivot + data[a], data[b-1] = data[b-1], data[a] + a++ + b-- + } + } + // Swap pivot into middle + data[pivot], data[b-1] = data[b-1], data[pivot] + return b - 1, c +} + +// Insertion sort +func insertionSortByFreq(data []literalNode, a, b int) { + for i := a + 1; i < b; i++ { + for j := i; j > a && (data[j].freq == data[j-1].freq && data[j].literal < data[j-1].literal || data[j].freq < data[j-1].freq); j-- { + data[j], data[j-1] = data[j-1], data[j] + } + } +} + +// quickSortByFreq, loosely following Bentley and McIlroy, +// ``Engineering a Sort Function,'' SP&E November 1993. + +// medianOfThreeSortByFreq moves the median of the three values data[m0], data[m1], data[m2] into data[m1]. +func medianOfThreeSortByFreq(data []literalNode, m1, m0, m2 int) { + // sort 3 elements + if data[m1].freq == data[m0].freq && data[m1].literal < data[m0].literal || data[m1].freq < data[m0].freq { + data[m1], data[m0] = data[m0], data[m1] + } + // data[m0] <= data[m1] + if data[m2].freq == data[m1].freq && data[m2].literal < data[m1].literal || data[m2].freq < data[m1].freq { + data[m2], data[m1] = data[m1], data[m2] + // data[m0] <= data[m2] && data[m1] < data[m2] + if data[m1].freq == data[m0].freq && data[m1].literal < data[m0].literal || data[m1].freq < data[m0].freq { + data[m1], data[m0] = data[m0], data[m1] + } + } + // now data[m0] <= data[m1] <= data[m2] +} diff --git a/src/compress/flate/huffman_sortByLiteral.go b/src/compress/flate/huffman_sortByLiteral.go new file mode 100644 index 00000000000000..93f1aea109e123 --- /dev/null +++ b/src/compress/flate/huffman_sortByLiteral.go @@ -0,0 +1,201 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +// Sort sorts data. +// It makes one call to data.Len to determine n, and O(n*log(n)) calls to +// data.Less and data.Swap. The sort is not guaranteed to be stable. +func sortByLiteral(data []literalNode) { + n := len(data) + quickSort(data, 0, n, maxDepth(n)) +} + +func quickSort(data []literalNode, a, b, maxDepth int) { + for b-a > 12 { // Use ShellSort for slices <= 12 elements + if maxDepth == 0 { + heapSort(data, a, b) + return + } + maxDepth-- + mlo, mhi := doPivot(data, a, b) + // Avoiding recursion on the larger subproblem guarantees + // a stack depth of at most lg(b-a). + if mlo-a < b-mhi { + quickSort(data, a, mlo, maxDepth) + a = mhi // i.e., quickSort(data, mhi, b) + } else { + quickSort(data, mhi, b, maxDepth) + b = mlo // i.e., quickSort(data, a, mlo) + } + } + if b-a > 1 { + // Do ShellSort pass with gap 6 + // It could be written in this simplified form cause b-a <= 12 + for i := a + 6; i < b; i++ { + if data[i].literal < data[i-6].literal { + data[i], data[i-6] = data[i-6], data[i] + } + } + insertionSort(data, a, b) + } +} +func heapSort(data []literalNode, a, b int) { + first := a + lo := 0 + hi := b - a + + // Build heap with greatest element at top. + for i := (hi - 1) / 2; i >= 0; i-- { + siftDown(data, i, hi, first) + } + + // Pop elements, largest first, into end of data. + for i := hi - 1; i >= 0; i-- { + data[first], data[first+i] = data[first+i], data[first] + siftDown(data, lo, i, first) + } +} + +// siftDown implements the heap property on data[lo, hi). +// first is an offset into the array where the root of the heap lies. +func siftDown(data []literalNode, lo, hi, first int) { + root := lo + for { + child := 2*root + 1 + if child >= hi { + break + } + if child+1 < hi && data[first+child].literal < data[first+child+1].literal { + child++ + } + if data[first+root].literal > data[first+child].literal { + return + } + data[first+root], data[first+child] = data[first+child], data[first+root] + root = child + } +} +func doPivot(data []literalNode, lo, hi int) (midlo, midhi int) { + m := int(uint(lo+hi) >> 1) // Written like this to avoid integer overflow. + if hi-lo > 40 { + // Tukey's ``Ninther,'' median of three medians of three. + s := (hi - lo) / 8 + medianOfThree(data, lo, lo+s, lo+2*s) + medianOfThree(data, m, m-s, m+s) + medianOfThree(data, hi-1, hi-1-s, hi-1-2*s) + } + medianOfThree(data, lo, m, hi-1) + + // Invariants are: + // data[lo] = pivot (set up by ChoosePivot) + // data[lo < i < a] < pivot + // data[a <= i < b] <= pivot + // data[b <= i < c] unexamined + // data[c <= i < hi-1] > pivot + // data[hi-1] >= pivot + pivot := lo + a, c := lo+1, hi-1 + + for ; a < c && data[a].literal < data[pivot].literal; a++ { + } + b := a + for { + for ; b < c && data[pivot].literal > data[b].literal; b++ { // data[b] <= pivot + } + for ; b < c && data[pivot].literal < data[c-1].literal; c-- { // data[c-1] > pivot + } + if b >= c { + break + } + // data[b] > pivot; data[c-1] <= pivot + data[b], data[c-1] = data[c-1], data[b] + b++ + c-- + } + // If hi-c<3 then there are duplicates (by property of median of nine). + // Let's be a bit more conservative, and set border to 5. + protect := hi-c < 5 + if !protect && hi-c < (hi-lo)/4 { + // Lets test some points for equality to pivot + dups := 0 + if data[pivot].literal > data[hi-1].literal { // data[hi-1] = pivot + data[c], data[hi-1] = data[hi-1], data[c] + c++ + dups++ + } + if data[b-1].literal > data[pivot].literal { // data[b-1] = pivot + b-- + dups++ + } + // m-lo = (hi-lo)/2 > 6 + // b-lo > (hi-lo)*3/4-1 > 8 + // ==> m < b ==> data[m] <= pivot + if data[m].literal > data[pivot].literal { // data[m] = pivot + data[m], data[b-1] = data[b-1], data[m] + b-- + dups++ + } + // if at least 2 points are equal to pivot, assume skewed distribution + protect = dups > 1 + } + if protect { + // Protect against a lot of duplicates + // Add invariant: + // data[a <= i < b] unexamined + // data[b <= i < c] = pivot + for { + for ; a < b && data[b-1].literal > data[pivot].literal; b-- { // data[b] == pivot + } + for ; a < b && data[a].literal < data[pivot].literal; a++ { // data[a] < pivot + } + if a >= b { + break + } + // data[a] == pivot; data[b-1] < pivot + data[a], data[b-1] = data[b-1], data[a] + a++ + b-- + } + } + // Swap pivot into middle + data[pivot], data[b-1] = data[b-1], data[pivot] + return b - 1, c +} + +// Insertion sort +func insertionSort(data []literalNode, a, b int) { + for i := a + 1; i < b; i++ { + for j := i; j > a && data[j].literal < data[j-1].literal; j-- { + data[j], data[j-1] = data[j-1], data[j] + } + } +} + +// maxDepth returns a threshold at which quicksort should switch +// to heapsort. It returns 2*ceil(lg(n+1)). +func maxDepth(n int) int { + var depth int + for i := n; i > 0; i >>= 1 { + depth++ + } + return depth * 2 +} + +// medianOfThree moves the median of the three values data[m0], data[m1], data[m2] into data[m1]. +func medianOfThree(data []literalNode, m1, m0, m2 int) { + // sort 3 elements + if data[m1].literal < data[m0].literal { + data[m1], data[m0] = data[m0], data[m1] + } + // data[m0] <= data[m1] + if data[m2].literal < data[m1].literal { + data[m2], data[m1] = data[m1], data[m2] + // data[m0] <= data[m2] && data[m1] < data[m2] + if data[m1].literal < data[m0].literal { + data[m1], data[m0] = data[m0], data[m1] + } + } + // now data[m0] <= data[m1] <= data[m2] +} diff --git a/src/compress/flate/level1.go b/src/compress/flate/level1.go new file mode 100644 index 00000000000000..2195df4fa38f93 --- /dev/null +++ b/src/compress/flate/level1.go @@ -0,0 +1,197 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +// Level 1 uses a single small table with 5 byte hashes. +type fastEncL1 struct { + fastGen + table [tableSize]tableEntry +} + +func (e *fastEncL1) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + hashBytes = 5 + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + + // nextEmit is where in src the next emitLiterals should start from. + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiterals in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + cv := loadLE64(src, s) + + for { + const skipLog = 5 + const doEvery = 2 + + nextS := s + var candidate tableEntry + var t int32 + for { + nextHash := hashLen(cv, tableBits, hashBytes) + candidate = e.table[nextHash] + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + + now := loadLE64(src, nextS) + e.table[nextHash] = tableEntry{offset: s + e.cur} + nextHash = hashLen(now, tableBits, hashBytes) + t = candidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) { + e.table[nextHash] = tableEntry{offset: nextS + e.cur} + break + } + + // Do one right away... + cv = now + s = nextS + nextS++ + candidate = e.table[nextHash] + now >>= 8 + e.table[nextHash] = tableEntry{offset: s + e.cur} + + t = candidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) { + e.table[nextHash] = tableEntry{offset: nextS + e.cur} + break + } + cv = now + s = nextS + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + + // Extend the 4-byte match as long as possible. + l := e.matchlenLong(int(s+4), int(t+4), src) + 4 + + // Extend backwards + for t > 0 && s > nextEmit && loadLE8(src, t-1) == loadLE8(src, s-1) { + s-- + t-- + l++ + } + if nextEmit < s { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } + + // Save the match found. Same as 'dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))' + xOffset := uint32(s - t - baseMatchOffset) + xLength := l + oc := offsetCode(xOffset) + xOffset |= oc << 16 + for xLength > 0 { + xl := xLength + if xl > 258 { + if xl > 258+baseMatchLength { + xl = 258 + } else { + xl = 258 - baseMatchLength + } + } + xLength -= xl + xl -= baseMatchLength + dst.extraHist[lengthCodes1[uint8(xl)]]++ + dst.offHist[oc]++ + dst.tokens[dst.n] = token(matchType | uint32(xl)<= s { + s = nextS + 1 + } + if s >= sLimit { + // Index first pair after match end. + if int(s+l+8) < len(src) { + cv := loadLE64(src, s) + e.table[hashLen(cv, tableBits, hashBytes)] = tableEntry{offset: s + e.cur} + } + goto emitRemainder + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-2 and at s. If + // another emitCopy is not our next move, also calculate nextHash + // at s+1. At least on GOARCH=amd64, these three hash calculations + // are faster as one load64 call (with some shifts) instead of + // three load32 calls. + x := loadLE64(src, s-2) + o := e.cur + s - 2 + prevHash := hashLen(x, tableBits, hashBytes) + e.table[prevHash] = tableEntry{offset: o} + x >>= 16 + currHash := hashLen(x, tableBits, hashBytes) + candidate = e.table[currHash] + e.table[currHash] = tableEntry{offset: o + 2} + + t = candidate.offset - e.cur + if s-t > maxMatchOffset || uint32(x) != loadLE32(src, t) { + cv = x >> 8 + s++ + break + } + } + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + emitLiterals(dst, src[nextEmit:]) + } +} diff --git a/src/compress/flate/level2.go b/src/compress/flate/level2.go new file mode 100644 index 00000000000000..7a2fdf7abe6ddb --- /dev/null +++ b/src/compress/flate/level2.go @@ -0,0 +1,187 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +// Level 2 uses a similar algorithm to level 1, but with a larger table. +type fastEncL2 struct { + fastGen + table [bTableSize]tableEntry +} + +func (e *fastEncL2) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + hashBytes = 5 + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + + // nextEmit is where in src the next emitLiterals should start from. + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiterals in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + cv := loadLE64(src, s) + for { + // When should we start skipping if we haven't found matches in a long while. + const skipLog = 5 + const doEvery = 2 + + nextS := s + var candidate tableEntry + for { + nextHash := hashLen(cv, bTableBits, hashBytes) + s = nextS + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + candidate = e.table[nextHash] + now := loadLE64(src, nextS) + e.table[nextHash] = tableEntry{offset: s + e.cur} + nextHash = hashLen(now, bTableBits, hashBytes) + + offset := s - (candidate.offset - e.cur) + if offset < maxMatchOffset && uint32(cv) == loadLE32(src, candidate.offset-e.cur) { + e.table[nextHash] = tableEntry{offset: nextS + e.cur} + break + } + + // Do one right away... + cv = now + s = nextS + nextS++ + candidate = e.table[nextHash] + now >>= 8 + e.table[nextHash] = tableEntry{offset: s + e.cur} + + offset = s - (candidate.offset - e.cur) + if offset < maxMatchOffset && uint32(cv) == loadLE32(src, candidate.offset-e.cur) { + break + } + cv = now + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes match. + for { + // Extend the 4-byte match as long as possible. + t := candidate.offset - e.cur + l := e.matchlenLong(int(s+4), int(t+4), src) + 4 + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + // Index first pair after match end. + if int(s+l+8) < len(src) { + cv := loadLE64(src, s) + e.table[hashLen(cv, bTableBits, hashBytes)] = tableEntry{offset: s + e.cur} + } + goto emitRemainder + } + + // Store every second hash in-between, but offset by 1. + for i := s - l + 2; i < s-5; i += 7 { + x := loadLE64(src, i) + nextHash := hashLen(x, bTableBits, hashBytes) + e.table[nextHash] = tableEntry{offset: e.cur + i} + // Skip one + x >>= 16 + nextHash = hashLen(x, bTableBits, hashBytes) + e.table[nextHash] = tableEntry{offset: e.cur + i + 2} + // Skip one + x >>= 16 + nextHash = hashLen(x, bTableBits, hashBytes) + e.table[nextHash] = tableEntry{offset: e.cur + i + 4} + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-2 to s. If + // another emitCopy is not our next move, also calculate nextHash + // at s+1. + x := loadLE64(src, s-2) + o := e.cur + s - 2 + prevHash := hashLen(x, bTableBits, hashBytes) + prevHash2 := hashLen(x>>8, bTableBits, hashBytes) + e.table[prevHash] = tableEntry{offset: o} + e.table[prevHash2] = tableEntry{offset: o + 1} + currHash := hashLen(x>>16, bTableBits, hashBytes) + candidate = e.table[currHash] + e.table[currHash] = tableEntry{offset: o + 2} + + offset := s - (candidate.offset - e.cur) + if offset > maxMatchOffset || uint32(x>>16) != loadLE32(src, candidate.offset-e.cur) { + cv = x >> 24 + s++ + break + } + } + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiterals(dst, src[nextEmit:]) + } +} diff --git a/src/compress/flate/level3.go b/src/compress/flate/level3.go new file mode 100644 index 00000000000000..adda8714879c8d --- /dev/null +++ b/src/compress/flate/level3.go @@ -0,0 +1,226 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +// Level 3 uses a similar algorithm to level 2, with a smaller table, +// but will check up two candidates for each iteration with more +// entries added to the table. +type fastEncL3 struct { + fastGen + table [1 << 16]tableEntryPrev +} + +func (e *fastEncL3) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + tableBits = 16 + hashBytes = 5 + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntryPrev{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i] + if v.Cur.offset <= minOff { + v.Cur.offset = 0 + } else { + v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset + } + if v.Prev.offset <= minOff { + v.Prev.offset = 0 + } else { + v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset + } + e.table[i] = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // Skip if too small. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiterals in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiterals should start from. + cv := loadLE64(src, s) + for { + const skipLog = 7 + nextS := s + var candidate tableEntry + for { + nextHash := hashLen(cv, tableBits, hashBytes) + s = nextS + nextS = s + 1 + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + candidates := e.table[nextHash] + now := loadLE64(src, nextS) + + // Safe offset distance until s + 4... + minOffset := e.cur + s - (maxMatchOffset - 4) + e.table[nextHash] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur}} + + // Check both candidates + candidate = candidates.Cur + if candidate.offset < minOffset { + cv = now + // Previous will also be invalid, we have nothing. + continue + } + + if uint32(cv) == loadLE32(src, candidate.offset-e.cur) { + if candidates.Prev.offset < minOffset || uint32(cv) != loadLE32(src, candidates.Prev.offset-e.cur) { + break + } + // Both match and are valid, pick longest. + offset := s - (candidate.offset - e.cur) + o2 := s - (candidates.Prev.offset - e.cur) + l1, l2 := matchLen(src[s+4:], src[s-offset+4:]), matchLen(src[s+4:], src[s-o2+4:]) + if l2 > l1 { + candidate = candidates.Prev + } + break + } else { + // We only check if value mismatches. + // Offset will always be invalid in other cases. + candidate = candidates.Prev + if candidate.offset > minOffset && uint32(cv) == loadLE32(src, candidate.offset-e.cur) { + break + } + } + cv = now + } + + for { + // Extend the 4-byte match as long as possible. + // + t := candidate.offset - e.cur + l := e.matchlenLong(int(s+4), int(t+4), src) + 4 + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + // Emit literals. + if nextEmit < s { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } + + // Emit match. + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + t += l + // Index first pair after match end. + if int(t+8) < len(src) && t > 0 { + cv = loadLE64(src, t) + nextHash := hashLen(cv, tableBits, hashBytes) + e.table[nextHash] = tableEntryPrev{ + Prev: e.table[nextHash].Cur, + Cur: tableEntry{offset: e.cur + t}, + } + } + goto emitRemainder + } + + // Store every 5th hash in-between. + for i := s - l + 2; i < s-5; i += 6 { + nextHash := hashLen(loadLE64(src, i), tableBits, hashBytes) + e.table[nextHash] = tableEntryPrev{ + Prev: e.table[nextHash].Cur, + Cur: tableEntry{offset: e.cur + i}} + } + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-2 to s. + x := loadLE64(src, s-2) + prevHash := hashLen(x, tableBits, hashBytes) + + e.table[prevHash] = tableEntryPrev{ + Prev: e.table[prevHash].Cur, + Cur: tableEntry{offset: e.cur + s - 2}, + } + x >>= 8 + prevHash = hashLen(x, tableBits, hashBytes) + + e.table[prevHash] = tableEntryPrev{ + Prev: e.table[prevHash].Cur, + Cur: tableEntry{offset: e.cur + s - 1}, + } + x >>= 8 + currHash := hashLen(x, tableBits, hashBytes) + candidates := e.table[currHash] + cv = x + e.table[currHash] = tableEntryPrev{ + Prev: candidates.Cur, + Cur: tableEntry{offset: s + e.cur}, + } + + // Check both candidates + candidate = candidates.Cur + minOffset := e.cur + s - (maxMatchOffset - 4) + + if candidate.offset > minOffset { + if uint32(cv) == loadLE32(src, candidate.offset-e.cur) { + // Found a match... + continue + } + candidate = candidates.Prev + if candidate.offset > minOffset && uint32(cv) == loadLE32(src, candidate.offset-e.cur) { + // Match at prev... + continue + } + } + cv = x >> 8 + s++ + break + } + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiterals(dst, src[nextEmit:]) + } +} diff --git a/src/compress/flate/level4.go b/src/compress/flate/level4.go new file mode 100644 index 00000000000000..ceb899793e3148 --- /dev/null +++ b/src/compress/flate/level4.go @@ -0,0 +1,204 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +// Level 4 uses two tables, one for short (4 bytes) and one for long (7 bytes) matches. +type fastEncL4 struct { + fastGen + table [tableSize]tableEntry + bTable [tableSize]tableEntry +} + +func (e *fastEncL4) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + hashShortBytes = 4 + ) + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + for i := range e.bTable[:] { + e.bTable[i] = tableEntry{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + for i := range e.bTable[:] { + v := e.bTable[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.bTable[i].offset = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // This check isn't in the Snappy implementation, but there, the caller + // instead of the callee handles this case. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiterals in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + // nextEmit is where in src the next emitLiterals should start from. + cv := loadLE64(src, s) + for { + const skipLog = 6 + const doEvery = 1 + + nextS := s + var t int32 + for { + nextHashS := hashLen(cv, tableBits, hashShortBytes) + nextHashL := hashLen(cv, tableBits, hashLongBytes) + + s = nextS + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + // Fetch a short+long candidate + sCandidate := e.table[nextHashS] + lCandidate := e.bTable[nextHashL] + next := loadLE64(src, nextS) + entry := tableEntry{offset: s + e.cur} + e.table[nextHashS] = entry + e.bTable[nextHashL] = entry + + t = lCandidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) { + // We got a long match. Use that. + break + } + + t = sCandidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) { + // Found a 4 match... + lCandidate = e.bTable[hashLen(next, tableBits, hashLongBytes)] + + // If the next long is a candidate, check if we should use that instead... + lOff := lCandidate.offset - e.cur + if nextS-lOff < maxMatchOffset && loadLE32(src, lOff) == uint32(next) { + l1, l2 := matchLen(src[s+4:], src[t+4:]), matchLen(src[nextS+4:], src[nextS-lOff+4:]) + if l2 > l1 { + s = nextS + t = lCandidate.offset - e.cur + } + } + break + } + cv = next + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + + // Extend the 4-byte match as long as possible. + l := e.matchlenLong(int(s+4), int(t+4), src) + 4 + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + // Index first pair after match end. + if int(s+8) < len(src) { + cv := loadLE64(src, s) + e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: s + e.cur} + e.bTable[hashLen(cv, tableBits, hashLongBytes)] = tableEntry{offset: s + e.cur} + } + goto emitRemainder + } + + // Store every 3rd hash in-between + i := nextS + if i < s-1 { + cv := loadLE64(src, i) + t := tableEntry{offset: i + e.cur} + t2 := tableEntry{offset: t.offset + 1} + e.bTable[hashLen(cv, tableBits, hashLongBytes)] = t + e.bTable[hashLen(cv>>8, tableBits, hashLongBytes)] = t2 + e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2 + + i += 3 + for ; i < s-1; i += 3 { + cv := loadLE64(src, i) + t := tableEntry{offset: i + e.cur} + t2 := tableEntry{offset: t.offset + 1} + e.bTable[hashLen(cv, tableBits, hashLongBytes)] = t + e.bTable[hashLen(cv>>8, tableBits, hashLongBytes)] = t2 + e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2 + } + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-1 and at s. + x := loadLE64(src, s-1) + o := e.cur + s - 1 + prevHashS := hashLen(x, tableBits, hashShortBytes) + prevHashL := hashLen(x, tableBits, hashLongBytes) + e.table[prevHashS] = tableEntry{offset: o} + e.bTable[prevHashL] = tableEntry{offset: o} + cv = x >> 8 + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiterals(dst, src[nextEmit:]) + } +} diff --git a/src/compress/flate/level5.go b/src/compress/flate/level5.go new file mode 100644 index 00000000000000..29f1df27413b82 --- /dev/null +++ b/src/compress/flate/level5.go @@ -0,0 +1,291 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +// Level 5 is similar to level 4, but for long matches two candidates are tested. +// Once a match is found, when it stops it will attempt to find a match that extends further. +type fastEncL5 struct { + fastGen + table [tableSize]tableEntry + bTable [tableSize]tableEntryPrev +} + +func (e *fastEncL5) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + hashShortBytes = 4 + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + for i := range e.bTable[:] { + e.bTable[i] = tableEntryPrev{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + for i := range e.bTable[:] { + v := e.bTable[i] + if v.Cur.offset <= minOff { + v.Cur.offset = 0 + v.Prev.offset = 0 + } else { + v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset + if v.Prev.offset <= minOff { + v.Prev.offset = 0 + } else { + v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset + } + } + e.bTable[i] = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + // This check isn't in the Snappy implementation, but there, the caller + // instead of the callee handles this case. + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + + // nextEmit is where in src the next emitLiterals should start from. + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiterals in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + cv := loadLE64(src, s) + for { + const skipLog = 6 + const doEvery = 1 + + nextS := s + var l int32 + var t int32 + for { + nextHashS := hashLen(cv, tableBits, hashShortBytes) + nextHashL := hashLen(cv, tableBits, hashLongBytes) + + s = nextS + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + // Fetch a short+long candidate + sCandidate := e.table[nextHashS] + lCandidate := e.bTable[nextHashL] + next := loadLE64(src, nextS) + entry := tableEntry{offset: s + e.cur} + e.table[nextHashS] = entry + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = entry, eLong.Cur + + nextHashS = hashLen(next, tableBits, hashShortBytes) + nextHashL = hashLen(next, tableBits, hashLongBytes) + + t = lCandidate.Cur.offset - e.cur + if s-t < maxMatchOffset { + if uint32(cv) == loadLE32(src, t) { + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur + + t2 := lCandidate.Prev.offset - e.cur + if s-t2 < maxMatchOffset && uint32(cv) == loadLE32(src, t2) { + l = e.matchLenLimited(int(s+4), int(t+4), src) + 4 + ml1 := e.matchLenLimited(int(s+4), int(t2+4), src) + 4 + if ml1 > l { + t = t2 + l = ml1 + break + } + } + break + } + t = lCandidate.Prev.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) { + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur + break + } + } + + t = sCandidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) { + // Found a 4 match... + l = e.matchLenLimited(int(s+4), int(t+4), src) + 4 + lCandidate = e.bTable[nextHashL] + // Store the next match + + e.table[nextHashS] = tableEntry{offset: nextS + e.cur} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur + + // If the next long is a candidate, use that... + t2 := lCandidate.Cur.offset - e.cur + if nextS-t2 < maxMatchOffset { + if loadLE32(src, t2) == uint32(next) { + ml := e.matchLenLimited(int(nextS+4), int(t2+4), src) + 4 + if ml > l { + t = t2 + s = nextS + l = ml + break + } + } + // If the previous long is a candidate, use that... + t2 = lCandidate.Prev.offset - e.cur + if nextS-t2 < maxMatchOffset && loadLE32(src, t2) == uint32(next) { + ml := e.matchLenLimited(int(nextS+4), int(t2+4), src) + 4 + if ml > l { + t = t2 + s = nextS + l = ml + break + } + } + } + break + } + cv = next + } + + if l == 0 { + // Extend the 4-byte match as long as possible. + l = e.matchlenLong(int(s+4), int(t+4), src) + 4 + } else if l == maxMatchLength { + l += e.matchlenLong(int(s+l), int(t+l), src) + } + + // Try to locate a better match by checking the end of best match... + if sAt := s + l; l < 30 && sAt < sLimit { + // Allow some bytes at the beginning to mismatch. + // Sweet spot is 2/3 bytes depending on input. + // 3 is only a little better when it is but sometimes a lot worse. + // The skipped bytes are tested in Extend backwards, + // and still picked up as part of the match if they do. + const skipBeginning = 2 + eLong := e.bTable[hashLen(loadLE64(src, sAt), tableBits, hashLongBytes)].Cur.offset + t2 := eLong - e.cur - l + skipBeginning + s2 := s + skipBeginning + off := s2 - t2 + if t2 >= 0 && off < maxMatchOffset && off > 0 { + if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l { + t = t2 + l = l2 + s = s2 + } + } + } + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + goto emitRemainder + } + + // Store every 3rd hash in-between. + const hashEvery = 3 + i := s - l + 1 + if i < s-1 { + cv := loadLE64(src, i) + t := tableEntry{offset: i + e.cur} + e.table[hashLen(cv, tableBits, hashShortBytes)] = t + eLong := &e.bTable[hashLen(cv, tableBits, hashLongBytes)] + eLong.Cur, eLong.Prev = t, eLong.Cur + + // Do an long at i+1 + cv >>= 8 + t = tableEntry{offset: t.offset + 1} + eLong = &e.bTable[hashLen(cv, tableBits, hashLongBytes)] + eLong.Cur, eLong.Prev = t, eLong.Cur + + // We only have enough bits for a short entry at i+2 + cv >>= 8 + t = tableEntry{offset: t.offset + 1} + e.table[hashLen(cv, tableBits, hashShortBytes)] = t + + // Skip one - otherwise we risk hitting 's' + i += 4 + for ; i < s-1; i += hashEvery { + cv := loadLE64(src, i) + t := tableEntry{offset: i + e.cur} + t2 := tableEntry{offset: t.offset + 1} + eLong := &e.bTable[hashLen(cv, tableBits, hashLongBytes)] + eLong.Cur, eLong.Prev = t, eLong.Cur + e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2 + } + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-1 and at s. + x := loadLE64(src, s-1) + o := e.cur + s - 1 + prevHashS := hashLen(x, tableBits, hashShortBytes) + prevHashL := hashLen(x, tableBits, hashLongBytes) + e.table[prevHashS] = tableEntry{offset: o} + eLong := &e.bTable[prevHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: o}, eLong.Cur + cv = x >> 8 + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiterals(dst, src[nextEmit:]) + } +} diff --git a/src/compress/flate/level6.go b/src/compress/flate/level6.go new file mode 100644 index 00000000000000..d709f31e21fc42 --- /dev/null +++ b/src/compress/flate/level6.go @@ -0,0 +1,301 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +// Level 6 extends level 5, but does "repeat offset" check, +// as well as adding more hash entries to the tables. +type fastEncL6 struct { + fastGen + table [tableSize]tableEntry + bTable [tableSize]tableEntryPrev +} + +func (e *fastEncL6) Encode(dst *tokens, src []byte) { + const ( + inputMargin = 12 - 1 + minNonLiteralBlockSize = 1 + 1 + inputMargin + hashShortBytes = 4 + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + for i := range e.bTable[:] { + e.bTable[i] = tableEntryPrev{} + } + e.cur = maxMatchOffset + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - maxMatchOffset + for i := range e.table[:] { + v := e.table[i].offset + if v <= minOff { + v = 0 + } else { + v = v - e.cur + maxMatchOffset + } + e.table[i].offset = v + } + for i := range e.bTable[:] { + v := e.bTable[i] + if v.Cur.offset <= minOff { + v.Cur.offset = 0 + v.Prev.offset = 0 + } else { + v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset + if v.Prev.offset <= minOff { + v.Prev.offset = 0 + } else { + v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset + } + } + e.bTable[i] = v + } + e.cur = maxMatchOffset + } + + s := e.addBlock(src) + + if len(src) < minNonLiteralBlockSize { + // We do not fill the token table. + // This will be picked up by caller. + dst.n = uint16(len(src)) + return + } + + // Override src + src = e.hist + + // nextEmit is where in src the next emitLiterals should start from. + nextEmit := s + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiterals in the main loop, while we are + // looking for copies. + sLimit := int32(len(src) - inputMargin) + + cv := loadLE64(src, s) + // Repeat MUST be > 1 and within range + repeat := int32(1) + for { + const skipLog = 7 + const doEvery = 1 + + nextS := s + var l int32 + var t int32 + for { + nextHashS := hashLen(cv, tableBits, hashShortBytes) + nextHashL := hashLen(cv, tableBits, hashLongBytes) + s = nextS + nextS = s + doEvery + (s-nextEmit)>>skipLog + if nextS > sLimit { + goto emitRemainder + } + // Fetch a short+long candidate + sCandidate := e.table[nextHashS] + lCandidate := e.bTable[nextHashL] + next := loadLE64(src, nextS) + entry := tableEntry{offset: s + e.cur} + e.table[nextHashS] = entry + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = entry, eLong.Cur + + // Calculate hashes of 'next' + nextHashS = hashLen(next, tableBits, hashShortBytes) + nextHashL = hashLen(next, tableBits, hashLongBytes) + + t = lCandidate.Cur.offset - e.cur + if s-t < maxMatchOffset { + if uint32(cv) == loadLE32(src, t) { + // Long candidate matches at least 4 bytes. + + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur + + // Check the previous long candidate as well. + t2 := lCandidate.Prev.offset - e.cur + if s-t2 < maxMatchOffset && uint32(cv) == loadLE32(src, t2) { + l = e.matchLenLimited(int(s+4), int(t+4), src) + 4 + ml1 := e.matchLenLimited(int(s+4), int(t2+4), src) + 4 + if ml1 > l { + t = t2 + l = ml1 + break + } + } + break + } + // Current value did not match, but check if previous long value does. + t = lCandidate.Prev.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) { + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur + break + } + } + + t = sCandidate.offset - e.cur + if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) { + // Found a 4 match... + l = e.matchLenLimited(int(s+4), int(t+4), src) + 4 + + // Look up next long candidate (at nextS) + lCandidate = e.bTable[nextHashL] + + // Store the next match + e.table[nextHashS] = tableEntry{offset: nextS + e.cur} + eLong := &e.bTable[nextHashL] + eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur + + // Check repeat at s + repOff + const repOff = 1 + t2 := s - repeat + repOff + if loadLE32(src, t2) == uint32(cv>>(8*repOff)) { + ml := e.matchLenLimited(int(s+4+repOff), int(t2+4), src) + 4 + if ml > l { + t = t2 + l = ml + s += repOff + // Not worth checking more. + break + } + } + + // If the next long is a candidate, use that... + t2 = lCandidate.Cur.offset - e.cur + if nextS-t2 < maxMatchOffset { + if loadLE32(src, t2) == uint32(next) { + ml := e.matchLenLimited(int(nextS+4), int(t2+4), src) + 4 + if ml > l { + t = t2 + s = nextS + l = ml + // This is ok, but check previous as well. + } + } + // If the previous long is a candidate, use that... + t2 = lCandidate.Prev.offset - e.cur + if nextS-t2 < maxMatchOffset && loadLE32(src, t2) == uint32(next) { + ml := e.matchLenLimited(int(nextS+4), int(t2+4), src) + 4 + if ml > l { + t = t2 + s = nextS + l = ml + break + } + } + } + break + } + cv = next + } + + // Extend the 4-byte match as long as possible. + if l == 0 { + l = e.matchlenLong(int(s+4), int(t+4), src) + 4 + } else if l == maxMatchLength { + l += e.matchlenLong(int(s+l), int(t+l), src) + } + + // Try to locate a better match by checking the end-of-match... + if sAt := s + l; sAt < sLimit { + // Allow some bytes at the beginning to mismatch. + // Sweet spot is 2/3 bytes depending on input. + // 3 is only a little better when it is but sometimes a lot worse. + // The skipped bytes are tested in extend backwards, + // and still picked up as part of the match if they do. + const skipBeginning = 2 + eLong := &e.bTable[hashLen(loadLE64(src, sAt), tableBits, hashLongBytes)] + // Test current + t2 := eLong.Cur.offset - e.cur - l + skipBeginning + s2 := s + skipBeginning + off := s2 - t2 + if off < maxMatchOffset { + if off > 0 && t2 >= 0 { + if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l { + t = t2 + l = l2 + s = s2 + } + } + // Test previous entry: + t2 = eLong.Prev.offset - e.cur - l + skipBeginning + off := s2 - t2 + if off > 0 && off < maxMatchOffset && t2 >= 0 { + if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l { + t = t2 + l = l2 + s = s2 + } + } + } + } + + // Extend backwards + for t > 0 && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + if nextEmit < s { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } + + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + repeat = s - t + s += l + nextEmit = s + if nextS >= s { + s = nextS + 1 + } + + if s >= sLimit { + // Index after match end. + for i := nextS + 1; i < int32(len(src))-8; i += 2 { + cv := loadLE64(src, i) + e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: i + e.cur} + eLong := &e.bTable[hashLen(cv, tableBits, hashLongBytes)] + eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur}, eLong.Cur + } + goto emitRemainder + } + + // Store every long hash in-between and every second short. + for i := nextS + 1; i < s-1; i += 2 { + cv := loadLE64(src, i) + t := tableEntry{offset: i + e.cur} + t2 := tableEntry{offset: t.offset + 1} + eLong := &e.bTable[hashLen(cv, tableBits, hashLongBytes)] + eLong2 := &e.bTable[hashLen(cv>>8, tableBits, hashLongBytes)] + e.table[hashLen(cv, tableBits, hashShortBytes)] = t + eLong.Cur, eLong.Prev = t, eLong.Cur + eLong2.Cur, eLong2.Prev = t2, eLong2.Cur + } + cv = loadLE64(src, s) + } + +emitRemainder: + if int(nextEmit) < len(src) { + // If nothing was added, don't encode literals. + if dst.n == 0 { + return + } + + emitLiterals(dst, src[nextEmit:]) + } +} diff --git a/src/compress/flate/regmask_amd64.go b/src/compress/flate/regmask_amd64.go new file mode 100644 index 00000000000000..cd1469a909173d --- /dev/null +++ b/src/compress/flate/regmask_amd64.go @@ -0,0 +1,14 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +const ( + // Masks for shifts with register sizes of the shift value. + // This can be used to work around the x86 design of shifting by mod register size. + // It can be used when a variable shift is always smaller than the register size. + + // reg8SizeMask64 - shift value is 8 bits on 64 bit register. + reg8SizeMask64 = 63 +) diff --git a/src/compress/flate/regmask_other.go b/src/compress/flate/regmask_other.go new file mode 100644 index 00000000000000..e25fc87af1b0d2 --- /dev/null +++ b/src/compress/flate/regmask_other.go @@ -0,0 +1,18 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !amd64 +// +build !amd64 + +package flate + +const ( + // Masks for shifts with register sizes of the shift value. + // This can be used to work around the x86 design of shifting by mod register size. + // On other platforms the mask is ineffective so the AND can be removed by the compiler. + // It can be used when a variable shift is always smaller than the register size. + + // reg8SizeMask64 - shift value is 8 bits on 64 bit register. + reg8SizeMask64 = 0xff +) diff --git a/src/compress/flate/testdata/huffman-null-max.sync.expect b/src/compress/flate/testdata/huffman-null-max.sync.expect new file mode 100644 index 00000000000000..c08165143f2c57 Binary files /dev/null and b/src/compress/flate/testdata/huffman-null-max.sync.expect differ diff --git a/src/compress/flate/testdata/huffman-null-max.sync.expect-noinput b/src/compress/flate/testdata/huffman-null-max.sync.expect-noinput new file mode 100644 index 00000000000000..c08165143f2c57 Binary files /dev/null and b/src/compress/flate/testdata/huffman-null-max.sync.expect-noinput differ diff --git a/src/compress/flate/testdata/huffman-pi.sync.expect b/src/compress/flate/testdata/huffman-pi.sync.expect new file mode 100644 index 00000000000000..e4396ac6fe5e34 Binary files /dev/null and b/src/compress/flate/testdata/huffman-pi.sync.expect differ diff --git a/src/compress/flate/testdata/huffman-pi.sync.expect-noinput b/src/compress/flate/testdata/huffman-pi.sync.expect-noinput new file mode 100644 index 00000000000000..e4396ac6fe5e34 Binary files /dev/null and b/src/compress/flate/testdata/huffman-pi.sync.expect-noinput differ diff --git a/src/compress/flate/testdata/huffman-rand-1k.dyn.expect-noinput b/src/compress/flate/testdata/huffman-rand-1k.dyn.expect-noinput index 0c24742fde2487..016db5595c47c5 100644 Binary files a/src/compress/flate/testdata/huffman-rand-1k.dyn.expect-noinput and b/src/compress/flate/testdata/huffman-rand-1k.dyn.expect-noinput differ diff --git a/src/compress/flate/testdata/huffman-rand-1k.sync.expect b/src/compress/flate/testdata/huffman-rand-1k.sync.expect new file mode 100644 index 00000000000000..09dc798ee37df8 Binary files /dev/null and b/src/compress/flate/testdata/huffman-rand-1k.sync.expect differ diff --git a/src/compress/flate/testdata/huffman-rand-1k.sync.expect-noinput b/src/compress/flate/testdata/huffman-rand-1k.sync.expect-noinput new file mode 100644 index 00000000000000..0c24742fde2487 Binary files /dev/null and b/src/compress/flate/testdata/huffman-rand-1k.sync.expect-noinput differ diff --git a/src/compress/flate/testdata/huffman-rand-limit.dyn.expect b/src/compress/flate/testdata/huffman-rand-limit.dyn.expect index 2d6527934e9830..881e59c9ab9bb3 100644 Binary files a/src/compress/flate/testdata/huffman-rand-limit.dyn.expect and b/src/compress/flate/testdata/huffman-rand-limit.dyn.expect differ diff --git a/src/compress/flate/testdata/huffman-rand-limit.dyn.expect-noinput b/src/compress/flate/testdata/huffman-rand-limit.dyn.expect-noinput index 2d6527934e9830..881e59c9ab9bb3 100644 Binary files a/src/compress/flate/testdata/huffman-rand-limit.dyn.expect-noinput and b/src/compress/flate/testdata/huffman-rand-limit.dyn.expect-noinput differ diff --git a/src/compress/flate/testdata/huffman-rand-limit.golden b/src/compress/flate/testdata/huffman-rand-limit.golden index 57e59322e9884e..9ca0eb1ce22ff2 100644 Binary files a/src/compress/flate/testdata/huffman-rand-limit.golden and b/src/compress/flate/testdata/huffman-rand-limit.golden differ diff --git a/src/compress/flate/testdata/huffman-rand-limit.sync.expect b/src/compress/flate/testdata/huffman-rand-limit.sync.expect new file mode 100644 index 00000000000000..881e59c9ab9bb3 Binary files /dev/null and b/src/compress/flate/testdata/huffman-rand-limit.sync.expect differ diff --git a/src/compress/flate/testdata/huffman-rand-limit.sync.expect-noinput b/src/compress/flate/testdata/huffman-rand-limit.sync.expect-noinput new file mode 100644 index 00000000000000..881e59c9ab9bb3 Binary files /dev/null and b/src/compress/flate/testdata/huffman-rand-limit.sync.expect-noinput differ diff --git a/src/compress/flate/testdata/huffman-shifts.sync.expect b/src/compress/flate/testdata/huffman-shifts.sync.expect new file mode 100644 index 00000000000000..7812c1c62da3cb Binary files /dev/null and b/src/compress/flate/testdata/huffman-shifts.sync.expect differ diff --git a/src/compress/flate/testdata/huffman-shifts.sync.expect-noinput b/src/compress/flate/testdata/huffman-shifts.sync.expect-noinput new file mode 100644 index 00000000000000..7812c1c62da3cb Binary files /dev/null and b/src/compress/flate/testdata/huffman-shifts.sync.expect-noinput differ diff --git a/src/compress/flate/testdata/huffman-text-shift.sync.expect b/src/compress/flate/testdata/huffman-text-shift.sync.expect new file mode 100644 index 00000000000000..71ce3aeb75a86e Binary files /dev/null and b/src/compress/flate/testdata/huffman-text-shift.sync.expect differ diff --git a/src/compress/flate/testdata/huffman-text-shift.sync.expect-noinput b/src/compress/flate/testdata/huffman-text-shift.sync.expect-noinput new file mode 100644 index 00000000000000..71ce3aeb75a86e Binary files /dev/null and b/src/compress/flate/testdata/huffman-text-shift.sync.expect-noinput differ diff --git a/src/compress/flate/testdata/huffman-text.sync.expect b/src/compress/flate/testdata/huffman-text.sync.expect new file mode 100644 index 00000000000000..d448727c323caf --- /dev/null +++ b/src/compress/flate/testdata/huffman-text.sync.expect @@ -0,0 +1 @@ +_K0`K0Aasě)^HIɟb߻_>4 a=-^ 1`_ 1 ő:Y-F66!A`aC;ANyr4ߜU!GKС#r:B[G3.L׶bFRuM]^⇳(#Z ivBBH2S]u/ֽWTGnr \ No newline at end of file diff --git a/src/compress/flate/testdata/huffman-text.sync.expect-noinput b/src/compress/flate/testdata/huffman-text.sync.expect-noinput new file mode 100644 index 00000000000000..d448727c323caf --- /dev/null +++ b/src/compress/flate/testdata/huffman-text.sync.expect-noinput @@ -0,0 +1 @@ +_K0`K0Aasě)^HIɟb߻_>4 a=-^ 1`_ 1 ő:Y-F66!A`aC;ANyr4ߜU!GKС#r:B[G3.L׶bFRuM]^⇳(#Z ivBBH2S]u/ֽWTGnr \ No newline at end of file diff --git a/src/compress/flate/testdata/huffman-zero.dyn.expect b/src/compress/flate/testdata/huffman-zero.dyn.expect index 830348a79ad9ab..dbe401c54c4b6f 100644 Binary files a/src/compress/flate/testdata/huffman-zero.dyn.expect and b/src/compress/flate/testdata/huffman-zero.dyn.expect differ diff --git a/src/compress/flate/testdata/huffman-zero.dyn.expect-noinput b/src/compress/flate/testdata/huffman-zero.dyn.expect-noinput index 830348a79ad9ab..dbe401c54c4b6f 100644 Binary files a/src/compress/flate/testdata/huffman-zero.dyn.expect-noinput and b/src/compress/flate/testdata/huffman-zero.dyn.expect-noinput differ diff --git a/src/compress/flate/testdata/huffman-zero.sync.expect b/src/compress/flate/testdata/huffman-zero.sync.expect new file mode 100644 index 00000000000000..dbe401c54c4b6f Binary files /dev/null and b/src/compress/flate/testdata/huffman-zero.sync.expect differ diff --git a/src/compress/flate/testdata/huffman-zero.sync.expect-noinput b/src/compress/flate/testdata/huffman-zero.sync.expect-noinput new file mode 100644 index 00000000000000..dbe401c54c4b6f Binary files /dev/null and b/src/compress/flate/testdata/huffman-zero.sync.expect-noinput differ diff --git a/src/compress/flate/testdata/null-long-match.sync.expect-noinput b/src/compress/flate/testdata/null-long-match.sync.expect-noinput new file mode 100644 index 00000000000000..8b92d9fc20f1ee Binary files /dev/null and b/src/compress/flate/testdata/null-long-match.sync.expect-noinput differ diff --git a/src/compress/flate/token.go b/src/compress/flate/token.go index fc0e4941e7bcd2..3f0d1c358077b8 100644 --- a/src/compress/flate/token.go +++ b/src/compress/flate/token.go @@ -4,20 +4,26 @@ package flate +import ( + "math" +) + const ( - // 2 bits: type 0 = literal 1=EOF 2=Match 3=Unused - // 8 bits: xlength = length - MIN_MATCH_LENGTH - // 22 bits xoffset = offset - MIN_OFFSET_SIZE, or literal - lengthShift = 22 - offsetMask = 1<maxnumlit + offHist [32]uint16 // offset codes + litHist [256]uint16 // codes 0->255 + nFilled int + n uint16 // Must be able to contain maxStoreBlockSize + tokens [65536]token +} + +func (t *tokens) Reset() { + if t.n == 0 { + return + } + t.n = 0 + t.nFilled = 0 + clear(t.litHist[:]) + clear(t.extraHist[:]) + clear(t.offHist[:]) +} + +func indexTokens(in []token) tokens { + var t tokens + t.indexTokens(in) + return t +} + +func (t *tokens) indexTokens(in []token) { + t.Reset() + for _, tok := range in { + if tok < matchType { + t.AddLiteral(tok.literal()) + continue + } + t.AddMatch(uint32(tok.length()), tok.offset()&matchOffsetOnlyMask) + } +} + +// emitLiterals writes a literal chunk and returns the number of bytes written. +func emitLiterals(dst *tokens, lit []byte) { + for _, v := range lit { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } +} + +func (t *tokens) AddLiteral(lit byte) { + t.tokens[t.n] = token(lit) + t.litHist[lit]++ + t.n++ +} + +// from https://stackoverflow.com/a/28730362 +func mFastLog2(val float32) float32 { + ux := int32(math.Float32bits(val)) + log2 := (float32)(((ux >> 23) & 255) - 128) + ux &= -0x7f800001 + ux += 127 << 23 + uval := math.Float32frombits(uint32(ux)) + log2 += ((-0.34484843)*uval+2.02466578)*uval - 0.67487759 + return log2 +} -// Convert a < xlength, xoffset > pair into a match token. -func matchToken(xlength uint32, xoffset uint32) token { - return token(matchType + xlength< 0 { + invTotal := 1.0 / float32(total) + for _, v := range t.litHist[:] { + if v > 0 { + n := float32(v) + shannon += atLeastOne(-mFastLog2(n*invTotal)) * n + } + } + // Just add 15 for EOB + shannon += 15 + for i, v := range t.extraHist[1 : literalCount-256] { + if v > 0 { + n := float32(v) + shannon += atLeastOne(-mFastLog2(n*invTotal)) * n + bits += int(lengthExtraBits[i&31]) * int(v) + nMatches += int(v) + } + } + } + if nMatches > 0 { + invTotal := 1.0 / float32(nMatches) + for i, v := range t.offHist[:offsetCodeCount] { + if v > 0 { + n := float32(v) + shannon += atLeastOne(-mFastLog2(n*invTotal)) * n + bits += int(offsetExtraBits[i&31]) * int(v) + } + } + } + return int(shannon) + bits } -// Returns the literal of a literal token. -func (t token) literal() uint32 { return uint32(t - literalType) } +// AddMatch adds a match to the tokens. +// This function is very sensitive to inlining and right on the border. +func (t *tokens) AddMatch(xlength uint32, xoffset uint32) { + oCode := offsetCode(xoffset) + xoffset |= oCode << 16 -// Returns the extra offset of a match token. + t.extraHist[lengthCodes1[uint8(xlength)]]++ + t.offHist[oCode&31]++ + t.tokens[t.n] = token(matchType | xlength< 0 { + xl := xlength + if xl > 258 { + // We need to have at least baseMatchLength left over for next loop. + if xl > 258+baseMatchLength { + xl = 258 + } else { + xl = 258 - baseMatchLength + } + } + xlength -= xl + xl -= baseMatchLength + t.extraHist[lengthCodes1[uint8(xl)]]++ + t.offHist[oc&31]++ + t.tokens[t.n] = token(matchType | uint32(xl)<> lengthShift) } +func (t token) length() uint8 { return uint8(t >> lengthShift) } -func lengthCode(len uint32) uint32 { return lengthCodes[len] } +// Convert length to code. +func lengthCode(len uint8) uint8 { return lengthCodes[len] } -// Returns the offset code corresponding to a specific offset. +// Returns the offset code corresponding to a specific offset func offsetCode(off uint32) uint32 { if off < uint32(len(offsetCodes)) { - return offsetCodes[off] - } - if off>>7 < uint32(len(offsetCodes)) { - return offsetCodes[off>>7] + 14 + return offsetCodes[uint8(off)] } - return offsetCodes[off>>14] + 28 + return offsetCodes14[uint8(off>>7)] } diff --git a/src/compress/flate/unsafe_disabled.go b/src/compress/flate/unsafe_disabled.go new file mode 100644 index 00000000000000..c4ecd0fd0a9bb1 --- /dev/null +++ b/src/compress/flate/unsafe_disabled.go @@ -0,0 +1,40 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package flate + +type indexer interface { + int | int8 | int16 | int32 | int64 | uint | uint8 | uint16 | uint32 | uint64 +} + +// loadLE8 will load from b at index i. +func loadLE8[I indexer](b []byte, i I) byte { + return b[i] +} + +// loadLE32 will load from b at index i. +func loadLE32[I indexer](b []byte, i I) uint32 { + b = b[i : i+4] + return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 +} + +// loadLE64 will load from b at index i. +func loadLE64[I indexer](b []byte, i I) uint64 { + b = b[i : i+8] + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +} + +// storeLE64 will store v at start of b. +func storeLE64(b []byte, v uint64) { + _ = b[7] // early bounds check to guarantee safety of writes below + b[0] = byte(v) + b[1] = byte(v >> 8) + b[2] = byte(v >> 16) + b[3] = byte(v >> 24) + b[4] = byte(v >> 32) + b[5] = byte(v >> 40) + b[6] = byte(v >> 48) + b[7] = byte(v >> 56) +} diff --git a/src/compress/flate/writer_test.go b/src/compress/flate/writer_test.go index c413735cd2c9f3..43815b2e4787fd 100644 --- a/src/compress/flate/writer_test.go +++ b/src/compress/flate/writer_test.go @@ -8,6 +8,7 @@ import ( "bytes" "fmt" "io" + "math" "math/rand" "runtime" "testing" @@ -40,6 +41,34 @@ func BenchmarkEncode(b *testing.B) { }) } +func TestWriterMemUsage(t *testing.T) { + testMem := func(t *testing.T, fn func()) { + var before, after runtime.MemStats + runtime.GC() + runtime.ReadMemStats(&before) + fn() + runtime.GC() + runtime.ReadMemStats(&after) + t.Logf("%s: Memory Used: %dKB, %d allocs", t.Name(), (after.HeapInuse-before.HeapInuse)/1024, after.HeapObjects-before.HeapObjects) + } + data := make([]byte, 100000) + + for level := HuffmanOnly; level <= BestCompression; level++ { + t.Run(fmt.Sprint("level-", level), func(t *testing.T) { + var zr *Writer + var err error + testMem(t, func() { + zr, err = NewWriter(io.Discard, level) + if err != nil { + t.Fatal(err) + } + zr.Write(data) + }) + zr.Close() + }) + } +} + // errorWriter is a writer that fails after N writes. type errorWriter struct { N int @@ -67,7 +96,7 @@ func TestWriteError(t *testing.T) { in := buf.Bytes() // We create our own buffer to control number of writes. copyBuffer := make([]byte, 128) - for l := 0; l < 10; l++ { + for l := range 10 { for fail := 1; fail <= 256; fail *= 2 { // Fail after 'fail' writes ew := &errorWriter{N: fail} @@ -110,6 +139,75 @@ func TestWriteError(t *testing.T) { } } +// Test if errors from the underlying writer is passed upwards. +func TestWriter_Reset(t *testing.T) { + buf := new(bytes.Buffer) + n := 65536 + if !testing.Short() { + n *= 4 + } + for i := 0; i < n; i++ { + fmt.Fprintf(buf, "asdasfasf%d%dfghfgujyut%dyutyu\n", i, i, i) + } + in := buf.Bytes() + for l := range 10 { + l := l + if testing.Short() && l > 1 { + continue + } + t.Run(fmt.Sprintf("level-%d", l), func(t *testing.T) { + t.Parallel() + offset := 1 + if testing.Short() { + offset = 256 + } + for ; offset <= 256; offset *= 2 { + // Fail after 'fail' writes + w, err := NewWriter(io.Discard, l) + if err != nil { + t.Fatalf("NewWriter: level %d: %v", l, err) + } + if w.d.fast == nil { + t.Skip("Not Fast...") + return + } + for i := 0; i < (bufferReset-len(in)-offset-maxMatchOffset)/maxMatchOffset; i++ { + // skip ahead to where we are close to wrap around... + w.d.fast.Reset() + } + w.d.fast.Reset() + _, err = w.Write(in) + if err != nil { + t.Fatal(err) + } + for range 50 { + // skip ahead again... This should wrap around... + w.d.fast.Reset() + } + w.d.fast.Reset() + + _, err = w.Write(in) + if err != nil { + t.Fatal(err) + } + for range (math.MaxUint32 - bufferReset) / maxMatchOffset { + // skip ahead to where we are close to wrap around... + w.d.fast.Reset() + } + + _, err = w.Write(in) + if err != nil { + t.Fatal(err) + } + err = w.Close() + if err != nil { + t.Fatal(err) + } + } + }) + } +} + // Test if two runs produce identical results // even when writing different sizes to the Writer. func TestDeterministic(t *testing.T) { @@ -171,6 +269,24 @@ func testDeterministic(i int, t *testing.T) { if !bytes.Equal(b1b, b2b) { t.Errorf("level %d did not produce deterministic result, result mismatch, len(a) = %d, len(b) = %d", i, len(b1b), len(b2b)) } + + // Test using io.WriterTo interface. + var b3 bytes.Buffer + br = bytes.NewBuffer(t1) + w, err = NewWriter(&b3, i) + if err != nil { + t.Fatal(err) + } + _, err = br.WriteTo(w) + if err != nil { + t.Fatal(err) + } + w.Close() + + b3b := b3.Bytes() + if !bytes.Equal(b1b, b3b) { + t.Errorf("level %d (io.WriterTo) did not produce deterministic result, result mismatch, len(a) = %d, len(b) = %d", i, len(b1b), len(b3b)) + } } // TestDeflateFast_Reset will test that encoding is consistent diff --git a/src/compress/zlib/example_test.go b/src/compress/zlib/example_test.go index 70408895ffd5a0..7052973355eb92 100644 --- a/src/compress/zlib/example_test.go +++ b/src/compress/zlib/example_test.go @@ -19,7 +19,7 @@ func ExampleNewWriter() { w.Write([]byte("hello, world\n")) w.Close() fmt.Println(b.Bytes()) - // Output: [120 156 202 72 205 201 201 215 81 40 207 47 202 73 225 2 4 0 0 255 255 33 231 4 147] + // Output: [120 156 0 13 0 242 255 104 101 108 108 111 44 32 119 111 114 108 100 10 3 0 33 231 4 147] } func ExampleNewReader() { diff --git a/src/debug/elf/file_test.go b/src/debug/elf/file_test.go index 0c1a7cf18aeb6e..733daae57772c6 100644 --- a/src/debug/elf/file_test.go +++ b/src/debug/elf/file_test.go @@ -7,7 +7,6 @@ package elf import ( "bytes" "compress/gzip" - "compress/zlib" "debug/dwarf" "encoding/binary" "errors" @@ -1560,18 +1559,9 @@ func TestIssue59208(t *testing.T) { zoffset := sec.Offset + uint64(sec.compressionOffset) copy(dn, data[:zoffset]) - ozd, err := sec.Data() - if err != nil { - t.Fatal(err) - } - buf := bytes.NewBuffer(nil) - wr := zlib.NewWriter(buf) // corrupt origin data same as COMPRESS_ZLIB - copy(ozd, []byte{1, 0, 0, 0}) - wr.Write(ozd) - wr.Close() - - copy(dn[zoffset:], buf.Bytes()) + // Insert zlib compressed sec.Data() block with `[]byte{1, 0, 0, 0}` as the first 4 bytes + copy(dn[zoffset:], []byte{0x78, 0x9c, 0x5c, 0x4d, 0xb9, 0xd, 0x80, 0x30, 0xc, 0x3c, 0x7, 0x27, 0xdc, 0xe, 0xc, 0x46, 0x4b, 0x8b, 0x14, 0x51, 0x20, 0x16, 0xa1, 0x67, 0x8b, 0x2c, 0x88, 0xec, 0x44, 0xc2, 0xe2, 0x8a, 0xdc, 0x1b, 0x59, 0x0, 0x28, 0xc, 0x34, 0x9, 0x7f, 0x22, 0x96, 0xa0, 0x13, 0x67, 0x27, 0xa1, 0x53, 0xea, 0x4e, 0x47, 0x58, 0x7a, 0x98, 0x8d, 0x26, 0xcd, 0xfb, 0x71, 0x21, 0x31, 0x87, 0x7f, 0xca, 0xf3, 0x1b, 0x7a, 0x21, 0xfa, 0x3f, 0x23, 0x4f, 0x3, 0x50, 0x7a, 0xb9, 0xda, 0xfc, 0xae, 0xc3, 0x35, 0x77, 0x1b, 0x94, 0xd5, 0x82, 0x37, 0x0, 0x0, 0xff, 0xff, 0x65, 0xfb, 0x7, 0x6e}) copy(dn[sec.Offset+sec.FileSize:], data[sec.Offset+sec.FileSize:]) nf, err := NewFile(bytes.NewReader(dn))