diff --git a/src/compress/flate/deflate.go b/src/compress/flate/deflate.go
index 6697f3a7913cd5..3819f2e1eae81d 100644
--- a/src/compress/flate/deflate.go
+++ b/src/compress/flate/deflate.go
@@ -27,132 +27,121 @@ const (
 	// RFC 1951 compliant. That is, any valid DEFLATE decompressor will
 	// continue to be able to decompress this output.
 	HuffmanOnly = -2
-)
 
-const (
-	logWindowSize = 15
-	windowSize    = 1 << logWindowSize
-	windowMask    = windowSize - 1
-
-	// The LZ77 step produces a sequence of literal tokens and <length, offset>
-	// pair tokens. The offset is also known as distance. The underlying wire
-	// format limits the range of lengths and offsets. For example, there are
-	// 256 legitimate lengths: those in the range [3, 258]. This package's
-	// compressor uses a higher minimum match length, enabling optimizations
-	// such as finding matches via 32-bit loads and compares.
-	baseMatchLength = 3       // The smallest match length per the RFC section 3.2.5
-	minMatchLength  = 4       // The smallest match length that the compressor actually emits
-	maxMatchLength  = 258     // The largest match length
-	baseMatchOffset = 1       // The smallest match offset
-	maxMatchOffset  = 1 << 15 // The largest match offset
-
-	// The maximum number of tokens we put into a single flate block, just to
-	// stop things from getting too large.
-	maxFlateBlockTokens = 1 << 14
+	logWindowSize  = 15
+	windowSize     = 1 << logWindowSize
+	windowMask     = windowSize - 1
+	minMatchLength = 4   // The smallest match that the compressor looks for
+	maxMatchLength = 258 // The longest match for the compressor
+	minOffsetSize  = 1   // The shortest offset that makes any sense
+
+	// The maximum number of tokens we will encode at the time.
+	// Smaller sizes usually creates less optimal blocks.
+	// Bigger can make context switching slow.
+	// We use this for levels 7-9, so we make it big.
+	maxFlateBlockTokens = 1 << 15
 	maxStoreBlockSize   = 65535
 	hashBits            = 17 // After 17 performance degrades
 	hashSize            = 1 << hashBits
 	hashMask            = (1 << hashBits) - 1
-	maxHashOffset       = 1 << 24
+	maxHashOffset       = 1 << 28
 
 	skipNever = math.MaxInt32
 )
 
 type compressionLevel struct {
-	level, good, lazy, nice, chain, fastSkipHashing int
+	good, lazy, nice, chain, level int
 }
 
 var levels = []compressionLevel{
-	{0, 0, 0, 0, 0, 0}, // NoCompression.
-	{1, 0, 0, 0, 0, 0}, // BestSpeed uses a custom algorithm; see deflatefast.go.
-	// For levels 2-3 we don't bother trying with lazy matches.
-	{2, 4, 0, 16, 8, 5},
-	{3, 4, 0, 32, 32, 6},
-	// Levels 4-9 use increasingly more lazy matching
+	{}, // 0
+	// Level 1-6 uses specialized algorithm - values not used
+	{0, 0, 0, 0, 1},
+	{0, 0, 0, 0, 2},
+	{0, 0, 0, 0, 3},
+	{0, 0, 0, 0, 4},
+	{0, 0, 0, 0, 5},
+	{0, 0, 0, 0, 6},
+	// Levels 7-9 use increasingly more lazy matching
 	// and increasingly stringent conditions for "good enough".
-	{4, 4, 4, 16, 16, skipNever},
-	{5, 8, 16, 32, 32, skipNever},
-	{6, 8, 16, 128, 128, skipNever},
-	{7, 8, 32, 128, 256, skipNever},
-	{8, 32, 128, 258, 1024, skipNever},
-	{9, 32, 258, 258, 4096, skipNever},
+	{8, 12, 16, 24, 7},
+	{16, 30, 40, 64, 8},
+	{32, 258, 258, 1024, 9},
 }
 
-type compressor struct {
-	compressionLevel
+// advancedState contains state for the advanced levels, with bigger hash tables, etc.
+type advancedState struct {
+	// deflate state
+	length         int
+	offset         int
+	maxInsertIndex int
+	chainHead      int
+	hashOffset     int
 
-	w          *huffmanBitWriter
-	bulkHasher func([]byte, []uint32)
+	ii uint16 // position of last match, intended to overflow to reset.
 
-	// compression algorithm
-	fill      func(*compressor, []byte) int // copy data to window
-	step      func(*compressor)             // process window
-	bestSpeed *deflateFast                  // Encoder for BestSpeed
+	// input window: unprocessed data is window[index:windowEnd]
+	index     int
+	hashMatch [maxMatchLength + minMatchLength]uint32
 
 	// Input hash chains
 	// hashHead[hashValue] contains the largest inputIndex with the specified hash value
 	// If hashHead[hashValue] is within the current window, then
 	// hashPrev[hashHead[hashValue] & windowMask] contains the previous index
 	// with the same hash value.
-	chainHead  int
-	hashHead   [hashSize]uint32
-	hashPrev   [windowSize]uint32
-	hashOffset int
+	hashHead [hashSize]uint32
+	hashPrev [windowSize]uint32
+}
 
-	// input window: unprocessed data is window[index:windowEnd]
-	index         int
-	window        []byte
-	windowEnd     int
-	blockStart    int  // window index where current tokens start
-	byteAvailable bool // if true, still need to process window[index-1].
+type compressor struct {
+	compressionLevel
 
-	sync bool // requesting flush
+	h *huffmanEncoder
+	w *huffmanBitWriter
 
-	// queued output tokens
-	tokens []token
+	// compression algorithm
+	fill func(*compressor, []byte) int // copy data to window
+	step func(*compressor)             // process window
 
-	// deflate state
-	length         int
-	offset         int
-	maxInsertIndex int
-	err            error
+	window     []byte
+	windowEnd  int
+	blockStart int // window index where current tokens start
+	err        error
+
+	// queued output tokens
+	tokens tokens
+	fast   fastEnc
+	state  *advancedState
 
-	// hashMatch must be able to contain hashes for the maximum match length.
-	hashMatch [maxMatchLength - 1]uint32
+	sync          bool // requesting flush
+	byteAvailable bool // if true, still need to process window[index-1].
 }
 
 func (d *compressor) fillDeflate(b []byte) int {
-	if d.index >= 2*windowSize-(minMatchLength+maxMatchLength) {
+	s := d.state
+	if s.index >= 2*windowSize-(minMatchLength+maxMatchLength) {
 		// shift the window by windowSize
-		copy(d.window, d.window[windowSize:2*windowSize])
-		d.index -= windowSize
+		//copy(d.window[:], d.window[windowSize:2*windowSize])
+		*(*[windowSize]byte)(d.window) = *(*[windowSize]byte)(d.window[windowSize:])
+		s.index -= windowSize
 		d.windowEnd -= windowSize
 		if d.blockStart >= windowSize {
 			d.blockStart -= windowSize
 		} else {
 			d.blockStart = math.MaxInt32
 		}
-		d.hashOffset += windowSize
-		if d.hashOffset > maxHashOffset {
-			delta := d.hashOffset - 1
-			d.hashOffset -= delta
-			d.chainHead -= delta
-
+		s.hashOffset += windowSize
+		if s.hashOffset > maxHashOffset {
+			delta := s.hashOffset - 1
+			s.hashOffset -= delta
+			s.chainHead -= delta
 			// Iterate over slices instead of arrays to avoid copying
 			// the entire table onto the stack (Issue #18625).
-			for i, v := range d.hashPrev[:] {
-				if int(v) > delta {
-					d.hashPrev[i] = uint32(int(v) - delta)
-				} else {
-					d.hashPrev[i] = 0
-				}
+			for i, v := range s.hashPrev[:] {
+				s.hashPrev[i] = uint32(max(int(v)-delta, 0))
 			}
-			for i, v := range d.hashHead[:] {
-				if int(v) > delta {
-					d.hashHead[i] = uint32(int(v) - delta)
-				} else {
-					d.hashHead[i] = 0
-				}
+			for i, v := range s.hashHead[:] {
+				s.hashHead[i] = uint32(max(int(v)-delta, 0))
 			}
 		}
 	}
@@ -161,14 +150,38 @@ func (d *compressor) fillDeflate(b []byte) int {
 	return n
 }
 
-func (d *compressor) writeBlock(tokens []token, index int) error {
-	if index > 0 {
+func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error {
+	if index > 0 || eof {
 		var window []byte
 		if d.blockStart <= index {
 			window = d.window[d.blockStart:index]
 		}
 		d.blockStart = index
-		d.w.writeBlock(tokens, false, window)
+		d.w.writeBlockDynamic(tok, eof, window, d.sync)
+		return d.w.err
+	}
+	return nil
+}
+
+// writeBlockSkip writes the current block and uses the number of tokens
+// to determine if the block should be stored on no matches, or
+// only huffman encoded.
+func (d *compressor) writeBlockSkip(tok *tokens, index int, eof bool) error {
+	if index > 0 || eof {
+		if d.blockStart <= index {
+			window := d.window[d.blockStart:index]
+			// If we removed less than a 64th of all literals
+			// we huffman compress the block.
+			if int(tok.n) > len(window)-int(tok.n>>6) {
+				d.w.writeBlockHuff(eof, window, d.sync)
+			} else {
+				// Write a dynamic huffman block.
+				d.w.writeBlockDynamic(tok, eof, window, d.sync)
+			}
+		} else {
+			d.w.writeBlock(tok, eof, nil)
+		}
+		d.blockStart = index
 		return d.w.err
 	}
 	return nil
@@ -177,103 +190,139 @@ func (d *compressor) writeBlock(tokens []token, index int) error {
 // fillWindow will fill the current window with the supplied
 // dictionary and calculate all hashes.
 // This is much faster than doing a full encode.
-// Should only be used after a reset.
+// Should only be used after a start/reset.
 func (d *compressor) fillWindow(b []byte) {
-	// Do not fill window if we are in store-only mode.
-	if d.compressionLevel.level < 2 {
+	// Do not fill window if we are in store-only or huffman mode.
+	if d.level <= 0 {
 		return
 	}
-	if d.index != 0 || d.windowEnd != 0 {
-		panic("internal error: fillWindow called with stale data")
+	if d.fast != nil {
+		// encode the last data, but discard the result
+		if len(b) > maxMatchOffset {
+			b = b[len(b)-maxMatchOffset:]
+		}
+		d.fast.Encode(&d.tokens, b)
+		d.tokens.Reset()
+		return
 	}
-
+	s := d.state
 	// If we are given too much, cut it.
 	if len(b) > windowSize {
 		b = b[len(b)-windowSize:]
 	}
 	// Add all to window.
-	n := copy(d.window, b)
+	n := copy(d.window[d.windowEnd:], b)
 
 	// Calculate 256 hashes at the time (more L1 cache hits)
 	loops := (n + 256 - minMatchLength) / 256
-	for j := 0; j < loops; j++ {
-		index := j * 256
-		end := index + 256 + minMatchLength - 1
-		if end > n {
-			end = n
-		}
-		toCheck := d.window[index:end]
-		dstSize := len(toCheck) - minMatchLength + 1
+	for j := range loops {
+		startindex := j * 256
+		end := min(startindex+256+minMatchLength-1, n)
+		tocheck := d.window[startindex:end]
+		dstSize := len(tocheck) - minMatchLength + 1
 
 		if dstSize <= 0 {
 			continue
 		}
 
-		dst := d.hashMatch[:dstSize]
-		d.bulkHasher(toCheck, dst)
+		dst := s.hashMatch[:dstSize]
+		bulkHash4(tocheck, dst)
+		var newH uint32
 		for i, val := range dst {
-			di := i + index
-			hh := &d.hashHead[val&hashMask]
+			di := i + startindex
+			newH = val & hashMask
 			// Get previous value with the same hash.
 			// Our chain should point to the previous value.
-			d.hashPrev[di&windowMask] = *hh
+			s.hashPrev[di&windowMask] = s.hashHead[newH]
 			// Set the head of the hash chain to us.
-			*hh = uint32(di + d.hashOffset)
+			s.hashHead[newH] = uint32(di + s.hashOffset)
 		}
 	}
 	// Update window information.
-	d.windowEnd = n
-	d.index = n
+	d.windowEnd += n
+	s.index = n
 }
 
 // Try to find a match starting at index whose length is greater than prevSize.
 // We only look at chainCount possibilities before giving up.
-func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
-	minMatchLook := maxMatchLength
-	if lookahead < minMatchLook {
-		minMatchLook = lookahead
-	}
+func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, offset int, ok bool) {
+	minMatchLook := min(lookahead, maxMatchLength)
 
 	win := d.window[0 : pos+minMatchLook]
 
 	// We quit when we get a match that's at least nice long
-	nice := len(win) - pos
-	if d.nice < nice {
-		nice = d.nice
-	}
+	nice := min(d.nice, len(win)-pos)
 
 	// If we've got a match that's good enough, only look in 1/4 the chain.
 	tries := d.chain
-	length = prevLength
-	if length >= d.good {
-		tries >>= 2
-	}
+	length = minMatchLength - 1
 
 	wEnd := win[pos+length]
 	wPos := win[pos:]
-	minIndex := pos - windowSize
+	minIndex := max(pos-windowSize, 0)
+	offset = 0
+
+	if d.chain < 100 {
+		for i := prevHead; tries > 0; tries-- {
+			if wEnd == win[i+length] {
+				n := matchLen(win[i:i+minMatchLook], wPos)
+				if n > length {
+					length = n
+					offset = pos - i
+					ok = true
+					if n >= nice {
+						// The match is good enough that we don't try to find a better one.
+						break
+					}
+					wEnd = win[pos+n]
+				}
+			}
+			if i <= minIndex {
+				// hashPrev[i & windowMask] has already been overwritten, so stop now.
+				break
+			}
+			i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
+			if i < minIndex {
+				break
+			}
+		}
+		return
+	}
+
+	// Minimum gain to accept a match.
+	cGain := 4
+
+	// Some like it higher (CSV), some like it lower (JSON)
+	const baseCost = 3
+	// Base is 4 bytes at with an additional cost.
+	// Matches must be better than this.
 
 	for i := prevHead; tries > 0; tries-- {
 		if wEnd == win[i+length] {
-			n := matchLen(win[i:], wPos, minMatchLook)
-
-			if n > length && (n > minMatchLength || pos-i <= 4096) {
-				length = n
-				offset = pos - i
-				ok = true
-				if n >= nice {
-					// The match is good enough that we don't try to find a better one.
-					break
+			n := matchLen(win[i:i+minMatchLook], wPos)
+			if n > length {
+				// Calculate gain. Estimates the gains of the new match compared to emitting as literals.
+				newGain := d.h.bitLengthRaw(wPos[:n]) - int(offsetExtraBits[offsetCode(uint32(pos-i))]) - baseCost - int(lengthExtraBits[lengthCodes[(n-3)&255]])
+
+				if newGain > cGain {
+					length = n
+					offset = pos - i
+					cGain = newGain
+					ok = true
+					if n >= nice {
+						// The match is good enough that we don't try to find a better one.
+						break
+					}
+					wEnd = win[pos+n]
 				}
-				wEnd = win[pos+n]
 			}
 		}
-		if i == minIndex {
+		if i <= minIndex {
 			// hashPrev[i & windowMask] has already been overwritten, so stop now.
 			break
 		}
-		i = int(d.hashPrev[i&windowMask]) - d.hashOffset
-		if i < minIndex || i < 0 {
+		i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
+		if i < minIndex {
 			break
 		}
 	}
@@ -288,235 +337,272 @@ func (d *compressor) writeStoredBlock(buf []byte) error {
 	return d.w.err
 }
 
-const hashmul = 0x1e35a7bd
-
 // hash4 returns a hash representation of the first 4 bytes
 // of the supplied slice.
 // The caller must ensure that len(b) >= 4.
 func hash4(b []byte) uint32 {
-	return ((uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24) * hashmul) >> (32 - hashBits)
+	return hash4u(loadLE32(b, 0), hashBits)
+}
+
+// hash4 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4u(u uint32, h uint8) uint32 {
+	return (u * prime4bytes) >> (32 - h)
 }
 
 // bulkHash4 will compute hashes using the same
-// algorithm as hash4.
+// algorithm as hash4
 func bulkHash4(b []byte, dst []uint32) {
-	if len(b) < minMatchLength {
+	if len(b) < 4 {
 		return
 	}
-	hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
-	dst[0] = (hb * hashmul) >> (32 - hashBits)
-	end := len(b) - minMatchLength + 1
-	for i := 1; i < end; i++ {
-		hb = (hb << 8) | uint32(b[i+3])
-		dst[i] = (hb * hashmul) >> (32 - hashBits)
-	}
-}
-
-// matchLen returns the number of matching bytes in a and b
-// up to length 'max'. Both slices must be at least 'max'
-// bytes in size.
-func matchLen(a, b []byte, max int) int {
-	a = a[:max]
-	b = b[:len(a)]
-	for i, av := range a {
-		if b[i] != av {
-			return i
-		}
-	}
-	return max
-}
-
-// encSpeed will compress and store the currently added data,
-// if enough has been accumulated or we at the end of the stream.
-// Any error that occurred will be in d.err
-func (d *compressor) encSpeed() {
-	// We only compress if we have maxStoreBlockSize.
-	if d.windowEnd < maxStoreBlockSize {
-		if !d.sync {
-			return
-		}
-
-		// Handle small sizes.
-		if d.windowEnd < 128 {
-			switch {
-			case d.windowEnd == 0:
-				return
-			case d.windowEnd <= 16:
-				d.err = d.writeStoredBlock(d.window[:d.windowEnd])
-			default:
-				d.w.writeBlockHuff(false, d.window[:d.windowEnd])
-				d.err = d.w.err
-			}
-			d.windowEnd = 0
-			d.bestSpeed.reset()
-			return
-		}
-
-	}
-	// Encode the block.
-	d.tokens = d.bestSpeed.encode(d.tokens[:0], d.window[:d.windowEnd])
+	hb := loadLE32(b, 0)
 
-	// If we removed less than 1/16th, Huffman compress the block.
-	if len(d.tokens) > d.windowEnd-(d.windowEnd>>4) {
-		d.w.writeBlockHuff(false, d.window[:d.windowEnd])
-	} else {
-		d.w.writeBlockDynamic(d.tokens, false, d.window[:d.windowEnd])
+	dst[0] = hash4u(hb, hashBits)
+	end := len(b) - 4 + 1
+	for i := 1; i < end; i++ {
+		hb = (hb >> 8) | uint32(b[i+3])<<24
+		dst[i] = hash4u(hb, hashBits)
 	}
-	d.err = d.w.err
-	d.windowEnd = 0
 }
 
 func (d *compressor) initDeflate() {
 	d.window = make([]byte, 2*windowSize)
-	d.hashOffset = 1
-	d.tokens = make([]token, 0, maxFlateBlockTokens+1)
-	d.length = minMatchLength - 1
-	d.offset = 0
 	d.byteAvailable = false
-	d.index = 0
-	d.chainHead = -1
-	d.bulkHasher = bulkHash4
+	d.err = nil
+	if d.state == nil {
+		return
+	}
+	s := d.state
+	s.index = 0
+	s.hashOffset = 1
+	s.length = minMatchLength - 1
+	s.offset = 0
+	s.chainHead = -1
 }
 
-func (d *compressor) deflate() {
-	if d.windowEnd-d.index < minMatchLength+maxMatchLength && !d.sync {
+// deflateLazy does encoding with lazy matching.
+func (d *compressor) deflateLazy() {
+	s := d.state
+
+	if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync {
 		return
 	}
+	if d.windowEnd != s.index && d.chain > 100 {
+		// Get literal huffman coder.
+		// This is used to estimate the cost of emitting a literal.
+		if d.h == nil {
+			d.h = newHuffmanEncoder(maxFlateBlockTokens)
+		}
+		var tmp [256]uint16
+		for _, v := range d.window[s.index:d.windowEnd] {
+			tmp[v]++
+		}
+		d.h.generate(tmp[:], 15)
+	}
 
-	d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
+	s.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
 
-Loop:
 	for {
-		if d.index > d.windowEnd {
-			panic("index > windowEnd")
-		}
-		lookahead := d.windowEnd - d.index
+		lookahead := d.windowEnd - s.index
 		if lookahead < minMatchLength+maxMatchLength {
 			if !d.sync {
-				break Loop
-			}
-			if d.index > d.windowEnd {
-				panic("index > windowEnd")
+				return
 			}
 			if lookahead == 0 {
 				// Flush current output block if any.
 				if d.byteAvailable {
 					// There is still one pending token that needs to be flushed
-					d.tokens = append(d.tokens, literalToken(uint32(d.window[d.index-1])))
+					d.tokens.AddLiteral(d.window[s.index-1])
 					d.byteAvailable = false
 				}
-				if len(d.tokens) > 0 {
-					if d.err = d.writeBlock(d.tokens, d.index); d.err != nil {
+				if d.tokens.n > 0 {
+					if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
 						return
 					}
-					d.tokens = d.tokens[:0]
+					d.tokens.Reset()
 				}
-				break Loop
+				return
 			}
 		}
-		if d.index < d.maxInsertIndex {
+		if s.index < s.maxInsertIndex {
 			// Update the hash
-			hash := hash4(d.window[d.index : d.index+minMatchLength])
-			hh := &d.hashHead[hash&hashMask]
-			d.chainHead = int(*hh)
-			d.hashPrev[d.index&windowMask] = uint32(d.chainHead)
-			*hh = uint32(d.index + d.hashOffset)
+			hash := hash4(d.window[s.index:])
+			ch := s.hashHead[hash]
+			s.chainHead = int(ch)
+			s.hashPrev[s.index&windowMask] = ch
+			s.hashHead[hash] = uint32(s.index + s.hashOffset)
 		}
-		prevLength := d.length
-		prevOffset := d.offset
-		d.length = minMatchLength - 1
-		d.offset = 0
-		minIndex := d.index - windowSize
-		if minIndex < 0 {
-			minIndex = 0
+		prevLength := s.length
+		prevOffset := s.offset
+		s.length = minMatchLength - 1
+		s.offset = 0
+		minIndex := max(s.index-windowSize, 0)
+
+		if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
+			if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, lookahead); ok {
+				s.length = newLength
+				s.offset = newOffset
+			}
 		}
 
-		if d.chainHead-d.hashOffset >= minIndex &&
-			(d.fastSkipHashing != skipNever && lookahead > minMatchLength-1 ||
-				d.fastSkipHashing == skipNever && lookahead > prevLength && prevLength < d.lazy) {
-			if newLength, newOffset, ok := d.findMatch(d.index, d.chainHead-d.hashOffset, minMatchLength-1, lookahead); ok {
-				d.length = newLength
-				d.offset = newOffset
+		if prevLength >= minMatchLength && s.length <= prevLength {
+			// No better match, but check for better match at end...
+			//
+			// Skip forward a number of bytes.
+			// Offset of 2 seems to yield the best results. 3 is sometimes better.
+			const checkOff = 2
+
+			// Check all, except full length
+			if prevLength < maxMatchLength-checkOff {
+				prevIndex := s.index - 1
+				if prevIndex+prevLength < s.maxInsertIndex {
+					end := min(lookahead, maxMatchLength+checkOff)
+					end += prevIndex
+
+					// Hash at match end.
+					h := hash4(d.window[prevIndex+prevLength:])
+					ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength
+					if prevIndex-ch2 != prevOffset && ch2 > minIndex+checkOff {
+						length := matchLen(d.window[prevIndex+checkOff:end], d.window[ch2+checkOff:])
+						// It seems like a pure length metric is best.
+						if length > prevLength {
+							prevLength = length
+							prevOffset = prevIndex - ch2
+
+							// Extend back...
+							for i := checkOff - 1; i >= 0; i-- {
+								if prevLength >= maxMatchLength || d.window[prevIndex+i] != d.window[ch2+i] {
+									// Emit tokens we "owe"
+									for j := 0; j <= i; j++ {
+										d.tokens.AddLiteral(d.window[prevIndex+j])
+										if d.tokens.n == maxFlateBlockTokens {
+											// The block includes the current character
+											if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+												return
+											}
+											d.tokens.Reset()
+										}
+										s.index++
+										if s.index < s.maxInsertIndex {
+											h := hash4(d.window[s.index:])
+											ch := s.hashHead[h]
+											s.chainHead = int(ch)
+											s.hashPrev[s.index&windowMask] = ch
+											s.hashHead[h] = uint32(s.index + s.hashOffset)
+										}
+									}
+									break
+								} else {
+									prevLength++
+								}
+							}
+						}
+					}
+				}
 			}
-		}
-		if d.fastSkipHashing != skipNever && d.length >= minMatchLength ||
-			d.fastSkipHashing == skipNever && prevLength >= minMatchLength && d.length <= prevLength {
 			// There was a match at the previous step, and the current match is
 			// not better. Output the previous match.
-			if d.fastSkipHashing != skipNever {
-				d.tokens = append(d.tokens, matchToken(uint32(d.length-baseMatchLength), uint32(d.offset-baseMatchOffset)))
-			} else {
-				d.tokens = append(d.tokens, matchToken(uint32(prevLength-baseMatchLength), uint32(prevOffset-baseMatchOffset)))
-			}
+			d.tokens.AddMatch(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
+
 			// Insert in the hash table all strings up to the end of the match.
 			// index and index-1 are already inserted. If there is not enough
 			// lookahead, the last two strings are not inserted into the hash
 			// table.
-			if d.length <= d.fastSkipHashing {
-				var newIndex int
-				if d.fastSkipHashing != skipNever {
-					newIndex = d.index + d.length
-				} else {
-					newIndex = d.index + prevLength - 1
-				}
-				index := d.index
-				for index++; index < newIndex; index++ {
-					if index < d.maxInsertIndex {
-						hash := hash4(d.window[index : index+minMatchLength])
-						// Get previous value with the same hash.
-						// Our chain should point to the previous value.
-						hh := &d.hashHead[hash&hashMask]
-						d.hashPrev[index&windowMask] = *hh
-						// Set the head of the hash chain to us.
-						*hh = uint32(index + d.hashOffset)
-					}
+			newIndex := s.index + prevLength - 1
+			// Calculate missing hashes
+			end := min(newIndex, s.maxInsertIndex)
+			end += minMatchLength - 1
+			startindex := min(s.index+1, s.maxInsertIndex)
+			tocheck := d.window[startindex:end]
+			dstSize := len(tocheck) - minMatchLength + 1
+			if dstSize > 0 {
+				dst := s.hashMatch[:dstSize]
+				bulkHash4(tocheck, dst)
+				var newH uint32
+				for i, val := range dst {
+					di := i + startindex
+					newH = val & hashMask
+					// Get previous value with the same hash.
+					// Our chain should point to the previous value.
+					s.hashPrev[di&windowMask] = s.hashHead[newH]
+					// Set the head of the hash chain to us.
+					s.hashHead[newH] = uint32(di + s.hashOffset)
 				}
-				d.index = index
-
-				if d.fastSkipHashing == skipNever {
-					d.byteAvailable = false
-					d.length = minMatchLength - 1
-				}
-			} else {
-				// For matches this long, we don't bother inserting each individual
-				// item into the table.
-				d.index += d.length
 			}
-			if len(d.tokens) == maxFlateBlockTokens {
+
+			s.index = newIndex
+			d.byteAvailable = false
+			s.length = minMatchLength - 1
+			if d.tokens.n == maxFlateBlockTokens {
 				// The block includes the current character
-				if d.err = d.writeBlock(d.tokens, d.index); d.err != nil {
+				if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
 					return
 				}
-				d.tokens = d.tokens[:0]
+				d.tokens.Reset()
 			}
+			s.ii = 0
 		} else {
-			if d.fastSkipHashing != skipNever || d.byteAvailable {
-				i := d.index - 1
-				if d.fastSkipHashing != skipNever {
-					i = d.index
-				}
-				d.tokens = append(d.tokens, literalToken(uint32(d.window[i])))
-				if len(d.tokens) == maxFlateBlockTokens {
-					if d.err = d.writeBlock(d.tokens, i+1); d.err != nil {
+			// Reset, if we got a match this run.
+			if s.length >= minMatchLength {
+				s.ii = 0
+			}
+			// We have a byte waiting. Emit it.
+			if d.byteAvailable {
+				s.ii++
+				d.tokens.AddLiteral(d.window[s.index-1])
+				if d.tokens.n == maxFlateBlockTokens {
+					if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
 						return
 					}
-					d.tokens = d.tokens[:0]
+					d.tokens.Reset()
 				}
-			}
-			d.index++
-			if d.fastSkipHashing == skipNever {
+				s.index++
+
+				// If we have a long run of no matches, skip additional bytes
+				// Resets when s.ii overflows after 64KB.
+				if n := int(s.ii) - d.chain; n > 0 {
+					n = 1 + int(n>>6)
+					for j := 0; j < n; j++ {
+						if s.index >= d.windowEnd-1 {
+							break
+						}
+						d.tokens.AddLiteral(d.window[s.index-1])
+						if d.tokens.n == maxFlateBlockTokens {
+							if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+								return
+							}
+							d.tokens.Reset()
+						}
+						// Index...
+						if s.index < s.maxInsertIndex {
+							h := hash4(d.window[s.index:])
+							ch := s.hashHead[h]
+							s.chainHead = int(ch)
+							s.hashPrev[s.index&windowMask] = ch
+							s.hashHead[h] = uint32(s.index + s.hashOffset)
+						}
+						s.index++
+					}
+					// Flush last byte
+					d.tokens.AddLiteral(d.window[s.index-1])
+					d.byteAvailable = false
+					// s.length = minMatchLength - 1 // not needed, since s.ii is reset above, so it should never be > minMatchLength
+					if d.tokens.n == maxFlateBlockTokens {
+						if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+							return
+						}
+						d.tokens.Reset()
+					}
+				}
+			} else {
+				s.index++
 				d.byteAvailable = true
 			}
 		}
 	}
 }
 
-func (d *compressor) fillStore(b []byte) int {
-	n := copy(d.window[d.windowEnd:], b)
-	d.windowEnd += n
-	return n
-}
-
 func (d *compressor) store() {
 	if d.windowEnd > 0 && (d.windowEnd == maxStoreBlockSize || d.sync) {
 		d.err = d.writeStoredBlock(d.window[:d.windowEnd])
@@ -524,38 +610,93 @@ func (d *compressor) store() {
 	}
 }
 
-// storeHuff compresses and stores the currently added data
-// when the d.window is full or we are at the end of the stream.
+// fillWindow will fill the buffer with data for huffman-only compression.
+// The number of bytes copied is returned.
+func (d *compressor) fillBlock(b []byte) int {
+	n := copy(d.window[d.windowEnd:], b)
+	d.windowEnd += n
+	return n
+}
+
+// storeHuff will compress and store the currently added data,
+// if enough has been accumulated or we at the end of the stream.
 // Any error that occurred will be in d.err
 func (d *compressor) storeHuff() {
 	if d.windowEnd < len(d.window) && !d.sync || d.windowEnd == 0 {
 		return
 	}
-	d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+	d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync)
 	d.err = d.w.err
 	d.windowEnd = 0
 }
 
+// storeFast will compress and store the currently added data,
+// if enough has been accumulated or we at the end of the stream.
+// Any error that occurred will be in d.err
+func (d *compressor) storeFast() {
+	// We only compress if we have maxStoreBlockSize.
+	if d.windowEnd < len(d.window) {
+		if !d.sync {
+			return
+		}
+		// Handle extremely small sizes.
+		if d.windowEnd < 128 {
+			if d.windowEnd == 0 {
+				return
+			}
+			if d.windowEnd <= 32 {
+				d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+			} else {
+				d.w.writeBlockHuff(false, d.window[:d.windowEnd], true)
+				d.err = d.w.err
+			}
+			d.tokens.Reset()
+			d.windowEnd = 0
+			d.fast.Reset()
+			return
+		}
+	}
+
+	d.fast.Encode(&d.tokens, d.window[:d.windowEnd])
+	// If we made zero matches, store the block as is.
+	if d.tokens.n == 0 {
+		d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+		// If we removed less than 1/16th, huffman compress the block.
+	} else if int(d.tokens.n) > d.windowEnd-(d.windowEnd>>4) {
+		d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync)
+		d.err = d.w.err
+	} else {
+		d.w.writeBlockDynamic(&d.tokens, false, d.window[:d.windowEnd], d.sync)
+		d.err = d.w.err
+	}
+	d.tokens.Reset()
+	d.windowEnd = 0
+}
+
+// write will add input byte to the stream.
+// Unless an error occurs all bytes will be consumed.
 func (d *compressor) write(b []byte) (n int, err error) {
 	if d.err != nil {
 		return 0, d.err
 	}
 	n = len(b)
 	for len(b) > 0 {
-		d.step(d)
+		if d.windowEnd == len(d.window) || d.sync {
+			d.step(d)
+		}
 		b = b[d.fill(d, b):]
 		if d.err != nil {
 			return 0, d.err
 		}
 	}
-	return n, nil
+	return n, d.err
 }
 
 func (d *compressor) syncFlush() error {
+	d.sync = true
 	if d.err != nil {
 		return d.err
 	}
-	d.sync = true
 	d.step(d)
 	if d.err == nil {
 		d.w.writeStoredHeader(0, false)
@@ -572,30 +713,33 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 	switch {
 	case level == NoCompression:
 		d.window = make([]byte, maxStoreBlockSize)
-		d.fill = (*compressor).fillStore
+		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).store
 	case level == HuffmanOnly:
-		d.window = make([]byte, maxStoreBlockSize)
-		d.fill = (*compressor).fillStore
+		d.w.logNewTablePenalty = 10
+		d.window = make([]byte, 32<<10)
+		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).storeHuff
-	case level == BestSpeed:
-		d.compressionLevel = levels[level]
-		d.window = make([]byte, maxStoreBlockSize)
-		d.fill = (*compressor).fillStore
-		d.step = (*compressor).encSpeed
-		d.bestSpeed = newDeflateFast()
-		d.tokens = make([]token, maxStoreBlockSize)
 	case level == DefaultCompression:
 		level = 6
 		fallthrough
-	case 2 <= level && level <= 9:
+	case level >= 1 && level <= 6:
+		d.w.logNewTablePenalty = 7
+		d.fast = newFastEnc(level)
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillBlock
+		d.step = (*compressor).storeFast
+	case 7 <= level && level <= 9:
+		d.w.logNewTablePenalty = 8
+		d.state = &advancedState{}
 		d.compressionLevel = levels[level]
 		d.initDeflate()
 		d.fill = (*compressor).fillDeflate
-		d.step = (*compressor).deflate
+		d.step = (*compressor).deflateLazy
 	default:
 		return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level)
 	}
+	d.level = level
 	return nil
 }
 
@@ -603,27 +747,39 @@ func (d *compressor) reset(w io.Writer) {
 	d.w.reset(w)
 	d.sync = false
 	d.err = nil
-	switch d.compressionLevel.level {
-	case NoCompression:
+	// We only need to reset a few things for Snappy.
+	if d.fast != nil {
+		d.fast.Reset()
 		d.windowEnd = 0
-	case BestSpeed:
+		d.tokens.Reset()
+		return
+	}
+	switch d.compressionLevel.chain {
+	case 0:
+		// level was NoCompression or ConstantCompression.
 		d.windowEnd = 0
-		d.tokens = d.tokens[:0]
-		d.bestSpeed.reset()
 	default:
-		d.chainHead = -1
-		clear(d.hashHead[:])
-		clear(d.hashPrev[:])
-		d.hashOffset = 1
-		d.index, d.windowEnd = 0, 0
+		s := d.state
+		s.chainHead = -1
+		for i := range s.hashHead {
+			s.hashHead[i] = 0
+		}
+		for i := range s.hashPrev {
+			s.hashPrev[i] = 0
+		}
+		s.hashOffset = 1
+		s.index, d.windowEnd = 0, 0
 		d.blockStart, d.byteAvailable = 0, false
-		d.tokens = d.tokens[:0]
-		d.length = minMatchLength - 1
-		d.offset = 0
-		d.maxInsertIndex = 0
+		d.tokens.Reset()
+		s.length = minMatchLength - 1
+		s.offset = 0
+		s.ii = 0
+		s.maxInsertIndex = 0
 	}
 }
 
+var errWriterClosed = errors.New("flate: closed writer")
+
 func (d *compressor) close() error {
 	if d.err == errWriterClosed {
 		return nil
@@ -644,6 +800,7 @@ func (d *compressor) close() error {
 		return d.w.err
 	}
 	d.err = errWriterClosed
+	d.w.reset(nil)
 	return nil
 }
 
@@ -674,26 +831,15 @@ func NewWriter(w io.Writer, level int) (*Writer, error) {
 // can only be decompressed by a reader initialized with the
 // same dictionary (see [NewReaderDict]).
 func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) {
-	dw := &dictWriter{w}
-	zw, err := NewWriter(dw, level)
+	zw, err := NewWriter(w, level)
 	if err != nil {
 		return nil, err
 	}
 	zw.d.fillWindow(dict)
 	zw.dict = append(zw.dict, dict...) // duplicate dictionary for Reset method.
-	return zw, nil
-}
-
-type dictWriter struct {
-	w io.Writer
+	return zw, err
 }
 
-func (w *dictWriter) Write(b []byte) (n int, err error) {
-	return w.w.Write(b)
-}
-
-var errWriterClosed = errors.New("flate: closed writer")
-
 // A Writer takes data written to it and writes the compressed
 // form of that data to an underlying writer (see [NewWriter]).
 type Writer struct {
@@ -728,16 +874,26 @@ func (w *Writer) Close() error {
 }
 
 // Reset discards the writer's state and makes it equivalent to
-// the result of [NewWriter] or [NewWriterDict] called with dst
+// the result of NewWriter or NewWriterDict called with dst
 // and w's level and dictionary.
 func (w *Writer) Reset(dst io.Writer) {
-	if dw, ok := w.d.w.writer.(*dictWriter); ok {
+	if len(w.dict) > 0 {
 		// w was created with NewWriterDict
-		dw.w = dst
-		w.d.reset(dw)
-		w.d.fillWindow(w.dict)
+		w.d.reset(dst)
+		if dst != nil {
+			w.d.fillWindow(w.dict)
+		}
 	} else {
 		// w was created with NewWriter
 		w.d.reset(dst)
 	}
 }
+
+// ResetDict discards the writer's state and makes it equivalent to
+// the result of NewWriter or NewWriterDict called with dst
+// and w's level, but sets a specific dictionary.
+func (w *Writer) ResetDict(dst io.Writer, dict []byte) {
+	w.dict = dict
+	w.d.reset(dst)
+	w.d.fillWindow(w.dict)
+}
diff --git a/src/compress/flate/deflate_test.go b/src/compress/flate/deflate_test.go
index 3610c7bf8763df..4bb89c61dcad0c 100644
--- a/src/compress/flate/deflate_test.go
+++ b/src/compress/flate/deflate_test.go
@@ -6,14 +6,11 @@ package flate
 
 import (
 	"bytes"
-	"errors"
 	"fmt"
-	"internal/testenv"
 	"io"
-	"math/rand"
 	"os"
 	"reflect"
-	"runtime/debug"
+	"strings"
 	"sync"
 	"testing"
 )
@@ -35,24 +32,24 @@ type reverseBitsTest struct {
 }
 
 var deflateTests = []*deflateTest{
-	{[]byte{}, 0, []byte{1, 0, 0, 255, 255}},
-	{[]byte{0x11}, -1, []byte{18, 4, 4, 0, 0, 255, 255}},
-	{[]byte{0x11}, DefaultCompression, []byte{18, 4, 4, 0, 0, 255, 255}},
-	{[]byte{0x11}, 4, []byte{18, 4, 4, 0, 0, 255, 255}},
-
-	{[]byte{0x11}, 0, []byte{0, 1, 0, 254, 255, 17, 1, 0, 0, 255, 255}},
-	{[]byte{0x11, 0x12}, 0, []byte{0, 2, 0, 253, 255, 17, 18, 1, 0, 0, 255, 255}},
-	{[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 0,
-		[]byte{0, 8, 0, 247, 255, 17, 17, 17, 17, 17, 17, 17, 17, 1, 0, 0, 255, 255},
+	0: {[]byte{}, 0, []byte{0x3, 0x0}},
+	1: {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}},
+	2: {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}},
+	3: {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}},
+
+	4: {[]byte{0x11}, 0, []byte{0x0, 0x1, 0x0, 0xfe, 0xff, 0x11, 0x3, 0x0}},
+	5: {[]byte{0x11, 0x12}, 0, []byte{0x0, 0x2, 0x0, 0xfd, 0xff, 0x11, 0x12, 0x3, 0x0}},
+	6: {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 0,
+		[]byte{0x0, 0x8, 0x0, 0xf7, 0xff, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x3, 0x0},
 	},
-	{[]byte{}, 2, []byte{1, 0, 0, 255, 255}},
-	{[]byte{0x11}, 2, []byte{18, 4, 4, 0, 0, 255, 255}},
-	{[]byte{0x11, 0x12}, 2, []byte{18, 20, 2, 4, 0, 0, 255, 255}},
-	{[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 2, []byte{18, 132, 2, 64, 0, 0, 0, 255, 255}},
-	{[]byte{}, 9, []byte{1, 0, 0, 255, 255}},
-	{[]byte{0x11}, 9, []byte{18, 4, 4, 0, 0, 255, 255}},
-	{[]byte{0x11, 0x12}, 9, []byte{18, 20, 2, 4, 0, 0, 255, 255}},
-	{[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 9, []byte{18, 132, 2, 64, 0, 0, 0, 255, 255}},
+	7:  {[]byte{}, 1, []byte{0x3, 0x0}},
+	8:  {[]byte{0x11}, BestCompression, []byte{0x12, 0x4, 0xc, 0x0}},
+	9:  {[]byte{0x11, 0x12}, BestCompression, []byte{0x12, 0x14, 0x2, 0xc, 0x0}},
+	10: {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, BestCompression, []byte{0x12, 0x84, 0x1, 0xc0, 0x0}},
+	11: {[]byte{}, 9, []byte{0x3, 0x0}},
+	12: {[]byte{0x11}, 9, []byte{0x12, 0x4, 0xc, 0x0}},
+	13: {[]byte{0x11, 0x12}, 9, []byte{0x12, 0x14, 0x2, 0xc, 0x0}},
+	14: {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 9, []byte{0x12, 0x84, 0x1, 0xc0, 0x0}},
 }
 
 var deflateInflateTests = []*deflateInflateTest{
@@ -86,23 +83,24 @@ func largeDataChunk() []byte {
 func TestBulkHash4(t *testing.T) {
 	for _, x := range deflateTests {
 		y := x.out
-		if len(y) < minMatchLength {
-			continue
-		}
-		y = append(y, y...)
-		for j := 4; j < len(y); j++ {
-			y := y[:j]
-			dst := make([]uint32, len(y)-minMatchLength+1)
-			for i := range dst {
-				dst[i] = uint32(i + 100)
-			}
-			bulkHash4(y, dst)
-			for i, got := range dst {
-				want := hash4(y[i:])
-				if got != want && got == uint32(i)+100 {
-					t.Errorf("Len:%d Index:%d, want 0x%08x but not modified", len(y), i, want)
-				} else if got != want {
-					t.Errorf("Len:%d Index:%d, got 0x%08x want:0x%08x", len(y), i, got, want)
+		if len(y) >= minMatchLength {
+			y = append(y, y...)
+			for j := 4; j < len(y); j++ {
+				y := y[:j]
+				dst := make([]uint32, len(y)-minMatchLength+1)
+				for i := range dst {
+					dst[i] = uint32(i + 100)
+				}
+				bulkHash4(y, dst)
+				for i, got := range dst {
+					want := hash4(y[i:])
+					if got != want && got == uint32(i)+100 {
+						t.Errorf("Len:%d Index:%d, expected 0x%08x but not modified", len(y), i, want)
+					} else if got != want {
+						t.Errorf("Len:%d Index:%d, got 0x%08x expected:0x%08x", len(y), i, got, want)
+					} else {
+						//t.Logf("Len:%d Index:%d OK (0x%08x)", len(y), i, got)
+					}
 				}
 			}
 		}
@@ -110,7 +108,7 @@ func TestBulkHash4(t *testing.T) {
 }
 
 func TestDeflate(t *testing.T) {
-	for _, h := range deflateTests {
+	for i, h := range deflateTests {
 		var buf bytes.Buffer
 		w, err := NewWriter(&buf, h.level)
 		if err != nil {
@@ -120,45 +118,11 @@ func TestDeflate(t *testing.T) {
 		w.Write(h.in)
 		w.Close()
 		if !bytes.Equal(buf.Bytes(), h.out) {
-			t.Errorf("Deflate(%d, %x) = \n%#v, want \n%#v", h.level, h.in, buf.Bytes(), h.out)
+			t.Errorf("%d: Deflate(%d, %x) got \n%#v, want \n%#v", i, h.level, h.in, buf.Bytes(), h.out)
 		}
 	}
 }
 
-func TestWriterClose(t *testing.T) {
-	b := new(bytes.Buffer)
-	zw, err := NewWriter(b, 6)
-	if err != nil {
-		t.Fatalf("NewWriter: %v", err)
-	}
-
-	if c, err := zw.Write([]byte("Test")); err != nil || c != 4 {
-		t.Fatalf("Write to not closed writer: %s, %d", err, c)
-	}
-
-	if err := zw.Close(); err != nil {
-		t.Fatalf("Close: %v", err)
-	}
-
-	afterClose := b.Len()
-
-	if c, err := zw.Write([]byte("Test")); err == nil || c != 0 {
-		t.Fatalf("Write to closed writer: %v, %d", err, c)
-	}
-
-	if err := zw.Flush(); err == nil {
-		t.Fatalf("Flush to closed writer: %s", err)
-	}
-
-	if err := zw.Close(); err != nil {
-		t.Fatalf("Close: %v", err)
-	}
-
-	if afterClose != b.Len() {
-		t.Fatalf("Writer wrote data after close. After close: %d. After writes on closed stream: %d", afterClose, b.Len())
-	}
-}
-
 // A sparseReader returns a stream consisting of 0s followed by 1<<16 1s.
 // This tests missing hash references in a very large input.
 type sparseReader struct {
@@ -191,7 +155,8 @@ func TestVeryLongSparseChunk(t *testing.T) {
 	if testing.Short() {
 		t.Skip("skipping sparse chunk during short test")
 	}
-	w, err := NewWriter(io.Discard, 1)
+	var buf bytes.Buffer
+	w, err := NewWriter(&buf, 1)
 	if err != nil {
 		t.Errorf("NewWriter: %v", err)
 		return
@@ -200,6 +165,7 @@ func TestVeryLongSparseChunk(t *testing.T) {
 		t.Errorf("Compress failed: %v", err)
 		return
 	}
+	t.Log("Length:", buf.Len())
 }
 
 type syncBuffer struct {
@@ -270,7 +236,7 @@ func testSync(t *testing.T, level int, input []byte, name string) {
 	r := NewReader(buf)
 
 	// Write half the input and read back.
-	for i := 0; i < 2; i++ {
+	for i := range 2 {
 		var lo, hi int
 		if i == 0 {
 			lo, hi = 0, (len(input)+1)/2
@@ -348,13 +314,13 @@ func testToFromWithLevelAndLimit(t *testing.T, level int, input []byte, name str
 	}
 	w.Write(input)
 	w.Close()
+	if limit > 0 {
+		t.Logf("level: %d - Size:%.2f%%, %d b\n", level, float64(buffer.Len()*100)/float64(limit), buffer.Len())
+	}
 	if limit > 0 && buffer.Len() > limit {
 		t.Errorf("level: %d, len(compress(data)) = %d > limit = %d", level, buffer.Len(), limit)
-		return
-	}
-	if limit > 0 {
-		t.Logf("level: %d, size:%.2f%%, %d b\n", level, float64(buffer.Len()*100)/float64(limit), buffer.Len())
 	}
+
 	r := NewReader(&buffer)
 	out, err := io.ReadAll(r)
 	if err != nil {
@@ -363,6 +329,8 @@ func testToFromWithLevelAndLimit(t *testing.T, level int, input []byte, name str
 	}
 	r.Close()
 	if !bytes.Equal(input, out) {
+		os.WriteFile("testdata/fails/"+t.Name()+".got", out, os.ModePerm)
+		os.WriteFile("testdata/fails/"+t.Name()+".want", input, os.ModePerm)
 		t.Errorf("decompress(compress(data)) != data: level=%d input=%s", level, name)
 		return
 	}
@@ -370,19 +338,14 @@ func testToFromWithLevelAndLimit(t *testing.T, level int, input []byte, name str
 }
 
 func testToFromWithLimit(t *testing.T, input []byte, name string, limit [11]int) {
-	for i := 0; i < 10; i++ {
+	for i := range 10 {
 		testToFromWithLevelAndLimit(t, i, input, name, limit[i])
 	}
-	// Test HuffmanCompression
 	testToFromWithLevelAndLimit(t, -2, input, name, limit[10])
 }
 
 func TestDeflateInflate(t *testing.T) {
-	t.Parallel()
 	for i, h := range deflateInflateTests {
-		if testing.Short() && len(h.in) > 10000 {
-			continue
-		}
 		testToFromWithLimit(t, h.in, fmt.Sprintf("#%d", i), [11]int{})
 	}
 }
@@ -399,33 +362,38 @@ func TestReverseBits(t *testing.T) {
 type deflateInflateStringTest struct {
 	filename string
 	label    string
-	limit    [11]int
+	limit    [11]int // Number 11 is ConstantCompression
 }
 
 var deflateInflateStringTests = []deflateInflateStringTest{
 	{
 		"../testdata/e.txt",
 		"2.718281828...",
-		[...]int{100018, 50650, 50960, 51150, 50930, 50790, 50790, 50790, 50790, 50790, 43683},
+		[...]int{100018, 67900, 50960, 51150, 50930, 50790, 50790, 50790, 50790, 50790, 43683 + 100},
 	},
 	{
 		"../../testdata/Isaac.Newton-Opticks.txt",
 		"Isaac.Newton-Opticks",
-		[...]int{567248, 218338, 198211, 193152, 181100, 175427, 175427, 173597, 173422, 173422, 325240},
+		[...]int{567248, 218338, 201354, 199101, 190627, 182587, 179765, 174982, 173422, 173422, 325240},
 	},
 }
 
 func TestDeflateInflateString(t *testing.T) {
-	t.Parallel()
-	if testing.Short() && testenv.Builder() == "" {
-		t.Skip("skipping in short mode")
-	}
 	for _, test := range deflateInflateStringTests {
 		gold, err := os.ReadFile(test.filename)
 		if err != nil {
 			t.Error(err)
 		}
-		testToFromWithLimit(t, gold, test.label, test.limit)
+		// Remove returns that may be present on Windows
+		neutral := strings.Map(func(r rune) rune {
+			if r != '\r' {
+				return r
+			}
+			return -1
+		}, string(gold))
+
+		testToFromWithLimit(t, []byte(neutral), test.label, test.limit)
+
 		if testing.Short() {
 			break
 		}
@@ -460,31 +428,36 @@ func TestReaderDict(t *testing.T) {
 
 func TestWriterDict(t *testing.T) {
 	const (
-		dict = "hello world"
-		text = "hello again world"
+		dict = "hello world Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."
+		text = "hello world Lorem ipsum dolor sit amet"
 	)
-	var b bytes.Buffer
-	w, err := NewWriter(&b, 5)
-	if err != nil {
-		t.Fatalf("NewWriter: %v", err)
-	}
-	w.Write([]byte(dict))
-	w.Flush()
-	b.Reset()
-	w.Write([]byte(text))
-	w.Close()
+	// This test is sensitive to algorithm changes that skip
+	// data in favour of speed. Higher levels are less prone to this
+	// so we test level 4-9.
+	for l := 4; l < 9; l++ {
+		var b bytes.Buffer
+		w, err := NewWriter(&b, l)
+		if err != nil {
+			t.Fatalf("level %d, NewWriter: %v", l, err)
+		}
+		w.Write([]byte(dict))
+		w.Flush()
+		b.Reset()
+		w.Write([]byte(text))
+		w.Close()
 
-	var b1 bytes.Buffer
-	w, _ = NewWriterDict(&b1, 5, []byte(dict))
-	w.Write([]byte(text))
-	w.Close()
+		var b1 bytes.Buffer
+		w, _ = NewWriterDict(&b1, l, []byte(dict))
+		w.Write([]byte(text))
+		w.Close()
 
-	if !bytes.Equal(b1.Bytes(), b.Bytes()) {
-		t.Fatalf("writer wrote %q want %q", b1.Bytes(), b.Bytes())
+		if !bytes.Equal(b1.Bytes(), b.Bytes()) {
+			t.Errorf("level %d, writer wrote\n%v\n want\n%v", l, b1.Bytes(), b.Bytes())
+		}
 	}
 }
 
-// See https://golang.org/issue/2508
+// See http://code.google.com/p/go/issues/detail?id=2508
 func TestRegression2508(t *testing.T) {
 	if testing.Short() {
 		t.Logf("test disabled with -short")
@@ -495,7 +468,7 @@ func TestRegression2508(t *testing.T) {
 		t.Fatalf("NewWriter: %v", err)
 	}
 	buf := make([]byte, 1024)
-	for i := 0; i < 131072; i++ {
+	for range 131072 {
 		if _, err := w.Write(buf); err != nil {
 			t.Fatalf("writer failed: %v", err)
 		}
@@ -504,8 +477,10 @@ func TestRegression2508(t *testing.T) {
 }
 
 func TestWriterReset(t *testing.T) {
-	t.Parallel()
-	for level := 0; level <= 9; level++ {
+	for level := -2; level <= 9; level++ {
+		if level == -1 {
+			level++
+		}
 		if testing.Short() && level > 1 {
 			break
 		}
@@ -514,11 +489,7 @@ func TestWriterReset(t *testing.T) {
 			t.Fatalf("NewWriter: %v", err)
 		}
 		buf := []byte("hello world")
-		n := 1024
-		if testing.Short() {
-			n = 10
-		}
-		for i := 0; i < n; i++ {
+		for range 1024 {
 			w.Write(buf)
 		}
 		w.Reset(io.Discard)
@@ -531,12 +502,12 @@ func TestWriterReset(t *testing.T) {
 		// DeepEqual doesn't compare functions.
 		w.d.fill, wref.d.fill = nil, nil
 		w.d.step, wref.d.step = nil, nil
-		w.d.bulkHasher, wref.d.bulkHasher = nil, nil
-		w.d.bestSpeed, wref.d.bestSpeed = nil, nil
+		w.d.state, wref.d.state = nil, nil
+		w.d.fast, wref.d.fast = nil, nil
+
 		// hashMatch is always overwritten when used.
-		copy(w.d.hashMatch[:], wref.d.hashMatch[:])
-		if len(w.d.tokens) != 0 {
-			t.Errorf("level %d Writer not reset after Reset. %d tokens were present", level, len(w.d.tokens))
+		if w.d.tokens.n != 0 {
+			t.Errorf("level %d Writer not reset after Reset. %d tokens were present", level, w.d.tokens.n)
 		}
 		// As long as the length is 0, we don't care about the content.
 		w.d.tokens = wref.d.tokens
@@ -548,76 +519,64 @@ func TestWriterReset(t *testing.T) {
 		}
 	}
 
-	levels := []int{0, 1, 2, 5, 9}
-	for _, level := range levels {
-		t.Run(fmt.Sprint(level), func(t *testing.T) {
-			testResetOutput(t, level, nil)
+	for i := HuffmanOnly; i <= BestCompression; i++ {
+		testResetOutput(t, fmt.Sprint("level-", i), func(w io.Writer) (*Writer, error) { return NewWriter(w, i) })
+	}
+	dict := []byte(strings.Repeat("we are the world - how are you?", 3))
+	for i := HuffmanOnly; i <= BestCompression; i++ {
+		testResetOutput(t, fmt.Sprint("dict-level-", i), func(w io.Writer) (*Writer, error) { return NewWriterDict(w, i, dict) })
+	}
+	for i := HuffmanOnly; i <= BestCompression; i++ {
+		testResetOutput(t, fmt.Sprint("dict-reset-level-", i), func(w io.Writer) (*Writer, error) {
+			w2, err := NewWriter(nil, i)
+			if err != nil {
+				return w2, err
+			}
+			w2.ResetDict(w, dict)
+			return w2, nil
 		})
 	}
-
-	t.Run("dict", func(t *testing.T) {
-		for _, level := range levels {
-			t.Run(fmt.Sprint(level), func(t *testing.T) {
-				testResetOutput(t, level, nil)
-			})
-		}
-	})
 }
 
-func testResetOutput(t *testing.T, level int, dict []byte) {
-	writeData := func(w *Writer) {
-		msg := []byte("now is the time for all good gophers")
-		w.Write(msg)
-		w.Flush()
-
-		hello := []byte("hello world")
-		for i := 0; i < 1024; i++ {
-			w.Write(hello)
+func testResetOutput(t *testing.T, name string, newWriter func(w io.Writer) (*Writer, error)) {
+	t.Run(name, func(t *testing.T) {
+		buf := new(bytes.Buffer)
+		w, err := newWriter(buf)
+		if err != nil {
+			t.Fatalf("NewWriter: %v", err)
 		}
+		b := []byte("hello world - how are you doing?")
+		for range 1024 {
+			w.Write(b)
+		}
+		w.Close()
+		out1 := buf.Bytes()
 
-		fill := bytes.Repeat([]byte("x"), 65000)
-		w.Write(fill)
-	}
-
-	buf := new(bytes.Buffer)
-	var w *Writer
-	var err error
-	if dict == nil {
-		w, err = NewWriter(buf, level)
-	} else {
-		w, err = NewWriterDict(buf, level, dict)
-	}
-	if err != nil {
-		t.Fatalf("NewWriter: %v", err)
-	}
-
-	writeData(w)
-	w.Close()
-	out1 := buf.Bytes()
-
-	buf2 := new(bytes.Buffer)
-	w.Reset(buf2)
-	writeData(w)
-	w.Close()
-	out2 := buf2.Bytes()
+		buf2 := new(bytes.Buffer)
+		w.Reset(buf2)
+		for range 1024 {
+			w.Write(b)
+		}
+		w.Close()
+		out2 := buf2.Bytes()
 
-	if len(out1) != len(out2) {
-		t.Errorf("got %d, expected %d bytes", len(out2), len(out1))
-		return
-	}
-	if !bytes.Equal(out1, out2) {
-		mm := 0
-		for i, b := range out1[:len(out2)] {
-			if b != out2[i] {
-				t.Errorf("mismatch index %d: %#02x, expected %#02x", i, out2[i], b)
-			}
-			mm++
-			if mm == 10 {
-				t.Fatal("Stopping")
+		if len(out1) != len(out2) {
+			t.Errorf("got %d, expected %d bytes", len(out2), len(out1))
+		}
+		if !bytes.Equal(out1, out2) {
+			mm := 0
+			for i, b := range out1[:len(out2)] {
+				if b != out2[i] {
+					t.Errorf("mismatch index %d: %02x, expected %02x", i, out2[i], b)
+				}
+				mm++
+				if mm == 10 {
+					t.Fatal("Stopping")
+				}
 			}
 		}
-	}
-	t.Logf("got %d bytes", len(out1))
+		t.Logf("got %d bytes", len(out1))
+	})
 }
 
 // TestBestSpeed tests that round-tripping through deflate and then inflate
@@ -625,7 +584,6 @@ func testResetOutput(t *testing.T, level int, dict []byte) {
 // compressor.encSpeed method (0, 16, 128), as well as near maxStoreBlockSize
 // (65535).
 func TestBestSpeed(t *testing.T) {
-	t.Parallel()
 	abc := make([]byte, 128)
 	for i := range abc {
 		abc[i] = byte(i)
@@ -653,8 +611,8 @@ func TestBestSpeed(t *testing.T) {
 	}
 
 	for i, tc := range testCases {
-		if i >= 3 && testing.Short() {
-			break
+		if testing.Short() && i > 5 {
+			t.Skip()
 		}
 		for _, firstN := range []int{1, 65534, 65535, 65536, 65537, 131072} {
 			tc[0] = firstN
@@ -703,368 +661,3 @@ func TestBestSpeed(t *testing.T) {
 		}
 	}
 }
-
-var errIO = errors.New("IO error")
-
-// failWriter fails with errIO exactly at the nth call to Write.
-type failWriter struct{ n int }
-
-func (w *failWriter) Write(b []byte) (int, error) {
-	w.n--
-	if w.n == -1 {
-		return 0, errIO
-	}
-	return len(b), nil
-}
-
-func TestWriterPersistentWriteError(t *testing.T) {
-	t.Parallel()
-	d, err := os.ReadFile("../../testdata/Isaac.Newton-Opticks.txt")
-	if err != nil {
-		t.Fatalf("ReadFile: %v", err)
-	}
-	d = d[:10000] // Keep this test short
-
-	zw, err := NewWriter(nil, DefaultCompression)
-	if err != nil {
-		t.Fatalf("NewWriter: %v", err)
-	}
-
-	// Sweep over the threshold at which an error is returned.
-	// The variable i makes it such that the ith call to failWriter.Write will
-	// return errIO. Since failWriter errors are not persistent, we must ensure
-	// that flate.Writer errors are persistent.
-	for i := 0; i < 1000; i++ {
-		fw := &failWriter{i}
-		zw.Reset(fw)
-
-		_, werr := zw.Write(d)
-		cerr := zw.Close()
-		ferr := zw.Flush()
-		if werr != errIO && werr != nil {
-			t.Errorf("test %d, mismatching Write error: got %v, want %v", i, werr, errIO)
-		}
-		if cerr != errIO && fw.n < 0 {
-			t.Errorf("test %d, mismatching Close error: got %v, want %v", i, cerr, errIO)
-		}
-		if ferr != errIO && fw.n < 0 {
-			t.Errorf("test %d, mismatching Flush error: got %v, want %v", i, ferr, errIO)
-		}
-		if fw.n >= 0 {
-			// At this point, the failure threshold was sufficiently high enough
-			// that we wrote the whole stream without any errors.
-			return
-		}
-	}
-}
-func TestWriterPersistentFlushError(t *testing.T) {
-	zw, err := NewWriter(&failWriter{0}, DefaultCompression)
-	if err != nil {
-		t.Fatalf("NewWriter: %v", err)
-	}
-	flushErr := zw.Flush()
-	closeErr := zw.Close()
-	_, writeErr := zw.Write([]byte("Test"))
-	checkErrors([]error{closeErr, flushErr, writeErr}, errIO, t)
-}
-
-func TestWriterPersistentCloseError(t *testing.T) {
-	// If underlying writer return error on closing stream we should persistent this error across all writer calls.
-	zw, err := NewWriter(&failWriter{0}, DefaultCompression)
-	if err != nil {
-		t.Fatalf("NewWriter: %v", err)
-	}
-	closeErr := zw.Close()
-	flushErr := zw.Flush()
-	_, writeErr := zw.Write([]byte("Test"))
-	checkErrors([]error{closeErr, flushErr, writeErr}, errIO, t)
-
-	// After closing writer we should persistent "write after close" error across Flush and Write calls, but return nil
-	// on next Close calls.
-	var b bytes.Buffer
-	zw.Reset(&b)
-	err = zw.Close()
-	if err != nil {
-		t.Fatalf("First call to close returned error: %s", err)
-	}
-	err = zw.Close()
-	if err != nil {
-		t.Fatalf("Second call to close returned error: %s", err)
-	}
-
-	flushErr = zw.Flush()
-	_, writeErr = zw.Write([]byte("Test"))
-	checkErrors([]error{flushErr, writeErr}, errWriterClosed, t)
-}
-
-func checkErrors(got []error, want error, t *testing.T) {
-	t.Helper()
-	for _, err := range got {
-		if err != want {
-			t.Errorf("Error doesn't match\nWant: %s\nGot: %s", want, got)
-		}
-	}
-}
-
-func TestBestSpeedMatch(t *testing.T) {
-	t.Parallel()
-	cases := []struct {
-		previous, current []byte
-		t, s, want        int32
-	}{{
-		previous: []byte{0, 0, 0, 1, 2},
-		current:  []byte{3, 4, 5, 0, 1, 2, 3, 4, 5},
-		t:        -3,
-		s:        3,
-		want:     6,
-	}, {
-		previous: []byte{0, 0, 0, 1, 2},
-		current:  []byte{2, 4, 5, 0, 1, 2, 3, 4, 5},
-		t:        -3,
-		s:        3,
-		want:     3,
-	}, {
-		previous: []byte{0, 0, 0, 1, 1},
-		current:  []byte{3, 4, 5, 0, 1, 2, 3, 4, 5},
-		t:        -3,
-		s:        3,
-		want:     2,
-	}, {
-		previous: []byte{0, 0, 0, 1, 2},
-		current:  []byte{2, 2, 2, 2, 1, 2, 3, 4, 5},
-		t:        -1,
-		s:        0,
-		want:     4,
-	}, {
-		previous: []byte{0, 0, 0, 1, 2, 3, 4, 5, 2, 2},
-		current:  []byte{2, 2, 2, 2, 1, 2, 3, 4, 5},
-		t:        -7,
-		s:        4,
-		want:     5,
-	}, {
-		previous: []byte{9, 9, 9, 9, 9},
-		current:  []byte{2, 2, 2, 2, 1, 2, 3, 4, 5},
-		t:        -1,
-		s:        0,
-		want:     0,
-	}, {
-		previous: []byte{9, 9, 9, 9, 9},
-		current:  []byte{9, 2, 2, 2, 1, 2, 3, 4, 5},
-		t:        0,
-		s:        1,
-		want:     0,
-	}, {
-		previous: []byte{},
-		current:  []byte{9, 2, 2, 2, 1, 2, 3, 4, 5},
-		t:        -5,
-		s:        1,
-		want:     0,
-	}, {
-		previous: []byte{},
-		current:  []byte{9, 2, 2, 2, 1, 2, 3, 4, 5},
-		t:        -1,
-		s:        1,
-		want:     0,
-	}, {
-		previous: []byte{},
-		current:  []byte{2, 2, 2, 2, 1, 2, 3, 4, 5},
-		t:        0,
-		s:        1,
-		want:     3,
-	}, {
-		previous: []byte{3, 4, 5},
-		current:  []byte{3, 4, 5},
-		t:        -3,
-		s:        0,
-		want:     3,
-	}, {
-		previous: make([]byte, 1000),
-		current:  make([]byte, 1000),
-		t:        -1000,
-		s:        0,
-		want:     maxMatchLength - 4,
-	}, {
-		previous: make([]byte, 200),
-		current:  make([]byte, 500),
-		t:        -200,
-		s:        0,
-		want:     maxMatchLength - 4,
-	}, {
-		previous: make([]byte, 200),
-		current:  make([]byte, 500),
-		t:        0,
-		s:        1,
-		want:     maxMatchLength - 4,
-	}, {
-		previous: make([]byte, maxMatchLength-4),
-		current:  make([]byte, 500),
-		t:        -(maxMatchLength - 4),
-		s:        0,
-		want:     maxMatchLength - 4,
-	}, {
-		previous: make([]byte, 200),
-		current:  make([]byte, 500),
-		t:        -200,
-		s:        400,
-		want:     100,
-	}, {
-		previous: make([]byte, 10),
-		current:  make([]byte, 500),
-		t:        200,
-		s:        400,
-		want:     100,
-	}}
-	for i, c := range cases {
-		e := deflateFast{prev: c.previous}
-		got := e.matchLen(c.s, c.t, c.current)
-		if got != c.want {
-			t.Errorf("Test %d: match length, want %d, got %d", i, c.want, got)
-		}
-	}
-}
-
-func TestBestSpeedMaxMatchOffset(t *testing.T) {
-	t.Parallel()
-	const abc, xyz = "abcdefgh", "stuvwxyz"
-	for _, matchBefore := range []bool{false, true} {
-		for _, extra := range []int{0, inputMargin - 1, inputMargin, inputMargin + 1, 2 * inputMargin} {
-			for offsetAdj := -5; offsetAdj <= +5; offsetAdj++ {
-				report := func(desc string, err error) {
-					t.Errorf("matchBefore=%t, extra=%d, offsetAdj=%d: %s%v",
-						matchBefore, extra, offsetAdj, desc, err)
-				}
-
-				offset := maxMatchOffset + offsetAdj
-
-				// Make src to be a []byte of the form
-				//	"%s%s%s%s%s" % (abc, zeros0, xyzMaybe, abc, zeros1)
-				// where:
-				//	zeros0 is approximately maxMatchOffset zeros.
-				//	xyzMaybe is either xyz or the empty string.
-				//	zeros1 is between 0 and 30 zeros.
-				// The difference between the two abc's will be offset, which
-				// is maxMatchOffset plus or minus a small adjustment.
-				src := make([]byte, offset+len(abc)+extra)
-				copy(src, abc)
-				if !matchBefore {
-					copy(src[offset-len(xyz):], xyz)
-				}
-				copy(src[offset:], abc)
-
-				buf := new(bytes.Buffer)
-				w, err := NewWriter(buf, BestSpeed)
-				if err != nil {
-					report("NewWriter: ", err)
-					continue
-				}
-				if _, err := w.Write(src); err != nil {
-					report("Write: ", err)
-					continue
-				}
-				if err := w.Close(); err != nil {
-					report("Writer.Close: ", err)
-					continue
-				}
-
-				r := NewReader(buf)
-				dst, err := io.ReadAll(r)
-				r.Close()
-				if err != nil {
-					report("ReadAll: ", err)
-					continue
-				}
-
-				if !bytes.Equal(dst, src) {
-					report("", fmt.Errorf("bytes differ after round-tripping"))
-					continue
-				}
-			}
-		}
-	}
-}
-
-func TestBestSpeedShiftOffsets(t *testing.T) {
-	// Test if shiftoffsets properly preserves matches and resets out-of-range matches
-	// seen in https://github.com/golang/go/issues/4142
-	enc := newDeflateFast()
-
-	// testData may not generate internal matches.
-	testData := make([]byte, 32)
-	rng := rand.New(rand.NewSource(0))
-	for i := range testData {
-		testData[i] = byte(rng.Uint32())
-	}
-
-	// Encode the testdata with clean state.
-	// Second part should pick up matches from the first block.
-	wantFirstTokens := len(enc.encode(nil, testData))
-	wantSecondTokens := len(enc.encode(nil, testData))
-
-	if wantFirstTokens <= wantSecondTokens {
-		t.Fatalf("test needs matches between inputs to be generated")
-	}
-	// Forward the current indicator to before wraparound.
-	enc.cur = bufferReset - int32(len(testData))
-
-	// Part 1 before wrap, should match clean state.
-	got := len(enc.encode(nil, testData))
-	if wantFirstTokens != got {
-		t.Errorf("got %d, want %d tokens", got, wantFirstTokens)
-	}
-
-	// Verify we are about to wrap.
-	if enc.cur != bufferReset {
-		t.Errorf("got %d, want e.cur to be at bufferReset (%d)", enc.cur, bufferReset)
-	}
-
-	// Part 2 should match clean state as well even if wrapped.
-	got = len(enc.encode(nil, testData))
-	if wantSecondTokens != got {
-		t.Errorf("got %d, want %d token", got, wantSecondTokens)
-	}
-
-	// Verify that we wrapped.
-	if enc.cur >= bufferReset {
-		t.Errorf("want e.cur to be < bufferReset (%d), got %d", bufferReset, enc.cur)
-	}
-
-	// Forward the current buffer, leaving the matches at the bottom.
-	enc.cur = bufferReset
-	enc.shiftOffsets()
-
-	// Ensure that no matches were picked up.
-	got = len(enc.encode(nil, testData))
-	if wantFirstTokens != got {
-		t.Errorf("got %d, want %d tokens", got, wantFirstTokens)
-	}
-}
-
-func TestMaxStackSize(t *testing.T) {
-	// This test must not run in parallel with other tests as debug.SetMaxStack
-	// affects all goroutines.
-	n := debug.SetMaxStack(1 << 16)
-	defer debug.SetMaxStack(n)
-
-	var wg sync.WaitGroup
-	defer wg.Wait()
-
-	b := make([]byte, 1<<20)
-	for level := HuffmanOnly; level <= BestCompression; level++ {
-		// Run in separate goroutine to increase probability of stack regrowth.
-		wg.Add(1)
-		go func(level int) {
-			defer wg.Done()
-			zw, err := NewWriter(io.Discard, level)
-			if err != nil {
-				t.Errorf("level %d, NewWriter() = %v, want nil", level, err)
-			}
-			if n, err := zw.Write(b); n != len(b) || err != nil {
-				t.Errorf("level %d, Write() = (%d, %v), want (%d, nil)", level, n, err, len(b))
-			}
-			if err := zw.Close(); err != nil {
-				t.Errorf("level %d, Close() = %v, want nil", level, err)
-			}
-			zw.Reset(io.Discard)
-		}(level)
-	}
-}
diff --git a/src/compress/flate/deflatefast.go b/src/compress/flate/deflatefast.go
index e5554d6fb40842..eef1896b6f5c63 100644
--- a/src/compress/flate/deflatefast.go
+++ b/src/compress/flate/deflatefast.go
@@ -4,304 +4,165 @@
 
 package flate
 
-import "math"
-
-// This encoding algorithm, which prioritizes speed over output size, is
-// based on Snappy's LZ77-style encoder: github.com/golang/snappy
-
-const (
-	tableBits  = 14             // Bits used in the table.
-	tableSize  = 1 << tableBits // Size of the table.
-	tableMask  = tableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
-	tableShift = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
-
-	// Reset the buffer offset when reaching this.
-	// Offsets are stored between blocks as int32 values.
-	// Since the offset we are checking against is at the beginning
-	// of the buffer, we need to subtract the current and input
-	// buffer to not risk overflowing the int32.
-	bufferReset = math.MaxInt32 - maxStoreBlockSize*2
+import (
+	"math/bits"
 )
 
-func load32(b []byte, i int32) uint32 {
-	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+type fastEnc interface {
+	Encode(dst *tokens, src []byte)
+	Reset()
 }
 
-func load64(b []byte, i int32) uint64 {
-	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+func newFastEnc(level int) fastEnc {
+	switch level {
+	case 1:
+		return &fastEncL1{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 2:
+		return &fastEncL2{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 3:
+		return &fastEncL3{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 4:
+		return &fastEncL4{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 5:
+		return &fastEncL5{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 6:
+		return &fastEncL6{fastGen: fastGen{cur: maxStoreBlockSize}}
+	default:
+		panic("invalid level specified")
+	}
 }
 
-func hash(u uint32) uint32 {
-	return (u * 0x1e35a7bd) >> tableShift
-}
+const (
+	tableBits       = 15             // Bits used in the table
+	tableSize       = 1 << tableBits // Size of the table
+	hashLongBytes   = 7              // Bytes used for long table hash
+	baseMatchOffset = 1              // The smallest match offset
+	baseMatchLength = 3              // The smallest match length per the RFC section 3.2.5
+	maxMatchOffset  = 1 << 15        // The largest match offset
+
+	bTableBits   = 17                                               // Bits used in the big tables
+	bTableSize   = 1 << bTableBits                                  // Size of the table
+	allocHistory = maxStoreBlockSize * 5                            // Size to preallocate for history.
+	bufferReset  = (1 << 31) - allocHistory - maxStoreBlockSize - 1 // Reset the buffer offset when reaching this.
+)
 
-// These constants are defined by the Snappy implementation so that its
-// assembly implementation can fast-path some 16-bytes-at-a-time copies. They
-// aren't necessary in the pure Go implementation, as we don't use those same
-// optimizations, but using the same thresholds doesn't really hurt.
 const (
-	inputMargin            = 16 - 1
-	minNonLiteralBlockSize = 1 + 1 + inputMargin
+	prime3bytes = 506832829
+	prime4bytes = 2654435761
+	prime5bytes = 889523592379
+	prime6bytes = 227718039650203
+	prime7bytes = 58295818150454627
+	prime8bytes = 0xcf1bbcdcb7a56463
 )
 
 type tableEntry struct {
-	val    uint32 // Value at destination
 	offset int32
 }
 
-// deflateFast maintains the table for matches,
-// and the previous byte block for cross block matching.
-type deflateFast struct {
-	table [tableSize]tableEntry
-	prev  []byte // Previous block, zero length if unknown.
-	cur   int32  // Current match offset.
-}
-
-func newDeflateFast() *deflateFast {
-	return &deflateFast{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}
+// fastGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type fastGen struct {
+	hist []byte
+	cur  int32
 }
 
-// encode encodes a block given in src and appends tokens
-// to dst and returns the result.
-func (e *deflateFast) encode(dst []token, src []byte) []token {
-	// Ensure that e.cur doesn't wrap.
-	if e.cur >= bufferReset {
-		e.shiftOffsets()
-	}
-
-	// This check isn't in the Snappy implementation, but there, the caller
-	// instead of the callee handles this case.
-	if len(src) < minNonLiteralBlockSize {
-		e.cur += maxStoreBlockSize
-		e.prev = e.prev[:0]
-		return emitLiteral(dst, src)
-	}
-
-	// sLimit is when to stop looking for offset/length copies. The inputMargin
-	// lets us use a fast path for emitLiteral in the main loop, while we are
-	// looking for copies.
-	sLimit := int32(len(src) - inputMargin)
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := int32(0)
-	s := int32(0)
-	cv := load32(src, s)
-	nextHash := hash(cv)
-
-	for {
-		// Copied from the C++ snappy implementation:
-		//
-		// Heuristic match skipping: If 32 bytes are scanned with no matches
-		// found, start looking only at every other byte. If 32 more bytes are
-		// scanned (or skipped), look at every third byte, etc.. When a match
-		// is found, immediately go back to looking at every byte. This is a
-		// small loss (~5% performance, ~0.1% density) for compressible data
-		// due to more bookkeeping, but for non-compressible data (such as
-		// JPEG) it's a huge win since the compressor quickly "realizes" the
-		// data is incompressible and doesn't bother looking for matches
-		// everywhere.
-		//
-		// The "skip" variable keeps track of how many bytes there are since
-		// the last match; dividing it by 32 (ie. right-shifting by five) gives
-		// the number of bytes to move ahead for each iteration.
-		skip := int32(32)
-
-		nextS := s
-		var candidate tableEntry
-		for {
-			s = nextS
-			bytesBetweenHashLookups := skip >> 5
-			nextS = s + bytesBetweenHashLookups
-			skip += bytesBetweenHashLookups
-			if nextS > sLimit {
-				goto emitRemainder
-			}
-			candidate = e.table[nextHash&tableMask]
-			now := load32(src, nextS)
-			e.table[nextHash&tableMask] = tableEntry{offset: s + e.cur, val: cv}
-			nextHash = hash(now)
-
-			offset := s - (candidate.offset - e.cur)
-			if offset > maxMatchOffset || cv != candidate.val {
-				// Out of range or not matched.
-				cv = now
-				continue
-			}
-			break
-		}
-
-		// A 4-byte match has been found. We'll later see if more than 4 bytes
-		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
-		// them as literal bytes.
-		dst = emitLiteral(dst, src[nextEmit:s])
-
-		// Call emitCopy, and then see if another emitCopy could be our next
-		// move. Repeat until we find no match for the input immediately after
-		// what was consumed by the last emitCopy call.
-		//
-		// If we exit this loop normally then we need to call emitLiteral next,
-		// though we don't yet know how big the literal will be. We handle that
-		// by proceeding to the next iteration of the main loop. We also can
-		// exit this loop via goto if we get close to exhausting the input.
-		for {
-			// Invariant: we have a 4-byte match at s, and no need to emit any
-			// literal bytes prior to s.
-
-			// Extend the 4-byte match as long as possible.
-			//
-			s += 4
-			t := candidate.offset - e.cur + 4
-			l := e.matchLen(s, t, src)
-
-			// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
-			dst = append(dst, matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset)))
-			s += l
-			nextEmit = s
-			if s >= sLimit {
-				goto emitRemainder
-			}
-
-			// We could immediately start working at s now, but to improve
-			// compression we first update the hash table at s-1 and at s. If
-			// another emitCopy is not our next move, also calculate nextHash
-			// at s+1. At least on GOARCH=amd64, these three hash calculations
-			// are faster as one load64 call (with some shifts) instead of
-			// three load32 calls.
-			x := load64(src, s-1)
-			prevHash := hash(uint32(x))
-			e.table[prevHash&tableMask] = tableEntry{offset: e.cur + s - 1, val: uint32(x)}
-			x >>= 8
-			currHash := hash(uint32(x))
-			candidate = e.table[currHash&tableMask]
-			e.table[currHash&tableMask] = tableEntry{offset: e.cur + s, val: uint32(x)}
-
-			offset := s - (candidate.offset - e.cur)
-			if offset > maxMatchOffset || uint32(x) != candidate.val {
-				cv = uint32(x >> 8)
-				nextHash = hash(cv)
-				s++
-				break
+func (e *fastGen) addBlock(src []byte) int32 {
+	// check if we have space already
+	if len(e.hist)+len(src) > cap(e.hist) {
+		if cap(e.hist) == 0 {
+			e.hist = make([]byte, 0, allocHistory)
+		} else {
+			if cap(e.hist) < maxMatchOffset*2 {
+				panic("unexpected buffer size")
 			}
+			// Move down
+			offset := int32(len(e.hist)) - maxMatchOffset
+			// copy(e.hist[0:maxMatchOffset], e.hist[offset:])
+			*(*[maxMatchOffset]byte)(e.hist) = *(*[maxMatchOffset]byte)(e.hist[offset:])
+			e.cur += offset
+			e.hist = e.hist[:maxMatchOffset]
 		}
 	}
-
-emitRemainder:
-	if int(nextEmit) < len(src) {
-		dst = emitLiteral(dst, src[nextEmit:])
-	}
-	e.cur += int32(len(src))
-	e.prev = e.prev[:len(src)]
-	copy(e.prev, src)
-	return dst
+	s := int32(len(e.hist))
+	e.hist = append(e.hist, src...)
+	return s
 }
 
-func emitLiteral(dst []token, lit []byte) []token {
-	for _, v := range lit {
-		dst = append(dst, literalToken(uint32(v)))
-	}
-	return dst
+type tableEntryPrev struct {
+	Cur  tableEntry
+	Prev tableEntry
 }
 
-// matchLen returns the match length between src[s:] and src[t:].
-// t can be negative to indicate the match is starting in e.prev.
-// We assume that src[s-4:s] and src[t-4:t] already match.
-func (e *deflateFast) matchLen(s, t int32, src []byte) int32 {
-	s1 := int(s) + maxMatchLength - 4
-	if s1 > len(src) {
-		s1 = len(src)
+// hashLen returns a hash of the lowest mls bytes of with length output bits.
+// mls must be >=3 and <=8. Any other value will return hash for 4 bytes.
+// length should always be < 32.
+// Preferably, length and mls should be a constant for inlining.
+func hashLen(u uint64, length, mls uint8) uint32 {
+	switch mls {
+	case 3:
+		return (uint32(u<<8) * prime3bytes) >> (32 - length)
+	case 5:
+		return uint32(((u << (64 - 40)) * prime5bytes) >> (64 - length))
+	case 6:
+		return uint32(((u << (64 - 48)) * prime6bytes) >> (64 - length))
+	case 7:
+		return uint32(((u << (64 - 56)) * prime7bytes) >> (64 - length))
+	case 8:
+		return uint32((u * prime8bytes) >> (64 - length))
+	default:
+		return (uint32(u) * prime4bytes) >> (32 - length)
 	}
+}
 
-	// If we are inside the current block
-	if t >= 0 {
-		b := src[t:]
-		a := src[s:s1]
-		b = b[:len(a)]
-		// Extend the match to be as long as possible.
-		for i := range a {
-			if a[i] != b[i] {
-				return int32(i)
-			}
-		}
-		return int32(len(a))
-	}
+// matchLenLimited will return the match length between offsets and t in src.
+// The maximum length returned is maxMatchLength - 4.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastGen) matchLenLimited(s, t int, src []byte) int32 {
+	a := src[s:min(s+maxMatchLength-4, len(src))]
+	b := src[t:]
+	return int32(matchLen(a, b))
+}
 
-	// We found a match in the previous block.
-	tp := int32(len(e.prev)) + t
-	if tp < 0 {
-		return 0
-	}
+// matchlenLong will return the match length between offsets and t in src.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastGen) matchlenLong(s, t int, src []byte) int32 {
+	return int32(matchLen(src[s:], src[t:]))
+}
 
-	// Extend the match to be as long as possible.
-	a := src[s:s1]
-	b := e.prev[tp:]
-	if len(b) > len(a) {
-		b = b[:len(a)]
+// Reset the encoding table.
+func (e *fastGen) Reset() {
+	if cap(e.hist) < allocHistory {
+		e.hist = make([]byte, 0, allocHistory)
 	}
-	a = a[:len(b)]
-	for i := range b {
-		if a[i] != b[i] {
-			return int32(i)
-		}
+	// We offset current position so everything will be out of reach.
+	// If we are above the buffer reset it will be cleared anyway since len(hist) == 0.
+	if e.cur <= bufferReset {
+		e.cur += maxMatchOffset + int32(len(e.hist))
 	}
+	e.hist = e.hist[:0]
+}
 
-	// If we reached our limit, we matched everything we are
-	// allowed to in the previous block and we return.
-	n := int32(len(b))
-	if int(s+n) == s1 {
-		return n
+// matchLen returns the maximum common prefix length of a and b.
+// a must be the shortest of the two.
+func matchLen(a, b []byte) (n int) {
+	left := len(a)
+	for left >= 8 {
+		diff := loadLE64(a, n) ^ loadLE64(b, n)
+		if diff != 0 {
+			return n + bits.TrailingZeros64(diff)>>3
+		}
+		n += 8
+		left -= 8
 	}
 
-	// Continue looking for more matches in the current block.
-	a = src[s+n : s1]
-	b = src[:len(a)]
+	a = a[n:]
+	b = b[n:]
 	for i := range a {
 		if a[i] != b[i] {
-			return int32(i) + n
-		}
-	}
-	return int32(len(a)) + n
-}
-
-// Reset resets the encoding history.
-// This ensures that no matches are made to the previous block.
-func (e *deflateFast) reset() {
-	e.prev = e.prev[:0]
-	// Bump the offset, so all matches will fail distance check.
-	// Nothing should be >= e.cur in the table.
-	e.cur += maxMatchOffset
-
-	// Protect against e.cur wraparound.
-	if e.cur >= bufferReset {
-		e.shiftOffsets()
-	}
-}
-
-// shiftOffsets will shift down all match offset.
-// This is only called in rare situations to prevent integer overflow.
-//
-// See https://golang.org/issue/18636 and https://github.com/golang/go/issues/34121.
-func (e *deflateFast) shiftOffsets() {
-	if len(e.prev) == 0 {
-		// We have no history; just clear the table.
-		clear(e.table[:])
-		e.cur = maxMatchOffset + 1
-		return
-	}
-
-	// Shift down everything in the table that isn't already too far away.
-	for i := range e.table[:] {
-		v := e.table[i].offset - e.cur + maxMatchOffset + 1
-		if v < 0 {
-			// We want to reset e.cur to maxMatchOffset + 1, so we need to shift
-			// all table entries down by (e.cur - (maxMatchOffset + 1)).
-			// Because we ignore matches > maxMatchOffset, we can cap
-			// any negative offsets at 0.
-			v = 0
+			break
 		}
-		e.table[i].offset = v
+		n++
 	}
-	e.cur = maxMatchOffset + 1
+	return n
 }
diff --git a/src/compress/flate/dict_decoder.go b/src/compress/flate/dict_decoder.go
index d2c19040f54f53..cb855abc4ba1d7 100644
--- a/src/compress/flate/dict_decoder.go
+++ b/src/compress/flate/dict_decoder.go
@@ -104,10 +104,7 @@ func (dd *dictDecoder) writeCopy(dist, length int) int {
 	dstBase := dd.wrPos
 	dstPos := dstBase
 	srcPos := dstPos - dist
-	endPos := dstPos + length
-	if endPos > len(dd.hist) {
-		endPos = len(dd.hist)
-	}
+	endPos := min(dstPos+length, len(dd.hist))
 
 	// Copy non-overlapping section after destination position.
 	//
@@ -160,8 +157,10 @@ func (dd *dictDecoder) tryWriteCopy(dist, length int) int {
 	srcPos := dstPos - dist
 
 	// Copy possibly overlapping section before destination position.
-	for dstPos < endPos {
-		dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos])
+loop:
+	dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos])
+	if dstPos < endPos {
+		goto loop // Avoid for-loop so that this function can be inlined
 	}
 
 	dd.wrPos = dstPos
diff --git a/src/compress/flate/example_test.go b/src/compress/flate/example_test.go
index 578009248f5704..3af5c1d95de1d1 100644
--- a/src/compress/flate/example_test.go
+++ b/src/compress/flate/example_test.go
@@ -93,7 +93,7 @@ func Example_dictionary() {
 	var b bytes.Buffer
 
 	// Compress the data using the specially crafted dictionary.
-	zw, err := flate.NewWriterDict(&b, flate.DefaultCompression, []byte(dict))
+	zw, err := flate.NewWriterDict(&b, flate.BestCompression, []byte(dict))
 	if err != nil {
 		log.Fatal(err)
 	}
@@ -168,6 +168,7 @@ func Example_synchronization() {
 	wg.Add(1)
 	go func() {
 		defer wg.Done()
+		defer wp.Close()
 
 		zw, err := flate.NewWriter(wp, flate.BestSpeed)
 		if err != nil {
diff --git a/src/compress/flate/fuzz_test.go b/src/compress/flate/fuzz_test.go
new file mode 100644
index 00000000000000..1ea8cc49e54672
--- /dev/null
+++ b/src/compress/flate/fuzz_test.go
@@ -0,0 +1,111 @@
+package flate
+
+import (
+	"bytes"
+	"flag"
+	"io"
+	"os"
+	"strconv"
+	"testing"
+)
+
+// Fuzzing tweaks:
+var fuzzStartF = flag.Int("start", HuffmanOnly, "Start fuzzing at this level")
+var fuzzEndF = flag.Int("end", BestCompression, "End fuzzing at this level (inclusive)")
+var fuzzMaxF = flag.Int("max", 1<<20, "Maximum input size")
+
+func TestMain(m *testing.M) {
+	flag.Parse()
+	os.Exit(m.Run())
+}
+
+// FuzzEncoding tests the fuzzer by doing roundtrips.
+// Every input is run through the fuzzer at every level.
+// Note: When running the fuzzer, it may hit the 10-second timeout on slower CPUs.
+func FuzzEncoding(f *testing.F) {
+	startFuzz := *fuzzStartF
+	endFuzz := *fuzzEndF
+	maxSize := *fuzzMaxF
+
+	decoder := NewReader(nil)
+	buf, buf2 := new(bytes.Buffer), new(bytes.Buffer)
+	encs := make([]*Writer, endFuzz-startFuzz+1)
+	for i := range encs {
+		var err error
+		encs[i], err = NewWriter(nil, i+startFuzz)
+		if err != nil {
+			f.Fatal(err.Error())
+		}
+	}
+
+	f.Fuzz(func(t *testing.T, data []byte) {
+		if len(data) > maxSize {
+			return
+		}
+		for level := startFuzz; level <= endFuzz; level++ {
+			if level == DefaultCompression {
+				continue // Already covered.
+			}
+			msg := "level " + strconv.Itoa(level) + ":"
+			buf.Reset()
+			fw := encs[level-startFuzz]
+			fw.Reset(buf)
+			n, err := fw.Write(data)
+			if n != len(data) {
+				t.Fatal(msg + "short write")
+			}
+			if err != nil {
+				t.Fatal(msg + err.Error())
+			}
+			err = fw.Close()
+			if err != nil {
+				t.Fatal(msg + err.Error())
+			}
+			compressed := buf.Bytes()
+			err = decoder.(Resetter).Reset(buf, nil)
+			if err != nil {
+				t.Fatal(msg + err.Error())
+			}
+			data2, err := io.ReadAll(decoder)
+			if err != nil {
+				t.Fatal(msg + err.Error())
+			}
+			if !bytes.Equal(data, data2) {
+				t.Fatal(msg + "decompressed not equal")
+			}
+
+			// Do it again...
+			msg = "level " + strconv.Itoa(level) + " (reset):"
+			buf2.Reset()
+			fw.Reset(buf2)
+			n, err = fw.Write(data)
+			if n != len(data) {
+				t.Fatal(msg + "short write")
+			}
+			if err != nil {
+				t.Fatal(msg + err.Error())
+			}
+			err = fw.Close()
+			if err != nil {
+				t.Fatal(msg + err.Error())
+			}
+			compressed2 := buf2.Bytes()
+			err = decoder.(Resetter).Reset(buf2, nil)
+			if err != nil {
+				t.Fatal(msg + err.Error())
+			}
+			data2, err = io.ReadAll(decoder)
+			if err != nil {
+				t.Fatal(msg + err.Error())
+			}
+			if !bytes.Equal(data, data2) {
+				t.Fatal(msg + "decompressed not equal")
+			}
+			// Determinism checks will usually not be reproducible,
+			// since it often relies on the internal state of the compressor.
+			if !bytes.Equal(compressed, compressed2) {
+				t.Fatal(msg + "non-deterministic output")
+			}
+		}
+	})
+}
diff --git a/src/compress/flate/huffman_bit_writer.go b/src/compress/flate/huffman_bit_writer.go
index d68c77fb32e32a..585a9b4cf19032 100644
--- a/src/compress/flate/huffman_bit_writer.go
+++ b/src/compress/flate/huffman_bit_writer.go
@@ -6,6 +6,7 @@ package flate
 
 import (
 	"io"
+	"math"
 )
 
 const (
@@ -22,20 +23,22 @@ const (
 	codegenCodeCount = 19
 	badCode          = 255
 
+	// maxPredefinedTokens is the maximum number of tokens
+	// where we check if fixed size is smaller.
+	maxPredefinedTokens = 250
+
 	// bufferFlushSize indicates the buffer size
 	// after which bytes are flushed to the writer.
 	// Should preferably be a multiple of 6, since
 	// we accumulate 6 bytes between writes to the buffer.
-	bufferFlushSize = 240
-
-	// bufferSize is the actual output byte buffer size.
-	// It must have additional headroom for a flush
-	// which can contain up to 8 bytes.
-	bufferSize = bufferFlushSize + 8
+	bufferFlushSize = 246
 )
 
+// Minimum length code that emits bits.
+const lengthExtraBitsMinCode = 8
+
 // The number of extra bits needed by length code X - LENGTH_CODES_START.
-var lengthExtraBits = []int8{
+var lengthExtraBits = [32]uint8{
 	/* 257 */ 0, 0, 0,
 	/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
 	/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
@@ -43,26 +46,47 @@ var lengthExtraBits = []int8{
 }
 
 // The length indicated by length code X - LENGTH_CODES_START.
-var lengthBase = []uint32{
+var lengthBase = [32]uint8{
 	0, 1, 2, 3, 4, 5, 6, 7, 8, 10,
 	12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
 	64, 80, 96, 112, 128, 160, 192, 224, 255,
 }
 
+// Minimum offset code that emits bits.
+const offsetExtraBitsMinCode = 4
+
 // offset code word extra bits.
-var offsetExtraBits = []int8{
+var offsetExtraBits = [32]int8{
 	0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
 	4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
 	9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
+	/* extended window */
+	14, 14,
 }
 
-var offsetBase = []uint32{
-	0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
-	0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
-	0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
-	0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
-	0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
-	0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
+var offsetCombined = [32]uint32{}
+
+func init() {
+	var offsetBase = [32]uint32{
+		/* normal deflate */
+		0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
+		0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
+		0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
+		0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
+		0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
+		0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
+
+		/* extended window */
+		0x008000, 0x00c000,
+	}
+
+	for i := range offsetCombined[:] {
+		// Don't use extended window values...
+		if offsetExtraBits[i] == 0 || offsetBase[i] > 0x006000 {
+			continue
+		}
+		offsetCombined[i] = uint32(offsetExtraBits[i]) | (offsetBase[i] << 8)
+	}
 }
 
 // The odd order in which the codegen code sizes are written.
@@ -75,29 +99,49 @@ type huffmanBitWriter struct {
 	writer io.Writer
 
 	// Data waiting to be written is bytes[0:nbytes]
-	// and then the low nbits of bits.  Data is always written
-	// sequentially into the bytes array.
-	bits            uint64
-	nbits           uint
-	bytes           [bufferSize]byte
-	codegenFreq     [codegenCodeCount]int32
-	nbytes          int
-	literalFreq     []int32
-	offsetFreq      []int32
-	codegen         []uint8
-	literalEncoding *huffmanEncoder
-	offsetEncoding  *huffmanEncoder
-	codegenEncoding *huffmanEncoder
-	err             error
+	// and then the low nbits of bits.
+	bits               uint64
+	nbits              uint8
+	nbytes             uint8
+	lastHuffMan        bool
+	literalEncoding    *huffmanEncoder
+	tmpLitEncoding     *huffmanEncoder
+	offsetEncoding     *huffmanEncoder
+	codegenEncoding    *huffmanEncoder
+	err                error
+	lastHeader         int
+	logNewTablePenalty uint // Bigger values will reduce the penalty of a new table.
+	bytes              [256 + 8]byte
+	literalFreq        [lengthCodesStart + 32]uint16
+	offsetFreq         [32]uint16
+	codegenFreq        [codegenCodeCount]uint16
+
+	// codegen must have an extra space for the final symbol.
+	codegen [literalCount + offsetCodeCount + 1]uint8
 }
 
+// The huffmanBitWriter supports reusing huffman tables and will combine
+// blocks, if compression is less than creating a new table.
+//
+// This is controlled by several variables:
+//
+// If 'lastHeader' is non-zero the Huffman table can be reused.
+// It also indicates that an EOB has not yet been emitted, so if a new table
+// is generated, an EOB with the previous table must be written.
+//
+// If 'lastHuffMan' is set, a table for outputting literals
+// has been generated and offsets are invalid.
+//
+// An incoming block estimates the output size of a new table using a
+// 'fresh' by calculating the optimal size and adding a penalty.
+// A Huffman table is not optimal, which is why we add a penalty,
+// and generating a new table is slower for both compression and decompression.
+
 func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
 	return &huffmanBitWriter{
 		writer:          w,
-		literalFreq:     make([]int32, maxNumLit),
-		offsetFreq:      make([]int32, offsetCodeCount),
-		codegen:         make([]uint8, maxNumLit+offsetCodeCount+1),
-		literalEncoding: newHuffmanEncoder(maxNumLit),
+		literalEncoding: newHuffmanEncoder(literalCount),
+		tmpLitEncoding:  newHuffmanEncoder(literalCount),
 		codegenEncoding: newHuffmanEncoder(codegenCodeCount),
 		offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
 	}
@@ -106,6 +150,37 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
 func (w *huffmanBitWriter) reset(writer io.Writer) {
 	w.writer = writer
 	w.bits, w.nbits, w.nbytes, w.err = 0, 0, 0, nil
+	w.lastHeader = 0
+	w.lastHuffMan = false
+}
+
+func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) {
+	a := t.offHist[:offsetCodeCount]
+	b := w.offsetEncoding.codes
+	b = b[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].zero() {
+			return false
+		}
+	}
+
+	a = t.extraHist[:literalCount-256]
+	b = w.literalEncoding.codes[256:literalCount]
+	b = b[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].zero() {
+			return false
+		}
+	}
+
+	a = t.litHist[:256]
+	b = w.literalEncoding.codes[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].zero() {
+			return false
+		}
+	}
+	return true
 }
 
 func (w *huffmanBitWriter) flush() {
@@ -113,6 +188,11 @@ func (w *huffmanBitWriter) flush() {
 		w.nbits = 0
 		return
 	}
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
 	n := w.nbytes
 	for w.nbits != 0 {
 		w.bytes[n] = byte(w.bits)
@@ -125,7 +205,9 @@ func (w *huffmanBitWriter) flush() {
 		n++
 	}
 	w.bits = 0
-	w.write(w.bytes[:n])
+	if n > 0 {
+		w.write(w.bytes[:n])
+	}
 	w.nbytes = 0
 }
 
@@ -136,30 +218,11 @@ func (w *huffmanBitWriter) write(b []byte) {
 	_, w.err = w.writer.Write(b)
 }
 
-func (w *huffmanBitWriter) writeBits(b int32, nb uint) {
-	if w.err != nil {
-		return
-	}
-	w.bits |= uint64(b) << w.nbits
+func (w *huffmanBitWriter) writeBits(b int32, nb uint8) {
+	w.bits |= uint64(b) << (w.nbits & 63)
 	w.nbits += nb
 	if w.nbits >= 48 {
-		bits := w.bits
-		w.bits >>= 48
-		w.nbits -= 48
-		n := w.nbytes
-		bytes := w.bytes[n : n+6]
-		bytes[0] = byte(bits)
-		bytes[1] = byte(bits >> 8)
-		bytes[2] = byte(bits >> 16)
-		bytes[3] = byte(bits >> 24)
-		bytes[4] = byte(bits >> 32)
-		bytes[5] = byte(bits >> 40)
-		n += 6
-		if n >= bufferFlushSize {
-			w.write(w.bytes[:n])
-			n = 0
-		}
-		w.nbytes = n
+		w.writeOutBits()
 	}
 }
 
@@ -198,21 +261,23 @@ func (w *huffmanBitWriter) writeBytes(bytes []byte) {
 //	numOffsets       The number of offsets in offsetEncoding
 //	litenc, offenc   The literal and offset encoder to use
 func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litEnc, offEnc *huffmanEncoder) {
-	clear(w.codegenFreq[:])
+	for i := range w.codegenFreq {
+		w.codegenFreq[i] = 0
+	}
 	// Note that we are using codegen both as a temporary variable for holding
 	// a copy of the frequencies, and as the place where we put the result.
 	// This is fine because the output is always shorter than the input used
 	// so far.
-	codegen := w.codegen // cache
+	codegen := w.codegen[:] // cache
 	// Copy the concatenated code sizes to codegen. Put a marker at the end.
 	cgnl := codegen[:numLiterals]
 	for i := range cgnl {
-		cgnl[i] = uint8(litEnc.codes[i].len)
+		cgnl[i] = litEnc.codes[i].len()
 	}
 
 	cgnl = codegen[numLiterals : numLiterals+numOffsets]
 	for i := range cgnl {
-		cgnl[i] = uint8(offEnc.codes[i].len)
+		cgnl[i] = offEnc.codes[i].len()
 	}
 	codegen[numLiterals+numOffsets] = badCode
 
@@ -234,10 +299,7 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE
 			w.codegenFreq[size]++
 			count--
 			for count >= 3 {
-				n := 6
-				if n > count {
-					n = count
-				}
+				n := min(6, count)
 				codegen[outIndex] = 16
 				outIndex++
 				codegen[outIndex] = uint8(n - 3)
@@ -247,10 +309,7 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE
 			}
 		} else {
 			for count >= 11 {
-				n := 138
-				if n > count {
-					n = count
-				}
+				n := min(138, count)
 				codegen[outIndex] = 18
 				outIndex++
 				codegen[outIndex] = uint8(n - 11)
@@ -282,30 +341,61 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE
 	codegen[outIndex] = badCode
 }
 
-// dynamicSize returns the size of dynamically encoded data in bits.
-func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
+func (w *huffmanBitWriter) codegens() int {
+	numCodegens := len(w.codegenFreq)
+	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
+		numCodegens--
+	}
+	return numCodegens
+}
+
+func (w *huffmanBitWriter) headerSize() (size, numCodegens int) {
 	numCodegens = len(w.codegenFreq)
 	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
 		numCodegens--
 	}
-	header := 3 + 5 + 5 + 4 + (3 * numCodegens) +
+	return 3 + 5 + 5 + 4 + (3 * numCodegens) +
 		w.codegenEncoding.bitLength(w.codegenFreq[:]) +
 		int(w.codegenFreq[16])*2 +
 		int(w.codegenFreq[17])*3 +
-		int(w.codegenFreq[18])*7
+		int(w.codegenFreq[18])*7, numCodegens
+}
+
+// dynamicSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) dynamicReuseSize(litEnc, offEnc *huffmanEncoder) (size int) {
+	size = litEnc.bitLength(w.literalFreq[:]) +
+		offEnc.bitLength(w.offsetFreq[:])
+	return size
+}
+
+// dynamicSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
+	header, numCodegens := w.headerSize()
 	size = header +
-		litEnc.bitLength(w.literalFreq) +
-		offEnc.bitLength(w.offsetFreq) +
+		litEnc.bitLength(w.literalFreq[:]) +
+		offEnc.bitLength(w.offsetFreq[:]) +
 		extraBits
-
 	return size, numCodegens
 }
 
+// extraBitSize will return the number of bits that will be written
+// as "extra" bits on matches.
+func (w *huffmanBitWriter) extraBitSize() int {
+	total := 0
+	for i, n := range w.literalFreq[257:literalCount] {
+		total += int(n) * int(lengthExtraBits[i&31])
+	}
+	for i, n := range w.offsetFreq[:offsetCodeCount] {
+		total += int(n) * int(offsetExtraBits[i&31])
+	}
+	return total
+}
+
 // fixedSize returns the size of dynamically encoded data in bits.
 func (w *huffmanBitWriter) fixedSize(extraBits int) int {
 	return 3 +
-		fixedLiteralEncoding.bitLength(w.literalFreq) +
-		fixedOffsetEncoding.bitLength(w.offsetFreq) +
+		fixedLiteralEncoding.bitLength(w.literalFreq[:]) +
+		fixedOffsetEncoding.bitLength(w.offsetFreq[:]) +
 		extraBits
 }
 
@@ -322,31 +412,37 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
 	return 0, false
 }
 
+// writeCode writes 'c' to the stream.
+// Inline manually when performance is critical.
 func (w *huffmanBitWriter) writeCode(c hcode) {
-	if w.err != nil {
-		return
-	}
-	w.bits |= uint64(c.code) << w.nbits
-	w.nbits += uint(c.len)
+	w.bits |= c.code64() << (w.nbits & reg8SizeMask64)
+	w.nbits += c.len()
 	if w.nbits >= 48 {
-		bits := w.bits
-		w.bits >>= 48
-		w.nbits -= 48
-		n := w.nbytes
-		bytes := w.bytes[n : n+6]
-		bytes[0] = byte(bits)
-		bytes[1] = byte(bits >> 8)
-		bytes[2] = byte(bits >> 16)
-		bytes[3] = byte(bits >> 24)
-		bytes[4] = byte(bits >> 32)
-		bytes[5] = byte(bits >> 40)
-		n += 6
-		if n >= bufferFlushSize {
-			w.write(w.bytes[:n])
+		w.writeOutBits()
+	}
+}
+
+// writeOutBits will write bits to the buffer.
+func (w *huffmanBitWriter) writeOutBits() {
+	bits := w.bits
+	w.bits >>= 48
+	w.nbits -= 48
+	n := w.nbytes
+
+	// We overwrite, but faster...
+	storeLE64(w.bytes[n:], bits)
+	n += 6
+
+	if n >= bufferFlushSize {
+		if w.err != nil {
 			n = 0
+			return
 		}
-		w.nbytes = n
+		w.write(w.bytes[:n])
+		n = 0
 	}
+
+	w.nbytes = n
 }
 
 // Write the header of a dynamic Huffman block to the output stream.
@@ -367,19 +463,19 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n
 	w.writeBits(int32(numOffsets-1), 5)
 	w.writeBits(int32(numCodegens-4), 4)
 
-	for i := 0; i < numCodegens; i++ {
-		value := uint(w.codegenEncoding.codes[codegenOrder[i]].len)
+	for i := range numCodegens {
+		value := uint(w.codegenEncoding.codes[codegenOrder[i]].len())
 		w.writeBits(int32(value), 3)
 	}
 
 	i := 0
 	for {
-		var codeWord int = int(w.codegen[i])
+		var codeWord = uint32(w.codegen[i])
 		i++
 		if codeWord == badCode {
 			break
 		}
-		w.writeCode(w.codegenEncoding.codes[uint32(codeWord)])
+		w.writeCode(w.codegenEncoding.codes[codeWord])
 
 		switch codeWord {
 		case 16:
@@ -395,10 +491,28 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n
 	}
 }
 
+// writeStoredHeader will write a stored header.
+// If the stored block is only used for EOF,
+// it is replaced with a fixed huffman block.
 func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
 	if w.err != nil {
 		return
 	}
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+
+	// To write EOF, use a fixed encoding block. 10 bits instead of 5 bytes.
+	if length == 0 && isEof {
+		w.writeFixedHeader(isEof)
+		// EOB: 7 bits, value: 0
+		w.writeBits(0, 7)
+		w.flush()
+		return
+	}
+
 	var flag int32
 	if isEof {
 		flag = 1
@@ -413,6 +527,12 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
 	if w.err != nil {
 		return
 	}
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+
 	// Indicate that we are a fixed Huffman block
 	var value int32 = 2
 	if isEof {
@@ -426,36 +546,33 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
 // is larger than the original bytes, the data will be written as a
 // stored block.
 // If the input is nil, the tokens will always be Huffman encoded.
-func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
+func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 	if w.err != nil {
 		return
 	}
 
-	tokens = append(tokens, endBlockMarker)
+	tokens.AddEOB()
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
 	numLiterals, numOffsets := w.indexTokens(tokens)
-
+	w.generate()
 	var extraBits int
 	storedSize, storable := w.storedSize(input)
 	if storable {
-		// We only bother calculating the costs of the extra bits required by
-		// the length of offset fields (which will be the same for both fixed
-		// and dynamic encoding), if we need to compare those two encodings
-		// against stored encoding.
-		for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ {
-			// First eight length codes have extra size = 0.
-			extraBits += int(w.literalFreq[lengthCode]) * int(lengthExtraBits[lengthCode-lengthCodesStart])
-		}
-		for offsetCode := 4; offsetCode < numOffsets; offsetCode++ {
-			// First four offset codes have extra size = 0.
-			extraBits += int(w.offsetFreq[offsetCode]) * int(offsetExtraBits[offsetCode])
-		}
+		extraBits = w.extraBitSize()
 	}
 
 	// Figure out smallest code.
 	// Fixed Huffman baseline.
 	var literalEncoding = fixedLiteralEncoding
 	var offsetEncoding = fixedOffsetEncoding
-	var size = w.fixedSize(extraBits)
+	var size = math.MaxInt32
+	if tokens.n < maxPredefinedTokens {
+		size = w.fixedSize(extraBits)
+	}
 
 	// Dynamic Huffman?
 	var numCodegens int
@@ -473,7 +590,7 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
 	}
 
 	// Stored bytes?
-	if storable && storedSize < size {
+	if storable && storedSize <= size {
 		w.writeStoredHeader(len(input), eof)
 		w.writeBytes(input)
 		return
@@ -487,7 +604,7 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
 	}
 
 	// Write the tokens.
-	w.writeTokens(tokens, literalEncoding.codes, offsetEncoding.codes)
+	w.writeTokens(tokens.Slice(), literalEncoding.codes, offsetEncoding.codes)
 }
 
 // writeBlockDynamic encodes a block using a dynamic Huffman table.
@@ -495,53 +612,153 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
 // histogram distribution.
 // If input is supplied and the compression savings are below 1/16th of the
 // input size the block is stored.
-func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []byte) {
+func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []byte, sync bool) {
 	if w.err != nil {
 		return
 	}
 
-	tokens = append(tokens, endBlockMarker)
+	sync = sync || eof
+	if sync {
+		tokens.AddEOB()
+	}
+
+	// We cannot reuse pure huffman table, and must mark as EOF.
+	if (w.lastHuffMan || eof) && w.lastHeader > 0 {
+		// We will not try to reuse.
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+		w.lastHuffMan = false
+	}
+
+	if w.lastHeader > 0 && !w.canReuse(tokens) {
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+
 	numLiterals, numOffsets := w.indexTokens(tokens)
+	extraBits := 0
+	ssize, storable := w.storedSize(input)
 
-	// Generate codegen and codegenFrequencies, which indicates how to encode
-	// the literalEncoding and the offsetEncoding.
-	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
-	w.codegenEncoding.generate(w.codegenFreq[:], 7)
-	size, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, 0)
+	if storable || w.lastHeader > 0 {
+		extraBits = w.extraBitSize()
+	}
 
-	// Store bytes, if we don't get a reasonable improvement.
-	if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
-		w.writeStoredHeader(len(input), eof)
-		w.writeBytes(input)
-		return
+	var size int
+
+	// Check if we should reuse.
+	if w.lastHeader > 0 {
+		// Estimate size for using a new table.
+		// Use the previous header size as the best estimate.
+		newSize := w.lastHeader + tokens.EstimatedBits()
+
+		// The estimated size is calculated as an optimal table.
+		// We add a penalty to make it more realistic and re-use a bit more.
+		newSize += int(w.literalEncoding.codes[endBlockMarker].len()) + newSize>>w.logNewTablePenalty
+
+		// Calculate the size for reusing the current table.
+		reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + extraBits
+
+		// Check if a new table is better.
+		if newSize < reuseSize {
+			// Write the EOB we owe.
+			w.writeCode(w.literalEncoding.codes[endBlockMarker])
+			size = newSize
+			w.lastHeader = 0
+		} else {
+			size = reuseSize
+		}
+
+		// Small blocks can be more efficient with fixed encoding.
+		if tokens.n < maxPredefinedTokens {
+			if preSize := w.fixedSize(extraBits) + 7; preSize < size {
+				// Check if we get a reasonable size decrease.
+				if storable && ssize <= size {
+					w.writeStoredHeader(len(input), eof)
+					w.writeBytes(input)
+					return
+				}
+				w.writeFixedHeader(eof)
+				if !sync {
+					tokens.AddEOB()
+				}
+				w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+				return
+			}
+		}
+
+		// Check if we get a reasonable size decrease.
+		if storable && ssize <= size {
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			return
+		}
 	}
 
-	// Write Huffman table.
-	w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+	// We want a new block/table
+	if w.lastHeader == 0 {
+		w.literalFreq[endBlockMarker] = 1
+
+		w.generate()
+		// Generate codegen and codegenFrequencies, which indicates how to encode
+		// the literalEncoding and the offsetEncoding.
+		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
+		w.codegenEncoding.generate(w.codegenFreq[:], 7)
+
+		var numCodegens int
+		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits)
+
+		// Store predefined or raw, if we don't get a reasonable improvement.
+		if tokens.n < maxPredefinedTokens {
+			if preSize := w.fixedSize(extraBits); preSize <= size {
+				// Store bytes, if we don't get an improvement.
+				if storable && ssize <= preSize {
+					w.writeStoredHeader(len(input), eof)
+					w.writeBytes(input)
+					return
+				}
+				w.writeFixedHeader(eof)
+				if !sync {
+					tokens.AddEOB()
+				}
+				w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+				return
+			}
+		}
+
+		if storable && ssize <= size {
+			// Store bytes, if we don't get an improvement.
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			return
+		}
+
+		// Write Huffman table.
+		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+		if !sync {
+			w.lastHeader, _ = w.headerSize()
+		}
+		w.lastHuffMan = false
+	}
 
+	if sync {
+		w.lastHeader = 0
+	}
 	// Write the tokens.
-	w.writeTokens(tokens, w.literalEncoding.codes, w.offsetEncoding.codes)
+	w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes)
 }
 
 // indexTokens indexes a slice of tokens, and updates
 // literalFreq and offsetFreq, and generates literalEncoding
 // and offsetEncoding.
 // The number of literal and offset tokens is returned.
-func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets int) {
-	clear(w.literalFreq)
-	clear(w.offsetFreq)
+func (w *huffmanBitWriter) indexTokens(t *tokens) (numLiterals, numOffsets int) {
+	*(*[256]uint16)(w.literalFreq[:]) = t.litHist
+	*(*[32]uint16)(w.literalFreq[256:]) = t.extraHist
+	w.offsetFreq = t.offHist
 
-	for _, t := range tokens {
-		if t < matchType {
-			w.literalFreq[t.literal()]++
-			continue
-		}
-		length := t.length()
-		offset := t.offset()
-		w.literalFreq[lengthCodesStart+lengthCode(length)]++
-		w.offsetFreq[offsetCode(offset)]++
+	if t.n == 0 {
+		return
 	}
-
 	// get the number of literals
 	numLiterals = len(w.literalFreq)
 	for w.literalFreq[numLiterals-1] == 0 {
@@ -558,41 +775,153 @@ func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets
 		w.offsetFreq[0] = 1
 		numOffsets = 1
 	}
-	w.literalEncoding.generate(w.literalFreq, 15)
-	w.offsetEncoding.generate(w.offsetFreq, 15)
 	return
 }
 
+func (w *huffmanBitWriter) generate() {
+	w.literalEncoding.generate(w.literalFreq[:literalCount], 15)
+	w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15)
+}
+
 // writeTokens writes a slice of tokens to the output.
 // codes for literal and offset encoding must be supplied.
 func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) {
 	if w.err != nil {
 		return
 	}
+	if len(tokens) == 0 {
+		return
+	}
+
+	// Only last token should be endBlockMarker.
+	var deferEOB bool
+	if tokens[len(tokens)-1] == endBlockMarker {
+		tokens = tokens[:len(tokens)-1]
+		deferEOB = true
+	}
+
+	// Create slices up to the next power of two to avoid bounds checks.
+	lits := leCodes[:256]
+	offs := oeCodes[:32]
+	lengths := leCodes[lengthCodesStart:]
+	lengths = lengths[:32]
+
+	// Go 1.16 LOVES having these on stack.
+	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
+
 	for _, t := range tokens {
-		if t < matchType {
-			w.writeCode(leCodes[t.literal()])
+		if t < 256 {
+			c := lits[t]
+			bits |= c.code64() << (nbits & 63)
+			nbits += c.len()
+			if nbits >= 48 {
+				storeLE64(w.bytes[nbytes:], bits)
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
 			continue
 		}
+
 		// Write the length
 		length := t.length()
-		lengthCode := lengthCode(length)
-		w.writeCode(leCodes[lengthCode+lengthCodesStart])
-		extraLengthBits := uint(lengthExtraBits[lengthCode])
-		if extraLengthBits > 0 {
-			extraLength := int32(length - lengthBase[lengthCode])
-			w.writeBits(extraLength, extraLengthBits)
+		lenCode := lengthCode(length) & 31
+		// inlined 'w.writeCode(lengths[lengthCode])'
+		c := lengths[lenCode]
+		bits |= c.code64() << (nbits & 63)
+		nbits += c.len()
+		if nbits >= 48 {
+			storeLE64(w.bytes[nbytes:], bits)
+			bits >>= 48
+			nbits -= 48
+			nbytes += 6
+			if nbytes >= bufferFlushSize {
+				if w.err != nil {
+					nbytes = 0
+					return
+				}
+				_, w.err = w.writer.Write(w.bytes[:nbytes])
+				nbytes = 0
+			}
+		}
+
+		if lenCode >= lengthExtraBitsMinCode {
+			extraLengthBits := lengthExtraBits[lenCode]
+			//w.writeBits(extraLength, extraLengthBits)
+			extraLength := int32(length - lengthBase[lenCode])
+			bits |= uint64(extraLength) << (nbits & 63)
+			nbits += extraLengthBits
+			if nbits >= 48 {
+				storeLE64(w.bytes[nbytes:], bits)
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
 		}
 		// Write the offset
 		offset := t.offset()
-		offsetCode := offsetCode(offset)
-		w.writeCode(oeCodes[offsetCode])
-		extraOffsetBits := uint(offsetExtraBits[offsetCode])
-		if extraOffsetBits > 0 {
-			extraOffset := int32(offset - offsetBase[offsetCode])
-			w.writeBits(extraOffset, extraOffsetBits)
+		offCode := (offset >> 16) & 31
+		// inlined 'w.writeCode(offs[offCode])'
+		c = offs[offCode]
+		bits |= c.code64() << (nbits & 63)
+		nbits += c.len()
+		if nbits >= 48 {
+			storeLE64(w.bytes[nbytes:], bits)
+			bits >>= 48
+			nbits -= 48
+			nbytes += 6
+			if nbytes >= bufferFlushSize {
+				if w.err != nil {
+					nbytes = 0
+					return
+				}
+				_, w.err = w.writer.Write(w.bytes[:nbytes])
+				nbytes = 0
+			}
+		}
+
+		if offCode >= offsetExtraBitsMinCode {
+			offsetComb := offsetCombined[offCode]
+			bits |= uint64((offset-(offsetComb>>8))&matchOffsetOnlyMask) << (nbits & 63)
+			nbits += uint8(offsetComb)
+			if nbits >= 48 {
+				storeLE64(w.bytes[nbytes:], bits)
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
 		}
 	}
+	// Restore...
+	w.bits, w.nbits, w.nbytes = bits, nbits, nbytes
+
+	if deferEOB {
+		w.writeCode(leCodes[endBlockMarker])
+	}
 }
 
 // huffOffset is a static offset encoder used for huffman only encoding.
@@ -600,94 +929,168 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 var huffOffset *huffmanEncoder
 
 func init() {
-	offsetFreq := make([]int32, offsetCodeCount)
-	offsetFreq[0] = 1
+	w := newHuffmanBitWriter(nil)
+	w.offsetFreq[0] = 1
 	huffOffset = newHuffmanEncoder(offsetCodeCount)
-	huffOffset.generate(offsetFreq, 15)
+	huffOffset.generate(w.offsetFreq[:offsetCodeCount], 15)
 }
 
 // writeBlockHuff encodes a block of bytes as either
 // Huffman encoded literals or uncompressed bytes if the
 // results only gains very little from compression.
-func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
+func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	if w.err != nil {
 		return
 	}
 
 	// Clear histogram
-	clear(w.literalFreq)
-
-	// Add everything as literals
-	histogram(input, w.literalFreq)
-
-	w.literalFreq[endBlockMarker] = 1
+	for i := range w.literalFreq[:] {
+		w.literalFreq[i] = 0
+	}
+	if !w.lastHuffMan {
+		for i := range w.offsetFreq[:] {
+			w.offsetFreq[i] = 0
+		}
+	}
 
 	const numLiterals = endBlockMarker + 1
-	w.offsetFreq[0] = 1
 	const numOffsets = 1
 
-	w.literalEncoding.generate(w.literalFreq, 15)
-
-	// Figure out smallest code.
-	// Always use dynamic Huffman or Store
-	var numCodegens int
-
-	// Generate codegen and codegenFrequencies, which indicates how to encode
-	// the literalEncoding and the offsetEncoding.
-	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
-	w.codegenEncoding.generate(w.codegenFreq[:], 7)
-	size, numCodegens := w.dynamicSize(w.literalEncoding, huffOffset, 0)
+	// Add everything as literals
+	// We have to estimate the header size.
+	// Assume header is around 70 bytes:
+	// https://stackoverflow.com/a/25454430
+	const guessHeaderSizeBits = 70 * 8
+	histogram(input, w.literalFreq[:numLiterals])
+	ssize, storable := w.storedSize(input)
+	if storable && len(input) > 1024 {
+		// Quick check for incompressible content.
+		abs := float64(0)
+		avg := float64(len(input)) / 256
+		max := float64(len(input) * 2)
+		for _, v := range w.literalFreq[:256] {
+			diff := float64(v) - avg
+			abs += diff * diff
+			if abs > max {
+				break
+			}
+		}
+		if abs < max {
+			// No chance we can compress this...
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			return
+		}
+	}
+	w.literalFreq[endBlockMarker] = 1
+	w.tmpLitEncoding.generate(w.literalFreq[:numLiterals], 15)
+	estBits := w.tmpLitEncoding.canReuseBits(w.literalFreq[:numLiterals])
+	if estBits < math.MaxInt32 {
+		estBits += w.lastHeader
+		if w.lastHeader == 0 {
+			estBits += guessHeaderSizeBits
+		}
+		estBits += estBits >> w.logNewTablePenalty
+	}
 
 	// Store bytes, if we don't get a reasonable improvement.
-	if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+	if storable && ssize <= estBits {
 		w.writeStoredHeader(len(input), eof)
 		w.writeBytes(input)
 		return
 	}
 
-	// Huffman.
-	w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
-	encoding := w.literalEncoding.codes[:257]
-	n := w.nbytes
-	for _, t := range input {
-		// Bitwriting inlined, ~30% speedup
-		c := encoding[t]
-		w.bits |= uint64(c.code) << w.nbits
-		w.nbits += uint(c.len)
-		if w.nbits < 48 {
-			continue
+	if w.lastHeader > 0 {
+		reuseSize := w.literalEncoding.canReuseBits(w.literalFreq[:256])
+
+		if estBits < reuseSize {
+			// We owe an EOB
+			w.writeCode(w.literalEncoding.codes[endBlockMarker])
+			w.lastHeader = 0
 		}
-		// Store 6 bytes
-		bits := w.bits
-		w.bits >>= 48
-		w.nbits -= 48
-		bytes := w.bytes[n : n+6]
-		bytes[0] = byte(bits)
-		bytes[1] = byte(bits >> 8)
-		bytes[2] = byte(bits >> 16)
-		bytes[3] = byte(bits >> 24)
-		bytes[4] = byte(bits >> 32)
-		bytes[5] = byte(bits >> 40)
-		n += 6
-		if n < bufferFlushSize {
-			continue
+	}
+
+	if w.lastHeader == 0 {
+		// Use the temp encoding, so swap.
+		w.literalEncoding, w.tmpLitEncoding = w.tmpLitEncoding, w.literalEncoding
+		// Generate codegen and codegenFrequencies, which indicates how to encode
+		// the literalEncoding and the offsetEncoding.
+		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
+		w.codegenEncoding.generate(w.codegenFreq[:], 7)
+		numCodegens := w.codegens()
+
+		// Huffman.
+		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+		w.lastHuffMan = true
+		w.lastHeader, _ = w.headerSize()
+	}
+
+	encoding := w.literalEncoding.codes[:256]
+	// Go 1.16 LOVES having these on stack. At least 1.5x the speed.
+	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
+
+	// Unroll, write 3 codes/loop.
+	// Fastest number of unrolls.
+	for len(input) > 3 {
+		// We must have at least 48 bits free.
+		if nbits >= 8 {
+			n := nbits >> 3
+			storeLE64(w.bytes[nbytes:], bits)
+			bits >>= (n * 8) & 63
+			nbits -= n * 8
+			nbytes += n
 		}
-		w.write(w.bytes[:n])
-		if w.err != nil {
-			return // Return early in the event of write failures
+		if nbytes >= bufferFlushSize {
+			if w.err != nil {
+				nbytes = 0
+				return
+			}
+			_, w.err = w.writer.Write(w.bytes[:nbytes])
+			nbytes = 0
 		}
-		n = 0
+		a, b := encoding[input[0]], encoding[input[1]]
+		bits |= a.code64() << (nbits & 63)
+		bits |= b.code64() << ((nbits + a.len()) & 63)
+		c := encoding[input[2]]
+		nbits += b.len() + a.len()
+		bits |= c.code64() << (nbits & 63)
+		nbits += c.len()
+		input = input[3:]
 	}
-	w.nbytes = n
-	w.writeCode(encoding[endBlockMarker])
-}
 
-// histogram accumulates a histogram of b in h.
-//
-// len(h) must be >= 256, and h's elements must be all zeroes.
-func histogram(b []byte, h []int32) {
-	h = h[:256]
-	for _, t := range b {
-		h[t]++
+	// Remaining...
+	for _, t := range input {
+		if nbits >= 48 {
+			storeLE64(w.bytes[nbytes:], bits)
+			bits >>= 48
+			nbits -= 48
+			nbytes += 6
+			if nbytes >= bufferFlushSize {
+				if w.err != nil {
+					nbytes = 0
+					return
+				}
+				_, w.err = w.writer.Write(w.bytes[:nbytes])
+				nbytes = 0
+			}
+		}
+		// Bitwriting inlined, ~30% speedup
+		c := encoding[t]
+		bits |= c.code64() << (nbits & 63)
+
+		nbits += c.len()
+	}
+	// Restore...
+	w.bits, w.nbits, w.nbytes = bits, nbits, nbytes
+
+	// Flush if needed to have space.
+	if w.nbits >= 48 {
+		w.writeOutBits()
+	}
+
+	if eof || sync {
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+		w.lastHuffMan = false
 	}
 }
diff --git a/src/compress/flate/huffman_bit_writer_test.go b/src/compress/flate/huffman_bit_writer_test.go
index a57799cae02685..dfb93e326c0871 100644
--- a/src/compress/flate/huffman_bit_writer_test.go
+++ b/src/compress/flate/huffman_bit_writer_test.go
@@ -32,7 +32,9 @@ func TestBlockHuff(t *testing.T) {
 		if strings.HasSuffix(in, ".in") {
 			out = in[:len(in)-len(".in")] + ".golden"
 		}
-		testBlockHuff(t, in, out)
+		t.Run(in, func(t *testing.T) {
+			testBlockHuff(t, in, out)
+		})
 	}
 }
 
@@ -44,7 +46,8 @@ func testBlockHuff(t *testing.T, in, out string) {
 	}
 	var buf bytes.Buffer
 	bw := newHuffmanBitWriter(&buf)
-	bw.writeBlockHuff(false, all)
+	bw.logNewTablePenalty = 8
+	bw.writeBlockHuff(false, all, false)
 	bw.flush()
 	got := buf.Bytes()
 
@@ -79,7 +82,7 @@ func testBlockHuff(t *testing.T, in, out string) {
 	// Test if the writer produces the same output after reset.
 	buf.Reset()
 	bw.reset(&buf)
-	bw.writeBlockHuff(false, all)
+	bw.writeBlockHuff(false, all, false)
 	bw.flush()
 	got = buf.Bytes()
 	if !bytes.Equal(got, want) {
@@ -175,13 +178,23 @@ func TestWriteBlockDynamic(t *testing.T) {
 	}
 }
 
+// TestWriteBlockDynamic tests if the writeBlockDynamic encoding has changed.
+// To update the reference files use the "-update" flag on the test.
+func TestWriteBlockDynamicSync(t *testing.T) {
+	for _, test := range writeBlockTests {
+		testBlock(t, test, "sync")
+	}
+}
+
 // testBlock tests a block against its references,
 // or regenerate the references, if "-update" flag is set.
 func testBlock(t *testing.T, test huffTest, ttype string) {
 	if test.want != "" {
 		test.want = fmt.Sprintf(test.want, ttype)
 	}
+	const gotSuffix = ".got"
 	test.wantNoInput = fmt.Sprintf(test.wantNoInput, ttype)
+	tokens := indexTokens(test.tokens)
 	if *update {
 		if test.input != "" {
 			t.Logf("Updating %q", test.want)
@@ -198,7 +211,7 @@ func testBlock(t *testing.T, test huffTest, ttype string) {
 			}
 			defer f.Close()
 			bw := newHuffmanBitWriter(f)
-			writeToType(t, ttype, bw, test.tokens, input)
+			writeToType(t, ttype, bw, tokens, input)
 		}
 
 		t.Logf("Updating %q", test.wantNoInput)
@@ -209,7 +222,7 @@ func testBlock(t *testing.T, test huffTest, ttype string) {
 		}
 		defer f.Close()
 		bw := newHuffmanBitWriter(f)
-		writeToType(t, ttype, bw, test.tokens, nil)
+		writeToType(t, ttype, bw, tokens, nil)
 		return
 	}
 
@@ -227,12 +240,12 @@ func testBlock(t *testing.T, test huffTest, ttype string) {
 		}
 		var buf bytes.Buffer
 		bw := newHuffmanBitWriter(&buf)
-		writeToType(t, ttype, bw, test.tokens, input)
+		writeToType(t, ttype, bw, tokens, input)
 
 		got := buf.Bytes()
 		if !bytes.Equal(got, want) {
-			t.Errorf("writeBlock did not yield expected result for file %q with input. See %q", test.want, test.want+".got")
-			if err := os.WriteFile(test.want+".got", got, 0666); err != nil {
+			t.Errorf("writeBlock did not yield expected result for file %q with input. See %q", test.want, test.want+gotSuffix)
+			if err := os.WriteFile(test.want+gotSuffix, got, 0666); err != nil {
 				t.Error(err)
 			}
 		}
@@ -241,12 +254,12 @@ func testBlock(t *testing.T, test huffTest, ttype string) {
 		// Test if the writer produces the same output after reset.
 		buf.Reset()
 		bw.reset(&buf)
-		writeToType(t, ttype, bw, test.tokens, input)
+		writeToType(t, ttype, bw, tokens, input)
 		bw.flush()
 		got = buf.Bytes()
 		if !bytes.Equal(got, want) {
-			t.Errorf("reset: writeBlock did not yield expected result for file %q with input. See %q", test.want, test.want+".reset.got")
-			if err := os.WriteFile(test.want+".reset.got", got, 0666); err != nil {
+			t.Errorf("reset: writeBlock did not yield expected result for file %q with input. See %q", test.want, test.want+".reset"+gotSuffix)
+			if err := os.WriteFile(test.want+".reset"+gotSuffix, got, 0666); err != nil {
 				t.Error(err)
 			}
 			return
@@ -262,12 +275,12 @@ func testBlock(t *testing.T, test huffTest, ttype string) {
 	}
 	var buf bytes.Buffer
 	bw := newHuffmanBitWriter(&buf)
-	writeToType(t, ttype, bw, test.tokens, nil)
+	writeToType(t, ttype, bw, tokens, nil)
 
 	got := buf.Bytes()
 	if !bytes.Equal(got, wantNI) {
-		t.Errorf("writeBlock did not yield expected result for file %q with input. See %q", test.wantNoInput, test.wantNoInput+".got")
-		if err := os.WriteFile(test.want+".got", got, 0666); err != nil {
+		t.Errorf("writeBlock did not yield expected result for file %q with input. See %q", test.wantNoInput, test.wantNoInput+gotSuffix)
+		if err := os.WriteFile(test.wantNoInput+gotSuffix, got, 0666); err != nil {
 			t.Error(err)
 		}
 	} else if got[0]&1 == 1 {
@@ -280,12 +293,12 @@ func testBlock(t *testing.T, test huffTest, ttype string) {
 	// Test if the writer produces the same output after reset.
 	buf.Reset()
 	bw.reset(&buf)
-	writeToType(t, ttype, bw, test.tokens, nil)
+	writeToType(t, ttype, bw, tokens, nil)
 	bw.flush()
 	got = buf.Bytes()
 	if !bytes.Equal(got, wantNI) {
-		t.Errorf("reset: writeBlock did not yield expected result for file %q without input. See %q", test.want, test.want+".reset.got")
-		if err := os.WriteFile(test.want+".reset.got", got, 0666); err != nil {
+		t.Errorf("reset: writeBlock did not yield expected result for file %q without input. See %q", test.wantNoInput, test.wantNoInput+".reset"+gotSuffix)
+		if err := os.WriteFile(test.wantNoInput+".reset"+gotSuffix, got, 0666); err != nil {
 			t.Error(err)
 		}
 		return
@@ -294,12 +307,14 @@ func testBlock(t *testing.T, test huffTest, ttype string) {
 	testWriterEOF(t, "wb", test, false)
 }
 
-func writeToType(t *testing.T, ttype string, bw *huffmanBitWriter, tok []token, input []byte) {
+func writeToType(t *testing.T, ttype string, bw *huffmanBitWriter, tok tokens, input []byte) {
 	switch ttype {
 	case "wb":
-		bw.writeBlock(tok, false, input)
+		bw.writeBlock(&tok, false, input)
 	case "dyn":
-		bw.writeBlockDynamic(tok, false, input)
+		bw.writeBlockDynamic(&tok, false, input, false)
+	case "sync":
+		bw.writeBlockDynamic(&tok, false, input, true)
 	default:
 		panic("unknown test type")
 	}
@@ -332,13 +347,14 @@ func testWriterEOF(t *testing.T, ttype string, test huffTest, useInput bool) {
 	}
 	var buf bytes.Buffer
 	bw := newHuffmanBitWriter(&buf)
+	tokens := indexTokens(test.tokens)
 	switch ttype {
 	case "wb":
-		bw.writeBlock(test.tokens, true, input)
+		bw.writeBlock(&tokens, true, input)
 	case "dyn":
-		bw.writeBlockDynamic(test.tokens, true, input)
+		bw.writeBlockDynamic(&tokens, true, input, true)
 	case "huff":
-		bw.writeBlockHuff(true, input)
+		bw.writeBlockHuff(true, input, true)
 	default:
 		panic("unknown test type")
 	}
diff --git a/src/compress/flate/huffman_code.go b/src/compress/flate/huffman_code.go
index 6f69cabfd060d4..f3e202430736d3 100644
--- a/src/compress/flate/huffman_code.go
+++ b/src/compress/flate/huffman_code.go
@@ -7,25 +7,42 @@ package flate
 import (
 	"math"
 	"math/bits"
-	"sort"
+)
+
+const (
+	maxBitsLimit = 16
+	// number of valid literals
+	literalCount = 286
 )
 
 // hcode is a huffman code with a bit code and bit length.
-type hcode struct {
-	code, len uint16
+type hcode uint32
+
+func (h hcode) len() uint8 {
+	return uint8(h)
+}
+
+func (h hcode) code64() uint64 {
+	return uint64(h >> 8)
+}
+
+func (h hcode) zero() bool {
+	return h == 0
 }
 
 type huffmanEncoder struct {
-	codes     []hcode
-	freqcache []literalNode
-	bitCount  [17]int32
-	lns       byLiteral // stored to avoid repeated allocation in generate
-	lfs       byFreq    // stored to avoid repeated allocation in generate
+	codes    []hcode
+	bitCount [17]int32
+
+	// Allocate a reusable buffer with the longest possible frequency table.
+	// Possible lengths are codegenCodeCount, offsetCodeCount and literalCount.
+	// The largest of these is literalCount, so we allocate for that case.
+	freqcache [literalCount + 1]literalNode
 }
 
 type literalNode struct {
 	literal uint16
-	freq    int32
+	freq    uint16
 }
 
 // A levelInfo describes the state of the constructed tree for a given depth.
@@ -49,25 +66,34 @@ type levelInfo struct {
 }
 
 // set sets the code and length of an hcode.
-func (h *hcode) set(code uint16, length uint16) {
-	h.len = length
-	h.code = code
+func (h *hcode) set(code uint16, length uint8) {
+	*h = hcode(length) | (hcode(code) << 8)
+}
+
+func newhcode(code uint16, length uint8) hcode {
+	return hcode(length) | (hcode(code) << 8)
+}
+
+func reverseBits(number uint16, bitLength byte) uint16 {
+	return bits.Reverse16(number << ((16 - bitLength) & 15))
 }
 
-func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxInt32} }
+func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxUint16} }
 
 func newHuffmanEncoder(size int) *huffmanEncoder {
-	return &huffmanEncoder{codes: make([]hcode, size)}
+	// Make capacity to next power of two.
+	c := uint(bits.Len32(uint32(size - 1)))
+	return &huffmanEncoder{codes: make([]hcode, size, 1<<c)}
 }
 
-// Generates a HuffmanCode corresponding to the fixed literal table.
+// Generates a HuffmanCode corresponding to the fixed literal table
 func generateFixedLiteralEncoding() *huffmanEncoder {
-	h := newHuffmanEncoder(maxNumLit)
+	h := newHuffmanEncoder(literalCount)
 	codes := h.codes
 	var ch uint16
-	for ch = 0; ch < maxNumLit; ch++ {
+	for ch = range uint16(literalCount) {
 		var bits uint16
-		var size uint16
+		var size uint8
 		switch {
 		case ch < 144:
 			// size 8, 000110000  .. 10111111
@@ -86,7 +112,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
 			bits = ch + 192 - 280
 			size = 8
 		}
-		codes[ch] = hcode{code: reverseBits(bits, byte(size)), len: size}
+		codes[ch] = newhcode(reverseBits(bits, size), size)
 	}
 	return h
 }
@@ -95,40 +121,65 @@ func generateFixedOffsetEncoding() *huffmanEncoder {
 	h := newHuffmanEncoder(30)
 	codes := h.codes
 	for ch := range codes {
-		codes[ch] = hcode{code: reverseBits(uint16(ch), 5), len: 5}
+		codes[ch] = newhcode(reverseBits(uint16(ch), 5), 5)
 	}
 	return h
 }
 
-var fixedLiteralEncoding *huffmanEncoder = generateFixedLiteralEncoding()
-var fixedOffsetEncoding *huffmanEncoder = generateFixedOffsetEncoding()
+var fixedLiteralEncoding = generateFixedLiteralEncoding()
+var fixedOffsetEncoding = generateFixedOffsetEncoding()
 
-func (h *huffmanEncoder) bitLength(freq []int32) int {
+func (h *huffmanEncoder) bitLength(freq []uint16) int {
 	var total int
 	for i, f := range freq {
 		if f != 0 {
-			total += int(f) * int(h.codes[i].len)
+			total += int(f) * int(h.codes[i].len())
 		}
 	}
 	return total
 }
 
-const maxBitsLimit = 16
+func (h *huffmanEncoder) bitLengthRaw(b []byte) int {
+	var total int
+	for _, f := range b {
+		total += int(h.codes[f].len())
+	}
+	return total
+}
 
-// bitCounts computes the number of literals assigned to each bit size in the Huffman encoding.
-// It is only called when list.length >= 3.
+// canReuseBits returns the number of bits or math.MaxInt32 if the encoder cannot be reused.
+func (h *huffmanEncoder) canReuseBits(freq []uint16) int {
+	var total int
+	for i, f := range freq {
+		if f != 0 {
+			code := h.codes[i]
+			if code.zero() {
+				return math.MaxInt32
+			}
+			total += int(f) * int(code.len())
+		}
+	}
+	return total
+}
+
+// Return the number of literals assigned to each bit size in the Huffman encoding
+//
+// This method is only called when list.length >= 3
 // The cases of 0, 1, and 2 literals are handled by special case code.
 //
-// list is an array of the literals with non-zero frequencies
-// and their associated frequencies. The array is in order of increasing
-// frequency and has as its last element a special element with frequency
-// MaxInt32.
+// list  An array of the literals with non-zero frequencies
+//
+//	and their associated frequencies. The array is in order of increasing
+//	frequency, and has as its last element a special element with frequency
+//	MaxInt32
+//
+// maxBits     The maximum number of bits that should be used to encode any literal.
+//
+//	Must be less than 16.
 //
-// maxBits is the maximum number of bits that should be used to encode any literal.
-// It must be less than 16.
+// return      An integer array in which array[i] indicates the number of literals
 //
-// bitCounts returns an integer slice in which slice[i] indicates the number of literals
-// that should be encoded in i bits.
+//	that should be encoded in i bits.
 func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 	if maxBits >= maxBitsLimit {
 		panic("flate: maxBits too large")
@@ -154,14 +205,19 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 	// of the level j ancestor.
 	var leafCounts [maxBitsLimit][maxBitsLimit]int32
 
+	// Descending to only have 1 bounds check.
+	l2f := int32(list[2].freq)
+	l1f := int32(list[1].freq)
+	l0f := int32(list[0].freq) + int32(list[1].freq)
+
 	for level := int32(1); level <= maxBits; level++ {
 		// For every level, the first two items are the first two characters.
 		// We initialize the levels as if we had already figured this out.
 		levels[level] = levelInfo{
 			level:        level,
-			lastFreq:     list[1].freq,
-			nextCharFreq: list[2].freq,
-			nextPairFreq: list[0].freq + list[1].freq,
+			lastFreq:     l1f,
+			nextCharFreq: l2f,
+			nextPairFreq: l0f,
 		}
 		leafCounts[level][level] = 2
 		if level == 1 {
@@ -172,11 +228,11 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 	// We need a total of 2*n - 2 items at top level and have already generated 2.
 	levels[maxBits].needed = 2*n - 4
 
-	level := maxBits
-	for {
+	level := uint32(maxBits)
+	for level < 16 {
 		l := &levels[level]
 		if l.nextPairFreq == math.MaxInt32 && l.nextCharFreq == math.MaxInt32 {
-			// We've run out of both leaves and pairs.
+			// We've run out of both leafs and pairs.
 			// End all calculations for this level.
 			// To make sure we never come back to this level or any lower level,
 			// set nextPairFreq impossibly large.
@@ -193,14 +249,21 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 			l.lastFreq = l.nextCharFreq
 			// Lower leafCounts are the same of the previous node.
 			leafCounts[level][level] = n
-			l.nextCharFreq = list[n].freq
+			e := list[n]
+			if e.literal < math.MaxUint16 {
+				l.nextCharFreq = int32(e.freq)
+			} else {
+				l.nextCharFreq = math.MaxInt32
+			}
 		} else {
 			// The next item on this row is a pair from the previous row.
 			// nextPairFreq isn't valid until we generate two
 			// more values in the level below
 			l.lastFreq = l.nextPairFreq
 			// Take leaf counts from the lower level, except counts[level] remains the same.
-			copy(leafCounts[level][:level], leafCounts[level-1][:level])
+			save := leafCounts[level][level]
+			leafCounts[level] = leafCounts[level-1]
+			leafCounts[level][level] = save
 			levels[l.level-1].needed = 2
 		}
 
@@ -256,9 +319,9 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 		// assigned in literal order (not frequency order).
 		chunk := list[len(list)-int(bits):]
 
-		h.lns.sort(chunk)
+		sortByLiteral(chunk)
 		for _, node := range chunk {
-			h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint16(n)}
+			h.codes[node.literal] = newhcode(reverseBits(code, uint8(n)), uint8(n))
 			code++
 		}
 		list = list[0 : len(list)-int(bits)]
@@ -268,15 +331,10 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 // Update this Huffman Code object to be the minimum code for the specified frequency count.
 //
 // freq is an array of frequencies, in which freq[i] gives the frequency of literal i.
-// maxBits  The maximum number of bits to use for any literal.
-func (h *huffmanEncoder) generate(freq []int32, maxBits int32) {
-	if h.freqcache == nil {
-		// Allocate a reusable buffer with the longest possible frequency table.
-		// Possible lengths are codegenCodeCount, offsetCodeCount and maxNumLit.
-		// The largest of these is maxNumLit, so we allocate for that case.
-		h.freqcache = make([]literalNode, maxNumLit+1)
-	}
+// maxBits is the maximum number of bits to use for any literal.
+func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
 	list := h.freqcache[:len(freq)+1]
+	codes := h.codes[:len(freq)]
 	// Number of non-zero literals
 	count := 0
 	// Set list to be the set of all non-zero literals and their frequencies
@@ -285,9 +343,10 @@ func (h *huffmanEncoder) generate(freq []int32, maxBits int32) {
 			list[count] = literalNode{uint16(i), f}
 			count++
 		} else {
-			h.codes[i].len = 0
+			codes[i] = 0
 		}
 	}
+	list[count] = literalNode{}
 
 	list = list[:count]
 	if count <= 2 {
@@ -299,7 +358,7 @@ func (h *huffmanEncoder) generate(freq []int32, maxBits int32) {
 		}
 		return
 	}
-	h.lfs.sort(list)
+	sortByFreq(list)
 
 	// Get the number of literals for each bit count
 	bitCount := h.bitCounts(list, maxBits)
@@ -307,39 +366,43 @@ func (h *huffmanEncoder) generate(freq []int32, maxBits int32) {
 	h.assignEncodingAndSize(bitCount, list)
 }
 
-type byLiteral []literalNode
-
-func (s *byLiteral) sort(a []literalNode) {
-	*s = byLiteral(a)
-	sort.Sort(s)
+// atLeastOne clamps the result between 1 and 15.
+func atLeastOne(v float32) float32 {
+	return min(15, max(1, v))
 }
 
-func (s byLiteral) Len() int { return len(s) }
-
-func (s byLiteral) Less(i, j int) bool {
-	return s[i].literal < s[j].literal
-}
-
-func (s byLiteral) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
-
-type byFreq []literalNode
-
-func (s *byFreq) sort(a []literalNode) {
-	*s = byFreq(a)
-	sort.Sort(s)
-}
-
-func (s byFreq) Len() int { return len(s) }
-
-func (s byFreq) Less(i, j int) bool {
-	if s[i].freq == s[j].freq {
-		return s[i].literal < s[j].literal
+func histogram(b []byte, h []uint16) {
+	if len(b) >= 8<<10 {
+		// Split for bigger inputs
+		histogramSplit(b, h)
+	} else {
+		h = h[:256]
+		for _, t := range b {
+			h[t]++
+		}
 	}
-	return s[i].freq < s[j].freq
 }
 
-func (s byFreq) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
-
-func reverseBits(number uint16, bitLength byte) uint16 {
-	return bits.Reverse16(number << (16 - bitLength))
+func histogramSplit(b []byte, h []uint16) {
+	// Tested, and slightly faster than 2-way.
+	// Writing to separate arrays and combining is also slightly slower.
+	h = h[:256]
+	// Make size divisible by 4
+	for len(b)&3 != 0 {
+		h[b[0]]++
+		b = b[1:]
+	}
+	n := len(b) / 4
+	x, y, z, w := b[:n], b[n:], b[n+n:], b[n+n+n:]
+	y, z, w = y[:len(x)], z[:len(x)], w[:len(x)]
+	for i, t := range x {
+		v0 := &h[t]
+		v1 := &h[y[i]]
+		v3 := &h[w[i]]
+		v2 := &h[z[i]]
+		*v0++
+		*v1++
+		*v2++
+		*v3++
+	}
 }
diff --git a/src/compress/flate/huffman_sortByFreq.go b/src/compress/flate/huffman_sortByFreq.go
new file mode 100644
index 00000000000000..6c05ba8c1c2e2a
--- /dev/null
+++ b/src/compress/flate/huffman_sortByFreq.go
@@ -0,0 +1,159 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Sort sorts data.
+// It makes one call to data.Len to determine n, and O(n*log(n)) calls to
+// data.Less and data.Swap. The sort is not guaranteed to be stable.
+func sortByFreq(data []literalNode) {
+	n := len(data)
+	quickSortByFreq(data, 0, n, maxDepth(n))
+}
+
+func quickSortByFreq(data []literalNode, a, b, maxDepth int) {
+	for b-a > 12 { // Use ShellSort for slices <= 12 elements
+		if maxDepth == 0 {
+			heapSort(data, a, b)
+			return
+		}
+		maxDepth--
+		mlo, mhi := doPivotByFreq(data, a, b)
+		// Avoiding recursion on the larger subproblem guarantees
+		// a stack depth of at most lg(b-a).
+		if mlo-a < b-mhi {
+			quickSortByFreq(data, a, mlo, maxDepth)
+			a = mhi // i.e., quickSortByFreq(data, mhi, b)
+		} else {
+			quickSortByFreq(data, mhi, b, maxDepth)
+			b = mlo // i.e., quickSortByFreq(data, a, mlo)
+		}
+	}
+	if b-a > 1 {
+		// Do ShellSort pass with gap 6
+		// It could be written in this simplified form cause b-a <= 12
+		for i := a + 6; i < b; i++ {
+			if data[i].freq == data[i-6].freq && data[i].literal < data[i-6].literal || data[i].freq < data[i-6].freq {
+				data[i], data[i-6] = data[i-6], data[i]
+			}
+		}
+		insertionSortByFreq(data, a, b)
+	}
+}
+
+func doPivotByFreq(data []literalNode, lo, hi int) (midlo, midhi int) {
+	m := int(uint(lo+hi) >> 1) // Written like this to avoid integer overflow.
+	if hi-lo > 40 {
+		// Tukey's ``Ninther,'' median of three medians of three.
+		s := (hi - lo) / 8
+		medianOfThreeSortByFreq(data, lo, lo+s, lo+2*s)
+		medianOfThreeSortByFreq(data, m, m-s, m+s)
+		medianOfThreeSortByFreq(data, hi-1, hi-1-s, hi-1-2*s)
+	}
+	medianOfThreeSortByFreq(data, lo, m, hi-1)
+
+	// Invariants are:
+	//	data[lo] = pivot (set up by ChoosePivot)
+	//	data[lo < i < a] < pivot
+	//	data[a <= i < b] <= pivot
+	//	data[b <= i < c] unexamined
+	//	data[c <= i < hi-1] > pivot
+	//	data[hi-1] >= pivot
+	pivot := lo
+	a, c := lo+1, hi-1
+
+	for ; a < c && (data[a].freq == data[pivot].freq && data[a].literal < data[pivot].literal || data[a].freq < data[pivot].freq); a++ {
+	}
+	b := a
+	for {
+		for ; b < c && (data[pivot].freq == data[b].freq && data[pivot].literal > data[b].literal || data[pivot].freq > data[b].freq); b++ { // data[b] <= pivot
+		}
+		for ; b < c && (data[pivot].freq == data[c-1].freq && data[pivot].literal < data[c-1].literal || data[pivot].freq < data[c-1].freq); c-- { // data[c-1] > pivot
+		}
+		if b >= c {
+			break
+		}
+		// data[b] > pivot; data[c-1] <= pivot
+		data[b], data[c-1] = data[c-1], data[b]
+		b++
+		c--
+	}
+	// If hi-c<3 then there are duplicates (by property of median of nine).
+	// Let's be a bit more conservative, and set border to 5.
+	protect := hi-c < 5
+	if !protect && hi-c < (hi-lo)/4 {
+		// Lets test some points for equality to pivot
+		dups := 0
+		if data[pivot].freq == data[hi-1].freq && data[pivot].literal > data[hi-1].literal || data[pivot].freq > data[hi-1].freq { // data[hi-1] = pivot
+			data[c], data[hi-1] = data[hi-1], data[c]
+			c++
+			dups++
+		}
+		if data[b-1].freq == data[pivot].freq && data[b-1].literal > data[pivot].literal || data[b-1].freq > data[pivot].freq { // data[b-1] = pivot
+			b--
+			dups++
+		}
+		// m-lo = (hi-lo)/2 > 6
+		// b-lo > (hi-lo)*3/4-1 > 8
+		// ==> m < b ==> data[m] <= pivot
+		if data[m].freq == data[pivot].freq && data[m].literal > data[pivot].literal || data[m].freq > data[pivot].freq { // data[m] = pivot
+			data[m], data[b-1] = data[b-1], data[m]
+			b--
+			dups++
+		}
+		// if at least 2 points are equal to pivot, assume skewed distribution
+		protect = dups > 1
+	}
+	if protect {
+		// Protect against a lot of duplicates
+		// Add invariant:
+		//	data[a <= i < b] unexamined
+		//	data[b <= i < c] = pivot
+		for {
+			for ; a < b && (data[b-1].freq == data[pivot].freq && data[b-1].literal > data[pivot].literal || data[b-1].freq > data[pivot].freq); b-- { // data[b] == pivot
+			}
+			for ; a < b && (data[a].freq == data[pivot].freq && data[a].literal < data[pivot].literal || data[a].freq < data[pivot].freq); a++ { // data[a] < pivot
+			}
+			if a >= b {
+				break
+			}
+			// data[a] == pivot; data[b-1] < pivot
+			data[a], data[b-1] = data[b-1], data[a]
+			a++
+			b--
+		}
+	}
+	// Swap pivot into middle
+	data[pivot], data[b-1] = data[b-1], data[pivot]
+	return b - 1, c
+}
+
+// Insertion sort
+func insertionSortByFreq(data []literalNode, a, b int) {
+	for i := a + 1; i < b; i++ {
+		for j := i; j > a && (data[j].freq == data[j-1].freq && data[j].literal < data[j-1].literal || data[j].freq < data[j-1].freq); j-- {
+			data[j], data[j-1] = data[j-1], data[j]
+		}
+	}
+}
+
+// quickSortByFreq, loosely following Bentley and McIlroy,
+// ``Engineering a Sort Function,'' SP&E November 1993.
+
+// medianOfThreeSortByFreq moves the median of the three values data[m0], data[m1], data[m2] into data[m1].
+func medianOfThreeSortByFreq(data []literalNode, m1, m0, m2 int) {
+	// sort 3 elements
+	if data[m1].freq == data[m0].freq && data[m1].literal < data[m0].literal || data[m1].freq < data[m0].freq {
+		data[m1], data[m0] = data[m0], data[m1]
+	}
+	// data[m0] <= data[m1]
+	if data[m2].freq == data[m1].freq && data[m2].literal < data[m1].literal || data[m2].freq < data[m1].freq {
+		data[m2], data[m1] = data[m1], data[m2]
+		// data[m0] <= data[m2] && data[m1] < data[m2]
+		if data[m1].freq == data[m0].freq && data[m1].literal < data[m0].literal || data[m1].freq < data[m0].freq {
+			data[m1], data[m0] = data[m0], data[m1]
+		}
+	}
+	// now data[m0] <= data[m1] <= data[m2]
+}
diff --git a/src/compress/flate/huffman_sortByLiteral.go b/src/compress/flate/huffman_sortByLiteral.go
new file mode 100644
index 00000000000000..93f1aea109e123
--- /dev/null
+++ b/src/compress/flate/huffman_sortByLiteral.go
@@ -0,0 +1,201 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Sort sorts data.
+// It makes one call to data.Len to determine n, and O(n*log(n)) calls to
+// data.Less and data.Swap. The sort is not guaranteed to be stable.
+func sortByLiteral(data []literalNode) {
+	n := len(data)
+	quickSort(data, 0, n, maxDepth(n))
+}
+
+func quickSort(data []literalNode, a, b, maxDepth int) {
+	for b-a > 12 { // Use ShellSort for slices <= 12 elements
+		if maxDepth == 0 {
+			heapSort(data, a, b)
+			return
+		}
+		maxDepth--
+		mlo, mhi := doPivot(data, a, b)
+		// Avoiding recursion on the larger subproblem guarantees
+		// a stack depth of at most lg(b-a).
+		if mlo-a < b-mhi {
+			quickSort(data, a, mlo, maxDepth)
+			a = mhi // i.e., quickSort(data, mhi, b)
+		} else {
+			quickSort(data, mhi, b, maxDepth)
+			b = mlo // i.e., quickSort(data, a, mlo)
+		}
+	}
+	if b-a > 1 {
+		// Do ShellSort pass with gap 6
+		// It could be written in this simplified form cause b-a <= 12
+		for i := a + 6; i < b; i++ {
+			if data[i].literal < data[i-6].literal {
+				data[i], data[i-6] = data[i-6], data[i]
+			}
+		}
+		insertionSort(data, a, b)
+	}
+}
+func heapSort(data []literalNode, a, b int) {
+	first := a
+	lo := 0
+	hi := b - a
+
+	// Build heap with greatest element at top.
+	for i := (hi - 1) / 2; i >= 0; i-- {
+		siftDown(data, i, hi, first)
+	}
+
+	// Pop elements, largest first, into end of data.
+	for i := hi - 1; i >= 0; i-- {
+		data[first], data[first+i] = data[first+i], data[first]
+		siftDown(data, lo, i, first)
+	}
+}
+
+// siftDown implements the heap property on data[lo, hi).
+// first is an offset into the array where the root of the heap lies.
+func siftDown(data []literalNode, lo, hi, first int) {
+	root := lo
+	for {
+		child := 2*root + 1
+		if child >= hi {
+			break
+		}
+		if child+1 < hi && data[first+child].literal < data[first+child+1].literal {
+			child++
+		}
+		if data[first+root].literal > data[first+child].literal {
+			return
+		}
+		data[first+root], data[first+child] = data[first+child], data[first+root]
+		root = child
+	}
+}
+func doPivot(data []literalNode, lo, hi int) (midlo, midhi int) {
+	m := int(uint(lo+hi) >> 1) // Written like this to avoid integer overflow.
+	if hi-lo > 40 {
+		// Tukey's ``Ninther,'' median of three medians of three.
+		s := (hi - lo) / 8
+		medianOfThree(data, lo, lo+s, lo+2*s)
+		medianOfThree(data, m, m-s, m+s)
+		medianOfThree(data, hi-1, hi-1-s, hi-1-2*s)
+	}
+	medianOfThree(data, lo, m, hi-1)
+
+	// Invariants are:
+	//	data[lo] = pivot (set up by ChoosePivot)
+	//	data[lo < i < a] < pivot
+	//	data[a <= i < b] <= pivot
+	//	data[b <= i < c] unexamined
+	//	data[c <= i < hi-1] > pivot
+	//	data[hi-1] >= pivot
+	pivot := lo
+	a, c := lo+1, hi-1
+
+	for ; a < c && data[a].literal < data[pivot].literal; a++ {
+	}
+	b := a
+	for {
+		for ; b < c && data[pivot].literal > data[b].literal; b++ { // data[b] <= pivot
+		}
+		for ; b < c && data[pivot].literal < data[c-1].literal; c-- { // data[c-1] > pivot
+		}
+		if b >= c {
+			break
+		}
+		// data[b] > pivot; data[c-1] <= pivot
+		data[b], data[c-1] = data[c-1], data[b]
+		b++
+		c--
+	}
+	// If hi-c<3 then there are duplicates (by property of median of nine).
+	// Let's be a bit more conservative, and set border to 5.
+	protect := hi-c < 5
+	if !protect && hi-c < (hi-lo)/4 {
+		// Lets test some points for equality to pivot
+		dups := 0
+		if data[pivot].literal > data[hi-1].literal { // data[hi-1] = pivot
+			data[c], data[hi-1] = data[hi-1], data[c]
+			c++
+			dups++
+		}
+		if data[b-1].literal > data[pivot].literal { // data[b-1] = pivot
+			b--
+			dups++
+		}
+		// m-lo = (hi-lo)/2 > 6
+		// b-lo > (hi-lo)*3/4-1 > 8
+		// ==> m < b ==> data[m] <= pivot
+		if data[m].literal > data[pivot].literal { // data[m] = pivot
+			data[m], data[b-1] = data[b-1], data[m]
+			b--
+			dups++
+		}
+		// if at least 2 points are equal to pivot, assume skewed distribution
+		protect = dups > 1
+	}
+	if protect {
+		// Protect against a lot of duplicates
+		// Add invariant:
+		//	data[a <= i < b] unexamined
+		//	data[b <= i < c] = pivot
+		for {
+			for ; a < b && data[b-1].literal > data[pivot].literal; b-- { // data[b] == pivot
+			}
+			for ; a < b && data[a].literal < data[pivot].literal; a++ { // data[a] < pivot
+			}
+			if a >= b {
+				break
+			}
+			// data[a] == pivot; data[b-1] < pivot
+			data[a], data[b-1] = data[b-1], data[a]
+			a++
+			b--
+		}
+	}
+	// Swap pivot into middle
+	data[pivot], data[b-1] = data[b-1], data[pivot]
+	return b - 1, c
+}
+
+// Insertion sort
+func insertionSort(data []literalNode, a, b int) {
+	for i := a + 1; i < b; i++ {
+		for j := i; j > a && data[j].literal < data[j-1].literal; j-- {
+			data[j], data[j-1] = data[j-1], data[j]
+		}
+	}
+}
+
+// maxDepth returns a threshold at which quicksort should switch
+// to heapsort. It returns 2*ceil(lg(n+1)).
+func maxDepth(n int) int {
+	var depth int
+	for i := n; i > 0; i >>= 1 {
+		depth++
+	}
+	return depth * 2
+}
+
+// medianOfThree moves the median of the three values data[m0], data[m1], data[m2] into data[m1].
+func medianOfThree(data []literalNode, m1, m0, m2 int) {
+	// sort 3 elements
+	if data[m1].literal < data[m0].literal {
+		data[m1], data[m0] = data[m0], data[m1]
+	}
+	// data[m0] <= data[m1]
+	if data[m2].literal < data[m1].literal {
+		data[m2], data[m1] = data[m1], data[m2]
+		// data[m0] <= data[m2] && data[m1] < data[m2]
+		if data[m1].literal < data[m0].literal {
+			data[m1], data[m0] = data[m0], data[m1]
+		}
+	}
+	// now data[m0] <= data[m1] <= data[m2]
+}
diff --git a/src/compress/flate/level1.go b/src/compress/flate/level1.go
new file mode 100644
index 00000000000000..2195df4fa38f93
--- /dev/null
+++ b/src/compress/flate/level1.go
@@ -0,0 +1,197 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Level 1 uses a single small table with 5 byte hashes.
+type fastEncL1 struct {
+	fastGen
+	table [tableSize]tableEntry
+}
+
+func (e *fastEncL1) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashBytes              = 5
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+
+	// nextEmit is where in src the next emitLiterals should start from.
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiterals in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	cv := loadLE64(src, s)
+
+	for {
+		const skipLog = 5
+		const doEvery = 2
+
+		nextS := s
+		var candidate tableEntry
+		var t int32
+		for {
+			nextHash := hashLen(cv, tableBits, hashBytes)
+			candidate = e.table[nextHash]
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+
+			now := loadLE64(src, nextS)
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+			nextHash = hashLen(now, tableBits, hashBytes)
+			t = candidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
+				break
+			}
+
+			// Do one right away...
+			cv = now
+			s = nextS
+			nextS++
+			candidate = e.table[nextHash]
+			now >>= 8
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+
+			t = candidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
+				break
+			}
+			cv = now
+			s = nextS
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			l := e.matchlenLong(int(s+4), int(t+4), src) + 4
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && loadLE8(src, t-1) == loadLE8(src, s-1) {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
+
+			// Save the match found. Same as 'dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))'
+			xOffset := uint32(s - t - baseMatchOffset)
+			xLength := l
+			oc := offsetCode(xOffset)
+			xOffset |= oc << 16
+			for xLength > 0 {
+				xl := xLength
+				if xl > 258 {
+					if xl > 258+baseMatchLength {
+						xl = 258
+					} else {
+						xl = 258 - baseMatchLength
+					}
+				}
+				xLength -= xl
+				xl -= baseMatchLength
+				dst.extraHist[lengthCodes1[uint8(xl)]]++
+				dst.offHist[oc]++
+				dst.tokens[dst.n] = token(matchType | uint32(xl)<<lengthShift | xOffset)
+				dst.n++
+			}
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+			if s >= sLimit {
+				// Index first pair after match end.
+				if int(s+l+8) < len(src) {
+					cv := loadLE64(src, s)
+					e.table[hashLen(cv, tableBits, hashBytes)] = tableEntry{offset: s + e.cur}
+				}
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := loadLE64(src, s-2)
+			o := e.cur + s - 2
+			prevHash := hashLen(x, tableBits, hashBytes)
+			e.table[prevHash] = tableEntry{offset: o}
+			x >>= 16
+			currHash := hashLen(x, tableBits, hashBytes)
+			candidate = e.table[currHash]
+			e.table[currHash] = tableEntry{offset: o + 2}
+
+			t = candidate.offset - e.cur
+			if s-t > maxMatchOffset || uint32(x) != loadLE32(src, t) {
+				cv = x >> 8
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+		emitLiterals(dst, src[nextEmit:])
+	}
+}
diff --git a/src/compress/flate/level2.go b/src/compress/flate/level2.go
new file mode 100644
index 00000000000000..7a2fdf7abe6ddb
--- /dev/null
+++ b/src/compress/flate/level2.go
@@ -0,0 +1,187 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Level 2 uses a similar algorithm to level 1, but with a larger table.
+type fastEncL2 struct {
+	fastGen
+	table [bTableSize]tableEntry
+}
+
+func (e *fastEncL2) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashBytes              = 5
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+
+	// nextEmit is where in src the next emitLiterals should start from.
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiterals in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	cv := loadLE64(src, s)
+	for {
+		// When should we start skipping if we haven't found matches in a long while.
+		const skipLog = 5
+		const doEvery = 2
+
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hashLen(cv, bTableBits, hashBytes)
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidate = e.table[nextHash]
+			now := loadLE64(src, nextS)
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+			nextHash = hashLen(now, bTableBits, hashBytes)
+
+			offset := s - (candidate.offset - e.cur)
+			if offset < maxMatchOffset && uint32(cv) == loadLE32(src, candidate.offset-e.cur) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
+				break
+			}
+
+			// Do one right away...
+			cv = now
+			s = nextS
+			nextS++
+			candidate = e.table[nextHash]
+			now >>= 8
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+
+			offset = s - (candidate.offset - e.cur)
+			if offset < maxMatchOffset && uint32(cv) == loadLE32(src, candidate.offset-e.cur) {
+				break
+			}
+			cv = now
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes match.
+		for {
+			// Extend the 4-byte match as long as possible.
+			t := candidate.offset - e.cur
+			l := e.matchlenLong(int(s+4), int(t+4), src) + 4
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
+
+			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+
+			if s >= sLimit {
+				// Index first pair after match end.
+				if int(s+l+8) < len(src) {
+					cv := loadLE64(src, s)
+					e.table[hashLen(cv, bTableBits, hashBytes)] = tableEntry{offset: s + e.cur}
+				}
+				goto emitRemainder
+			}
+
+			// Store every second hash in-between, but offset by 1.
+			for i := s - l + 2; i < s-5; i += 7 {
+				x := loadLE64(src, i)
+				nextHash := hashLen(x, bTableBits, hashBytes)
+				e.table[nextHash] = tableEntry{offset: e.cur + i}
+				// Skip one
+				x >>= 16
+				nextHash = hashLen(x, bTableBits, hashBytes)
+				e.table[nextHash] = tableEntry{offset: e.cur + i + 2}
+				// Skip one
+				x >>= 16
+				nextHash = hashLen(x, bTableBits, hashBytes)
+				e.table[nextHash] = tableEntry{offset: e.cur + i + 4}
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 to s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1.
+			x := loadLE64(src, s-2)
+			o := e.cur + s - 2
+			prevHash := hashLen(x, bTableBits, hashBytes)
+			prevHash2 := hashLen(x>>8, bTableBits, hashBytes)
+			e.table[prevHash] = tableEntry{offset: o}
+			e.table[prevHash2] = tableEntry{offset: o + 1}
+			currHash := hashLen(x>>16, bTableBits, hashBytes)
+			candidate = e.table[currHash]
+			e.table[currHash] = tableEntry{offset: o + 2}
+
+			offset := s - (candidate.offset - e.cur)
+			if offset > maxMatchOffset || uint32(x>>16) != loadLE32(src, candidate.offset-e.cur) {
+				cv = x >> 24
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiterals(dst, src[nextEmit:])
+	}
+}
diff --git a/src/compress/flate/level3.go b/src/compress/flate/level3.go
new file mode 100644
index 00000000000000..adda8714879c8d
--- /dev/null
+++ b/src/compress/flate/level3.go
@@ -0,0 +1,226 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Level 3 uses a similar algorithm to level 2, with a smaller table,
+// but will check up two candidates for each iteration with more
+// entries added to the table.
+type fastEncL3 struct {
+	fastGen
+	table [1 << 16]tableEntryPrev
+}
+
+func (e *fastEncL3) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		tableBits              = 16
+		hashBytes              = 5
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+			}
+			if v.Prev.offset <= minOff {
+				v.Prev.offset = 0
+			} else {
+				v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+			}
+			e.table[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// Skip if too small.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiterals in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiterals should start from.
+	cv := loadLE64(src, s)
+	for {
+		const skipLog = 7
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hashLen(cv, tableBits, hashBytes)
+			s = nextS
+			nextS = s + 1 + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidates := e.table[nextHash]
+			now := loadLE64(src, nextS)
+
+			// Safe offset distance until s + 4...
+			minOffset := e.cur + s - (maxMatchOffset - 4)
+			e.table[nextHash] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur}}
+
+			// Check both candidates
+			candidate = candidates.Cur
+			if candidate.offset < minOffset {
+				cv = now
+				// Previous will also be invalid, we have nothing.
+				continue
+			}
+
+			if uint32(cv) == loadLE32(src, candidate.offset-e.cur) {
+				if candidates.Prev.offset < minOffset || uint32(cv) != loadLE32(src, candidates.Prev.offset-e.cur) {
+					break
+				}
+				// Both match and are valid, pick longest.
+				offset := s - (candidate.offset - e.cur)
+				o2 := s - (candidates.Prev.offset - e.cur)
+				l1, l2 := matchLen(src[s+4:], src[s-offset+4:]), matchLen(src[s+4:], src[s-o2+4:])
+				if l2 > l1 {
+					candidate = candidates.Prev
+				}
+				break
+			} else {
+				// We only check if value mismatches.
+				// Offset will always be invalid in other cases.
+				candidate = candidates.Prev
+				if candidate.offset > minOffset && uint32(cv) == loadLE32(src, candidate.offset-e.cur) {
+					break
+				}
+			}
+			cv = now
+		}
+
+		for {
+			// Extend the 4-byte match as long as possible.
+			//
+			t := candidate.offset - e.cur
+			l := e.matchlenLong(int(s+4), int(t+4), src) + 4
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			// Emit literals.
+			if nextEmit < s {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
+
+			// Emit match.
+			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+
+			if s >= sLimit {
+				t += l
+				// Index first pair after match end.
+				if int(t+8) < len(src) && t > 0 {
+					cv = loadLE64(src, t)
+					nextHash := hashLen(cv, tableBits, hashBytes)
+					e.table[nextHash] = tableEntryPrev{
+						Prev: e.table[nextHash].Cur,
+						Cur:  tableEntry{offset: e.cur + t},
+					}
+				}
+				goto emitRemainder
+			}
+
+			// Store every 5th hash in-between.
+			for i := s - l + 2; i < s-5; i += 6 {
+				nextHash := hashLen(loadLE64(src, i), tableBits, hashBytes)
+				e.table[nextHash] = tableEntryPrev{
+					Prev: e.table[nextHash].Cur,
+					Cur:  tableEntry{offset: e.cur + i}}
+			}
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 to s.
+			x := loadLE64(src, s-2)
+			prevHash := hashLen(x, tableBits, hashBytes)
+
+			e.table[prevHash] = tableEntryPrev{
+				Prev: e.table[prevHash].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 2},
+			}
+			x >>= 8
+			prevHash = hashLen(x, tableBits, hashBytes)
+
+			e.table[prevHash] = tableEntryPrev{
+				Prev: e.table[prevHash].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 1},
+			}
+			x >>= 8
+			currHash := hashLen(x, tableBits, hashBytes)
+			candidates := e.table[currHash]
+			cv = x
+			e.table[currHash] = tableEntryPrev{
+				Prev: candidates.Cur,
+				Cur:  tableEntry{offset: s + e.cur},
+			}
+
+			// Check both candidates
+			candidate = candidates.Cur
+			minOffset := e.cur + s - (maxMatchOffset - 4)
+
+			if candidate.offset > minOffset {
+				if uint32(cv) == loadLE32(src, candidate.offset-e.cur) {
+					// Found a match...
+					continue
+				}
+				candidate = candidates.Prev
+				if candidate.offset > minOffset && uint32(cv) == loadLE32(src, candidate.offset-e.cur) {
+					// Match at prev...
+					continue
+				}
+			}
+			cv = x >> 8
+			s++
+			break
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiterals(dst, src[nextEmit:])
+	}
+}
diff --git a/src/compress/flate/level4.go b/src/compress/flate/level4.go
new file mode 100644
index 00000000000000..ceb899793e3148
--- /dev/null
+++ b/src/compress/flate/level4.go
@@ -0,0 +1,204 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Level 4 uses two tables, one for short (4 bytes) and one for long (7 bytes) matches.
+type fastEncL4 struct {
+	fastGen
+	table  [tableSize]tableEntry
+	bTable [tableSize]tableEntry
+}
+
+func (e *fastEncL4) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
+	)
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntry{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.bTable[i].offset = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiterals in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiterals should start from.
+	cv := loadLE64(src, s)
+	for {
+		const skipLog = 6
+		const doEvery = 1
+
+		nextS := s
+		var t int32
+		for {
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
+			nextHashL := hashLen(cv, tableBits, hashLongBytes)
+
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := loadLE64(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			e.bTable[nextHashL] = entry
+
+			t = lCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) {
+				// We got a long match. Use that.
+				break
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) {
+				// Found a 4 match...
+				lCandidate = e.bTable[hashLen(next, tableBits, hashLongBytes)]
+
+				// If the next long is a candidate, check if we should use that instead...
+				lOff := lCandidate.offset - e.cur
+				if nextS-lOff < maxMatchOffset && loadLE32(src, lOff) == uint32(next) {
+					l1, l2 := matchLen(src[s+4:], src[t+4:]), matchLen(src[nextS+4:], src[nextS-lOff+4:])
+					if l2 > l1 {
+						s = nextS
+						t = lCandidate.offset - e.cur
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		// Extend the 4-byte match as long as possible.
+		l := e.matchlenLong(int(s+4), int(t+4), src) + 4
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			for _, v := range src[nextEmit:s] {
+				dst.tokens[dst.n] = token(v)
+				dst.litHist[v]++
+				dst.n++
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			// Index first pair after match end.
+			if int(s+8) < len(src) {
+				cv := loadLE64(src, s)
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: s + e.cur}
+				e.bTable[hashLen(cv, tableBits, hashLongBytes)] = tableEntry{offset: s + e.cur}
+			}
+			goto emitRemainder
+		}
+
+		// Store every 3rd hash in-between
+		i := nextS
+		if i < s-1 {
+			cv := loadLE64(src, i)
+			t := tableEntry{offset: i + e.cur}
+			t2 := tableEntry{offset: t.offset + 1}
+			e.bTable[hashLen(cv, tableBits, hashLongBytes)] = t
+			e.bTable[hashLen(cv>>8, tableBits, hashLongBytes)] = t2
+			e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
+
+			i += 3
+			for ; i < s-1; i += 3 {
+				cv := loadLE64(src, i)
+				t := tableEntry{offset: i + e.cur}
+				t2 := tableEntry{offset: t.offset + 1}
+				e.bTable[hashLen(cv, tableBits, hashLongBytes)] = t
+				e.bTable[hashLen(cv>>8, tableBits, hashLongBytes)] = t2
+				e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		x := loadLE64(src, s-1)
+		o := e.cur + s - 1
+		prevHashS := hashLen(x, tableBits, hashShortBytes)
+		prevHashL := hashLen(x, tableBits, hashLongBytes)
+		e.table[prevHashS] = tableEntry{offset: o}
+		e.bTable[prevHashL] = tableEntry{offset: o}
+		cv = x >> 8
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiterals(dst, src[nextEmit:])
+	}
+}
diff --git a/src/compress/flate/level5.go b/src/compress/flate/level5.go
new file mode 100644
index 00000000000000..29f1df27413b82
--- /dev/null
+++ b/src/compress/flate/level5.go
@@ -0,0 +1,291 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Level 5 is similar to level 4, but for long matches two candidates are tested.
+// Once a match is found, when it stops it will attempt to find a match that extends further.
+type fastEncL5 struct {
+	fastGen
+	table  [tableSize]tableEntry
+	bTable [tableSize]tableEntryPrev
+}
+
+func (e *fastEncL5) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+				v.Prev.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+				if v.Prev.offset <= minOff {
+					v.Prev.offset = 0
+				} else {
+					v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+				}
+			}
+			e.bTable[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+
+	// nextEmit is where in src the next emitLiterals should start from.
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiterals in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	cv := loadLE64(src, s)
+	for {
+		const skipLog = 6
+		const doEvery = 1
+
+		nextS := s
+		var l int32
+		var t int32
+		for {
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
+			nextHashL := hashLen(cv, tableBits, hashLongBytes)
+
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := loadLE64(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			eLong := &e.bTable[nextHashL]
+			eLong.Cur, eLong.Prev = entry, eLong.Cur
+
+			nextHashS = hashLen(next, tableBits, hashShortBytes)
+			nextHashL = hashLen(next, tableBits, hashLongBytes)
+
+			t = lCandidate.Cur.offset - e.cur
+			if s-t < maxMatchOffset {
+				if uint32(cv) == loadLE32(src, t) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+					t2 := lCandidate.Prev.offset - e.cur
+					if s-t2 < maxMatchOffset && uint32(cv) == loadLE32(src, t2) {
+						l = e.matchLenLimited(int(s+4), int(t+4), src) + 4
+						ml1 := e.matchLenLimited(int(s+4), int(t2+4), src) + 4
+						if ml1 > l {
+							t = t2
+							l = ml1
+							break
+						}
+					}
+					break
+				}
+				t = lCandidate.Prev.offset - e.cur
+				if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+					break
+				}
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) {
+				// Found a 4 match...
+				l = e.matchLenLimited(int(s+4), int(t+4), src) + 4
+				lCandidate = e.bTable[nextHashL]
+				// Store the next match
+
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+				eLong := &e.bTable[nextHashL]
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+				// If the next long is a candidate, use that...
+				t2 := lCandidate.Cur.offset - e.cur
+				if nextS-t2 < maxMatchOffset {
+					if loadLE32(src, t2) == uint32(next) {
+						ml := e.matchLenLimited(int(nextS+4), int(t2+4), src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+					// If the previous long is a candidate, use that...
+					t2 = lCandidate.Prev.offset - e.cur
+					if nextS-t2 < maxMatchOffset && loadLE32(src, t2) == uint32(next) {
+						ml := e.matchLenLimited(int(nextS+4), int(t2+4), src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		if l == 0 {
+			// Extend the 4-byte match as long as possible.
+			l = e.matchlenLong(int(s+4), int(t+4), src) + 4
+		} else if l == maxMatchLength {
+			l += e.matchlenLong(int(s+l), int(t+l), src)
+		}
+
+		// Try to locate a better match by checking the end of best match...
+		if sAt := s + l; l < 30 && sAt < sLimit {
+			// Allow some bytes at the beginning to mismatch.
+			// Sweet spot is 2/3 bytes depending on input.
+			// 3 is only a little better when it is but sometimes a lot worse.
+			// The skipped bytes are tested in Extend backwards,
+			// and still picked up as part of the match if they do.
+			const skipBeginning = 2
+			eLong := e.bTable[hashLen(loadLE64(src, sAt), tableBits, hashLongBytes)].Cur.offset
+			t2 := eLong - e.cur - l + skipBeginning
+			s2 := s + skipBeginning
+			off := s2 - t2
+			if t2 >= 0 && off < maxMatchOffset && off > 0 {
+				if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l {
+					t = t2
+					l = l2
+					s = s2
+				}
+			}
+		}
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			for _, v := range src[nextEmit:s] {
+				dst.tokens[dst.n] = token(v)
+				dst.litHist[v]++
+				dst.n++
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		// Store every 3rd hash in-between.
+		const hashEvery = 3
+		i := s - l + 1
+		if i < s-1 {
+			cv := loadLE64(src, i)
+			t := tableEntry{offset: i + e.cur}
+			e.table[hashLen(cv, tableBits, hashShortBytes)] = t
+			eLong := &e.bTable[hashLen(cv, tableBits, hashLongBytes)]
+			eLong.Cur, eLong.Prev = t, eLong.Cur
+
+			// Do an long at i+1
+			cv >>= 8
+			t = tableEntry{offset: t.offset + 1}
+			eLong = &e.bTable[hashLen(cv, tableBits, hashLongBytes)]
+			eLong.Cur, eLong.Prev = t, eLong.Cur
+
+			// We only have enough bits for a short entry at i+2
+			cv >>= 8
+			t = tableEntry{offset: t.offset + 1}
+			e.table[hashLen(cv, tableBits, hashShortBytes)] = t
+
+			// Skip one - otherwise we risk hitting 's'
+			i += 4
+			for ; i < s-1; i += hashEvery {
+				cv := loadLE64(src, i)
+				t := tableEntry{offset: i + e.cur}
+				t2 := tableEntry{offset: t.offset + 1}
+				eLong := &e.bTable[hashLen(cv, tableBits, hashLongBytes)]
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+				e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		x := loadLE64(src, s-1)
+		o := e.cur + s - 1
+		prevHashS := hashLen(x, tableBits, hashShortBytes)
+		prevHashL := hashLen(x, tableBits, hashLongBytes)
+		e.table[prevHashS] = tableEntry{offset: o}
+		eLong := &e.bTable[prevHashL]
+		eLong.Cur, eLong.Prev = tableEntry{offset: o}, eLong.Cur
+		cv = x >> 8
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiterals(dst, src[nextEmit:])
+	}
+}
diff --git a/src/compress/flate/level6.go b/src/compress/flate/level6.go
new file mode 100644
index 00000000000000..d709f31e21fc42
--- /dev/null
+++ b/src/compress/flate/level6.go
@@ -0,0 +1,301 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Level 6 extends level 5, but does "repeat offset" check,
+// as well as adding more hash entries to the tables.
+type fastEncL6 struct {
+	fastGen
+	table  [tableSize]tableEntry
+	bTable [tableSize]tableEntryPrev
+}
+
+func (e *fastEncL6) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+				v.Prev.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+				if v.Prev.offset <= minOff {
+					v.Prev.offset = 0
+				} else {
+					v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+				}
+			}
+			e.bTable[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+
+	// nextEmit is where in src the next emitLiterals should start from.
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiterals in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	cv := loadLE64(src, s)
+	// Repeat MUST be > 1 and within range
+	repeat := int32(1)
+	for {
+		const skipLog = 7
+		const doEvery = 1
+
+		nextS := s
+		var l int32
+		var t int32
+		for {
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
+			nextHashL := hashLen(cv, tableBits, hashLongBytes)
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := loadLE64(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			eLong := &e.bTable[nextHashL]
+			eLong.Cur, eLong.Prev = entry, eLong.Cur
+
+			// Calculate hashes of 'next'
+			nextHashS = hashLen(next, tableBits, hashShortBytes)
+			nextHashL = hashLen(next, tableBits, hashLongBytes)
+
+			t = lCandidate.Cur.offset - e.cur
+			if s-t < maxMatchOffset {
+				if uint32(cv) == loadLE32(src, t) {
+					// Long candidate matches at least 4 bytes.
+
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+					// Check the previous long candidate as well.
+					t2 := lCandidate.Prev.offset - e.cur
+					if s-t2 < maxMatchOffset && uint32(cv) == loadLE32(src, t2) {
+						l = e.matchLenLimited(int(s+4), int(t+4), src) + 4
+						ml1 := e.matchLenLimited(int(s+4), int(t2+4), src) + 4
+						if ml1 > l {
+							t = t2
+							l = ml1
+							break
+						}
+					}
+					break
+				}
+				// Current value did not match, but check if previous long value does.
+				t = lCandidate.Prev.offset - e.cur
+				if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+					break
+				}
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == loadLE32(src, t) {
+				// Found a 4 match...
+				l = e.matchLenLimited(int(s+4), int(t+4), src) + 4
+
+				// Look up next long candidate (at nextS)
+				lCandidate = e.bTable[nextHashL]
+
+				// Store the next match
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+				eLong := &e.bTable[nextHashL]
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+				// Check repeat at s + repOff
+				const repOff = 1
+				t2 := s - repeat + repOff
+				if loadLE32(src, t2) == uint32(cv>>(8*repOff)) {
+					ml := e.matchLenLimited(int(s+4+repOff), int(t2+4), src) + 4
+					if ml > l {
+						t = t2
+						l = ml
+						s += repOff
+						// Not worth checking more.
+						break
+					}
+				}
+
+				// If the next long is a candidate, use that...
+				t2 = lCandidate.Cur.offset - e.cur
+				if nextS-t2 < maxMatchOffset {
+					if loadLE32(src, t2) == uint32(next) {
+						ml := e.matchLenLimited(int(nextS+4), int(t2+4), src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							// This is ok, but check previous as well.
+						}
+					}
+					// If the previous long is a candidate, use that...
+					t2 = lCandidate.Prev.offset - e.cur
+					if nextS-t2 < maxMatchOffset && loadLE32(src, t2) == uint32(next) {
+						ml := e.matchLenLimited(int(nextS+4), int(t2+4), src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// Extend the 4-byte match as long as possible.
+		if l == 0 {
+			l = e.matchlenLong(int(s+4), int(t+4), src) + 4
+		} else if l == maxMatchLength {
+			l += e.matchlenLong(int(s+l), int(t+l), src)
+		}
+
+		// Try to locate a better match by checking the end-of-match...
+		if sAt := s + l; sAt < sLimit {
+			// Allow some bytes at the beginning to mismatch.
+			// Sweet spot is 2/3 bytes depending on input.
+			// 3 is only a little better when it is but sometimes a lot worse.
+			// The skipped bytes are tested in extend backwards,
+			// and still picked up as part of the match if they do.
+			const skipBeginning = 2
+			eLong := &e.bTable[hashLen(loadLE64(src, sAt), tableBits, hashLongBytes)]
+			// Test current
+			t2 := eLong.Cur.offset - e.cur - l + skipBeginning
+			s2 := s + skipBeginning
+			off := s2 - t2
+			if off < maxMatchOffset {
+				if off > 0 && t2 >= 0 {
+					if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l {
+						t = t2
+						l = l2
+						s = s2
+					}
+				}
+				// Test previous entry:
+				t2 = eLong.Prev.offset - e.cur - l + skipBeginning
+				off := s2 - t2
+				if off > 0 && off < maxMatchOffset && t2 >= 0 {
+					if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l {
+						t = t2
+						l = l2
+						s = s2
+					}
+				}
+			}
+		}
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			for _, v := range src[nextEmit:s] {
+				dst.tokens[dst.n] = token(v)
+				dst.litHist[v]++
+				dst.n++
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		repeat = s - t
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			// Index after match end.
+			for i := nextS + 1; i < int32(len(src))-8; i += 2 {
+				cv := loadLE64(src, i)
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: i + e.cur}
+				eLong := &e.bTable[hashLen(cv, tableBits, hashLongBytes)]
+				eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur}, eLong.Cur
+			}
+			goto emitRemainder
+		}
+
+		// Store every long hash in-between and every second short.
+		for i := nextS + 1; i < s-1; i += 2 {
+			cv := loadLE64(src, i)
+			t := tableEntry{offset: i + e.cur}
+			t2 := tableEntry{offset: t.offset + 1}
+			eLong := &e.bTable[hashLen(cv, tableBits, hashLongBytes)]
+			eLong2 := &e.bTable[hashLen(cv>>8, tableBits, hashLongBytes)]
+			e.table[hashLen(cv, tableBits, hashShortBytes)] = t
+			eLong.Cur, eLong.Prev = t, eLong.Cur
+			eLong2.Cur, eLong2.Prev = t2, eLong2.Cur
+		}
+		cv = loadLE64(src, s)
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiterals(dst, src[nextEmit:])
+	}
+}
diff --git a/src/compress/flate/regmask_amd64.go b/src/compress/flate/regmask_amd64.go
new file mode 100644
index 00000000000000..cd1469a909173d
--- /dev/null
+++ b/src/compress/flate/regmask_amd64.go
@@ -0,0 +1,14 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+const (
+	// Masks for shifts with register sizes of the shift value.
+	// This can be used to work around the x86 design of shifting by mod register size.
+	// It can be used when a variable shift is always smaller than the register size.
+
+	// reg8SizeMask64 - shift value is 8 bits on 64 bit register.
+	reg8SizeMask64 = 63
+)
diff --git a/src/compress/flate/regmask_other.go b/src/compress/flate/regmask_other.go
new file mode 100644
index 00000000000000..e25fc87af1b0d2
--- /dev/null
+++ b/src/compress/flate/regmask_other.go
@@ -0,0 +1,18 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64
+// +build !amd64
+
+package flate
+
+const (
+	// Masks for shifts with register sizes of the shift value.
+	// This can be used to work around the x86 design of shifting by mod register size.
+	// On other platforms the mask is ineffective so the AND can be removed by the compiler.
+	// It can be used when a variable shift is always smaller than the register size.
+
+	// reg8SizeMask64 - shift value is 8 bits on 64 bit register.
+	reg8SizeMask64 = 0xff
+)
diff --git a/src/compress/flate/testdata/huffman-null-max.sync.expect b/src/compress/flate/testdata/huffman-null-max.sync.expect
new file mode 100644
index 00000000000000..c08165143f2c57
Binary files /dev/null and b/src/compress/flate/testdata/huffman-null-max.sync.expect differ
diff --git a/src/compress/flate/testdata/huffman-null-max.sync.expect-noinput b/src/compress/flate/testdata/huffman-null-max.sync.expect-noinput
new file mode 100644
index 00000000000000..c08165143f2c57
Binary files /dev/null and b/src/compress/flate/testdata/huffman-null-max.sync.expect-noinput differ
diff --git a/src/compress/flate/testdata/huffman-pi.sync.expect b/src/compress/flate/testdata/huffman-pi.sync.expect
new file mode 100644
index 00000000000000..e4396ac6fe5e34
Binary files /dev/null and b/src/compress/flate/testdata/huffman-pi.sync.expect differ
diff --git a/src/compress/flate/testdata/huffman-pi.sync.expect-noinput b/src/compress/flate/testdata/huffman-pi.sync.expect-noinput
new file mode 100644
index 00000000000000..e4396ac6fe5e34
Binary files /dev/null and b/src/compress/flate/testdata/huffman-pi.sync.expect-noinput differ
diff --git a/src/compress/flate/testdata/huffman-rand-1k.dyn.expect-noinput b/src/compress/flate/testdata/huffman-rand-1k.dyn.expect-noinput
index 0c24742fde2487..016db5595c47c5 100644
Binary files a/src/compress/flate/testdata/huffman-rand-1k.dyn.expect-noinput and b/src/compress/flate/testdata/huffman-rand-1k.dyn.expect-noinput differ
diff --git a/src/compress/flate/testdata/huffman-rand-1k.sync.expect b/src/compress/flate/testdata/huffman-rand-1k.sync.expect
new file mode 100644
index 00000000000000..09dc798ee37df8
Binary files /dev/null and b/src/compress/flate/testdata/huffman-rand-1k.sync.expect differ
diff --git a/src/compress/flate/testdata/huffman-rand-1k.sync.expect-noinput b/src/compress/flate/testdata/huffman-rand-1k.sync.expect-noinput
new file mode 100644
index 00000000000000..0c24742fde2487
Binary files /dev/null and b/src/compress/flate/testdata/huffman-rand-1k.sync.expect-noinput differ
diff --git a/src/compress/flate/testdata/huffman-rand-limit.dyn.expect b/src/compress/flate/testdata/huffman-rand-limit.dyn.expect
index 2d6527934e9830..881e59c9ab9bb3 100644
Binary files a/src/compress/flate/testdata/huffman-rand-limit.dyn.expect and b/src/compress/flate/testdata/huffman-rand-limit.dyn.expect differ
diff --git a/src/compress/flate/testdata/huffman-rand-limit.dyn.expect-noinput b/src/compress/flate/testdata/huffman-rand-limit.dyn.expect-noinput
index 2d6527934e9830..881e59c9ab9bb3 100644
Binary files a/src/compress/flate/testdata/huffman-rand-limit.dyn.expect-noinput and b/src/compress/flate/testdata/huffman-rand-limit.dyn.expect-noinput differ
diff --git a/src/compress/flate/testdata/huffman-rand-limit.golden b/src/compress/flate/testdata/huffman-rand-limit.golden
index 57e59322e9884e..9ca0eb1ce22ff2 100644
Binary files a/src/compress/flate/testdata/huffman-rand-limit.golden and b/src/compress/flate/testdata/huffman-rand-limit.golden differ
diff --git a/src/compress/flate/testdata/huffman-rand-limit.sync.expect b/src/compress/flate/testdata/huffman-rand-limit.sync.expect
new file mode 100644
index 00000000000000..881e59c9ab9bb3
Binary files /dev/null and b/src/compress/flate/testdata/huffman-rand-limit.sync.expect differ
diff --git a/src/compress/flate/testdata/huffman-rand-limit.sync.expect-noinput b/src/compress/flate/testdata/huffman-rand-limit.sync.expect-noinput
new file mode 100644
index 00000000000000..881e59c9ab9bb3
Binary files /dev/null and b/src/compress/flate/testdata/huffman-rand-limit.sync.expect-noinput differ
diff --git a/src/compress/flate/testdata/huffman-shifts.sync.expect b/src/compress/flate/testdata/huffman-shifts.sync.expect
new file mode 100644
index 00000000000000..7812c1c62da3cb
Binary files /dev/null and b/src/compress/flate/testdata/huffman-shifts.sync.expect differ
diff --git a/src/compress/flate/testdata/huffman-shifts.sync.expect-noinput b/src/compress/flate/testdata/huffman-shifts.sync.expect-noinput
new file mode 100644
index 00000000000000..7812c1c62da3cb
Binary files /dev/null and b/src/compress/flate/testdata/huffman-shifts.sync.expect-noinput differ
diff --git a/src/compress/flate/testdata/huffman-text-shift.sync.expect b/src/compress/flate/testdata/huffman-text-shift.sync.expect
new file mode 100644
index 00000000000000..71ce3aeb75a86e
Binary files /dev/null and b/src/compress/flate/testdata/huffman-text-shift.sync.expect differ
diff --git a/src/compress/flate/testdata/huffman-text-shift.sync.expect-noinput b/src/compress/flate/testdata/huffman-text-shift.sync.expect-noinput
new file mode 100644
index 00000000000000..71ce3aeb75a86e
Binary files /dev/null and b/src/compress/flate/testdata/huffman-text-shift.sync.expect-noinput differ
diff --git a/src/compress/flate/testdata/huffman-text.sync.expect b/src/compress/flate/testdata/huffman-text.sync.expect
new file mode 100644
index 00000000000000..d448727c323caf
--- /dev/null
+++ b/src/compress/flate/testdata/huffman-text.sync.expect
@@ -0,0 +1 @@
+�_K�0������`K��0Aasě)^�H�����Iɟb߻��_>�4a��=����-^�1`_�	1	���	�ő:�Y��-�F66!�A��`�a��C;A����Nyr4ߜU�!���GKС��#�����r:B[G�3��.�L��׶�bFRuM]���^⇳�(#Z������i�����v��B����B�H2S]��u/���ֽ��W�T�G��n���r�
\ No newline at end of file
diff --git a/src/compress/flate/testdata/huffman-text.sync.expect-noinput b/src/compress/flate/testdata/huffman-text.sync.expect-noinput
new file mode 100644
index 00000000000000..d448727c323caf
--- /dev/null
+++ b/src/compress/flate/testdata/huffman-text.sync.expect-noinput
@@ -0,0 +1 @@
+�_K�0������`K��0Aasě)^�H�����Iɟb߻��_>�4a��=����-^�1`_�	1	���	�ő:�Y��-�F66!�A��`�a��C;A����Nyr4ߜU�!���GKС��#�����r:B[G�3��.�L��׶�bFRuM]���^⇳�(#Z������i�����v��B����B�H2S]��u/���ֽ��W�T�G��n���r�
\ No newline at end of file
diff --git a/src/compress/flate/testdata/huffman-zero.dyn.expect b/src/compress/flate/testdata/huffman-zero.dyn.expect
index 830348a79ad9ab..dbe401c54c4b6f 100644
Binary files a/src/compress/flate/testdata/huffman-zero.dyn.expect and b/src/compress/flate/testdata/huffman-zero.dyn.expect differ
diff --git a/src/compress/flate/testdata/huffman-zero.dyn.expect-noinput b/src/compress/flate/testdata/huffman-zero.dyn.expect-noinput
index 830348a79ad9ab..dbe401c54c4b6f 100644
Binary files a/src/compress/flate/testdata/huffman-zero.dyn.expect-noinput and b/src/compress/flate/testdata/huffman-zero.dyn.expect-noinput differ
diff --git a/src/compress/flate/testdata/huffman-zero.sync.expect b/src/compress/flate/testdata/huffman-zero.sync.expect
new file mode 100644
index 00000000000000..dbe401c54c4b6f
Binary files /dev/null and b/src/compress/flate/testdata/huffman-zero.sync.expect differ
diff --git a/src/compress/flate/testdata/huffman-zero.sync.expect-noinput b/src/compress/flate/testdata/huffman-zero.sync.expect-noinput
new file mode 100644
index 00000000000000..dbe401c54c4b6f
Binary files /dev/null and b/src/compress/flate/testdata/huffman-zero.sync.expect-noinput differ
diff --git a/src/compress/flate/testdata/null-long-match.sync.expect-noinput b/src/compress/flate/testdata/null-long-match.sync.expect-noinput
new file mode 100644
index 00000000000000..8b92d9fc20f1ee
Binary files /dev/null and b/src/compress/flate/testdata/null-long-match.sync.expect-noinput differ
diff --git a/src/compress/flate/token.go b/src/compress/flate/token.go
index fc0e4941e7bcd2..3f0d1c358077b8 100644
--- a/src/compress/flate/token.go
+++ b/src/compress/flate/token.go
@@ -4,20 +4,26 @@
 
 package flate
 
+import (
+	"math"
+)
+
 const (
-	// 2 bits:   type   0 = literal  1=EOF  2=Match   3=Unused
-	// 8 bits:   xlength = length - MIN_MATCH_LENGTH
-	// 22 bits   xoffset = offset - MIN_OFFSET_SIZE, or literal
-	lengthShift = 22
-	offsetMask  = 1<<lengthShift - 1
-	typeMask    = 3 << 30
-	literalType = 0 << 30
-	matchType   = 1 << 30
+	// Token is a compound value:
+	// bits 0-16  	xoffset = offset - MIN_OFFSET_SIZE, or literal - 16 bits
+	// bits 16-22	offsetcode - 5 bits
+	// bits 22-30   xlength = length - MIN_MATCH_LENGTH - 8 bits
+	// bits 30-32   type   0 = literal  1=EOF  2=Match   3=Unused - 2 bits
+	lengthShift         = 22
+	offsetMask          = 1<<lengthShift - 1
+	typeMask            = 3 << 30
+	matchType           = 1 << 30
+	matchOffsetOnlyMask = 0xffff
 )
 
 // The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
 // is lengthCodes[length - MIN_MATCH_LENGTH]
-var lengthCodes = [...]uint32{
+var lengthCodes = [256]uint8{
 	0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
 	9, 9, 10, 10, 11, 11, 12, 12, 12, 12,
 	13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
@@ -46,7 +52,37 @@ var lengthCodes = [...]uint32{
 	27, 27, 27, 27, 27, 28,
 }
 
-var offsetCodes = [...]uint32{
+// lengthCodes1 is length codes, but starting at 1.
+var lengthCodes1 = [256]uint8{
+	1, 2, 3, 4, 5, 6, 7, 8, 9, 9,
+	10, 10, 11, 11, 12, 12, 13, 13, 13, 13,
+	14, 14, 14, 14, 15, 15, 15, 15, 16, 16,
+	16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
+	18, 18, 18, 18, 18, 18, 18, 18, 19, 19,
+	19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
+	20, 20, 20, 20, 21, 21, 21, 21, 21, 21,
+	21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+	22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+	22, 22, 22, 22, 22, 22, 23, 23, 23, 23,
+	23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+	23, 23, 24, 24, 24, 24, 24, 24, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 29,
+}
+
+var offsetCodes = [256]uint32{
 	0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
 	8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
 	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
@@ -65,33 +101,198 @@ var offsetCodes = [...]uint32{
 	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
 }
 
+// offsetCodes14 are offsetCodes, but with 14 added.
+var offsetCodes14 = [256]uint32{
+	14, 15, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
+	22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+}
+
 type token uint32
 
-// Convert a literal into a literal token.
-func literalToken(literal uint32) token { return token(literalType + literal) }
+// tokens are compound values as described above.
+// Histograms are created as tokens are added.
+// A full block is allocated.
+type tokens struct {
+	extraHist [32]uint16  // codes 256->maxnumlit
+	offHist   [32]uint16  // offset codes
+	litHist   [256]uint16 // codes 0->255
+	nFilled   int
+	n         uint16 // Must be able to contain maxStoreBlockSize
+	tokens    [65536]token
+}
+
+func (t *tokens) Reset() {
+	if t.n == 0 {
+		return
+	}
+	t.n = 0
+	t.nFilled = 0
+	clear(t.litHist[:])
+	clear(t.extraHist[:])
+	clear(t.offHist[:])
+}
+
+func indexTokens(in []token) tokens {
+	var t tokens
+	t.indexTokens(in)
+	return t
+}
+
+func (t *tokens) indexTokens(in []token) {
+	t.Reset()
+	for _, tok := range in {
+		if tok < matchType {
+			t.AddLiteral(tok.literal())
+			continue
+		}
+		t.AddMatch(uint32(tok.length()), tok.offset()&matchOffsetOnlyMask)
+	}
+}
+
+// emitLiterals writes a literal chunk and returns the number of bytes written.
+func emitLiterals(dst *tokens, lit []byte) {
+	for _, v := range lit {
+		dst.tokens[dst.n] = token(v)
+		dst.litHist[v]++
+		dst.n++
+	}
+}
+
+func (t *tokens) AddLiteral(lit byte) {
+	t.tokens[t.n] = token(lit)
+	t.litHist[lit]++
+	t.n++
+}
+
+// from https://stackoverflow.com/a/28730362
+func mFastLog2(val float32) float32 {
+	ux := int32(math.Float32bits(val))
+	log2 := (float32)(((ux >> 23) & 255) - 128)
+	ux &= -0x7f800001
+	ux += 127 << 23
+	uval := math.Float32frombits(uint32(ux))
+	log2 += ((-0.34484843)*uval+2.02466578)*uval - 0.67487759
+	return log2
+}
 
-// Convert a < xlength, xoffset > pair into a match token.
-func matchToken(xlength uint32, xoffset uint32) token {
-	return token(matchType + xlength<<lengthShift + xoffset)
+// EstimatedBits will return a minimum size estimated by an *optimal*
+// compression of the block.
+func (t *tokens) EstimatedBits() int {
+	shannon := float32(0)
+	bits := int(0)
+	nMatches := 0
+	total := int(t.n) + t.nFilled
+	if total > 0 {
+		invTotal := 1.0 / float32(total)
+		for _, v := range t.litHist[:] {
+			if v > 0 {
+				n := float32(v)
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
+			}
+		}
+		// Just add 15 for EOB
+		shannon += 15
+		for i, v := range t.extraHist[1 : literalCount-256] {
+			if v > 0 {
+				n := float32(v)
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
+				bits += int(lengthExtraBits[i&31]) * int(v)
+				nMatches += int(v)
+			}
+		}
+	}
+	if nMatches > 0 {
+		invTotal := 1.0 / float32(nMatches)
+		for i, v := range t.offHist[:offsetCodeCount] {
+			if v > 0 {
+				n := float32(v)
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
+				bits += int(offsetExtraBits[i&31]) * int(v)
+			}
+		}
+	}
+	return int(shannon) + bits
 }
 
-// Returns the literal of a literal token.
-func (t token) literal() uint32 { return uint32(t - literalType) }
+// AddMatch adds a match to the tokens.
+// This function is very sensitive to inlining and right on the border.
+func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
+	oCode := offsetCode(xoffset)
+	xoffset |= oCode << 16
 
-// Returns the extra offset of a match token.
+	t.extraHist[lengthCodes1[uint8(xlength)]]++
+	t.offHist[oCode&31]++
+	t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
+	t.n++
+}
+
+// AddMatchLong adds a match to the tokens, potentially longer than max match length.
+// Length should NOT have the base subtracted, only offset should.
+func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
+	oc := offsetCode(xoffset)
+	xoffset |= oc << 16
+	for xlength > 0 {
+		xl := xlength
+		if xl > 258 {
+			// We need to have at least baseMatchLength left over for next loop.
+			if xl > 258+baseMatchLength {
+				xl = 258
+			} else {
+				xl = 258 - baseMatchLength
+			}
+		}
+		xlength -= xl
+		xl -= baseMatchLength
+		t.extraHist[lengthCodes1[uint8(xl)]]++
+		t.offHist[oc&31]++
+		t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
+		t.n++
+	}
+}
+
+func (t *tokens) AddEOB() {
+	t.tokens[t.n] = token(endBlockMarker)
+	t.extraHist[0]++
+	t.n++
+}
+
+// Slice returns a slice of the tokens that references the tokens in t.
+func (t *tokens) Slice() []token {
+	return t.tokens[:t.n]
+}
+
+// Returns the type of a token
+func (t token) typ() uint32 { return uint32(t) & typeMask }
+
+// Returns the literal of a literal token
+func (t token) literal() uint8 { return uint8(t) }
+
+// Returns the extra offset of a match token
 func (t token) offset() uint32 { return uint32(t) & offsetMask }
 
-func (t token) length() uint32 { return uint32((t - matchType) >> lengthShift) }
+func (t token) length() uint8 { return uint8(t >> lengthShift) }
 
-func lengthCode(len uint32) uint32 { return lengthCodes[len] }
+// Convert length to code.
+func lengthCode(len uint8) uint8 { return lengthCodes[len] }
 
-// Returns the offset code corresponding to a specific offset.
+// Returns the offset code corresponding to a specific offset
 func offsetCode(off uint32) uint32 {
 	if off < uint32(len(offsetCodes)) {
-		return offsetCodes[off]
-	}
-	if off>>7 < uint32(len(offsetCodes)) {
-		return offsetCodes[off>>7] + 14
+		return offsetCodes[uint8(off)]
 	}
-	return offsetCodes[off>>14] + 28
+	return offsetCodes14[uint8(off>>7)]
 }
diff --git a/src/compress/flate/unsafe_disabled.go b/src/compress/flate/unsafe_disabled.go
new file mode 100644
index 00000000000000..c4ecd0fd0a9bb1
--- /dev/null
+++ b/src/compress/flate/unsafe_disabled.go
@@ -0,0 +1,40 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+type indexer interface {
+	int | int8 | int16 | int32 | int64 | uint | uint8 | uint16 | uint32 | uint64
+}
+
+// loadLE8 will load from b at index i.
+func loadLE8[I indexer](b []byte, i I) byte {
+	return b[i]
+}
+
+// loadLE32 will load from b at index i.
+func loadLE32[I indexer](b []byte, i I) uint32 {
+	b = b[i : i+4]
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+// loadLE64 will load from b at index i.
+func loadLE64[I indexer](b []byte, i I) uint64 {
+	b = b[i : i+8]
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+// storeLE64 will store v at start of b.
+func storeLE64(b []byte, v uint64) {
+	_ = b[7] // early bounds check to guarantee safety of writes below
+	b[0] = byte(v)
+	b[1] = byte(v >> 8)
+	b[2] = byte(v >> 16)
+	b[3] = byte(v >> 24)
+	b[4] = byte(v >> 32)
+	b[5] = byte(v >> 40)
+	b[6] = byte(v >> 48)
+	b[7] = byte(v >> 56)
+}
diff --git a/src/compress/flate/writer_test.go b/src/compress/flate/writer_test.go
index c413735cd2c9f3..43815b2e4787fd 100644
--- a/src/compress/flate/writer_test.go
+++ b/src/compress/flate/writer_test.go
@@ -8,6 +8,7 @@ import (
 	"bytes"
 	"fmt"
 	"io"
+	"math"
 	"math/rand"
 	"runtime"
 	"testing"
@@ -40,6 +41,34 @@ func BenchmarkEncode(b *testing.B) {
 	})
 }
 
+func TestWriterMemUsage(t *testing.T) {
+	testMem := func(t *testing.T, fn func()) {
+		var before, after runtime.MemStats
+		runtime.GC()
+		runtime.ReadMemStats(&before)
+		fn()
+		runtime.GC()
+		runtime.ReadMemStats(&after)
+		t.Logf("%s: Memory Used: %dKB, %d allocs", t.Name(), (after.HeapInuse-before.HeapInuse)/1024, after.HeapObjects-before.HeapObjects)
+	}
+	data := make([]byte, 100000)
+
+	for level := HuffmanOnly; level <= BestCompression; level++ {
+		t.Run(fmt.Sprint("level-", level), func(t *testing.T) {
+			var zr *Writer
+			var err error
+			testMem(t, func() {
+				zr, err = NewWriter(io.Discard, level)
+				if err != nil {
+					t.Fatal(err)
+				}
+				zr.Write(data)
+			})
+			zr.Close()
+		})
+	}
+}
+
 // errorWriter is a writer that fails after N writes.
 type errorWriter struct {
 	N int
@@ -67,7 +96,7 @@ func TestWriteError(t *testing.T) {
 	in := buf.Bytes()
 	// We create our own buffer to control number of writes.
 	copyBuffer := make([]byte, 128)
-	for l := 0; l < 10; l++ {
+	for l := range 10 {
 		for fail := 1; fail <= 256; fail *= 2 {
 			// Fail after 'fail' writes
 			ew := &errorWriter{N: fail}
@@ -110,6 +139,75 @@ func TestWriteError(t *testing.T) {
 	}
 }
 
+// Test if errors from the underlying writer is passed upwards.
+func TestWriter_Reset(t *testing.T) {
+	buf := new(bytes.Buffer)
+	n := 65536
+	if !testing.Short() {
+		n *= 4
+	}
+	for i := 0; i < n; i++ {
+		fmt.Fprintf(buf, "asdasfasf%d%dfghfgujyut%dyutyu\n", i, i, i)
+	}
+	in := buf.Bytes()
+	for l := range 10 {
+		l := l
+		if testing.Short() && l > 1 {
+			continue
+		}
+		t.Run(fmt.Sprintf("level-%d", l), func(t *testing.T) {
+			t.Parallel()
+			offset := 1
+			if testing.Short() {
+				offset = 256
+			}
+			for ; offset <= 256; offset *= 2 {
+				// Fail after 'fail' writes
+				w, err := NewWriter(io.Discard, l)
+				if err != nil {
+					t.Fatalf("NewWriter: level %d: %v", l, err)
+				}
+				if w.d.fast == nil {
+					t.Skip("Not Fast...")
+					return
+				}
+				for i := 0; i < (bufferReset-len(in)-offset-maxMatchOffset)/maxMatchOffset; i++ {
+					// skip ahead to where we are close to wrap around...
+					w.d.fast.Reset()
+				}
+				w.d.fast.Reset()
+				_, err = w.Write(in)
+				if err != nil {
+					t.Fatal(err)
+				}
+				for range 50 {
+					// skip ahead again... This should wrap around...
+					w.d.fast.Reset()
+				}
+				w.d.fast.Reset()
+
+				_, err = w.Write(in)
+				if err != nil {
+					t.Fatal(err)
+				}
+				for range (math.MaxUint32 - bufferReset) / maxMatchOffset {
+					// skip ahead to where we are close to wrap around...
+					w.d.fast.Reset()
+				}
+
+				_, err = w.Write(in)
+				if err != nil {
+					t.Fatal(err)
+				}
+				err = w.Close()
+				if err != nil {
+					t.Fatal(err)
+				}
+			}
+		})
+	}
+}
+
 // Test if two runs produce identical results
 // even when writing different sizes to the Writer.
 func TestDeterministic(t *testing.T) {
@@ -171,6 +269,24 @@ func testDeterministic(i int, t *testing.T) {
 	if !bytes.Equal(b1b, b2b) {
 		t.Errorf("level %d did not produce deterministic result, result mismatch, len(a) = %d, len(b) = %d", i, len(b1b), len(b2b))
 	}
+
+	// Test using io.WriterTo interface.
+	var b3 bytes.Buffer
+	br = bytes.NewBuffer(t1)
+	w, err = NewWriter(&b3, i)
+	if err != nil {
+		t.Fatal(err)
+	}
+	_, err = br.WriteTo(w)
+	if err != nil {
+		t.Fatal(err)
+	}
+	w.Close()
+
+	b3b := b3.Bytes()
+	if !bytes.Equal(b1b, b3b) {
+		t.Errorf("level %d (io.WriterTo) did not produce deterministic result, result mismatch, len(a) = %d, len(b) = %d", i, len(b1b), len(b3b))
+	}
 }
 
 // TestDeflateFast_Reset will test that encoding is consistent
diff --git a/src/compress/zlib/example_test.go b/src/compress/zlib/example_test.go
index 70408895ffd5a0..7052973355eb92 100644
--- a/src/compress/zlib/example_test.go
+++ b/src/compress/zlib/example_test.go
@@ -19,7 +19,7 @@ func ExampleNewWriter() {
 	w.Write([]byte("hello, world\n"))
 	w.Close()
 	fmt.Println(b.Bytes())
-	// Output: [120 156 202 72 205 201 201 215 81 40 207 47 202 73 225 2 4 0 0 255 255 33 231 4 147]
+	// Output: [120 156 0 13 0 242 255 104 101 108 108 111 44 32 119 111 114 108 100 10 3 0 33 231 4 147]
 }
 
 func ExampleNewReader() {
diff --git a/src/debug/elf/file_test.go b/src/debug/elf/file_test.go
index 0c1a7cf18aeb6e..733daae57772c6 100644
--- a/src/debug/elf/file_test.go
+++ b/src/debug/elf/file_test.go
@@ -7,7 +7,6 @@ package elf
 import (
 	"bytes"
 	"compress/gzip"
-	"compress/zlib"
 	"debug/dwarf"
 	"encoding/binary"
 	"errors"
@@ -1560,18 +1559,9 @@ func TestIssue59208(t *testing.T) {
 	zoffset := sec.Offset + uint64(sec.compressionOffset)
 	copy(dn, data[:zoffset])
 
-	ozd, err := sec.Data()
-	if err != nil {
-		t.Fatal(err)
-	}
-	buf := bytes.NewBuffer(nil)
-	wr := zlib.NewWriter(buf)
 	// corrupt origin data same as COMPRESS_ZLIB
-	copy(ozd, []byte{1, 0, 0, 0})
-	wr.Write(ozd)
-	wr.Close()
-
-	copy(dn[zoffset:], buf.Bytes())
+	// Insert zlib compressed sec.Data() block with `[]byte{1, 0, 0, 0}` as the first 4 bytes
+	copy(dn[zoffset:], []byte{0x78, 0x9c, 0x5c, 0x4d, 0xb9, 0xd, 0x80, 0x30, 0xc, 0x3c, 0x7, 0x27, 0xdc, 0xe, 0xc, 0x46, 0x4b, 0x8b, 0x14, 0x51, 0x20, 0x16, 0xa1, 0x67, 0x8b, 0x2c, 0x88, 0xec, 0x44, 0xc2, 0xe2, 0x8a, 0xdc, 0x1b, 0x59, 0x0, 0x28, 0xc, 0x34, 0x9, 0x7f, 0x22, 0x96, 0xa0, 0x13, 0x67, 0x27, 0xa1, 0x53, 0xea, 0x4e, 0x47, 0x58, 0x7a, 0x98, 0x8d, 0x26, 0xcd, 0xfb, 0x71, 0x21, 0x31, 0x87, 0x7f, 0xca, 0xf3, 0x1b, 0x7a, 0x21, 0xfa, 0x3f, 0x23, 0x4f, 0x3, 0x50, 0x7a, 0xb9, 0xda, 0xfc, 0xae, 0xc3, 0x35, 0x77, 0x1b, 0x94, 0xd5, 0x82, 0x37, 0x0, 0x0, 0xff, 0xff, 0x65, 0xfb, 0x7, 0x6e})
 	copy(dn[sec.Offset+sec.FileSize:], data[sec.Offset+sec.FileSize:])
 
 	nf, err := NewFile(bytes.NewReader(dn))