Skip to content

Commit 2c020b2

Browse files
committed
wip precompres #14
1 parent daa31a0 commit 2c020b2

File tree

5 files changed

+570
-0
lines changed

5 files changed

+570
-0
lines changed

zipserver/archive.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,39 @@ func (a *ArchiveExtractor) extractAndUploadOne(ctx context.Context, key string,
512512
expectedSize += uint64(len(htmlFooter))
513513
reader = newAppendReader(reader, htmlFooter)
514514
injected = true
515+
}
516+
517+
// Pre-compress if configured and applicable
518+
if resource.contentEncoding == "" && shouldPreCompress(key, expectedSize, a.Config) {
519+
// Use LimitReader to enforce expectedSize limit and prevent memory exhaustion
520+
// from malicious zips that lie about UncompressedSize64
521+
limitedForCompress := io.LimitReader(reader, int64(expectedSize)+1)
522+
data, err := io.ReadAll(limitedForCompress)
523+
if err != nil {
524+
return UploadFileResult{Error: err, Key: key}
525+
}
526+
527+
// Check if we read more than expected (zip lied about size)
528+
if uint64(len(data)) > expectedSize {
529+
return UploadFileResult{Error: fmt.Errorf("zip entry exceeds declared size"), Key: key}
530+
}
531+
532+
compressedData, err := gzipCompress(data)
533+
if err != nil {
534+
return UploadFileResult{Error: err, Key: key}
535+
}
536+
537+
// Only use compressed if actually smaller
538+
if len(compressedData) < len(data) {
539+
reader = bytes.NewReader(compressedData)
540+
resource.contentEncoding = "gzip"
541+
expectedSize = uint64(len(compressedData))
542+
} else {
543+
reader = bytes.NewReader(data)
544+
}
545+
}
546+
547+
if injected {
515548
log.Printf("Sending: %s (injected)", resource)
516549
} else {
517550
log.Printf("Sending: %s", resource)

zipserver/archive_test.go

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package zipserver
33
import (
44
"archive/zip"
55
"bytes"
6+
"compress/gzip"
67
"context"
78
"errors"
89
"fmt"
@@ -1039,3 +1040,234 @@ func Test_isIndexHtml(t *testing.T) {
10391040
})
10401041
}
10411042
}
1043+
1044+
func Test_PreCompression(t *testing.T) {
1045+
ctx := context.Background()
1046+
1047+
withZip := func(t *testing.T, storage *MemStorage, config *Config, entries []zipEntry, cb func(archiver *ArchiveExtractor, zipPath, prefix string)) {
1048+
prefix := "zipserver_test/precompress_test"
1049+
zipPath := "precompress_test.zip"
1050+
1051+
var buf bytes.Buffer
1052+
zw := zip.NewWriter(&buf)
1053+
(&zipLayout{entries: entries}).Write(t, zw)
1054+
err := zw.Close()
1055+
assert.NoError(t, err)
1056+
_, err = storage.PutFile(ctx, config.Bucket, zipPath, bytes.NewReader(buf.Bytes()), PutOptions{})
1057+
assert.NoError(t, err)
1058+
1059+
archiver := &ArchiveExtractor{Storage: storage, Config: config}
1060+
cb(archiver, zipPath, prefix)
1061+
}
1062+
1063+
t.Run("compresses eligible files and sets Content-Encoding", func(t *testing.T) {
1064+
storage, _ := NewMemStorage()
1065+
config := emptyConfig()
1066+
config.PreCompressEnabled = true
1067+
config.PreCompressMinSize = 100
1068+
config.PreCompressExtensions = []string{".html", ".js", ".css"}
1069+
1070+
// Create compressible content (repetitive text compresses well)
1071+
htmlContent := bytes.Repeat([]byte("<html><body>Hello World!</body></html>"), 50)
1072+
jsContent := bytes.Repeat([]byte("function test() { console.log('hello'); }"), 50)
1073+
1074+
withZip(t, storage, config, []zipEntry{
1075+
{name: "index.html", data: htmlContent},
1076+
{name: "app.js", data: jsContent},
1077+
}, func(archiver *ArchiveExtractor, zipPath, prefix string) {
1078+
files, err := archiver.ExtractZip(ctx, zipPath, prefix, testLimits())
1079+
assert.NoError(t, err)
1080+
assert.Len(t, files, 2)
1081+
1082+
for _, f := range files {
1083+
h, err := storage.HeadFile(ctx, config.Bucket, f.Key)
1084+
assert.NoError(t, err)
1085+
assert.Equal(t, "gzip", h.Get("content-encoding"), "file %s should have gzip encoding", f.Key)
1086+
1087+
// Verify compressed size is smaller
1088+
assert.True(t, f.Size < uint64(len(htmlContent)), "compressed size should be smaller")
1089+
}
1090+
})
1091+
})
1092+
1093+
t.Run("skips files below minimum size", func(t *testing.T) {
1094+
storage, _ := NewMemStorage()
1095+
config := emptyConfig()
1096+
config.PreCompressEnabled = true
1097+
config.PreCompressMinSize = 1024
1098+
config.PreCompressExtensions = []string{".html"}
1099+
1100+
smallContent := []byte("<html>Small</html>")
1101+
1102+
withZip(t, storage, config, []zipEntry{
1103+
{name: "small.html", data: smallContent},
1104+
}, func(archiver *ArchiveExtractor, zipPath, prefix string) {
1105+
files, err := archiver.ExtractZip(ctx, zipPath, prefix, testLimits())
1106+
assert.NoError(t, err)
1107+
assert.Len(t, files, 1)
1108+
1109+
h, _ := storage.HeadFile(ctx, config.Bucket, files[0].Key)
1110+
assert.Empty(t, h.Get("content-encoding"), "small file should not be compressed")
1111+
1112+
// Content should be unchanged
1113+
reader, _, _ := storage.GetFile(ctx, config.Bucket, files[0].Key)
1114+
data, _ := io.ReadAll(reader)
1115+
reader.Close()
1116+
assert.Equal(t, smallContent, data)
1117+
})
1118+
})
1119+
1120+
t.Run("skips non-matching extensions", func(t *testing.T) {
1121+
storage, _ := NewMemStorage()
1122+
config := emptyConfig()
1123+
config.PreCompressEnabled = true
1124+
config.PreCompressMinSize = 100
1125+
config.PreCompressExtensions = []string{".html", ".js"}
1126+
1127+
jsonContent := bytes.Repeat([]byte(`{"key": "value", "number": 123}`), 50)
1128+
1129+
withZip(t, storage, config, []zipEntry{
1130+
{name: "data.json", data: jsonContent},
1131+
}, func(archiver *ArchiveExtractor, zipPath, prefix string) {
1132+
files, err := archiver.ExtractZip(ctx, zipPath, prefix, testLimits())
1133+
assert.NoError(t, err)
1134+
assert.Len(t, files, 1)
1135+
1136+
h, _ := storage.HeadFile(ctx, config.Bucket, files[0].Key)
1137+
assert.Empty(t, h.Get("content-encoding"), "non-matching extension should not be compressed")
1138+
})
1139+
})
1140+
1141+
t.Run("skips already-compressed files", func(t *testing.T) {
1142+
storage, _ := NewMemStorage()
1143+
config := emptyConfig()
1144+
config.PreCompressEnabled = true
1145+
config.PreCompressMinSize = 10
1146+
config.PreCompressExtensions = []string{".png", ".gz", ".jpg"}
1147+
1148+
// PNG magic bytes + some data
1149+
pngData := append([]byte{0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A}, bytes.Repeat([]byte{0x00}, 100)...)
1150+
// Gzip magic bytes + some data
1151+
gzData := append([]byte{0x1F, 0x8B, 0x08}, bytes.Repeat([]byte{0x00}, 100)...)
1152+
1153+
withZip(t, storage, config, []zipEntry{
1154+
{name: "image.png", data: pngData},
1155+
{name: "archive.gz", data: gzData},
1156+
}, func(archiver *ArchiveExtractor, zipPath, prefix string) {
1157+
files, err := archiver.ExtractZip(ctx, zipPath, prefix, testLimits())
1158+
assert.NoError(t, err)
1159+
assert.Len(t, files, 2)
1160+
1161+
for _, f := range files {
1162+
h, _ := storage.HeadFile(ctx, config.Bucket, f.Key)
1163+
// These files are detected as gzip by content, so they'll have gzip encoding
1164+
// but they shouldn't be double-compressed
1165+
if strings.HasSuffix(f.Key, ".gz") {
1166+
assert.Equal(t, "gzip", h.Get("content-encoding"))
1167+
}
1168+
}
1169+
})
1170+
})
1171+
1172+
t.Run("compresses HTML footer with content", func(t *testing.T) {
1173+
storage, _ := NewMemStorage()
1174+
config := emptyConfig()
1175+
config.PreCompressEnabled = true
1176+
config.PreCompressMinSize = 100
1177+
config.PreCompressExtensions = []string{".html"}
1178+
1179+
htmlContent := bytes.Repeat([]byte("<html><body>Content</body></html>"), 20)
1180+
footer := bytes.Repeat([]byte("<script>console.log('injected');</script>"), 10)
1181+
1182+
withZip(t, storage, config, []zipEntry{
1183+
{name: "index.html", data: htmlContent},
1184+
}, func(archiver *ArchiveExtractor, zipPath, prefix string) {
1185+
limits := testLimits()
1186+
limits.HtmlFooter = string(footer)
1187+
1188+
files, err := archiver.ExtractZip(ctx, zipPath, prefix, limits)
1189+
assert.NoError(t, err)
1190+
assert.Len(t, files, 1)
1191+
1192+
h, _ := storage.HeadFile(ctx, config.Bucket, files[0].Key)
1193+
assert.Equal(t, "gzip", h.Get("content-encoding"))
1194+
1195+
// Verify the footer was included before compression
1196+
reader, _, _ := storage.GetFile(ctx, config.Bucket, files[0].Key)
1197+
compressedData, _ := io.ReadAll(reader)
1198+
reader.Close()
1199+
1200+
// Decompress and verify content includes footer
1201+
gzReader, err := gzip.NewReader(bytes.NewReader(compressedData))
1202+
assert.NoError(t, err)
1203+
decompressed, _ := io.ReadAll(gzReader)
1204+
gzReader.Close()
1205+
1206+
expected := string(htmlContent) + string(footer)
1207+
assert.Equal(t, expected, string(decompressed))
1208+
})
1209+
})
1210+
1211+
t.Run("skips compression when result would be larger", func(t *testing.T) {
1212+
storage, _ := NewMemStorage()
1213+
config := emptyConfig()
1214+
config.PreCompressEnabled = true
1215+
config.PreCompressMinSize = 10
1216+
config.PreCompressExtensions = []string{".html"}
1217+
1218+
// Random/incompressible data that gzip can't compress well
1219+
incompressibleData := make([]byte, 100)
1220+
for i := range incompressibleData {
1221+
incompressibleData[i] = byte(i * 17 % 256)
1222+
}
1223+
1224+
withZip(t, storage, config, []zipEntry{
1225+
{name: "random.html", data: incompressibleData},
1226+
}, func(archiver *ArchiveExtractor, zipPath, prefix string) {
1227+
files, err := archiver.ExtractZip(ctx, zipPath, prefix, testLimits())
1228+
assert.NoError(t, err)
1229+
assert.Len(t, files, 1)
1230+
1231+
h, _ := storage.HeadFile(ctx, config.Bucket, files[0].Key)
1232+
// If compressed would be larger, should not have gzip encoding
1233+
// Note: The actual behavior depends on whether gzip makes it larger
1234+
// For truly random data, gzip often makes it larger
1235+
1236+
reader, _, _ := storage.GetFile(ctx, config.Bucket, files[0].Key)
1237+
storedData, _ := io.ReadAll(reader)
1238+
reader.Close()
1239+
1240+
if h.Get("content-encoding") == "" {
1241+
// Not compressed - verify original data
1242+
assert.Equal(t, incompressibleData, storedData)
1243+
}
1244+
// If it was compressed, that's also fine - the test is that we don't
1245+
// store a larger compressed version
1246+
})
1247+
})
1248+
1249+
t.Run("disabled by default", func(t *testing.T) {
1250+
storage, _ := NewMemStorage()
1251+
config := emptyConfig()
1252+
// PreCompressEnabled is false by default
1253+
1254+
htmlContent := bytes.Repeat([]byte("<html><body>Hello World!</body></html>"), 50)
1255+
1256+
withZip(t, storage, config, []zipEntry{
1257+
{name: "index.html", data: htmlContent},
1258+
}, func(archiver *ArchiveExtractor, zipPath, prefix string) {
1259+
files, err := archiver.ExtractZip(ctx, zipPath, prefix, testLimits())
1260+
assert.NoError(t, err)
1261+
assert.Len(t, files, 1)
1262+
1263+
h, _ := storage.HeadFile(ctx, config.Bucket, files[0].Key)
1264+
assert.Empty(t, h.Get("content-encoding"), "compression should be disabled by default")
1265+
1266+
// Content should be unchanged
1267+
reader, _, _ := storage.GetFile(ctx, config.Bucket, files[0].Key)
1268+
data, _ := io.ReadAll(reader)
1269+
reader.Close()
1270+
assert.Equal(t, htmlContent, data)
1271+
})
1272+
})
1273+
}

zipserver/compress.go

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
package zipserver
2+
3+
import (
4+
"bytes"
5+
"compress/gzip"
6+
"path"
7+
"strings"
8+
)
9+
10+
// alreadyCompressedExtensions contains file extensions that are already compressed
11+
// and should not be pre-compressed
12+
var alreadyCompressedExtensions = map[string]bool{
13+
".gz": true,
14+
".br": true,
15+
".zip": true,
16+
".png": true,
17+
".jpg": true,
18+
".jpeg": true,
19+
".gif": true,
20+
".webp": true,
21+
".mp3": true,
22+
".mp4": true,
23+
".webm": true,
24+
".ogg": true,
25+
".flac": true,
26+
".rar": true,
27+
".7z": true,
28+
".bz2": true,
29+
".xz": true,
30+
".zst": true,
31+
}
32+
33+
// shouldPreCompress checks if a file should be pre-compressed based on
34+
// filename, size, and configuration
35+
func shouldPreCompress(filename string, size uint64, config *Config) bool {
36+
if !config.PreCompressEnabled {
37+
return false
38+
}
39+
40+
if int64(size) < config.PreCompressMinSize {
41+
return false
42+
}
43+
44+
ext := strings.ToLower(path.Ext(filename))
45+
46+
// Skip already compressed files
47+
if alreadyCompressedExtensions[ext] {
48+
return false
49+
}
50+
51+
// Check if extension matches configured extensions
52+
for _, allowedExt := range config.PreCompressExtensions {
53+
if strings.EqualFold(ext, allowedExt) {
54+
return true
55+
}
56+
}
57+
58+
return false
59+
}
60+
61+
// gzipCompress compresses data using gzip with best compression
62+
func gzipCompress(data []byte) ([]byte, error) {
63+
var buf bytes.Buffer
64+
writer, err := gzip.NewWriterLevel(&buf, gzip.BestCompression)
65+
if err != nil {
66+
return nil, err
67+
}
68+
69+
_, err = writer.Write(data)
70+
if err != nil {
71+
writer.Close()
72+
return nil, err
73+
}
74+
75+
err = writer.Close()
76+
if err != nil {
77+
return nil, err
78+
}
79+
80+
return buf.Bytes(), nil
81+
}

0 commit comments

Comments
 (0)