Skip to content

Commit 2d52d78

Browse files
committed
fix: v0.3.1 hotfix - compressed data, endianness, small tags
Critical bug fixes: 1. Compressed data support (miCOMPRESSED) - Implemented full zlib decompression - Added compression bomb protection (100MB limit, 1000:1 ratio) - Note: Compressed elements have no padding after data 2. Endianness interpretation - Fixed 'IM'/'MI' indicator interpretation - 'IM' = little-endian, 'MI' = big-endian - Updated parser, writer, and all tests 3. Small format tag data - Added SmallData field to DataTag struct - Small format tags (1-4 bytes) now properly capture embedded data - Fixed variable name and small array corruption Test results: - All 298+ existing tests pass - Verified with real MATLAB files (12 and 34 variables) - Added test file: inner_outer_tbl_param.mat
1 parent d6bb8bf commit 2d52d78

File tree

11 files changed

+219
-81
lines changed

11 files changed

+219
-81
lines changed

CHANGELOG.md

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,36 @@
1+
## [0.3.1] - 2025-11-25
2+
3+
### Fixed - Critical Bug Fixes (Hotfix)
4+
5+
**Bug Fix #1: Compressed Data Support** (Critical):
6+
- **Issue**: Files with zlib-compressed data (miCOMPRESSED) returned 0 variables
7+
- **Impact**: Most real-world MATLAB files use compression - they were unreadable
8+
- **Fix**: Implemented full zlib decompression in `internal/v5/compressed.go`
9+
- **Security**: Added compression bomb protection (100MB limit, 1000:1 ratio max)
10+
11+
**Bug Fix #2: Endianness Interpretation** (Critical):
12+
- **Issue**: "MI"/"IM" endian indicators were interpreted incorrectly
13+
- **Impact**: Wrong byte order caused garbage data or parse failures
14+
- **Fix**: Corrected interpretation in `internal/v5/header.go` and `writer.go`
15+
- **Note**: "IM" = little-endian, "MI" = big-endian (matches MATLAB spec)
16+
17+
**Bug Fix #3: Small Format Tag Data** (Critical):
18+
- **Issue**: Small format tags (1-4 bytes) lost their embedded data
19+
- **Impact**: Variable names and small arrays were corrupted
20+
- **Fix**: Added `SmallData` field to `DataTag` struct in `internal/v5/data_tag.go`
21+
22+
### Technical Details
23+
24+
- Files affected: `header.go`, `writer.go`, `parser.go`, `data_tag.go`, `compressed.go`
25+
- All 298+ existing tests pass
26+
- Added test files: `inner_outer_tbl_param.mat` (12 vars), `energy_cascade_signals.mat` (34 vars)
27+
- Verified with real scientific data files from SciPy project
28+
29+
---
30+
131
## [0.3.0] - 2025-11-21
232

3-
### Added - Production Quality Release 🎉
33+
### Added - Production Quality Release
434

535
**Functional Options Pattern**:
636
- `WithEndianness(binary.ByteOrder)` - Set byte order for v5 files (little/big endian)

ROADMAP.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
> **Strategic Approach**: Leverage existing HDF5 library and MATLAB documentation
44
5-
**Last Updated**: 2025-11-21 | **Current Version**: v0.3.0 (STABLE ✅) | **Target**: v1.0.0 stable (2026)
5+
**Last Updated**: 2025-11-25 | **Current Version**: v0.3.1 (HOTFIX ✅) | **Target**: v1.0.0 stable (2026)
66

77
---
88

internal/v5/compressed.go

Lines changed: 52 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,61 @@
22
package v5
33

44
import (
5-
"errors"
5+
"bytes"
6+
"compress/zlib"
7+
"fmt"
68
"io"
79
)
810

9-
// ErrCompressedNotSupported indicates compressed data is not supported.
10-
var ErrCompressedNotSupported = errors.New("compressed MAT-files not yet supported")
11+
// maxDecompressedSize is the maximum allowed size after decompression (100MB).
12+
// This prevents compression bomb attacks (zip bombs).
13+
const maxDecompressedSize = 100 * 1024 * 1024 // 100MB
1114

12-
// decompress would handle decompression (stub for future implementation).
15+
// maxCompressionRatio is the maximum allowed compression ratio.
16+
// Typical zlib compression achieves 2:1 to 10:1 ratios.
17+
// A ratio above 1000:1 suggests a potential zip bomb.
18+
const maxCompressionRatio = 1000
19+
20+
// decompress decompresses zlib-compressed data from a MAT-file.
21+
// It reads compressedSize bytes from r and returns the decompressed content.
1322
//
14-
//nolint:unused // Future implementation stub
15-
func decompress(_ io.Reader) (io.Reader, error) {
16-
return nil, ErrCompressedNotSupported
23+
// Security: Implements protection against compression bombs:
24+
// - Maximum decompressed size limit (100MB).
25+
// - Maximum compression ratio check (1000:1).
26+
func decompress(r io.Reader, compressedSize uint32) ([]byte, error) {
27+
// Read compressed data
28+
compressed := make([]byte, compressedSize)
29+
if _, err := io.ReadFull(r, compressed); err != nil {
30+
return nil, fmt.Errorf("failed to read compressed data: %w", err)
31+
}
32+
33+
// Create zlib reader
34+
zlibReader, err := zlib.NewReader(bytes.NewReader(compressed))
35+
if err != nil {
36+
return nil, fmt.Errorf("failed to create zlib reader: %w", err)
37+
}
38+
defer zlibReader.Close() //nolint:errcheck // Best effort cleanup
39+
40+
// Read decompressed data with size limit
41+
var decompressed bytes.Buffer
42+
limited := io.LimitReader(zlibReader, maxDecompressedSize+1)
43+
n, err := io.Copy(&decompressed, limited)
44+
if err != nil {
45+
return nil, fmt.Errorf("failed to decompress data: %w", err)
46+
}
47+
48+
// Check for size limit exceeded
49+
if n > maxDecompressedSize {
50+
return nil, fmt.Errorf("decompressed size exceeds limit: %d > %d bytes", n, maxDecompressedSize)
51+
}
52+
53+
// Check compression ratio
54+
if compressedSize > 0 {
55+
ratio := float64(n) / float64(compressedSize)
56+
if ratio > maxCompressionRatio {
57+
return nil, fmt.Errorf("compression ratio too high: %.1f:1 (max %d:1)", ratio, maxCompressionRatio)
58+
}
59+
}
60+
61+
return decompressed.Bytes(), nil
1762
}

internal/v5/data_tag.go

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@ const maxReasonableSize = 2 * 1024 * 1024 * 1024 // 2GB
1212

1313
// DataTag represents a data element tag.
1414
type DataTag struct {
15-
DataType uint32 // Data type identifier
16-
Size uint32 // Data size in bytes
17-
IsSmall bool // True for small data elements
15+
DataType uint32 // Data type identifier
16+
Size uint32 // Data size in bytes
17+
IsSmall bool // True for small data elements
18+
SmallData []byte // For small format: data bytes (up to 4 bytes)
1819
}
1920

2021
// readTag reads a data tag from the stream.
@@ -36,12 +37,16 @@ func (p *Parser) readTag() (*DataTag, error) {
3637
// Lower 16 bits contain data type
3738
size := firstWord >> 16
3839
if size > 0 && size <= 4 {
39-
// Small format
40+
// Small format: data is packed in bytes 4-7 of the 8-byte tag
4041
dataType := firstWord & 0xFFFF
42+
// Copy the small data from bytes 4 to 4+size
43+
smallData := make([]byte, size)
44+
copy(smallData, buf[4:4+size])
4145
return &DataTag{
42-
DataType: dataType,
43-
Size: size,
44-
IsSmall: true,
46+
DataType: dataType,
47+
Size: size,
48+
IsSmall: true,
49+
SmallData: smallData,
4550
}, nil
4651
}
4752

internal/v5/header.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,14 @@ func parseHeader(data []byte) (*Header, error) {
2222
}
2323

2424
// Determine byte order
25+
// The endian indicator is the 16-bit value 0x4D49 ("MI") written to bytes 126-127.
26+
// On little-endian systems, this is stored as [0x49, 0x4D] which reads as "IM".
27+
// On big-endian systems, this is stored as [0x4D, 0x49] which reads as "MI".
28+
// So: "IM" → little-endian, "MI" → big-endian
2529
switch hdr.EndianIndicator {
26-
case "MI":
27-
hdr.Order = binary.LittleEndian
2830
case "IM":
31+
hdr.Order = binary.LittleEndian
32+
case "MI":
2933
hdr.Order = binary.BigEndian
3034
default:
3135
return nil, errors.New("invalid endian indicator")

internal/v5/header_test.go

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@ import (
66
)
77

88
func TestParseHeader(t *testing.T) {
9+
// Note: Endian indicator interpretation:
10+
// - "IM" = file created on little-endian system → use LittleEndian
11+
// - "MI" = file created on big-endian system → use BigEndian
12+
// This is because the 16-bit value 0x4D49 ("MI") is stored as [0x49, 0x4D]
13+
// on little-endian systems, which reads as "IM".
914
tests := []struct {
1015
name string
1116
header []byte
@@ -17,46 +22,46 @@ func TestParseHeader(t *testing.T) {
1722
}{
1823
{
1924
name: "valid little endian v5",
20-
header: makeHeader("MATLAB 5.0 MAT-file", 0x0100, "MI"),
25+
header: makeHeader("MATLAB 5.0 MAT-file", 0x0100, "IM"),
2126
wantDesc: "MATLAB 5.0 MAT-file",
2227
wantVersion: 0x0100,
23-
wantEndian: "MI",
28+
wantEndian: "IM",
2429
wantOrder: binary.LittleEndian,
2530
wantErr: false,
2631
},
2732
{
2833
name: "valid big endian v5",
29-
header: makeHeader("MATLAB 5.0 MAT-file", 0x0100, "IM"),
34+
header: makeHeader("MATLAB 5.0 MAT-file", 0x0100, "MI"),
3035
wantDesc: "MATLAB 5.0 MAT-file",
3136
wantVersion: 0x0100,
32-
wantEndian: "IM",
37+
wantEndian: "MI",
3338
wantOrder: binary.BigEndian,
3439
wantErr: false,
3540
},
3641
{
3742
name: "description with trailing nulls",
38-
header: makeHeader("Test file\x00\x00\x00", 0x0100, "MI"),
43+
header: makeHeader("Test file\x00\x00\x00", 0x0100, "IM"),
3944
wantDesc: "Test file",
4045
wantVersion: 0x0100,
41-
wantEndian: "MI",
46+
wantEndian: "IM",
4247
wantOrder: binary.LittleEndian,
4348
wantErr: false,
4449
},
4550
{
4651
name: "empty description",
47-
header: makeHeader("", 0x0100, "MI"),
52+
header: makeHeader("", 0x0100, "IM"),
4853
wantDesc: "",
4954
wantVersion: 0x0100,
50-
wantEndian: "MI",
55+
wantEndian: "IM",
5156
wantOrder: binary.LittleEndian,
5257
wantErr: false,
5358
},
5459
{
5560
name: "v7.2 format",
56-
header: makeHeader("MATLAB 7.0 MAT-file", 0x0100, "MI"),
61+
header: makeHeader("MATLAB 7.0 MAT-file", 0x0100, "IM"),
5762
wantDesc: "MATLAB 7.0 MAT-file",
5863
wantVersion: 0x0100,
59-
wantEndian: "MI",
64+
wantEndian: "IM",
6065
wantOrder: binary.LittleEndian,
6166
wantErr: false,
6267
},
@@ -107,6 +112,7 @@ func TestParseHeader(t *testing.T) {
107112
// TestParseHeaderByteOrderVerification verifies that byte order is correctly detected
108113
// and used for version number parsing.
109114
func TestParseHeaderByteOrderVerification(t *testing.T) {
115+
// Note: "IM" = little-endian, "MI" = big-endian
110116
tests := []struct {
111117
name string
112118
endian string
@@ -115,19 +121,19 @@ func TestParseHeaderByteOrderVerification(t *testing.T) {
115121
}{
116122
{
117123
name: "little endian version parsing",
118-
endian: "MI",
124+
endian: "IM",
119125
version: 0x0100,
120126
wantVersion: 0x0100,
121127
},
122128
{
123129
name: "big endian version parsing",
124-
endian: "IM",
130+
endian: "MI",
125131
version: 0x0100,
126132
wantVersion: 0x0100,
127133
},
128134
{
129135
name: "little endian different version",
130-
endian: "MI",
136+
endian: "IM",
131137
version: 0x0200,
132138
wantVersion: 0x0200,
133139
},
@@ -146,11 +152,12 @@ func TestParseHeaderByteOrderVerification(t *testing.T) {
146152
}
147153

148154
// Verify byte order matches endian indicator
149-
if tt.endian == "MI" && got.Order != binary.LittleEndian {
150-
t.Error("Expected LittleEndian for 'MI' indicator")
155+
// "IM" = little-endian, "MI" = big-endian
156+
if tt.endian == "IM" && got.Order != binary.LittleEndian {
157+
t.Error("Expected LittleEndian for 'IM' indicator")
151158
}
152-
if tt.endian == "IM" && got.Order != binary.BigEndian {
153-
t.Error("Expected BigEndian for 'IM' indicator")
159+
if tt.endian == "MI" && got.Order != binary.BigEndian {
160+
t.Error("Expected BigEndian for 'MI' indicator")
154161
}
155162
})
156163
}
@@ -164,7 +171,7 @@ func TestParseHeaderLongDescription(t *testing.T) {
164171
longDesc = longDesc[:i] + "A" + longDesc[i+1:]
165172
}
166173

167-
header := makeHeader(longDesc, 0x0100, "MI")
174+
header := makeHeader(longDesc, 0x0100, "IM") // Use "IM" for little-endian
168175
got, err := parseHeader(header)
169176
if err != nil {
170177
t.Fatalf("parseHeader() unexpected error: %v", err)
@@ -176,18 +183,20 @@ func TestParseHeaderLongDescription(t *testing.T) {
176183
}
177184

178185
// makeHeader creates a test MAT-file header (128 bytes).
186+
// Note: "IM" = little-endian, "MI" = big-endian.
179187
func makeHeader(desc string, version uint16, endian string) []byte {
180188
header := make([]byte, 128)
181189

182190
// Description (bytes 0-115)
183191
copy(header, desc)
184192

185193
// Determine byte order from endian indicator
194+
// "IM" = little-endian, "MI" = big-endian
186195
var order binary.ByteOrder
187196
switch endian {
188-
case "MI":
189-
order = binary.LittleEndian
190197
case "IM":
198+
order = binary.LittleEndian
199+
case "MI":
191200
order = binary.BigEndian
192201
default:
193202
// For invalid endian, use little endian but write invalid indicator
@@ -205,7 +214,7 @@ func makeHeader(desc string, version uint16, endian string) []byte {
205214

206215
// BenchmarkParseHeader benchmarks header parsing performance.
207216
func BenchmarkParseHeader(b *testing.B) {
208-
header := makeHeader("MATLAB 5.0 MAT-file", 0x0100, "MI")
217+
header := makeHeader("MATLAB 5.0 MAT-file", 0x0100, "IM") // Use "IM" for little-endian
209218

210219
b.ResetTimer()
211220
for i := 0; i < b.N; i++ {

0 commit comments

Comments
 (0)