Skip to content

Commit 7ff0b3e

Browse files
rpuneetclaude
andauthored
fix(tiff): decode UserComment EXIF tag (Bug #19) (#28)
Previously, UserComment (tag 0x9286) was returned as raw hex bytes. Now properly decodes the 8-byte charset prefix and text content. Supported encodings: - ASCII: "ASCII\x00\x00\x00" prefix - Unicode: "UNICODE\x00" prefix (UTF-16 LE/BE) - JIS: "JIS\x00\x00\x00\x00\x00" prefix - Undefined: null prefix (treated as UTF-8) - No prefix fallback for malformed data Before: "415343494900000053686f7420..." After: "Shot during golden hour, 3-stop graduated ND filter used" Closes #19 Co-authored-by: Claude Opus 4.5 <[email protected]>
1 parent c19be2b commit 7ff0b3e

File tree

4 files changed

+370
-1
lines changed

4 files changed

+370
-1
lines changed

api_integration_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ func TestIntegration_CR2(t *testing.T) {
283283
{Name: "Flash", Value: "Off, Did not fire"},
284284
{Name: "FocalLength", Value: "85/1"},
285285
// MakerNote is now parsed into Canon directory
286-
{Name: "UserComment", Type: "[]byte"}, // Binary data - check presence only
286+
{Name: "UserComment", Type: "string"}, // Decoded comment text (Bug #19 fix)
287287
{Name: "FlashpixVersion", Value: "0100"},
288288
{Name: "ColorSpace", Value: "sRGB"},
289289
{Name: "PixelXDimension", Value: uint16(4992)},

internal/parser/tiff/ifd.go

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,14 @@ func (p *Parser) parseTag(r *imxbin.Reader, entry *IFDEntry, dirName string) (*p
141141
value = decoded
142142
}
143143

144+
// Decode UserComment (EXIF tag 0x9286)
145+
// Format: 8-byte charset prefix + variable-length text
146+
if entry.Tag == TagUserComment {
147+
if data, ok := value.([]byte); ok {
148+
value = decodeUserComment(data)
149+
}
150+
}
151+
144152
return &parser.Tag{
145153
ID: parser.TagID(fmt.Sprintf("%s:0x%04X", dirName, entry.Tag)),
146154
Name: tagName,
@@ -664,3 +672,163 @@ func getTagNameForDir(tag uint16, dirName string) string {
664672
return getTIFFTagName(tag) // Default to TIFF tags
665673
}
666674
}
675+
676+
// UserComment charset prefixes (8 bytes each)
677+
var (
678+
userCommentASCII = []byte("ASCII\x00\x00\x00")
679+
userCommentUnicode = []byte("UNICODE\x00")
680+
userCommentJIS = []byte("JIS\x00\x00\x00\x00\x00")
681+
userCommentUndefined = []byte("\x00\x00\x00\x00\x00\x00\x00\x00")
682+
)
683+
684+
// decodeUserComment decodes EXIF UserComment tag (0x9286).
685+
// Format: 8-byte charset prefix + variable-length text.
686+
//
687+
// Charset prefixes:
688+
// - "ASCII\x00\x00\x00" (8 bytes) - ASCII text
689+
// - "UNICODE\x00" (8 bytes) - UTF-16 text (big or little endian)
690+
// - "JIS\x00\x00\x00\x00\x00" (8 bytes) - JIS encoding
691+
// - "\x00\x00\x00\x00\x00\x00\x00\x00" (8 bytes) - undefined (often UTF-8)
692+
//
693+
// Returns decoded string or original bytes if decoding fails.
694+
func decodeUserComment(data []byte) interface{} {
695+
const prefixLen = 8
696+
697+
// Need at least the charset prefix
698+
if len(data) < prefixLen {
699+
return data
700+
}
701+
702+
prefix := data[:prefixLen]
703+
content := data[prefixLen:]
704+
705+
// Empty content
706+
if len(content) == 0 {
707+
return ""
708+
}
709+
710+
switch {
711+
case bytes.Equal(prefix, userCommentASCII):
712+
// Trim trailing nulls for ASCII
713+
return string(bytes.TrimRight(content, "\x00"))
714+
715+
case bytes.Equal(prefix, userCommentUnicode):
716+
// UTF-16 - don't trim nulls before decoding (nulls are part of UTF-16)
717+
// decodeUTF16 handles null termination internally
718+
return decodeUTF16(content)
719+
720+
case bytes.Equal(prefix, userCommentJIS):
721+
// JIS encoding - return as-is for now (rarely used)
722+
// Full JIS decoding would require additional dependencies
723+
return string(bytes.TrimRight(content, "\x00"))
724+
725+
case bytes.Equal(prefix, userCommentUndefined):
726+
// Undefined charset - often UTF-8 in practice
727+
return string(bytes.TrimRight(content, "\x00"))
728+
729+
default:
730+
// Unknown prefix - check if it looks like text without prefix
731+
// Some cameras omit the charset prefix entirely
732+
if isValidUTF8(data) {
733+
return string(bytes.TrimRight(data, "\x00"))
734+
}
735+
return data
736+
}
737+
}
738+
739+
// decodeUTF16 decodes UTF-16 encoded data to a UTF-8 string.
740+
// Detects byte order from BOM if present, otherwise assumes little-endian.
741+
func decodeUTF16(data []byte) string {
742+
if len(data) < 2 {
743+
return ""
744+
}
745+
746+
// Check for BOM
747+
var littleEndian bool
748+
start := 0
749+
if data[0] == 0xFF && data[1] == 0xFE {
750+
littleEndian = true
751+
start = 2
752+
} else if data[0] == 0xFE && data[1] == 0xFF {
753+
littleEndian = false
754+
start = 2
755+
} else {
756+
// No BOM - assume little-endian (most common on Windows)
757+
littleEndian = true
758+
}
759+
760+
// Decode UTF-16 to UTF-8
761+
data = data[start:]
762+
if len(data)%2 != 0 {
763+
data = data[:len(data)-1] // Truncate odd byte
764+
}
765+
766+
var result []rune
767+
for i := 0; i < len(data); i += 2 {
768+
var r uint16
769+
if littleEndian {
770+
r = uint16(data[i]) | uint16(data[i+1])<<8
771+
} else {
772+
r = uint16(data[i])<<8 | uint16(data[i+1])
773+
}
774+
775+
// Handle surrogate pairs for characters outside BMP
776+
if r >= 0xD800 && r <= 0xDBFF && i+2 < len(data) {
777+
var r2 uint16
778+
if littleEndian {
779+
r2 = uint16(data[i+2]) | uint16(data[i+3])<<8
780+
} else {
781+
r2 = uint16(data[i+2])<<8 | uint16(data[i+3])
782+
}
783+
if r2 >= 0xDC00 && r2 <= 0xDFFF {
784+
// Valid surrogate pair
785+
codePoint := 0x10000 + (int(r-0xD800) << 10) + int(r2-0xDC00)
786+
result = append(result, rune(codePoint))
787+
i += 2
788+
continue
789+
}
790+
}
791+
792+
if r == 0 {
793+
break // Null terminator
794+
}
795+
result = append(result, rune(r))
796+
}
797+
798+
return string(result)
799+
}
800+
801+
// isValidUTF8 checks if data is valid UTF-8 text (printable or whitespace).
802+
func isValidUTF8(data []byte) bool {
803+
if len(data) == 0 {
804+
return false
805+
}
806+
807+
// Trim trailing nulls first - they're common in fixed-size EXIF fields
808+
trimmed := bytes.TrimRight(data, "\x00")
809+
if len(trimmed) == 0 {
810+
return false
811+
}
812+
813+
// Quick check: count embedded nulls (after trimming trailing ones)
814+
nullCount := 0
815+
for _, b := range trimmed {
816+
if b == 0 {
817+
nullCount++
818+
}
819+
}
820+
// If more than 20% embedded nulls, probably binary
821+
if nullCount > len(trimmed)/5 {
822+
return false
823+
}
824+
825+
// Check if valid UTF-8
826+
s := string(trimmed)
827+
for _, r := range s {
828+
// Allow printable characters and common whitespace
829+
if r == '\uFFFD' { // Unicode replacement character = invalid UTF-8
830+
return false
831+
}
832+
}
833+
return true
834+
}

internal/parser/tiff/ifd_test.go

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1421,3 +1421,203 @@ func TestGetTagNameForDir(t *testing.T) {
14211421
}
14221422
}
14231423
}
1424+
1425+
func TestDecodeUserComment(t *testing.T) {
1426+
tests := []struct {
1427+
name string
1428+
data []byte
1429+
want interface{}
1430+
wantStr bool // true if want is expected to be a string
1431+
}{
1432+
{
1433+
name: "ASCII encoding",
1434+
data: append([]byte("ASCII\x00\x00\x00"), []byte("Hello World")...),
1435+
want: "Hello World",
1436+
wantStr: true,
1437+
},
1438+
{
1439+
name: "ASCII with trailing nulls",
1440+
data: append([]byte("ASCII\x00\x00\x00"), []byte("Hello\x00\x00\x00")...),
1441+
want: "Hello",
1442+
wantStr: true,
1443+
},
1444+
{
1445+
name: "undefined encoding (UTF-8)",
1446+
data: append([]byte("\x00\x00\x00\x00\x00\x00\x00\x00"), []byte("Test comment")...),
1447+
want: "Test comment",
1448+
wantStr: true,
1449+
},
1450+
{
1451+
name: "empty content after prefix",
1452+
data: []byte("ASCII\x00\x00\x00"),
1453+
want: "",
1454+
wantStr: true,
1455+
},
1456+
{
1457+
name: "empty content with trailing nulls",
1458+
data: append([]byte("ASCII\x00\x00\x00"), []byte("\x00\x00\x00")...),
1459+
want: "",
1460+
wantStr: true,
1461+
},
1462+
{
1463+
name: "too short - less than prefix",
1464+
data: []byte("ASCII"),
1465+
want: []byte("ASCII"),
1466+
wantStr: false,
1467+
},
1468+
{
1469+
name: "JIS encoding",
1470+
data: append([]byte("JIS\x00\x00\x00\x00\x00"), []byte("Test")...),
1471+
want: "Test",
1472+
wantStr: true,
1473+
},
1474+
{
1475+
name: "Unicode UTF-16LE with BOM",
1476+
data: append([]byte("UNICODE\x00"), append([]byte{0xFF, 0xFE}, []byte{'H', 0, 'i', 0}...)...),
1477+
want: "Hi",
1478+
wantStr: true,
1479+
},
1480+
{
1481+
name: "Unicode UTF-16BE with BOM",
1482+
data: append([]byte("UNICODE\x00"), append([]byte{0xFE, 0xFF}, []byte{0, 'H', 0, 'i'}...)...),
1483+
want: "Hi",
1484+
wantStr: true,
1485+
},
1486+
{
1487+
name: "Unicode UTF-16LE without BOM",
1488+
data: append([]byte("UNICODE\x00"), []byte{'T', 0, 'e', 0, 's', 0, 't', 0}...),
1489+
want: "Test",
1490+
wantStr: true,
1491+
},
1492+
{
1493+
name: "no prefix - valid UTF-8 text",
1494+
data: []byte("Plain text comment"),
1495+
want: "Plain text comment",
1496+
wantStr: true,
1497+
},
1498+
}
1499+
1500+
for _, tt := range tests {
1501+
t.Run(tt.name, func(t *testing.T) {
1502+
got := decodeUserComment(tt.data)
1503+
1504+
if tt.wantStr {
1505+
gotStr, ok := got.(string)
1506+
if !ok {
1507+
t.Errorf("decodeUserComment() returned %T, want string", got)
1508+
return
1509+
}
1510+
wantStr := tt.want.(string)
1511+
if gotStr != wantStr {
1512+
t.Errorf("decodeUserComment() = %q, want %q", gotStr, wantStr)
1513+
}
1514+
} else {
1515+
gotBytes, ok := got.([]byte)
1516+
if !ok {
1517+
t.Errorf("decodeUserComment() returned %T, want []byte", got)
1518+
return
1519+
}
1520+
wantBytes := tt.want.([]byte)
1521+
if !bytes.Equal(gotBytes, wantBytes) {
1522+
t.Errorf("decodeUserComment() = %v, want %v", gotBytes, wantBytes)
1523+
}
1524+
}
1525+
})
1526+
}
1527+
}
1528+
1529+
func TestDecodeUTF16(t *testing.T) {
1530+
tests := []struct {
1531+
name string
1532+
data []byte
1533+
want string
1534+
}{
1535+
{
1536+
name: "empty",
1537+
data: []byte{},
1538+
want: "",
1539+
},
1540+
{
1541+
name: "single byte",
1542+
data: []byte{0x41},
1543+
want: "",
1544+
},
1545+
{
1546+
name: "simple ASCII as UTF-16LE",
1547+
data: []byte{'A', 0, 'B', 0, 'C', 0},
1548+
want: "ABC",
1549+
},
1550+
{
1551+
name: "with LE BOM",
1552+
data: []byte{0xFF, 0xFE, 'H', 0, 'i', 0},
1553+
want: "Hi",
1554+
},
1555+
{
1556+
name: "with BE BOM",
1557+
data: []byte{0xFE, 0xFF, 0, 'H', 0, 'i'},
1558+
want: "Hi",
1559+
},
1560+
{
1561+
name: "null terminated",
1562+
data: []byte{'A', 0, 0, 0},
1563+
want: "A",
1564+
},
1565+
{
1566+
name: "odd length truncated",
1567+
data: []byte{'A', 0, 'B'},
1568+
want: "A",
1569+
},
1570+
}
1571+
1572+
for _, tt := range tests {
1573+
t.Run(tt.name, func(t *testing.T) {
1574+
got := decodeUTF16(tt.data)
1575+
if got != tt.want {
1576+
t.Errorf("decodeUTF16() = %q, want %q", got, tt.want)
1577+
}
1578+
})
1579+
}
1580+
}
1581+
1582+
func TestIsValidUTF8(t *testing.T) {
1583+
tests := []struct {
1584+
name string
1585+
data []byte
1586+
want bool
1587+
}{
1588+
{
1589+
name: "empty",
1590+
data: []byte{},
1591+
want: false,
1592+
},
1593+
{
1594+
name: "valid ASCII",
1595+
data: []byte("Hello World"),
1596+
want: true,
1597+
},
1598+
{
1599+
name: "valid UTF-8",
1600+
data: []byte("Héllo Wörld"),
1601+
want: true,
1602+
},
1603+
{
1604+
name: "too many nulls",
1605+
data: []byte{0, 0, 0, 0, 'A'},
1606+
want: false,
1607+
},
1608+
{
1609+
name: "trailing nulls ok",
1610+
data: []byte("Test\x00\x00"),
1611+
want: true,
1612+
},
1613+
}
1614+
1615+
for _, tt := range tests {
1616+
t.Run(tt.name, func(t *testing.T) {
1617+
got := isValidUTF8(tt.data)
1618+
if got != tt.want {
1619+
t.Errorf("isValidUTF8() = %v, want %v", got, tt.want)
1620+
}
1621+
})
1622+
}
1623+
}

internal/parser/tiff/types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ const (
3939
TagIPTC uint16 = 0x83BB
4040
TagXMP uint16 = 0x02BC // XMLPacket (decimal 700)
4141
TagMakerNote uint16 = 0x927C
42+
TagUserComment uint16 = 0x9286
4243
TagSubIFDs uint16 = 0x014A
4344
TagJPEGInterchange uint16 = 0x0201
4445
TagJPEGInterLength uint16 = 0x0202

0 commit comments

Comments
 (0)