fix(tiff): decode UserComment EXIF tag (Bug #19) (#28)

rpuneet · claude · web-flow · commit 7ff0b3e56b25 · 2026-02-08T00:56:50.000+05:30
Previously, UserComment (tag 0x9286) was returned as raw hex bytes. Now properly decodes the 8-byte charset prefix and text content. Supported encodings: - ASCII: "ASCII\x00\x00\x00" prefix - Unicode: "UNICODE\x00" prefix (UTF-16 LE/BE) - JIS: "JIS\x00\x00\x00\x00\x00" prefix - Undefined: null prefix (treated as UTF-8) - No prefix fallback for malformed data Before: "415343494900000053686f7420..." After: "Shot during golden hour, 3-stop graduated ND filter used" Closes #19 Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/api_integration_test.go b/api_integration_test.go
@@ -283,7 +283,7 @@ func TestIntegration_CR2(t *testing.T) {
 				{Name: "Flash", Value: "Off, Did not fire"},
 				{Name: "FocalLength", Value: "85/1"},
 				// MakerNote is now parsed into Canon directory
-				{Name: "UserComment", Type: "[]byte"}, // Binary data - check presence only
+				{Name: "UserComment", Type: "string"}, // Decoded comment text (Bug #19 fix)
 				{Name: "FlashpixVersion", Value: "0100"},
 				{Name: "ColorSpace", Value: "sRGB"},
 				{Name: "PixelXDimension", Value: uint16(4992)},
diff --git a/internal/parser/tiff/ifd.go b/internal/parser/tiff/ifd.go
@@ -141,6 +141,14 @@ func (p *Parser) parseTag(r *imxbin.Reader, entry *IFDEntry, dirName string) (*p
 		value = decoded
 	}
 
+	// Decode UserComment (EXIF tag 0x9286)
+	// Format: 8-byte charset prefix + variable-length text
+	if entry.Tag == TagUserComment {
+		if data, ok := value.([]byte); ok {
+			value = decodeUserComment(data)
+		}
+	}
+
 	return &parser.Tag{
 		ID:       parser.TagID(fmt.Sprintf("%s:0x%04X", dirName, entry.Tag)),
 		Name:     tagName,
@@ -664,3 +672,163 @@ func getTagNameForDir(tag uint16, dirName string) string {
 		return getTIFFTagName(tag) // Default to TIFF tags
 	}
 }
+
+// UserComment charset prefixes (8 bytes each)
+var (
+	userCommentASCII     = []byte("ASCII\x00\x00\x00")
+	userCommentUnicode   = []byte("UNICODE\x00")
+	userCommentJIS       = []byte("JIS\x00\x00\x00\x00\x00")
+	userCommentUndefined = []byte("\x00\x00\x00\x00\x00\x00\x00\x00")
+)
+
+// decodeUserComment decodes EXIF UserComment tag (0x9286).
+// Format: 8-byte charset prefix + variable-length text.
+//
+// Charset prefixes:
+//   - "ASCII\x00\x00\x00" (8 bytes) - ASCII text
+//   - "UNICODE\x00" (8 bytes) - UTF-16 text (big or little endian)
+//   - "JIS\x00\x00\x00\x00\x00" (8 bytes) - JIS encoding
+//   - "\x00\x00\x00\x00\x00\x00\x00\x00" (8 bytes) - undefined (often UTF-8)
+//
+// Returns decoded string or original bytes if decoding fails.
+func decodeUserComment(data []byte) interface{} {
+	const prefixLen = 8
+
+	// Need at least the charset prefix
+	if len(data) < prefixLen {
+		return data
+	}
+
+	prefix := data[:prefixLen]
+	content := data[prefixLen:]
+
+	// Empty content
+	if len(content) == 0 {
+		return ""
+	}
+
+	switch {
+	case bytes.Equal(prefix, userCommentASCII):
+		// Trim trailing nulls for ASCII
+		return string(bytes.TrimRight(content, "\x00"))
+
+	case bytes.Equal(prefix, userCommentUnicode):
+		// UTF-16 - don't trim nulls before decoding (nulls are part of UTF-16)
+		// decodeUTF16 handles null termination internally
+		return decodeUTF16(content)
+
+	case bytes.Equal(prefix, userCommentJIS):
+		// JIS encoding - return as-is for now (rarely used)
+		// Full JIS decoding would require additional dependencies
+		return string(bytes.TrimRight(content, "\x00"))
+
+	case bytes.Equal(prefix, userCommentUndefined):
+		// Undefined charset - often UTF-8 in practice
+		return string(bytes.TrimRight(content, "\x00"))
+
+	default:
+		// Unknown prefix - check if it looks like text without prefix
+		// Some cameras omit the charset prefix entirely
+		if isValidUTF8(data) {
+			return string(bytes.TrimRight(data, "\x00"))
+		}
+		return data
+	}
+}
+
+// decodeUTF16 decodes UTF-16 encoded data to a UTF-8 string.
+// Detects byte order from BOM if present, otherwise assumes little-endian.
+func decodeUTF16(data []byte) string {
+	if len(data) < 2 {
+		return ""
+	}
+
+	// Check for BOM
+	var littleEndian bool
+	start := 0
+	if data[0] == 0xFF && data[1] == 0xFE {
+		littleEndian = true
+		start = 2
+	} else if data[0] == 0xFE && data[1] == 0xFF {
+		littleEndian = false
+		start = 2
+	} else {
+		// No BOM - assume little-endian (most common on Windows)
+		littleEndian = true
+	}
+
+	// Decode UTF-16 to UTF-8
+	data = data[start:]
+	if len(data)%2 != 0 {
+		data = data[:len(data)-1] // Truncate odd byte
+	}
+
+	var result []rune
+	for i := 0; i < len(data); i += 2 {
+		var r uint16
+		if littleEndian {
+			r = uint16(data[i]) | uint16(data[i+1])<<8
+		} else {
+			r = uint16(data[i])<<8 | uint16(data[i+1])
+		}
+
+		// Handle surrogate pairs for characters outside BMP
+		if r >= 0xD800 && r <= 0xDBFF && i+2 < len(data) {
+			var r2 uint16
+			if littleEndian {
+				r2 = uint16(data[i+2]) | uint16(data[i+3])<<8
+			} else {
+				r2 = uint16(data[i+2])<<8 | uint16(data[i+3])
+			}
+			if r2 >= 0xDC00 && r2 <= 0xDFFF {
+				// Valid surrogate pair
+				codePoint := 0x10000 + (int(r-0xD800) << 10) + int(r2-0xDC00)
+				result = append(result, rune(codePoint))
+				i += 2
+				continue
+			}
+		}
+
+		if r == 0 {
+			break // Null terminator
+		}
+		result = append(result, rune(r))
+	}
+
+	return string(result)
+}
+
+// isValidUTF8 checks if data is valid UTF-8 text (printable or whitespace).
+func isValidUTF8(data []byte) bool {
+	if len(data) == 0 {
+		return false
+	}
+
+	// Trim trailing nulls first - they're common in fixed-size EXIF fields
+	trimmed := bytes.TrimRight(data, "\x00")
+	if len(trimmed) == 0 {
+		return false
+	}
+
+	// Quick check: count embedded nulls (after trimming trailing ones)
+	nullCount := 0
+	for _, b := range trimmed {
+		if b == 0 {
+			nullCount++
+		}
+	}
+	// If more than 20% embedded nulls, probably binary
+	if nullCount > len(trimmed)/5 {
+		return false
+	}
+
+	// Check if valid UTF-8
+	s := string(trimmed)
+	for _, r := range s {
+		// Allow printable characters and common whitespace
+		if r == '\uFFFD' { // Unicode replacement character = invalid UTF-8
+			return false
+		}
+	}
+	return true
+}
diff --git a/internal/parser/tiff/ifd_test.go b/internal/parser/tiff/ifd_test.go
@@ -1421,3 +1421,203 @@ func TestGetTagNameForDir(t *testing.T) {
 		}
 	}
 }
+
+func TestDecodeUserComment(t *testing.T) {
+	tests := []struct {
+		name    string
+		data    []byte
+		want    interface{}
+		wantStr bool // true if want is expected to be a string
+	}{
+		{
+			name:    "ASCII encoding",
+			data:    append([]byte("ASCII\x00\x00\x00"), []byte("Hello World")...),
+			want:    "Hello World",
+			wantStr: true,
+		},
+		{
+			name:    "ASCII with trailing nulls",
+			data:    append([]byte("ASCII\x00\x00\x00"), []byte("Hello\x00\x00\x00")...),
+			want:    "Hello",
+			wantStr: true,
+		},
+		{
+			name:    "undefined encoding (UTF-8)",
+			data:    append([]byte("\x00\x00\x00\x00\x00\x00\x00\x00"), []byte("Test comment")...),
+			want:    "Test comment",
+			wantStr: true,
+		},
+		{
+			name:    "empty content after prefix",
+			data:    []byte("ASCII\x00\x00\x00"),
+			want:    "",
+			wantStr: true,
+		},
+		{
+			name:    "empty content with trailing nulls",
+			data:    append([]byte("ASCII\x00\x00\x00"), []byte("\x00\x00\x00")...),
+			want:    "",
+			wantStr: true,
+		},
+		{
+			name:    "too short - less than prefix",
+			data:    []byte("ASCII"),
+			want:    []byte("ASCII"),
+			wantStr: false,
+		},
+		{
+			name:    "JIS encoding",
+			data:    append([]byte("JIS\x00\x00\x00\x00\x00"), []byte("Test")...),
+			want:    "Test",
+			wantStr: true,
+		},
+		{
+			name:    "Unicode UTF-16LE with BOM",
+			data:    append([]byte("UNICODE\x00"), append([]byte{0xFF, 0xFE}, []byte{'H', 0, 'i', 0}...)...),
+			want:    "Hi",
+			wantStr: true,
+		},
+		{
+			name:    "Unicode UTF-16BE with BOM",
+			data:    append([]byte("UNICODE\x00"), append([]byte{0xFE, 0xFF}, []byte{0, 'H', 0, 'i'}...)...),
+			want:    "Hi",
+			wantStr: true,
+		},
+		{
+			name:    "Unicode UTF-16LE without BOM",
+			data:    append([]byte("UNICODE\x00"), []byte{'T', 0, 'e', 0, 's', 0, 't', 0}...),
+			want:    "Test",
+			wantStr: true,
+		},
+		{
+			name:    "no prefix - valid UTF-8 text",
+			data:    []byte("Plain text comment"),
+			want:    "Plain text comment",
+			wantStr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := decodeUserComment(tt.data)
+
+			if tt.wantStr {
+				gotStr, ok := got.(string)
+				if !ok {
+					t.Errorf("decodeUserComment() returned %T, want string", got)
+					return
+				}
+				wantStr := tt.want.(string)
+				if gotStr != wantStr {
+					t.Errorf("decodeUserComment() = %q, want %q", gotStr, wantStr)
+				}
+			} else {
+				gotBytes, ok := got.([]byte)
+				if !ok {
+					t.Errorf("decodeUserComment() returned %T, want []byte", got)
+					return
+				}
+				wantBytes := tt.want.([]byte)
+				if !bytes.Equal(gotBytes, wantBytes) {
+					t.Errorf("decodeUserComment() = %v, want %v", gotBytes, wantBytes)
+				}
+			}
+		})
+	}
+}
+
+func TestDecodeUTF16(t *testing.T) {
+	tests := []struct {
+		name string
+		data []byte
+		want string
+	}{
+		{
+			name: "empty",
+			data: []byte{},
+			want: "",
+		},
+		{
+			name: "single byte",
+			data: []byte{0x41},
+			want: "",
+		},
+		{
+			name: "simple ASCII as UTF-16LE",
+			data: []byte{'A', 0, 'B', 0, 'C', 0},
+			want: "ABC",
+		},
+		{
+			name: "with LE BOM",
+			data: []byte{0xFF, 0xFE, 'H', 0, 'i', 0},
+			want: "Hi",
+		},
+		{
+			name: "with BE BOM",
+			data: []byte{0xFE, 0xFF, 0, 'H', 0, 'i'},
+			want: "Hi",
+		},
+		{
+			name: "null terminated",
+			data: []byte{'A', 0, 0, 0},
+			want: "A",
+		},
+		{
+			name: "odd length truncated",
+			data: []byte{'A', 0, 'B'},
+			want: "A",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := decodeUTF16(tt.data)
+			if got != tt.want {
+				t.Errorf("decodeUTF16() = %q, want %q", got, tt.want)
+			}
+		})
+	}
+}
+
+func TestIsValidUTF8(t *testing.T) {
+	tests := []struct {
+		name string
+		data []byte
+		want bool
+	}{
+		{
+			name: "empty",
+			data: []byte{},
+			want: false,
+		},
+		{
+			name: "valid ASCII",
+			data: []byte("Hello World"),
+			want: true,
+		},
+		{
+			name: "valid UTF-8",
+			data: []byte("Héllo Wörld"),
+			want: true,
+		},
+		{
+			name: "too many nulls",
+			data: []byte{0, 0, 0, 0, 'A'},
+			want: false,
+		},
+		{
+			name: "trailing nulls ok",
+			data: []byte("Test\x00\x00"),
+			want: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := isValidUTF8(tt.data)
+			if got != tt.want {
+				t.Errorf("isValidUTF8() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
diff --git a/internal/parser/tiff/types.go b/internal/parser/tiff/types.go
@@ -39,6 +39,7 @@ const (
 	TagIPTC            uint16 = 0x83BB
 	TagXMP             uint16 = 0x02BC // XMLPacket (decimal 700)
 	TagMakerNote       uint16 = 0x927C
+	TagUserComment     uint16 = 0x9286
 	TagSubIFDs         uint16 = 0x014A
 	TagJPEGInterchange uint16 = 0x0201
 	TagJPEGInterLength uint16 = 0x0202