Skip to content

Commit 9b9734f

Browse files
wesmclaudeliuxiaopai-ai
authored
Fix invalid UTF-8 during sync and suggest repair-encoding on error (#98)
## Summary - Validate participant address names and attachment filenames with `EnsureUTF8()` before database insertion during sync, closing a gap where subject/body/snippet were already validated but address names and filenames were not - The root cause: emails with mis-labeled RFC 2047 headers (e.g. `=?UTF-8?Q?Jane_Doe=C9ric?=` where the bytes are actually Latin-1) pass through enmime's address parser with invalid UTF-8, get inserted into the `participants` table as-is, then cause DuckDB errors when exported to Parquet - When invalid string encoding is encountered in query results, show a hint suggesting `msgvault repair-encoding` to fix existing data (cherry-picked from #97) Fixes #95. Supersedes #97 ## Test plan - [x] `TestFullSync_Latin1InFromName` - Latin-1 É in From name via mis-labeled RFC 2047 - [x] `TestFullSync_InvalidUTF8InAllAddressFields` - Windows-1252 smart quotes in From/To/Cc/Bcc - [x] `TestFullSync_InvalidUTF8InAttachmentFilename` - attachment filename validation - [x] `TestFullSync_MultipleEncodingIssuesSameMessage` - mixed Latin-1 + Windows-1252 in one email - [x] `TestEncodingErrorHint` - repair-encoding hint on DuckDB encoding errors - [x] Full test suite passes - [x] Linter passes 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: liuxiaopai-ai <liuxiaopai-ai@users.noreply.github.com>
1 parent 788c093 commit 9b9734f

File tree

10 files changed

+359
-9
lines changed

10 files changed

+359
-9
lines changed

cmd/msgvault/cmd/list_domains.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ Examples:
4040
// Execute aggregation
4141
results, err := engine.Aggregate(cmd.Context(), query.ViewDomains, opts)
4242
if err != nil {
43-
return fmt.Errorf("aggregate by domain: %w", err)
43+
return query.HintRepairEncoding(fmt.Errorf("aggregate by domain: %w", err))
4444
}
4545

4646
if len(results) == 0 {

cmd/msgvault/cmd/list_labels.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ Examples:
4040
// Execute aggregation
4141
results, err := engine.Aggregate(cmd.Context(), query.ViewLabels, opts)
4242
if err != nil {
43-
return fmt.Errorf("aggregate by label: %w", err)
43+
return query.HintRepairEncoding(fmt.Errorf("aggregate by label: %w", err))
4444
}
4545

4646
if len(results) == 0 {

cmd/msgvault/cmd/list_senders.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ Examples:
4040
// Execute aggregation
4141
results, err := engine.Aggregate(cmd.Context(), query.ViewSenders, opts)
4242
if err != nil {
43-
return fmt.Errorf("aggregate by sender: %w", err)
43+
return query.HintRepairEncoding(fmt.Errorf("aggregate by sender: %w", err))
4444
}
4545

4646
if len(results) == 0 {

cmd/msgvault/cmd/search.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ Examples:
8181
results, err := engine.Search(cmd.Context(), q, searchLimit, searchOffset)
8282
fmt.Fprintf(os.Stderr, "\r \r")
8383
if err != nil {
84-
return fmt.Errorf("search: %w", err)
84+
return query.HintRepairEncoding(fmt.Errorf("search: %w", err))
8585
}
8686

8787
if len(results) == 0 {

internal/query/encoding_hint.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package query
2+
3+
import (
4+
"fmt"
5+
"strings"
6+
)
7+
8+
// encodingErrorSubstring is the error text emitted by DuckDB when it encounters
9+
// invalid UTF-8 in a Parquet file.
10+
const encodingErrorSubstring = "Invalid string encoding found in Parquet file"
11+
12+
// IsEncodingError reports whether err contains the DuckDB invalid-string-encoding
13+
// error that can be resolved by running `msgvault repair-encoding`.
14+
func IsEncodingError(err error) bool {
15+
return err != nil && strings.Contains(err.Error(), encodingErrorSubstring)
16+
}
17+
18+
// HintRepairEncoding wraps err with a user-facing hint suggesting
19+
// `msgvault repair-encoding` when the error is an encoding error.
20+
// If err is nil or unrelated, it is returned unchanged.
21+
func HintRepairEncoding(err error) error {
22+
if !IsEncodingError(err) {
23+
return err
24+
}
25+
return fmt.Errorf("%w\nHint: try running 'msgvault repair-encoding' to fix encoding issues", err)
26+
}
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
package query
2+
3+
import (
4+
"errors"
5+
"fmt"
6+
"testing"
7+
)
8+
9+
func TestIsEncodingError(t *testing.T) {
10+
tests := []struct {
11+
name string
12+
err error
13+
want bool
14+
}{
15+
{"nil error", nil, false},
16+
{"unrelated error", errors.New("connection refused"), false},
17+
{"encoding error", errors.New("Invalid string encoding found in Parquet file"), true},
18+
{"wrapped encoding error", fmt.Errorf("aggregate query: %w", errors.New("Invalid string encoding found in Parquet file")), true},
19+
{"encoding error substring", errors.New("scan: Invalid string encoding found in Parquet file foo.parquet"), true},
20+
}
21+
for _, tt := range tests {
22+
t.Run(tt.name, func(t *testing.T) {
23+
if got := IsEncodingError(tt.err); got != tt.want {
24+
t.Errorf("IsEncodingError() = %v, want %v", got, tt.want)
25+
}
26+
})
27+
}
28+
}
29+
30+
func TestHintRepairEncoding(t *testing.T) {
31+
t.Run("nil error", func(t *testing.T) {
32+
if got := HintRepairEncoding(nil); got != nil {
33+
t.Errorf("HintRepairEncoding(nil) = %v, want nil", got)
34+
}
35+
})
36+
37+
t.Run("unrelated error passes through", func(t *testing.T) {
38+
orig := errors.New("something else")
39+
got := HintRepairEncoding(orig)
40+
if got != orig {
41+
t.Errorf("HintRepairEncoding should return original error unchanged, got %v", got)
42+
}
43+
})
44+
45+
t.Run("encoding error gets hint", func(t *testing.T) {
46+
orig := errors.New("Invalid string encoding found in Parquet file")
47+
got := HintRepairEncoding(orig)
48+
if got == nil {
49+
t.Fatal("HintRepairEncoding returned nil")
50+
}
51+
msg := got.Error()
52+
if want := "repair-encoding"; !containsSubstring(msg, want) {
53+
t.Errorf("expected hint containing %q, got: %s", want, msg)
54+
}
55+
// Original error should be preserved in the chain
56+
if !errors.Is(got, orig) {
57+
t.Error("wrapped error should preserve original via errors.Is")
58+
}
59+
})
60+
61+
t.Run("wrapped encoding error gets hint", func(t *testing.T) {
62+
inner := errors.New("Invalid string encoding found in Parquet file")
63+
wrapped := fmt.Errorf("aggregate query: %w", inner)
64+
got := HintRepairEncoding(wrapped)
65+
msg := got.Error()
66+
if want := "repair-encoding"; !containsSubstring(msg, want) {
67+
t.Errorf("expected hint containing %q, got: %s", want, msg)
68+
}
69+
})
70+
}
71+
72+
func containsSubstring(s, sub string) bool {
73+
return len(s) >= len(sub) && (s == sub || len(s) > 0 && contains(s, sub))
74+
}
75+
76+
func contains(s, sub string) bool {
77+
for i := 0; i <= len(s)-len(sub); i++ {
78+
if s[i:i+len(sub)] == sub {
79+
return true
80+
}
81+
}
82+
return false
83+
}

internal/store/inspect.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,15 @@ func (s *Store) InspectDisplayName(sourceMessageID, recipientType, email string)
139139
return displayName, err
140140
}
141141

142+
// InspectParticipantDisplayName returns the display_name from the participants table for an email.
143+
func (s *Store) InspectParticipantDisplayName(email string) (string, error) {
144+
var displayName string
145+
err := s.db.QueryRow(s.Rebind(
146+
"SELECT display_name FROM participants WHERE email_address = ?"),
147+
email).Scan(&displayName)
148+
return displayName, err
149+
}
150+
142151
// InspectDeletedFromSource checks whether a message has deleted_from_source_at set.
143152
func (s *Store) InspectDeletedFromSource(sourceMessageID string) (bool, error) {
144153
var deletedAt sql.NullTime
@@ -188,6 +197,18 @@ func (s *Store) InspectThreadSourceID(sourceMessageID string) (string, error) {
188197
return threadSourceID, err
189198
}
190199

200+
// InspectAttachment returns the filename and mime_type for the first attachment (by ID) of a message.
201+
func (s *Store) InspectAttachment(sourceMessageID string) (filename, mimeType string, err error) {
202+
err = s.db.QueryRow(s.Rebind(`
203+
SELECT a.filename, a.mime_type FROM attachments a
204+
JOIN messages m ON a.message_id = m.id
205+
WHERE m.source_message_id = ?
206+
ORDER BY a.id
207+
LIMIT 1
208+
`), sourceMessageID).Scan(&filename, &mimeType)
209+
return
210+
}
211+
191212
// InspectMessageDates returns sent_at and internal_date for a message.
192213
func (s *Store) InspectMessageDates(sourceMessageID string) (sentAt, internalDate string, err error) {
193214
err = s.db.QueryRow(s.Rebind(

internal/sync/sync.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,18 @@ func (s *Syncer) parseToModel(sourceID int64, raw *gmail.RawMessage, threadID st
438438
bodyHTML := textutil.EnsureUTF8(parsed.BodyHTML)
439439
snippet := textutil.EnsureUTF8(raw.Snippet)
440440

441+
// Ensure participant names are valid UTF-8 before database insertion
442+
ensureAddressUTF8(parsed.From)
443+
ensureAddressUTF8(parsed.To)
444+
ensureAddressUTF8(parsed.Cc)
445+
ensureAddressUTF8(parsed.Bcc)
446+
447+
// Ensure attachment filenames and content types are valid UTF-8
448+
for i := range parsed.Attachments {
449+
parsed.Attachments[i].Filename = textutil.EnsureUTF8(parsed.Attachments[i].Filename)
450+
parsed.Attachments[i].ContentType = textutil.EnsureUTF8(parsed.Attachments[i].ContentType)
451+
}
452+
441453
// Ensure participants exist in database
442454
allAddresses := append(append(append(parsed.From, parsed.To...), parsed.Cc...), parsed.Bcc...)
443455
participantMap, err := s.store.EnsureParticipantsBatch(allAddresses)
@@ -586,6 +598,13 @@ func (s *Syncer) ingestMessage(ctx context.Context, sourceID int64, raw *gmail.R
586598
return s.persistMessage(data, labelMap)
587599
}
588600

601+
// ensureAddressUTF8 validates and converts address names to valid UTF-8 in place.
602+
func ensureAddressUTF8(addrs []mime.Address) {
603+
for i := range addrs {
604+
addrs[i].Name = textutil.EnsureUTF8(addrs[i].Name)
605+
}
606+
}
607+
589608
// storeRecipients stores recipient records.
590609
func (s *Syncer) storeRecipients(messageID int64, recipientType string, addresses []mime.Address, participantMap map[string]int64) error {
591610
if len(addresses) == 0 {

0 commit comments

Comments
 (0)