Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
162 commits
Select commit Hold shift + click to select a range
b6b55f4
Use signal-aware context for graceful shutdown of long-running commands
wesm Feb 3, 2026
1b8e250
Refactor config package: extract defaults, fix path expansion, rename…
wesm Feb 3, 2026
5d936a4
Refactor deletion executor: extract shared helpers to reduce duplication
wesm Feb 3, 2026
d3be767
Refactor executor tests: consolidate into table-driven tests, add ass…
wesm Feb 3, 2026
a37c84f
Refactor deletion manifest: centralize status-directory mapping, use …
wesm Feb 3, 2026
96a878b
Refactor manifest tests: use go-cmp, table-driven transitions, slices…
wesm Feb 3, 2026
1aa7803
Refactor attachment export: extract helpers, return structured data, …
wesm Feb 3, 2026
b465eb3
Improve attachment export tests: add happy path coverage, verify zip …
wesm Feb 3, 2026
9df28bb
Refactor gmail API: extract sync types, segregate interfaces
wesm Feb 3, 2026
5eff819
Refactor gmail client: extract response types, use RawURLEncoding, ex…
wesm Feb 3, 2026
51f9c86
Refactor gmail client tests: use http constants, extract slice helper
wesm Feb 3, 2026
5d0294b
Refactor deletion mock: remove atomics, extract error helper, add op …
wesm Feb 3, 2026
4340e50
Refactor deletion mock tests: strengthen Reset coverage, unify Hooks,…
wesm Feb 3, 2026
05bd771
Refactor gmail mock: clarify batch nil-slot contract, add SetupMessag…
wesm Feb 3, 2026
f8d566e
Refactor rate limiter: extract reserve method, centralize costs, remo…
wesm Feb 3, 2026
afef913
Refactor rate limiter tests: event-driven mock clock, shared construc…
wesm Feb 3, 2026
51891cf
Refactor MCP handlers: extract argument helpers, separate file I/O
wesm Feb 3, 2026
1bad67d
Refactor MCP server: extract common arg helpers, add tool name constants
wesm Feb 3, 2026
19dd3f9
Refactor MCP server tests: extract response types, table-driven attac…
wesm Feb 3, 2026
d13494f
Refactor MIME parser: extract helper, hoist date formats, use regex f…
wesm Feb 3, 2026
2d3a53b
Refactor MIME parse tests: strengthen assertions, consolidate date ta…
wesm Feb 3, 2026
1606eac
Refactor OAuth: fix global ServeMux, consolidate constructors, extrac…
wesm Feb 3, 2026
02664c0
Refactor OAuth tests: extract setup/token helpers, table-drive metada…
wesm Feb 3, 2026
5cbabe7
Refactor DuckDB aggregation: extract view defs, consolidate 7 methods…
wesm Feb 3, 2026
1e6b1fa
Refactor DuckDB tests: generic assertSetEqual, replace inline SQL wit…
wesm Feb 3, 2026
0ef09e4
Consolidate 7 AggregateBy* methods into single Aggregate(groupBy, opts)
wesm Feb 3, 2026
7ea8435
Refactor MessageFilter: replace MatchEmpty* bools with EmptyValueTarg…
wesm Feb 3, 2026
27f7452
Refactor SQLite aggregates: replace 7 methods + SubAggregate switch w…
wesm Feb 3, 2026
7ee78fb
Refactor SQLite aggregate tests: standardize assertions, consolidate …
wesm Feb 3, 2026
c5a3d6b
Refactor SQLite CRUD tests: consolidate filters into table-driven tes…
wesm Feb 3, 2026
f4735c9
Refactor SQLite search tests: consolidate filters into table-driven t…
wesm Feb 3, 2026
4b2214a
Refactor query test helpers: dynamic IDs, consolidate setup, map-base…
wesm Feb 3, 2026
ac3f730
Refactor test fixtures: extract toSQL, generic joinRows, decompose Build
wesm Feb 3, 2026
4b8ee1e
Refactor validation tests: extract MockTB to tbmock package, consolid…
wesm Feb 3, 2026
cd4aa01
Refactor assertQueryEqual: replace manual field checks with go-cmp
wesm Feb 3, 2026
fa80387
Refactor search parser: inject time dependency, strategy pattern for …
wesm Feb 3, 2026
abf20e8
Refactor TestParse: group test cases by feature into named sub-tests
wesm Feb 3, 2026
4b35369
Refactor store: transactional safety, batch inserts, extract helpers
wesm Feb 3, 2026
995f3af
Harden store error handling: fix silent failures in GetStats and Init…
wesm Feb 3, 2026
7278182
Refactor store tests: extract DB inspection helpers, standardize fixt…
wesm Feb 3, 2026
181d8ad
Refactor sync.go: extract row scanners, centralize time parsing const…
wesm Feb 3, 2026
e41954f
Refactor encoding tests: deterministic assertions, inline table-drive…
wesm Feb 3, 2026
feab5ee
Refactor sync fixtures: convert global vars to functions for test iso…
wesm Feb 3, 2026
1f0d440
Refactor incremental.go: extract event processors, convert helper to …
wesm Feb 3, 2026
4c2d27f
Refactor sync.go: extract encoding utils, batch processing, sync state
wesm Feb 3, 2026
0a19403
Refactor sync tests: table-driven message variations, extract mock as…
wesm Feb 3, 2026
9433c94
Refactor sync tests: extract store queries to inspection methods
wesm Feb 3, 2026
1e5f881
Refactor test helpers: consolidate with generics
wesm Feb 3, 2026
bbc8d58
Refactor test builders: improve API consistency and ergonomics
wesm Feb 3, 2026
54da291
Refactor EncodedSamples: use reflection for robust cloning
wesm Feb 3, 2026
925e59a
Refactor encoding test: add safety check and modernize slice copy
wesm Feb 3, 2026
332fbb3
Refactor testutil: split monolithic file and use t.TempDir()
wesm Feb 3, 2026
42eaf3b
Refactor testutil_test: encapsulate shared data and use stronger asse…
wesm Feb 3, 2026
a231cba
Refactor StageForDeletion: introduce parameter object and extract hel…
wesm Feb 3, 2026
d7d2749
Refactor actions_test: encapsulate helpers and add AssertStringSet
wesm Feb 3, 2026
b11b8f7
Refactor keys.go: extract helpers and decompose modal handling
wesm Feb 3, 2026
bb56301
Refactor model.go: extract safeCmd helper and decompose Update method
wesm Feb 3, 2026
0660e4f
Add comprehensive unit tests for TUI Model state machine
wesm Feb 3, 2026
1e967cc
Refactor nav_test.go: split into focused test files by domain
wesm Feb 3, 2026
81aa0ff
Refactor navigation.go: consolidate scroll logic and detail navigation
wesm Feb 3, 2026
4658c71
Refactor search_test.go: consolidate Tab tests and add assertion helpers
wesm Feb 3, 2026
1fd33c3
Refactor selection_test.go: consolidate deletion tests and add assert…
wesm Feb 3, 2026
8eae5bf
Refactor setup_test.go: decompose Builder and unify mock construction
wesm Feb 3, 2026
54bc1e3
Refactor view.go: extract utilities, decompose header and modal rende…
wesm Feb 3, 2026
032b962
Refactor view_render_test.go: consolidate tests and simplify ANSI val…
wesm Feb 3, 2026
5a7988a
Refactor view_test.go: consolidate helpers and remove redundant test
wesm Feb 3, 2026
5b298d4
Refactor update.go: use standard semver library and extract helpers
wesm Feb 3, 2026
76001cf
Refactor update_test.go: add AssertEqual helper and consolidate test …
wesm Feb 3, 2026
87c3b9d
Add Execute() wrapper for backwards compatibility and context cancell…
wesm Feb 3, 2026
2bdc92c
Add unit tests for config package expandPath and Load functions
wesm Feb 3, 2026
8a81e0f
Preserve batch deletion semantics: always mark as Completed
wesm Feb 3, 2026
0797a40
Add explicit status-to-directory mapping for deletion manifests
wesm Feb 3, 2026
fe4b7ad
Add list count assertions to TestManager_Transitions
wesm Feb 3, 2026
8b4a9a5
Add WriteError flag to ExportStats for proper error propagation
wesm Feb 3, 2026
29cafe1
Tolerate padded base64url in raw MIME decoding
wesm Feb 3, 2026
fee950a
Strengthen deletion mock tests with invocation verification
wesm Feb 3, 2026
50f8902
Add nil guards and tests for MockAPI.SetupMessages
wesm Feb 3, 2026
3ed3803
Add nil guard and test for RateLimiter clock invariant
wesm Feb 3, 2026
2a9c95f
Add lazy initialization for mockClock timerNotify channel
wesm Feb 3, 2026
b08c596
Add expected length parameter to assertAddress test helper
wesm Feb 3, 2026
169990c
Add unit tests for OAuth callback handler CSRF validation
wesm Feb 3, 2026
e4a27db
Add table-driven tests for DuckDB aggregation and time granularity
wesm Feb 3, 2026
30b78ca
Add unit tests for invalid ViewType values in Aggregate API
wesm Feb 3, 2026
dbceed0
Fix EmptyValueTarget to support multiple empty dimensions in drill-down
wesm Feb 3, 2026
6063bab
Add deterministic secondary sort to aggregate queries
wesm Feb 3, 2026
0ce3078
Improve test robustness in sqlite_crud_test.go
wesm Feb 3, 2026
4958c90
Improve test robustness in sqlite_search_test.go
wesm Feb 3, 2026
696125d
Replace hardcoded participant IDs with MustLookupParticipant in tests
wesm Feb 3, 2026
f079ede
Restore source ID verification in TestAddMessage_UsesFirstSource
wesm Feb 3, 2026
93404fd
Document EquateEmpty() usage rationale in assertQueryEqual
wesm Feb 3, 2026
75f62f8
Add nil guard for Parser.Now and test for Parse() wrapper
wesm Feb 3, 2026
6f60e5b
Use slice instead of map for deterministic test group ordering
wesm Feb 3, 2026
f19c773
Fix SQLite parameter limit regression in batch inserts
wesm Feb 3, 2026
ec776d6
Use errors.As for SQLite error detection instead of string matching
wesm Feb 3, 2026
cccd9b8
Standardize store tests to use storetest.Fixture
wesm Feb 3, 2026
cb97126
Return errors from timestamp parsing instead of silently failing
wesm Feb 3, 2026
b6ca0ce
Improve encoding test robustness against implementation changes
wesm Feb 3, 2026
0afc1ae
Fix potential test flakiness from nondeterministic MIME generation
wesm Feb 3, 2026
859d326
Harden textutil helpers and expand sync test coverage
wesm Feb 3, 2026
a90aff4
Keep SizeEstimate consistent when overwriting Raw in sync tests
wesm Feb 3, 2026
e7585a1
Fix error handling and driver compatibility in store inspection helpers
wesm Feb 3, 2026
09a223c
Harden reflection-based deep copy in EncodedSamples()
wesm Feb 3, 2026
79bc922
Harden validateRelativePath against Windows drive-relative paths and …
wesm Feb 3, 2026
28cb3d3
Fix AssertStringSet to properly detect missing items when got contain…
wesm Feb 3, 2026
b08bacb
Add TimeGranularityCount constant to replace magic number in view cyc…
wesm Feb 3, 2026
92d117c
Add unit tests for threadMessagesLoaded, detail search resize, and ap…
wesm Feb 3, 2026
34dc9a8
Add test for out-of-bounds detail index navigation with multiple mess…
wesm Feb 3, 2026
7659fda
Strengthen search test coverage with exact assertions and cmd checks
wesm Feb 3, 2026
be799bf
Fix WithSelectedAggregatesViewType to work with ViewSenders (iota 0)
wesm Feb 3, 2026
d0e643e
Document newTestModelWithRows loading behavior after builder refactor
wesm Feb 3, 2026
562efb2
Fix export modal to show message when no attachments available
wesm Feb 3, 2026
b2dfa20
Consolidate header tests into table-driven TestHeaderDisplay
wesm Feb 3, 2026
bbdeb05
Normalize prerelease versions and add tests for update helpers
wesm Feb 3, 2026
1825ca6
Improve test isolation and reliability in root command tests
wesm Feb 3, 2026
75f8ab9
Improve config test isolation and fix expandPath double-slash handling
wesm Feb 3, 2026
216e3ca
Fix cross-platform path separator in TestDirForStatus
wesm Feb 3, 2026
58721ad
Fix ExportResultMsg.Err to only set on actual errors
wesm Feb 3, 2026
1d45643
Validate base64url padding in decodeBase64URL to reject malformed input
wesm Feb 3, 2026
93426c6
Strengthen DeletionMockAPI Reset test with pre/post assertions
wesm Feb 3, 2026
e479648
Use Fatalf for length assertion in assertAddress test helper
wesm Feb 3, 2026
2fc414f
Strengthen OAuth callback handler tests with channel assertions
wesm Feb 3, 2026
cede16e
Assert wantFormat regex in TimeGranularity test
wesm Feb 3, 2026
5ef92ad
Simplify error assertions in SQLite invalid ViewType tests
wesm Feb 3, 2026
f03b7ad
Fix MessageFilter map mutation and HasEmptyTargets false positives
wesm Feb 3, 2026
f3ef25c
Add test for deterministic aggregate ordering on ties
wesm Feb 3, 2026
4d5ca8e
Replace hardcoded participant IDs with MustLookupParticipant
wesm Feb 3, 2026
71db26c
Add test for Parser nil Now guard
wesm Feb 3, 2026
62eb7e3
Handle both value and pointer sqlite3.Error in isSQLiteError
wesm Feb 3, 2026
4c7aa63
Use testutil.NewTestStore for truly empty DB stats test
wesm Feb 3, 2026
62cb317
Expand timestamp parsing formats and add NULL validation for required…
wesm Feb 3, 2026
aba2bea
Add correctness checks and discriminating sequences to encoding tests
wesm Feb 3, 2026
6c61ee1
Add tests for InspectMessage and InspectRawDataExists error handling
wesm Feb 3, 2026
1f805ff
Harden encoding tests for future-proofing and fix fragility issues
wesm Feb 3, 2026
4e60d71
Add tests for AssertStringSet multiset semantics
wesm Feb 3, 2026
16eabda
Tighten inline search tests with exact searchRequestID assertions
wesm Feb 3, 2026
85412ca
Fix ViewSenders iota 0 sentinel collision in test model builder
wesm Feb 3, 2026
1c09b67
Fix prerelease normalization and installBinaryTo backup test
wesm Feb 3, 2026
6bd3fcb
Strengthen Execute/ExecuteContext delegation tests
wesm Feb 3, 2026
46510b5
Fix export partial success handling to show detailed results
wesm Feb 3, 2026
f1cb428
Fix mislabeled base64url test cases and add padded URL-safe coverage
wesm Feb 3, 2026
8b765d3
Add idx bounds check to assertAddress test helper
wesm Feb 3, 2026
0dd5150
Use timeout in callback handler empty-channel assertions
wesm Feb 3, 2026
3c82d4c
Simplify DuckDB invalid ViewType tests with fail-fast pattern
wesm Feb 3, 2026
f3646c9
Use test helpers in TestAggregateDeterministicOrderOnTies
wesm Feb 3, 2026
0c35c7f
Tighten TestParser_NilNow assertion window for newer_than:1d
wesm Feb 3, 2026
d11def7
Add nil guard to isSQLiteError and tests for pointer-form errors
wesm Feb 3, 2026
f2e03c0
Fix incomplete TestScanSource_UnrecognizedFormat test
wesm Feb 3, 2026
c82ca42
Use errors.Is for sql.ErrNoRows comparison in inspect test
wesm Feb 3, 2026
71adc13
Harden EncodedSamples tests with robust slice mutation
wesm Feb 3, 2026
3cac763
Replace zero-value testing.T with proper TB stub in AssertStringSet t…
wesm Feb 3, 2026
a3495c1
Add table-driven tests for normalizePrereleaseIdentifiers function
wesm Feb 3, 2026
2b32d1d
Document parallelism constraints for rootCmd tests
wesm Feb 3, 2026
4508c6a
Add assertNoSend helper with 100ms timeout for channel assertions
wesm Feb 3, 2026
1772283
Thread explicit IDs in TestAggregateDeterministicOrderOnTies
wesm Feb 3, 2026
c510616
Improve mutation test helpers to handle all reflect.Kind types
wesm Feb 3, 2026
2b0a955
Wrap real testing.TB in errRecorder to prevent nil panic
wesm Feb 3, 2026
0b17c00
Fix mutation edge cases in test helpers
wesm Feb 3, 2026
5ced5d6
Improve mutation test coverage and naming clarity
wesm Feb 3, 2026
bcfadf7
Revert encoding.go to explicit field copying, simplify tests
wesm Feb 3, 2026
2b62cb0
Add cleanup for export test artifacts
wesm Feb 3, 2026
59df7e2
Deduplicate encoding functions using textutil package
wesm Feb 3, 2026
87aced4
Fix misleading comment about unkeyed struct literals
wesm Feb 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/msgvault/cmd/addaccount.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ Example:
}

// Initialize database (in case it's new)
dbPath := cfg.DatabasePath()
dbPath := cfg.DatabaseDSN()
s, err := store.Open(dbPath)
if err != nil {
return fmt.Errorf("open database: %w", err)
Expand Down
2 changes: 1 addition & 1 deletion cmd/msgvault/cmd/build_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ The cache files are stored in ~/.msgvault/analytics/:
By default, this performs an incremental update (only adding new messages).
Use --full-rebuild to recreate all cache files from scratch.`,
RunE: func(cmd *cobra.Command, args []string) error {
dbPath := cfg.DatabasePath()
dbPath := cfg.DatabaseDSN()
analyticsDir := cfg.AnalyticsDir()

// Check database exists
Expand Down
2 changes: 1 addition & 1 deletion cmd/msgvault/cmd/deletions.go
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ Examples:
}

// Open database
dbPath := cfg.DatabasePath()
dbPath := cfg.DatabaseDSN()
s, err := store.Open(dbPath)
if err != nil {
return fmt.Errorf("open database: %w", err)
Expand Down
2 changes: 1 addition & 1 deletion cmd/msgvault/cmd/export_eml.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Examples:
idStr := args[0]

// Open database
dbPath := cfg.DatabasePath()
dbPath := cfg.DatabaseDSN()
s, err := store.Open(dbPath)
if err != nil {
return fmt.Errorf("open database: %w", err)
Expand Down
2 changes: 1 addition & 1 deletion cmd/msgvault/cmd/initdb.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This command creates all necessary tables for storing emails, attachments,
labels, and sync state. It is safe to run multiple times - tables are only
created if they don't already exist.`,
RunE: func(cmd *cobra.Command, args []string) error {
dbPath := cfg.DatabasePath()
dbPath := cfg.DatabaseDSN()
logger.Info("initializing database", "path", dbPath)

s, err := store.Open(dbPath)
Expand Down
4 changes: 2 additions & 2 deletions cmd/msgvault/cmd/list_domains.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Examples:
}

// Open database
dbPath := cfg.DatabasePath()
dbPath := cfg.DatabaseDSN()
s, err := store.Open(dbPath)
if err != nil {
return fmt.Errorf("open database: %w", err)
Expand All @@ -38,7 +38,7 @@ Examples:
engine := query.NewSQLiteEngine(s.DB())

// Execute aggregation
results, err := engine.AggregateByDomain(cmd.Context(), opts)
results, err := engine.Aggregate(cmd.Context(), query.ViewDomains, opts)
if err != nil {
return fmt.Errorf("aggregate by domain: %w", err)
}
Expand Down
4 changes: 2 additions & 2 deletions cmd/msgvault/cmd/list_labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Examples:
}

// Open database
dbPath := cfg.DatabasePath()
dbPath := cfg.DatabaseDSN()
s, err := store.Open(dbPath)
if err != nil {
return fmt.Errorf("open database: %w", err)
Expand All @@ -38,7 +38,7 @@ Examples:
engine := query.NewSQLiteEngine(s.DB())

// Execute aggregation
results, err := engine.AggregateByLabel(cmd.Context(), opts)
results, err := engine.Aggregate(cmd.Context(), query.ViewLabels, opts)
if err != nil {
return fmt.Errorf("aggregate by label: %w", err)
}
Expand Down
4 changes: 2 additions & 2 deletions cmd/msgvault/cmd/list_senders.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Examples:
}

// Open database
dbPath := cfg.DatabasePath()
dbPath := cfg.DatabaseDSN()
s, err := store.Open(dbPath)
if err != nil {
return fmt.Errorf("open database: %w", err)
Expand All @@ -38,7 +38,7 @@ Examples:
engine := query.NewSQLiteEngine(s.DB())

// Execute aggregation
results, err := engine.AggregateBySender(cmd.Context(), opts)
results, err := engine.Aggregate(cmd.Context(), query.ViewSenders, opts)
if err != nil {
return fmt.Errorf("aggregate by sender: %w", err)
}
Expand Down
2 changes: 1 addition & 1 deletion cmd/msgvault/cmd/mcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ Add to Claude Desktop config:
}
}`,
RunE: func(cmd *cobra.Command, args []string) error {
dbPath := cfg.DatabasePath()
dbPath := cfg.DatabaseDSN()
s, err := store.Open(dbPath)
if err != nil {
return fmt.Errorf("open database: %w", err)
Expand Down
131 changes: 8 additions & 123 deletions cmd/msgvault/cmd/repair_encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,10 @@ import (
"strings"
"unicode/utf8"

"github.com/gogs/chardet"
"github.com/spf13/cobra"
"github.com/wesm/msgvault/internal/mime"
"github.com/wesm/msgvault/internal/store"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/encoding/korean"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/encoding/traditionalchinese"
"github.com/wesm/msgvault/internal/textutil"
)

var repairEncodingCmd = &cobra.Command{
Expand All @@ -40,7 +34,7 @@ For each invalid field, it:
This is useful after a sync that may have produced invalid UTF-8 due to
charset detection issues in the MIME parser.`,
RunE: func(cmd *cobra.Command, args []string) error {
dbPath := cfg.DatabasePath()
dbPath := cfg.DatabaseDSN()
s, err := store.Open(dbPath)
if err != nil {
return fmt.Errorf("open database: %w", err)
Expand Down Expand Up @@ -239,7 +233,7 @@ func repairMessageFields(s *store.Store, stats *repairStats) error {
if parsed != nil && utf8.ValidString(parsed.Subject) {
repair.newSubject = sql.NullString{String: parsed.Subject, Valid: true}
} else {
repair.newSubject = sql.NullString{String: ensureValidUTF8(subject.String), Valid: true}
repair.newSubject = sql.NullString{String: textutil.EnsureUTF8(subject.String), Valid: true}
}
needsRepair = true
stats.subjects++
Expand All @@ -253,7 +247,7 @@ func repairMessageFields(s *store.Store, stats *repairStats) error {
if parsed != nil && utf8.ValidString(parsed.GetBodyText()) {
repair.newBody = sql.NullString{String: parsed.GetBodyText(), Valid: true}
} else {
repair.newBody = sql.NullString{String: ensureValidUTF8(bodyText.String), Valid: true}
repair.newBody = sql.NullString{String: textutil.EnsureUTF8(bodyText.String), Valid: true}
}
needsRepair = true
stats.bodyTexts++
Expand All @@ -267,15 +261,15 @@ func repairMessageFields(s *store.Store, stats *repairStats) error {
if parsed != nil && utf8.ValidString(parsed.BodyHTML) {
repair.newHTML = sql.NullString{String: parsed.BodyHTML, Valid: true}
} else {
repair.newHTML = sql.NullString{String: ensureValidUTF8(bodyHTML.String), Valid: true}
repair.newHTML = sql.NullString{String: textutil.EnsureUTF8(bodyHTML.String), Valid: true}
}
needsRepair = true
stats.bodyHTMLs++
}

// Snippet (from Gmail API, not in raw MIME)
if snippet.Valid && !utf8.ValidString(snippet.String) {
repair.newSnippet = sql.NullString{String: ensureValidUTF8(snippet.String), Valid: true}
repair.newSnippet = sql.NullString{String: textutil.EnsureUTF8(snippet.String), Valid: true}
needsRepair = true
stats.snippets++
}
Expand Down Expand Up @@ -395,7 +389,7 @@ func repairDisplayNames(s *store.Store, stats *repairStats) error {
}

if !utf8.ValidString(name) {
repairs = append(repairs, nameRepair{id: id, newName: ensureValidUTF8(name)})
repairs = append(repairs, nameRepair{id: id, newName: textutil.EnsureUTF8(name)})
stats.displayNames++

// Apply batch when full
Expand Down Expand Up @@ -526,7 +520,7 @@ func repairOtherStrings(s *store.Store, stats *repairStats) error {
}

if !utf8.ValidString(value) {
repairs = append(repairs, repair{id: id, newValue: ensureValidUTF8(value)})
repairs = append(repairs, repair{id: id, newValue: textutil.EnsureUTF8(value)})
*table.counter++

if len(repairs) >= batchSize {
Expand Down Expand Up @@ -582,22 +576,6 @@ func tryParseMIME(rawData []byte, compression sql.NullString) *mime.Message {
return parsed
}

// ensureValidUTF8 converts a string to valid UTF-8 using charset detection
func ensureValidUTF8(s string) string {
if utf8.ValidString(s) {
return s
}

// Try charset detection and conversion
decoded, err := detectAndDecode([]byte(s))
if err == nil {
return decoded
}

// Last resort: replace invalid bytes
return sanitizeUTF8(s)
}

// byteReader wraps a byte slice for use with zlib.NewReader
type byteReader struct {
data []byte
Expand All @@ -613,99 +591,6 @@ func (r *byteReader) Read(p []byte) (n int, err error) {
return n, nil
}

// sanitizeUTF8 replaces invalid UTF-8 bytes with the replacement character.
func sanitizeUTF8(s string) string {
var sb strings.Builder
sb.Grow(len(s))
for i := 0; i < len(s); {
r, size := utf8.DecodeRuneInString(s[i:])
if r == utf8.RuneError && size == 1 {
sb.WriteRune('\ufffd')
i++
} else {
sb.WriteRune(r)
i += size
}
}
return sb.String()
}

// detectAndDecode attempts to detect the charset of the given bytes and decode to UTF-8.
func detectAndDecode(data []byte) (string, error) {
if utf8.Valid(data) {
return string(data), nil
}

// Try charset detection first (only useful for longer samples)
if len(data) > 20 {
detector := chardet.NewTextDetector()
result, err := detector.DetectBest(data)
if err == nil && result.Confidence >= 50 {
if enc := getEncodingByName(result.Charset); enc != nil {
decoded, err := enc.NewDecoder().Bytes(data)
if err == nil && utf8.Valid(decoded) {
return string(decoded), nil
}
}
}
}

// Try common encodings in order
encodings := []encoding.Encoding{
charmap.Windows1252,
charmap.ISO8859_1,
charmap.ISO8859_15,
japanese.ShiftJIS,
japanese.EUCJP,
korean.EUCKR,
simplifiedchinese.GBK,
traditionalchinese.Big5,
}

for _, enc := range encodings {
decoded, err := enc.NewDecoder().Bytes(data)
if err == nil && utf8.Valid(decoded) {
return string(decoded), nil
}
}

return "", fmt.Errorf("could not decode to valid UTF-8")
}

// getEncodingByName returns an encoding for the given IANA charset name.
func getEncodingByName(name string) encoding.Encoding {
switch name {
case "windows-1252", "CP1252", "cp1252":
return charmap.Windows1252
case "ISO-8859-1", "iso-8859-1", "latin1", "latin-1":
return charmap.ISO8859_1
case "ISO-8859-15", "iso-8859-15", "latin9":
return charmap.ISO8859_15
case "ISO-8859-2", "iso-8859-2", "latin2":
return charmap.ISO8859_2
case "Shift_JIS", "shift_jis", "shift-jis", "sjis":
return japanese.ShiftJIS
case "EUC-JP", "euc-jp", "eucjp":
return japanese.EUCJP
case "ISO-2022-JP", "iso-2022-jp":
return japanese.ISO2022JP
case "EUC-KR", "euc-kr", "euckr":
return korean.EUCKR
case "GB2312", "gb2312", "GBK", "gbk":
return simplifiedchinese.GBK
case "GB18030", "gb18030":
return simplifiedchinese.GB18030
case "Big5", "big5", "big-5":
return traditionalchinese.Big5
case "KOI8-R", "koi8-r":
return charmap.KOI8R
case "KOI8-U", "koi8-u":
return charmap.KOI8U
default:
return nil
}
}

func init() {
rootCmd.AddCommand(repairEncodingCmd)
}
Loading