Skip to content

Commit 6e5b3a5

Browse files
authored
feat(cmd): add --dedup-stats flag to ls; show per-file dedup stats (#102)
* cmd: add --dedup-stats flag to ls; show per-file dedup stats * test(cmd): add coverage for ls helpers (chunk index, dedup stats, display) * refactor(ls): move UI helpers to internal/ls; move dedup stat computation to internal/deduplication; update tests * Add --dedup-stats flag to 'sietch ls' for per-file deduplication stats * Add --dedup-stats flag to 'sietch ls' for per-file deduplication stats
1 parent d345826 commit 6e5b3a5

4 files changed

Lines changed: 381 additions & 12 deletions

File tree

cmd/ls.go

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ import (
1414
"github.com/spf13/cobra"
1515

1616
"github.com/substantialcattle5/sietch/internal/config"
17+
"github.com/substantialcattle5/sietch/internal/deduplication"
1718
"github.com/substantialcattle5/sietch/internal/fs"
19+
lsui "github.com/substantialcattle5/sietch/internal/ls"
1820
"github.com/substantialcattle5/sietch/util"
1921
)
2022

@@ -64,10 +66,17 @@ Examples:
6466
long, _ := cmd.Flags().GetBool("long")
6567
showTags, _ := cmd.Flags().GetBool("tags")
6668
sortBy, _ := cmd.Flags().GetString("sort")
69+
showDedup, _ := cmd.Flags().GetBool("dedup-stats")
6770

6871
// Filter and sort files
6972
files := filterAndSortFiles(manifest.Files, filterPath, sortBy)
7073

74+
// Build chunk -> files index only if dedup stats requested
75+
var chunkRefs map[string][]string
76+
if showDedup {
77+
chunkRefs = buildChunkIndex(manifest.Files)
78+
}
79+
7180
// Display the files
7281
if len(files) == 0 {
7382
if filterPath != "" {
@@ -79,9 +88,9 @@ Examples:
7988
}
8089

8190
if long {
82-
displayLongFormat(files, showTags)
91+
displayLongFormat(files, showTags, showDedup, chunkRefs)
8392
} else {
84-
displayShortFormat(files, showTags)
93+
lsui.DisplayShortFormat(files, showTags, showDedup, chunkRefs)
8594
}
8695

8796
return nil
@@ -125,7 +134,8 @@ func filterAndSortFiles(files []config.FileManifest, filterPath, sortBy string)
125134
}
126135

127136
// Display files in long format with detailed information
128-
func displayLongFormat(files []config.FileManifest, showTags bool) {
137+
// showDedup = whether to include dedup stats; chunkRefs is map[chunkID][]filePaths
138+
func displayLongFormat(files []config.FileManifest, showTags, showDedup bool, chunkRefs map[string][]string) {
129139
// Create a tabwriter for aligned columns
130140
w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
131141
defer w.Flush()
@@ -159,20 +169,47 @@ func displayLongFormat(files []config.FileManifest, showTags bool) {
159169
len(file.Chunks),
160170
file.Destination+file.FilePath)
161171
}
172+
173+
// Dedup stats (print an indented stats line after the file line)
174+
if showDedup && chunkRefs != nil {
175+
sharedChunks, savedBytes, sharedWith := deduplication.ComputeDedupStatsForFile(file, chunkRefs)
176+
// Format saved size
177+
savedStr := util.HumanReadableSize(savedBytes)
178+
// Format shared_with string with truncation
179+
sharedWithStr := lsui.FormatSharedWith(sharedWith, 10)
180+
// Print as indented info (not part of the tabwriter)
181+
if len(sharedWith) == 0 {
182+
fmt.Fprintf(w, "%s\t%s\t%s\t%s\n", "", "", "", "") // ensure tabwriter alignment
183+
fmt.Fprintf(w, " shared_chunks: %d\t saved: %s\n", sharedChunks, savedStr)
184+
} else {
185+
fmt.Fprintf(w, "%s\t%s\t%s\t%s\n", "", "", "", "") // alignment spacer
186+
fmt.Fprintf(w, " shared_chunks: %d\t saved: %s\t shared_with: %s\n", sharedChunks, savedStr, sharedWithStr)
187+
}
188+
}
162189
}
163190
}
164191

165-
// Display files in short format
166-
func displayShortFormat(files []config.FileManifest, showTags bool) {
167-
for _, file := range files {
168-
path := file.Destination + file.FilePath
169-
if showTags && len(file.Tags) > 0 {
170-
tags := strings.Join(file.Tags, ", ")
171-
fmt.Printf("%s [%s]\n", path, tags)
172-
} else {
173-
fmt.Println(path)
192+
// buildChunkIndex creates a mapping chunkID -> []filePaths using the manifest file list.
193+
// Uses ChunkRef.Hash as the chunk identifier.
194+
func buildChunkIndex(files []config.FileManifest) map[string][]string {
195+
chunkRefs := make(map[string][]string)
196+
for _, f := range files {
197+
fp := f.Destination + f.FilePath
198+
for _, c := range f.Chunks {
199+
// use the Hash field as the chunk identifier
200+
chunkID := c.Hash
201+
if chunkID == "" {
202+
// fallback: if Hash is empty, use EncryptedHash
203+
chunkID = c.EncryptedHash
204+
}
205+
if chunkID == "" {
206+
// skip weird entries
207+
continue
208+
}
209+
chunkRefs[chunkID] = append(chunkRefs[chunkID], fp)
174210
}
175211
}
212+
return chunkRefs
176213
}
177214

178215
func init() {
@@ -182,4 +219,7 @@ func init() {
182219
lsCmd.Flags().BoolP("long", "l", false, "Use long listing format")
183220
lsCmd.Flags().BoolP("tags", "t", false, "Show file tags")
184221
lsCmd.Flags().StringP("sort", "s", "path", "Sort by: name, size, time, path")
222+
223+
// New dedup-stats flag
224+
lsCmd.Flags().BoolP("dedup-stats", "d", false, "Show per-file deduplication statistics")
185225
}

cmd/ls_helpers_test.go

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
package cmd
2+
3+
import (
4+
"bytes"
5+
"fmt"
6+
"io"
7+
"os"
8+
"sort"
9+
"strings"
10+
"testing"
11+
"time"
12+
13+
"github.com/substantialcattle5/sietch/internal/config"
14+
dedup "github.com/substantialcattle5/sietch/internal/deduplication"
15+
lsui "github.com/substantialcattle5/sietch/internal/ls"
16+
)
17+
18+
// Helper: capture stdout while running fn()
19+
func captureStdout(t *testing.T, fn func()) string {
20+
old := os.Stdout
21+
r, w, err := os.Pipe()
22+
if err != nil {
23+
t.Fatalf("pipe: %v", err)
24+
}
25+
os.Stdout = w
26+
27+
fn()
28+
29+
// Close writer and restore stdout before reading
30+
w.Close()
31+
var buf bytes.Buffer
32+
_, err = io.Copy(&buf, r)
33+
if err != nil {
34+
os.Stdout = old
35+
t.Fatalf("copy: %v", err)
36+
}
37+
os.Stdout = old
38+
return buf.String()
39+
}
40+
41+
func TestFilterAndSortFiles_Basic(t *testing.T) {
42+
now := time.Now().UTC().Format(time.RFC3339)
43+
f1 := config.FileManifest{FilePath: "a.txt", Destination: "docs/", Size: 100, ModTime: now}
44+
f2 := config.FileManifest{FilePath: "b.txt", Destination: "docs/", Size: 200, ModTime: now}
45+
f3 := config.FileManifest{FilePath: "c.txt", Destination: "data/", Size: 50, ModTime: now}
46+
47+
files := []config.FileManifest{f1, f2, f3}
48+
49+
// sort by name
50+
out := filterAndSortFiles(files, "", "name")
51+
if out[0].FilePath != "a.txt" || out[1].FilePath != "b.txt" || out[2].FilePath != "c.txt" {
52+
t.Fatalf("unexpected order by name: %v", []string{out[0].FilePath, out[1].FilePath, out[2].FilePath})
53+
}
54+
55+
// sort by size (desc)
56+
out = filterAndSortFiles(files, "", "size")
57+
if out[0].Size < out[1].Size || out[1].Size < out[2].Size {
58+
t.Fatalf("unexpected order by size: %v", []int64{out[0].Size, out[1].Size, out[2].Size})
59+
}
60+
61+
// filter by destination prefix
62+
out = filterAndSortFiles(files, "docs/", "path")
63+
if len(out) != 2 {
64+
t.Fatalf("expected 2 files in docs/, got %d", len(out))
65+
}
66+
}
67+
68+
func TestBuildChunkIndexAndComputeDedupStats(t *testing.T) {
69+
now := time.Now().UTC().Format(time.RFC3339)
70+
71+
// file1 has chunks c1 and c2
72+
f1 := config.FileManifest{
73+
FilePath: "a.txt",
74+
Destination: "test/",
75+
Size: 1024,
76+
ModTime: now,
77+
Chunks: []config.ChunkRef{
78+
{Hash: "c1", EncryptedSize: 128},
79+
{Hash: "c2", EncryptedSize: 256},
80+
},
81+
}
82+
// file2 shares c1
83+
f2 := config.FileManifest{
84+
FilePath: "b.txt",
85+
Destination: "test/",
86+
Size: 1024,
87+
ModTime: now,
88+
Chunks: []config.ChunkRef{
89+
{Hash: "c1", EncryptedSize: 128},
90+
},
91+
}
92+
// file3 no share
93+
f3 := config.FileManifest{
94+
FilePath: "c.txt",
95+
Destination: "other/",
96+
Size: 512,
97+
ModTime: now,
98+
Chunks: []config.ChunkRef{
99+
{Hash: "c3", EncryptedSize: 64},
100+
},
101+
}
102+
103+
files := []config.FileManifest{f1, f2, f3}
104+
105+
idx := buildChunkIndex(files)
106+
107+
// verify chunk index
108+
if len(idx["c1"]) != 2 {
109+
t.Fatalf("expected c1 refs length 2, got %d", len(idx["c1"]))
110+
}
111+
if len(idx["c2"]) != 1 {
112+
t.Fatalf("expected c2 refs length 1, got %d", len(idx["c2"]))
113+
}
114+
115+
sharedChunks, savedBytes, sharedWith := dedup.ComputeDedupStatsForFile(f1, idx)
116+
if sharedChunks != 1 {
117+
t.Fatalf("expected sharedChunks 1 for f1, got %d", sharedChunks)
118+
}
119+
if savedBytes != 128 {
120+
t.Fatalf("expected savedBytes 128 for f1, got %d", savedBytes)
121+
}
122+
if len(sharedWith) != 1 {
123+
t.Fatalf("expected sharedWith length 1, got %d", len(sharedWith))
124+
}
125+
if sharedWith[0] != "test/b.txt" {
126+
t.Fatalf("expected shared with test/b.txt got %v", sharedWith)
127+
}
128+
129+
// file with no shared chunks
130+
sc, sb, sw := dedup.ComputeDedupStatsForFile(f3, idx)
131+
if sc != 0 || sb != 0 || len(sw) != 0 {
132+
t.Fatalf("expected no shared chunks for f3, got sc=%d sb=%d sw=%v", sc, sb, sw)
133+
}
134+
}
135+
136+
func TestFormatSharedWith_Truncation(t *testing.T) {
137+
list := make([]string, 0, 12)
138+
for i := 0; i < 12; i++ {
139+
// use numeric suffixes to avoid rune/int confusion
140+
list = append(list, fmt.Sprintf("file%d", i))
141+
}
142+
out := lsui.FormatSharedWith(list, 10)
143+
if !strings.Contains(out, "(+2 more)") {
144+
t.Fatalf("expected truncation info (+2 more) in '%s'", out)
145+
}
146+
}
147+
148+
func TestDisplayShortAndLongFormat_OutputContainsStats(t *testing.T) {
149+
now := time.Now().UTC().Format(time.RFC3339)
150+
151+
f1 := config.FileManifest{
152+
FilePath: "a.txt",
153+
Destination: "test/",
154+
Size: 100,
155+
ModTime: now,
156+
Chunks: []config.ChunkRef{{Hash: "c1", EncryptedSize: 128}},
157+
}
158+
f2 := config.FileManifest{
159+
FilePath: "b.txt",
160+
Destination: "test/",
161+
Size: 200,
162+
ModTime: now,
163+
Chunks: []config.ChunkRef{{Hash: "c1", EncryptedSize: 128}},
164+
}
165+
files := []config.FileManifest{f1, f2}
166+
chunkRefs := buildChunkIndex(files)
167+
168+
// short format capture
169+
outShort := captureStdout(t, func() {
170+
lsui.DisplayShortFormat(files, true, true, chunkRefs)
171+
})
172+
if !strings.Contains(outShort, "shared_chunks:") || !strings.Contains(outShort, "saved:") {
173+
t.Fatalf("short output missing dedup info: %s", outShort)
174+
}
175+
176+
// long format capture
177+
outLong := captureStdout(t, func() {
178+
displayLongFormat(files, false, true, chunkRefs)
179+
})
180+
if !strings.Contains(outLong, "SIZE") || !strings.Contains(outLong, "shared_chunks:") {
181+
t.Fatalf("long output missing dedup info: %s", outLong)
182+
}
183+
}
184+
185+
func TestBuildChunkIndex_DeterministicOrder(t *testing.T) {
186+
now := time.Now().UTC().Format(time.RFC3339)
187+
188+
f1 := config.FileManifest{
189+
FilePath: "a.txt",
190+
Destination: "x/",
191+
Size: 10,
192+
ModTime: now,
193+
Chunks: []config.ChunkRef{{Hash: "c1", EncryptedSize: 10}},
194+
}
195+
f2 := config.FileManifest{
196+
FilePath: "b.txt",
197+
Destination: "y/",
198+
Size: 20,
199+
ModTime: now,
200+
Chunks: []config.ChunkRef{{Hash: "c1", EncryptedSize: 10}},
201+
}
202+
files := []config.FileManifest{f1, f2}
203+
idx := buildChunkIndex(files)
204+
205+
// ensure entries are present
206+
if len(idx["c1"]) != 2 {
207+
t.Fatalf("expected 2 refs for c1; got %d", len(idx["c1"]))
208+
}
209+
210+
// ensure computeDedupStatsForFile sorts sharedWith deterministically
211+
_, _, sw := dedup.ComputeDedupStatsForFile(f1, idx)
212+
// sw should be sorted (we call sort.Strings), check monotonic property
213+
if !sort.StringsAreSorted(sw) {
214+
t.Fatalf("sharedWith not sorted: %v", sw)
215+
}
216+
}

internal/deduplication/util.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
package deduplication
2+
3+
import (
4+
"sort"
5+
6+
"github.com/substantialcattle5/sietch/internal/config"
7+
)
8+
9+
// ComputeDedupStatsForFile calculates dedup stats by consulting chunkRefs map.
10+
// Uses EncryptedSize if present, otherwise Size, otherwise falls back to default chunk size.
11+
func ComputeDedupStatsForFile(file config.FileManifest, chunkRefs map[string][]string) (sharedChunks int, savedBytes int64, sharedWith []string) {
12+
// Default chunk size assumption (matches docs): 4 MiB
13+
const defaultChunkSize int64 = 4 * 1024 * 1024
14+
15+
sharedWithSet := make(map[string]struct{})
16+
filePath := file.Destination + file.FilePath
17+
18+
for _, c := range file.Chunks {
19+
chunkID := c.Hash
20+
if chunkID == "" {
21+
chunkID = c.EncryptedHash
22+
}
23+
if chunkID == "" {
24+
continue
25+
}
26+
27+
refs, ok := chunkRefs[chunkID]
28+
if !ok {
29+
continue
30+
}
31+
if len(refs) > 1 {
32+
sharedChunks++
33+
34+
// Prefer encrypted size if available (actual stored size), fallback to plaintext size
35+
var chunkSize int64
36+
if c.EncryptedSize > 0 {
37+
chunkSize = c.EncryptedSize
38+
} else if c.Size > 0 {
39+
chunkSize = c.Size
40+
} else {
41+
chunkSize = defaultChunkSize
42+
}
43+
savedBytes += chunkSize
44+
45+
for _, other := range refs {
46+
if other == filePath {
47+
continue
48+
}
49+
sharedWithSet[other] = struct{}{}
50+
}
51+
}
52+
}
53+
54+
sharedWith = make([]string, 0, len(sharedWithSet))
55+
for s := range sharedWithSet {
56+
sharedWith = append(sharedWith, s)
57+
}
58+
// sort for deterministic output
59+
sort.Strings(sharedWith)
60+
return
61+
}

0 commit comments

Comments
 (0)