Skip to content

Commit 07f017f

Browse files
authored
docs(provide): validation and reprovide cycle visualization (#10977)
* docs: improve slow reprovide warning messages simplify warning text and provide actionable solutions in order of preference * feat(config): add validation for Provide.DHT settings - validate interval doesn't exceed DHT record validity (48h) - validate worker counts and other parameters are within valid ranges - improve slow reprovide warning messages to reference config parameter - add tests for all validation cases * docs: add reprovide cycle visualization shows traffic patterns of legacy vs sweep vs accelerated DHT
1 parent 9faefe3 commit 07f017f

File tree

6 files changed

+203
-26
lines changed

6 files changed

+203
-26
lines changed

config/provide.go

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
package config
22

33
import (
4+
"fmt"
45
"strings"
56
"time"
7+
8+
"github.com/libp2p/go-libp2p-kad-dht/amino"
69
)
710

811
const (
@@ -101,3 +104,67 @@ func ParseProvideStrategy(s string) ProvideStrategy {
101104
}
102105
return strategy
103106
}
107+
108+
// ValidateProvideConfig validates the Provide configuration according to DHT requirements.
109+
func ValidateProvideConfig(cfg *Provide) error {
110+
// Validate Provide.DHT.Interval
111+
if !cfg.DHT.Interval.IsDefault() {
112+
interval := cfg.DHT.Interval.WithDefault(DefaultProvideDHTInterval)
113+
if interval > amino.DefaultProvideValidity {
114+
return fmt.Errorf("Provide.DHT.Interval (%v) must be less than or equal to DHT provider record validity (%v)", interval, amino.DefaultProvideValidity)
115+
}
116+
if interval < 0 {
117+
return fmt.Errorf("Provide.DHT.Interval must be non-negative, got %v", interval)
118+
}
119+
}
120+
121+
// Validate MaxWorkers
122+
if !cfg.DHT.MaxWorkers.IsDefault() {
123+
maxWorkers := cfg.DHT.MaxWorkers.WithDefault(DefaultProvideDHTMaxWorkers)
124+
if maxWorkers <= 0 {
125+
return fmt.Errorf("Provide.DHT.MaxWorkers must be positive, got %d", maxWorkers)
126+
}
127+
}
128+
129+
// Validate DedicatedPeriodicWorkers
130+
if !cfg.DHT.DedicatedPeriodicWorkers.IsDefault() {
131+
workers := cfg.DHT.DedicatedPeriodicWorkers.WithDefault(DefaultProvideDHTDedicatedPeriodicWorkers)
132+
if workers < 0 {
133+
return fmt.Errorf("Provide.DHT.DedicatedPeriodicWorkers must be non-negative, got %d", workers)
134+
}
135+
}
136+
137+
// Validate DedicatedBurstWorkers
138+
if !cfg.DHT.DedicatedBurstWorkers.IsDefault() {
139+
workers := cfg.DHT.DedicatedBurstWorkers.WithDefault(DefaultProvideDHTDedicatedBurstWorkers)
140+
if workers < 0 {
141+
return fmt.Errorf("Provide.DHT.DedicatedBurstWorkers must be non-negative, got %d", workers)
142+
}
143+
}
144+
145+
// Validate MaxProvideConnsPerWorker
146+
if !cfg.DHT.MaxProvideConnsPerWorker.IsDefault() {
147+
conns := cfg.DHT.MaxProvideConnsPerWorker.WithDefault(DefaultProvideDHTMaxProvideConnsPerWorker)
148+
if conns <= 0 {
149+
return fmt.Errorf("Provide.DHT.MaxProvideConnsPerWorker must be positive, got %d", conns)
150+
}
151+
}
152+
153+
// Validate KeyStoreBatchSize
154+
if !cfg.DHT.KeyStoreBatchSize.IsDefault() {
155+
batchSize := cfg.DHT.KeyStoreBatchSize.WithDefault(DefaultProvideDHTKeyStoreBatchSize)
156+
if batchSize <= 0 {
157+
return fmt.Errorf("Provide.DHT.KeyStoreBatchSize must be positive, got %d", batchSize)
158+
}
159+
}
160+
161+
// Validate OfflineDelay
162+
if !cfg.DHT.OfflineDelay.IsDefault() {
163+
delay := cfg.DHT.OfflineDelay.WithDefault(DefaultProvideDHTOfflineDelay)
164+
if delay < 0 {
165+
return fmt.Errorf("Provide.DHT.OfflineDelay must be non-negative, got %v", delay)
166+
}
167+
}
168+
169+
return nil
170+
}

config/provide_test.go

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
package config
22

3-
import "testing"
3+
import (
4+
"testing"
5+
"time"
6+
7+
"github.com/stretchr/testify/assert"
8+
"github.com/stretchr/testify/require"
9+
)
410

511
func TestParseProvideStrategy(t *testing.T) {
612
tests := []struct {
@@ -25,3 +31,77 @@ func TestParseProvideStrategy(t *testing.T) {
2531
}
2632
}
2733
}
34+
35+
func TestValidateProvideConfig_Interval(t *testing.T) {
36+
tests := []struct {
37+
name string
38+
interval time.Duration
39+
wantErr bool
40+
errMsg string
41+
}{
42+
{"valid default (22h)", 22 * time.Hour, false, ""},
43+
{"valid max (48h)", 48 * time.Hour, false, ""},
44+
{"valid small (1h)", 1 * time.Hour, false, ""},
45+
{"valid zero (disabled)", 0, false, ""},
46+
{"invalid over limit (49h)", 49 * time.Hour, true, "must be less than or equal to DHT provider record validity"},
47+
{"invalid over limit (72h)", 72 * time.Hour, true, "must be less than or equal to DHT provider record validity"},
48+
{"invalid negative", -1 * time.Hour, true, "must be non-negative"},
49+
}
50+
51+
for _, tt := range tests {
52+
t.Run(tt.name, func(t *testing.T) {
53+
cfg := &Provide{
54+
DHT: ProvideDHT{
55+
Interval: NewOptionalDuration(tt.interval),
56+
},
57+
}
58+
59+
err := ValidateProvideConfig(cfg)
60+
61+
if tt.wantErr {
62+
require.Error(t, err, "expected error for interval=%v", tt.interval)
63+
if tt.errMsg != "" {
64+
assert.Contains(t, err.Error(), tt.errMsg, "error message mismatch")
65+
}
66+
} else {
67+
require.NoError(t, err, "unexpected error for interval=%v", tt.interval)
68+
}
69+
})
70+
}
71+
}
72+
73+
func TestValidateProvideConfig_MaxWorkers(t *testing.T) {
74+
tests := []struct {
75+
name string
76+
maxWorkers int64
77+
wantErr bool
78+
errMsg string
79+
}{
80+
{"valid default", 16, false, ""},
81+
{"valid high", 100, false, ""},
82+
{"valid low", 1, false, ""},
83+
{"invalid zero", 0, true, "must be positive"},
84+
{"invalid negative", -1, true, "must be positive"},
85+
}
86+
87+
for _, tt := range tests {
88+
t.Run(tt.name, func(t *testing.T) {
89+
cfg := &Provide{
90+
DHT: ProvideDHT{
91+
MaxWorkers: NewOptionalInteger(tt.maxWorkers),
92+
},
93+
}
94+
95+
err := ValidateProvideConfig(cfg)
96+
97+
if tt.wantErr {
98+
require.Error(t, err, "expected error for maxWorkers=%d", tt.maxWorkers)
99+
if tt.errMsg != "" {
100+
assert.Contains(t, err.Error(), tt.errMsg, "error message mismatch")
101+
}
102+
} else {
103+
require.NoError(t, err, "unexpected error for maxWorkers=%d", tt.maxWorkers)
104+
}
105+
})
106+
}
107+
}

core/node/groups.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,11 @@ func IPFS(ctx context.Context, bcfg *BuildCfg) fx.Option {
431431
return fx.Error(err)
432432
}
433433

434+
// Validate Provide configuration
435+
if err := config.ValidateProvideConfig(&cfg.Provide); err != nil {
436+
return fx.Error(err)
437+
}
438+
434439
// Auto-sharding settings
435440
shardingThresholdString := cfg.Import.UnixFSHAMTDirectorySizeThreshold.WithDefault(config.DefaultUnixFSHAMTDirectorySizeThreshold)
436441
shardSingThresholdInt, err := humanize.ParseBytes(shardingThresholdString)

core/node/provider.go

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -208,20 +208,20 @@ func LegacyProviderOpt(reprovideInterval time.Duration, strategy string, acceler
208208
expectedProvideSpeed := reprovideInterval / probableBigBlockstore
209209
if avgProvideSpeed > expectedProvideSpeed {
210210
logger.Errorf(`
211-
🔔🔔🔔 YOU MAY BE FALLING BEHIND DHT REPROVIDES! 🔔🔔🔔
211+
🔔🔔🔔 Reprovide Operations Too Slow 🔔🔔🔔
212212
213-
⚠️ Your system might be struggling to keep up with DHT reprovides!
214-
This means your content could be partially or completely inaccessible on the network.
215-
We observed that you recently provided %d keys at an average rate of %v per key.
213+
Your node may be falling behind on DHT reprovides, which could affect content availability.
216214
217-
🕑 An attempt to estimate your blockstore size timed out after 5 minutes,
218-
implying your blockstore might be exceedingly large. Assuming a considerable
219-
size of 10TiB, it would take %v to provide the complete set.
215+
Observed: %d keys at %v per key
216+
Estimated: Assuming 10TiB blockstore, would take %v to complete
217+
⏰ Must finish within %v (Provide.DHT.Interval)
220218
221-
⏰ The total provide time needs to stay under your reprovide interval (%v) to prevent falling behind!
219+
Solutions (try in order):
220+
1. Enable Provide.DHT.SweepEnabled=true (recommended)
221+
2. Increase Provide.DHT.MaxWorkers if needed
222+
3. Enable Routing.AcceleratedDHTClient=true (last resort, resource intensive)
222223
223-
💡 Consider enabling the Accelerated DHT to enhance your system performance. See:
224-
https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtclient`,
224+
Learn more: https://github.com/ipfs/kubo/blob/master/docs/config.md#provide`,
225225
keysProvided, avgProvideSpeed, avgProvideSpeed*probableBigBlockstore, reprovideInterval)
226226
return false
227227
}
@@ -237,18 +237,20 @@ https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtcli
237237

238238
if avgProvideSpeed > expectedProvideSpeed {
239239
logger.Errorf(`
240-
🔔🔔🔔 YOU ARE FALLING BEHIND DHT REPROVIDES! 🔔🔔🔔
240+
🔔🔔🔔 Reprovide Operations Too Slow 🔔🔔🔔
241241
242-
⚠️ Your system is struggling to keep up with DHT reprovides!
243-
This means your content could be partially or completely inaccessible on the network.
244-
We observed that you recently provided %d keys at an average rate of %v per key.
242+
Your node is falling behind on DHT reprovides, which will affect content availability.
245243
246-
💾 Your total CID count is ~%d which would total at %v reprovide process.
244+
Observed: %d keys at %v per key
245+
Confirmed: ~%d total CIDs requiring %v to complete
246+
⏰ Must finish within %v (Provide.DHT.Interval)
247247
248-
⏰ The total provide time needs to stay under your reprovide interval (%v) to prevent falling behind!
248+
Solutions (try in order):
249+
1. Enable Provide.DHT.SweepEnabled=true (recommended)
250+
2. Increase Provide.DHT.MaxWorkers if needed
251+
3. Enable Routing.AcceleratedDHTClient=true (last resort, resource intensive)
249252
250-
💡 Consider enabling the Accelerated DHT to enhance your reprovide throughput. See:
251-
https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtclient`,
253+
Learn more: https://github.com/ipfs/kubo/blob/master/docs/config.md#provide`,
252254
keysProvided, avgProvideSpeed, count, avgProvideSpeed*time.Duration(count), reprovideInterval)
253255
}
254256
return false

docs/changelogs/v0.38.md

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,17 +38,25 @@ Read more about the new system below.
3838

3939
#### 🧹 Experimental Sweeping DHT Provider
4040

41-
A new experimental DHT provider is available as an alternative to both the default provider and the resource-intensive [accelerated DHT client](https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtclient). Enable it via [`Provide.DHT.SweepEnabled`](https://github.com/ipfs/kubo/blob/master/docs/config.md#providedhtssweepenabled).
42-
43-
> [!NOTE]
44-
> This feature is experimental and opt-in. In the future, it will become the default and replace the legacy system. Some commands like `ipfs stats provide` and `ipfs routing provide` are not yet available with sweep mode. Run `ipfs provide --help` for alternatives.
41+
A new experimental DHT provider is available as an alternative to both the default provider and the resource-intensive [accelerated DHT client](https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtclient). Enable it via [`Provide.DHT.SweepEnabled`](https://github.com/ipfs/kubo/blob/master/docs/config.md#providedhtsweepenabled).
4542

4643
**How it works:** Instead of providing keys one-by-one, the sweep provider systematically explores DHT keyspace regions in batches.
4744

45+
> <picture>
46+
> <source media="(prefers-color-scheme: dark)" srcset="https://github.com/user-attachments/assets/f6e06b08-7fee-490c-a681-1bf440e16e27">
47+
> <source media="(prefers-color-scheme: light)" srcset="https://github.com/user-attachments/assets/e1662d7c-f1be-4275-a9ed-f2752fcdcabe">
48+
> <img alt="Reprovide Cycle Comparison" src="https://github.com/user-attachments/assets/e1662d7c-f1be-4275-a9ed-f2752fcdcabe">
49+
> </picture>
50+
>
51+
> The diagram shows how sweep mode avoids the hourly traffic spikes of Accelerated DHT while maintaining similar effectiveness. By grouping CIDs into keyspace regions and processing them in batches, sweep mode reduces memory overhead and creates predictable network patterns.
52+
4853
**Benefits for large-scale operations:** Handles hundreds of thousands of CIDs with reduced memory and network connections, spreads operations evenly to eliminate resource spikes, maintains state across restarts through persistent keystore, and provides better metrics visibility.
4954

5055
**Monitoring and debugging:** Legacy mode (`SweepEnabled=false`) tracks `provider_reprovider_provide_count` and `provider_reprovider_reprovide_count`, while sweep mode (`SweepEnabled=true`) tracks `total_provide_count_total`. Enable debug logging with `GOLOG_LOG_LEVEL=error,provider=debug,dht/provider=debug` to see detailed logs from either system.
5156

57+
> [!NOTE]
58+
> This feature is experimental and opt-in. In the future, it will become the default and replace the legacy system. Some commands like `ipfs stats provide` and `ipfs routing provide` are not yet available with sweep mode. Run `ipfs provide --help` for alternatives.
59+
5260
For configuration details, see [`Provide.DHT`](https://github.com/ipfs/kubo/blob/master/docs/config.md#providedht). For metrics documentation, see [Provide metrics](https://github.com/ipfs/kubo/blob/master/docs/metrics.md#provide).
5361

5462
#### 📊 Exposed DHT metrics

docs/config.md

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ config file at runtime.
131131
- [`Provide.DHT`](#providedht)
132132
- [`Provide.DHT.MaxWorkers`](#providedhtmaxworkers)
133133
- [`Provide.DHT.Interval`](#providedhtinterval)
134-
- [`Provide.DHT.SweepEnabled`](#providedhtssweepenabled)
134+
- [`Provide.DHT.SweepEnabled`](#providedhtsweepenabled)
135135
- [`Provide.DHT.DedicatedPeriodicWorkers`](#providedhtdedicatedperiodicworkers)
136136
- [`Provide.DHT.DedicatedBurstWorkers`](#providedhtdedicatedburstworkers)
137137
- [`Provide.DHT.MaxProvideConnsPerWorker`](#providedhtmaxprovideconnsperworker)
@@ -2026,6 +2026,21 @@ by providing it a channel of all the keys it is expected to contain according
20262026
to the [`Provide.Strategy`](#providestrategy). During this operation,
20272027
all keys in the `Keystore` are purged, and only the given ones remain scheduled.
20282028

2029+
> <picture>
2030+
> <source media="(prefers-color-scheme: dark)" srcset="https://github.com/user-attachments/assets/f6e06b08-7fee-490c-a681-1bf440e16e27">
2031+
> <source media="(prefers-color-scheme: light)" srcset="https://github.com/user-attachments/assets/e1662d7c-f1be-4275-a9ed-f2752fcdcabe">
2032+
> <img alt="Reprovide Cycle Comparison" src="https://github.com/user-attachments/assets/e1662d7c-f1be-4275-a9ed-f2752fcdcabe">
2033+
> </picture>
2034+
>
2035+
> The diagram above visualizes the performance patterns:
2036+
>
2037+
> - **Legacy mode**: Individual (slow) provides per CID, can struggle with large datasets
2038+
> - **Sweep mode**: Even distribution matching the keyspace sweep described with low resource usage
2039+
> - **Accelerated DHT**: Hourly traffic spikes with high resource usage
2040+
>
2041+
> Sweep mode provides similar effectiveness to Accelerated DHT but with steady resource usage - better for machines with limited CPU, memory, or network bandwidth.
2042+
2043+
20292044
> [!NOTE]
20302045
> This feature is opt-in for now, but will become the default in a future release.
20312046
> Eventually, this configuration flag will be removed once the feature is stable.
@@ -2400,8 +2415,8 @@ When it is enabled:
24002415
- The provider will now use a keyspace sweeping mode allowing to keep alive
24012416
CID sets that are multiple orders of magnitude larger.
24022417
- **Note:** For improved provide/reprovide operations specifically, consider using
2403-
[`Provide.DHT.SweepEnabled`](#providedhtssweepenabled) instead, which offers similar
2404-
benefits with lower resource consumption.
2418+
[`Provide.DHT.SweepEnabled`](#providedhtsweepenabled) instead, which offers similar
2419+
benefits without the hourly traffic spikes.
24052420
- The standard Bucket-Routing-Table DHT will still run for the DHT server (if
24062421
the DHT server is enabled). This means the classical routing table will
24072422
still be used to answer other nodes.

0 commit comments

Comments
 (0)