docs(provide): validation and reprovide cycle visualization (#10977)

lidel · web-flow · commit 07f017f01ded · 2025-09-19T18:47:30.000+02:00
* docs: improve slow reprovide warning messages

simplify warning text and provide actionable solutions in order of preference

* feat(config): add validation for Provide.DHT settings

- validate interval doesn't exceed DHT record validity (48h)
- validate worker counts and other parameters are within valid ranges
- improve slow reprovide warning messages to reference config parameter
- add tests for all validation cases

* docs: add reprovide cycle visualization

shows traffic patterns of legacy vs sweep vs accelerated DHT
diff --git a/config/provide.go b/config/provide.go
@@ -1,8 +1,11 @@
 package config
 
 import (
+	"fmt"
 	"strings"
 	"time"
+
+	"github.com/libp2p/go-libp2p-kad-dht/amino"
 )
 
 const (
@@ -101,3 +104,67 @@ func ParseProvideStrategy(s string) ProvideStrategy {
 	}
 	return strategy
 }
+
+// ValidateProvideConfig validates the Provide configuration according to DHT requirements.
+func ValidateProvideConfig(cfg *Provide) error {
+	// Validate Provide.DHT.Interval
+	if !cfg.DHT.Interval.IsDefault() {
+		interval := cfg.DHT.Interval.WithDefault(DefaultProvideDHTInterval)
+		if interval > amino.DefaultProvideValidity {
+			return fmt.Errorf("Provide.DHT.Interval (%v) must be less than or equal to DHT provider record validity (%v)", interval, amino.DefaultProvideValidity)
+		}
+		if interval < 0 {
+			return fmt.Errorf("Provide.DHT.Interval must be non-negative, got %v", interval)
+		}
+	}
+
+	// Validate MaxWorkers
+	if !cfg.DHT.MaxWorkers.IsDefault() {
+		maxWorkers := cfg.DHT.MaxWorkers.WithDefault(DefaultProvideDHTMaxWorkers)
+		if maxWorkers <= 0 {
+			return fmt.Errorf("Provide.DHT.MaxWorkers must be positive, got %d", maxWorkers)
+		}
+	}
+
+	// Validate DedicatedPeriodicWorkers
+	if !cfg.DHT.DedicatedPeriodicWorkers.IsDefault() {
+		workers := cfg.DHT.DedicatedPeriodicWorkers.WithDefault(DefaultProvideDHTDedicatedPeriodicWorkers)
+		if workers < 0 {
+			return fmt.Errorf("Provide.DHT.DedicatedPeriodicWorkers must be non-negative, got %d", workers)
+		}
+	}
+
+	// Validate DedicatedBurstWorkers
+	if !cfg.DHT.DedicatedBurstWorkers.IsDefault() {
+		workers := cfg.DHT.DedicatedBurstWorkers.WithDefault(DefaultProvideDHTDedicatedBurstWorkers)
+		if workers < 0 {
+			return fmt.Errorf("Provide.DHT.DedicatedBurstWorkers must be non-negative, got %d", workers)
+		}
+	}
+
+	// Validate MaxProvideConnsPerWorker
+	if !cfg.DHT.MaxProvideConnsPerWorker.IsDefault() {
+		conns := cfg.DHT.MaxProvideConnsPerWorker.WithDefault(DefaultProvideDHTMaxProvideConnsPerWorker)
+		if conns <= 0 {
+			return fmt.Errorf("Provide.DHT.MaxProvideConnsPerWorker must be positive, got %d", conns)
+		}
+	}
+
+	// Validate KeyStoreBatchSize
+	if !cfg.DHT.KeyStoreBatchSize.IsDefault() {
+		batchSize := cfg.DHT.KeyStoreBatchSize.WithDefault(DefaultProvideDHTKeyStoreBatchSize)
+		if batchSize <= 0 {
+			return fmt.Errorf("Provide.DHT.KeyStoreBatchSize must be positive, got %d", batchSize)
+		}
+	}
+
+	// Validate OfflineDelay
+	if !cfg.DHT.OfflineDelay.IsDefault() {
+		delay := cfg.DHT.OfflineDelay.WithDefault(DefaultProvideDHTOfflineDelay)
+		if delay < 0 {
+			return fmt.Errorf("Provide.DHT.OfflineDelay must be non-negative, got %v", delay)
+		}
+	}
+
+	return nil
+}
diff --git a/config/provide_test.go b/config/provide_test.go
@@ -1,6 +1,12 @@
 package config
 
-import "testing"
+import (
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
 
 func TestParseProvideStrategy(t *testing.T) {
 	tests := []struct {
@@ -25,3 +31,77 @@ func TestParseProvideStrategy(t *testing.T) {
 		}
 	}
 }
+
+func TestValidateProvideConfig_Interval(t *testing.T) {
+	tests := []struct {
+		name     string
+		interval time.Duration
+		wantErr  bool
+		errMsg   string
+	}{
+		{"valid default (22h)", 22 * time.Hour, false, ""},
+		{"valid max (48h)", 48 * time.Hour, false, ""},
+		{"valid small (1h)", 1 * time.Hour, false, ""},
+		{"valid zero (disabled)", 0, false, ""},
+		{"invalid over limit (49h)", 49 * time.Hour, true, "must be less than or equal to DHT provider record validity"},
+		{"invalid over limit (72h)", 72 * time.Hour, true, "must be less than or equal to DHT provider record validity"},
+		{"invalid negative", -1 * time.Hour, true, "must be non-negative"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			cfg := &Provide{
+				DHT: ProvideDHT{
+					Interval: NewOptionalDuration(tt.interval),
+				},
+			}
+
+			err := ValidateProvideConfig(cfg)
+
+			if tt.wantErr {
+				require.Error(t, err, "expected error for interval=%v", tt.interval)
+				if tt.errMsg != "" {
+					assert.Contains(t, err.Error(), tt.errMsg, "error message mismatch")
+				}
+			} else {
+				require.NoError(t, err, "unexpected error for interval=%v", tt.interval)
+			}
+		})
+	}
+}
+
+func TestValidateProvideConfig_MaxWorkers(t *testing.T) {
+	tests := []struct {
+		name       string
+		maxWorkers int64
+		wantErr    bool
+		errMsg     string
+	}{
+		{"valid default", 16, false, ""},
+		{"valid high", 100, false, ""},
+		{"valid low", 1, false, ""},
+		{"invalid zero", 0, true, "must be positive"},
+		{"invalid negative", -1, true, "must be positive"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			cfg := &Provide{
+				DHT: ProvideDHT{
+					MaxWorkers: NewOptionalInteger(tt.maxWorkers),
+				},
+			}
+
+			err := ValidateProvideConfig(cfg)
+
+			if tt.wantErr {
+				require.Error(t, err, "expected error for maxWorkers=%d", tt.maxWorkers)
+				if tt.errMsg != "" {
+					assert.Contains(t, err.Error(), tt.errMsg, "error message mismatch")
+				}
+			} else {
+				require.NoError(t, err, "unexpected error for maxWorkers=%d", tt.maxWorkers)
+			}
+		})
+	}
+}
diff --git a/core/node/groups.go b/core/node/groups.go
@@ -431,6 +431,11 @@ func IPFS(ctx context.Context, bcfg *BuildCfg) fx.Option {
 		return fx.Error(err)
 	}
 
+	// Validate Provide configuration
+	if err := config.ValidateProvideConfig(&cfg.Provide); err != nil {
+		return fx.Error(err)
+	}
+
 	// Auto-sharding settings
 	shardingThresholdString := cfg.Import.UnixFSHAMTDirectorySizeThreshold.WithDefault(config.DefaultUnixFSHAMTDirectorySizeThreshold)
 	shardSingThresholdInt, err := humanize.ParseBytes(shardingThresholdString)
diff --git a/core/node/provider.go b/core/node/provider.go
@@ -208,20 +208,20 @@ func LegacyProviderOpt(reprovideInterval time.Duration, strategy string, acceler
 									expectedProvideSpeed := reprovideInterval / probableBigBlockstore
 									if avgProvideSpeed > expectedProvideSpeed {
 										logger.Errorf(`
-🔔🔔🔔 YOU MAY BE FALLING BEHIND DHT REPROVIDES! 🔔🔔🔔
+🔔🔔🔔 Reprovide Operations Too Slow 🔔🔔🔔
 
-⚠️ Your system might be struggling to keep up with DHT reprovides!
-This means your content could be partially or completely inaccessible on the network.
-We observed that you recently provided %d keys at an average rate of %v per key.
+Your node may be falling behind on DHT reprovides, which could affect content availability.
 
-🕑 An attempt to estimate your blockstore size timed out after 5 minutes,
-implying your blockstore might be exceedingly large. Assuming a considerable
-size of 10TiB, it would take %v to provide the complete set.
+Observed: %d keys at %v per key
+Estimated: Assuming 10TiB blockstore, would take %v to complete
+⏰ Must finish within %v (Provide.DHT.Interval)
 
-⏰ The total provide time needs to stay under your reprovide interval (%v) to prevent falling behind!
+Solutions (try in order):
+1. Enable Provide.DHT.SweepEnabled=true (recommended)
+2. Increase Provide.DHT.MaxWorkers if needed
+3. Enable Routing.AcceleratedDHTClient=true (last resort, resource intensive)
 
-💡 Consider enabling the Accelerated DHT to enhance your system performance. See:
-https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtclient`,
+Learn more: https://github.com/ipfs/kubo/blob/master/docs/config.md#provide`,
 											keysProvided, avgProvideSpeed, avgProvideSpeed*probableBigBlockstore, reprovideInterval)
 										return false
 									}
@@ -237,18 +237,20 @@ https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtcli
 
 						if avgProvideSpeed > expectedProvideSpeed {
 							logger.Errorf(`
-🔔🔔🔔 YOU ARE FALLING BEHIND DHT REPROVIDES! 🔔🔔🔔
+🔔🔔🔔 Reprovide Operations Too Slow 🔔🔔🔔
 
-⚠️ Your system is struggling to keep up with DHT reprovides!
-This means your content could be partially or completely inaccessible on the network.
-We observed that you recently provided %d keys at an average rate of %v per key.
+Your node is falling behind on DHT reprovides, which will affect content availability.
 
-💾 Your total CID count is ~%d which would total at %v reprovide process.
+Observed: %d keys at %v per key
+Confirmed: ~%d total CIDs requiring %v to complete
+⏰ Must finish within %v (Provide.DHT.Interval)
 
-⏰ The total provide time needs to stay under your reprovide interval (%v) to prevent falling behind!
+Solutions (try in order):
+1. Enable Provide.DHT.SweepEnabled=true (recommended)
+2. Increase Provide.DHT.MaxWorkers if needed
+3. Enable Routing.AcceleratedDHTClient=true (last resort, resource intensive)
 
-💡 Consider enabling the Accelerated DHT to enhance your reprovide throughput. See:
-https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtclient`,
+Learn more: https://github.com/ipfs/kubo/blob/master/docs/config.md#provide`,
 								keysProvided, avgProvideSpeed, count, avgProvideSpeed*time.Duration(count), reprovideInterval)
 						}
 						return false
diff --git a/docs/changelogs/v0.38.md b/docs/changelogs/v0.38.md
@@ -38,17 +38,25 @@ Read more about the new system below.
 
 #### 🧹 Experimental Sweeping DHT Provider
 
-A new experimental DHT provider is available as an alternative to both the default provider and the resource-intensive [accelerated DHT client](https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtclient). Enable it via [`Provide.DHT.SweepEnabled`](https://github.com/ipfs/kubo/blob/master/docs/config.md#providedhtssweepenabled).
-
-> [!NOTE]
-> This feature is experimental and opt-in. In the future, it will become the default and replace the legacy system. Some commands like `ipfs stats provide` and `ipfs routing provide` are not yet available with sweep mode. Run `ipfs provide --help` for alternatives.
+A new experimental DHT provider is available as an alternative to both the default provider and the resource-intensive [accelerated DHT client](https://github.com/ipfs/kubo/blob/master/docs/config.md#routingaccelerateddhtclient). Enable it via [`Provide.DHT.SweepEnabled`](https://github.com/ipfs/kubo/blob/master/docs/config.md#providedhtsweepenabled).
 
 **How it works:** Instead of providing keys one-by-one, the sweep provider systematically explores DHT keyspace regions in batches.
 
+> <picture>
+>   <source media="(prefers-color-scheme: dark)" srcset="https://github.com/user-attachments/assets/f6e06b08-7fee-490c-a681-1bf440e16e27">
+>   <source media="(prefers-color-scheme: light)" srcset="https://github.com/user-attachments/assets/e1662d7c-f1be-4275-a9ed-f2752fcdcabe">
+>   <img alt="Reprovide Cycle Comparison" src="https://github.com/user-attachments/assets/e1662d7c-f1be-4275-a9ed-f2752fcdcabe">
+> </picture>
+>
+> The diagram shows how sweep mode avoids the hourly traffic spikes of Accelerated DHT while maintaining similar effectiveness. By grouping CIDs into keyspace regions and processing them in batches, sweep mode reduces memory overhead and creates predictable network patterns.
+
 **Benefits for large-scale operations:** Handles hundreds of thousands of CIDs with reduced memory and network connections, spreads operations evenly to eliminate resource spikes, maintains state across restarts through persistent keystore, and provides better metrics visibility.
 
 **Monitoring and debugging:** Legacy mode (`SweepEnabled=false`) tracks `provider_reprovider_provide_count` and `provider_reprovider_reprovide_count`, while sweep mode (`SweepEnabled=true`) tracks `total_provide_count_total`. Enable debug logging with `GOLOG_LOG_LEVEL=error,provider=debug,dht/provider=debug` to see detailed logs from either system.
 
+> [!NOTE]
+> This feature is experimental and opt-in. In the future, it will become the default and replace the legacy system. Some commands like `ipfs stats provide` and `ipfs routing provide` are not yet available with sweep mode. Run `ipfs provide --help` for alternatives.
+
 For configuration details, see [`Provide.DHT`](https://github.com/ipfs/kubo/blob/master/docs/config.md#providedht). For metrics documentation, see [Provide metrics](https://github.com/ipfs/kubo/blob/master/docs/metrics.md#provide).
 
 #### 📊 Exposed DHT metrics
diff --git a/docs/config.md b/docs/config.md
@@ -131,7 +131,7 @@ config file at runtime.
     - [`Provide.DHT`](#providedht)
       - [`Provide.DHT.MaxWorkers`](#providedhtmaxworkers)
       - [`Provide.DHT.Interval`](#providedhtinterval)
-      - [`Provide.DHT.SweepEnabled`](#providedhtssweepenabled)
+      - [`Provide.DHT.SweepEnabled`](#providedhtsweepenabled)
       - [`Provide.DHT.DedicatedPeriodicWorkers`](#providedhtdedicatedperiodicworkers)
       - [`Provide.DHT.DedicatedBurstWorkers`](#providedhtdedicatedburstworkers)
       - [`Provide.DHT.MaxProvideConnsPerWorker`](#providedhtmaxprovideconnsperworker)
@@ -2026,6 +2026,21 @@ by providing it a channel of all the keys it is expected to contain according
 to the [`Provide.Strategy`](#providestrategy). During this operation,
 all keys in the `Keystore` are purged, and only the given ones remain scheduled.
 
+> <picture>
+>   <source media="(prefers-color-scheme: dark)" srcset="https://github.com/user-attachments/assets/f6e06b08-7fee-490c-a681-1bf440e16e27">
+>   <source media="(prefers-color-scheme: light)" srcset="https://github.com/user-attachments/assets/e1662d7c-f1be-4275-a9ed-f2752fcdcabe">
+>   <img alt="Reprovide Cycle Comparison" src="https://github.com/user-attachments/assets/e1662d7c-f1be-4275-a9ed-f2752fcdcabe">
+> </picture>
+>
+> The diagram above visualizes the performance patterns:
+>
+> - **Legacy mode**: Individual (slow) provides per CID, can struggle with large datasets
+> - **Sweep mode**: Even distribution matching the keyspace sweep described with low resource usage
+> - **Accelerated DHT**: Hourly traffic spikes with high resource usage
+>
+> Sweep mode provides similar effectiveness to Accelerated DHT but with steady resource usage - better for machines with limited CPU, memory, or network bandwidth.
+
+
 > [!NOTE]
 > This feature is opt-in for now, but will become the default in a future release.
 > Eventually, this configuration flag will be removed once the feature is stable.
@@ -2400,8 +2415,8 @@ When it is enabled:
 - The provider will now use a keyspace sweeping mode allowing to keep alive
   CID sets that are multiple orders of magnitude larger.
   - **Note:** For improved provide/reprovide operations specifically, consider using
-    [`Provide.DHT.SweepEnabled`](#providedhtssweepenabled) instead, which offers similar
-    benefits with lower resource consumption.
+    [`Provide.DHT.SweepEnabled`](#providedhtsweepenabled) instead, which offers similar
+    benefits without the hourly traffic spikes.
   - The standard Bucket-Routing-Table DHT will still run for the DHT server (if
     the DHT server is enabled). This means the classical routing table will
     still be used to answer other nodes.