From 109785cf952d7952cd63ae8dc148521b1257e1f7 Mon Sep 17 00:00:00 2001 From: Luke Van Drie Date: Thu, 4 Sep 2025 01:00:56 +0000 Subject: [PATCH 1/5] feat(flowcontrol): Refactor FlowRegistry contracts This commit refactors some of the core Flow Control contracts to improve clarity and better align with their intended roles. The goal is to create a more intuitive and robust interface for the upcoming top-level FlowController. Key changes include: - The `FlowRegistryClient` interface is renamed to `FlowRegistryDataPlane` to more accurately reflect its role in the high-throughput request path. - The `FlowRegistryAdmin` interface is renamed to `FlowRegistryObserver` to clarify its read-only, observational nature. - The `ActiveFlowConnection.Shards()` method is renamed to `ActiveFlowConnection.ActiveShards()` to make it explicit that it returns only active, schedulable shards. This removes ambiguity for the distributor logic. - `ShardStats` is enriched with `ID` and `IsActive` fields, providing consumers with more context about the shard's state at the time the snapshot was taken. - The registry implementation has been updated to match these new contract definitions. --- pkg/epp/flowcontrol/contracts/registry.go | 39 ++++++++++-------- pkg/epp/flowcontrol/registry/connection.go | 6 +-- pkg/epp/flowcontrol/registry/registry.go | 4 +- pkg/epp/flowcontrol/registry/registry_test.go | 40 +++++++++---------- pkg/epp/flowcontrol/registry/shard.go | 2 + pkg/epp/flowcontrol/registry/shard_test.go | 2 + pkg/epp/flowcontrol/types/errors.go | 6 +-- 7 files changed, 54 insertions(+), 45 deletions(-) diff --git a/pkg/epp/flowcontrol/contracts/registry.go b/pkg/epp/flowcontrol/contracts/registry.go index e2c42fdbc..fe0b790b9 100644 --- a/pkg/epp/flowcontrol/contracts/registry.go +++ b/pkg/epp/flowcontrol/contracts/registry.go @@ -22,8 +22,8 @@ import ( ) // FlowRegistry is the complete interface for the global flow control plane. -// It composes the client-facing data path interface and the administrative interface. A concrete implementation of this -// interface is the single source of truth for all flow control state. +// It composes all role-based interfaces. A concrete implementation of this interface is the single source of truth for +// all flow control state. // // # Conformance: Implementations MUST be goroutine-safe. // @@ -48,22 +48,21 @@ import ( // 2. Capacity Partitioning: Global and per-band capacity limits must be uniformly partitioned across all Active // shards. type FlowRegistry interface { - FlowRegistryClient - FlowRegistryAdmin + FlowRegistryObserver + FlowRegistryDataPlane } -// FlowRegistryAdmin defines the administrative interface for the global control plane. -type FlowRegistryAdmin interface { - // Stats returns globally aggregated statistics for the entire `FlowRegistry`. +// FlowRegistryObserver defines the read-only, observation interface for the registry. +type FlowRegistryObserver interface { + // Stats returns a near-consistent snapshot globally aggregated statistics for the entire `FlowRegistry`. Stats() AggregateStats - // ShardStats returns a slice of statistics, one for each internal shard. + // ShardStats returns a near-consistent slice of statistics snapshots, one for each `RegistryShard`. ShardStats() []ShardStats } -// FlowRegistryClient defines the primary, client-facing interface for the registry. -// This is the interface that the `controller.FlowController`'s data path depends upon. -type FlowRegistryClient interface { +// FlowRegistryDataPlane defines the high-throughput, request-path interface for the registry. +type FlowRegistryDataPlane interface { // WithConnection manages a scoped, leased session for a given flow. // It is the primary and sole entry point for interacting with the data path. // @@ -90,9 +89,8 @@ type FlowRegistryClient interface { // Its purpose is to ensure that any interaction with the flow's state (e.g., accessing its shards and queues) occurs // safely while the flow is guaranteed to be protected from garbage collection. type ActiveFlowConnection interface { - // Shards returns a stable snapshot of accessors for all internal state shards (both Active and Draining). - // Consumers MUST check `RegistryShard.IsActive()` before routing new work to a shard from this slice. - Shards() []RegistryShard + // ActiveShards returns a stable snapshot of accessors for all Active internal state shards. + ActiveShards() []RegistryShard } // RegistryShard defines the interface for a single slice (shard) of the `FlowRegistry`'s state. @@ -139,7 +137,7 @@ type RegistryShard interface { // `controller.FlowController` worker's dispatch loop. AllOrderedPriorityLevels() []int - // Stats returns a snapshot of the statistics for this specific shard. + // Stats returns a near consistent snapshot of the shard's state. Stats() ShardStats } @@ -162,6 +160,7 @@ type ManagedQueue interface { } // AggregateStats holds globally aggregated statistics for the entire `FlowRegistry`. +// It is a read-only data object representing a near-consistent snapshot of the registry's state. type AggregateStats struct { // TotalCapacityBytes is the globally configured maximum total byte size limit across all priority bands and shards. TotalCapacityBytes uint64 @@ -173,8 +172,15 @@ type AggregateStats struct { PerPriorityBandStats map[int]PriorityBandStats } -// ShardStats holds statistics for a single internal shard within the `FlowRegistry`. +// ShardStats holds statistics and identifying information for a `RegistryShard` within the `FlowRegistry`. +// It is a read-only data object representing a near-consistent snapshot of the shard's state. type ShardStats struct { + // ID is the unique, stable identifier for this shard. + ID string + // IsActive indicates if the shard was accepting new work at the time this stats snapshot was generated. + // A value of `false` means the shard is in the process of being gracefully drained. + // Due to the concurrent nature of the system, this state could change immediately after the snapshot is taken. + IsActive bool // TotalCapacityBytes is the optional, maximum total byte size limit aggregated across all priority bands within this // shard. Its value represents the globally configured limit for the `FlowRegistry` partitioned for this shard. // The `controller.FlowController` enforces this limit in addition to any per-band capacity limits. @@ -192,6 +198,7 @@ type ShardStats struct { } // PriorityBandStats holds aggregated statistics for a single priority band. +// It is a read-only data object representing a near-consistent snapshot of the priority band's state. type PriorityBandStats struct { // Priority is the numerical priority level this struct describes. Priority int diff --git a/pkg/epp/flowcontrol/registry/connection.go b/pkg/epp/flowcontrol/registry/connection.go index 995f23c13..cb9831655 100644 --- a/pkg/epp/flowcontrol/registry/connection.go +++ b/pkg/epp/flowcontrol/registry/connection.go @@ -31,13 +31,13 @@ type connection struct { var _ contracts.ActiveFlowConnection = &connection{} // Shards returns a stable snapshot of accessors for all internal state shards. -func (c *connection) Shards() []contracts.RegistryShard { +func (c *connection) ActiveShards() []contracts.RegistryShard { c.registry.mu.RLock() defer c.registry.mu.RUnlock() // Return a copy to ensure the caller cannot modify the registry's internal slice. - shardsCopy := make([]contracts.RegistryShard, len(c.registry.allShards)) - for i, s := range c.registry.allShards { + shardsCopy := make([]contracts.RegistryShard, len(c.registry.activeShards)) + for i, s := range c.registry.activeShards { shardsCopy[i] = s } return shardsCopy diff --git a/pkg/epp/flowcontrol/registry/registry.go b/pkg/epp/flowcontrol/registry/registry.go index 12f83e2ad..5f295541d 100644 --- a/pkg/epp/flowcontrol/registry/registry.go +++ b/pkg/epp/flowcontrol/registry/registry.go @@ -198,7 +198,7 @@ func (fr *FlowRegistry) Run(ctx context.Context) { } } -// --- `contracts.FlowRegistryClient` Implementation --- +// --- `contracts.FlowRegistryDataPlane` Implementation --- // Connect establishes a session for a given flow, acquiring a lifecycle lease. // This is the primary entry point for the data path. @@ -275,7 +275,7 @@ func (fr *FlowRegistry) prepareNewFlow(key types.FlowKey) (*flowState, error) { return &flowState{key: key}, nil } -// --- `contracts.FlowRegistryAdmin` Implementation --- +// --- `contracts.FlowRegistryObserver` Implementation --- // Stats returns globally aggregated statistics for the entire `FlowRegistry`. // diff --git a/pkg/epp/flowcontrol/registry/registry_test.go b/pkg/epp/flowcontrol/registry/registry_test.go index 0e1bee194..b63b151c2 100644 --- a/pkg/epp/flowcontrol/registry/registry_test.go +++ b/pkg/epp/flowcontrol/registry/registry_test.go @@ -261,7 +261,7 @@ func TestFlowRegistry_WithConnection_AndHandle(t *testing.T) { assert.ErrorContains(t, err, "injected factory failure", "The returned error must propagate the reason") }) - t.Run("Handle_Shards_ShouldReturnAllShardsAndBeACopy", func(t *testing.T) { + t.Run("Handle_Shards_ShouldReturnAllActiveShardsAndBeACopy", func(t *testing.T) { t.Parallel() // Create a registry with a known mixed topology of Active and Draining shards. h := newRegistryTestHarness(t, harnessOptions{initialShardCount: 3}) @@ -272,9 +272,9 @@ func TestFlowRegistry_WithConnection_AndHandle(t *testing.T) { key := types.FlowKey{ID: "test-flow", Priority: highPriority} err = h.fr.WithConnection(key, func(conn contracts.ActiveFlowConnection) error { - shards := conn.Shards() + shards := conn.ActiveShards() - assert.Len(t, shards, 3, "Shards() must return all configured shards, including Draining ones") + assert.Len(t, shards, 2, "ActiveShards() must only return the Active shards") // Assert it's a copy by maliciously modifying it. require.NotEmpty(t, shards, "Test setup assumes shards are present") @@ -285,8 +285,8 @@ func TestFlowRegistry_WithConnection_AndHandle(t *testing.T) { require.NoError(t, err) // Prove the registry's internal state was not mutated by the modification. - assert.NotNil(t, h.fr.allShards[0], - "Modifying the slice returned by Shards() must not affect the registry's internal state") + assert.NotNil(t, h.fr.activeShards[0], + "Modifying the slice returned by ActiveShards() must not affect the registry's internal state") }) } @@ -323,6 +323,9 @@ func TestFlowRegistry_Stats(t *testing.T) { require.Len(t, shardStats, 2, "Should return stats for 2 shards") var totalShardLen, totalShardBytes uint64 for _, ss := range shardStats { + assert.True(t, ss.IsActive, "All shards should be active in this test") + assert.NotEmpty(t, ss.PerPriorityBandStats, "Each shard should have stats for its priority bands") + assert.NotEmpty(t, ss.ID, "Each shard should have a non-empty ID") totalShardLen += ss.TotalLen totalShardBytes += ss.TotalByteSize } @@ -600,24 +603,19 @@ func TestFlowRegistry_UpdateShardCount(t *testing.T) { require.NoError(t, err, "UpdateShardCount should not have returned an error") } - var finalActiveCount, finalDrainingCount int globalCapacities := make(map[uint64]int) bandCapacities := make(map[uint64]int) - err = h.fr.WithConnection(key, func(conn contracts.ActiveFlowConnection) error { - for _, shard := range conn.Shards() { - if shard.IsActive() { - finalActiveCount++ - stats := shard.Stats() - globalCapacities[stats.TotalCapacityBytes]++ - bandCapacities[stats.PerPriorityBandStats[highPriority].CapacityBytes]++ - h.assertFlowExists(key, "Shard %s should contain the existing flow", shard.ID()) - } else { - finalDrainingCount++ - } - } - return nil - }) - require.NoError(t, err, "WithConnection should not fail") + + h.fr.mu.RLock() + finalActiveCount := len(h.fr.activeShards) + finalDrainingCount := len(h.fr.drainingShards) + for _, shard := range h.fr.activeShards { + stats := shard.Stats() + globalCapacities[stats.TotalCapacityBytes]++ + bandCapacities[stats.PerPriorityBandStats[highPriority].CapacityBytes]++ + h.assertFlowExists(key, "Shard %s should contain the existing flow", shard.ID()) + } + h.fr.mu.RUnlock() expectedDrainingCount := 0 if tc.initialShardCount > tc.expectedActiveCount { diff --git a/pkg/epp/flowcontrol/registry/shard.go b/pkg/epp/flowcontrol/registry/shard.go index c87a8e485..4a7918e79 100644 --- a/pkg/epp/flowcontrol/registry/shard.go +++ b/pkg/epp/flowcontrol/registry/shard.go @@ -228,6 +228,8 @@ func (s *registryShard) Stats() contracts.ShardStats { // Casts from `int64` to `uint64` are safe because the non-negative invariant is strictly enforced at the // `managedQueue` level. stats := contracts.ShardStats{ + ID: s.id, + IsActive: s.IsActive(), TotalCapacityBytes: s.config.MaxBytes, TotalByteSize: uint64(s.totalByteSize.Load()), TotalLen: uint64(s.totalLen.Load()), diff --git a/pkg/epp/flowcontrol/registry/shard_test.go b/pkg/epp/flowcontrol/registry/shard_test.go index 7fbda572e..afb2fac34 100644 --- a/pkg/epp/flowcontrol/registry/shard_test.go +++ b/pkg/epp/flowcontrol/registry/shard_test.go @@ -165,6 +165,8 @@ func TestShard_Stats(t *testing.T) { stats := h.shard.Stats() + assert.Equal(t, h.shard.ID(), stats.ID, "Stats ID must match the shard ID") + assert.True(t, stats.IsActive, "Shard must report itself as active in the stats snapshot") assert.Equal(t, uint64(2), stats.TotalLen, "Total shard length must aggregate counts from all bands") assert.Equal(t, uint64(150), stats.TotalByteSize, "Total shard byte size must aggregate sizes from all bands") diff --git a/pkg/epp/flowcontrol/types/errors.go b/pkg/epp/flowcontrol/types/errors.go index f7dffbd4d..5d2c25516 100644 --- a/pkg/epp/flowcontrol/types/errors.go +++ b/pkg/epp/flowcontrol/types/errors.go @@ -68,10 +68,10 @@ var ( // --- General `controller.FlowController` Errors --- var ( - // ErrFlowControllerShutdown indicates that an operation could not complete or an item was evicted because the - // `controller.FlowController` is shutting down or has stopped. + // ErrFlowControllerNotRunning indicates that an operation could not complete or an item was evicted because the + // `controller.FlowController` is not running or is in the process of shutting down. // // When returned by `FlowController.EnqueueAndWait()`, this will be wrapped by `ErrRejected` (if rejection happens // before internal queuing) or `ErrEvicted` (if eviction happens after internal queuing). - ErrFlowControllerShutdown = errors.New("FlowController is shutting down") + ErrFlowControllerNotRunning = errors.New("flow controller is not running") ) From 44efb4501f86d3ae55b2785b2ca581aa3ee2bb93 Mon Sep 17 00:00:00 2001 From: Luke Van Drie Date: Thu, 4 Sep 2025 01:02:48 +0000 Subject: [PATCH 2/5] refactor: Adapt ShardProcessor to a worker role This commit refactors the `ShardProcessor` to function as a stateful worker managed by a higher-level supervisor. This is a preparatory step for the introduction of the new top-level `FlowController`. The public API of the processor is changed from a direct `Enqueue` method to a more sophisticated, channel-based submission model with `Submit` (non-blocking) and `SubmitOrBlock` (blocking). This decouples the producer from the processor's main loop, enabling better backpressure signals and higher throughput. Key changes include: - Introduction of `Submit` and `SubmitOrBlock` for asynchronous request handoff. - `FlowItem`'s finalization logic is improved to be more robust and channel-based. - Error handling within the dispatch cycle is refactored (no logic change) to be more clear about how it promotes work conservation by isolating failures to a single priority band. --- .../flowcontrol/controller/internal/doc.go | 37 +- .../flowcontrol/controller/internal/item.go | 154 +++----- .../controller/internal/item_test.go | 61 ++- .../controller/internal/processor.go | 291 +++++++------- .../controller/internal/processor_test.go | 361 +++++++++--------- 5 files changed, 418 insertions(+), 486 deletions(-) diff --git a/pkg/epp/flowcontrol/controller/internal/doc.go b/pkg/epp/flowcontrol/controller/internal/doc.go index 3f39b5791..083654682 100644 --- a/pkg/epp/flowcontrol/controller/internal/doc.go +++ b/pkg/epp/flowcontrol/controller/internal/doc.go @@ -14,34 +14,19 @@ See the License for the specific language governing permissions and limitations under the License. */ -// Package internal provides the core worker implementation for the `controller.FlowController`. +// Package internal provides the core worker implementation for the controller.FlowController. // -// The components in this package are the private, internal building blocks of the `controller` package. This separation -// enforces a clean public API at the `controller` level and allows the internal mechanics of the engine to evolve -// independently. +// The components in this package are the private, internal building blocks of the controller. This separation enforces +// a clean public API at the `controller` level and allows the internal mechanics of the engine to evolve independently. // -// # Design Philosophy: A Single-Writer Actor Model +// # Design Philosophy: The Single-Writer Actor Model // -// The concurrency model for this package is deliberately built around a single-writer, channel-based actor pattern, as -// implemented in the `ShardProcessor`. While a simple lock-based approach might seem easier, it is insufficient for the -// system's requirements. The "enqueue" operation is a complex, stateful transaction that requires a **hierarchical -// capacity check** against both the overall shard and a request's specific priority band. +// The concurrency model for this package is built around a single-writer, channel-based actor pattern, as implemented +// in the `ShardProcessor`. All state-mutating operations for a given shard (primarily enqueuing new requests) are +// funneled through a single "Run" goroutine. // -// A coarse, shard-wide lock would be required to make this transaction atomic, creating a major performance bottleneck -// and causing head-of-line blocking at the top-level `controller.FlowController`. The single-writer model, where all -// state mutations are funneled through a single goroutine, makes this transaction atomic *without locks*. -// -// This design provides two critical benefits: -// 1. **Decoupling:** The `controller.FlowController` is decoupled via a non-blocking channel send, allowing for much -// higher throughput. -// 2. **Backpressure:** The state of the channel buffer serves as a high-fidelity, real-time backpressure signal, -// enabling more intelligent load balancing. -// -// # Future-Proofing for Complex Transactions -// -// This model's true power is that it provides a robust foundation for future features like **displacement** (a -// high-priority item evicting lower-priority ones). This is an "all-or-nothing" atomic transaction that is -// exceptionally difficult to implement correctly in a lock-free or coarse-grained locking model without significant -// performance penalties. The single-writer model contains the performance cost of such a potentially long transaction -// to the single `ShardProcessor`, preventing it from blocking the entire `controller.FlowController`. +// This design makes complex, multi-step transactions (like a hierarchical capacity check against both a shard's total +// limit and a priority band's limit) inherently atomic without locks. This avoids the performance bottleneck of a +// coarse, shard-wide lock and allows the top-level `controller.FlowController` to remain decoupled and highly +// concurrent. package internal diff --git a/pkg/epp/flowcontrol/controller/internal/item.go b/pkg/epp/flowcontrol/controller/internal/item.go index 86aeb8a0c..d5bdaaf2e 100644 --- a/pkg/epp/flowcontrol/controller/internal/item.go +++ b/pkg/epp/flowcontrol/controller/internal/item.go @@ -18,140 +18,96 @@ package internal import ( "sync" - "sync/atomic" "time" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types" ) -// flowItem is the internal representation of a request managed by the `FlowController`. It implements the -// `types.QueueItemAccessor` interface, which is the primary view of the item used by queue and policy implementations. -// It wraps the original `types.FlowControlRequest` and adds metadata for queuing, lifecycle management, and policy -// interaction. -// -// # Concurrency -// -// The `finalize` method is the primary point of concurrency concern. It is designed to be atomic and idempotent through -// the use of `sync.Once`. This guarantees that an item's final outcome can be set exactly once, even if multiple -// goroutines (e.g., the main dispatch loop and the expiry cleanup loop) race to finalize it. All other fields are set -// at creation time and are not modified thereafter, making them safe for concurrent access. -type flowItem struct { - // enqueueTime is the timestamp when the item was logically accepted by the `FlowController`. - enqueueTime time.Time - // effectiveTTL is the actual time-to-live assigned to this item. - effectiveTTL time.Duration - // originalRequest is the underlying request object. +// FinalState encapsulates the terminal outcome of a `FlowItem`'s lifecycle. +// It is sent over the item's `Done()` channel exactly once. +type FinalState struct { + Outcome types.QueueOutcome + Err error +} + +// FlowItem is the internal representation of a request managed by the `FlowController`. +type FlowItem struct { + // --- Immutable fields (set at creation) --- + + enqueueTime time.Time + effectiveTTL time.Duration originalRequest types.FlowControlRequest - // handle is the unique identifier for this item within a specific queue instance. - handle types.QueueItemHandle - - // done is closed exactly once when the item is finalized (dispatched or evicted/rejected). - done chan struct{} - // err stores the final error state if the item was not successfully dispatched. - // It is written to exactly once, protected by `onceFinalize`. - err atomic.Value // Stores error - // outcome stores the final `types.QueueOutcome` of the item's lifecycle. - // It is written to exactly once, protected by `onceFinalize`. - outcome atomic.Value // Stores `types.QueueOutcome` + handle types.QueueItemHandle + + // --- Finalization state (protected by onceFinalize) --- + + // done is closed exactly once when the item is finalized. + // The closing of this channel establishes a "happens-before" memory barrier, guaranteeing that writes to `outcome` + // and `err` are visible to any goroutine that has successfully read from `done`. + done chan FinalState + + // finalState is safely visible to any goroutine after it has confirmed the channel is closed. + finalState FinalState + // onceFinalize ensures the `finalize()` logic is idempotent. onceFinalize sync.Once } -// ensure flowItem implements the interface. -var _ types.QueueItemAccessor = &flowItem{} +// ensure FlowItem implements the interface. +var _ types.QueueItemAccessor = &FlowItem{} -// NewItem creates a new `flowItem`, which is the internal representation of a request inside the `FlowController`. -// This constructor is exported so that the parent `controller` package can create items to be passed into the -// `internal` package's processors. It initializes the item with a "NotYetFinalized" outcome and an open `done` channel. -func NewItem(req types.FlowControlRequest, effectiveTTL time.Duration, enqueueTime time.Time) *flowItem { - fi := &flowItem{ +// NewItem creates a new `FlowItem`. +func NewItem(req types.FlowControlRequest, effectiveTTL time.Duration, enqueueTime time.Time) *FlowItem { + return &FlowItem{ enqueueTime: enqueueTime, effectiveTTL: effectiveTTL, originalRequest: req, - done: make(chan struct{}), + // Buffer to size one, preventing finalizing goroutine (e.g., the dispatcher) from blocking if the waiting + // goroutine has already timed out and is no longer reading. + done: make(chan FinalState, 1), } - // Initialize the outcome to its zero state. - fi.outcome.Store(types.QueueOutcomeNotYetFinalized) - return fi } // EnqueueTime returns the time the item was logically accepted by the `FlowController` for queuing. This is used as the // basis for TTL calculations. -func (fi *flowItem) EnqueueTime() time.Time { return fi.enqueueTime } +func (fi *FlowItem) EnqueueTime() time.Time { return fi.enqueueTime } // EffectiveTTL returns the actual time-to-live assigned to this item by the `FlowController`. -func (fi *flowItem) EffectiveTTL() time.Duration { return fi.effectiveTTL } +func (fi *FlowItem) EffectiveTTL() time.Duration { return fi.effectiveTTL } // OriginalRequest returns the original, underlying `types.FlowControlRequest` object. -func (fi *flowItem) OriginalRequest() types.FlowControlRequest { return fi.originalRequest } +func (fi *FlowItem) OriginalRequest() types.FlowControlRequest { return fi.originalRequest } // Handle returns the `types.QueueItemHandle` that uniquely identifies this item within a specific queue instance. It // returns nil if the item has not yet been added to a queue. -func (fi *flowItem) Handle() types.QueueItemHandle { return fi.handle } +func (fi *FlowItem) Handle() types.QueueItemHandle { return fi.handle } // SetHandle associates a `types.QueueItemHandle` with this item. This method is called by a `framework.SafeQueue` // implementation immediately after the item is added to the queue. -func (fi *flowItem) SetHandle(handle types.QueueItemHandle) { fi.handle = handle } - -// Done returns a channel that is closed when the item has been finalized (e.g., dispatched or evicted). -// This is the primary mechanism for consumers to wait for an item's outcome. It is designed to be used in a `select` -// statement, allowing the caller to simultaneously wait for other events, such as context cancellation. -// -// # Example Usage -// -// select { -// case <-item.Done(): -// outcome, err := item.FinalState() -// // ... handle outcome -// case <-ctx.Done(): -// // ... handle cancellation -// } -func (fi *flowItem) Done() <-chan struct{} { - return fi.done -} - -// FinalState returns the terminal outcome and error for the item. -// -// CRITICAL: This method must only be called after the channel returned by `Done()` has been closed. Calling it before -// the item is finalized may result in a race condition where the final state has not yet been written. -func (fi *flowItem) FinalState() (types.QueueOutcome, error) { - outcomeVal := fi.outcome.Load() - errVal := fi.err.Load() - - var finalOutcome types.QueueOutcome - if oc, ok := outcomeVal.(types.QueueOutcome); ok { - finalOutcome = oc - } else { - // This case should not happen if finalize is always called correctly, but we default to a safe value. - finalOutcome = types.QueueOutcomeNotYetFinalized - } +func (fi *FlowItem) SetHandle(handle types.QueueItemHandle) { fi.handle = handle } - var finalErr error - if e, ok := errVal.(error); ok { - finalErr = e - } - return finalOutcome, finalErr +// Done returns a channel that is closed when the item has been finalized (e.g., dispatched, rejected, or evicted). +func (fi *FlowItem) Done() <-chan FinalState { + return fi.done } -// finalize sets the item's terminal state (`outcome`, `error`) and closes its `done` channel idempotently using -// `sync.Once`. This is the single, internal point where an item's lifecycle within the `FlowController` concludes. -func (fi *flowItem) finalize(outcome types.QueueOutcome, err error) { +// Finalize sets the item's terminal state and signals the waiting goroutine by closing its `done` channel idempotently. +// This method is idempotent and is the single point where an item's lifecycle concludes. +// It is intended to be called only by the component that owns the item's lifecycle, such as a `ShardProcessor`. +func (fi *FlowItem) Finalize(outcome types.QueueOutcome, err error) { fi.onceFinalize.Do(func() { - if err != nil { - fi.err.Store(err) - } - fi.outcome.Store(outcome) + finalState := FinalState{Outcome: outcome, Err: err} + fi.finalState = finalState + fi.done <- finalState close(fi.done) }) } -// isFinalized checks if the item has been finalized without blocking. It is used internally by the `ShardProcessor` as -// a defensive check to avoid operating on items that have already been completed. -func (fi *flowItem) isFinalized() bool { - select { - case <-fi.done: - return true - default: - return false - } +// isFinalized checks if the item has been finalized without blocking or consuming the final state. +// It is a side-effect-free check used by the `ShardProcessor` as a defensive measure to avoid operating on +// already-completed items. +func (fi *FlowItem) isFinalized() bool { + // A buffered channel of size 1 can be safely and non-blockingly checked by its length. + // If the finalize function has run, it will have sent a value, and the length will be 1. + return len(fi.done) > 0 } diff --git a/pkg/epp/flowcontrol/controller/internal/item_test.go b/pkg/epp/flowcontrol/controller/internal/item_test.go index d50aaed41..1f713e913 100644 --- a/pkg/epp/flowcontrol/controller/internal/item_test.go +++ b/pkg/epp/flowcontrol/controller/internal/item_test.go @@ -18,6 +18,7 @@ package internal import ( "context" + "errors" "testing" "time" @@ -28,25 +29,47 @@ import ( typesmocks "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types/mocks" ) -func TestItem(t *testing.T) { +func TestFlowItem_New(t *testing.T) { t.Parallel() + req := typesmocks.NewMockFlowControlRequest(100, "req-1", types.FlowKey{}, context.Background()) - t.Run("should correctly set and get handle", func(t *testing.T) { - t.Parallel() - item := &flowItem{} - handle := &typesmocks.MockQueueItemHandle{} - item.SetHandle(handle) - assert.Same(t, handle, item.Handle(), "Handle() should retrieve the same handle instance set by SetHandle()") - }) - - t.Run("should have a non-finalized state upon creation", func(t *testing.T) { - t.Parallel() - key := types.FlowKey{ID: "flow-a", Priority: 10} - req := typesmocks.NewMockFlowControlRequest(100, "req-1", key, context.Background()) - item := NewItem(req, time.Minute, time.Now()) - require.NotNil(t, item, "NewItem should not return nil") - outcome, err := item.FinalState() - assert.Equal(t, types.QueueOutcomeNotYetFinalized, outcome, "A new item's outcome should be NotYetFinalized") - assert.NoError(t, err, "A new item should have a nil error") - }) + item := NewItem(req, time.Minute, time.Now()) + + require.NotNil(t, item, "NewItem should not return a nil item") + assert.False(t, item.isFinalized(), "a new item must not be in a finalized state") + select { + case <-item.Done(): + t.Fatal("Done() channel for a new item must block, but it was closed") + default: + // This is the expected path, as the channel would have blocked. + } +} + +func TestFlowItem_Handle(t *testing.T) { + t.Parallel() + item := &FlowItem{} + handle := &typesmocks.MockQueueItemHandle{} + item.SetHandle(handle) + assert.Same(t, handle, item.Handle(), "Handle() must retrieve the identical handle instance set by SetHandle()") +} + +func TestFlowItem_Finalize_Idempotency(t *testing.T) { + t.Parallel() + req := typesmocks.NewMockFlowControlRequest(100, "req-1", types.FlowKey{}, context.Background()) + item := NewItem(req, time.Minute, time.Now()) + expectedErr := errors.New("first-error") + + item.Finalize(types.QueueOutcomeEvictedTTL, expectedErr) + item.Finalize(types.QueueOutcomeDispatched, nil) // Should take no effect + + assert.True(t, item.isFinalized(), "item must be in a finalized state after a call to finalize()") + select { + case finalState, ok := <-item.Done(): + require.True(t, ok, "Done() channel should be readable with a value, not just closed") + assert.Equal(t, types.QueueOutcomeEvictedTTL, finalState.Outcome, + "the outcome from Done() must match the first finalized outcome") + assert.Equal(t, expectedErr, finalState.Err, "the error from Done() must match the first finalized error") + case <-time.After(50 * time.Millisecond): + t.Fatal("Done() channel must not block after finalization") + } } diff --git a/pkg/epp/flowcontrol/controller/internal/processor.go b/pkg/epp/flowcontrol/controller/internal/processor.go index abcd9f2bc..49a6dd675 100644 --- a/pkg/epp/flowcontrol/controller/internal/processor.go +++ b/pkg/epp/flowcontrol/controller/internal/processor.go @@ -26,6 +26,7 @@ import ( "time" "github.com/go-logr/logr" + "k8s.io/utils/clock" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/contracts" @@ -34,70 +35,43 @@ import ( logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) -const ( - // enqueueChannelBufferSize sets the size of the buffered channel that accepts incoming requests for the shard - // processor. This buffer acts as a "shock absorber," decoupling the upstream distributor from the processor's main - // loop and allowing the system to handle short, intense bursts of traffic without blocking the distributor. - enqueueChannelBufferSize = 100 +// maxCleanupWorkers caps the number of concurrent workers for background cleanup tasks. This prevents a single shard +// from overwhelming the Go scheduler with too many goroutines. +const maxCleanupWorkers = 4 - // maxCleanupWorkers caps the number of concurrent workers for background cleanup tasks. This prevents a single shard - // from overwhelming the Go scheduler with too many goroutines. - maxCleanupWorkers = 4 -) - -var ( - // errInterFlow is a sentinel error for failures during the inter-flow dispatch phase (e.g., a - // `framework.InterFlowDispatchPolicy` fails to select a queue). - // - // Strategy: When this error is encountered, the dispatch cycle aborts processing for the current priority band and - // immediately moves to the next, promoting work conservation. A failure in one band should not halt progress in - // others. - errInterFlow = errors.New("inter-flow policy failure") - - // errIntraFlow is a sentinel error for failures *after* a specific flow's queue has been selected (e.g., a - // `framework.IntraFlowDispatchPolicy` fails or a queue `Remove` fails). - // - // Strategy: When this error is encountered, the dispatch cycle aborts processing for the entire priority band for the - // current cycle. This acts as a critical circuit breaker. A stateless inter-flow policy could otherwise repeatedly - // select the same problematic queue in a tight loop of failures. Halting the band for one cycle prevents this. - errIntraFlow = errors.New("intra-flow operation failure") -) - -// clock defines an interface for getting the current time, allowing for dependency injection in tests. -type clock interface { - Now() time.Time -} +// ErrProcessorBusy is a sentinel error returned by a the processor's `Submit` method. +// It indicates that the processor's internal buffer is momentarily full and cannot accept new work. +// This is used as a signal for the `controller.FlowController`'s "fast failover" logic. +var ErrProcessorBusy = errors.New("shard processor is busy") -// ShardProcessor is the core worker of the `controller.FlowController`. It is paired one-to-one with a -// `contracts.RegistryShard` instance and is responsible for all request lifecycle operations on that shard, including -// enqueueing, dispatching, and expiry cleanup. It acts as the "data plane" worker that executes against the -// concurrent-safe state provided by its shard. +// ShardProcessor is the core worker of the `controller.FlowController`. +// It is paired one-to-one with a `contracts.RegistryShard` instance and is responsible for all request lifecycle +// operations on that shard. It acts as the "data plane" worker that executes against the concurrent-safe state provided +// by its shard. // -// For a full rationale on the single-writer concurrency model, see the package-level documentation in `doc.go`. +// # Concurrency Model: The Single-Writer Actor // -// # Concurrency Guarantees and Race Conditions +// To ensure correctness and high performance, the processor uses a single-goroutine, actor-based model. The main `Run` +// loop is the sole "writer" for all state-mutating operations, particularly enqueueing. This makes complex transactions +// inherently atomic without coarse-grained locks. // -// This model provides two key guarantees: +// # Concurrency Guarantees // -// 1. **Safe Enqueueing**: The `Run` method's goroutine has exclusive ownership of all operations that *add* items to -// queues. This makes the "check-then-act" sequence in `enqueue` (calling `hasCapacity` then `managedQ.Add`) -// inherently atomic from a writer's perspective, preventing capacity breaches. While the background -// `runExpiryCleanup` goroutine can concurrently *remove* items, this is a benign race; a concurrent removal only -// creates more available capacity, ensuring the `hasCapacity` check remains valid. -// -// 2. **Idempotent Finalization**: The primary internal race is between the main `dispatchCycle` and the background -// `runExpiryCleanup` goroutine, which might try to finalize the same `flowItem` simultaneously. This race is -// resolved by the `flowItem.finalize` method, which uses `sync.Once` to guarantee that only one of these goroutines -// can set the item's final state. +// 1. Safe Enqueueing: The "check-then-act" sequence for capacity is safe because it is only ever performed by the +// single `Run` goroutine. +// 2. Idempotent Finalization: The primary internal race condition is between the main `dispatchCycle` and the +// background `runExpiryCleanup` goroutine, both of which might try to finalize an item. This is resolved by the +// `FlowItem.Finalize` method, which uses `sync.Once` to guarantee that only the first attempt to finalize an item +// succeeds. type ShardProcessor struct { shard contracts.RegistryShard dispatchFilter BandFilter - clock clock + clock clock.Clock expiryCleanupInterval time.Duration logger logr.Logger // enqueueChan is the entry point for new requests to be processed by this shard's `Run` loop. - enqueueChan chan *flowItem + enqueueChan chan *FlowItem // wg is used to wait for background tasks like expiry cleanup to complete on shutdown. wg sync.WaitGroup isShuttingDown atomic.Bool @@ -108,8 +82,9 @@ type ShardProcessor struct { func NewShardProcessor( shard contracts.RegistryShard, dispatchFilter BandFilter, - clock clock, + clock clock.Clock, expiryCleanupInterval time.Duration, + enqueueChannelBufferSize int, logger logr.Logger, ) *ShardProcessor { return &ShardProcessor{ @@ -120,22 +95,61 @@ func NewShardProcessor( logger: logger, // A buffered channel decouples the processor from the distributor, allowing for a fast, asynchronous handoff of new // requests. - enqueueChan: make(chan *flowItem, enqueueChannelBufferSize), + enqueueChan: make(chan *FlowItem, enqueueChannelBufferSize), } } -// Run is the main operational loop for the shard processor. It must be run as a goroutine. +// Submit attempts a non-blocking handoff of an item to the processor's internal channel for asynchronous processing. // -// # Loop Strategy: Interleaving Enqueue and Dispatch +// It returns nil if the item was accepted by the processor, or if the processor is shutting down (in which case the +// item is immediately finalized with a shutdown error). In both cases, a nil return means the item's lifecycle has been +// handled by this processor and the caller should not retry. +// It returns `ErrProcessorBusy` if the processor's channel is momentarily full, signaling that the caller should try +// another processor. +func (sp *ShardProcessor) Submit(item *FlowItem) error { + if sp.isShuttingDown.Load() { + item.Finalize(types.QueueOutcomeRejectedOther, + fmt.Errorf("%w: %w", types.ErrRejected, types.ErrFlowControllerNotRunning)) + return nil // Success from the caller's perspective; the item is terminal. + } + + select { + case sp.enqueueChan <- item: + return nil // Success + default: + // The channel buffer is full, signaling transient backpressure. + return ErrProcessorBusy + } +} + +// SubmitOrBlock performs a blocking submission of an item to the processor's internal channel. +// It will wait until either the submission succeeds or the provided context is cancelled. // -// The loop uses a `select` statement to interleave two primary tasks: -// 1. Accepting new requests from the `enqueueChan`. -// 2. Attempting to dispatch existing requests from queues via `dispatchCycle`. +// This method is the fallback used by the distributor when all processors are busy, providing graceful backpressure +// instead of immediate rejection. // -// This strategy is crucial for balancing responsiveness and throughput. When a new item arrives, it is immediately -// enqueued, and a dispatch cycle is triggered. This gives high-priority new arrivals a chance to be dispatched quickly. -// When no new items are arriving, the loop's `default` case continuously calls `dispatchCycle` to drain the existing -// backlog, ensuring work continues. +// It returns the `ctx.Err()` if the context is cancelled during the wait. +func (sp *ShardProcessor) SubmitOrBlock(ctx context.Context, item *FlowItem) error { + if sp.isShuttingDown.Load() { + // Here, we return an error because the caller, expecting to block, was prevented from doing so by the shutdown. + // This is a failure of the operation. + item.Finalize(types.QueueOutcomeRejectedOther, + fmt.Errorf("%w: %w", types.ErrRejected, types.ErrFlowControllerNotRunning)) + return types.ErrFlowControllerNotRunning + } + + select { + case sp.enqueueChan <- item: + return nil // Success + case <-ctx.Done(): + // The caller's context was cancelled while we were blocked. + return ctx.Err() + } +} + +// Run is the main operational loop for the shard processor. It must be run as a goroutine. +// It uses a `select` statement to interleave accepting new requests with dispatching existing ones, balancing +// responsiveness with throughput. func (sp *ShardProcessor) Run(ctx context.Context) { sp.logger.V(logutil.DEFAULT).Info("Shard processor run loop starting.") defer sp.logger.V(logutil.DEFAULT).Info("Shard processor run loop stopped.") @@ -148,10 +162,8 @@ func (sp *ShardProcessor) Run(ctx context.Context) { // // 1. Context Cancellation: The highest priority is shutting down. If the context's `Done` channel is closed, the // loop will drain all queues and exit. This is the primary exit condition. - // // 2. New Item Arrival: If an item is available on `enqueueChan`, it will be processed. This ensures that the // processor is responsive to new work. - // // 3. Default (Dispatch): If neither of the above cases is ready, the `default` case executes, ensuring the loop is // non-blocking. It continuously attempts to dispatch items from the existing backlog, preventing starvation and // ensuring queues are drained. @@ -175,8 +187,9 @@ func (sp *ShardProcessor) Run(ctx context.Context) { sp.enqueue(item) sp.dispatchCycle(ctx) default: + // If no new items are arriving, continuously try to dispatch from the backlog. if !sp.dispatchCycle(ctx) { - // If no work was done, yield to other goroutines to prevent a tight, busy-loop when idle, but allow for + // If no work was done, yield to the scheduler to prevent a tight, busy-loop when idle, while still allowing for // immediate rescheduling. runtime.Gosched() } @@ -184,20 +197,10 @@ func (sp *ShardProcessor) Run(ctx context.Context) { } } -// Enqueue sends a new flow item to the processor's internal channel for asynchronous processing by its main `Run` loop. -// If the processor is shutting down, it immediately finalizes the item with a shutdown error. -func (sp *ShardProcessor) Enqueue(item *flowItem) { - if sp.isShuttingDown.Load() { - item.finalize(types.QueueOutcomeRejectedOther, - fmt.Errorf("%w: %w", types.ErrRejected, types.ErrFlowControllerShutdown)) - return - } - sp.enqueueChan <- item -} - -// enqueue is the internal implementation for adding a new item to a managed queue. It is always run from the single -// main `Run` goroutine, making its "check-then-act" logic for capacity safe. -func (sp *ShardProcessor) enqueue(item *flowItem) { +// enqueue is responsible for adding a new item to its designated queue. It is always run from the single main `Run` +// goroutine, which makes its multi-step "check-then-act" logic for capacity management inherently atomic and safe from +// race conditions. +func (sp *ShardProcessor) enqueue(item *FlowItem) { req := item.OriginalRequest() key := req.FlowKey() @@ -213,7 +216,7 @@ func (sp *ShardProcessor) enqueue(item *flowItem) { if err != nil { finalErr := fmt.Errorf("configuration error: failed to get queue for flow key %s: %w", key, err) logger.Error(finalErr, "Rejecting item.") - item.finalize(types.QueueOutcomeRejectedOther, fmt.Errorf("%w: %w", types.ErrRejected, finalErr)) + item.Finalize(types.QueueOutcomeRejectedOther, fmt.Errorf("%w: %w", types.ErrRejected, finalErr)) return } @@ -221,7 +224,7 @@ func (sp *ShardProcessor) enqueue(item *flowItem) { if err != nil { finalErr := fmt.Errorf("configuration error: failed to get priority band for priority %d: %w", key.Priority, err) logger.Error(finalErr, "Rejecting item.") - item.finalize(types.QueueOutcomeRejectedOther, fmt.Errorf("%w: %w", types.ErrRejected, finalErr)) + item.Finalize(types.QueueOutcomeRejectedOther, fmt.Errorf("%w: %w", types.ErrRejected, finalErr)) return } logger = logger.WithValues("priorityName", band.PriorityName()) @@ -237,7 +240,7 @@ func (sp *ShardProcessor) enqueue(item *flowItem) { "bandTotalBytes", bandStats.ByteSize, "bandCapacityBytes", bandStats.CapacityBytes, ) - item.finalize(types.QueueOutcomeRejectedCapacity, fmt.Errorf("%w: %w", types.ErrRejected, types.ErrQueueAtCapacity)) + item.Finalize(types.QueueOutcomeRejectedCapacity, fmt.Errorf("%w: %w", types.ErrRejected, types.ErrQueueAtCapacity)) return } @@ -250,7 +253,8 @@ func (sp *ShardProcessor) enqueue(item *flowItem) { // 2. The background `runExpiryCleanup` goroutine acts as the ultimate guarantor of correctness, as it will // eventually find and evict any finalized item that slips through this check and is added to a queue. if item.isFinalized() { - outcome, err := item.FinalState() + finalState := item.finalState + outcome, err := finalState.Outcome, finalState.Err logger.V(logutil.VERBOSE).Info("Item finalized before adding to queue, ignoring.", "outcome", outcome, "err", err) return } @@ -260,14 +264,14 @@ func (sp *ShardProcessor) enqueue(item *flowItem) { if err := managedQ.Add(item); err != nil { finalErr := fmt.Errorf("failed to add item to queue for flow key %s: %w", key, err) logger.Error(finalErr, "Rejecting item.") - item.finalize(types.QueueOutcomeRejectedOther, fmt.Errorf("%w: %w", types.ErrRejected, finalErr)) + item.Finalize(types.QueueOutcomeRejectedOther, fmt.Errorf("%w: %w", types.ErrRejected, finalErr)) return } logger.V(logutil.TRACE).Info("Item enqueued.") } // hasCapacity checks if the shard and the specific priority band have enough capacity to accommodate an item of a given -// size. +// size. This check is only safe because it is called from the single-writer `enqueue` method. func (sp *ShardProcessor) hasCapacity(priority int, itemByteSize uint64) bool { if itemByteSize == 0 { return true @@ -286,22 +290,19 @@ func (sp *ShardProcessor) hasCapacity(priority int, itemByteSize uint64) bool { // dispatchCycle attempts to dispatch a single item by iterating through all priority bands from highest to lowest. // It applies the configured policies for each band to select an item and then attempts to dispatch it. -// It returns true if an item was successfully dispatched, and false otherwise. -// -// # Error Handling Philosophy +// It returns true if an item was successfully dispatched, and false otherwise, indicating that no work was done in this +// cycle. // -// The engine employs a robust, two-tiered error handling strategy to isolate failures and maximize system availability. -// This is managed via the `errInterFlow` and `errIntraFlow` sentinel errors. +// # Error Handling Philosophy: Failure Isolation & Work Conservation // -// - Inter-Flow Failures: If a failure occurs while selecting a flow (e.g., the `InterFlowDispatchPolicy` fails), the -// processor aborts the *current priority band* and immediately moves to the next one. This promotes work -// conservation, ensuring a single misconfigured band does not halt progress for the entire system. +// The engine is designed to be resilient to failures in individual policies or transient errors within a specific flow. +// The core principle is failure isolation. A problem in one priority band must not be allowed to halt processing for +// other, healthy bands. // -// - Intra-Flow Failures: If a failure occurs *after* a flow has been selected (e.g., the `IntraFlowDispatchPolicy` -// fails), the processor aborts the *entire priority band* for the current cycle. This is a critical circuit -// breaker. An inter-flow policy that is not stateful with respect to past failures could otherwise repeatedly -// select the same problematic queue, causing a tight loop of failures. Halting the band for one cycle prevents -// this. +// To achieve this, any error encountered during the selection or dispatch process for a given priority band is treated +// as a non-critical failure for that band, in this cycle. The processor will log the error for observability and then +// immediately continue its attempt to find work in the next-lower priority band. This promotes work conservation and +// maximizes system throughput even in the presence of partial failures. func (sp *ShardProcessor) dispatchCycle(ctx context.Context) bool { baseLogger := sp.logger.WithName("dispatchCycle") @@ -329,12 +330,7 @@ func (sp *ShardProcessor) dispatchCycle(ctx context.Context) bool { // Pass the (potentially filtered) band to the policies. item, err := sp.selectItem(dispatchableBand, logger) if err != nil { - // The error handling strategy depends on the type of failure (inter- vs. intra-flow). - if errors.Is(err, errIntraFlow) { - logger.Error(err, "Intra-flow policy failure, skipping priority band for this cycle") - } else { - logger.Error(err, "Inter-flow policy or configuration failure, skipping priority band for this cycle") - } + logger.Error(err, "Failed to select item, skipping priority band for this cycle") continue } if item == nil { @@ -350,7 +346,6 @@ func (sp *ShardProcessor) dispatchCycle(ctx context.Context) bool { "reqByteSize", item.OriginalRequest().ByteSize()) if err := sp.dispatchItem(item, logger); err != nil { - // All errors from dispatchItem are considered intra-flow and unrecoverable for this band in this cycle. logger.Error(err, "Failed to dispatch item, skipping priority band for this cycle") continue } @@ -369,12 +364,11 @@ func (sp *ShardProcessor) selectItem( ) (types.QueueItemAccessor, error) { interP, err := sp.shard.InterFlowDispatchPolicy(band.Priority()) if err != nil { - return nil, fmt.Errorf("%w: could not get InterFlowDispatchPolicy: %w", errInterFlow, err) + return nil, fmt.Errorf("could not get InterFlowDispatchPolicy: %w", err) } queue, err := interP.SelectQueue(band) if err != nil { - return nil, fmt.Errorf("%w: InterFlowDispatchPolicy %q failed to select queue: %w", - errInterFlow, interP.Name(), err) + return nil, fmt.Errorf("InterFlowDispatchPolicy %q failed to select queue: %w", interP.Name(), err) } if queue == nil { logger.V(logutil.TRACE).Info("No queue selected by InterFlowDispatchPolicy") @@ -387,13 +381,11 @@ func (sp *ShardProcessor) selectItem( "selectedFlowPriority", key.Priority) intraP, err := sp.shard.IntraFlowDispatchPolicy(key) if err != nil { - // This is an intra-flow failure because we have already successfully selected a queue. - return nil, fmt.Errorf("%w: could not get IntraFlowDispatchPolicy for flow %q: %w", errIntraFlow, key, err) + return nil, fmt.Errorf("could not get IntraFlowDispatchPolicy for flow %s: %w", key, err) } item, err := intraP.SelectItem(queue) if err != nil { - return nil, fmt.Errorf("%w: IntraFlowDispatchPolicy %q failed to select item for flow %q: %w", - errIntraFlow, intraP.Name(), key, err) + return nil, fmt.Errorf("IntraFlowDispatchPolicy %q failed to select item for flow %s: %w", intraP.Name(), key, err) } if item == nil { logger.V(logutil.TRACE).Info("No item selected by IntraFlowDispatchPolicy") @@ -403,7 +395,7 @@ func (sp *ShardProcessor) selectItem( } // dispatchItem handles the final steps of dispatching an item after it has been selected by policies. This includes -// removing it from its queue, checking for last-minute expiry, and finalizing its outcome. +// removing it from its queue and finalizing its outcome. func (sp *ShardProcessor) dispatchItem(itemAcc types.QueueItemAccessor, logger logr.Logger) error { logger = logger.WithName("dispatchItem") @@ -411,7 +403,7 @@ func (sp *ShardProcessor) dispatchItem(itemAcc types.QueueItemAccessor, logger l // We must look up the queue by its specific priority, as a flow might have draining queues at other levels. managedQ, err := sp.shard.ManagedQueue(req.FlowKey()) if err != nil { - return fmt.Errorf("%w: failed to get ManagedQueue for flow %q: %w", errIntraFlow, req.FlowKey(), err) + return fmt.Errorf("failed to get ManagedQueue for flow %s: %w", req.FlowKey(), err) } // The core mutation: remove the item from the queue. @@ -420,21 +412,11 @@ func (sp *ShardProcessor) dispatchItem(itemAcc types.QueueItemAccessor, logger l // This can happen benignly if the item was already removed by the expiry cleanup loop between the time it was // selected by the policy and the time this function is called. logger.V(logutil.VERBOSE).Info("Item already removed from queue, likely by expiry cleanup", "err", err) - return fmt.Errorf("%w: failed to remove item %q from queue for flow %q: %w", - errIntraFlow, req.ID(), req.FlowKey(), err) - } - - removedItem, ok := removedItemAcc.(*flowItem) - if !ok { - // This indicates a severe logic error where a queue returns an item of an unexpected type. This violates a - // core system invariant: all items managed by the processor must be of type *flowItem. This is an unrecoverable - // state for this shard. - unexpectedItemErr := fmt.Errorf("%w: internal error: item %q of type %T is not a *flowItem", - errIntraFlow, removedItemAcc.OriginalRequest().ID(), removedItemAcc) - panic(unexpectedItemErr) + return fmt.Errorf("failed to remove item %q from queue for flow %s: %w", req.ID(), req.FlowKey(), err) } // Final check for expiry/cancellation right before dispatch. + removedItem := removedItemAcc.(*FlowItem) isExpired, outcome, expiryErr := checkItemExpiry(removedItem, sp.clock.Now()) if isExpired { // Ensure we always have a non-nil error to wrap for consistent logging and error handling. @@ -444,43 +426,34 @@ func (sp *ShardProcessor) dispatchItem(itemAcc types.QueueItemAccessor, logger l } logger.V(logutil.VERBOSE).Info("Item expired at time of dispatch, evicting", "outcome", outcome, "err", finalErr) - removedItem.finalize(outcome, fmt.Errorf("%w: %w", types.ErrEvicted, finalErr)) + removedItem.Finalize(outcome, fmt.Errorf("%w: %w", types.ErrEvicted, finalErr)) // Return an error to signal that the dispatch did not succeed. - return fmt.Errorf("%w: item %q expired before dispatch: %w", errIntraFlow, req.ID(), finalErr) + return fmt.Errorf("item %q expired before dispatch: %w", req.ID(), finalErr) } // Finalize the item as dispatched. - removedItem.finalize(types.QueueOutcomeDispatched, nil) + removedItem.Finalize(types.QueueOutcomeDispatched, nil) logger.V(logutil.TRACE).Info("Item dispatched.") return nil } -// checkItemExpiry checks if an item has been cancelled (via its context) or has exceeded its TTL. It returns true if -// the item is expired, along with the corresponding outcome and error. +// checkItemExpiry provides the authoritative check to determine if an item should be evicted due to TTL expiry or +// context cancellation. // -// This function provides "defense in depth" against race conditions. It is the authoritative check that is called from -// multiple locations (the dispatch loop and the cleanup loop) to determine if an item should be evicted. Its first -// action is to check if the item has *already* been finalized by a competing goroutine, ensuring that the final outcome -// is decided exactly once. +// It provides "defense in depth" against race conditions. Its first action is to check if the item has already been +// finalized by a competing goroutine (e.g., the cleanup loop finalizing an item the dispatch loop is trying to +// process). This ensures that the final outcome is decided exactly once. func checkItemExpiry( itemAcc types.QueueItemAccessor, now time.Time, ) (isExpired bool, outcome types.QueueOutcome, err error) { - item, ok := itemAcc.(*flowItem) - if !ok { - // This indicates a severe logic error where a queue returns an item of an unexpected type. This violates a - // core system invariant: all items managed by the processor must be of type *flowItem. This is an unrecoverable - // state for this shard. - unexpectedItemErr := fmt.Errorf("internal error: item %q of type %T is not a *flowItem", - itemAcc.OriginalRequest().ID(), itemAcc) - panic(unexpectedItemErr) - } + item := itemAcc.(*FlowItem) // This check is a critical defense against race conditions. If another goroutine (e.g., the cleanup loop) has // already finalized this item, we must respect that outcome. if item.isFinalized() { - outcome, err := item.FinalState() - return true, outcome, err + finalState := item.finalState + return true, finalState.Outcome, finalState.Err } // Check if the request's context has been cancelled. @@ -517,7 +490,7 @@ func (sp *ShardProcessor) runExpiryCleanup(ctx context.Context) { } // cleanupExpired performs a single scan of all queues on the shard, removing and finalizing any items that have -// expired due to TTL or context cancellation. +// expired. func (sp *ShardProcessor) cleanupExpired(now time.Time) { processFn := func(managedQ contracts.ManagedQueue, queueLogger logr.Logger) { // This predicate identifies items to be removed by the Cleanup call. @@ -537,9 +510,8 @@ func (sp *ShardProcessor) cleanupExpired(now time.Time) { sp.processAllQueuesConcurrently("cleanupExpired", processFn) } -// shutdown handles the graceful termination of the processor. It uses sync.Once to guarantee that the shutdown logic is -// executed exactly once, regardless of whether it's triggered by context cancellation or the closing of the enqueue -// channel. +// shutdown handles the graceful termination of the processor, ensuring any pending items in the enqueue channel or in +// the queues are finalized correctly. func (sp *ShardProcessor) shutdown() { sp.shutdownOnce.Do(func() { // Set the atomic bool so that any new calls to Enqueue will fail fast. @@ -555,8 +527,8 @@ func (sp *ShardProcessor) shutdown() { if item == nil { // This is a safeguard against logic errors in the distributor. continue } - item.finalize(types.QueueOutcomeRejectedOther, - fmt.Errorf("%w: %w", types.ErrRejected, types.ErrFlowControllerShutdown)) + item.Finalize(types.QueueOutcomeRejectedOther, + fmt.Errorf("%w: %w", types.ErrRejected, types.ErrFlowControllerNotRunning)) default: // The channel is empty, we can now safely close it. break DrainLoop @@ -569,8 +541,7 @@ func (sp *ShardProcessor) shutdown() { }) } -// evictAll drains all queues on the shard and finalizes every item with a shutdown error. This is called when the -// processor is shutting down to ensure no requests are left in a pending state. +// evictAll drains all queues on the shard and finalizes every item with a shutdown error. func (sp *ShardProcessor) evictAll() { processFn := func(managedQ contracts.ManagedQueue, queueLogger logr.Logger) { removedItems, err := managedQ.Drain() @@ -580,7 +551,7 @@ func (sp *ShardProcessor) evictAll() { // Finalize all the items that were removed. getOutcome := func(_ types.QueueItemAccessor) (types.QueueOutcome, error) { - return types.QueueOutcomeEvictedOther, fmt.Errorf("%w: %w", types.ErrEvicted, types.ErrFlowControllerShutdown) + return types.QueueOutcomeEvictedOther, fmt.Errorf("%w: %w", types.ErrEvicted, types.ErrFlowControllerNotRunning) } sp.finalizeItems(removedItems, queueLogger, getOutcome) } @@ -659,23 +630,23 @@ func (sp *ShardProcessor) finalizeItems( getOutcome func(item types.QueueItemAccessor) (types.QueueOutcome, error), ) { for _, i := range items { - item, ok := i.(*flowItem) + item, ok := i.(*FlowItem) if !ok { - unexpectedItemErr := fmt.Errorf("internal error: item %q of type %T is not a *flowItem", + unexpectedItemErr := fmt.Errorf("internal error: item %q of type %T is not a *FlowItem", i.OriginalRequest().ID(), i) logger.Error(unexpectedItemErr, "Panic condition detected during finalization", "item", i) continue } outcome, err := getOutcome(i) - item.finalize(outcome, err) + item.Finalize(outcome, err) logger.V(logutil.TRACE).Info("Item finalized", "reqID", item.OriginalRequest().ID(), "outcome", outcome, "err", err) } } -// finalizeExpiredItems is a specialized version of finalizeItems for items that are known to be expired. It determines -// the precise reason for expiry and finalizes the item accordingly. +// finalizeExpiredItems is a specialized version of finalizeItems for items that are known to be expired. +// It determines the precise reason for expiry and finalizes the item accordingly. func (sp *ShardProcessor) finalizeExpiredItems(items []types.QueueItemAccessor, now time.Time, logger logr.Logger) { getOutcome := func(item types.QueueItemAccessor) (types.QueueOutcome, error) { // We don't need the `isExpired` boolean here because we know it's true, but this function conveniently returns the diff --git a/pkg/epp/flowcontrol/controller/internal/processor_test.go b/pkg/epp/flowcontrol/controller/internal/processor_test.go index 81807e0e3..461cb3440 100644 --- a/pkg/epp/flowcontrol/controller/internal/processor_test.go +++ b/pkg/epp/flowcontrol/controller/internal/processor_test.go @@ -53,6 +53,7 @@ import ( "github.com/go-logr/logr" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + testclock "k8s.io/utils/clock/testing" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" @@ -79,28 +80,6 @@ func TestMain(m *testing.M) { os.Exit(m.Run()) } -// mockClock allows for controlling time in tests. -type mockClock struct { - mu sync.Mutex - currentTime time.Time -} - -func newMockClock() *mockClock { - return &mockClock{currentTime: time.Now()} -} - -func (c *mockClock) Now() time.Time { - c.mu.Lock() - defer c.mu.Unlock() - return c.currentTime -} - -func (c *mockClock) Advance(d time.Duration) { - c.mu.Lock() - defer c.mu.Unlock() - c.currentTime = c.currentTime.Add(d) -} - // testHarness provides a unified, mock-based testing environment for the ShardProcessor. It centralizes all mock state // and provides helper methods for setting up tests and managing the processor's lifecycle. type testHarness struct { @@ -115,7 +94,7 @@ type testHarness struct { // Core components under test processor *ShardProcessor - mockClock *mockClock + clock *testclock.FakeClock logger logr.Logger // --- Centralized Mock State --- @@ -135,7 +114,7 @@ func newTestHarness(t *testing.T, expiryCleanupInterval time.Duration) *testHarn h := &testHarness{ t: t, MockRegistryShard: &mocks.MockRegistryShard{}, - mockClock: newMockClock(), + clock: testclock.NewFakeClock(time.Now()), logger: logr.Discard(), startSignal: make(chan struct{}), queues: make(map[types.FlowKey]*mocks.MockManagedQueue), @@ -167,8 +146,11 @@ func newTestHarness(t *testing.T, expiryCleanupInterval time.Duration) *testHarn ) (framework.PriorityBandAccessor, bool) { return nil, false } - h.processor = NewShardProcessor(h, filter, h.mockClock, expiryCleanupInterval, h.logger) + h.processor = NewShardProcessor(h, filter, h.clock, expiryCleanupInterval, 100, h.logger) require.NotNil(t, h.processor, "NewShardProcessor should not return nil") + + t.Cleanup(func() { h.Stop() }) + return h } @@ -202,23 +184,23 @@ func (h *testHarness) Stop() { } // waitForFinalization blocks until an item is finalized or a timeout is reached. -func (h *testHarness) waitForFinalization(item *flowItem) (types.QueueOutcome, error) { +func (h *testHarness) waitForFinalization(item *FlowItem) (types.QueueOutcome, error) { h.t.Helper() select { - case <-item.Done(): - return item.FinalState() + case finalState := <-item.Done(): + return finalState.Outcome, finalState.Err case <-time.After(testWaitTimeout): h.t.Fatalf("Timed out waiting for item %q to be finalized", item.OriginalRequest().ID()) return types.QueueOutcomeNotYetFinalized, nil } } -// newTestItem creates a new flowItem for testing purposes. -func (h *testHarness) newTestItem(id string, key types.FlowKey, ttl time.Duration) *flowItem { +// newTestItem creates a new FlowItem for testing purposes. +func (h *testHarness) newTestItem(id string, key types.FlowKey, ttl time.Duration) *FlowItem { h.t.Helper() ctx := log.IntoContext(context.Background(), h.logger) req := typesmocks.NewMockFlowControlRequest(100, id, key, ctx) - return NewItem(req, ttl, h.mockClock.Now()) + return NewItem(req, ttl, h.clock.Now()) } // addQueue centrally registers a new mock queue for a given flow, ensuring all harness components are aware of it. @@ -226,13 +208,9 @@ func (h *testHarness) addQueue(key types.FlowKey) *mocks.MockManagedQueue { h.t.Helper() h.mu.Lock() defer h.mu.Unlock() - mockQueue := &mocks.MockManagedQueue{FlowKeyV: key} h.queues[key] = mockQueue - - // Add the key to the correct priority band, creating the band if needed. h.priorityFlows[key.Priority] = append(h.priorityFlows[key.Priority], key) - return mockQueue } @@ -334,9 +312,9 @@ func (h *testHarness) intraFlowDispatchPolicy(types.FlowKey) (framework.IntraFlo func TestShardProcessor(t *testing.T) { t.Parallel() - // Lifecycle tests use the processor's main `Run` loop to verify the complete end-to-end lifecycle of a request, from + // Integration tests use the processor's main `Run` loop to verify the complete end-to-end lifecycle of a request, from // `Enqueue` to its final outcome. - t.Run("Lifecycle", func(t *testing.T) { + t.Run("Integration", func(t *testing.T) { t.Parallel() t.Run("should dispatch item successfully", func(t *testing.T) { @@ -344,12 +322,11 @@ func TestShardProcessor(t *testing.T) { // --- ARRANGE --- h := newTestHarness(t, testCleanupTick) item := h.newTestItem("req-dispatch-success", testFlow, testTTL) - h.addQueue(types.FlowKey{ID: testFlow.ID, Priority: testFlow.Priority}) + h.addQueue(testFlow) // --- ACT --- h.Start() - defer h.Stop() - h.processor.Enqueue(item) + require.NoError(t, h.processor.Submit(item), "precondition: Submit should not fail") h.Go() // --- ASSERT --- @@ -372,8 +349,7 @@ func TestShardProcessor(t *testing.T) { // --- ACT --- h.Start() - defer h.Stop() - h.processor.Enqueue(item) + require.NoError(t, h.processor.Submit(item), "precondition: Submit should not fail") h.Go() // --- ASSERT --- @@ -396,7 +372,7 @@ func TestShardProcessor(t *testing.T) { // --- ACT --- h.Start() defer h.Stop() - h.processor.Enqueue(item) + require.NoError(t, h.processor.Submit(item), "precondition: Submit should not fail") h.Go() // --- ASSERT --- @@ -416,15 +392,14 @@ func TestShardProcessor(t *testing.T) { // --- ACT --- h.Start() h.Go() - // Stop the processor, then immediately try to enqueue. - h.Stop() - h.processor.Enqueue(item) + h.Stop() // Stop the processor, then immediately try to enqueue. + require.NoError(t, h.processor.Submit(item), "precondition: Submit should not fail, even on shutdown") // --- ASSERT --- outcome, err := h.waitForFinalization(item) assert.Equal(t, types.QueueOutcomeRejectedOther, outcome, "The outcome should be RejectedOther") require.Error(t, err, "An eviction on shutdown should produce an error") - assert.ErrorIs(t, err, types.ErrFlowControllerShutdown, "The error should be of type ErrFlowControllerShutdown") + assert.ErrorIs(t, err, types.ErrFlowControllerNotRunning, "The error should be of type ErrFlowControllerNotRunning") }) t.Run("should evict item on TTL expiry via background cleanup", func(t *testing.T) { @@ -436,13 +411,12 @@ func TestShardProcessor(t *testing.T) { // --- ACT --- h.Start() - defer h.Stop() - h.processor.Enqueue(item) + require.NoError(t, h.processor.Submit(item), "precondition: Submit should not fail") h.Go() - // Let time pass for the item to expire and for the background cleanup to run. - h.mockClock.Advance(testShortTTL * 2) - time.Sleep(testCleanupTick * 3) // Allow the cleanup goroutine time to run. + h.clock.Step(testShortTTL * 2) // Let time pass for the item to expire. + // Manually invoke the cleanup logic to simulate a tick of the cleanup loop deterministically. + h.processor.cleanupExpired(h.clock.Now()) // --- ASSERT --- outcome, err := h.waitForFinalization(item) @@ -451,52 +425,22 @@ func TestShardProcessor(t *testing.T) { assert.ErrorIs(t, err, types.ErrTTLExpired, "The error should be of type ErrTTLExpired") }) - t.Run("should evict item at moment of dispatch if TTL has expired", func(t *testing.T) { - t.Parallel() - // --- ARRANGE --- - h := newTestHarness(t, 1*time.Hour) // Disable background cleanup to isolate dispatch logic. - item := h.newTestItem("req-expired-dispatch-evict", testFlow, testShortTTL) - mockQueue := h.addQueue(testFlow) - require.NoError(t, mockQueue.Add(item), "Adding item to mock queue should not fail") - - // Have the policy select the item, but then advance time so it's expired by the time dispatchItem actually runs. - h.interFlowPolicySelectQueue = func(band framework.PriorityBandAccessor) (framework.FlowQueueAccessor, error) { - h.mockClock.Advance(testShortTTL * 2) - return mockQueue.FlowQueueAccessor(), nil - } - - // --- ACT --- - h.Start() - defer h.Stop() - h.Go() - - // The run loop will pick up the item and attempt dispatch, which will fail internally. - // We need a small sleep to allow the non-blocking run loop to process. - time.Sleep(50 * time.Millisecond) - - // --- ASSERT --- - outcome, err := h.waitForFinalization(item) - assert.Equal(t, types.QueueOutcomeEvictedTTL, outcome, "The final outcome should be EvictedTTL") - require.Error(t, err, "An eviction at dispatch time should produce an error") - assert.ErrorIs(t, err, types.ErrTTLExpired, "The error should be of type ErrTTLExpired") - }) - t.Run("should evict item on context cancellation", func(t *testing.T) { t.Parallel() // --- ARRANGE --- h := newTestHarness(t, testCleanupTick) ctx, cancel := context.WithCancel(context.Background()) req := typesmocks.NewMockFlowControlRequest(100, "req-ctx-cancel", testFlow, ctx) - item := NewItem(req, testTTL, h.mockClock.Now()) + item := NewItem(req, testTTL, h.clock.Now()) h.addQueue(testFlow) // --- ACT --- h.Start() - defer h.Stop() - h.processor.Enqueue(item) + require.NoError(t, h.processor.Submit(item), "precondition: Submit should not fail") h.Go() - cancel() // Cancel the context *after* the item is enqueued. - time.Sleep(testCleanupTick * 3) // Allow the cleanup goroutine time to run. + cancel() // Cancel the context after the item is enqueued. + // Manually invoke the cleanup logic to deterministically check for the cancelled context. + h.processor.cleanupExpired(h.clock.Now()) // --- ASSERT --- outcome, err := h.waitForFinalization(item) @@ -528,7 +472,7 @@ func TestShardProcessor(t *testing.T) { outcome, err := h.waitForFinalization(item) assert.Equal(t, types.QueueOutcomeEvictedOther, outcome, "The outcome should be EvictedOther") require.Error(t, err, "An eviction on shutdown should produce an error") - assert.ErrorIs(t, err, types.ErrFlowControllerShutdown, "The error should be of type ErrFlowControllerShutdown") + assert.ErrorIs(t, err, types.ErrFlowControllerNotRunning, "The error should be of type ErrFlowControllerNotRunning") }) t.Run("should handle concurrent enqueues and dispatch all items", func(t *testing.T) { @@ -537,7 +481,7 @@ func TestShardProcessor(t *testing.T) { h := newTestHarness(t, testCleanupTick) const numConcurrentItems = 20 q := h.addQueue(testFlow) - itemsToTest := make([]*flowItem, 0, numConcurrentItems) + itemsToTest := make([]*FlowItem, 0, numConcurrentItems) for i := 0; i < numConcurrentItems; i++ { item := h.newTestItem(fmt.Sprintf("req-concurrent-%d", i), testFlow, testTTL) itemsToTest = append(itemsToTest, item) @@ -549,9 +493,9 @@ func TestShardProcessor(t *testing.T) { var wg sync.WaitGroup for _, item := range itemsToTest { wg.Add(1) - go func(fi *flowItem) { + go func(fi *FlowItem) { defer wg.Done() - h.processor.Enqueue(fi) + require.NoError(t, h.processor.Submit(fi), "Submit should not fail") }(item) } h.Go() @@ -600,8 +544,8 @@ func TestShardProcessor(t *testing.T) { <-itemIsBeingDispatched // 3. The dispatch goroutine is now paused. We can now safely win the "race" by running cleanup logic. - h.mockClock.Advance(testShortTTL * 2) - h.processor.cleanupExpired(h.mockClock.Now()) // This will remove and finalize the item. + h.clock.Step(testShortTTL * 2) + h.processor.cleanupExpired(h.clock.Now()) // This will remove and finalize the item. // 5. Un-pause the dispatch goroutine. It will now fail to remove the item and the `dispatchCycle` will // correctly conclude without finalizing the item a second time. @@ -650,7 +594,7 @@ func TestShardProcessor(t *testing.T) { h.Start() defer h.Stop() h.Go() - h.processor.Enqueue(nil) + require.NoError(t, h.processor.Submit(nil), "Submit should not fail") // --- ASSERT --- // Allow a moment for the processor to potentially process the nil item. @@ -669,19 +613,18 @@ func TestShardProcessor(t *testing.T) { testCases := []struct { name string setupHarness func(h *testHarness) - item *flowItem - assert func(t *testing.T, h *testHarness, item *flowItem) + item *FlowItem + assert func(t *testing.T, h *testHarness, item *FlowItem) }{ { name: "should reject item on registry queue lookup failure", setupHarness: func(h *testHarness) { h.ManagedQueueFunc = func(types.FlowKey) (contracts.ManagedQueue, error) { return nil, testErr } }, - assert: func(t *testing.T, h *testHarness, item *flowItem) { - outcome, err := item.FinalState() - assert.Equal(t, types.QueueOutcomeRejectedOther, outcome, "Outcome should be RejectedOther") - require.Error(t, err, "An error should be returned") - assert.ErrorIs(t, err, testErr, "The underlying error should be preserved") + assert: func(t *testing.T, h *testHarness, item *FlowItem) { + assert.Equal(t, types.QueueOutcomeRejectedOther, item.finalState.Outcome, "Outcome should be RejectedOther") + require.Error(t, item.finalState.Err, "An error should be returned") + assert.ErrorIs(t, item.finalState.Err, testErr, "The underlying error should be preserved") }, }, { @@ -690,11 +633,10 @@ func TestShardProcessor(t *testing.T) { h.addQueue(testFlow) h.PriorityBandAccessorFunc = func(int) (framework.PriorityBandAccessor, error) { return nil, testErr } }, - assert: func(t *testing.T, h *testHarness, item *flowItem) { - outcome, err := item.FinalState() - assert.Equal(t, types.QueueOutcomeRejectedOther, outcome, "Outcome should be RejectedOther") - require.Error(t, err, "An error should be returned") - assert.ErrorIs(t, err, testErr, "The underlying error should be preserved") + assert: func(t *testing.T, h *testHarness, item *FlowItem) { + assert.Equal(t, types.QueueOutcomeRejectedOther, item.finalState.Outcome, "Outcome should be RejectedOther") + require.Error(t, item.finalState.Err, "An error should be returned") + assert.ErrorIs(t, item.finalState.Err, testErr, "The underlying error should be preserved") }, }, { @@ -703,11 +645,10 @@ func TestShardProcessor(t *testing.T) { mockQueue := h.addQueue(testFlow) mockQueue.AddFunc = func(types.QueueItemAccessor) error { return testErr } }, - assert: func(t *testing.T, h *testHarness, item *flowItem) { - outcome, err := item.FinalState() - assert.Equal(t, types.QueueOutcomeRejectedOther, outcome, "Outcome should be RejectedOther") - require.Error(t, err, "An error should be returned") - assert.ErrorIs(t, err, testErr, "The underlying error should be preserved") + assert: func(t *testing.T, h *testHarness, item *FlowItem) { + assert.Equal(t, types.QueueOutcomeRejectedOther, item.finalState.Outcome, "Outcome should be RejectedOther") + require.Error(t, item.finalState.Err, "An error should be returned") + assert.ErrorIs(t, item.finalState.Err, testErr, "The underlying error should be preserved") }, }, { @@ -724,17 +665,16 @@ func TestShardProcessor(t *testing.T) { assert.Equal(t, 0, addCallCount, "Queue.Add should not have been called for a finalized item") }) }, - item: func() *flowItem { + item: func() *FlowItem { // Create a pre-finalized item. item := newTestHarness(t, 0).newTestItem("req-finalized", testFlow, testTTL) - item.finalize(types.QueueOutcomeDispatched, nil) + item.Finalize(types.QueueOutcomeDispatched, nil) return item }(), - assert: func(t *testing.T, h *testHarness, item *flowItem) { + assert: func(t *testing.T, h *testHarness, item *FlowItem) { // The item was already finalized, so its state should not change. - outcome, err := item.FinalState() - assert.Equal(t, types.QueueOutcomeDispatched, outcome, "Outcome should remain unchanged") - assert.NoError(t, err, "Error should remain unchanged") + assert.Equal(t, types.QueueOutcomeDispatched, item.finalState.Outcome, "Outcome should remain unchanged") + assert.NoError(t, item.finalState.Err, "Error should remain unchanged") }, }, } @@ -997,9 +937,9 @@ func TestShardProcessor(t *testing.T) { // --- ASSERT --- assert.True(t, dispatched, "An item should have been dispatched from the filtered flow") - outcome, err := itemB.FinalState() - assert.Equal(t, types.QueueOutcomeDispatched, outcome, "The dispatched item's outcome should be correct") - assert.NoError(t, err, "The dispatched item should not have an error") + assert.Equal(t, types.QueueOutcomeDispatched, itemB.finalState.Outcome, + "The dispatched item's outcome should be correct") + assert.NoError(t, itemB.finalState.Err, "The dispatched item should not have an error") }) t.Run("should guarantee strict priority by starving lower priority items", func(t *testing.T) { @@ -1012,8 +952,8 @@ func TestShardProcessor(t *testing.T) { qLow := h.addQueue(keyLow) const numItems = 3 - highPrioItems := make([]*flowItem, numItems) - lowPrioItems := make([]*flowItem, numItems) + highPrioItems := make([]*FlowItem, numItems) + lowPrioItems := make([]*FlowItem, numItems) for i := range numItems { // Add high priority items. itemH := h.newTestItem(fmt.Sprintf("req-high-%d", i), keyHigh, testTTL) @@ -1035,9 +975,9 @@ func TestShardProcessor(t *testing.T) { // Verify all high-priority items are gone and low-priority items remain. for _, item := range highPrioItems { - outcome, err := item.FinalState() - assert.Equal(t, types.QueueOutcomeDispatched, outcome, "High-priority item should be dispatched") - assert.NoError(t, err, "Dispatched high-priority item should not have an error") + assert.Equal(t, types.QueueOutcomeDispatched, item.finalState.Outcome, + "High-priority item should be dispatched") + assert.NoError(t, item.finalState.Err, "Dispatched high-priority item should not have an error") } assert.Equal(t, numItems, qLow.Len(), "Low-priority queue should still be full") @@ -1112,7 +1052,7 @@ func TestShardProcessor(t *testing.T) { } // --- ACT --- - h.mockClock.Advance(testShortTTL * 2) // Make the item expire. + h.clock.Step(testShortTTL * 2) // Make the item expire. err := h.processor.dispatchItem(item, h.logger) // --- ASSERT --- @@ -1121,36 +1061,11 @@ func TestShardProcessor(t *testing.T) { assert.ErrorIs(t, err, types.ErrTTLExpired, "The error should be of type ErrTTLExpired") // Second, check the final state of the item itself. - outcome, finalErr := item.FinalState() - assert.Equal(t, types.QueueOutcomeEvictedTTL, outcome, "The item's final outcome should be EvictedTTL") - require.Error(t, finalErr, "The item's final state should contain an error") - assert.ErrorIs(t, finalErr, types.ErrTTLExpired, "The item's final error should be of type ErrTTLExpired") - }) - - t.Run("should panic if queue returns item of wrong type", func(t *testing.T) { - t.Parallel() - // --- ARRANGE --- - h := newTestHarness(t, testCleanupTick) - badItem := &typesmocks.MockQueueItemAccessor{ - OriginalRequestV: typesmocks.NewMockFlowControlRequest(0, "bad-item", testFlow, context.Background()), - } - - h.ManagedQueueFunc = func(types.FlowKey) (contracts.ManagedQueue, error) { - return &mocks.MockManagedQueue{ - RemoveFunc: func(types.QueueItemHandle) (types.QueueItemAccessor, error) { - return badItem, nil - }, - }, nil - } - - itemToDispatch := h.newTestItem("req-dispatch-panic", testFlow, testTTL) - expectedPanicMsg := fmt.Sprintf("%s: internal error: item %q of type %T is not a *flowItem", - errIntraFlow, "bad-item", badItem) - - // --- ACT & ASSERT --- - assert.PanicsWithError(t, expectedPanicMsg, func() { - _ = h.processor.dispatchItem(itemToDispatch, h.logger) - }, "A type mismatch from a queue should cause a panic") + assert.Equal(t, types.QueueOutcomeEvictedTTL, item.finalState.Outcome, + "The item's final outcome should be EvictedTTL") + require.Error(t, item.finalState.Err, "The item's final state should contain an error") + assert.ErrorIs(t, item.finalState.Err, types.ErrTTLExpired, + "The item's final error should be of type ErrTTLExpired") }) }) @@ -1165,16 +1080,15 @@ func TestShardProcessor(t *testing.T) { item := h.newTestItem("req-expired", testFlow, 1*time.Millisecond) q := h.addQueue(testFlow) require.NoError(t, q.Add(item)) - cleanupTime := h.mockClock.Now().Add(10 * time.Millisecond) + cleanupTime := h.clock.Now().Add(10 * time.Millisecond) // --- ACT --- h.processor.cleanupExpired(cleanupTime) // --- ASSERT --- - outcome, err := item.FinalState() - assert.Equal(t, types.QueueOutcomeEvictedTTL, outcome, "Item outcome should be EvictedTTL") - require.Error(t, err, "Item should have an error") - assert.ErrorIs(t, err, types.ErrTTLExpired, "Item error should be ErrTTLExpired") + assert.Equal(t, types.QueueOutcomeEvictedTTL, item.finalState.Outcome, "Item outcome should be EvictedTTL") + require.Error(t, item.finalState.Err, "Item should have an error") + assert.ErrorIs(t, item.finalState.Err, types.ErrTTLExpired, "Item error should be ErrTTLExpired") }) t.Run("should evict all items on shutdown", func(t *testing.T) { @@ -1189,10 +1103,10 @@ func TestShardProcessor(t *testing.T) { h.processor.evictAll() // --- ASSERT --- - outcome, err := item.FinalState() - assert.Equal(t, types.QueueOutcomeEvictedOther, outcome, "Item outcome should be EvictedOther") - require.Error(t, err, "Item should have an error") - assert.ErrorIs(t, err, types.ErrFlowControllerShutdown, "Item error should be ErrFlowControllerShutdown") + assert.Equal(t, types.QueueOutcomeEvictedOther, item.finalState.Outcome, "Item outcome should be EvictedOther") + require.Error(t, item.finalState.Err, "Item should have an error") + assert.ErrorIs(t, item.finalState.Err, types.ErrFlowControllerNotRunning, + "Item error should be ErrFlowControllerNotRunning") }) t.Run("should handle registry errors gracefully during concurrent processing", func(t *testing.T) { @@ -1260,6 +1174,105 @@ func TestShardProcessor(t *testing.T) { }) }) }) + + t.Run("Public API", func(t *testing.T) { + t.Parallel() + + t.Run("Submit", func(t *testing.T) { + t.Parallel() + + t.Run("should return ErrProcessorBusy when channel is full", func(t *testing.T) { + t.Parallel() + h := newTestHarness(t, testCleanupTick) + h.processor.enqueueChan = make(chan *FlowItem, 1) + h.processor.enqueueChan <- h.newTestItem("item-filler", testFlow, testTTL) // Fill the channel to capacity. + + // The next submit should be non-blocking and fail immediately. + err := h.processor.Submit(h.newTestItem("item-to-reject", testFlow, testTTL)) + require.Error(t, err, "Submit must return an error when the channel is full") + assert.ErrorIs(t, err, ErrProcessorBusy, "The returned error must be ErrProcessorBusy") + }) + }) + + t.Run("SubmitOrBlock", func(t *testing.T) { + t.Parallel() + + t.Run("should block when channel is full and succeed when space becomes available", func(t *testing.T) { + t.Parallel() + h := newTestHarness(t, testCleanupTick) + h.processor.enqueueChan = make(chan *FlowItem, 1) + h.processor.enqueueChan <- h.newTestItem("item-filler", testFlow, testTTL) // Fill the channel to capacity. + + itemToSubmit := h.newTestItem("item-to-block", testFlow, testTTL) + submitErr := make(chan error, 1) + + // Run `SubmitOrBlock` in a separate goroutine, as it will block. + go func() { + submitErr <- h.processor.SubmitOrBlock(context.Background(), itemToSubmit) + }() + + // Prove that the call is blocking by ensuring it hasn't returned an error yet. + time.Sleep(20 * time.Millisecond) + require.Len(t, submitErr, 0, "SubmitOrBlock should be blocking and not have returned yet") + <-h.processor.enqueueChan // Make space in the channel. This should unblock the goroutine. + + select { + case err := <-submitErr: + require.NoError(t, err, "SubmitOrBlock should succeed and return no error after being unblocked") + case <-time.After(testWaitTimeout): + t.Fatal("SubmitOrBlock did not return after space was made in the channel") + } + }) + + t.Run("should unblock and return context error on cancellation", func(t *testing.T) { + t.Parallel() + h := newTestHarness(t, testCleanupTick) + h.processor.enqueueChan = make(chan *FlowItem) // Use an unbuffered channel to guarantee the first send blocks. + itemToSubmit := h.newTestItem("item-to-cancel", testFlow, testTTL) + submitErr := make(chan error, 1) + ctx, cancel := context.WithCancel(context.Background()) + + // Run `SubmitOrBlock` in a separate goroutine, as it will block. + go func() { + submitErr <- h.processor.SubmitOrBlock(ctx, itemToSubmit) + }() + + // Prove that the call is blocking. + time.Sleep(20 * time.Millisecond) + require.Len(t, submitErr, 0, "SubmitOrBlock should be blocking and not have returned yet") + cancel() // Cancel the context. This should unblock the goroutine. + + select { + case err := <-submitErr: + require.Error(t, err, "SubmitOrBlock should return an error after context cancellation") + assert.ErrorIs(t, err, context.Canceled, "The returned error must be context.Canceled") + case <-time.After(testWaitTimeout): + t.Fatal("SubmitOrBlock did not return after context was cancelled") + } + }) + + t.Run("should reject immediately if shutting down", func(t *testing.T) { + t.Parallel() + h := newTestHarness(t, testCleanupTick) + item := h.newTestItem("req-shutdown-reject", testFlow, testTTL) + h.addQueue(testFlow) + + h.Start() + h.Go() + h.Stop() // Stop the processor, then immediately try to enqueue. + err := h.processor.SubmitOrBlock(context.Background(), item) + + require.Error(t, err, "SubmitOrBlock should return an error when shutting down") + assert.ErrorIs(t, err, types.ErrFlowControllerNotRunning, "The error should be ErrFlowControllerNotRunning") + + outcome, err := h.waitForFinalization(item) + assert.Equal(t, types.QueueOutcomeRejectedOther, outcome, "The outcome should be RejectedOther") + require.Error(t, err, "Finalization should include an error") + assert.ErrorIs(t, err, types.ErrFlowControllerNotRunning, + "The finalization error should be ErrFlowControllerNotRunning") + }) + }) + }) } func TestCheckItemExpiry(t *testing.T) { @@ -1329,7 +1342,7 @@ func TestCheckItemExpiry(t *testing.T) { typesmocks.NewMockFlowControlRequest(100, "req-finalized", testFlow, context.Background()), testTTL, now) - i.finalize(types.QueueOutcomeDispatched, nil) + i.Finalize(types.QueueOutcomeDispatched, nil) return i }(), now: now, @@ -1362,20 +1375,4 @@ func TestCheckItemExpiry(t *testing.T) { } }) } - - t.Run("should panic on item of an unexpected type", func(t *testing.T) { - t.Parallel() - // --- ARRANGE --- - badItem := &typesmocks.MockQueueItemAccessor{ - OriginalRequestV: typesmocks.NewMockFlowControlRequest(0, "item-bad-type", testFlow, context.Background()), - } - - expectedPanicMsg := fmt.Sprintf("internal error: item %q of type %T is not a *flowItem", - badItem.OriginalRequestV.ID(), badItem) - - // --- ACT & ASSERT --- - assert.PanicsWithError(t, expectedPanicMsg, func() { - _, _, _ = checkItemExpiry(badItem, time.Now()) - }, "A type mismatch from a queue should cause a panic") - }) } From 02c3f2e40e09b8affce3ce888bceec7d5b53ba51 Mon Sep 17 00:00:00 2001 From: Luke Van Drie Date: Thu, 4 Sep 2025 01:07:54 +0000 Subject: [PATCH 3/5] feat: Introduce the FlowController supervisor This commit introduces the `FlowController`, a high-throughput, sharded supervisor that orchestrates a pool of stateful `ShardProcessor` workers. This new component is the central processing engine of the Flow Control system, implementing a "supervisor-worker" pattern. Key features of the `FlowController` include: - Supervisor-Worker Architecture: Acts as a stateless supervisor, managing the lifecycle of stateful `ShardProcessor` workers. It includes a reconciliation loop to garbage-collect workers for stale shards. - Flow-Aware Load Balancing: Implements a "Join-Shortest-Queue-by-Bytes" (JSQ-Bytes) algorithm to distribute incoming requests to the least-loaded worker, promoting emergent fairness. - Synchronous API: Exposes a blocking `EnqueueAndWait` method, which simplifies client integration (e.g., with Envoy `ext_proc`) and provides direct backpressure. - Lazy Worker Initialization: Workers are created on-demand when a shard shard first becomes active to conserve resources and reduce contention on the hot path. - Configuration: A new `Config` object allows for tuning parameters like TTLs, buffer sizes, and reconciliation intervals. --- pkg/epp/flowcontrol/controller/config.go | 110 +++ pkg/epp/flowcontrol/controller/config_test.go | 135 +++ pkg/epp/flowcontrol/controller/controller.go | 478 ++++++++++ .../flowcontrol/controller/controller_test.go | 862 ++++++++++++++++++ pkg/epp/flowcontrol/controller/doc.go | 110 +-- 5 files changed, 1597 insertions(+), 98 deletions(-) create mode 100644 pkg/epp/flowcontrol/controller/config.go create mode 100644 pkg/epp/flowcontrol/controller/config_test.go create mode 100644 pkg/epp/flowcontrol/controller/controller.go create mode 100644 pkg/epp/flowcontrol/controller/controller_test.go diff --git a/pkg/epp/flowcontrol/controller/config.go b/pkg/epp/flowcontrol/controller/config.go new file mode 100644 index 000000000..17c1429aa --- /dev/null +++ b/pkg/epp/flowcontrol/controller/config.go @@ -0,0 +1,110 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "fmt" + "time" +) + +const ( + // defaultExpiryCleanupInterval is the default frequency for scanning for expired items. + defaultExpiryCleanupInterval = 1 * time.Second + // defaultProcessorReconciliationInterval is the default frequency for the supervisor loop. + defaultProcessorReconciliationInterval = 5 * time.Second + // defaultEnqueueChannelBufferSize is the default size of a worker's incoming request buffer. + defaultEnqueueChannelBufferSize = 100 +) + +// Config holds the configuration for the `FlowController`. +type Config struct { + // DefaultRequestTTL is the default Time-To-Live applied to requests that do not + // specify their own TTL hint. + // Optional: If zero, no TTL is applied by default and we rely solely on request context cancellation. + DefaultRequestTTL time.Duration + + // ExpiryCleanupInterval is the interval at which each shard processor scans its queues for expired items. + // Optional: Defaults to `defaultExpiryCleanupInterval` (1 second). + ExpiryCleanupInterval time.Duration + + // ProcessorReconciliationInterval is the frequency at which the `FlowController`'s supervisor loop garbage collects + // stale workers. + // Optional: Defaults to `defaultProcessorReconciliationInterval` (5 seconds). + ProcessorReconciliationInterval time.Duration + + // EnqueueChannelBufferSize is the size of the buffered channel that accepts incoming requests for each shard + // processor. This buffer acts as a shock absorber, decoupling the high-frequency distributor from the processor's + // serial execution loop and allowing the system to handle short bursts of traffic without blocking. + // Optional: Defaults to `defaultEnqueueChannelBufferSize` (100). + EnqueueChannelBufferSize int +} + +// newConfig performs validation and initialization, returning a guaranteed-valid `Config` object. +// This is the required constructor for creating a new configuration. +// It does not mutate the input `cfg`. +func newConfig(cfg Config) (*Config, error) { + newCfg := cfg.deepCopy() + if err := newCfg.validateAndApplyDefaults(); err != nil { + return nil, err + } + return newCfg, nil +} + +// validateAndApplyDefaults checks the global configuration for validity and then mutates the receiver to populate any +// empty fields with system defaults. +func (c *Config) validateAndApplyDefaults() error { + // --- Validation --- + if c.DefaultRequestTTL < 0 { + return fmt.Errorf("DefaultRequestTTL cannot be negative, but got %v", c.DefaultRequestTTL) + } + if c.ExpiryCleanupInterval < 0 { + return fmt.Errorf("ExpiryCleanupInterval cannot be negative, but got %v", c.ExpiryCleanupInterval) + } + if c.ProcessorReconciliationInterval < 0 { + return fmt.Errorf("ProcessorReconciliationInterval cannot be negative, but got %v", + c.ProcessorReconciliationInterval) + } + if c.EnqueueChannelBufferSize < 0 { + return fmt.Errorf("EnqueueChannelBufferSize cannot be negative, but got %d", c.EnqueueChannelBufferSize) + } + + // --- Defaulting --- + if c.ExpiryCleanupInterval == 0 { + c.ExpiryCleanupInterval = defaultExpiryCleanupInterval + } + if c.ProcessorReconciliationInterval == 0 { + c.ProcessorReconciliationInterval = defaultProcessorReconciliationInterval + } + if c.EnqueueChannelBufferSize == 0 { + c.EnqueueChannelBufferSize = defaultEnqueueChannelBufferSize + } + return nil +} + +// deepCopy creates a deep copy of the `Config` object. +func (c *Config) deepCopy() *Config { + if c == nil { + return nil + } + newCfg := &Config{ + DefaultRequestTTL: c.DefaultRequestTTL, + ExpiryCleanupInterval: c.ExpiryCleanupInterval, + ProcessorReconciliationInterval: c.ProcessorReconciliationInterval, + EnqueueChannelBufferSize: c.EnqueueChannelBufferSize, + } + return newCfg +} diff --git a/pkg/epp/flowcontrol/controller/config_test.go b/pkg/epp/flowcontrol/controller/config_test.go new file mode 100644 index 000000000..6dd28ec8c --- /dev/null +++ b/pkg/epp/flowcontrol/controller/config_test.go @@ -0,0 +1,135 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUTHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestNewConfig(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + input Config + expectErr bool + expectedCfg Config + shouldDefault bool + }{ + { + name: "ValidConfig_NoChanges", + input: Config{ + DefaultRequestTTL: 10 * time.Second, + ExpiryCleanupInterval: 2 * time.Second, + ProcessorReconciliationInterval: 10 * time.Second, + EnqueueChannelBufferSize: 200, + }, + expectErr: false, + expectedCfg: Config{ + DefaultRequestTTL: 10 * time.Second, + ExpiryCleanupInterval: 2 * time.Second, + ProcessorReconciliationInterval: 10 * time.Second, + EnqueueChannelBufferSize: 200, + }, + }, + { + name: "EmptyConfig_ShouldApplyDefaults", + input: Config{}, + expectErr: false, + expectedCfg: Config{ + DefaultRequestTTL: 0, + ExpiryCleanupInterval: defaultExpiryCleanupInterval, + ProcessorReconciliationInterval: defaultProcessorReconciliationInterval, + EnqueueChannelBufferSize: defaultEnqueueChannelBufferSize, + }, + shouldDefault: true, + }, + { + name: "NegativeDefaultRequestTTL_Invalid", + input: Config{DefaultRequestTTL: -1}, + expectErr: true, + }, + { + name: "NegativeExpiryCleanupInterval_Invalid", + input: Config{ExpiryCleanupInterval: -1}, + expectErr: true, + }, + { + name: "NegativeProcessorReconciliationInterval_Invalid", + input: Config{ProcessorReconciliationInterval: -1}, + expectErr: true, + }, + { + name: "NegativeEnqueueChannelBufferSize_Invalid", + input: Config{EnqueueChannelBufferSize: -1}, + expectErr: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + originalInput := tc.input.deepCopy() + validatedCfg, err := newConfig(tc.input) + + if tc.expectErr { + require.Error(t, err, "expected an error but got nil") + assert.Nil(t, validatedCfg, "validatedCfg should be nil on error") + } else { + require.NoError(t, err, "expected no error but got: %v", err) + require.NotNil(t, validatedCfg, "validatedCfg should not be nil on success") + assert.Equal(t, tc.expectedCfg, *validatedCfg, "validatedCfg should match expected config") + + // Ensure the original config is not mutated. + assert.Equal(t, *originalInput, tc.input, "input config should not be mutated") + } + }) + } +} + +func TestConfig_DeepCopy(t *testing.T) { + t.Parallel() + + t.Run("ShouldReturnNil_ForNilReceiver", func(t *testing.T) { + t.Parallel() + var nilConfig *Config + assert.Nil(t, nilConfig.deepCopy(), "Deep copy of a nil config should be nil") + }) + + t.Run("ShouldCreateIdenticalButSeparateObject", func(t *testing.T) { + t.Parallel() + original := &Config{ + DefaultRequestTTL: 1 * time.Second, + ExpiryCleanupInterval: 2 * time.Second, + ProcessorReconciliationInterval: 3 * time.Second, + EnqueueChannelBufferSize: 4, + } + clone := original.deepCopy() + + require.NotSame(t, original, clone, "Clone should be a new object in memory") + assert.Equal(t, *original, *clone, "Cloned object should have identical values") + + // Modify the clone and ensure the original is unchanged. + clone.DefaultRequestTTL = 99 * time.Second + assert.NotEqual(t, original.DefaultRequestTTL, clone.DefaultRequestTTL, + "Original should not be mutated after clone is changed") + }) +} diff --git a/pkg/epp/flowcontrol/controller/controller.go b/pkg/epp/flowcontrol/controller/controller.go new file mode 100644 index 000000000..9d67b0e42 --- /dev/null +++ b/pkg/epp/flowcontrol/controller/controller.go @@ -0,0 +1,478 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package controller contains the implementation of the `FlowController` engine. +// +// The FlowController is the central processing engine of the Flow Control system. It is a sharded, high-throughput +// component responsible for managing the lifecycle of all incoming requests. It achieves this by acting as a stateless +// supervisor that orchestrates a pool of stateful workers (`internal.ShardProcessor`), distributing incoming requests +// among them using a sophisticated load-balancing algorithm. +package controller + +import ( + "cmp" + "context" + "errors" + "fmt" + "slices" + "sync" + "sync/atomic" + "time" + + "github.com/go-logr/logr" + "k8s.io/utils/clock" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/contracts" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/controller/internal" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +// registryClient defines the minimal interface that the `FlowController` needs to interact with the `FlowRegistry`. +type registryClient interface { + contracts.FlowRegistryObserver + contracts.FlowRegistryDataPlane +} + +// shardProcessor is the minimal internal interface that the `FlowController` requires from its workers. +// This abstraction allows for the injection of mock processors during testing. +type shardProcessor interface { + Run(ctx context.Context) + Submit(item *internal.FlowItem) error + SubmitOrBlock(ctx context.Context, item *internal.FlowItem) error +} + +// shardProcessorFactory defines the signature for a function that creates a `shardProcessor`. +// This enables dependency injection for testing. +type shardProcessorFactory func( + shard contracts.RegistryShard, + dispatchFilter internal.BandFilter, + clock clock.Clock, + expiryCleanupInterval time.Duration, + enqueueChannelBufferSize int, + logger logr.Logger, +) shardProcessor + +var _ shardProcessor = &internal.ShardProcessor{} + +// managedWorker holds the state for a single supervised worker. +type managedWorker struct { + processor shardProcessor + cancel context.CancelFunc +} + +// FlowController is the central, high-throughput engine of the Flow Control system. +// It is designed as a stateless distributor that orchestrates a pool of stateful workers (`internal.ShardProcessor`), +// following a "supervisor-worker" pattern. +// +// The controller's `Run` loop executes periodically, acting as a garbage collector that keeps the pool of running +// workers synchronized with the dynamic shard topology of the `FlowRegistry`. +type FlowController struct { + // --- Immutable dependencies (set at construction) --- + + config Config + registry registryClient + saturationDetector contracts.SaturationDetector + clock clock.WithTicker + logger logr.Logger + shardProcessorFactory shardProcessorFactory + + // --- Lifecycle state --- + + // parentCtx is the root context for the controller's lifecycle, established when `Run` is called. + // It is the parent for all long-lived worker goroutines. + parentCtx context.Context + + // --- Concurrent state --- + + // workers is a highly concurrent map storing the `managedWorker` for each shard. + // It is the controller's source of truth for the worker pool. + // The key is the shard ID (`string`), and the value is a `*managedWorker`. + workers sync.Map + + // ready is closed by the Run method once initialization is complete, including setting the `parentCtx`. + // This acts as a memory barrier and synchronization point for all other concurrent methods. + ready chan struct{} + + isRunning atomic.Bool + wg sync.WaitGroup +} + +// flowControllerOption is a function that applies a configuration change to a `FlowController`. +// test-only +type flowControllerOption func(*FlowController) + +// withClock returns a test-only option to inject a clock. +// test-only +func withClock(c clock.WithTicker) flowControllerOption { + return func(fc *FlowController) { + fc.clock = c + } +} + +// withRegistryClient returns a test-only option to inject a mock or fake registry client. +// test-only +func withRegistryClient(client registryClient) flowControllerOption { + return func(fc *FlowController) { + fc.registry = client + } +} + +// withShardProcessorFactory returns a test-only option to inject a processor factory. +// test-only +func withShardProcessorFactory(factory shardProcessorFactory) flowControllerOption { + return func(fc *FlowController) { + fc.shardProcessorFactory = factory + } +} + +// NewFlowController creates a new `FlowController` instance. +func NewFlowController( + config Config, + registry contracts.FlowRegistry, + sd contracts.SaturationDetector, + logger logr.Logger, + opts ...flowControllerOption, +) (*FlowController, error) { + validatedConfig, err := newConfig(config) + if err != nil { + return nil, fmt.Errorf("invalid flow controller configuration: %w", err) + } + + fc := &FlowController{ + config: *validatedConfig, + registry: registry, + saturationDetector: sd, + clock: clock.RealClock{}, + logger: logger.WithName("flow-controller"), + parentCtx: context.Background(), // Will be set in `Run` + ready: make(chan struct{}), + } + + // Use the real shard processor implementation by default. + fc.shardProcessorFactory = func( + shard contracts.RegistryShard, + dispatchFilter internal.BandFilter, + clock clock.Clock, + expiryCleanupInterval time.Duration, + enqueueChannelBufferSize int, + logger logr.Logger, + ) shardProcessor { + return internal.NewShardProcessor( + shard, + dispatchFilter, + clock, + expiryCleanupInterval, + enqueueChannelBufferSize, + logger) + } + + for _, opt := range opts { + opt(fc) + } + return fc, nil +} + +// Run starts the `FlowController`'s main reconciliation loop. +// This loop is responsible for garbage collecting workers whose shards no longer exist in the registry. +// This method blocks until the provided context is cancelled and ALL worker goroutines have fully terminated. +func (fc *FlowController) Run(ctx context.Context) { + if !fc.isRunning.CompareAndSwap(false, true) { + fc.logger.Error(nil, "FlowController Run loop already started or controller is shut down") + return + } + + fc.parentCtx = ctx + close(fc.ready) + + fc.logger.Info("Starting FlowController reconciliation loop.") + defer fc.logger.Info("FlowController reconciliation loop stopped.") + + ticker := fc.clock.NewTicker(fc.config.ProcessorReconciliationInterval) + defer ticker.Stop() + + fc.reconcileProcessors() // Initial reconciliation + + for { + select { + case <-ctx.Done(): + fc.shutdown() + return + case <-ticker.C(): + fc.reconcileProcessors() + } + } +} + +// EnqueueAndWait is the primary, synchronous entry point to the Flow Control system. It submits a request and blocks +// until the request reaches a terminal outcome (dispatched, rejected, or evicted). +// +// # Design Rationale: The Synchronous Model +// +// This blocking model is deliberately chosen for its simplicity and robustness, especially in the context of Envoy +// External Processing (`ext_proc`), which operates on a stream-based protocol. +// +// - `ext_proc` Alignment: A single goroutine typically manages the stream for a given HTTP request. +// `EnqueueAndWait` fits this perfectly: the request-handling goroutine calls it, blocks, and upon return, has a +// definitive outcome to act upon. +// - Simplified State Management: The state of a "waiting" request is implicitly managed by the blocked goroutine's +// stack and its `context.Context`. The system only needs to signal this specific goroutine to unblock it. +// - Direct Backpressure: If queues are full, `EnqueueAndWait` returns an error immediately, providing direct +// backpressure to the caller. +func (fc *FlowController) EnqueueAndWait(req types.FlowControlRequest) (types.QueueOutcome, error) { + if req == nil { + return types.QueueOutcomeRejectedOther, fmt.Errorf("%w: %w", types.ErrRejected, types.ErrNilRequest) + } + effectiveTTL := req.InitialEffectiveTTL() + if effectiveTTL <= 0 { + effectiveTTL = fc.config.DefaultRequestTTL + } + enqueueTime := fc.clock.Now() + + for { + if !fc.isRunning.Load() { + return types.QueueOutcomeRejectedOther, fmt.Errorf("%w: %w", types.ErrRejected, types.ErrFlowControllerNotRunning) + } + + // We must create a fresh `FlowItem` on each attempt since finalization is idempotent. + // However, it we use the original, preserved `enqueueTime`. + item := internal.NewItem(req, effectiveTTL, enqueueTime) + if outcome, err := fc.distributeRequest(item); err != nil { + return outcome, fmt.Errorf("%w: %w", types.ErrRejected, err) + } + + finalState := <-item.Done() // finalization handles monitoring request context cancellation and TTL expiry + if errors.Is(finalState.Err, contracts.ErrShardDraining) { + fc.logger.V(logutil.DEBUG).Info("Shard is draining, retrying request", "requestID", req.ID()) + // Benign race with the chosen `contracts.RegistryShard` becoming Draining post selection but before the item was + // enqueued into its respective `contracts.ManagedQueue`. Simply try again. + continue + } + + return finalState.Outcome, finalState.Err + } +} + +// distributeRequest selects the optimal shard processor for a given item and attempts to submit it. +// +// # Architectural Deep Dive: Achieving Emergent Fairness with Data Parallelism +// +// To achieve high throughput and prevent a central bottleneck, the `FlowController` is built on a sharded, +// data-parallel architecture. It runs multiple `internal`ShardProcessor` workers, and every logical flow is represented +// by a dedicated queue on every Active shard. This design grants the distributor maximum flexibility to route traffic +// based on real-time load. +// +// This function implements a sophisticated distribution strategy: Flow-Aware, Two-Phase Join-Shortest-Queue-by-Bytes +// (JSQ-Bytes) with Graceful Backpressure. It is designed to balance load, prevent unnecessary rejections under +// transient spikes, and create the necessary conditions for global fairness goals to emerge from local, independent +// worker actions. +// +// # The Algorithm in Detail +// +// 1. Flow-Aware Candidate Selection: For an incoming request, the controller inspects the queue depth (in bytes) for +// that specific flow across all Active shards. It then sorts these shards from least-loaded to most-loaded, +// creating a ranked list of candidates. +// 2. Phase 1 (Non-blocking Fast Failover): The controller iterates through the sorted candidates and attempts a +// non-blocking `Submit()` to each. If any processor accepts the item, the operation succeeds immediately. +// This prevents a single, momentarily busy worker from stalling the entire system. +// 3. Phase 2 (Blocking Fallback): If all processors are busy, it attempts a single, blocking `SubmitOrBlock()` on the +// single best candidate. This provides graceful backpressure and increases the likelihood of success during +// transient traffic bursts. +// +// # Design Rationale and Critical Assumptions +// +// ### 1. The Flow Homogeneity Assumption +// +// The first assumption is that traffic within a single logical flow is roughly homogeneous. The `types.FlowKey` is +// the primary mechanism for grouping requests that are expected to have statistically similar behavior (e.g., prefill, +// decode). For this to be effective, a flow must meaningfully represent a single workload (e.g., the same model, user +// cohort, or task type). The more closely this assumption is satisfied in practice, the more stable and predictable the +// system dynamics will be. +// +// ### Robustness Through Real-Time Adaptation +// +// The system is designed to be robust even when the homogeneity assumption is imperfect. The distribution algorithm +// does not need to predict workload characteristics; it only needs to react to their consequences in real time. +// If a shard becomes slow or congested, the backlogs of its queues will grow. The JSQ-Bytes algorithm will naturally +// observe this increase in byte size and adaptively steer new work away from the congested shard. +// +// ### 2. The Shard Homogeneity Constraint (Enabling Stateful Policies) +// +// The second, and most critical, constraint of this data-parallel design relates to the policies executed by the +// workers. The fairness (`InterFlowDispatchPolicy`) and temporal scheduling (`IntraFlowDispatchPolicy`) policies may be +// stateful (e.g., a fairness algorithm tracking historical tokens served). +// +// For the independent decisions of these stateful policies to result in coherent, globally fair outcomes, the state +// they observe on each shard must be statistically similar. This is the shard homogeneity constraint. +// +// This constraint is actively enforced by the Flow-Aware JSQ-Bytes algorithm. By constantly balancing the load for each +// flow individually, the distributor ensures that, over time, the mix of traffic on each shard is roughly proportional. +// It actively prevents one shard from becoming specialized in serving a single, dominant flow. +// +// This creates the necessary foundation for our model: local, stateful policy decisions, when aggregated across +// statistically similar shards, result in an emergent, approximately correct global fairness objective. This is key to +// unlocking scalable performance without a central, bottlenecked scheduler. +func (fc *FlowController) distributeRequest(item *internal.FlowItem) (types.QueueOutcome, error) { + key := item.OriginalRequest().FlowKey() + reqID := item.OriginalRequest().ID() + type candidate struct { + processor shardProcessor + shardID string + byteSize uint64 + } + var candidates []candidate + err := fc.registry.WithConnection(key, func(conn contracts.ActiveFlowConnection) error { + shards := conn.ActiveShards() + candidates = make([]candidate, len(shards)) + for i, shard := range shards { + worker := fc.getOrStartWorker(shard) + mq, err := shard.ManagedQueue(key) + if err != nil { + panic(fmt.Sprintf("invariant violation: ManagedQueue for leased flow %s failed on shard %s: %v", + key, shard.ID(), err)) + } + candidates[i] = candidate{worker.processor, shard.ID(), mq.ByteSize()} + } + return nil + }) + if err != nil { + return types.QueueOutcomeRejectedOther, fmt.Errorf("failed to establish connection for request %q (flow %s): %w", + reqID, key, err) + } + + if len(candidates) == 0 { + return types.QueueOutcomeRejectedCapacity, fmt.Errorf("no viable Active shards available for request %q (flow %s)", + reqID, key) + } + + slices.SortFunc(candidates, func(a, b candidate) int { + return cmp.Compare(a.byteSize, b.byteSize) + }) + + // --- Phase 1: Fast, non-blocking failover attempt --- + for _, c := range candidates { + if err := c.processor.Submit(item); err == nil { + return types.QueueOutcomeNotYetFinalized, nil // Success + } + fc.logger.V(logutil.DEBUG).Info("Processor busy during fast failover, trying next candidate", + "shardID", c.shardID, "requestID", reqID) + } + + // --- Phase 2: All processors busy. Attempt a single blocking send to the best candidate. --- + bestCandidate := candidates[0] + fc.logger.V(logutil.DEBUG).Info("All processors busy, attempting blocking submit to best candidate", + "shardID", bestCandidate.shardID, "requestID", reqID, "queueByteSize", bestCandidate.byteSize) + + err = bestCandidate.processor.SubmitOrBlock(item.OriginalRequest().Context(), item) + if err != nil { + // If even the blocking attempt fails (e.g., context cancelled or processor shut down), the request is definitively + // rejected. + return types.QueueOutcomeRejectedCapacity, fmt.Errorf( + "all viable shard processors are at capacity for request %q (flow %s): %w", reqID, key, err) + } + return types.QueueOutcomeNotYetFinalized, nil +} + +// getOrStartWorker implements the lazy-loading and startup of shard processors. +// It attempts to retrieve an existing worker for a shard, and if one doesn't exist, it creates, starts, and +// registers it atomically. +// This ensures that workers are only created on-demand when a shard first becomes Active. +func (fc *FlowController) getOrStartWorker(shard contracts.RegistryShard) *managedWorker { + if w, ok := fc.workers.Load(shard.ID()); ok { + return w.(*managedWorker) + } + + // Atomically load or store. + // This handles the race condition where multiple goroutines try to create the same worker. + newWorker := fc.startNewWorker(shard) + actual, loaded := fc.workers.LoadOrStore(shard.ID(), newWorker) + if loaded { + // Another goroutine beat us to it; the `newWorker` we created was not stored. + // We must clean it up immediately to prevent resource leaks. + newWorker.cancel() + return actual.(*managedWorker) + } + return newWorker +} + +// startNewWorker encapsulates the logic for creating and starting a new worker goroutine. +func (fc *FlowController) startNewWorker(shard contracts.RegistryShard) *managedWorker { + <-fc.ready // We must wait until the parent context is initialized. + processorCtx, cancel := context.WithCancel(fc.parentCtx) + dispatchFilter := internal.NewSaturationFilter(fc.saturationDetector) + processor := fc.shardProcessorFactory( + shard, + dispatchFilter, + fc.clock, + fc.config.ExpiryCleanupInterval, + fc.config.EnqueueChannelBufferSize, + fc.logger.WithValues("shardID", shard.ID()), + ) + + worker := &managedWorker{ + processor: processor, + cancel: cancel, + } + + fc.wg.Add(1) + go func() { + defer fc.wg.Done() + processor.Run(processorCtx) + }() + + return worker +} + +// reconcileProcessors is the supervisor's core garbage collection loop. +// It fetches the current list of Active shards from the registry and removes any workers whose corresponding shards +// have been fully drained and garbage collected by the registry. +func (fc *FlowController) reconcileProcessors() { + stats := fc.registry.ShardStats() + shards := make(map[string]struct{}, len(stats)) // `map[shardID] -> isActive` + for _, s := range stats { + shards[s.ID] = struct{}{} + } + + fc.workers.Range(func(key, value any) bool { + shardID := key.(string) + worker := value.(*managedWorker) + + // GC check: Is the shard no longer in the registry at all? + if _, exists := shards[shardID]; !exists { + fc.logger.Info("Stale worker detected for GC'd shard, shutting down.", "shardID", shardID) + worker.cancel() + fc.workers.Delete(shardID) + } + return true + }) +} + +// shutdown gracefully terminates all running `shardProcessor` goroutines. +// It signals all workers to stop and waits for them to complete their shutdown procedures. +func (fc *FlowController) shutdown() { + fc.isRunning.Store(false) + fc.logger.Info("Shutting down FlowController and all shard processors.") + fc.workers.Range(func(key, value any) bool { + shardID := key.(string) + worker := value.(*managedWorker) + fc.logger.V(logutil.VERBOSE).Info("Sending shutdown signal to processor", "shardID", shardID) + worker.cancel() + return true + }) + + fc.wg.Wait() + fc.logger.Info("All shard processors have shut down.") +} diff --git a/pkg/epp/flowcontrol/controller/controller_test.go b/pkg/epp/flowcontrol/controller/controller_test.go new file mode 100644 index 000000000..20bf70175 --- /dev/null +++ b/pkg/epp/flowcontrol/controller/controller_test.go @@ -0,0 +1,862 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "errors" + "fmt" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/go-logr/logr" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "k8s.io/utils/clock" + testclock "k8s.io/utils/clock/testing" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/contracts" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/contracts/mocks" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/controller/internal" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/framework" + frameworkmocks "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/framework/mocks" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types" + typesmocks "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types/mocks" +) + +// --- Test Harness & Fixtures --- + +// testHarness holds the `FlowController` and its dependencies under test. +type testHarness struct { + fc *FlowController + cfg Config + mockRegistry *mockRegistryClient + mockDetector *mocks.MockSaturationDetector + mockClock *testclock.FakeClock + mockProcessorFactory *mockShardProcessorFactory +} + +func (h *testHarness) start(t *testing.T) { + t.Helper() + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + go h.fc.Run(ctx) + + select { + case <-h.fc.ready: + // Success + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for FlowController to become ready") + } +} + +// newUnitHarness creates a test environment with a mock processor factory, suitable for focused unit tests of the +// controller's logic. +func newUnitHarness(t *testing.T, cfg Config) *testHarness { + t.Helper() + mockRegistry := &mockRegistryClient{} + mockDetector := &mocks.MockSaturationDetector{} + mockClock := testclock.NewFakeClock(time.Now()) + mockProcessorFactory := &mockShardProcessorFactory{ + processors: make(map[string]*mockShardProcessor), + } + opts := []flowControllerOption{ + withRegistryClient(mockRegistry), + withClock(mockClock), + withShardProcessorFactory(mockProcessorFactory.new), + } + fc, err := NewFlowController(cfg, mockRegistry, mockDetector, logr.Discard(), opts...) + require.NoError(t, err, "failed to create FlowController for unit test harness") + return &testHarness{ + fc: fc, + cfg: cfg, + mockRegistry: mockRegistry, + mockDetector: mockDetector, + mockClock: mockClock, + mockProcessorFactory: mockProcessorFactory, + } +} + +// newIntegrationHarness creates a test environment that uses real `ShardProcessor`s, suitable for integration tests +// validating the controller-processor interaction. +func newIntegrationHarness(t *testing.T, cfg Config) *testHarness { + t.Helper() + mockRegistry := &mockRegistryClient{} + mockDetector := &mocks.MockSaturationDetector{} + mockClock := testclock.NewFakeClock(time.Now()) + opts := []flowControllerOption{ + withRegistryClient(mockRegistry), + withClock(mockClock), + } + fc, err := NewFlowController(cfg, mockRegistry, mockDetector, logr.Discard(), opts...) + require.NoError(t, err, "failed to create FlowController for integration test harness") + return &testHarness{ + fc: fc, + cfg: cfg, + mockRegistry: mockRegistry, + mockDetector: mockDetector, + mockClock: mockClock, + } +} + +// mockActiveFlowConnection is a local mock for the `contracts.ActiveFlowConnection` interface. +type mockActiveFlowConnection struct { + contracts.ActiveFlowConnection + ActiveShardsV []contracts.RegistryShard +} + +func (m *mockActiveFlowConnection) ActiveShards() []contracts.RegistryShard { + return m.ActiveShardsV +} + +// mockRegistryClient is a mock for the private `registryClient` interface. +type mockRegistryClient struct { + contracts.FlowRegistryObserver + contracts.FlowRegistryDataPlane + WithConnectionFunc func(key types.FlowKey, fn func(conn contracts.ActiveFlowConnection) error) error + ShardStatsFunc func() []contracts.ShardStats +} + +func (m *mockRegistryClient) WithConnection( + key types.FlowKey, + fn func(conn contracts.ActiveFlowConnection) error, +) error { + if m.WithConnectionFunc != nil { + return m.WithConnectionFunc(key, fn) + } + return fn(&mockActiveFlowConnection{}) +} + +func (m *mockRegistryClient) ShardStats() []contracts.ShardStats { + if m.ShardStatsFunc != nil { + return m.ShardStatsFunc() + } + return nil +} + +// mockShardProcessor is a mock for the internal `shardProcessor` interface. +type mockShardProcessor struct { + SubmitFunc func(item *internal.FlowItem) error + SubmitOrBlockFunc func(ctx context.Context, item *internal.FlowItem) error + runCtx context.Context + runCtxMu sync.RWMutex + runStarted chan struct{} +} + +func (m *mockShardProcessor) Submit(item *internal.FlowItem) error { + if m.SubmitFunc != nil { + return m.SubmitFunc(item) + } + return nil +} + +func (m *mockShardProcessor) SubmitOrBlock(ctx context.Context, item *internal.FlowItem) error { + if m.SubmitOrBlockFunc != nil { + return m.SubmitOrBlockFunc(ctx, item) + } + return nil +} + +func (m *mockShardProcessor) Run(ctx context.Context) { + m.runCtxMu.Lock() + m.runCtx = ctx + m.runCtxMu.Unlock() + if m.runStarted != nil { + close(m.runStarted) + } + <-ctx.Done() +} + +func (m *mockShardProcessor) Context() context.Context { + m.runCtxMu.RLock() + defer m.runCtxMu.RUnlock() + return m.runCtx +} + +// mockShardProcessorFactory allows tests to inject specific `mockShardProcessor` instances. +type mockShardProcessorFactory struct { + mu sync.Mutex + processors map[string]*mockShardProcessor +} + +func (f *mockShardProcessorFactory) new( + shard contracts.RegistryShard, + _ internal.BandFilter, + _ clock.Clock, + _ time.Duration, + _ int, + _ logr.Logger, +) shardProcessor { + f.mu.Lock() + defer f.mu.Unlock() + if proc, ok := f.processors[shard.ID()]; ok { + return proc + } + // Return a default mock processor if one is not registered. + return &mockShardProcessor{} +} + +// stubManagedQueue is a simple stub for the `contracts.ManagedQueue` interface. +type stubManagedQueue struct { + contracts.ManagedQueue + byteSizeV uint64 +} + +func (s *stubManagedQueue) ByteSize() uint64 { return s.byteSizeV } + +func (s *stubManagedQueue) FlowQueueAccessor() framework.FlowQueueAccessor { + return &frameworkmocks.MockFlowQueueAccessor{ByteSizeV: s.byteSizeV} +} + +// mockShardBuilder is a fixture to declaratively build mock `contracts.RegistryShard` for tests. +type mockShardBuilder struct { + id string + byteSize uint64 +} + +func newMockShard(id string) *mockShardBuilder { + return &mockShardBuilder{id: id} +} + +func (b *mockShardBuilder) withByteSize(size uint64) *mockShardBuilder { + b.byteSize = size + return b +} + +func (b *mockShardBuilder) build() contracts.RegistryShard { + return &mocks.MockRegistryShard{ + IDFunc: func() string { return b.id }, + ManagedQueueFunc: func(_ types.FlowKey) (contracts.ManagedQueue, error) { + return &stubManagedQueue{byteSizeV: b.byteSize}, nil + }, + } +} + +var defaultFlowKey = types.FlowKey{ID: "test-flow", Priority: 100} + +func newTestRequest(ctx context.Context, key types.FlowKey) *typesmocks.MockFlowControlRequest { + return &typesmocks.MockFlowControlRequest{ + Ctx: ctx, + FlowKeyV: key, + ByteSizeV: 100, + IDV: "req-" + key.ID, + } +} + +// --- Test Cases --- + +func TestNewFlowController(t *testing.T) { + t.Parallel() + + t.Run("ErrorOnInvalidConfig", func(t *testing.T) { + t.Parallel() + invalidCfg := Config{ProcessorReconciliationInterval: -1 * time.Second} + _, err := NewFlowController(invalidCfg, &mockRegistryClient{}, &mocks.MockSaturationDetector{}, logr.Discard()) + require.Error(t, err, "NewFlowController must return an error for invalid configuration") + }) +} + +func TestFlowController_EnqueueAndWait(t *testing.T) { + t.Parallel() + + t.Run("Rejections", func(t *testing.T) { + t.Parallel() + + t.Run("OnNilRequest", func(t *testing.T) { + t.Parallel() + h := newUnitHarness(t, Config{}) // Do not start the controller, as we are testing the pre-run validation. + outcome, err := h.fc.EnqueueAndWait(nil) + require.Error(t, err, "EnqueueAndWait must reject a nil request") + assert.ErrorIs(t, err, types.ErrRejected, "error should wrap ErrRejected") + assert.ErrorIs(t, err, types.ErrNilRequest, "error should wrap ErrNilRequest") + assert.Equal(t, types.QueueOutcomeRejectedOther, outcome, "outcome should be QueueOutcomeRejectedOther") + }) + + t.Run("OnControllerNotRunning", func(t *testing.T) { + t.Parallel() + h := newUnitHarness(t, Config{}) // Harness is created but `Run()` is never called. + req := newTestRequest(context.Background(), defaultFlowKey) + outcome, err := h.fc.EnqueueAndWait(req) + require.Error(t, err, "EnqueueAndWait must reject requests if controller is not running") + assert.ErrorIs(t, err, types.ErrRejected, "error should wrap ErrRejected") + assert.ErrorIs(t, err, types.ErrFlowControllerNotRunning, "error should wrap ErrFlowControllerNotRunning") + assert.Equal(t, types.QueueOutcomeRejectedOther, outcome, "outcome should be QueueOutcomeRejectedOther") + }) + + t.Run("OnNoShardsAvailable", func(t *testing.T) { + t.Parallel() + h := newUnitHarness(t, Config{}) + h.start(t) + + req := newTestRequest(context.Background(), defaultFlowKey) + outcome, err := h.fc.EnqueueAndWait(req) + require.Error(t, err, "EnqueueAndWait must reject requests if no shards are available") + assert.ErrorIs(t, err, types.ErrRejected, "error should wrap ErrRejected") + assert.Equal(t, types.QueueOutcomeRejectedCapacity, outcome, "outcome should be QueueOutcomeRejectedCapacity") + }) + + t.Run("OnRegistryConnectionError", func(t *testing.T) { + t.Parallel() + h := newUnitHarness(t, Config{}) + h.start(t) + + expectedErr := errors.New("connection failed") + h.mockRegistry.WithConnectionFunc = func( + _ types.FlowKey, + _ func(conn contracts.ActiveFlowConnection) error, + ) error { + return expectedErr + } + + req := newTestRequest(context.Background(), defaultFlowKey) + outcome, err := h.fc.EnqueueAndWait(req) + require.Error(t, err, "EnqueueAndWait must reject requests if registry connection fails") + assert.ErrorIs(t, err, types.ErrRejected, "error should wrap ErrRejected") + assert.ErrorIs(t, err, expectedErr, "error should wrap the underlying connection error") + assert.Equal(t, types.QueueOutcomeRejectedOther, outcome, "outcome should be QueueOutcomeRejectedOther") + }) + + t.Run("PanicsOnManagedQueueError", func(t *testing.T) { + t.Parallel() + h := newUnitHarness(t, Config{}) + h.start(t) + + faultyShard := &mocks.MockRegistryShard{ + IDFunc: func() string { return "faulty-shard" }, + ManagedQueueFunc: func(_ types.FlowKey) (contracts.ManagedQueue, error) { + return nil, errors.New("queue retrieval failed") + }, + } + h.mockRegistry.WithConnectionFunc = func( + _ types.FlowKey, + fn func(conn contracts.ActiveFlowConnection) error, + ) error { + return fn(&mockActiveFlowConnection{ActiveShardsV: []contracts.RegistryShard{faultyShard}}) + } + + req := newTestRequest(context.Background(), defaultFlowKey) + assert.Panics(t, func() { + _, _ = h.fc.EnqueueAndWait(req) + }, "EnqueueAndWait did not panic as expected on a ManagedQueue error") + }) + }) + + t.Run("Distribution", func(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + shards []contracts.RegistryShard + setupProcessors func(t *testing.T, h *testHarness) + expectedOutcome types.QueueOutcome + expectErr bool + }{ + { + name: "SubmitSucceeds_NonBlocking_WithSingleActiveShard", + shards: []contracts.RegistryShard{newMockShard("shard-A").build()}, + setupProcessors: func(t *testing.T, h *testHarness) { + h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ + SubmitFunc: func(item *internal.FlowItem) error { + go item.Finalize(types.QueueOutcomeDispatched, nil) + return nil + }, + } + }, + expectedOutcome: types.QueueOutcomeDispatched, + }, + { + name: "DistributesToLeastLoadedShard_WithMultipleActiveShards", + shards: []contracts.RegistryShard{ + newMockShard("shard-A").withByteSize(1000).build(), + newMockShard("shard-B").withByteSize(100).build(), + }, + setupProcessors: func(t *testing.T, h *testHarness) { + h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ + SubmitFunc: func(_ *internal.FlowItem) error { + t.Error("Submit was called on the more loaded shard (shard-A), which is incorrect") + return internal.ErrProcessorBusy + }, + } + h.mockProcessorFactory.processors["shard-B"] = &mockShardProcessor{ + SubmitFunc: func(item *internal.FlowItem) error { + go item.Finalize(types.QueueOutcomeDispatched, nil) + return nil + }, + } + }, + expectedOutcome: types.QueueOutcomeDispatched, + }, + { + name: "SubmitSucceeds_AfterBlocking_WithAllProcessorsBusy", + shards: []contracts.RegistryShard{ + newMockShard("shard-A").withByteSize(1000).build(), + newMockShard("shard-B").withByteSize(100).build(), + }, + setupProcessors: func(t *testing.T, h *testHarness) { + h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ + SubmitFunc: func(_ *internal.FlowItem) error { return internal.ErrProcessorBusy }, + } + h.mockProcessorFactory.processors["shard-B"] = &mockShardProcessor{ + SubmitFunc: func(_ *internal.FlowItem) error { return internal.ErrProcessorBusy }, + SubmitOrBlockFunc: func(_ context.Context, item *internal.FlowItem) error { + go item.Finalize(types.QueueOutcomeDispatched, nil) + return nil + }, + } + }, + expectedOutcome: types.QueueOutcomeDispatched, + }, + { + name: "Rejects_AfterBlocking_WithAllProcessorsRemainingBusy", + shards: []contracts.RegistryShard{newMockShard("shard-A").build()}, + setupProcessors: func(t *testing.T, h *testHarness) { + h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ + SubmitFunc: func(_ *internal.FlowItem) error { return internal.ErrProcessorBusy }, + SubmitOrBlockFunc: func(_ context.Context, _ *internal.FlowItem) error { return context.DeadlineExceeded }, + } + }, + expectedOutcome: types.QueueOutcomeRejectedCapacity, + expectErr: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + h := newUnitHarness(t, Config{}) + h.start(t) + + h.mockRegistry.WithConnectionFunc = func( + _ types.FlowKey, + fn func(conn contracts.ActiveFlowConnection) error, + ) error { + return fn(&mockActiveFlowConnection{ActiveShardsV: tc.shards}) + } + tc.setupProcessors(t, h) + outcome, err := h.fc.EnqueueAndWait(newTestRequest(context.Background(), defaultFlowKey)) + if tc.expectErr { + require.Error(t, err, "expected an error but got nil") + } else { + require.NoError(t, err, "expected no error but got: %v", err) + } + assert.Equal(t, tc.expectedOutcome, outcome, "outcome did not match expected value") + }) + } + }) + + t.Run("Retry", func(t *testing.T) { + t.Parallel() + + t.Run("Rejects_OnRequestContextCancelledWhileBlocking", func(t *testing.T) { + t.Parallel() + h := newUnitHarness(t, Config{}) + h.start(t) + + h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ + SubmitFunc: func(_ *internal.FlowItem) error { return internal.ErrProcessorBusy }, + SubmitOrBlockFunc: func(ctx context.Context, _ *internal.FlowItem) error { <-ctx.Done(); return ctx.Err() }, + } + h.mockRegistry.WithConnectionFunc = func( + _ types.FlowKey, + fn func(conn contracts.ActiveFlowConnection, + ) error) error { + return fn(&mockActiveFlowConnection{ActiveShardsV: []contracts.RegistryShard{newMockShard("shard-A").build()}}) + } + reqCtx, cancelReq := context.WithCancel(context.Background()) + go func() { time.Sleep(50 * time.Millisecond); cancelReq() }() + + outcome, err := h.fc.EnqueueAndWait(newTestRequest(reqCtx, defaultFlowKey)) + + require.Error(t, err, "EnqueueAndWait must fail when context is cancelled during a blocking submit") + assert.ErrorIs(t, err, types.ErrRejected, "error should wrap ErrRejected") + assert.ErrorIs(t, err, context.Canceled, "error should wrap the underlying ctx.Err()") + assert.Equal(t, types.QueueOutcomeRejectedCapacity, outcome, "outcome should be QueueOutcomeRejectedCapacity") + }) + + t.Run("RetriesAndSucceeds_OnProcessorReportsShardDraining", func(t *testing.T) { + t.Parallel() + h := newUnitHarness(t, Config{}) + h.start(t) + + h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ + SubmitFunc: func(item *internal.FlowItem) error { + go item.Finalize(types.QueueOutcomeRejectedOther, contracts.ErrShardDraining) + return nil + }, + } + h.mockProcessorFactory.processors["shard-B"] = &mockShardProcessor{ + SubmitFunc: func(item *internal.FlowItem) error { + go item.Finalize(types.QueueOutcomeDispatched, nil) + return nil + }, + } + + var callCount atomic.Int32 + h.mockRegistry.WithConnectionFunc = func( + _ types.FlowKey, + fn func(conn contracts.ActiveFlowConnection) error, + ) error { + attempt := callCount.Add(1) + shardA := newMockShard("shard-A").withByteSize(100).build() + shardB := newMockShard("shard-B").withByteSize(1000).build() + if attempt == 1 { + return fn(&mockActiveFlowConnection{ActiveShardsV: []contracts.RegistryShard{shardA, shardB}}) + } + return fn(&mockActiveFlowConnection{ActiveShardsV: []contracts.RegistryShard{shardB}}) + } + + outcome, err := h.fc.EnqueueAndWait(newTestRequest(context.Background(), defaultFlowKey)) + require.NoError(t, err, "EnqueueAndWait must succeed after retrying on a healthy shard") + assert.Equal(t, types.QueueOutcomeDispatched, outcome, "outcome should be QueueOutcomeDispatched") + assert.Equal(t, int32(2), callCount.Load(), "registry must be consulted for Active shards on each retry attempt") + }) + }) +} + +func TestFlowController_Run(t *testing.T) { + t.Parallel() + + t.Run("Idempotency", func(t *testing.T) { + t.Parallel() + h := newUnitHarness(t, Config{}) + + // Start the controller in the background. + ctx := t.Context() + go h.fc.Run(ctx) + + // Wait for the first `Run` call to complete its startup sequence. + select { + case <-h.fc.ready: + // Success + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for FlowController to become ready") + } + + // The second call to Run should return immediately without blocking or panicking. + // We wrap it in a channel to add a timeout, ensuring it doesn't block. + done := make(chan struct{}) + go func() { + h.fc.Run(ctx) + close(done) + }() + + select { + case <-done: + // Success + case <-time.After(100 * time.Millisecond): + t.Error("second call to Run() blocked, indicating it is not idempotent") + } + }) + + t.Run("Reconciliation", func(t *testing.T) { + t.Parallel() + h := newUnitHarness(t, Config{}) + + // Manually simulate the controller starting up so we can test the reconciliation logic in isolation. + parentCtx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + h.fc.parentCtx = parentCtx + close(h.fc.ready) + + // Pre-populate the controller with initial workers. + initialShards := []string{"shard-A", "stale-shard"} + for _, shardID := range initialShards { + currentShardID := shardID + h.mockProcessorFactory.processors[currentShardID] = &mockShardProcessor{runStarted: make(chan struct{})} + shard := &mocks.MockRegistryShard{IDFunc: func() string { return currentShardID }} + h.fc.getOrStartWorker(shard) + } + require.Len(t, h.mockProcessorFactory.processors, 2, "pre-condition: initial workers not set up correctly") + + // Wait for all workers to have started and set their contexts before proceeding with the test. + for _, p := range h.mockProcessorFactory.processors { + proc := p + select { + case <-proc.runStarted: + // Success + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for worker to start") + } + } + + // Configure the mock registry to report the new state without the stale shard. + h.mockRegistry.ShardStatsFunc = func() []contracts.ShardStats { + return []contracts.ShardStats{{ID: "shard-A"}} + } + + // Manually trigger the reconciliation loop logic. + h.fc.reconcileProcessors() + + t.Run("StaleWorkerIsCancelled", func(t *testing.T) { + staleProc := h.mockProcessorFactory.processors["stale-shard"] + require.NotNil(t, staleProc.Context(), "precondition: stale processor context should have been captured") + select { + case <-staleProc.Context().Done(): + // Success + case <-time.After(100 * time.Millisecond): + t.Error("context of removed worker must be cancelled") + } + }) + + t.Run("ActiveWorkerIsNotCancelled", func(t *testing.T) { + activeProc := h.mockProcessorFactory.processors["shard-A"] + require.NotNil(t, activeProc.Context(), "precondition: active processor context should have been captured") + select { + case <-activeProc.Context().Done(): + t.Error("context of remaining worker must not be cancelled") + default: + // Success + } + }) + + t.Run("WorkerMapIsUpdated", func(t *testing.T) { + _, ok := h.fc.workers.Load("stale-shard") + assert.False(t, ok, "stale worker must be removed from the controller's map") + _, ok = h.fc.workers.Load("shard-A") + assert.True(t, ok, "active worker must remain in the controller's map") + }) + }) + + t.Run("Reconciliation_IsTriggeredByTicker", func(t *testing.T) { + t.Parallel() + reconciliationInterval := 10 * time.Second + h := newUnitHarness(t, Config{ + ProcessorReconciliationInterval: reconciliationInterval, + }) + + var reconcileCount atomic.Int32 + h.mockRegistry.ShardStatsFunc = func() []contracts.ShardStats { + reconcileCount.Add(1) + return nil + } + + h.start(t) // This will perform the initial reconciliation. + + // Assert that the initial reconciliation occurred on startup. + require.Equal(t, int32(1), reconcileCount.Load(), "initial reconciliation did not occur on start") + + // Advance the clock to trigger the next reconciliation. + h.mockClock.Step(reconciliationInterval) + + assert.Eventually(t, func() bool { + return reconcileCount.Load() == 2 + }, time.Second, 10*time.Millisecond, "reconciliation was not triggered by the ticker") + + // Advance the clock again to ensure it continues to fire. + h.mockClock.Step(reconciliationInterval) + assert.Eventually(t, func() bool { + return reconcileCount.Load() == 3 + }, time.Second, 10*time.Millisecond, "reconciliation did not fire on the second tick") + }) + + t.Run("WorkerCreationRace", func(t *testing.T) { + t.Parallel() + h := newUnitHarness(t, Config{}) + + // This test requires manual control over the shard processor factory to deterministically create a race. + factoryEntered := make(chan *mockShardProcessor, 2) + continueFactory := make(chan struct{}) + h.fc.shardProcessorFactory = func( + shard contracts.RegistryShard, + _ internal.BandFilter, + _ clock.Clock, + _ time.Duration, + _ int, + _ logr.Logger, + ) shardProcessor { + // This factory function will be called by `startNewWorker`. + // We use channels to pause execution here, allowing two goroutines to enter this function before one "wins" + // the `LoadOrStore` race. + proc := &mockShardProcessor{runStarted: make(chan struct{})} + factoryEntered <- proc + <-continueFactory + return proc + } + + h.start(t) + shard := newMockShard("race-shard").build() + var wg sync.WaitGroup + wg.Add(2) + + // Start two goroutines that will race to create the same worker. + go func() { + defer wg.Done() + h.fc.getOrStartWorker(shard) + }() + go func() { + defer wg.Done() + h.fc.getOrStartWorker(shard) + }() + + // Wait for both goroutines to have entered the factory and created a processor. + // This confirms they both missed the initial `workers.Load` check. + proc1 := <-factoryEntered + proc2 := <-factoryEntered + + // Unblock both goroutines, allowing them to race to `workers.LoadOrStore`. + close(continueFactory) + wg.Wait() + + // Wait for the `Run` method to be called on both processors to ensure their contexts are non-nil before asserting. + <-proc1.runStarted + <-proc2.runStarted + + // One processor "won" and was stored, the other "lost" and should have been cancelled. + actual, ok := h.fc.workers.Load("race-shard") + require.True(t, ok, "a worker should have been stored in the map") + + storedWorker := actual.(*managedWorker) + winnerProc := storedWorker.processor.(*mockShardProcessor) + + var loserProc *mockShardProcessor + if winnerProc == proc1 { + loserProc = proc2 + } else { + loserProc = proc1 + } + + // The losing processor's context must be cancelled to prevent a goroutine leak. + select { + case <-loserProc.Context().Done(): + assert.ErrorIs(t, loserProc.Context().Err(), context.Canceled, "loser's context should be canceled") + case <-time.After(100 * time.Millisecond): + t.Error("context of the losing worker was not cancelled") + } + + // The winning processor's context must remain active. + select { + case <-winnerProc.Context().Done(): + t.Error("context of the winning worker should not be cancelled") + default: + // Success + } + }) +} + +func TestFlowController_Concurrency(t *testing.T) { + const ( + numShards = 4 + numGoroutines = 50 + numRequests = 200 + ) + h := newIntegrationHarness(t, Config{ + // Use a generous buffer to prevent flakes in the test due to transient queuing delays. + EnqueueChannelBufferSize: numRequests, + DefaultRequestTTL: 1 * time.Second, + }) + + // Manually manage the controller's lifecycle to prevent a race with `t.Cleanup`. + controllerCtx := t.Context() + go h.fc.Run(controllerCtx) + select { + case <-h.fc.ready: + // Success + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for FlowController to become ready") + } + + // Set up a realistic registry that vends real components to the processor. + shards := make([]contracts.RegistryShard, numShards) + queues := make(map[string]contracts.ManagedQueue) + for i := range numShards { + shardID := fmt.Sprintf("shard-%d", i) + queues[shardID] = &mocks.MockManagedQueue{FlowKeyV: defaultFlowKey} // Use the high-fidelity mock queue. + shards[i] = &mocks.MockRegistryShard{ + IDFunc: func() string { return shardID }, + ManagedQueueFunc: func(_ types.FlowKey) (contracts.ManagedQueue, error) { + return queues[shardID], nil + }, + AllOrderedPriorityLevelsFunc: func() []uint { return []uint{100} }, + PriorityBandAccessorFunc: func(priority uint) (framework.PriorityBandAccessor, error) { + if priority == 100 { + return &frameworkmocks.MockPriorityBandAccessor{ + PriorityNameV: "high", + PriorityV: 100, + IterateQueuesFunc: func(f func(framework.FlowQueueAccessor) bool) { + f(queues[shardID].FlowQueueAccessor()) + }, + }, nil + } + return nil, fmt.Errorf("unexpected priority %d", priority) + }, + IntraFlowDispatchPolicyFunc: func(_ types.FlowKey) (framework.IntraFlowDispatchPolicy, error) { + return &frameworkmocks.MockIntraFlowDispatchPolicy{ + SelectItemFunc: func(qa framework.FlowQueueAccessor) (types.QueueItemAccessor, error) { + return qa.PeekHead() + }, + }, nil + }, + InterFlowDispatchPolicyFunc: func(_ uint) (framework.InterFlowDispatchPolicy, error) { + return &frameworkmocks.MockInterFlowDispatchPolicy{ + SelectQueueFunc: func(band framework.PriorityBandAccessor) (framework.FlowQueueAccessor, error) { + return queues[shardID].FlowQueueAccessor(), nil + }, + }, nil + }, + StatsFunc: func() contracts.ShardStats { + return contracts.ShardStats{ + ID: shardID, + TotalLen: uint64(queues[shardID].Len()), + TotalByteSize: queues[shardID].ByteSize(), + PerPriorityBandStats: map[uint]contracts.PriorityBandStats{ + 100: { + Len: uint64(queues[shardID].Len()), + ByteSize: queues[shardID].ByteSize(), + CapacityBytes: 1e9, // Effectively unlimited capacity + }, + }, + } + }, + } + } + h.mockRegistry.WithConnectionFunc = func(_ types.FlowKey, fn func(conn contracts.ActiveFlowConnection) error) error { + return fn(&mockActiveFlowConnection{ActiveShardsV: shards}) + } + + reqCtx := logr.NewContext(context.Background(), logr.Discard()) + + var wg sync.WaitGroup + wg.Add(numGoroutines) + outcomes := make(chan types.QueueOutcome, numRequests) + for range numGoroutines { + go func() { + defer wg.Done() + for range numRequests / numGoroutines { + req := newTestRequest(reqCtx, defaultFlowKey) + outcome, err := h.fc.EnqueueAndWait(req) + if err != nil { + // Use `t.Errorf` for concurrent tests to avoid halting execution on a single failure. + t.Errorf("EnqueueAndWait failed unexpectedly: %v", err) + } + outcomes <- outcome + } + }() + } + wg.Wait() + close(outcomes) + + successCount := 0 + for outcome := range outcomes { + if outcome == types.QueueOutcomeDispatched { + successCount++ + } + } + require.Equal(t, numRequests, successCount, "all concurrent requests should be dispatched successfully") +} diff --git a/pkg/epp/flowcontrol/controller/doc.go b/pkg/epp/flowcontrol/controller/doc.go index 8c96bbc18..4f2620c5a 100644 --- a/pkg/epp/flowcontrol/controller/doc.go +++ b/pkg/epp/flowcontrol/controller/doc.go @@ -18,105 +18,19 @@ limitations under the License. // // # Overview // -// The `FlowController` is the central processing engine of the flow control system. It is a sharded, high-throughput -// component responsible for managing the lifecycle of all incoming requests—from initial submission via the synchronous -// `EnqueueAndWait` method to a terminal outcome (dispatch, rejection, or eviction). It achieves this by orchestrating -// its dependencies—the `contracts.FlowRegistry`, the pluggable `Policy` framework, and the -// `contracts.SaturationDetector`—to make continuous, state-aware decisions. +// The `FlowController` is the central processing engine of the Flow Control system. It acts as a stateless supervisor +// that orchestrates a pool of stateful workers (`internal.ShardProcessor`), managing the lifecycle of all incoming +// requests from initial submission to a terminal outcome (dispatch, rejection, or eviction). // -// # Architecture: The Processor-Shard Relationship +// # Architecture: Supervisor-Worker Pattern // -// The `FlowController` engine is designed around a clear separation of state and execution. This "control plane vs. -// data plane" separation is key to enabling dynamic, concurrent-safe configuration updates. +// This package implements a classic "supervisor-worker" pattern to achieve high throughput and dynamic scalability. // -// - The `contracts.FlowRegistry` is the **control plane**. It is the single source of truth for all configuration. -// When an administrative action occurs (e.g., `RegisterOrUpdateFlow`), the `contracts.FlowRegistry` is responsible -// for safely applying that change to each of its managed `contracts.RegistryShard` instances. -// -// - The `contracts.RegistryShard` is the **concurrent-safe state port**. It defines the contract for a state store -// that holds the `contracts.ManagedQueue` and framework `Policy` instances for a single shard. -// -// - The `internal.ShardProcessor` is the **data plane worker**. It is given a single `contracts.RegistryShard` to -// operate on. Its main `dispatchCycle` continuously acquires a read lock on the shard to get a consistent view of -// the active queues and policies, and then executes its dispatch logic. -// -// This separation is what enables dynamic updates. The `internal.ShardProcessor` is stateless; it simply executes -// against the state presented by its `contracts.RegistryShard` on each cycle. This allows the control plane -// (`contracts.FlowRegistry`) to safely change that state in the background. -// -// # Architectural Deep Dive: The `EnqueueAndWait` Model -// -// A fundamental design choice is the synchronous, blocking `EnqueueAndWait` method. In the context of the Gateway API -// Inference Extension's Endpoint Picker (EPP), which operates as an Envoy External Processing (`ext_proc`) server, this -// model is deliberately chosen for its simplicity and robustness. -// -// - Alignment with `ext_proc`: The `ext_proc` protocol is stream-based. A single goroutine within the EPP manages the -// stream for a given HTTP request. `EnqueueAndWait` fits this perfectly: the request-handling goroutine calls it, -// blocks, and upon return, has the definitive outcome. It can then immediately act on that outcome, maintaining -// clear request-goroutine affinity. -// -// - Simplified State Management: The state of a "waiting" request is implicitly managed by the blocked goroutine's -// stack and its `context.Context`. The `FlowController` only needs to signal this specific goroutine to unblock it. -// -// - Direct Backpressure: If queues are full, `EnqueueAndWait` returns `types.ErrQueueAtCapacity`. This provides -// immediate, direct backpressure to the earliest point of contact. -// -// # Architectural Deep Dive: The Sharded Model & JSQ-Bytes -// -// The `FlowController` is built on a sharded architecture to enable parallel processing and prevent a central dispatch -// loop from becoming a bottleneck. The `FlowController` consists of a top-level manager and a pool of independent -// `internal.ShardProcessor` workers. The `contracts.FlowRegistry` guarantees that every logical flow is represented by -// a distinct queue instance on every active shard. -// -// This architecture trades deterministic global state for high throughput and scalability. The key challenge, and the -// system's most critical assumption, revolves around ensuring this distributed model can still achieve global fairness -// objectives. -// -// ## The Critical Assumption: Homogeneity Within Flows -// -// The effectiveness of the sharded model hinges on a critical assumption: while the system as a whole manages a -// heterogeneous set of flows, the traffic *within a single logical flow* is assumed to be roughly homogeneous in its -// characteristics. A logical flow is intended to represent a single workload or tenant; therefore, the most -// unpredictable variables (effecting decode behavior) are expected to be statistically similar *within* that flow. -// -// ## The Hedge: Join the Shortest Queue by Bytes (JSQ-Bytes) -// -// To make this assumption as robust as possible, the `FlowController` uses a "Join the Shortest Queue by Bytes -// (JSQ-Bytes)" algorithm. `ByteSize` is an excellent proxy for the resources the `FlowController` explicitly manages -// (host memory pressure and queuing capacity) and is also a reasonable proxy for prefill compute time. -// -// Crucially, the goal of the distributor is not to perfectly predict backend compute time, but to intelligently balance -// the load at the controller level. JSQ-Bytes achieves this by: -// -// 1. Reflecting True Load: It distributes work based on each shard's current queue size in bytes—a direct measure of -// its memory and capacity congestion. -// -// 2. Adapting to Congestion: The byte-size of a queue is a real-time signal of a shard's overall congestion. If a -// shard is slow (e.g., due to long-decoding downstream requests), its queues will remain full, and JSQ-Bytes will -// adaptively steer new work away. -// -// 3. Hedging Against Assumption Violations: This adaptive, self-correcting nature makes it a powerful hedge. It -// doesn't just distribute; it actively *load balances* based on the most relevant feedback available. -// -// # Architectural Deep Dive: Pre-Policy Gating -// -// Before policies are invoked, the `internal.ShardProcessor` applies an `internal.BandFilter`. This function determines -// which flows within a priority band are eligible for a given operation (e.g., dispatch). This pattern is a deliberate -// architectural choice to decouple the logic of *viability* from the logic of *selection*. -// -// - An `internal.BandFilter` (e.g., the `internal.NewSaturationFilter`) determines if a flow is viable based on -// external signals like backend load. -// - The `framework.InterFlowDispatchPolicy` then selects from among the viable candidates based on its own fairness -// logic. -// -// This abstraction provides two major benefits: -// -// 1. Low Contributor Burden: It makes the mental model for policy contributors significantly simpler. An author of a -// new fairness policy does not need to be concerned with the complexities of saturation detection or other gating -// concerns. They are given a simple, pre-filtered view of the world and can focus solely on their selection logic. -// -// 2. Correctness by Construction: The `internal.subsetPriorityBandAccessor` wrapper guarantees that a policy operates -// on a consistent, filtered view, regardless of which accessor method it calls (`FlowIDs`, `Queue`, etc.). This -// prevents an entire class of subtle bugs where a policy might otherwise see a stale or unfiltered view of the -// system state. +// - The `FlowController` (Supervisor): The public-facing API of the system. Its primary responsibilities are to +// execute a distribution algorithm to select the optimal worker for a new request and to manage the lifecycle of +// the worker pool, ensuring it stays synchronized with the underlying shard topology defined by the +// `contracts.FlowRegistry`. +// - The `internal.ShardProcessor` (Worker): A stateful, single-goroutine actor responsible for the entire lifecycle +// of requests on a single shard. The supervisor manages a pool of these workers, one for each +// `contracts.RegistryShard`. package controller From 196e5eb4280f3ab41e753b831fd210eb27700292 Mon Sep 17 00:00:00 2001 From: Luke Van Drie Date: Thu, 4 Sep 2025 01:10:04 +0000 Subject: [PATCH 4/5] docs: Update comments to align with FlowController This commit updates documentation and code comments across various framework components to align with the concepts and architecture introduced by the `FlowController`. Key changes include: - FCFS Policy: Clarified the distinction between "logical" and "physical" enqueue time and the behavioral trade-offs when pairing with different queue capabilities. - ListQueue: Expanded the documentation to explain its role as a high-performance, approximate FCFS queue in the context of the `FlowController`'s retry mechanics. - Request Types: Refined the comments for `QueueItemAccessor` to be more precise about the meaning of `EnqueueTime`. --- .../policies/intraflow/dispatch/fcfs/fcfs.go | 35 +++++++++++++++++-- .../intraflow/dispatch/fcfs/fcfs_test.go | 3 +- .../plugins/queue/listqueue/listqueue.go | 31 +++++++++++++--- pkg/epp/flowcontrol/registry/config_test.go | 22 ++++++++---- pkg/epp/flowcontrol/types/request.go | 3 +- 5 files changed, 77 insertions(+), 17 deletions(-) diff --git a/pkg/epp/flowcontrol/framework/plugins/policies/intraflow/dispatch/fcfs/fcfs.go b/pkg/epp/flowcontrol/framework/plugins/policies/intraflow/dispatch/fcfs/fcfs.go index edcc02ac6..7addb9d13 100644 --- a/pkg/epp/flowcontrol/framework/plugins/policies/intraflow/dispatch/fcfs/fcfs.go +++ b/pkg/epp/flowcontrol/framework/plugins/policies/intraflow/dispatch/fcfs/fcfs.go @@ -26,6 +26,32 @@ import ( ) // FCFSPolicyName is the name of the FCFS policy implementation. +// +// This policy implements a First-Come, First-Served (FCFS) strategy by selecting the item with the earliest logical +// enqueue time. +// +// # Behavior and Queue Pairing +// +// The behavioral guarantees of this policy are critically dependent on the capabilities of the `framework.SafeQueue` it +// is paired with. The system distinguishes between: +// - "Logical Enqueue Time": The timestamp when a request first arrives at the `controller.FlowController`. +// - "Physical Enqueue Time": The timestamp when a request is added to a specific shard's queue, which happens later. +// +// This policy's behavior changes accordingly: +// - Paired with a `CapabilityPriorityConfigurable` queue, it provides strict FCFS ordering based on logical enqueue +// time, aligning with this policy's vended `framework.ItemComparator`. +// This configuration ensures that requests are processed in the order they arrived at the controller, providing the +// most intuitive behavior. +// - Paired with a `CapabilityFIFO` queue, it provides approximate FCFS ordering based on physical arrival order at +// the `framework.SafeQueue`. +// This configuration offers higher performance at the cost of strict logical-time ordering, as the +// `controller.FlowController`'s "bounce-and-retry" mechanic for Draining shards means a bounced request may be +// processed after a request that logically darrived later. +// +// Given that true end-to-end ordering is non-deterministic in a distributd system, this policy defaults to pairing with +// a CapabilityFIFO` queue (like "ListQueue") to prioritize performance and high throughput. For users who require the +// strictest possible logical-time ordering that this layer can provide, explicitly pairing this policy with a +// `CapabilityPriorityConfigurable` queue is recommended. const FCFSPolicyName = "FCFS" func init() { @@ -35,7 +61,9 @@ func init() { }) } -// fcfs (First-Come, First-Served) implements the `framework.IntraFlowDispatchPolicy` interface. +// fcfs is the internal implementation of the FCFS policy. +// See the documentation for the exported `FCFSPolicyName` constant for detailed user-facing information about its +// behavior. type fcfs struct { comparator framework.ItemComparator } @@ -70,9 +98,10 @@ func (p *fcfs) Comparator() framework.ItemComparator { return p.comparator } -// RequiredQueueCapabilities specifies that this policy needs a queue that supports FIFO operations. +// RequiredQueueCapabilities returns an empty slice, indicating that this policy can operate with any queue. +// See the `FCFSPolicyName` constant's documentation for details on the behavioral trade-offs. func (p *fcfs) RequiredQueueCapabilities() []framework.QueueCapability { - return []framework.QueueCapability{framework.CapabilityFIFO} + return []framework.QueueCapability{} } // --- enqueueTimeComparator --- diff --git a/pkg/epp/flowcontrol/framework/plugins/policies/intraflow/dispatch/fcfs/fcfs_test.go b/pkg/epp/flowcontrol/framework/plugins/policies/intraflow/dispatch/fcfs/fcfs_test.go index 45c144238..cc6bceecf 100644 --- a/pkg/epp/flowcontrol/framework/plugins/policies/intraflow/dispatch/fcfs/fcfs_test.go +++ b/pkg/epp/flowcontrol/framework/plugins/policies/intraflow/dispatch/fcfs/fcfs_test.go @@ -41,8 +41,7 @@ func TestFCFS_RequiredQueueCapabilities(t *testing.T) { t.Parallel() policy := newFCFS() caps := policy.RequiredQueueCapabilities() - require.Len(t, caps, 1, "RequiredQueueCapabilities should return one capability") - assert.Equal(t, framework.CapabilityFIFO, caps[0], "Required capability should be FIFO") + require.Empty(t, caps, "No required capabilities should be returned") } func TestFCFS_SelectItem(t *testing.T) { diff --git a/pkg/epp/flowcontrol/framework/plugins/queue/listqueue/listqueue.go b/pkg/epp/flowcontrol/framework/plugins/queue/listqueue/listqueue.go index 8e123b631..792e3a46d 100644 --- a/pkg/epp/flowcontrol/framework/plugins/queue/listqueue/listqueue.go +++ b/pkg/epp/flowcontrol/framework/plugins/queue/listqueue/listqueue.go @@ -14,8 +14,8 @@ See the License for the specific language governing permissions and limitations under the License. */ -// Package listqueue provides a simple, concurrent-safe queue implementation using a standard library -// `container/list.List` as the underlying data structure for FIFO (First-In, First-Out) behavior. +// Package listqueue provides a high-performance, concurrent-safe FIFO (First-In, First-Out) implementation of +// implementation of the `framework.SafeQueue` based on the standard library's `container/list`. package listqueue import ( @@ -28,7 +28,28 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types" ) -// ListQueueName is the name of the list queue implementation. +// ListQueueName is the name of the list-based queue implementation. +// +// This queue provides a high-performance, low-overhead implementation based on a standard `container/list`. +// It advertises the `CapabilityFIFO`. +// +// # Behavioral Guarantees +// +// The core guarantee of this queue is strict physical First-In, First-Out (FIFO) ordering. It processes items in the +// exact order they are added to the queue on a specific shard. +// +// # Performance and Trade-offs +// +// Because the physical insertion order may not match a request's logical arrival time (due to the +// `controller.FlowController`'s internal "bounce-and-retry" mechanic), this queue provides an*approximate FCFS behavior +// from a system-wide perspective. +// +// Given that true end-to-end ordering is non-deterministic in a distributed system, this high-performance queue is the +// recommended default for most FCFS-like policies. It prioritizes throughput and efficiency, which aligns with the +// primary goal of the Flow Control system. +// +// For workloads that require the strictest possible logical-time ordering this layer can provide, explicitly using a +// queue that supports `CapabilityPriorityConfigurable` is the appropriate choice. const ListQueueName = "ListQueue" func init() { @@ -39,8 +60,8 @@ func init() { }) } -// listQueue implements the `framework.SafeQueue` interface using a standard `container/list.List` for FIFO behavior. -// This implementation is concurrent-safe. +// listQueue is the internal implementation of the ListQueue. +// See the documentation for the exported `ListQueueName` constant for detailed user-facing information. type listQueue struct { requests *list.List byteSize atomic.Uint64 diff --git a/pkg/epp/flowcontrol/registry/config_test.go b/pkg/epp/flowcontrol/registry/config_test.go index 376282ffe..ea9c19b7d 100644 --- a/pkg/epp/flowcontrol/registry/config_test.go +++ b/pkg/epp/flowcontrol/registry/config_test.go @@ -211,16 +211,26 @@ func TestConfig_NewConfig(t *testing.T) { name: "ShouldError_WhenQueueFactoryFails", input: Config{ PriorityBands: []PriorityBandConfig{{ - Priority: 1, - PriorityName: "High", - Queue: queue.RegisteredQueueName("failing-queue"), + Priority: 1, + PriorityName: "High", + Queue: queue.RegisteredQueueName("failing-queue"), + IntraFlowDispatchPolicy: intra.RegisteredPolicyName("policy-with-req"), }}, }, expectErr: true, - opts: []configOption{withQueueFactory( - func(_ queue.RegisteredQueueName, _ framework.ItemComparator) (framework.SafeQueue, error) { + opts: []configOption{ + withIntraFlowDispatchPolicyFactory( // Forces queue instance creation for validating capabilities. + func(name intra.RegisteredPolicyName) (framework.IntraFlowDispatchPolicy, error) { + return &mocks.MockIntraFlowDispatchPolicy{ + NameV: string(name), + RequiredQueueCapabilitiesV: []framework.QueueCapability{"required-capability"}, + }, nil + }, + ), + withQueueFactory(func(_ queue.RegisteredQueueName, _ framework.ItemComparator) (framework.SafeQueue, error) { return nil, errors.New("queue creation failed") - })}, + }), + }, }, { name: "ShouldError_WhenPolicyFactoryFails", diff --git a/pkg/epp/flowcontrol/types/request.go b/pkg/epp/flowcontrol/types/request.go index 756940704..5dfc18eb6 100644 --- a/pkg/epp/flowcontrol/types/request.go +++ b/pkg/epp/flowcontrol/types/request.go @@ -92,7 +92,8 @@ type QueueItemAccessor interface { OriginalRequest() FlowControlRequest // EnqueueTime is the timestamp when the item was logically accepted by the `controller.FlowController` for queuing - // (i.e., when `controller.FlowController.EnqueueAndWait()` was called). + // (i.e., when `controller.FlowController.EnqueueAndWait()` was called). It does not reflect the time the request + // landed in a `framework.SafeQueue` instance. EnqueueTime() time.Time // EffectiveTTL is the actual Time-To-Live assigned to this item by the `controller.FlowController`, taking into From 16bc1299025c1a1e872507fab4b2cdd7e808ad16 Mon Sep 17 00:00:00 2001 From: Luke Van Drie Date: Fri, 12 Sep 2025 22:14:59 +0000 Subject: [PATCH 5/5] refactor Simplify controller Lifecycle This commit refactors the `FlowController` to simplify its startup and shutdown lifecycle, making it more robust and easier to reason about. It also incorporates several smaller improvements based on reviewer feedback. The primary change addresses a complex lifecycle implementation that used an `atomic.Bool` (`isRunning`) and a `ready` channel to manage state. Key changes: - **Simplified Lifecycle:** The controller's lifecycle is now tied directly to a `context` passed into `NewFlowController`. The `Run` method has been unexported, and the main `run` loop is started as a goroutine from the constructor. This eliminates the `ready` channel and `isRunning` flag in addition to simplifying the interface for callers. - **Robust Worker Creation:** The `getOrStartWorker` logic has been improved to ensure that in a race to create a worker, the "losing" goroutine correctly cleans up its resources and does not start a redundant processor. This fixes a bug where the losing worker would evict all items from its queues on shutdown which were shared instances with the winning worker resulting in premature request finalization. - **Comment Reduction:** The extensive explanatory comments in `distributeRequest` have been condensed to be more concise while retaining the essential details of the algorithm. - **Minor Cleanups:** - The initial, unnecessary call to `reconcileProcessors()` at startup has been removed. - Error messages have been clarified (e.g., "acquire lease" instead of "establish connection"). - A typed error for nil requests was replaced with a standard `errors.New`. --- pkg/epp/flowcontrol/controller/config_test.go | 2 +- pkg/epp/flowcontrol/controller/controller.go | 175 +++------- .../flowcontrol/controller/controller_test.go | 305 +++++++++--------- .../controller/internal/processor.go | 6 +- pkg/epp/flowcontrol/types/errors.go | 5 +- 5 files changed, 193 insertions(+), 300 deletions(-) diff --git a/pkg/epp/flowcontrol/controller/config_test.go b/pkg/epp/flowcontrol/controller/config_test.go index 6dd28ec8c..a89db25ff 100644 --- a/pkg/epp/flowcontrol/controller/config_test.go +++ b/pkg/epp/flowcontrol/controller/config_test.go @@ -9,7 +9,7 @@ You may obtain a copy of the License at Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, -WITHOUTHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ diff --git a/pkg/epp/flowcontrol/controller/controller.go b/pkg/epp/flowcontrol/controller/controller.go index 9d67b0e42..e25eda969 100644 --- a/pkg/epp/flowcontrol/controller/controller.go +++ b/pkg/epp/flowcontrol/controller/controller.go @@ -29,7 +29,6 @@ import ( "fmt" "slices" "sync" - "sync/atomic" "time" "github.com/go-logr/logr" @@ -103,44 +102,16 @@ type FlowController struct { // The key is the shard ID (`string`), and the value is a `*managedWorker`. workers sync.Map - // ready is closed by the Run method once initialization is complete, including setting the `parentCtx`. - // This acts as a memory barrier and synchronization point for all other concurrent methods. - ready chan struct{} - - isRunning atomic.Bool - wg sync.WaitGroup + wg sync.WaitGroup } // flowControllerOption is a function that applies a configuration change to a `FlowController`. // test-only type flowControllerOption func(*FlowController) -// withClock returns a test-only option to inject a clock. -// test-only -func withClock(c clock.WithTicker) flowControllerOption { - return func(fc *FlowController) { - fc.clock = c - } -} - -// withRegistryClient returns a test-only option to inject a mock or fake registry client. -// test-only -func withRegistryClient(client registryClient) flowControllerOption { - return func(fc *FlowController) { - fc.registry = client - } -} - -// withShardProcessorFactory returns a test-only option to inject a processor factory. -// test-only -func withShardProcessorFactory(factory shardProcessorFactory) flowControllerOption { - return func(fc *FlowController) { - fc.shardProcessorFactory = factory - } -} - // NewFlowController creates a new `FlowController` instance. func NewFlowController( + ctx context.Context, config Config, registry contracts.FlowRegistry, sd contracts.SaturationDetector, @@ -158,8 +129,7 @@ func NewFlowController( saturationDetector: sd, clock: clock.RealClock{}, logger: logger.WithName("flow-controller"), - parentCtx: context.Background(), // Will be set in `Run` - ready: make(chan struct{}), + parentCtx: ctx, } // Use the real shard processor implementation by default. @@ -183,29 +153,21 @@ func NewFlowController( for _, opt := range opts { opt(fc) } + + go fc.run(ctx) return fc, nil } -// Run starts the `FlowController`'s main reconciliation loop. +// run starts the `FlowController`'s main reconciliation loop. // This loop is responsible for garbage collecting workers whose shards no longer exist in the registry. // This method blocks until the provided context is cancelled and ALL worker goroutines have fully terminated. -func (fc *FlowController) Run(ctx context.Context) { - if !fc.isRunning.CompareAndSwap(false, true) { - fc.logger.Error(nil, "FlowController Run loop already started or controller is shut down") - return - } - - fc.parentCtx = ctx - close(fc.ready) - +func (fc *FlowController) run(ctx context.Context) { fc.logger.Info("Starting FlowController reconciliation loop.") defer fc.logger.Info("FlowController reconciliation loop stopped.") ticker := fc.clock.NewTicker(fc.config.ProcessorReconciliationInterval) defer ticker.Stop() - fc.reconcileProcessors() // Initial reconciliation - for { select { case <-ctx.Done(): @@ -234,7 +196,7 @@ func (fc *FlowController) Run(ctx context.Context) { // backpressure to the caller. func (fc *FlowController) EnqueueAndWait(req types.FlowControlRequest) (types.QueueOutcome, error) { if req == nil { - return types.QueueOutcomeRejectedOther, fmt.Errorf("%w: %w", types.ErrRejected, types.ErrNilRequest) + return types.QueueOutcomeRejectedOther, errors.New("request cannot be nil") } effectiveTTL := req.InitialEffectiveTTL() if effectiveTTL <= 0 { @@ -243,18 +205,23 @@ func (fc *FlowController) EnqueueAndWait(req types.FlowControlRequest) (types.Qu enqueueTime := fc.clock.Now() for { - if !fc.isRunning.Load() { + select { + case <-fc.parentCtx.Done(): return types.QueueOutcomeRejectedOther, fmt.Errorf("%w: %w", types.ErrRejected, types.ErrFlowControllerNotRunning) + default: + // The controller is running, proceed. } // We must create a fresh `FlowItem` on each attempt since finalization is idempotent. - // However, it we use the original, preserved `enqueueTime`. + // However, we use the original, preserved `enqueueTime`. item := internal.NewItem(req, effectiveTTL, enqueueTime) if outcome, err := fc.distributeRequest(item); err != nil { return outcome, fmt.Errorf("%w: %w", types.ErrRejected, err) } - finalState := <-item.Done() // finalization handles monitoring request context cancellation and TTL expiry + // Block until the request is finalized (dispatched, rejected, or evicted). + // The finalization logic internally monitors for context cancellation and TTL expiry. + finalState := <-item.Done() if errors.Is(finalState.Err, contracts.ErrShardDraining) { fc.logger.V(logutil.DEBUG).Info("Shard is draining, retrying request", "requestID", req.ID()) // Benign race with the chosen `contracts.RegistryShard` becoming Draining post selection but before the item was @@ -266,65 +233,16 @@ func (fc *FlowController) EnqueueAndWait(req types.FlowControlRequest) (types.Qu } } -// distributeRequest selects the optimal shard processor for a given item and attempts to submit it. -// -// # Architectural Deep Dive: Achieving Emergent Fairness with Data Parallelism -// -// To achieve high throughput and prevent a central bottleneck, the `FlowController` is built on a sharded, -// data-parallel architecture. It runs multiple `internal`ShardProcessor` workers, and every logical flow is represented -// by a dedicated queue on every Active shard. This design grants the distributor maximum flexibility to route traffic -// based on real-time load. -// -// This function implements a sophisticated distribution strategy: Flow-Aware, Two-Phase Join-Shortest-Queue-by-Bytes -// (JSQ-Bytes) with Graceful Backpressure. It is designed to balance load, prevent unnecessary rejections under -// transient spikes, and create the necessary conditions for global fairness goals to emerge from local, independent -// worker actions. -// -// # The Algorithm in Detail -// -// 1. Flow-Aware Candidate Selection: For an incoming request, the controller inspects the queue depth (in bytes) for -// that specific flow across all Active shards. It then sorts these shards from least-loaded to most-loaded, -// creating a ranked list of candidates. -// 2. Phase 1 (Non-blocking Fast Failover): The controller iterates through the sorted candidates and attempts a -// non-blocking `Submit()` to each. If any processor accepts the item, the operation succeeds immediately. -// This prevents a single, momentarily busy worker from stalling the entire system. -// 3. Phase 2 (Blocking Fallback): If all processors are busy, it attempts a single, blocking `SubmitOrBlock()` on the -// single best candidate. This provides graceful backpressure and increases the likelihood of success during -// transient traffic bursts. +// distributeRequest implements a flow-aware, two-phase "Join-Shortest-Queue-by-Bytes" (JSQ-Bytes) distribution strategy +// with graceful backpressure. It selects the optimal worker for a given item and attempts to submit it. // -// # Design Rationale and Critical Assumptions -// -// ### 1. The Flow Homogeneity Assumption -// -// The first assumption is that traffic within a single logical flow is roughly homogeneous. The `types.FlowKey` is -// the primary mechanism for grouping requests that are expected to have statistically similar behavior (e.g., prefill, -// decode). For this to be effective, a flow must meaningfully represent a single workload (e.g., the same model, user -// cohort, or task type). The more closely this assumption is satisfied in practice, the more stable and predictable the -// system dynamics will be. -// -// ### Robustness Through Real-Time Adaptation -// -// The system is designed to be robust even when the homogeneity assumption is imperfect. The distribution algorithm -// does not need to predict workload characteristics; it only needs to react to their consequences in real time. -// If a shard becomes slow or congested, the backlogs of its queues will grow. The JSQ-Bytes algorithm will naturally -// observe this increase in byte size and adaptively steer new work away from the congested shard. -// -// ### 2. The Shard Homogeneity Constraint (Enabling Stateful Policies) -// -// The second, and most critical, constraint of this data-parallel design relates to the policies executed by the -// workers. The fairness (`InterFlowDispatchPolicy`) and temporal scheduling (`IntraFlowDispatchPolicy`) policies may be -// stateful (e.g., a fairness algorithm tracking historical tokens served). -// -// For the independent decisions of these stateful policies to result in coherent, globally fair outcomes, the state -// they observe on each shard must be statistically similar. This is the shard homogeneity constraint. -// -// This constraint is actively enforced by the Flow-Aware JSQ-Bytes algorithm. By constantly balancing the load for each -// flow individually, the distributor ensures that, over time, the mix of traffic on each shard is roughly proportional. -// It actively prevents one shard from becoming specialized in serving a single, dominant flow. -// -// This creates the necessary foundation for our model: local, stateful policy decisions, when aggregated across -// statistically similar shards, result in an emergent, approximately correct global fairness objective. This is key to -// unlocking scalable performance without a central, bottlenecked scheduler. +// The algorithm operates as follows: +// 1. Candidate Selection: It identifies all Active shards for the item's flow and ranks them by the current byte size +// of that flow's queue, from least to most loaded. +// 2. Phase 1 (Non-blocking Fast Failover): It iterates through the ranked candidates and attempts a non-blocking +// submission. The first successful submission wins. +// 3. Phase 2 (Blocking Fallback): If all non-blocking attempts fail, it performs a single blocking submission to the +// least-loaded candidate, providing backpressure. func (fc *FlowController) distributeRequest(item *internal.FlowItem) (types.QueueOutcome, error) { key := item.OriginalRequest().FlowKey() reqID := item.OriginalRequest().ID() @@ -349,7 +267,7 @@ func (fc *FlowController) distributeRequest(item *internal.FlowItem) (types.Queu return nil }) if err != nil { - return types.QueueOutcomeRejectedOther, fmt.Errorf("failed to establish connection for request %q (flow %s): %w", + return types.QueueOutcomeRejectedOther, fmt.Errorf("failed to acquire lease for request %q (flow %s): %w", reqID, key, err) } @@ -387,30 +305,15 @@ func (fc *FlowController) distributeRequest(item *internal.FlowItem) (types.Queu } // getOrStartWorker implements the lazy-loading and startup of shard processors. -// It attempts to retrieve an existing worker for a shard, and if one doesn't exist, it creates, starts, and -// registers it atomically. -// This ensures that workers are only created on-demand when a shard first becomes Active. +// It attempts to retrieve an existing worker for a shard. If one doesn't exist, it constructs a new worker and attempts +// to register it atomically. The worker's processor goroutine is only started *after* it has successfully been +// registered, preventing race conditions where multiple goroutines create and start the same worker. func (fc *FlowController) getOrStartWorker(shard contracts.RegistryShard) *managedWorker { if w, ok := fc.workers.Load(shard.ID()); ok { return w.(*managedWorker) } - // Atomically load or store. - // This handles the race condition where multiple goroutines try to create the same worker. - newWorker := fc.startNewWorker(shard) - actual, loaded := fc.workers.LoadOrStore(shard.ID(), newWorker) - if loaded { - // Another goroutine beat us to it; the `newWorker` we created was not stored. - // We must clean it up immediately to prevent resource leaks. - newWorker.cancel() - return actual.(*managedWorker) - } - return newWorker -} - -// startNewWorker encapsulates the logic for creating and starting a new worker goroutine. -func (fc *FlowController) startNewWorker(shard contracts.RegistryShard) *managedWorker { - <-fc.ready // We must wait until the parent context is initialized. + // Construct a new worker, but do not start its processor goroutine yet. processorCtx, cancel := context.WithCancel(fc.parentCtx) dispatchFilter := internal.NewSaturationFilter(fc.saturationDetector) processor := fc.shardProcessorFactory( @@ -421,19 +324,30 @@ func (fc *FlowController) startNewWorker(shard contracts.RegistryShard) *managed fc.config.EnqueueChannelBufferSize, fc.logger.WithValues("shardID", shard.ID()), ) - - worker := &managedWorker{ + newWorker := &managedWorker{ processor: processor, cancel: cancel, } + // Atomically load or store. This is the critical step for preventing race conditions. + actual, loaded := fc.workers.LoadOrStore(shard.ID(), newWorker) + if loaded { + // Another goroutine beat us to it. The `newWorker` we created was not stored. + // We must cancel the context we created for it to prevent a leak, but we do not need to do anything else, as its + // processor was never started. + cancel() + return actual.(*managedWorker) + } + + // We won the race. The `newWorker` was successfully stored. + // Now, and only now, do we start the processor's long-running goroutine. fc.wg.Add(1) go func() { defer fc.wg.Done() processor.Run(processorCtx) }() - return worker + return newWorker } // reconcileProcessors is the supervisor's core garbage collection loop. @@ -463,7 +377,6 @@ func (fc *FlowController) reconcileProcessors() { // shutdown gracefully terminates all running `shardProcessor` goroutines. // It signals all workers to stop and waits for them to complete their shutdown procedures. func (fc *FlowController) shutdown() { - fc.isRunning.Store(false) fc.logger.Info("Shutting down FlowController and all shard processors.") fc.workers.Range(func(key, value any) bool { shardID := key.(string) diff --git a/pkg/epp/flowcontrol/controller/controller_test.go b/pkg/epp/flowcontrol/controller/controller_test.go index 20bf70175..511bae8ce 100644 --- a/pkg/epp/flowcontrol/controller/controller_test.go +++ b/pkg/epp/flowcontrol/controller/controller_test.go @@ -42,6 +42,30 @@ import ( // --- Test Harness & Fixtures --- +// withClock returns a test-only option to inject a clock. +// test-only +func withClock(c clock.WithTicker) flowControllerOption { + return func(fc *FlowController) { + fc.clock = c + } +} + +// withRegistryClient returns a test-only option to inject a mock or fake registry client. +// test-only +func withRegistryClient(client registryClient) flowControllerOption { + return func(fc *FlowController) { + fc.registry = client + } +} + +// withShardProcessorFactory returns a test-only option to inject a processor factory. +// test-only +func withShardProcessorFactory(factory shardProcessorFactory) flowControllerOption { + return func(fc *FlowController) { + fc.shardProcessorFactory = factory + } +} + // testHarness holds the `FlowController` and its dependencies under test. type testHarness struct { fc *FlowController @@ -52,68 +76,55 @@ type testHarness struct { mockProcessorFactory *mockShardProcessorFactory } -func (h *testHarness) start(t *testing.T) { - t.Helper() - ctx, cancel := context.WithCancel(context.Background()) - t.Cleanup(cancel) - - go h.fc.Run(ctx) - - select { - case <-h.fc.ready: - // Success - case <-time.After(2 * time.Second): - t.Fatal("timed out waiting for FlowController to become ready") - } -} - // newUnitHarness creates a test environment with a mock processor factory, suitable for focused unit tests of the -// controller's logic. -func newUnitHarness(t *testing.T, cfg Config) *testHarness { +// controller's logic. It starts the controller's run loop and returns a cancel function to stop it. +func newUnitHarness(t *testing.T, ctx context.Context, cfg Config, registry *mockRegistryClient) *testHarness { t.Helper() - mockRegistry := &mockRegistryClient{} mockDetector := &mocks.MockSaturationDetector{} mockClock := testclock.NewFakeClock(time.Now()) mockProcessorFactory := &mockShardProcessorFactory{ processors: make(map[string]*mockShardProcessor), } opts := []flowControllerOption{ - withRegistryClient(mockRegistry), + withRegistryClient(registry), withClock(mockClock), withShardProcessorFactory(mockProcessorFactory.new), } - fc, err := NewFlowController(cfg, mockRegistry, mockDetector, logr.Discard(), opts...) + fc, err := NewFlowController(ctx, cfg, registry, mockDetector, logr.Discard(), opts...) require.NoError(t, err, "failed to create FlowController for unit test harness") - return &testHarness{ + + h := &testHarness{ fc: fc, cfg: cfg, - mockRegistry: mockRegistry, + mockRegistry: registry, mockDetector: mockDetector, mockClock: mockClock, mockProcessorFactory: mockProcessorFactory, } + return h } // newIntegrationHarness creates a test environment that uses real `ShardProcessor`s, suitable for integration tests // validating the controller-processor interaction. -func newIntegrationHarness(t *testing.T, cfg Config) *testHarness { +func newIntegrationHarness(t *testing.T, ctx context.Context, cfg Config, registry *mockRegistryClient) *testHarness { t.Helper() - mockRegistry := &mockRegistryClient{} mockDetector := &mocks.MockSaturationDetector{} mockClock := testclock.NewFakeClock(time.Now()) opts := []flowControllerOption{ - withRegistryClient(mockRegistry), + withRegistryClient(registry), withClock(mockClock), } - fc, err := NewFlowController(cfg, mockRegistry, mockDetector, logr.Discard(), opts...) + fc, err := NewFlowController(ctx, cfg, registry, mockDetector, logr.Discard(), opts...) require.NoError(t, err, "failed to create FlowController for integration test harness") - return &testHarness{ + + h := &testHarness{ fc: fc, cfg: cfg, - mockRegistry: mockRegistry, + mockRegistry: registry, mockDetector: mockDetector, mockClock: mockClock, } + return h } // mockActiveFlowConnection is a local mock for the `contracts.ActiveFlowConnection` interface. @@ -268,7 +279,13 @@ func TestNewFlowController(t *testing.T) { t.Run("ErrorOnInvalidConfig", func(t *testing.T) { t.Parallel() invalidCfg := Config{ProcessorReconciliationInterval: -1 * time.Second} - _, err := NewFlowController(invalidCfg, &mockRegistryClient{}, &mocks.MockSaturationDetector{}, logr.Discard()) + _, err := NewFlowController( + context.Background(), + invalidCfg, + &mockRegistryClient{}, + &mocks.MockSaturationDetector{}, + logr.Discard(), + ) require.Error(t, err, "NewFlowController must return an error for invalid configuration") }) } @@ -281,17 +298,20 @@ func TestFlowController_EnqueueAndWait(t *testing.T) { t.Run("OnNilRequest", func(t *testing.T) { t.Parallel() - h := newUnitHarness(t, Config{}) // Do not start the controller, as we are testing the pre-run validation. + h := newUnitHarness(t, t.Context(), Config{}, &mockRegistryClient{}) + outcome, err := h.fc.EnqueueAndWait(nil) require.Error(t, err, "EnqueueAndWait must reject a nil request") - assert.ErrorIs(t, err, types.ErrRejected, "error should wrap ErrRejected") - assert.ErrorIs(t, err, types.ErrNilRequest, "error should wrap ErrNilRequest") + assert.Equal(t, "request cannot be nil", err.Error()) assert.Equal(t, types.QueueOutcomeRejectedOther, outcome, "outcome should be QueueOutcomeRejectedOther") }) - t.Run("OnControllerNotRunning", func(t *testing.T) { + t.Run("OnControllerShutdown", func(t *testing.T) { t.Parallel() - h := newUnitHarness(t, Config{}) // Harness is created but `Run()` is never called. + ctx, cancel := context.WithCancel(t.Context()) + h := newUnitHarness(t, ctx, Config{}, &mockRegistryClient{}) + cancel() // Immediately stop the controller. + req := newTestRequest(context.Background(), defaultFlowKey) outcome, err := h.fc.EnqueueAndWait(req) require.Error(t, err, "EnqueueAndWait must reject requests if controller is not running") @@ -302,8 +322,7 @@ func TestFlowController_EnqueueAndWait(t *testing.T) { t.Run("OnNoShardsAvailable", func(t *testing.T) { t.Parallel() - h := newUnitHarness(t, Config{}) - h.start(t) + h := newUnitHarness(t, t.Context(), Config{}, &mockRegistryClient{}) req := newTestRequest(context.Background(), defaultFlowKey) outcome, err := h.fc.EnqueueAndWait(req) @@ -314,11 +333,11 @@ func TestFlowController_EnqueueAndWait(t *testing.T) { t.Run("OnRegistryConnectionError", func(t *testing.T) { t.Parallel() - h := newUnitHarness(t, Config{}) - h.start(t) + mockRegistry := &mockRegistryClient{} + h := newUnitHarness(t, t.Context(), Config{}, mockRegistry) expectedErr := errors.New("connection failed") - h.mockRegistry.WithConnectionFunc = func( + mockRegistry.WithConnectionFunc = func( _ types.FlowKey, _ func(conn contracts.ActiveFlowConnection) error, ) error { @@ -335,8 +354,8 @@ func TestFlowController_EnqueueAndWait(t *testing.T) { t.Run("PanicsOnManagedQueueError", func(t *testing.T) { t.Parallel() - h := newUnitHarness(t, Config{}) - h.start(t) + mockRegistry := &mockRegistryClient{} + h := newUnitHarness(t, t.Context(), Config{}, mockRegistry) faultyShard := &mocks.MockRegistryShard{ IDFunc: func() string { return "faulty-shard" }, @@ -344,7 +363,7 @@ func TestFlowController_EnqueueAndWait(t *testing.T) { return nil, errors.New("queue retrieval failed") }, } - h.mockRegistry.WithConnectionFunc = func( + mockRegistry.WithConnectionFunc = func( _ types.FlowKey, fn func(conn contracts.ActiveFlowConnection) error, ) error { @@ -440,10 +459,9 @@ func TestFlowController_EnqueueAndWait(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { t.Parallel() - h := newUnitHarness(t, Config{}) - h.start(t) - - h.mockRegistry.WithConnectionFunc = func( + mockRegistry := &mockRegistryClient{} + h := newUnitHarness(t, t.Context(), Config{}, mockRegistry) + mockRegistry.WithConnectionFunc = func( _ types.FlowKey, fn func(conn contracts.ActiveFlowConnection) error, ) error { @@ -466,19 +484,21 @@ func TestFlowController_EnqueueAndWait(t *testing.T) { t.Run("Rejects_OnRequestContextCancelledWhileBlocking", func(t *testing.T) { t.Parallel() - h := newUnitHarness(t, Config{}) - h.start(t) - + mockRegistry := &mockRegistryClient{ + WithConnectionFunc: func( + _ types.FlowKey, + fn func(conn contracts.ActiveFlowConnection, + ) error) error { + return fn(&mockActiveFlowConnection{ + ActiveShardsV: []contracts.RegistryShard{newMockShard("shard-A").build()}, + }) + }, + } + h := newUnitHarness(t, t.Context(), Config{}, mockRegistry) h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ SubmitFunc: func(_ *internal.FlowItem) error { return internal.ErrProcessorBusy }, SubmitOrBlockFunc: func(ctx context.Context, _ *internal.FlowItem) error { <-ctx.Done(); return ctx.Err() }, } - h.mockRegistry.WithConnectionFunc = func( - _ types.FlowKey, - fn func(conn contracts.ActiveFlowConnection, - ) error) error { - return fn(&mockActiveFlowConnection{ActiveShardsV: []contracts.RegistryShard{newMockShard("shard-A").build()}}) - } reqCtx, cancelReq := context.WithCancel(context.Background()) go func() { time.Sleep(50 * time.Millisecond); cancelReq() }() @@ -492,9 +512,22 @@ func TestFlowController_EnqueueAndWait(t *testing.T) { t.Run("RetriesAndSucceeds_OnProcessorReportsShardDraining", func(t *testing.T) { t.Parallel() - h := newUnitHarness(t, Config{}) - h.start(t) - + var callCount atomic.Int32 + mockRegistry := &mockRegistryClient{ + WithConnectionFunc: func( + _ types.FlowKey, + fn func(conn contracts.ActiveFlowConnection) error, + ) error { + attempt := callCount.Add(1) + shardA := newMockShard("shard-A").withByteSize(100).build() + shardB := newMockShard("shard-B").withByteSize(1000).build() + if attempt == 1 { + return fn(&mockActiveFlowConnection{ActiveShardsV: []contracts.RegistryShard{shardA, shardB}}) + } + return fn(&mockActiveFlowConnection{ActiveShardsV: []contracts.RegistryShard{shardB}}) + }, + } + h := newUnitHarness(t, t.Context(), Config{}, mockRegistry) h.mockProcessorFactory.processors["shard-A"] = &mockShardProcessor{ SubmitFunc: func(item *internal.FlowItem) error { go item.Finalize(types.QueueOutcomeRejectedOther, contracts.ErrShardDraining) @@ -508,20 +541,6 @@ func TestFlowController_EnqueueAndWait(t *testing.T) { }, } - var callCount atomic.Int32 - h.mockRegistry.WithConnectionFunc = func( - _ types.FlowKey, - fn func(conn contracts.ActiveFlowConnection) error, - ) error { - attempt := callCount.Add(1) - shardA := newMockShard("shard-A").withByteSize(100).build() - shardB := newMockShard("shard-B").withByteSize(1000).build() - if attempt == 1 { - return fn(&mockActiveFlowConnection{ActiveShardsV: []contracts.RegistryShard{shardA, shardB}}) - } - return fn(&mockActiveFlowConnection{ActiveShardsV: []contracts.RegistryShard{shardB}}) - } - outcome, err := h.fc.EnqueueAndWait(newTestRequest(context.Background(), defaultFlowKey)) require.NoError(t, err, "EnqueueAndWait must succeed after retrying on a healthy shard") assert.Equal(t, types.QueueOutcomeDispatched, outcome, "outcome should be QueueOutcomeDispatched") @@ -530,50 +549,18 @@ func TestFlowController_EnqueueAndWait(t *testing.T) { }) } -func TestFlowController_Run(t *testing.T) { +func TestFlowController_Lifecycle(t *testing.T) { t.Parallel() - t.Run("Idempotency", func(t *testing.T) { - t.Parallel() - h := newUnitHarness(t, Config{}) - - // Start the controller in the background. - ctx := t.Context() - go h.fc.Run(ctx) - - // Wait for the first `Run` call to complete its startup sequence. - select { - case <-h.fc.ready: - // Success - case <-time.After(2 * time.Second): - t.Fatal("timed out waiting for FlowController to become ready") - } - - // The second call to Run should return immediately without blocking or panicking. - // We wrap it in a channel to add a timeout, ensuring it doesn't block. - done := make(chan struct{}) - go func() { - h.fc.Run(ctx) - close(done) - }() - - select { - case <-done: - // Success - case <-time.After(100 * time.Millisecond): - t.Error("second call to Run() blocked, indicating it is not idempotent") - } - }) - t.Run("Reconciliation", func(t *testing.T) { t.Parallel() - h := newUnitHarness(t, Config{}) - // Manually simulate the controller starting up so we can test the reconciliation logic in isolation. - parentCtx, cancel := context.WithCancel(context.Background()) - t.Cleanup(cancel) - h.fc.parentCtx = parentCtx - close(h.fc.ready) + mockRegistry := &mockRegistryClient{ + // Configure the mock registry to report the new state without the stale shard. + ShardStatsFunc: func() []contracts.ShardStats { + return []contracts.ShardStats{{ID: "shard-A"}} + }} + h := newUnitHarness(t, t.Context(), Config{}, mockRegistry) // Pre-populate the controller with initial workers. initialShards := []string{"shard-A", "stale-shard"} @@ -586,21 +573,16 @@ func TestFlowController_Run(t *testing.T) { require.Len(t, h.mockProcessorFactory.processors, 2, "pre-condition: initial workers not set up correctly") // Wait for all workers to have started and set their contexts before proceeding with the test. - for _, p := range h.mockProcessorFactory.processors { + for id, p := range h.mockProcessorFactory.processors { proc := p select { case <-proc.runStarted: // Success case <-time.After(2 * time.Second): - t.Fatal("timed out waiting for worker to start") + t.Fatalf("timed out waiting for worker %s to start", id) } } - // Configure the mock registry to report the new state without the stale shard. - h.mockRegistry.ShardStatsFunc = func() []contracts.ShardStats { - return []contracts.ShardStats{{ID: "shard-A"}} - } - // Manually trigger the reconciliation loop logic. h.fc.reconcileProcessors() @@ -637,42 +619,42 @@ func TestFlowController_Run(t *testing.T) { t.Run("Reconciliation_IsTriggeredByTicker", func(t *testing.T) { t.Parallel() reconciliationInterval := 10 * time.Second - h := newUnitHarness(t, Config{ - ProcessorReconciliationInterval: reconciliationInterval, - }) + mockRegistry := &mockRegistryClient{} var reconcileCount atomic.Int32 - h.mockRegistry.ShardStatsFunc = func() []contracts.ShardStats { + mockRegistry.ShardStatsFunc = func() []contracts.ShardStats { reconcileCount.Add(1) return nil } - h.start(t) // This will perform the initial reconciliation. + h := newUnitHarness(t, t.Context(), Config{ProcessorReconciliationInterval: reconciliationInterval}, mockRegistry) - // Assert that the initial reconciliation occurred on startup. - require.Equal(t, int32(1), reconcileCount.Load(), "initial reconciliation did not occur on start") + // Wait for the reconciliation loop to start and create the ticker. + // This prevents a race where the clock is stepped before the ticker is registered. + require.Eventually(t, h.mockClock.HasWaiters, time.Second, 10*time.Millisecond, "ticker was not created") // Advance the clock to trigger the next reconciliation. h.mockClock.Step(reconciliationInterval) assert.Eventually(t, func() bool { - return reconcileCount.Load() == 2 + return reconcileCount.Load() == 1 }, time.Second, 10*time.Millisecond, "reconciliation was not triggered by the ticker") // Advance the clock again to ensure it continues to fire. h.mockClock.Step(reconciliationInterval) assert.Eventually(t, func() bool { - return reconcileCount.Load() == 3 + return reconcileCount.Load() == 2 }, time.Second, 10*time.Millisecond, "reconciliation did not fire on the second tick") }) t.Run("WorkerCreationRace", func(t *testing.T) { t.Parallel() - h := newUnitHarness(t, Config{}) // This test requires manual control over the shard processor factory to deterministically create a race. factoryEntered := make(chan *mockShardProcessor, 2) continueFactory := make(chan struct{}) + + h := newUnitHarness(t, t.Context(), Config{}, &mockRegistryClient{}) h.fc.shardProcessorFactory = func( shard contracts.RegistryShard, _ internal.BandFilter, @@ -690,7 +672,6 @@ func TestFlowController_Run(t *testing.T) { return proc } - h.start(t) shard := newMockShard("race-shard").build() var wg sync.WaitGroup wg.Add(2) @@ -714,10 +695,6 @@ func TestFlowController_Run(t *testing.T) { close(continueFactory) wg.Wait() - // Wait for the `Run` method to be called on both processors to ensure their contexts are non-nil before asserting. - <-proc1.runStarted - <-proc2.runStarted - // One processor "won" and was stored, the other "lost" and should have been cancelled. actual, ok := h.fc.workers.Load("race-shard") require.True(t, ok, "a worker should have been stored in the map") @@ -732,21 +709,31 @@ func TestFlowController_Run(t *testing.T) { loserProc = proc1 } - // The losing processor's context must be cancelled to prevent a goroutine leak. + // Wait for the `Run` method to be called on the winning processor to ensure its context is available. select { - case <-loserProc.Context().Done(): - assert.ErrorIs(t, loserProc.Context().Err(), context.Canceled, "loser's context should be canceled") - case <-time.After(100 * time.Millisecond): - t.Error("context of the losing worker was not cancelled") + case <-winnerProc.runStarted: + // Success. + case <-time.After(1 * time.Second): + t.Fatal("timed out waiting for winning worker to start") } // The winning processor's context must remain active. + require.NotNil(t, winnerProc.Context(), "winner's context should not be nil") select { case <-winnerProc.Context().Done(): t.Error("context of the winning worker should not be cancelled") default: // Success } + + // The losing processor's `Run` method must not be called, and its context should be nil. + select { + case <-loserProc.runStarted: + t.Error("Run was called on the losing worker, but it should not have been") + default: + // Success + } + assert.Nil(t, loserProc.Context(), "loser's context should be nil as Run is never called") }) } @@ -756,23 +743,9 @@ func TestFlowController_Concurrency(t *testing.T) { numGoroutines = 50 numRequests = 200 ) - h := newIntegrationHarness(t, Config{ - // Use a generous buffer to prevent flakes in the test due to transient queuing delays. - EnqueueChannelBufferSize: numRequests, - DefaultRequestTTL: 1 * time.Second, - }) - - // Manually manage the controller's lifecycle to prevent a race with `t.Cleanup`. - controllerCtx := t.Context() - go h.fc.Run(controllerCtx) - select { - case <-h.fc.ready: - // Success - case <-time.After(2 * time.Second): - t.Fatal("timed out waiting for FlowController to become ready") - } // Set up a realistic registry that vends real components to the processor. + mockRegistry := &mockRegistryClient{} shards := make([]contracts.RegistryShard, numShards) queues := make(map[string]contracts.ManagedQueue) for i := range numShards { @@ -783,8 +756,8 @@ func TestFlowController_Concurrency(t *testing.T) { ManagedQueueFunc: func(_ types.FlowKey) (contracts.ManagedQueue, error) { return queues[shardID], nil }, - AllOrderedPriorityLevelsFunc: func() []uint { return []uint{100} }, - PriorityBandAccessorFunc: func(priority uint) (framework.PriorityBandAccessor, error) { + AllOrderedPriorityLevelsFunc: func() []int { return []int{100} }, + PriorityBandAccessorFunc: func(priority int) (framework.PriorityBandAccessor, error) { if priority == 100 { return &frameworkmocks.MockPriorityBandAccessor{ PriorityNameV: "high", @@ -803,7 +776,7 @@ func TestFlowController_Concurrency(t *testing.T) { }, }, nil }, - InterFlowDispatchPolicyFunc: func(_ uint) (framework.InterFlowDispatchPolicy, error) { + InterFlowDispatchPolicyFunc: func(_ int) (framework.InterFlowDispatchPolicy, error) { return &frameworkmocks.MockInterFlowDispatchPolicy{ SelectQueueFunc: func(band framework.PriorityBandAccessor) (framework.FlowQueueAccessor, error) { return queues[shardID].FlowQueueAccessor(), nil @@ -815,7 +788,7 @@ func TestFlowController_Concurrency(t *testing.T) { ID: shardID, TotalLen: uint64(queues[shardID].Len()), TotalByteSize: queues[shardID].ByteSize(), - PerPriorityBandStats: map[uint]contracts.PriorityBandStats{ + PerPriorityBandStats: map[int]contracts.PriorityBandStats{ 100: { Len: uint64(queues[shardID].Len()), ByteSize: queues[shardID].ByteSize(), @@ -826,11 +799,21 @@ func TestFlowController_Concurrency(t *testing.T) { }, } } - h.mockRegistry.WithConnectionFunc = func(_ types.FlowKey, fn func(conn contracts.ActiveFlowConnection) error) error { + mockRegistry.WithConnectionFunc = func(_ types.FlowKey, fn func(conn contracts.ActiveFlowConnection) error) error { return fn(&mockActiveFlowConnection{ActiveShardsV: shards}) } - - reqCtx := logr.NewContext(context.Background(), logr.Discard()) + mockRegistry.ShardStatsFunc = func() []contracts.ShardStats { + stats := make([]contracts.ShardStats, len(shards)) + for i, shard := range shards { + stats[i] = shard.Stats() + } + return stats + } + h := newIntegrationHarness(t, t.Context(), Config{ + // Use a generous buffer to prevent flakes in the test due to transient queuing delays. + EnqueueChannelBufferSize: numRequests, + DefaultRequestTTL: 1 * time.Second, + }, mockRegistry) var wg sync.WaitGroup wg.Add(numGoroutines) @@ -839,7 +822,7 @@ func TestFlowController_Concurrency(t *testing.T) { go func() { defer wg.Done() for range numRequests / numGoroutines { - req := newTestRequest(reqCtx, defaultFlowKey) + req := newTestRequest(logr.NewContext(context.Background(), logr.Discard()), defaultFlowKey) outcome, err := h.fc.EnqueueAndWait(req) if err != nil { // Use `t.Errorf` for concurrent tests to avoid halting execution on a single failure. diff --git a/pkg/epp/flowcontrol/controller/internal/processor.go b/pkg/epp/flowcontrol/controller/internal/processor.go index 49a6dd675..fa0118270 100644 --- a/pkg/epp/flowcontrol/controller/internal/processor.go +++ b/pkg/epp/flowcontrol/controller/internal/processor.go @@ -440,9 +440,9 @@ func (sp *ShardProcessor) dispatchItem(itemAcc types.QueueItemAccessor, logger l // checkItemExpiry provides the authoritative check to determine if an item should be evicted due to TTL expiry or // context cancellation. // -// It provides "defense in depth" against race conditions. Its first action is to check if the item has already been -// finalized by a competing goroutine (e.g., the cleanup loop finalizing an item the dispatch loop is trying to -// process). This ensures that the final outcome is decided exactly once. +// It serves as a safeguard against race conditions. Its first action is to check if the item has already been finalized +// by a competing goroutine (e.g., the cleanup loop finalizing an item the dispatch loop is trying to process).= +// This ensures that the final outcome is decided exactly once. func checkItemExpiry( itemAcc types.QueueItemAccessor, now time.Time, diff --git a/pkg/epp/flowcontrol/types/errors.go b/pkg/epp/flowcontrol/types/errors.go index 5d2c25516..8c966bb45 100644 --- a/pkg/epp/flowcontrol/types/errors.go +++ b/pkg/epp/flowcontrol/types/errors.go @@ -43,11 +43,8 @@ var ( // The following errors can occur before a request is formally added to a `framework.SafeQueue`. When returned by // `FlowController.EnqueueAndWait()`, these specific errors will typically be wrapped by `ErrRejected`. var ( - // ErrNilRequest indicates that a nil `types.FlowControlRequest` was provided. - ErrNilRequest = errors.New("FlowControlRequest cannot be nil") - // ErrQueueAtCapacity indicates that a request could not be enqueued because queue capacity limits were met. - ErrQueueAtCapacity = errors.New("queue at capacity and displacement failed to make space") + ErrQueueAtCapacity = errors.New("queue at capacity") ) // --- Post-Enqueue Eviction Errors ---