Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions internal/nvmeof/controller/controllerserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -796,12 +796,12 @@ func (cs *Server) createNVMeoFResources(

// Step 6: If using auto-listeners, query them back for storing in metadata
if networkMask != "" {
autoListeners, err := gateway.ListListeners(ctx, nvmeofData.SubsystemNQN)
listenersDetailsList, err := gateway.GetListeners(ctx, nvmeofData.SubsystemNQN)
if err != nil {
return nvmeofData, fmt.Errorf("failed to list auto-created listeners: %w", err)
return nvmeofData, fmt.Errorf("failed to retrieve auto-created listeners after retries: %w", err)
}
nvmeofData.ListenerInfo = nvmeof.ConvertListenersFromProto(autoListeners.GetListeners())
log.DebugLog(ctx, "Retrieved %d auto-created listeners", len(nvmeofData.ListenerInfo))
log.DebugLog(ctx, "Retrieved %d auto-created listeners", len(listenersDetailsList))
nvmeofData.ListenerInfo = listenersDetailsList
}

uuid, err := gateway.GetUUIDBySubsystemAndNameSpaceID(ctx, nvmeofData.SubsystemNQN, nvmeofData.NamespaceID)
Expand Down
25 changes: 25 additions & 0 deletions internal/nvmeof/nvmeof.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"math/big"
"syscall"

"github.com/avast/retry-go/v4"
pb "github.com/ceph/ceph-nvmeof/lib/go/nvmeof"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
Expand Down Expand Up @@ -536,6 +537,30 @@ func ConvertListenersFromProto(protoListeners []*pb.ListenerInfo) []ListenerDeta
return listeners
}

// GetListeners retrieves listeners for a subsystem with retry logic.
// Auto-listeners feature may takes time to sync to OMAP state, so this retries with
// exponential backoff.
func (gw *GatewayRpcClient) GetListeners(
ctx context.Context,
subsystemNQN string,
) ([]ListenerDetails, error) {
return retry.DoWithData(
func() ([]ListenerDetails, error) {
autoListeners, err := gw.ListListeners(ctx, subsystemNQN)
if err != nil {
return nil, fmt.Errorf("failed to list auto-created listeners: %w", err)
}

if len(autoListeners.GetListeners()) == 0 {
return nil, fmt.Errorf("no auto-listeners found for subsystem %s", subsystemNQN)
}

return ConvertListenersFromProto(autoListeners.GetListeners()), nil
},
retry.Attempts(6), // ~100ms, 200ms, 400ms, 800ms, 1.6s, 3.2s = ~6.3s total
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gadididi will this work in all the clusters where we have 100 pvc etc or if ceph cluster is under some small stress? because retries are always very tricky

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will try to check it soon

)
}

// Connect to Gateway gRPC server.
func (c *GatewayRpcClient) connect() error {
// Create connection using new gRPC API
Expand Down