Skip to content

Commit a081c75

Browse files
committed
OCPBUGS-29469: fix Azure API SKU calls timing out
If the API call to list SKUs times out, we don't get a context error (e.g., "context deadline exceeded") from Azure, just an empty response. The problem with that is that we return the following errors to the user: ``` ERROR failed to fetch Master Machines: failed to load asset "Install Config": failed to create install config: [controlPlane.platform.azure.type: Invalid value: "Standard_D8s_v3": not found in region eastus, controlPlane.platform.azure.type: Invalid value: "Standard_D8s_v3": unable to determine HyperVGeneration version, compute[0].platform.azure.type: Invalid value: "Standard_D4s_v3": not found in region eastus] ``` which confuses them when they verify that information with the az client ``` $ az vm list-sizes --location "eastus" | grep -i Standard_D8s_v3 "name": "Standard_D8s_v3", ``` To improve the situation, let's increase the time out from 30s to 2min and let's explicitly check for context errors before reporting that the VM type was not found.
1 parent 7645fef commit a081c75

File tree

1 file changed

+19
-2
lines changed

1 file changed

+19
-2
lines changed

pkg/asset/installconfig/azure/client.go

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,9 @@ func (c *Client) getProvidersClient(ctx context.Context) (azres.ProvidersClient,
173173
func (c *Client) GetDiskSkus(ctx context.Context, region string) ([]azsku.ResourceSku, error) {
174174
client := azsku.NewResourceSkusClientWithBaseURI(c.ssn.Environment.ResourceManagerEndpoint, c.ssn.Credentials.SubscriptionID)
175175
client.Authorizer = c.ssn.Authorizer
176-
ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
176+
177+
// See https://issues.redhat.com/browse/OCPBUGS-29469 before changing this timeout
178+
ctx, cancel := context.WithTimeout(ctx, 2*time.Minute)
177179
defer cancel()
178180

179181
var sku []azsku.ResourceSku
@@ -195,6 +197,12 @@ func (c *Client) GetDiskSkus(ctx context.Context, region string) ([]azsku.Resour
195197
return sku, nil
196198
}
197199

200+
// Azure does not return an error in case of context deadline, so we need
201+
// to check it ourselves
202+
if err := ctx.Err(); err != nil {
203+
return nil, fmt.Errorf("failed to list SKUs: %w", err)
204+
}
205+
198206
return nil, fmt.Errorf("no disks for specified subscription in region %s", region)
199207
}
200208

@@ -235,7 +243,9 @@ func (c *Client) ListResourceIDsByGroup(ctx context.Context, groupName string) (
235243
func (c *Client) GetVirtualMachineSku(ctx context.Context, name, region string) (*azsku.ResourceSku, error) {
236244
client := azsku.NewResourceSkusClientWithBaseURI(c.ssn.Environment.ResourceManagerEndpoint, c.ssn.Credentials.SubscriptionID)
237245
client.Authorizer = c.ssn.Authorizer
238-
ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
246+
247+
// See https://issues.redhat.com/browse/OCPBUGS-29469 before chaging this timeout
248+
ctx, cancel := context.WithTimeout(ctx, 2*time.Minute)
239249
defer cancel()
240250

241251
for page, err := client.List(ctx); page.NotDone(); err = page.NextWithContext(ctx) {
@@ -259,6 +269,13 @@ func (c *Client) GetVirtualMachineSku(ctx context.Context, name, region string)
259269
}
260270
}
261271
}
272+
273+
// Azure does not return an error in case of context deadline, so we need
274+
// to check it ourselves
275+
if err := ctx.Err(); err != nil {
276+
return nil, fmt.Errorf("failed to list SKUs: %w", err)
277+
}
278+
262279
return nil, nil
263280
}
264281

0 commit comments

Comments
 (0)