Skip to content

Commit 180edab

Browse files
nullfunclionello
andauthored
add GPU validation for range of GPU amounts (#852)
* add GPU validation for range of GPU amounts * get service quota to determine if there are any GPUs * add test fix zero only quotas * fix comments and re-order vars * skip test needing more doc * update vendorHash * Update src/pkg/cli/client/byoc/aws/validation.go --------- Co-authored-by: Lio李歐 <[email protected]>
1 parent c9be979 commit 180edab

File tree

6 files changed

+389
-13
lines changed

6 files changed

+389
-13
lines changed

pkgs/defang/cli.nix

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ buildGoModule {
66
pname = "defang-cli";
77
version = "git";
88
src = ../../src;
9-
vendorHash = "sha256-f8vNFPtx2a9+UYve95VFxgbdZk+CAwwddNvVfArs+qo=";
9+
vendorHash = "sha256-LMvY3gejg2mS/71wwW9aCpHo5r9uKzaUcKFm+9l8B6s=";
1010

1111
subPackages = [ "cmd/cli" ];
1212

src/go.mod

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,18 @@ toolchain go1.22.3
66

77
require (
88
github.com/AlecAivazis/survey/v2 v2.3.7
9-
github.com/aws/aws-sdk-go-v2 v1.27.0
9+
github.com/aws/aws-sdk-go-v2 v1.32.4
1010
github.com/aws/aws-sdk-go-v2/config v1.26.6
1111
github.com/aws/aws-sdk-go-v2/service/cloudformation v1.42.6
1212
github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.35.4
1313
github.com/aws/aws-sdk-go-v2/service/ec2 v1.145.0
1414
github.com/aws/aws-sdk-go-v2/service/ecs v1.38.1
1515
github.com/aws/aws-sdk-go-v2/service/route53 v1.37.1
1616
github.com/aws/aws-sdk-go-v2/service/s3 v1.48.1
17+
github.com/aws/aws-sdk-go-v2/service/servicequotas v1.25.5
1718
github.com/aws/aws-sdk-go-v2/service/ssm v1.44.7
1819
github.com/aws/aws-sdk-go-v2/service/sts v1.26.7
19-
github.com/aws/smithy-go v1.20.2
20+
github.com/aws/smithy-go v1.22.0
2021
github.com/awslabs/goformation/v7 v7.13.1
2122
github.com/bufbuild/connect-go v1.10.0
2223
github.com/compose-spec/compose-go/v2 v2.4.3
@@ -67,8 +68,8 @@ require (
6768
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.2 // indirect
6869
github.com/aws/aws-sdk-go-v2/credentials v1.16.16
6970
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.14.11 // indirect
70-
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.7 // indirect
71-
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.7 // indirect
71+
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.23 // indirect
72+
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.23 // indirect
7273
github.com/aws/aws-sdk-go-v2/internal/ini v1.7.3 // indirect
7374
github.com/aws/aws-sdk-go-v2/internal/v4a v1.2.10 // indirect
7475
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.10.4 // indirect

src/go.sum

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migc
66
github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM=
77
github.com/Netflix/go-expect v0.0.0-20220104043353-73e0943537d2 h1:+vx7roKuyA63nhn5WAunQHLTznkw5W8b1Xc0dNjp83s=
88
github.com/Netflix/go-expect v0.0.0-20220104043353-73e0943537d2/go.mod h1:HBCaDeC1lPdgDeDbhX8XFpy1jqjK0IBG8W5K+xYqA0w=
9-
github.com/aws/aws-sdk-go-v2 v1.27.0 h1:7bZWKoXhzI+mMR/HjdMx8ZCC5+6fY0lS5tr0bbgiLlo=
10-
github.com/aws/aws-sdk-go-v2 v1.27.0/go.mod h1:ffIFB97e2yNsv4aTSGkqtHnppsIJzw7G7BReUZ3jCXM=
9+
github.com/aws/aws-sdk-go-v2 v1.32.4 h1:S13INUiTxgrPueTmrm5DZ+MiAo99zYzHEFh1UNkOxNE=
10+
github.com/aws/aws-sdk-go-v2 v1.32.4/go.mod h1:2SK5n0a2karNTv5tbP1SjsX0uhttou00v/HpXKM1ZUo=
1111
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.2 h1:x6xsQXGSmW6frevwDA+vi/wqhp1ct18mVXYN08/93to=
1212
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.2/go.mod h1:lPprDr1e6cJdyYeGXnRaJoP4Md+cDBvi2eOj00BlGmg=
1313
github.com/aws/aws-sdk-go-v2/config v1.26.6 h1:Z/7w9bUqlRI0FFQpetVuFYEsjzE3h7fpU6HuGmfPL/o=
@@ -16,10 +16,10 @@ github.com/aws/aws-sdk-go-v2/credentials v1.16.16 h1:8q6Rliyv0aUFAVtzaldUEcS+T5g
1616
github.com/aws/aws-sdk-go-v2/credentials v1.16.16/go.mod h1:UHVZrdUsv63hPXFo1H7c5fEneoVo9UXiz36QG1GEPi0=
1717
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.14.11 h1:c5I5iH+DZcH3xOIMlz3/tCKJDaHFwYEmxvlh2fAcFo8=
1818
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.14.11/go.mod h1:cRrYDYAMUohBJUtUnOhydaMHtiK/1NZ0Otc9lIb6O0Y=
19-
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.7 h1:lf/8VTF2cM+N4SLzaYJERKEWAXq8MOMpZfU6wEPWsPk=
20-
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.7/go.mod h1:4SjkU7QiqK2M9oozyMzfZ/23LmUY+h3oFqhdeP5OMiI=
21-
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.7 h1:4OYVp0705xu8yjdyoWix0r9wPIRXnIzzOoUpQVHIJ/g=
22-
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.7/go.mod h1:vd7ESTEvI76T2Na050gODNmNU7+OyKrIKroYTu4ABiI=
19+
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.23 h1:A2w6m6Tmr+BNXjDsr7M90zkWjsu4JXHwrzPg235STs4=
20+
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.23/go.mod h1:35EVp9wyeANdujZruvHiQUAo9E3vbhnIO1mTCAxMlY0=
21+
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.23 h1:pgYW9FCabt2M25MoHYCfMrVY2ghiiBKYWUVXfwZs+sU=
22+
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.23/go.mod h1:c48kLgzO19wAu3CPkDWC28JbaJ+hfQlsdl7I2+oqIbk=
2323
github.com/aws/aws-sdk-go-v2/internal/ini v1.7.3 h1:n3GDfwqF2tzEkXlv5cuy4iy7LpKDtqDMcNLfZDu9rls=
2424
github.com/aws/aws-sdk-go-v2/internal/ini v1.7.3/go.mod h1:6fQQgfuGmw8Al/3M2IgIllycxV7ZW7WCdVSqfBeUiCY=
2525
github.com/aws/aws-sdk-go-v2/internal/v4a v1.2.10 h1:5oE2WzJE56/mVveuDZPJESKlg/00AaS2pY2QZcnxg4M=
@@ -44,6 +44,8 @@ github.com/aws/aws-sdk-go-v2/service/route53 v1.37.1 h1:U7OksynDSIFScG+7sGqOuJh+
4444
github.com/aws/aws-sdk-go-v2/service/route53 v1.37.1/go.mod h1:8qqfpG4mug2JLlEyWPSFhEGvJiaZ9iPmMDDMYc5Xtas=
4545
github.com/aws/aws-sdk-go-v2/service/s3 v1.48.1 h1:5XNlsBsEvBZBMO6p82y+sqpWg8j5aBCe+5C2GBFgqBQ=
4646
github.com/aws/aws-sdk-go-v2/service/s3 v1.48.1/go.mod h1:4qXHrG1Ne3VGIMZPCB8OjH/pLFO94sKABIusjh0KWPU=
47+
github.com/aws/aws-sdk-go-v2/service/servicequotas v1.25.5 h1:jt3Uxl/IlqWyy5PcaUIVoPbdaDB5kVQ8osqJSOeebS8=
48+
github.com/aws/aws-sdk-go-v2/service/servicequotas v1.25.5/go.mod h1:3A0rDgx/TxFalmvJ9coSZNeOIpxg0z6esHdSVt6TdeM=
4749
github.com/aws/aws-sdk-go-v2/service/ssm v1.44.7 h1:a8HvP/+ew3tKwSXqL3BCSjiuicr+XTU2eFYeogV9GJE=
4850
github.com/aws/aws-sdk-go-v2/service/ssm v1.44.7/go.mod h1:Q7XIWsMo0JcMpI/6TGD6XXcXcV1DbTj6e9BKNntIMIM=
4951
github.com/aws/aws-sdk-go-v2/service/sso v1.18.7 h1:eajuO3nykDPdYicLlP3AGgOyVN3MOlFmZv7WGTuJPow=
@@ -52,8 +54,8 @@ github.com/aws/aws-sdk-go-v2/service/ssooidc v1.21.7 h1:QPMJf+Jw8E1l7zqhZmMlFw6w
5254
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.21.7/go.mod h1:ykf3COxYI0UJmxcfcxcVuz7b6uADi1FkiUz6Eb7AgM8=
5355
github.com/aws/aws-sdk-go-v2/service/sts v1.26.7 h1:NzO4Vrau795RkUdSHKEwiR01FaGzGOH1EETJ+5QHnm0=
5456
github.com/aws/aws-sdk-go-v2/service/sts v1.26.7/go.mod h1:6h2YuIoxaMSCFf5fi1EgZAwdfkGMgDY+DVfa61uLe4U=
55-
github.com/aws/smithy-go v1.20.2 h1:tbp628ireGtzcHDDmLT/6ADHidqnwgF57XOXZe6tp4Q=
56-
github.com/aws/smithy-go v1.20.2/go.mod h1:krry+ya/rV9RDcV/Q16kpu6ypI4K2czasz0NC3qS14E=
57+
github.com/aws/smithy-go v1.22.0 h1:uunKnWlcoL3zO7q+gG2Pk53joueEOsnNB28QdMsmiMM=
58+
github.com/aws/smithy-go v1.22.0/go.mod h1:irrKGvNn1InZwb2d7fkIRNucdfwR8R+Ts3wxYa/cJHg=
5759
github.com/awslabs/goformation/v7 v7.13.1 h1:QlPn8qwNCqYhrb4GW8kLjT4j1J49n5Qh/anpurCHxUA=
5860
github.com/awslabs/goformation/v7 v7.13.1/go.mod h1:FTCFMNesubEX0LAd6kIR+YkDD1U+5UaMbXtgPUgsck0=
5961
github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=

src/pkg/cli/client/byoc/aws/byoc.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,11 @@ func (b *ByocAws) Preview(ctx context.Context, req *defangv1.DeployRequest) (*de
185185
}
186186

187187
func (b *ByocAws) deploy(ctx context.Context, req *defangv1.DeployRequest, cmd string) (*defangv1.DeployResponse, error) {
188+
cfg, err := b.driver.LoadConfig(ctx)
189+
if err != nil {
190+
return nil, byoc.AnnotateAwsError(err)
191+
}
192+
188193
// If multiple Compose files were provided, req.Compose is the merged representation of all the files
189194
project, err := compose.LoadFromContent(ctx, req.Compose, "")
190195
if err != nil {
@@ -201,6 +206,11 @@ func (b *ByocAws) deploy(ctx context.Context, req *defangv1.DeployRequest, cmd s
201206
return nil, errors.New("maximum number of services reached")
202207
}
203208

209+
quotaClient = NewServiceQuotasClient(ctx, cfg)
210+
if err = ValidateGPUResources(ctx, project); err != nil {
211+
return nil, err
212+
}
213+
204214
serviceInfos := []*defangv1.ServiceInfo{}
205215
for _, service := range project.Services {
206216
serviceInfo, err := b.update(ctx, project.Name, req.DelegateDomain, service)
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
package aws
2+
3+
import (
4+
"context"
5+
"errors"
6+
"slices"
7+
8+
"github.com/aws/aws-sdk-go-v2/aws"
9+
"github.com/aws/aws-sdk-go-v2/service/servicequotas"
10+
composeTypes "github.com/compose-spec/compose-go/v2/types"
11+
)
12+
13+
var (
14+
gpuQuotaCodes = []string{"L-7212CCBC", "L-3819A6DF"} // these are the GPU quota codes from cd
15+
serviceCode = "ec2"
16+
)
17+
18+
type QuotaClientAPI interface {
19+
ListServiceQuotas(ctx context.Context, params *servicequotas.ListServiceQuotasInput, optFns ...func(*servicequotas.Options)) (*servicequotas.ListServiceQuotasOutput, error)
20+
}
21+
22+
var quotaClient QuotaClientAPI
23+
24+
var ErrAWSNoConnection = errors.New("no connect to AWS service quotas")
25+
var ErrGPUQuotaZero = errors.New("GPU quota is 0, no GPUs allowed")
26+
var ErrNoQuotasReceived = errors.New("no service quotas received")
27+
28+
func NewServiceQuotasClient(ctx context.Context, cfg aws.Config) *servicequotas.Client {
29+
return servicequotas.NewFromConfig(cfg)
30+
}
31+
32+
func hasGPUQuota(ctx context.Context) (bool, error) {
33+
if quotaClient == nil {
34+
return false, ErrAWSNoConnection
35+
}
36+
37+
var token *string
38+
for _, quotaCode := range gpuQuotaCodes {
39+
for {
40+
quotas, err := quotaClient.ListServiceQuotas(ctx, &servicequotas.ListServiceQuotasInput{
41+
ServiceCode: aws.String(serviceCode),
42+
QuotaCode: aws.String(quotaCode),
43+
NextToken: token,
44+
})
45+
if err != nil {
46+
return false, err
47+
}
48+
if len(quotas.Quotas) == 0 {
49+
return false, ErrNoQuotasReceived
50+
}
51+
52+
// the quota.Value is actually the number of CPUs, but since we only
53+
// alllocate GPU enabled instances, as soon as we know there
54+
// is a non-zero CPU instance we know that there is at least one GPU
55+
for _, quota := range quotas.Quotas {
56+
if *(quota.Value) > 0.0 {
57+
return true, nil
58+
}
59+
}
60+
61+
token = quotas.NextToken
62+
if token == nil {
63+
break
64+
}
65+
}
66+
}
67+
68+
// if we've reached this point, no GPU quota was found or all quotas were zero
69+
return false, nil
70+
}
71+
72+
func ValidateGPUResources(ctx context.Context, project *composeTypes.Project) error {
73+
// return after checking if there are actually non-zero GPUs requested
74+
hasGPUs, quotaErr := hasGPUQuota(ctx)
75+
76+
for _, service := range project.Services {
77+
if service.Deploy != nil &&
78+
service.Deploy.Resources.Reservations != nil {
79+
for _, device := range service.Deploy.Resources.Reservations.Devices {
80+
if slices.Contains(device.Capabilities, "gpu") {
81+
if device.Count == 0 {
82+
continue
83+
}
84+
85+
// if there was an error getting the quota
86+
if quotaErr != nil {
87+
return quotaErr
88+
}
89+
90+
if !hasGPUs {
91+
return ErrGPUQuotaZero
92+
}
93+
94+
break
95+
}
96+
}
97+
}
98+
}
99+
100+
return nil
101+
}

0 commit comments

Comments
 (0)