Skip to content

Commit 9f0b094

Browse files
committed
Add missing dependencies for vendor sdk tests and adjust dockerfile for local tests
Signed-off-by: Rodrigo Sampaio Vaz <rvaz@nvidia.com>
1 parent 4b8e899 commit 9f0b094

30 files changed

+1509
-213
lines changed

.dockerignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ build
77
**/*.test
88
**/*.out
99
**/*.log
10+
!third_party/fleet-intelligence-sdk/**/testdata/**/*.log
1011
**/node_modules
1112
deployments/packages
1213

.gitignore

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,18 @@ Thumbs.db
3939
# Log files
4040
*.log
4141
logs/
42+
!third_party/fleet-intelligence-sdk/components/accelerator/nvidia/fabric-manager/testdata/
43+
!third_party/fleet-intelligence-sdk/components/accelerator/nvidia/fabric-manager/testdata/**
44+
!third_party/fleet-intelligence-sdk/components/accelerator/nvidia/nccl/testdata/
45+
!third_party/fleet-intelligence-sdk/components/accelerator/nvidia/nccl/testdata/**
46+
!third_party/fleet-intelligence-sdk/components/accelerator/nvidia/xid/testdata/
47+
!third_party/fleet-intelligence-sdk/components/accelerator/nvidia/xid/testdata/**
48+
!third_party/fleet-intelligence-sdk/components/memory/testdata/
49+
!third_party/fleet-intelligence-sdk/components/memory/testdata/**
50+
!third_party/fleet-intelligence-sdk/components/os/testdata/
51+
!third_party/fleet-intelligence-sdk/components/os/testdata/**
52+
!third_party/fleet-intelligence-sdk/pkg/kmsg/testdata/
53+
!third_party/fleet-intelligence-sdk/pkg/kmsg/testdata/**
4254

4355
# Configuration files with secrets
4456
*.env
@@ -68,10 +80,14 @@ profile.out
6880
release/
6981
!third_party/fleet-intelligence-sdk/pkg/release/
7082
!third_party/fleet-intelligence-sdk/pkg/release/**
83+
!third_party/fleet-intelligence-sdk/cmd/gpud/release/
84+
!third_party/fleet-intelligence-sdk/cmd/gpud/release/**
7185

7286
# Local development
7387
local/
7488
dev/
89+
!third_party/fleet-intelligence-sdk/pkg/nvidia/dev/
90+
!third_party/fleet-intelligence-sdk/pkg/nvidia/dev/**
7591

7692
CLAUDE.md
7793
AGENTS.md

Dockerfile.citests

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
# Dockerfile for running CI tests in a container.
22
#
3-
# Build (requires SSH agent for private modules, same as main Dockerfile):
4-
# eval "$(ssh-agent -s)" && ssh-add ~/.ssh/id_ed25519
5-
# docker build -f Dockerfile.citests --ssh default -t fleet-intelligence-agent:test .
63
#
74
# Default: run lint, tests, then vulnerability check (all must pass).
85
# docker run --rm fleet-intelligence-agent:test
@@ -11,6 +8,7 @@
118
# docker run --rm -v "$(pwd)/coverage:/src/coverage" fleet-intelligence-agent:test
129
#
1310
# Run only one check:
11+
# docker run --rm fleet-intelligence-agent:test sh -c "cd third_party/fleet-intelligence-sdk && go test ./..."
1412
# docker run --rm fleet-intelligence-agent:test make test
1513
# docker run --rm fleet-intelligence-agent:test make lint
1614
# docker run --rm fleet-intelligence-agent:test make vuln
@@ -20,6 +18,8 @@
2018
#
2119
FROM golang:1.24.13
2220

21+
ARG TARGETARCH
22+
2323
RUN apt-get update && apt-get install -y --no-install-recommends \
2424
ca-certificates \
2525
curl \
@@ -28,6 +28,23 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
2828
build-essential \
2929
&& rm -rf /var/lib/apt/lists/*
3030

31+
# Install DCGM runtime from NVIDIA repo.
32+
# - amd64: package is datacenter-gpu-manager in debian12/x86_64.
33+
# - arm64: package is datacenter-gpu-manager-4-core in debian12/sbsa.
34+
RUN if [ "${TARGETARCH}" = "amd64" ]; then \
35+
echo "deb [trusted=yes] https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64 /" > /etc/apt/sources.list.d/nvidia-cuda.list && \
36+
apt-get update && \
37+
apt-get install -y --no-install-recommends datacenter-gpu-manager && \
38+
rm -rf /var/lib/apt/lists/*; \
39+
elif [ "${TARGETARCH}" = "arm64" ]; then \
40+
echo "deb [trusted=yes] https://developer.download.nvidia.com/compute/cuda/repos/debian12/sbsa /" > /etc/apt/sources.list.d/nvidia-cuda.list && \
41+
apt-get update && \
42+
apt-get install -y --no-install-recommends datacenter-gpu-manager-4-core && \
43+
rm -rf /var/lib/apt/lists/*; \
44+
else \
45+
echo "Skipping DCGM install: unsupported TARGETARCH=${TARGETARCH}"; \
46+
fi
47+
3148
# Install golangci-lint (matches CI / make lint)
3249
RUN curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b /usr/local/bin
3350
# Install govulncheck up front (avoids runtime installs as non-root user)
@@ -42,4 +59,4 @@ ENV GOFLAGS=-trimpath
4259
COPY . .
4360

4461
# Default: run lint, tests, then vulnerability check (exit on first failure).
45-
CMD ["sh", "-c", "make lint && make test && make vuln"]
62+
CMD ["sh", "-c", "make lint && make test && make vuln && cd third_party/fleet-intelligence-sdk && go test ./..."]
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
// Package v1 is a stub for the pruned vendored SDK (client not used by fleet-intelligence-agent).
1+
// Package v1 provides the fleet-intelligence-sdk v1 client for the server.
22
package v1
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
package v1
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"encoding/json"
7+
"errors"
8+
"fmt"
9+
"io"
10+
"net/http"
11+
"time"
12+
13+
"github.com/NVIDIA/fleet-intelligence-sdk/pkg/server"
14+
)
15+
16+
var ErrServerNotReady = errors.New("server not ready, timeout waiting")
17+
18+
func CheckHealthz(ctx context.Context, addr string, opts ...OpOption) error {
19+
op := &Op{}
20+
if err := op.applyOpts(opts); err != nil {
21+
return err
22+
}
23+
24+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("%s/healthz", addr), nil)
25+
if err != nil {
26+
return fmt.Errorf("failed to create request: %w", err)
27+
}
28+
29+
exp, err := json.Marshal(server.DefaultHealthz)
30+
if err != nil {
31+
return fmt.Errorf("failed to marshal expected healthz response: %w", err)
32+
}
33+
34+
return checkHealthz(createDefaultHTTPClient(), req, exp)
35+
}
36+
37+
func checkHealthz(cli *http.Client, req *http.Request, exp []byte) error {
38+
resp, err := cli.Do(req)
39+
if err != nil {
40+
return fmt.Errorf("failed to make request to /healthz: %w", err)
41+
}
42+
defer func() {
43+
_ = resp.Body.Close()
44+
}()
45+
46+
if resp.StatusCode != http.StatusOK {
47+
return fmt.Errorf("server not ready, response not 200")
48+
}
49+
50+
b, err := io.ReadAll(resp.Body)
51+
if err != nil {
52+
return fmt.Errorf("failed to read healthz response: %w", err)
53+
}
54+
55+
if !bytes.Equal(b, exp) {
56+
return fmt.Errorf("unexpected healthz response: %s", string(b))
57+
}
58+
59+
return nil
60+
}
61+
62+
func BlockUntilServerReady(ctx context.Context, addr string, opts ...OpOption) error {
63+
op := &Op{}
64+
if err := op.applyOpts(opts); err != nil {
65+
return err
66+
}
67+
68+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("%s/healthz", addr), nil)
69+
if err != nil {
70+
return fmt.Errorf("failed to create request: %w", err)
71+
}
72+
73+
exp, err := json.Marshal(server.DefaultHealthz)
74+
if err != nil {
75+
return fmt.Errorf("failed to marshal expected healthz response: %w", err)
76+
}
77+
78+
httpClient := createDefaultHTTPClient()
79+
80+
ticker := time.NewTicker(time.Second)
81+
defer ticker.Stop()
82+
83+
for range 30 {
84+
select {
85+
case <-ticker.C:
86+
if err := checkHealthz(httpClient, req, exp); err == nil {
87+
return nil
88+
}
89+
case <-ctx.Done():
90+
return fmt.Errorf("context done: %w", ctx.Err())
91+
}
92+
}
93+
return ErrServerNotReady
94+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
package v1
2+
3+
import (
4+
"crypto/tls"
5+
"net/http"
6+
)
7+
8+
func createDefaultHTTPClient() *http.Client {
9+
return &http.Client{
10+
Transport: &http.Transport{
11+
Proxy: http.ProxyFromEnvironment,
12+
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
13+
},
14+
}
15+
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
package v1
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
"net/http"
8+
9+
apiv1 "github.com/NVIDIA/fleet-intelligence-sdk/api/v1"
10+
"github.com/NVIDIA/fleet-intelligence-sdk/pkg/server"
11+
)
12+
13+
func GetMachineInfo(ctx context.Context, addr string, opts ...OpOption) (*apiv1.MachineInfo, error) {
14+
op := &Op{}
15+
if err := op.applyOpts(opts); err != nil {
16+
return nil, err
17+
}
18+
19+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("%s%s", addr, server.URLPathMachineInfo), nil)
20+
if err != nil {
21+
return nil, fmt.Errorf("failed to create request: %w", err)
22+
}
23+
24+
return getMachineInfo(createDefaultHTTPClient(), req)
25+
}
26+
27+
func getMachineInfo(cli *http.Client, req *http.Request) (*apiv1.MachineInfo, error) {
28+
resp, err := cli.Do(req)
29+
if err != nil {
30+
return nil, fmt.Errorf("failed to make request to %q: %w", req.URL, err)
31+
}
32+
defer func() {
33+
_ = resp.Body.Close()
34+
}()
35+
36+
if resp.StatusCode != http.StatusOK {
37+
return nil, fmt.Errorf("server not ready, response not 200")
38+
}
39+
40+
var info apiv1.MachineInfo
41+
if err := json.NewDecoder(resp.Body).Decode(&info); err != nil {
42+
return nil, fmt.Errorf("failed to decode machine info: %w", err)
43+
}
44+
45+
return &info, nil
46+
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
package v1
2+
3+
import "github.com/NVIDIA/fleet-intelligence-sdk/pkg/httputil"
4+
5+
type Op struct {
6+
requestContentType string
7+
requestAcceptEncoding string
8+
components map[string]any
9+
}
10+
11+
type OpOption func(*Op)
12+
13+
func (op *Op) applyOpts(opts []OpOption) error {
14+
for _, opt := range opts {
15+
opt(op)
16+
}
17+
return nil
18+
}
19+
20+
// WithRequestContentTypeYAML sets the request content type to YAML.
21+
func WithRequestContentTypeYAML() OpOption {
22+
return func(op *Op) {
23+
op.requestContentType = httputil.RequestHeaderYAML
24+
}
25+
}
26+
27+
// WithRequestContentTypeJSON sets the request content type to JSON.
28+
func WithRequestContentTypeJSON() OpOption {
29+
return func(op *Op) {
30+
op.requestContentType = httputil.RequestHeaderJSON
31+
}
32+
}
33+
34+
// WithAcceptEncodingGzip requests gzip encoding for the response.
35+
func WithAcceptEncodingGzip() OpOption {
36+
return func(op *Op) {
37+
op.requestAcceptEncoding = httputil.RequestHeaderEncodingGzip
38+
}
39+
}
40+
41+
func WithComponent(component string) OpOption {
42+
return func(op *Op) {
43+
if op.components == nil {
44+
op.components = make(map[string]any)
45+
}
46+
op.components[component] = nil
47+
}
48+
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
package v1
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
"io"
8+
"net/http"
9+
10+
"github.com/NVIDIA/fleet-intelligence-sdk/pkg/gpud-manager/packages"
11+
)
12+
13+
// GetPackageStatus fetches the GPUd package status from the GPUd admin API.
14+
func GetPackageStatus(ctx context.Context, url string, opts ...OpOption) ([]packages.PackageStatus, error) {
15+
op := &Op{}
16+
if err := op.applyOpts(opts); err != nil {
17+
return nil, err
18+
}
19+
20+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
21+
if err != nil {
22+
return nil, err
23+
}
24+
25+
resp, err := createDefaultHTTPClient().Do(req)
26+
if err != nil {
27+
return nil, err
28+
}
29+
defer func() {
30+
_ = resp.Body.Close()
31+
}()
32+
33+
if resp.StatusCode != http.StatusOK {
34+
return nil, fmt.Errorf("unexpected status code %v received", resp.StatusCode)
35+
}
36+
37+
rawBody, err := io.ReadAll(resp.Body)
38+
if err != nil {
39+
return nil, err
40+
}
41+
42+
var ret []packages.PackageStatus
43+
if err := json.Unmarshal(rawBody, &ret); err != nil {
44+
return nil, err
45+
}
46+
return ret, nil
47+
}

0 commit comments

Comments
 (0)