Skip to content

Commit bae5f31

Browse files
authored
add dcgm info to the agent data. Upgrade golangcilint (#42)
* add dcgm info to the agent data. Upgrade golangcilint * update action * fix lint
1 parent f6f12b6 commit bae5f31

File tree

9 files changed

+438
-61
lines changed

9 files changed

+438
-61
lines changed

.github/workflows/check.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ jobs:
2020
go-version: '1.22'
2121
cache: false
2222
- name: golangci-lint
23-
uses: golangci/golangci-lint-action@v3
23+
uses: golangci/golangci-lint-action@v8
2424
test:
2525
runs-on: ubuntu-latest
2626
steps:

.golangci.yaml

Lines changed: 63 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,68 @@
1+
version: "2"
12
run:
2-
timeout: 30m # big timeout as github actions are sloow
33
allow-parallel-runners: true
44
linters:
5-
disable-all: true
5+
default: none
66
enable:
7-
- errcheck # Errcheck is a program for checking for unchecked errors in Go code. These unchecked errors can be critical bugs in some cases
8-
- gosimple # Linter for Go source code that specializes in simplifying code
9-
- govet # Vet examines Go source code and reports suspicious constructs, such as Printf calls whose arguments do not align with the format string.
10-
- ineffassign # Detects when assignments to existing variables are not used
11-
- staticcheck # It's a set of rules from staticcheck. It's not the same thing as the staticcheck binary. The author of staticcheck doesn't support or approve the use of staticcheck as a library inside golangci-lint.
12-
- typecheck # like the front-end of a Go compiler, parses and type-checks Go code
13-
- unused # Checks Go code for unused constants, variables, functions and types.
14-
- bodyclose # Checks whether HTTP response body is closed successfully.
15-
- decorder # Check declaration order and count of types, constants, variables and functions.
16-
- dogsled # Checks assignments with too many blank identifiers (e.g. x, , , _, := f()).
17-
- dupword # Checks for duplicate words in the source code
18-
- durationcheck # Check for two durations multiplied together.
19-
- errchkjson # Checks types passed to the json encoding functions. Reports unsupported types and optionally reports occasions, where the check for the returned error can be omitted.
20-
- errname # Checks that sentinel errors are prefixed with the Err and error types are suffixed with the Error.
21-
- errorlint # Errorlint is a linter for that can be used to find code that will cause problems with the error wrapping scheme introduced in Go 1.13.
7+
- bodyclose
228
- copyloopvar
23-
- gocheckcompilerdirectives # Checks that go compiler directive comments (//go:) are valid.
24-
- gocognit # Computes and checks the cognitive complexity of functions.
25-
- goconst # Finds repeated strings that could be replaced by a constant.
26-
- gocritic # Provides diagnostics that check for bugs, performance and style issues.
27-
- gocyclo # Computes and checks the cyclomatic complexity of functions.
28-
- gofmt # Gofmt checks whether code was gofmt-ed. By default this tool runs with -s option to check for code simplification.
29-
- importas # Enforces consistent import
30-
- interfacebloat # A linter that checks the number of methods inside an interface.
31-
- lll # Reports long lines.
32-
- makezero # Finds slice declarations with non-zero initial length.
33-
- misspell # Finds commonly misspelled English words in comments.
34-
- nakedret # Finds naked returns in functions greater than a specified function length.
35-
- nestif # Reports deeply nested if statements.
36-
- noctx # Noctx finds sending http request without context.Context.
37-
- prealloc # Finds slice declarations that could potentially be pre-allocated.
38-
- predeclared # Find code that shadows one of Go's predeclared identifiers.
39-
- reassign # Checks that package variables are not reassigned.
40-
- tenv # Tenv is analyzer that detects using os.Setenv instead of t.Setenv since Go1.17.
41-
- unconvert # Remove unnecessary type conversions.
42-
- unparam # Reports unused function parameters.
43-
- usestdlibvars # A linter that detect the possibility to use variables/constants from the Go standard library.
44-
- whitespace # Tool for detection of leading and trailing whitespace.
45-
linters-settings:
46-
gocognit:
47-
min-complexity: 30
48-
govet:
49-
# report about shadowed variables
50-
check-shadowing: false
51-
lll:
52-
# Max line length, lines longer will be reported.
53-
# '\t' is counted as 1 character by default, and can be changed with the tab-width option.
54-
# Default: 120.
55-
line-length: 200
56-
gocritic:
57-
disabled-checks:
58-
- ifElseChain
9+
- decorder
10+
- dogsled
11+
- dupword
12+
- durationcheck
13+
- errcheck
14+
- errchkjson
15+
- errname
16+
- errorlint
17+
- gocheckcompilerdirectives
18+
- gocognit
19+
- goconst
20+
- gocritic
21+
- gocyclo
22+
- govet
23+
- importas
24+
- ineffassign
25+
- interfacebloat
26+
- lll
27+
- makezero
28+
- misspell
29+
- nakedret
30+
- nestif
31+
- noctx
32+
- prealloc
33+
- predeclared
34+
- reassign
35+
- staticcheck
36+
- unconvert
37+
- unparam
38+
- unused
39+
- usestdlibvars
40+
- whitespace
41+
settings:
42+
gocognit:
43+
min-complexity: 30
44+
gocritic:
45+
disabled-checks:
46+
- ifElseChain
47+
lll:
48+
line-length: 200
49+
exclusions:
50+
generated: lax
51+
presets:
52+
- comments
53+
- common-false-positives
54+
- legacy
55+
- std-error-handling
56+
paths:
57+
- third_party$
58+
- builtin$
59+
- examples$
60+
formatters:
61+
enable:
62+
- gofmt
63+
exclusions:
64+
generated: lax
65+
paths:
66+
- third_party$
67+
- builtin$
68+
- examples$

cmd/nebius-observability-agent-updater/main.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"github.com/nebius/nebius-observability-agent-updater/internal/application"
88
"github.com/nebius/nebius-observability-agent-updater/internal/client"
99
"github.com/nebius/nebius-observability-agent-updater/internal/config"
10+
"github.com/nebius/nebius-observability-agent-updater/internal/dcgm"
1011
"github.com/nebius/nebius-observability-agent-updater/internal/loggerhelper"
1112
"github.com/nebius/nebius-observability-agent-updater/internal/metadata"
1213
"github.com/nebius/nebius-observability-agent-updater/internal/osutils"
@@ -32,7 +33,8 @@ func main() {
3233
logger := loggerhelper.InitLogger(&cfg.Logger)
3334
metadataReader := metadata.NewReader(cfg.Metadata, logger)
3435
oh := osutils.NewOsHelper()
35-
cli, err := client.New(metadataReader, oh, cfg, logger, metadataReader.GetIamToken)
36+
dh := dcgm.NewDcgmHelper()
37+
cli, err := client.New(metadataReader, oh, dh, cfg, logger, metadataReader.GetIamToken)
3638
if err != nil {
3739
logger.Error("failed to create client", "error", err)
3840
defer syscall.Exit(1)

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ toolchain go1.23.3
66

77
require (
88
github.com/cenkalti/backoff/v4 v4.3.0
9-
github.com/nebius/gosdk v0.0.0-20250321225023-d65cca70d9a3
9+
github.com/nebius/gosdk v0.0.0-20250505140419-9e3daa7d020f
1010
github.com/shirou/gopsutil/v3 v3.24.5
1111
github.com/stretchr/testify v1.9.0
1212
go.uber.org/goleak v1.3.0

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
1414
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
1515
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4=
1616
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I=
17-
github.com/nebius/gosdk v0.0.0-20250321225023-d65cca70d9a3 h1:v5ET1Sv93c8y6+o3/GTGFfJiAag/LYIVdLJDGRP1z4I=
18-
github.com/nebius/gosdk v0.0.0-20250321225023-d65cca70d9a3/go.mod h1:d0iaJFMQofs+hV0K25+8wVvMHdNxPmxMbIYfhOpZAt8=
17+
github.com/nebius/gosdk v0.0.0-20250505140419-9e3daa7d020f h1:gPkLTbF+zYVsbid0drTulDbiPbcbKcVGKMdzlgJW/cY=
18+
github.com/nebius/gosdk v0.0.0-20250505140419-9e3daa7d020f/go.mod h1:d0iaJFMQofs+hV0K25+8wVvMHdNxPmxMbIYfhOpZAt8=
1919
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
2020
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
2121
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw=

internal/client/client.go

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ type oshelper interface {
4242
GetLastLogs(serviceName string, lines int) (string, error)
4343
}
4444

45+
type dcgmhelper interface {
46+
GetDCGMVersion() (string, error)
47+
GetGpuInfo() (model string, number int, err error)
48+
}
49+
4550
const (
4651
ENDPOINT_ENV = "NEBIUS_OBSERVABILITY_AGENT_UPDATER_ENDPOINT"
4752
UserAgent = "nebius-observability-agent-updater"
@@ -58,11 +63,12 @@ type Client struct {
5863
client agentmanager.VersionServiceClient
5964
logger *slog.Logger
6065
oh oshelper
66+
dh dcgmhelper
6167
retryBackoff backoff.BackOff
6268
getTokenCallback func() (string, error)
6369
}
6470

65-
func New(metadata metadataReader, oh oshelper, config *config.Config, logger *slog.Logger, getTokenCallback func() (string, error)) (*Client, error) {
71+
func New(metadata metadataReader, oh oshelper, dh dcgmhelper, config *config.Config, logger *slog.Logger, getTokenCallback func() (string, error)) (*Client, error) {
6672
if config.GRPC.Endpoint == "" {
6773
endpoint := os.Getenv(ENDPOINT_ENV)
6874
if endpoint == "" {
@@ -96,6 +102,7 @@ func New(metadata metadataReader, oh oshelper, config *config.Config, logger *sl
96102
client: client,
97103
logger: logger,
98104
oh: oh,
105+
dh: dh,
99106
retryBackoff: getRetryBackoff(config.GRPC.Retry),
100107
getTokenCallback: getTokenCallback,
101108
}, nil
@@ -321,5 +328,20 @@ func (s *Client) fillRequest(agent agents.AgentData) *agentmanager.GetVersionReq
321328
req.CloudInitStatus = cloudInitStatus
322329
}
323330

331+
dcgmVersion, err := s.dh.GetDCGMVersion()
332+
if err != nil {
333+
s.logger.Error("failed to get DCGM version", "error", err)
334+
} else {
335+
req.DcgmVersion = dcgmVersion
336+
}
337+
338+
gpuModel, gpuNumber, err := s.dh.GetGpuInfo()
339+
if err != nil {
340+
s.logger.Error("failed to get GPU info", "error", err)
341+
} else {
342+
req.GpuModel = gpuModel
343+
req.GpuNumber = int32(gpuNumber)
344+
}
345+
324346
return &req
325347
}

internal/client/client_test.go

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,20 @@ func (m *mockOSHelper) GetLastLogs(string, int) (string, error) {
9292
return "logs", nil
9393
}
9494

95+
type mockDcgmHelper struct {
96+
mock.Mock
97+
}
98+
99+
func (m *mockDcgmHelper) GetDCGMVersion() (string, error) {
100+
args := m.Called()
101+
return args.String(0), args.Error(1)
102+
}
103+
104+
func (m *mockDcgmHelper) GetGpuInfo() (string, int, error) {
105+
args := m.Called()
106+
return args.String(0), args.Int(1), args.Error(2)
107+
}
108+
95109
type mockVersionServiceClient struct {
96110
mock.Mock
97111
}
@@ -154,8 +168,8 @@ func TestNew(t *testing.T) {
154168
Timeout: 5 * time.Second,
155169
},
156170
}
157-
158-
client, err := New(metadata, oh, &cfg, nil, tokenFunc)
171+
dh := &mockDcgmHelper{}
172+
client, err := New(metadata, oh, dh, &cfg, nil, tokenFunc)
159173
assert.NoError(t, err)
160174
assert.NotNil(t, client)
161175
assert.NotNil(t, client.conn)
@@ -165,11 +179,13 @@ func TestNew(t *testing.T) {
165179
func TestSendAgentData(t *testing.T) {
166180
metadata := &mockMetadataReader{}
167181
oh := &mockOSHelper{}
182+
dh := &mockDcgmHelper{}
168183
mockClient := &mockVersionServiceClient{}
169184

170185
client := &Client{
171186
metadata: metadata,
172187
oh: oh,
188+
dh: dh,
173189
client: mockClient,
174190
retryBackoff: getRetryBackoff(clientconfig.GetDefaultRetryConfig()),
175191
config: &config.Config{
@@ -191,6 +207,9 @@ func TestSendAgentData(t *testing.T) {
191207
oh.On("GetArch").Return("x86_64", nil)
192208
oh.On("GetMk8sClusterId").Return("abcd", nil)
193209

210+
dh.On("GetDCGMVersion").Return("3.3.7", nil)
211+
dh.On("GetGpuInfo").Return("NVIDIA H200", 2, nil)
212+
194213
expectedResponse := &agentmanager.GetVersionResponse{
195214
Action: agentmanager.Action_NOP,
196215
}
@@ -219,10 +238,12 @@ func TestSendAgentData(t *testing.T) {
219238
func TestFillRequest(t *testing.T) {
220239
metadata := &mockMetadataReader{}
221240
oh := &mockOSHelper{}
241+
dh := &mockDcgmHelper{}
222242

223243
client := &Client{
224244
metadata: metadata,
225245
oh: oh,
246+
dh: dh,
226247
retryBackoff: getRetryBackoff(clientconfig.GetDefaultRetryConfig()),
227248
logger: slog.New(slog.NewTextHandler(os.Stdout, nil)),
228249
config: &config.Config{},
@@ -239,6 +260,9 @@ func TestFillRequest(t *testing.T) {
239260
oh.On("GetArch").Return("x86_64", nil)
240261
oh.On("GetMk8sClusterId", mock.Anything).Return("abcd", nil)
241262

263+
dh.On("GetDCGMVersion").Return("3.3.7", nil)
264+
dh.On("GetGpuInfo").Return("NVIDIA H200", 2, nil)
265+
242266
// Create mock health check response using the correct structure
243267
checkStatuses := map[string]healthcheck.CheckStatus{
244268
"process": {
@@ -294,6 +318,9 @@ func TestFillRequest(t *testing.T) {
294318
assert.Equal(t, durationpb.New(1*time.Hour), req.SystemUptime)
295319
assert.Equal(t, "some-error", req.LastUpdateError)
296320
assert.Equal(t, "abcd", req.Mk8SClusterId)
321+
assert.Equal(t, "3.3.7", req.DcgmVersion)
322+
assert.Equal(t, "NVIDIA H200", req.GpuModel)
323+
assert.Equal(t, int32(2), req.GpuNumber)
297324

298325
// Verify module health statuses
299326
assert.NotNil(t, req.ModulesHealth)
@@ -313,11 +340,13 @@ func TestFillRequest(t *testing.T) {
313340
func TestSendAgentDataWithRetry(t *testing.T) {
314341
metadata := &mockMetadataReader{}
315342
oh := &mockOSHelper{}
343+
dh := &mockDcgmHelper{}
316344
mockClient := &mockVersionServiceClient{}
317345

318346
client := &Client{
319347
metadata: metadata,
320348
oh: oh,
349+
dh: dh,
321350
client: mockClient,
322351
config: &config.Config{
323352
GRPC: clientconfig.GRPCConfig{
@@ -344,6 +373,8 @@ func TestSendAgentDataWithRetry(t *testing.T) {
344373
oh.On("GetArch").Return("x86_64", nil)
345374
oh.On("GetMk8sClusterId").Return("abcd", nil)
346375

376+
dh.On("GetDCGMVersion").Return("3.3.7", nil)
377+
dh.On("GetGpuInfo").Return("NVIDIA H200", 2, nil)
347378
expectedResponse := &agentmanager.GetVersionResponse{
348379
Action: agentmanager.Action_NOP,
349380
}
@@ -381,11 +412,13 @@ func TestSendAgentDataWithRetry(t *testing.T) {
381412
func TestSendAgentDataWithRetryFailure(t *testing.T) {
382413
metadata := &mockMetadataReader{}
383414
oh := &mockOSHelper{}
415+
dh := &mockDcgmHelper{}
384416
mockClient := &mockVersionServiceClient{}
385417

386418
client := &Client{
387419
metadata: metadata,
388420
oh: oh,
421+
dh: dh,
389422
client: mockClient,
390423
config: &config.Config{
391424
GRPC: clientconfig.GRPCConfig{
@@ -414,6 +447,9 @@ func TestSendAgentDataWithRetryFailure(t *testing.T) {
414447
oh.On("GetArch").Return("x86_64", nil)
415448
oh.On("GetMk8sClusterId").Return("abcd", nil)
416449

450+
dh.On("GetDCGMVersion").Return("3.3.7", nil)
451+
dh.On("GetGpuInfo").Return("NVIDIA H200", 2, nil)
452+
417453
// Simulate continuous failures
418454
mockClient.On("GetVersion", mock.Anything, mock.Anything, mock.Anything).
419455
Return(nil, status.Error(codes.Unavailable, "Service unavailable")).Times(4)
@@ -444,10 +480,12 @@ func TestSendAgentDataWithRetryFailure(t *testing.T) {
444480
func TestFillRequestDebNotFound(t *testing.T) {
445481
metadata := &mockMetadataReader{}
446482
oh := &mockOSHelper{}
483+
dh := &mockDcgmHelper{}
447484

448485
client := &Client{
449486
metadata: metadata,
450487
oh: oh,
488+
dh: dh,
451489
logger: slog.New(slog.NewTextHandler(os.Stdout, nil)),
452490
retryBackoff: getRetryBackoff(clientconfig.GetDefaultRetryConfig()),
453491
config: &config.Config{},
@@ -472,6 +510,9 @@ func TestFillRequestDebNotFound(t *testing.T) {
472510
oh.On("GetSystemUptime").Return(1*time.Hour, nil)
473511
oh.On("GetMk8sClusterId").Return("abcd", nil)
474512

513+
dh.On("GetDCGMVersion").Return("3.3.7", nil)
514+
dh.On("GetGpuInfo").Return("NVIDIA H200", 2, nil)
515+
475516
req := client.fillRequest(agentData)
476517

477518
assert.NotNil(t, req)

0 commit comments

Comments
 (0)