Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
884f4f5
feat(gpu): enhance DGX Spark support and update GPU type handling
eball Jan 22, 2026
9194c0c
feat(amdgpu): refactor AMD GPU detection and support for GB10 chip an…
eball Jan 26, 2026
1102751
feat(connector): enhance GB10 chip detection with environment variabl…
eball Jan 28, 2026
dee6474
feat(gpu): enhance DGX Spark support and update GPU type handling
eball Jan 22, 2026
f33a1f1
feat(amdgpu): refactor AMD GPU detection and support for GB10 chip an…
eball Jan 26, 2026
979cd37
feat(connector): enhance GB10 chip detection with environment variabl…
eball Jan 28, 2026
5159ecb
feat: add nvidia device plugin for gb10
dkeven Jan 28, 2026
7037a7a
Merge branch 'cli/feat/install_on_spark' of github.com:beclab/olares …
eball Feb 2, 2026
9c0b2d4
Merge branch 'main' into cli/feat/install_on_spark
eball Feb 2, 2026
4004acf
Merge commit '375dfceacbdae1430a27929525cbabd9b5991d38' into cli/feat…
eball Feb 2, 2026
0fa4f29
fix(gpu): update pod selector for hami-device-plugin based on GB10 ch…
eball Feb 3, 2026
997c9d4
Merge branch 'main' into cli/feat/install_on_spark
eball Feb 3, 2026
90408a9
Merge branch 'main' into cli/feat/install_on_spark
eball Feb 5, 2026
0776188
feat: enable CGO for building on ARM architecture and adjust build co…
eball Feb 5, 2026
4beb697
feat: enhance multi-architecture support for ARM64 in release workflow
eball Feb 5, 2026
b79f36d
feat: update multi-arch setup for ARM64 in release workflow
eball Feb 5, 2026
8904168
feat: enhance ARM64 multi-architecture support in release workflow
eball Feb 5, 2026
989f540
feat: streamline ARM64 cross-compilation setup in release workflow
eball Feb 5, 2026
9d59d56
feat: enhance ARM64 support by adding architecture-specific package i…
eball Feb 5, 2026
3f9e38f
feat: update ARM64 package sources in release workflow for improved c…
eball Feb 5, 2026
5bd7561
Merge commit '5109ad001c62c59e049c36b6e3ee9c1a59e3c96e' into cli/feat…
eball Feb 5, 2026
06b3c44
feat: amd device plugin and container toolkit install
hysyeah Feb 10, 2026
b378a09
refactor: remove GB10 chip type check from GPU info update
eball Feb 10, 2026
9dd0a27
Merge branch 'main' into cli/feat/install_on_spark
eball Feb 10, 2026
1c971d8
feat(gpu): update hami version to v2.6.10-compatible for spark
dkeven Feb 10, 2026
8f3744f
fix: remove gb10 device plugin checking
eball Feb 11, 2026
0edae50
Merge branch 'cli/feat/install_on_spark' of github.com:beclab/olares …
eball Feb 11, 2026
45428fd
fix: update klauspost/cpuid to v2.3.0
eball Feb 11, 2026
476aee0
fix: amd gpu check (#2522)
hysyeah Feb 11, 2026
1aa3e2d
Merge branch 'cli/feat/install_on_spark' of github.com:beclab/olares …
eball Feb 12, 2026
ffaedaf
Merge branch 'main' into cli/feat/install_on_spark
eball Feb 12, 2026
0bb131a
feat: enhance storage device detection with USB serial properties
eball Feb 12, 2026
5855b8a
Merge branch 'main' into cli/feat/install_on_spark
eball Feb 13, 2026
c00eec3
feat: update hami version to v2.6.11-compatible-arm
dkeven Feb 13, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions cli/pkg/amdgpu/module.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
package amdgpu

import (
"time"

"github.com/beclab/Olares/cli/pkg/common"
"github.com/beclab/Olares/cli/pkg/core/prepare"
"github.com/beclab/Olares/cli/pkg/core/task"
)

// InstallAmdContainerToolkitModule installs AMD container toolkit on supported Ubuntu if ROCm is installed.
type InstallAmdContainerToolkitModule struct {
common.KubeModule
Skip bool // conditional execution based on ROCm detection
SkipRocmCheck bool
}

func (m *InstallAmdContainerToolkitModule) IsSkip() bool {
return m.Skip
}

func (m *InstallAmdContainerToolkitModule) Init() {
m.Name = "InstallAmdContainerToolkit"
if m.IsSkip() {
return
}

prepareCollection := prepare.PrepareCollection{}
if !m.SkipRocmCheck {
prepareCollection = append(prepareCollection, new(RocmInstalled))
}

updateAmdSource := &task.RemoteTask{
Name: "UpdateAmdContainerToolkitSource",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Action: new(UpdateAmdContainerToolkitSource),
Prepare: &prepareCollection,
Parallel: false,
Retry: 1,
}

installAmdContainerToolkit := &task.RemoteTask{
Name: "InstallAmdContainerToolkit",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepareCollection,
Action: new(InstallAmdContainerToolkit),
Parallel: false,
Retry: 1,
}

generateAndValidateCDI := &task.RemoteTask{
Name: "GenerateAndValidateAmdCDI",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepareCollection,
Action: new(GenerateAndValidateAmdCDI),
Parallel: false,
Retry: 1,
}

m.Tasks = []task.Interface{
updateAmdSource,
installAmdContainerToolkit,
generateAndValidateCDI,
}
}

// InstallAmdPluginModule installs AMD GPU device plugin on Kubernetes.
type InstallAmdPluginModule struct {
common.KubeModule
Skip bool // conditional execution based on GPU enablement
}

func (m *InstallAmdPluginModule) IsSkip() bool {
return m.Skip
}

func (m *InstallAmdPluginModule) Init() {
m.Name = "InstallAmdPlugin"

// update node with AMD GPU labels
updateNode := &task.RemoteTask{
Name: "UpdateNodeAmdGPUInfo",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
},
Action: new(UpdateNodeAmdGPUInfo),
Parallel: false,
Retry: 1,
}

installPlugin := &task.RemoteTask{
Name: "InstallAmdPlugin",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
},
Action: new(InstallAmdPlugin),
Parallel: false,
Retry: 1,
}

checkGpuState := &task.RemoteTask{
Name: "CheckAmdGPUState",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
new(RocmInstalled),
},
Action: new(CheckAmdGpuStatus),
Parallel: false,
Retry: 50,
Delay: 10 * time.Second,
}

m.Tasks = []task.Interface{
updateNode,
installPlugin,
checkGpuState,
}
}
56 changes: 56 additions & 0 deletions cli/pkg/amdgpu/prepares.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package amdgpu

import (
"github.com/beclab/Olares/cli/pkg/bootstrap/precheck"
"github.com/beclab/Olares/cli/pkg/common"
"github.com/beclab/Olares/cli/pkg/core/connector"
"github.com/beclab/Olares/cli/pkg/core/logger"
)

// RocmInstalled checks if AMD ROCm is installed on the system.
type RocmInstalled struct {
common.KubePrepare
}

func (p *RocmInstalled) PreCheck(runtime connector.Runtime) (bool, error) {
rocmV, err := connector.RocmVersion()
if err != nil {
logger.Debugf("ROCm version check error: %v", err)
return false, nil
}
if rocmV == nil {
return false, nil
}

logger.Infof("Detected ROCm version: %s", rocmV.Original())
return true, nil
}

// RocmNotInstalled checks if AMD ROCm is NOT installed on the system.
type RocmNotInstalled struct {
common.KubePrepare
RocmInstalled
}

func (p *RocmNotInstalled) PreCheck(runtime connector.Runtime) (bool, error) {
installed, err := p.RocmInstalled.PreCheck(runtime)
if err != nil {
return false, err
}
return !installed, nil
}

// ContainerdInstalled checks if containerd is installed on the system.
type ContainerdInstalled struct {
common.KubePrepare
}

func (p *ContainerdInstalled) PreCheck(runtime connector.Runtime) (bool, error) {
containerdCheck := precheck.ConflictingContainerdCheck{}
if err := containerdCheck.Check(runtime); err != nil {
return true, nil
}

logger.Info("containerd is not installed, ignore task")
return false, nil
}
173 changes: 168 additions & 5 deletions cli/pkg/amdgpu/tasks.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
package amdgpu

import (
"context"
"fmt"
"os/exec"
"path"
"path/filepath"

"github.com/beclab/Olares/cli/pkg/clientset"
"github.com/beclab/Olares/cli/pkg/common"
cc "github.com/beclab/Olares/cli/pkg/core/common"
"github.com/beclab/Olares/cli/pkg/core/connector"
"github.com/beclab/Olares/cli/pkg/core/logger"
"github.com/beclab/Olares/cli/pkg/core/task"
"github.com/beclab/Olares/cli/pkg/utils"
"github.com/beclab/Olares/cli/pkg/core/util"
"github.com/beclab/Olares/cli/pkg/gpu"

"github.com/Masterminds/semver/v3"
"github.com/pkg/errors"
Expand All @@ -26,8 +29,8 @@ func (m *InstallAmdRocmModule) Init() {
m.Name = "InstallAMDGPU"

installAmd := &task.RemoteTask{
Name: "InstallAmdRocm",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Name: "InstallAmdRocm",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Action: &InstallAmdRocm{
// no manifest needed
},
Expand All @@ -51,15 +54,15 @@ func (t *InstallAmdRocm) Execute(runtime connector.Runtime) error {
return nil
}

amdGPUExists, err := utils.HasAmdIGPU(runtime)
amdGPUExists, err := connector.HasAmdAPUOrGPU(runtime)
if err != nil {
return err
}
// skip rocm install
if !amdGPUExists {
return nil
}
rocmV, _ := utils.RocmVersion()
rocmV, _ := connector.RocmVersion()
min := semver.MustParse("7.1.1")
if rocmV != nil && rocmV.LessThan(min) {
return fmt.Errorf("detected ROCm version %s, which is lower than required %s; please uninstall existing ROCm/AMDGPU components before installation with command: olares-cli amdgpu uninstall", rocmV.Original(), min.Original())
Expand Down Expand Up @@ -131,3 +134,163 @@ func (t *AmdgpuUninstallAction) Execute(runtime connector.Runtime) error {
logger.Warn("Warning: Please reboot your machine after uninstall to fully remove ROCm components.")
return nil
}

// UpdateAmdContainerToolkitSource configures the AMD container toolkit APT repository.
type UpdateAmdContainerToolkitSource struct {
common.KubeAction
}

func (t *UpdateAmdContainerToolkitSource) Execute(runtime connector.Runtime) error {
// Install prerequisites
if _, err := runtime.GetRunner().SudoCmd("apt update && apt install -y wget gnupg2", false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to install prerequisites for AMD container toolkit")
}

if _, err := runtime.GetRunner().SudoCmd("install -d -m 0755 /etc/apt/keyrings", false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to create /etc/apt/keyrings directory")
}

cmd := "wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null"
if _, err := runtime.GetRunner().SudoCmd(cmd, false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to download and install AMD ROCm GPG key")
}

si := runtime.GetSystemInfo()
var ubuntuCodename string
if si.IsUbuntuVersionEqual(connector.Ubuntu2404) {
ubuntuCodename = "noble"
} else if si.IsUbuntuVersionEqual(connector.Ubuntu2204) {
ubuntuCodename = "jammy"
} else {
return fmt.Errorf("unsupported Ubuntu version for AMD container toolkit")
}

aptSourceLine := fmt.Sprintf("deb [signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amd-container-toolkit/apt/ %s main", ubuntuCodename)
cmd = fmt.Sprintf("echo '%s' > /etc/apt/sources.list.d/amd-container-toolkit.list", aptSourceLine)
if _, err := runtime.GetRunner().SudoCmd(cmd, false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to add AMD container toolkit APT source")
}

logger.Infof("AMD container toolkit repository configured successfully")
return nil
}

// InstallAmdContainerToolkit installs the AMD container toolkit package.
type InstallAmdContainerToolkit struct {
common.KubeAction
}

func (t *InstallAmdContainerToolkit) Execute(runtime connector.Runtime) error {
logger.Infof("Installing AMD container toolkit...")
if _, err := runtime.GetRunner().SudoCmd("apt update && apt install -y amd-container-toolkit", false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to install AMD container toolkit")
}
logger.Infof("AMD container toolkit installed successfully")
return nil
}

// GenerateAndValidateAmdCDI generates and validates the AMD CDI spec.
type GenerateAndValidateAmdCDI struct {
common.KubeAction
}

func (t *GenerateAndValidateAmdCDI) Execute(runtime connector.Runtime) error {
// Ensure /etc/cdi directory exists
if _, err := runtime.GetRunner().SudoCmd("install -d -m 0755 /etc/cdi", false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to create /etc/cdi directory")
}

// Generate CDI spec
logger.Infof("Generating AMD CDI spec...")
if _, err := runtime.GetRunner().SudoCmd("amd-ctk cdi generate --output=/etc/cdi/amd.json", false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to generate AMD CDI spec")
}

// Validate CDI spec
logger.Infof("Validating AMD CDI spec...")
if _, err := runtime.GetRunner().SudoCmd("amd-ctk cdi validate --path=/etc/cdi/amd.json", false, true); err != nil {
return errors.Wrap(errors.WithStack(err), "failed to validate AMD CDI spec")
}

logger.Infof("AMD CDI spec generated and validated successfully")
return nil
}

// UpdateNodeAmdGPUInfo updates Kubernetes node labels with AMD GPU information.
type UpdateNodeAmdGPUInfo struct {
common.KubeAction
}

func (u *UpdateNodeAmdGPUInfo) Execute(runtime connector.Runtime) error {
client, err := clientset.NewKubeClient()
if err != nil {
return errors.Wrap(errors.WithStack(err), "kubeclient create error")
}

// Check if AMD GPU/APU exists
amdGPUExists, err := connector.HasAmdAPUOrGPU(runtime)
if err != nil {
return err
}
if !amdGPUExists {
logger.Info("AMD GPU/APU is not detected")
return nil
}

// Get ROCm version
rocmV, err := connector.RocmVersion()
if err != nil || rocmV == nil {
logger.Info("ROCm is not installed")
return nil
}

rocmVersion := rocmV.Original()

// Determine GPU type (APU vs discrete GPU)
gpuType := gpu.AmdGpuCardType
if runtime.GetSystemInfo().IsAmdApu() {
gpuType = gpu.AmdApuCardType
}

// Use ROCm version as both driver and "cuda" version for AMD
return gpu.UpdateNodeGpuLabel(context.Background(), client.Kubernetes(), &rocmVersion, nil, nil, &gpuType)
}

// InstallAmdPlugin installs the AMD GPU device plugin DaemonSet.
type InstallAmdPlugin struct {
common.KubeAction
}

func (t *InstallAmdPlugin) Execute(runtime connector.Runtime) error {
amdPluginPath := path.Join(runtime.GetInstallerDir(), "wizard/config/gpu/nvidia/amdgpu-device-plugin.yaml")
_, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("kubectl apply -f %s", amdPluginPath), false, true)
if err != nil {
return errors.Wrap(errors.WithStack(err), "failed to apply AMD GPU device plugin")
}

logger.Infof("AMD GPU device plugin installed successfully")
return nil
}

// CheckAmdGpuStatus checks if the AMD GPU device plugin pod is running.
type CheckAmdGpuStatus struct {
common.KubeAction
}

func (t *CheckAmdGpuStatus) Execute(runtime connector.Runtime) error {
kubectlpath, err := util.GetCommand(common.CommandKubectl)
if err != nil {
return fmt.Errorf("kubectl not found")
}

// Check AMD device plugin pod status using the label from amdgpu-device-plugin.yaml
selector := "name=amdgpu-dp-ds"
cmd := fmt.Sprintf("%s get pod -n kube-system -l '%s' -o jsonpath='{.items[*].status.phase}'", kubectlpath, selector)

rphase, _ := runtime.GetRunner().SudoCmd(cmd, false, false)
if rphase == "Running" {
logger.Infof("AMD GPU device plugin is running")
return nil
}
return fmt.Errorf("AMD GPU device plugin state is not Running (current: %s)", rphase)
}
Loading
Loading