diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 000000000..b2d955148 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,22 @@ +**What type of PR is this?** + + + +**What this PR does / why we need it**: + +**Which issue(s) this PR fixes**: +Fixes # + +**Special notes for your reviewer**: + +**Does this PR introduce a user-facing change?**: \ No newline at end of file diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000..c6044996b --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,21 @@ + +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + + +version: 2 +updates: + - package-ecosystem: "gomod" + directory: "/" + schedule: + interval: "daily" + - package-ecosystem: "docker" + directory: "/docker" + schedule: + interval: "daily" + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" diff --git a/.github/issue_template.md b/.github/issue_template.md new file mode 100644 index 000000000..ec3aaa304 --- /dev/null +++ b/.github/issue_template.md @@ -0,0 +1,21 @@ +_The template below is mostly useful for bug reports and support questions. Feel free to remove anything which doesn't apply to you and add more information where it makes sense._ +--- + +### 1. Issue or feature description + +### 2. Steps to reproduce the issue + +### 3. Information to [attach](https://help.github.com/articles/file-attachments-on-issues-and-pull-requests/) (optional if deemed irrelevant) + +Common error checking: +- [ ] The output of `nvidia-smi -a` on your host +- [ ] Your docker or containerd configuration file (e.g: `/etc/docker/daemon.json`) +- [ ] The vgpu-device-plugin container logs +- [ ] The vgpu-scheduler container logs +- [ ] The kubelet logs on the node (e.g: `sudo journalctl -r -u kubelet`) + +Additional information that might help better understand your environment and reproduce the bug: +- [ ] Docker version from `docker version` +- [ ] Docker command, image and tag used +- [ ] Kernel version from `uname -a` +- [ ] Any relevant kernel output lines from `dmesg` \ No newline at end of file diff --git a/.github/workflows/build-helm-release.yaml b/.github/workflows/build-helm-release.yaml new file mode 100644 index 000000000..d890cec67 --- /dev/null +++ b/.github/workflows/build-helm-release.yaml @@ -0,0 +1,36 @@ +name: Release helm + +on: + workflow_dispatch: + push: + branches: + - main + tags: + - v* + +jobs: + helm-release: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Configure Git + run: | + git config user.name "$GITHUB_ACTOR" + git config user.email "$GITHUB_ACTOR@users.noreply.github.com" + + - name: Install Helm + uses: azure/setup-helm@v4 + with: + version: v3.7.1 + + - name: Run chart-releaser + uses: helm/chart-releaser-action@v1.6.0 + with: + charts_dir: charts + env: + CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" + diff --git a/.github/workflows/build-image-release.yaml b/.github/workflows/build-image-release.yaml new file mode 100644 index 000000000..0a31cad82 --- /dev/null +++ b/.github/workflows/build-image-release.yaml @@ -0,0 +1,77 @@ +name: Release Arm Image + +env: + REGISTRY: docker.io + IMAGE_REPO: projecthami/hami + IMAGE_ROOT_PATH: docker + BUILD_PLATFORM: linux/arm64 + REGISTER_USER: ${{ github.actor }} + REGISTER_PASSWORD: ${{ secrets.GITHUB_TOKEN }} + +on: + workflow_dispatch: + push: + tags: + - v* + +jobs: + docker-build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@master + - name: Get the version + id: get_version + run: | + VERSION=${GITHUB_REF#refs/tags/} + if [[ ${GITHUB_REF} == "refs/heads/main" ]]; then + VERSION=latest + fi + echo ::set-output name=VERSION::${VERSION} + + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Checkout submodule + uses: Mushus/checkout-submodule@v1.0.1 + with: + basePath: # optional, default is . + submodulePath: libvgpu + + - name: Docker Login + uses: docker/login-action@v3.3.0 + with: + username: ${{ secrets.DOCKERHUB_TOKEN }} + password: ${{ secrets.DOCKERHUB_PASSWD }} + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: image=moby/buildkit:master + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_REPO }} + + - name: Build & Pushing hami image + uses: docker/build-push-action@v6.6.1 + with: + context: . + file: ${{ env.IMAGE_ROOT_PATH }}/Dockerfile + labels: ${{ steps.meta.outputs.labels }} + platforms: ${{ env.BUILD_PLATFORM }} + build-args: | + VERSION=${{ steps.get_version.outputs.VERSION }} + GOLANG_IMAGE=golang:1.22.5-bullseye + NVIDIA_IMAGE=nvidia/cuda:12.2.0-devel-ubuntu20.04 + DEST_DIR=/usr/local + tags: ${{ steps.meta.outputs.tags }} + push: true + github-token: ${{ env.REGISTER_PASSWORD }} + diff --git a/.github/workflows/ci-image-scanning.yaml b/.github/workflows/ci-image-scanning.yaml new file mode 100644 index 000000000..188c6b6e9 --- /dev/null +++ b/.github/workflows/ci-image-scanning.yaml @@ -0,0 +1,59 @@ +name: Trivy Scan +on: + schedule: + - cron: "0 0 * * *" +jobs: + trivy-scan: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Checkout submodule + uses: Mushus/checkout-submodule@v1.0.1 + with: + basePath: # optional, default is . + submodulePath: libvgpu + - name: Get branch name + uses: nelonoel/branch-name@v1.0.1 + - name: Docker Login + uses: docker/login-action@v3.3.0 + with: + username: ${{ secrets.DOCKERHUB_TOKEN }} + password: ${{ secrets.DOCKERHUB_PASSWD }} + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v3 + - name: Generating image tag + id: runtime-tag + run: | + echo tag="$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT + - run: make tidy + - run: SHORT_VERSION=${{ steps.runtime-tag.outputs.tag }} bash ./hack/build.sh + - name: Run Trivy vulnerability scanner (table output) + uses: aquasecurity/trivy-action@0.24.0 + with: + image-ref: "projecthami/hami:${{ steps.runtime-tag.outputs.tag }}" + format: "table" + ignore-unfixed: true + severity: "HIGH,CRITICAL" + vuln-type: "os,library" + trivyignores: .trivyignore + - name: Run Trivy vulnerability scanner (SARIF) + uses: aquasecurity/trivy-action@0.24.0 + with: + image-ref: "projecthami/hami:${{ steps.runtime-tag.outputs.tag }}" + format: "sarif" + output: "trivy-results.sarif" + ignore-unfixed: true + vuln-type: "os,library" + trivyignores: .trivyignore + if: always() && github.repository == 'Project-HAMi/HAMi' + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: "trivy-results.sarif" + if: always() && github.repository == 'Project-HAMi/HAMi' diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 000000000..91db8d40e --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,45 @@ +name: CI +on: + pull_request: + push: + branches: ["master"] + +env: + GO_VERSION: "1.21.6" + +jobs: + golangci: + name: lint + runs-on: ubuntu-22.04 + steps: + - name: checkout code + uses: actions/checkout@v4 + - name: install Go + uses: actions/setup-go@v5 + with: + go-version: "1.21" + - name: verify license + run: hack/verify-license.sh + - name: go tidy + run: make tidy + - name: lint + run: make lint + - name: import alias + run: hack/verify-import-aliases.sh + test: + name: Unit test + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Checkout submodule + uses: Mushus/checkout-submodule@v1.0.1 + with: + basePath: # optional, default is . + submodulePath: libvgpu + - name: Install Go + uses: actions/setup-go@v5 + with: + go-version: "1.21" + - run: make tidy + - run: make test diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml new file mode 100644 index 000000000..681fe2c0e --- /dev/null +++ b/.github/workflows/codeql-analysis.yml @@ -0,0 +1,65 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +name: "CodeQL" + +on: + workflow_dispatch: + push: + branches: ["master"] + paths-ignore: + - "**/*.json" + - "**/*.md" + - "**/*.txt" + - "**/*.yml" + schedule: + - cron: "0 4 * * 6" + +permissions: + security-events: write + # required to fetch internal or private CodeQL packs + packages: read + + # only required for workflows in private repositories + actions: read + contents: read + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + if: github.repository == 'Project-HAMi/HAMi' + + strategy: + fail-fast: false + matrix: + language: ["cpp", "go"] + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Checkout submodule + uses: Mushus/checkout-submodule@v1.0.1 + with: + basePath: # optional, default is . + submodulePath: libvgpu + - if: matrix.language == 'go' + name: Set go version + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + # queries: ./path/to/local/query, your-org/your-repo/queries@main + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 diff --git a/.github/workflows/dev-image-build.yaml b/.github/workflows/dev-image-build.yaml new file mode 100644 index 000000000..82d754e30 --- /dev/null +++ b/.github/workflows/dev-image-build.yaml @@ -0,0 +1,45 @@ +name: Build dev image +on: + pull_request_target: + types: + - opened + - synchronize + - reopened + push: + branches: ["master"] + +jobs: + build: + name: build-dev-image + runs-on: ubuntu-latest + env: + IMAGE: ${{ secrets.IMAGE || 'projecthami/hami' }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Checkout submodule + uses: Mushus/checkout-submodule@v1.0.1 + with: + basePath: # optional, default is . + submodulePath: libvgpu + + - name: Get branch name + uses: nelonoel/branch-name@v1.0.1 + + - name: Docker Login + uses: docker/login-action@v3.3.0 + with: + username: ${{ secrets.DOCKERHUB_TOKEN }} + password: ${{ secrets.DOCKERHUB_PASSWD }} + + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v3 + + - name: Generating image tag + id: runtime-tag + run: | + echo tag="$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT + - run: make tidy + - run: SHORT_VERSION=${{ steps.runtime-tag.outputs.tag }} bash ./hack/build.sh diff --git a/.github/workflows/lint-chart.yaml b/.github/workflows/lint-chart.yaml new file mode 100644 index 000000000..4122ea7b6 --- /dev/null +++ b/.github/workflows/lint-chart.yaml @@ -0,0 +1,24 @@ +name: Chart Lint + +on: + pull_request: + push: + branches: ["master"] + +jobs: + chart-lint-test: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Helm + uses: azure/setup-helm@v4 + with: + version: v3.7.1 + + - name: Check chart version + run: bash ./hack/verify-chart-version.sh + diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 000000000..f500e6b93 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,51 @@ +# This is a basic workflow to help you get started with Actions + +name: Release + +# Controls when the workflow will run +on: + # Triggers the workflow on push or pull request events but only for the master branch + push: + tags: + - v[0-9]+.[0-9]+.[0-9]+.[0-9]+ + - v[0-9]+.[0-9]+.[0-9]+ + - v[0-9]+.[0-9]+ + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + # This workflow contains a single job called "build" + build: + # The type of runner that the job will run on + runs-on: ubuntu-latest + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + - name: Checkout + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + uses: actions/checkout@v4 + + - name: Checkout submodule + uses: Mushus/checkout-submodule@v1.0.1 + with: + basePath: # optional, default is . + submodulePath: libvgpu + + - name: Get branch name + uses: nelonoel/branch-name@v1.0.1 + + - name: Docker Login + uses: docker/login-action@v3.3.0 + with: + username: ${{ secrets.DOCKERHUB_TOKEN }} + password: ${{ secrets.DOCKERHUB_PASSWD }} + + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v3 + + - run: make tidy + - run: SHORT_VERSION="${BRANCH_NAME}" bash ./hack/build.sh + diff --git a/.gitignore b/.gitignore index e660fd93d..08e658f73 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,11 @@ bin/ +run_device_plugin.sh +run_scheduler.sh +device_plugin.sh +libvgpu/build +updateso.sh +libvgpu.so +.idea +vendor +license +vgpuvalidator \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml deleted file mode 100644 index 8765cac77..000000000 --- a/.gitlab-ci.yml +++ /dev/null @@ -1,87 +0,0 @@ -stages: - - build_image - - deploy - -variables: - IMAGE_NAME: k8s-vgpu - -.build_image: - stage: build_image - image: '${DIND_IMAGE}' - script: - - IMAGE_FULL_NAME=${IMAGE_REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG} - - > - docker build -t ${IMAGE_FULL_NAME} - --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} - --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} - --build-arg VERSION=${VERSION} - --build-arg GOPROXY=${GOPROXY} -f ./docker/Dockerfile . - - docker push ${IMAGE_FULL_NAME} - -build_dev_image: - extends: .build_image - variables: - IMAGE_TAG: ${CI_COMMIT_SHA} - VERSION: ${CI_COMMIT_SHA} - only: - - master - -build_release_image: - extends: .build_image - variables: - IMAGE_TAG: ${CI_COMMIT_TAG} - VERSION: ${CI_COMMIT_TAG}-${CI_COMMIT_SHA} - only: - - tags - -.deploy: - stage: deploy - image: '${HELM_IMAGE}' - variables: - RELEASE_NAME: vgpu - RELEASE_NAMESPACE: vgpu - EXTRA_ARGS: '' - script: - - IMAGE_FULL_NAME=${IMAGE_REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG} - - > - helm upgrade --install ${RELEASE_NAME} ./deployments/4pd-vgpu - -n ${RELEASE_NAMESPACE} - --set scheduler.extender.image=${IMAGE_FULL_NAME} - --set devicePlugin.image=${IMAGE_FULL_NAME} - ${EXTRA_ARGS} - -deploy_develop: - extends: .deploy - variables: - IMAGE_TAG: ${CI_COMMIT_SHA} - environment: - name: vgpu-develop - only: - - master - tags: - - deploy-test - -deploy_pre_product: - extends: .deploy - variables: - IMAGE_TAG: ${CI_COMMIT_TAG} - EXTRA_ARGS: "--wait --timeout=30m" - environment: - name: vgpu-develop - only: - - tags - tags: - - deploy-test - -deploy_product: - extends: .deploy - variables: - IMAGE_TAG: ${CI_COMMIT_TAG} - environment: - name: vgpu-product - only: - - tags - tags: - - deploy-product - when: manual - diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..0f8525526 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "libvgpu"] + path = libvgpu + url = https://github.com/Project-HAMi/HAMi-core.git + branch = main diff --git a/.golangci.yaml b/.golangci.yaml new file mode 100644 index 000000000..fcce9e5f7 --- /dev/null +++ b/.golangci.yaml @@ -0,0 +1,137 @@ +run: + timeout: 10m + + # The default concurrency value is the number of available CPU. + concurrency: 4 + + # which dirs to skip: issues from them won't be reported; + # can use regexp here: generated.*, regexp is applied on full path; + # default value is empty list, but default dirs are skipped independently + # from this option's value (see skip-dirs-use-default). + # "/" will be replaced by current OS file path separator to properly work + # on Windows. + skip-dirs: + - pkg/device-plugin # This code is directly lifted from the Kubernetes codebase, skip checking + + # default is true. Enables skipping of directories: + # vendor$, third_party$, testdata$, examples$, Godeps$, builtin$ + skip-dirs-use-default: true + + # One of 'readonly' and 'vendor'. + # - readonly: the go command is disallowed from the implicit automatic updating of go.mod described above. + # Instead, it fails when any changes to go.mod are needed. This setting is most useful to check + # that go.mod does not need updates, such as in a continuous integration and testing system. + # - vendor: the go command assumes that the vendor directory holds the correct copies of dependencies and ignores + # the dependency descriptions in go.mod. + modules-download-mode: readonly + +linters-settings: + depguard: + list-type: blacklist + include-go-root: false + dupl: + threshold: 800 + errcheck: + check-type-assertions: true + check-blank: true + # exclude: .errcheckignore + errorlint: + errorf: true + asserts: true + comparison: true + goconst: + min-len: 3 + min-occurrences: 3 + gocritic: + enabled-tags: + - diagnostic + - experimental + - opinionated + - performance + - style + disabled-checks: + - commentedOutCode + - whyNoLint + settings: + hugeParam: + sizeThreshold: 80 + rangeExprCopy: + sizeThreshold: 512 + rangeValCopy: + sizeThreshold: 128 + godot: + scope: declarations + capital: false + gofmt: + simplify: true + gofumpt: + extra-rules: true + goimports: + local-prefixes: github.com/Project-HAMi/HAMi + gocyclo: + # minimal code complexity to report, 30 by default (but we recommend 10-20) + min-complexity: 20 + nestif: + min-complexity: 20 + +output: + format: colored-line-number + print-issued-lines: true + print-linter-name: true + uniq-by-line: true + sort-results: true + +linters: + disable-all: true + disabled: + - exhaustivestruct # Checks if all struct's fields are initialized + - forbidigo # Forbids identifiers + - forcetypeassert # finds forced type assertions + - gci # Gci control golang package import order and make it always deterministic. + - gochecknoglobals # check that no global variables exist + - gochecknoinits # Checks that no init functions are present in Go code + - goconst # Finds repeated strings that could be replaced by a constant + - godox # Tool for detection of FIXME, TODO and other comment keywords + - goerr113 # Golang linter to check the errors handling expressions + - golint # Golint differs from gofmt. Gofmt reformats Go source code, whereas golint prints out style mistakes + - gomnd # An analyzer to detect magic numbers. + - gomoddirectives # Manage the use of 'replace', 'retract', and 'excludes' directives in go.mod. + - gomodguard # Allow and block list linter for direct Go module dependencies. + - interfacer # Linter that suggests narrower interface types + - lll # Reports long lines + - maligned # Tool to detect Go structs that would take less memory if their fields were sorted + - promlinter # Check Prometheus metrics naming via promlint + - scopelint # Scopelint checks for unpinned variables in go programs + - sqlclosecheck # Checks that sql.Rows and sql.Stmt are closed. + - testpackage # Linter that makes you use a separate _test package + - tparallel # tparallel detects inappropriate usage of t.Parallel() method in your Go test codes + - wrapcheck # Checks that errors returned from external packages are wrapped + - wsl # Whitespace Linter + - paralleltest # paralleltest detects missing usage of t.Parallel() method in your Go test + - noctx # noctx finds sending http request without context.Context + - wastedassign # wastedassign finds wasted assignment statements. + - exhaustive # check exhaustiveness of enum switch statements + - cyclop # checks function and package cyclomatic complexity + - errcheck # Errcheck is a program for checking for unchecked errors in go programs. These unchecked errors can be critical bugs in some cases + - unparam # Reports unused function parameters + - gosec # Inspects source code for security problems + - funlen # Tool for detection of long functions + - gocognit # Computes and checks the cognitive complexity of functions + - gocyclo # Computes and checks the cyclomatic complexity of functions + - nlreturn # nlreturn checks for a new line before return and branch statements to increase code clarity + - gocritic # Provides many diagnostics that check for bugs, performance and style issues. + - errorlint # errorlint is a linter for that can be used to find code that will cause problems with the error wrapping scheme introduced in Go 1.13. + - tagliatelle # Checks the struct tags. + # need to enable + - nestif # Reports deeply nested if statements + - ineffassign # Detects when assignments to existing variables are not used + + + enable: + - asciicheck + - forcetypeassert + - godot + - gofmt + - goimports + - misspell + - stylecheck diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 000000000..d50f7577d --- /dev/null +++ b/AUTHORS @@ -0,0 +1,14 @@ +The following people, in alphabetical order, have either authored or signed +off on commits in the HAMi repository: + +archlitchi limengxuan@4paradigm.com +peizhaoyou peizhaoyou@4paradigm.com +chaunceyjiang chaunceyjiang@gmail.com +wawa0210 xiaozhang0210@hotmail.com +whybeyoung +gsakun +CoderTH +lengrongfu +chaunceyjiang +atttx123 +zhengbingxian diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 000000000..cd21797f5 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,236 @@ +# CHANGELOG + +## v1.0.1 + +**Add MIG support:"mixed strategy"** + +**Add support for kubernetes v1.22+** + +## v1.0.1.1 + +**Bugs fixed** + +a pod can be scheduled to a node where its core usage is 100 - Fixed + +cudevshr.cache can't be modified with non-root users - Fixed + +## v1.0.1.2 + +**Add custom resource name** + +A task with cores=100 will allocate all device memory(virtual device memory excluded) + +## v1.0.1.3 + +**nvidia.com/gpucores will limit the GPU utilization inside container** +Prior than v1.0.1.3, nvidia.com/gpucores will not limit utilization inside container, we have fixed it in v1.0.1.3 + +## v1.0.1.4 + +**Add nvidia.com/gpumem-percentage resoure name** +This resource indicates the device memory percentage of GPU, can not be used with "nvidia.com/gpumem". If you want an exclusive GPU, specify both the "nvidia.com/gpucores" and "nvidia.com/gpumem-percentage" to 100 + +**Add GPU type specification** +You can set "nvidia.com/use-gputype" annotation to specify which type of GPU to use. "nvidia.com/nouse-gputype" annotation to specify which type of GPU to avoid. + +## v1.0.1.5 + +Fix an monitor "desc not found" error + +Add "devicePlugin.sockPath" parameter to set the location of vgpu.sock + +## v1.1.0.0 + +**Major Update: Device Memory will be counted more accurately** +serveral device memory usage, including cuda context, modules, parameters, reserved addresses will be counted in v1.1.0.0 + +**Update to be compatable with CUDA 11.6 and Driver 500+** + +**Rework monitor strategy** +Monitor will mmap control file into address space instead of reading it in each query. + +## v1.1.1.0 + +**Fix segmentation fault when invoking cuMallocAsync** + +**Core Utilization Oversubscribe and priority-base scheduling** +Currently we have two priority, 0 for high and 1 for low. The core utilization of high priority task won't be limited to resourceCores unless sharing GPU node with other high priority tasks. +The core utilization of low priority task won't be limited to resourceCores if no other tasks sharing its GPU. +See exmaple.yaml for more details + +**Add Container Core Utilization policy** +See details in docs/config.md(docs/config_cn.md) + +## v2.2 + +**Update device memory counting mechanism to compat with CUDA 11.3+ task** +sometimes vgpu-scheduler won't able to collect device memory usage when running cuda11.3+ compiled tasks in v1.x version of vgpu-scheduler. We solve this problem by reworking device memory counting mechanism. + +**Use node annotation instead of grpc to communicate between scheduler and device-plugin** +In v1.x version of vgpu-scheduler, we use grpc to communicate between scheduler and device-plugin, but we reimplement this communication in v2.x by using node annotation, to make it more stable and readable. + +**modified nvidia-container-runtime is no longer needed** +We remove self-modified nvidia-container-runtime in v1.x, because we now use node lock to track pod and container information. so this nvidia-container-runtime is no longer needed. + +## v2.2.7 + +**BUG fix** + +fix tasks with "gpumem-percentage" not working properly + +fix dead lock when a process die with its lock not released + +**Adjust certain logs** + +**update go modules to more recent version in order to support k8s v1.25** + +## v2.2.8 + +**BUG fix** + +fix vGPUmonitor not working properly with containerd + +fix installation error on k8s v1.25+ + +## v2.2.9 + +**BUG fix** + +fix non-root user in container can't access /tmp/vgpulock, result in "unified lock error" + +**Rework device registration** + +device registration used to be done in gRpc between device-plugin and scheduler. However, in some cluster, this communication may be blocked by firewall or selinux configuration. So, we reimplement device registration mechanism by using node annotations: +A-device-plugin will put its usable device and its status in "Node-A-device-register" annotation +scheduler will read from this annotation and acknowledge this registration. So, gRpc will no longer be used. + +**Optimization in code** + +Put nvidia-device-plugin related code in a seperate directory "nvidiadevice" + +**Libvgpu log adjusting** + +Downgrade the following API from LOG:WARN to LOG:INFO +cuFuncSetCacheConfig, cuFuncSetCacheConfig ,cuModuleGetTexRef, cuModuleGetSurfRef + +## v2.2.10 + +**BUG fix** + +fix process can't initialize properly in driver 440 + +fix cuCtxCreate failed in some tensorRT task + +fix env CUDA_VISIBLE_DEVICES not working properly sometimes. + +## v2.2.11 + +**BUG fix** + +fix initialization failed with non-root uers + +fix core limitation not working properly on A30 + +## v2.2.12 + +Downgrade core control output from LOG:WARN to LOG:DEBUG + +## v2.2.13 + +Adjust default memory to 0, which means use 100% device memory + +Move cache file directory from /tmp/vgpu/containers to /usr/local/vgpu/containers + +## v2.2.14 + +Fix device memory calculation error after container crashloop + +Fix env cuda_oversubscribe not set properly when MemoryScaling < 1 + +Fix MemoryScaling not working when set < 1 + +## v2.2.15 + +Move shared-memory from from /tmp/xxx.cache to /usr/local/vgpu/xxx.cache inside container + +Add Deviceidx to scheduler monitor apis(31993) + +## v2.2.16 + +Fix crash during initlization in vGPUmonitor + +# v2.3 + +## v2.3.0 + +Fix oom can't be triggered when loading module + +Update device-plugin version to v0.14.0 + +## v2.3.1 + +Fix a bug where a cuda process can't be launched properly + +## v2.3.2 + +Remove node selector for scheduler + +Fix an issue where mlu device-plugin can't be launched properly + +Major rework on devices-related code + +Add support for hygon DCU device + +## v2.3.3 + +Fix an issue where pod pending on nodes with multi-architect devices. + +## v2.3.4 + +Fix an issue where 31993 port can't list all GPU nodes + +Add a switch on cuda_control by set env "CUDA_DISABLE_ENV=true" in container + +## v2.3.6 + +Fix initialization error when using ray + +## v2.3.7 + +Fix error when "N/A" is shown in command "nvidia-smi topo -m" +Fix core utilization not working on some cases +Adjust some documents + +## v2.3.8 + +Fix device-plugin launch error on driver version < 500 + +support manual config MutatingWebhookConfiguration failurePolicy + +add metrics bind address flag for scheduler + +Improved log messages + +fix: loss of metrics after vdeivce restart + +bugfix: device-plugin monitor serves too slowly in big cluster + +## v2.3.9 + +Add support for iluvatar GPU devices + +Fix issue on "get_host_pid" func in HAMi-core + +Regular devices API, make it easier to add new devices + +## v2.3.10 + +Fix issue where device-plugin failed to start + +## v2.3.11 + +Add support for Ascend910B device + +Add "NVIDIA_VISIBLE_DEVICES=none" to none-gpu tasks + + diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..f2e8bcb20 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,3 @@ +# HAMi Community Code of Conduct + +Please refer to our [HAMi Community Code of Conduct](https://github.com/Project-HAMi/community/blob/main/CODE_OF_CONDUCT.md). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..09788f5f6 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,103 @@ +# Contributing + +Welcome to HAMi! + +- [Contributing](#contributing) +- [Before you get started](#before-you-get-started) + - [Code of Conduct](#code-of-conduct) + - [Community Expectations](#community-expectations) +- [Getting started](#getting-started) +- [Your First Contribution](#your-first-contribution) + - [Find something to work on](#find-something-to-work-on) + - [Find a good first topic](#find-a-good-first-topic) + - [Work on an issue](#work-on-an-issue) + - [File an Issue](#file-an-issue) +- [Contributor Workflow](#contributor-workflow) + - [Creating Pull Requests](#creating-pull-requests) + - [Code Review](#code-review) + +# Before you get started + +## Code of Conduct + +Please make sure to read and observe our [Code of Conduct](/CODE_OF_CONDUCT.md). + +## Community Expectations + +HAMi is a community project driven by its community which strives to promote a healthy, friendly and productive environment. + +# Getting started + +- Fork the repository on GitHub. +- Make your changes on your fork repository. +- Submit a PR. + + +# Your First Contribution + +We will help you to contribute in different areas like filing issues, developing features, fixing critical bugs and +getting your work reviewed and merged. + +If you have questions about the development process, +feel free to [file an issue](https://github.com/Project-HAMi/HAMi/issues/new/choose). + +## Find something to work on + +We are always in need of help, be it fixing documentation, reporting bugs or writing some code. +Look at places where you feel best coding practices aren't followed, code refactoring is needed or tests are missing. +Here is how you get started. + +### Find a good first topic + +There are [multiple repositories](https://github.com/Project-HAMi/) within the HAMi organization. +Each repository has beginner-friendly issues that provide a good first issue. +For example, [Project-HAMi/HAMi](https://github.com/Project-HAMi/HAMi) has +[help wanted](https://github.com/Project-HAMi/HAMi/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22) and +[good first issue](https://github.com/Project-HAMi/HAMi/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) +labels for issues that should not need deep knowledge of the system. +We can help new contributors who wish to work on such issues. + +Another good way to contribute is to find a documentation improvement, such as a missing/broken link. +Please see [Contributing](#contributing) below for the workflow. + +#### Work on an issue + +When you are willing to take on an issue, just reply on the issue. The maintainer will assign it to you. + +### File an Issue + +While we encourage everyone to contribute code, it is also appreciated when someone reports an issue. +Issues should be filed under the appropriate HAMi sub-repository. + +*Example:* a HAMi issue should be opened to [Project-HAMi/HAMi](https://github.com/Project-HAMi/HAMi/issues). + +Please follow the prompted submission guidelines while opening an issue. + +# Contributor Workflow + +Please do not ever hesitate to ask a question or send a pull request. + +This is a rough outline of what a contributor's workflow looks like: + +- Create a topic branch from where to base the contribution. This is usually master. +- Make commits of logical units. +- Push changes in a topic branch to a personal fork of the repository. +- Submit a pull request to [Project-HAMi/HAMi](https://github.com/Project-HAMi/HAMi). + +## Creating Pull Requests + +Pull requests are often called simply "PR". +HAMi generally follows the standard [github pull request](https://help.github.com/articles/about-pull-requests/) process. +To submit a proposed change, please develop the code/fix and add new test cases. +After that, run these local verifications before submitting pull request to predict the pass or +fail of continuous integration. + +* Run and pass `make verify` + +## Code Review + +To make it easier for your PR to receive reviews, consider the reviewers will need you to: + +* follow [good coding guidelines](https://github.com/golang/go/wiki/CodeReviewComments). +* write [good commit messages](https://chris.beams.io/posts/git-commit/). +* break large changes into a logical series of smaller patches which individually make easily understandable changes, and in aggregate solve a broader issue. \ No newline at end of file diff --git a/HAMi.jpg b/HAMi.jpg new file mode 100644 index 000000000..53ecf9432 Binary files /dev/null and b/HAMi.jpg differ diff --git a/MAINTAINERS.md b/MAINTAINERS.md new file mode 100644 index 000000000..4650f3b73 --- /dev/null +++ b/MAINTAINERS.md @@ -0,0 +1,13 @@ +# Maintainers + +See the [HAMi Community Membership](https://github.com/Project-HAMi/community/blob/main/CONTRIBUTOR-LADDER.md) to learn how to level up through the project. + +Please see the [AUTHORS](./AUTHORS) file for the full list of contributors to the project + +## HAMi Committers + +| Maintainer | Email | Emplolyer | +|---------------------------------------------------|-----------|-----------| +| [Li Mengxuan](https://github.com/archlitchi) | limengxuan@4paradigm.com | 4Paradigm | +| [Xiao Zhang](https://github.com/wawa0210) | xiaozhang0210@hotmail.com | DaoCloud | +| [Wang Leibo](https://github.com/william-wang) | wang.platform@gmail.com | HuaweiCloud | diff --git a/Makefile b/Makefile index 81fd816a0..1102089e7 100644 --- a/Makefile +++ b/Makefile @@ -1,19 +1,55 @@ -GO=go -GO111MODULE=on -CMDS=scheduler device-plugin -OUTPUT_DIR=bin - -VERSION ?= unknown +##### Global variables ##### +include version.mk all: build -build: $(CMDS) +docker: + docker build \ + --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} \ + --build-arg TARGET_ARCH=${TARGET_ARCH} \ + --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} \ + --build-arg DEST_DIR=${DEST_DIR} \ + --build-arg GOPROXY=https://goproxy.cn,direct \ + . -f=docker/Dockerfile -t ${IMG_TAG} + +dockerwithlib: + docker build \ + --no-cache \ + --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} \ + --build-arg TARGET_ARCH=${TARGET_ARCH} \ + --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} \ + --build-arg DEST_DIR=${DEST_DIR} \ + --build-arg GOPROXY=https://goproxy.cn,direct \ + . -f=docker/Dockerfile.withlib -t ${IMG_TAG} + +tidy: + $(GO) mod tidy + +proto: + $(GO) get github.com/gogo/protobuf/protoc-gen-gofast@v1.3.2 + protoc --gofast_out=plugins=grpc:. ./pkg/api/*.proto + +build: $(CMDS) $(DEVICES) $(CMDS): - $(GO) build -ldflags '-s -w -X 4pd.io/k8s-vgpu/pkg/version.version=$(VERSION)' -o ${OUTPUT_DIR}/$@ ./cmd/$@ + $(GO) build -ldflags '-s -w -X github.com/Project-HAMi/HAMi/pkg/version.version=$(VERSION)' -o ${OUTPUT_DIR}/$@ ./cmd/$@ + +$(DEVICES): + $(GO) build -ldflags '-s -w -X github.com/Project-HAMi/HAMi/pkg/version.version=$(VERSION)' -o ${OUTPUT_DIR}/$@-device-plugin ./cmd/device-plugin/$@ clean: $(GO) clean -r -x ./cmd/... -rm -rf $(OUTPUT_DIR) -.PHONY: all build clean $(CMDS) +.PHONY: all build docker clean $(CMDS) + +test: + bash hack/unit-test.sh + +lint: + bash hack/verify-staticcheck.sh + +.PHONY: verify +verify: + hack/verify-all.sh + diff --git a/OWNERS b/OWNERS new file mode 100644 index 000000000..5318d8da7 --- /dev/null +++ b/OWNERS @@ -0,0 +1,6 @@ +reviewers: + - archlitchi + - wawa0210 +approvers: + - archlitchi + - wawa0210 diff --git a/README.md b/README.md new file mode 100644 index 000000000..93e0fd768 --- /dev/null +++ b/README.md @@ -0,0 +1,367 @@ +English version|[中文版](README_cn.md) + + + +# Heterogeneous AI Computing Virtualization Middleware + +[![build status](https://github.com/Project-HAMi/HAMi/actions/workflows/main.yml/badge.svg)](https://github.com/Project-HAMi/HAMi/actions/workflows/main.yml) +[![docker pulls](https://img.shields.io/docker/pulls/4pdosc/k8s-vgpu.svg)](https://hub.docker.com/r/4pdosc/k8s-vgpu) +[![slack](https://img.shields.io/badge/Slack-Join%20Slack-blue)](https://join.slack.com/t/hami-hsf3791/shared_invite/zt-2gcteqiph-Ls8Atnpky6clrspCAQ_eGQ) +[![discuss](https://img.shields.io/badge/Discuss-Ask%20Questions-blue)](https://github.com/Project-HAMi/HAMi/discussions) +[![Contact Me](https://img.shields.io/badge/Contact%20Me-blue)](https://github.com/Project-HAMi/HAMi#contact) +[![FOSSA Status](https://app.fossa.com/api/projects/git%2Bgithub.com%2FProject-HAMi%2FHAMi.svg?type=shield)](https://app.fossa.com/projects/git%2Bgithub.com%2FProject-HAMi%2FHAMi?ref=badge_shield) + +--- +

+ +

+ +**HAMi is a [Cloud Native Computing Foundation](https://cncf.io/) sandbox project & [Landscape project](https://landscape.cncf.io/?item=orchestration-management--scheduling-orchestration--hami) & [CNAI Landscape project](https://landscape.cncf.io/?group=cnai&item=cnai--general-orchestration--hami).** + +## Supperted devices + +[![nvidia GPU](https://img.shields.io/badge/Nvidia-GPU-blue)](https://github.com/Project-HAMi/HAMi#preparing-your-gpu-nodes) +[![cambricon MLU](https://img.shields.io/badge/Cambricon-Mlu-blue)](docs/cambricon-mlu-support.md) +[![hygon DCU](https://img.shields.io/badge/Hygon-DCU-blue)](docs/hygon-dcu-support.md) +[![iluvatar GPU](https://img.shields.io/badge/Iluvatar-GPU-blue)](docs/iluvatar-gpu-support.md) + +## Introduction + + + +**Heterogeneous AI Computing Virtualization Middleware (HAMi), formerly known as k8s-vGPU-scheduler, is an "all-in-one" chart designed to manage Heterogeneous AI Computing Devices in a k8s cluster.** It includes everything you would expect, such as: + +***Device sharing***: Each task can allocate a portion of a device instead of the entire device, allowing a device to be shared among multiple tasks. + +***Device Memory Control***: Devices can be allocated a specific device memory size (e.g., 3000M) or a percentage of the whole GPU's memory (e.g., 50%), ensuring it does not exceed the specified boundaries. + +***Device Type Specification***: You can specify the type of device to use or avoid for a particular task by setting annotations, such as "nvidia.com/use-gputype" or "nvidia.com/nouse-gputype". + +***Device UUID Specification***: You can specify the UUID of device to use or avoid for a particular task by setting annotations, such as "nvidia.com/use-gpuuuid" or "nvidia.com/nouse-gpuuuid". + +***Easy to use***: You don't need to modify your task YAML to use our scheduler. All your jobs will be automatically supported after installation. Additionally, you can specify a resource name other than "nvidia.com/gpu" if you prefer. + +***Scheduling Policy***: The vGPU scheduler supports various scheduling policies, including node-level and GPU-level policies. These can be set by default through scheduler parameters, and can also be selected based on application scenarios by setting the Pod's annotation, such as "hami.io/node-scheduler-policy" or "hami.io/gpu-scheduler-policy". Both dimensions support two policies: `binpack` and `spread`. + +## Major Features + +- Hard Limit on Device Memory. + +A simple demostration for Hard Limit: +A task with the following resources. + +``` + resources: + limits: + nvidia.com/gpu: 1 # requesting 1 vGPU + nvidia.com/gpumem: 3000 # Each vGPU contains 3000m device memory +``` + +will see 3G device memory inside container + +![img](./imgs/hard_limit.jpg) + +- Allows partial device allocation by specifying device memory. +- Imposes a hard limit on streaming multiprocessors. +- Permits partial device allocation by specifying device core usage. +- Requires zero changes to existing programs. + +## Architect + + + +HAMi consists of several components, including a unified mutatingwebhook, a unified scheduler extender, different device-plugins and different in-container virtualization technics for each heterogeneous AI devices. + +## Application Scenarios + +1. Device sharing (or device virtualization) on Kubernetes. +2. Scenarios where pods need to be allocated with specific device memory 3. usage or device cores. +3. Need to balance GPU usage in a cluster with multiple GPU nodes. +4. Low utilization of device memory and computing units, such as running 10 TensorFlow servings on one GPU. +5. Situations that require a large number of small GPUs, such as teaching scenarios where one GPU is provided for multiple students to use, and cloud platforms that offer small GPU instances. + +## Quick Start + +### Choose your orchestrator + +[![kube-scheduler](https://img.shields.io/badge/kube-scheduler-blue)](https://github.com/Project-HAMi/HAMi#quick-start) +[![volcano-scheduler](https://img.shields.io/badge/volcano-scheduler-orange)](docs/how-to-use-volcano-vgpu.md) + +### Prerequisites + +The list of prerequisites for running the NVIDIA device plugin is described below: + +- NVIDIA drivers >= 440 +- nvidia-docker version > 2.0 +- config default runtime is nvidia for containerd/docker/cri-o container runtime. +- Kubernetes version >= 1.16 +- glibc >= 2.17 & glibc < 2.3.0 +- kernel version >= 3.10 +- helm > 3.0 + +### Preparing your GPU Nodes + +
Configure nvidia-container-toolkit + +Execute the following steps on all your GPU nodes. + +This README assumes pre-installation of NVIDIA drivers and the `nvidia-container-toolkit`. Additionally, it assumes configuration of the `nvidia-container-runtime` as the default low-level runtime. + +Please see: + +#### Example for debian-based systems with `Docker` and `containerd` + +##### Install the `nvidia-container-toolkit` + +```bash +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | sudo apt-key add - +curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | sudo tee /etc/apt/sources.list.d/libnvidia-container.list + +sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit +``` + +##### Configure `Docker` + +When running `Kubernetes` with `Docker`, edit the configuration file, typically located at `/etc/docker/daemon.json`, to set up `nvidia-container-runtime` as the default low-level runtime: + +```json +{ + "default-runtime": "nvidia", + "runtimes": { + "nvidia": { + "path": "/usr/bin/nvidia-container-runtime", + "runtimeArgs": [] + } + } +} +``` + +And then restart `Docker`: + +``` +sudo systemctl daemon-reload && systemctl restart docker +``` + +##### Configure `containerd` + +When running `Kubernetes` with `containerd`, modify the configuration file typically located at `/etc/containerd/config.toml`, to set up +`nvidia-container-runtime` as the default low-level runtime: + +``` +version = 2 +[plugins] + [plugins."io.containerd.grpc.v1.cri"] + [plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "nvidia" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] + privileged_without_host_devices = false + runtime_engine = "" + runtime_root = "" + runtime_type = "io.containerd.runc.v2" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] + BinaryName = "/usr/bin/nvidia-container-runtime" +``` + +And then restart `containerd`: + +``` +sudo systemctl daemon-reload && systemctl restart containerd +``` + +
+ +
Label your nodes + +Label your GPU nodes for scheduling with HAMi by adding the label "gpu=on". Without this label, the nodes cannot be managed by our scheduler. + +``` +kubectl label nodes {nodeid} gpu=on +``` + +
+ +### Install and Uninstall + +
Installation + +First, you need to check your Kubernetes version by using the following command: + +``` +kubectl version +``` + +Then, add our repo in helm + +``` +helm repo add hami-charts https://project-hami.github.io/HAMi/ +``` + +During installation, set the Kubernetes scheduler image version to match your Kubernetes server version. For instance, if your cluster server version is 1.16.8, use the following command for deployment: + +``` +helm install hami hami-charts/hami --set scheduler.kubeScheduler.imageTag=v1.16.8 -n kube-system +``` + +Customize your installation by adjusting the [configs](docs/config.md). + +Verify your installation using the following command: + +``` +kubectl get pods -n kube-system +``` + +If both `vgpu-device-plugin` and `vgpu-scheduler` pods are in the *Running* state, your installation is successful. + +
+ +
Upgrade + +Upgrading HAMi to the latest version is a simple process, update the repository and restart the chart: + +``` +helm uninstall hami -n kube-system +helm repo update +helm install hami hami-charts/hami -n kube-system +``` + +> **WARNING:** *If you upgrade HAMi without clearing your submitted tasks, it may result in segmentation fault.* + +
+ +
Uninstall + +``` +helm uninstall hami -n kube-system +``` + +> **NOTICE:** *Uninstallation won't kill running tasks.* + +
+ +### Submit Task + +
Task example + +Containers can now request NVIDIA vGPUs using the `nvidia.com/gpu`` resource type. + +``` +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # requesting 2 vGPUs + nvidia.com/gpumem: 3000 # Each vGPU contains 3000m device memory (Optional,Integer) + nvidia.com/gpucores: 30 # Each vGPU uses 30% of the entire GPU (Optional,Integer) +``` + +Exercise caution; if a task cannot fit into any GPU node (i.e., the requested number of `nvidia.com/gpu` exceeds the available GPUs in any node), the task will remain in a `pending` state. + +You can now execute the `nvidia-smi` command in the container to observe the difference in GPU memory between vGPU and physical GPU. + +> **WARNING:** +> +> *1. if you don't request vGPUs when using the device plugin with NVIDIA images all +> the vGPUs on the machine will be exposed inside your container.* +> +> *2. Do not set "nodeName" field, use "nodeSelector" instead.* + +#### More examples + +Click [here](examples/nvidia/) + +
+ +### Monitor + +
Get cluster overview + +Monitoring is automatically enabled after installation. Obtain an overview of cluster information by visiting the following URL: + +``` +http://{scheduler ip}:{monitorPort}/metrics +``` + +The default monitorPort is 31993; other values can be set using `--set devicePlugin.service.httpPort` during installation. + +Grafana dashboard [example](docs/dashboard.md) + +> **Note** The status of a node won't be collected before you submit a task + +
+ +## [Benchmarks](docs/benchmark.md) + +## Known Issues + +- Currently, A100 MIG can be supported in only "none" and "mixed" modes. +- Tasks with the "nodeName" field cannot be scheduled at the moment; please use "nodeSelector" instead. +- Only computing tasks are currently supported; video codec processing is not supported. +- We change `device-plugin` env var name from `NodeName` to `NODE_NAME`, if you use the image version `v2.3.9`, you may encounter the situation that `device-plugin` cannot start, there are two ways to fix it: + - Manually execute `kubectl edit daemonset` to modify the `device-plugin` env var from `NodeName` to `NODE_NAME`. + - Upgrade to the latest version using helm, the latest version of `device-plugin` image version is `v2.3.10`, execute `helm upgrade hami hami/hami -n kube-system`, it will be fixed automatically. + +## Roadmap + +Heterogeneous AI Computing device to support + + +| Production | manufactor | Type |MemoryIsolation | CoreIsolation | MultiCard support | +|-------------|------------|-------------|-----------|---------------|-------------------| +| GPU | NVIDIA | All | ✅ | ✅ | ✅ | +| MLU | Cambricon | 370, 590 | ✅ | ❌ | ❌ | +| DCU | Hygon | Z100, Z100L | ✅ | ✅ | ❌ | +| Ascend | Huawei | 910B | ✅ | ✅ | ❌ | +| GPU | iluvatar | All | ✅ | ✅ | ❌ | +| DPU | Teco | Checking | In progress | In progress | ❌ | + +- [ ] Support video codec processing +- [ ] Support Multi-Instance GPUs (MIG) +- [ ] Support Flexible scheduling policies + - [x] binpack + - [x] spread + - [ ] numa affinity +- [ ] integrated gpu-operator +- [ ] Rich observability support +- [ ] DRA Support +- [ ] Support Intel GPU device +- [ ] Support AMD GPU device + +## Governance + +The project is governed by a group of [Maintainers and Committers](https://github.com/Project-HAMi/HAMi/blob/master/AUTHORS). How they are selected and govern is outlined in our [Governance Document](https://github.com/Project-HAMi/community/blob/main/governance.md). + +## Contributing + +If you're interested in being a contributor and want to get involved in +developing the HAMi code, please see [CONTRIBUTING](CONTRIBUTING.md) for +details on submitting patches and the contribution workflow. + +## Meeting & Contact + +The HAMi community is committed to fostering an open and welcoming environment, with several ways to engage with other users and developers. + +If you have any questions, please feel free to reach out to us through the following channels: + +- Regular Community Meeting: Friday at 16:00 UTC+8 (Chinese)(weekly). [Convert to your timezone](https://www.thetimezoneconverter.com/?t=14%3A30&tz=GMT%2B8&). + - [Meeting Notes and Agenda](https://docs.google.com/document/d/1YC6hco03_oXbF9IOUPJ29VWEddmITIKIfSmBX8JtGBw/edit#heading=h.g61sgp7w0d0c) + - [Meeting Link](https://meeting.tencent.com/dm/Ntiwq1BICD1P) +- Email: refer to the [MAINTAINERS.md](MAINTAINERS.md) to find the email addresses of all maintainers. Feel free to contact them via email to report any issues or ask questions. +- [mailing list](https://groups.google.com/forum/#!forum/hami-project) +- [slack](https://join.slack.com/t/hami-hsf3791/shared_invite/zt-2gcteqiph-Ls8Atnpky6clrspCAQ_eGQ) + +## License + +HAMi is under the Apache 2.0 license. See the [LICENSE](LICENSE) file for details. + + +[![FOSSA Status](https://app.fossa.com/api/projects/git%2Bgithub.com%2FProject-HAMi%2FHAMi.svg?type=large)](https://app.fossa.com/projects/git%2Bgithub.com%2FProject-HAMi%2FHAMi?ref=badge_large) \ No newline at end of file diff --git a/README_cn.md b/README_cn.md new file mode 100644 index 000000000..2208c21f6 --- /dev/null +++ b/README_cn.md @@ -0,0 +1,317 @@ + + +# HAMi--异构算力虚拟化中间件 + +[![build status](https://github.com/Project-HAMi/HAMi/actions/workflows/main.yml/badge.svg)](https://github.com/Project-HAMi/HAMi/actions/workflows/build.yml) +[![docker pulls](https://img.shields.io/docker/pulls/4pdosc/k8s-vgpu.svg)](https://hub.docker.com/r/4pdosc/k8s-vgpu) +[![slack](https://img.shields.io/badge/Slack-Join%20Slack-blue)](https://join.slack.com/t/hami-hsf3791/shared_invite/zt-2gcteqiph-Ls8Atnpky6clrspCAQ_eGQ) +[![discuss](https://img.shields.io/badge/Discuss-Ask%20Questions-blue)](https://github.com/Project-HAMi/HAMi/discussions) +[![Contact Me](https://img.shields.io/badge/Contact%20Me-blue)](https://github.com/Project-HAMi/HAMi#contact) + +--- +

+ +

+ +**HAMi is a [Cloud Native Computing Foundation](https://cncf.io/) sandbox project & [Landscape project](https://landscape.cncf.io/?item=orchestration-management--scheduling-orchestration--hami) & [CNAI Landscape project](https://landscape.cncf.io/?group=cnai&item=cnai--general-orchestration--hami).** + +## 支持设备: + +[![英伟达 GPU](https://img.shields.io/badge/Nvidia-GPU-blue)](https://github.com/Project-HAMi/HAMi#preparing-your-gpu-nodes) +[![寒武纪 MLU](https://img.shields.io/badge/寒武纪-Mlu-blue)](docs/cambricon-mlu-support_cn.md) +[![海光 DCU](https://img.shields.io/badge/海光-DCU-blue)](docs/hygon-dcu-support.md) +[![天数智芯 GPU](https://img.shields.io/badge/天数智芯-GPU-blue)](docs/iluvatar-gpu-support_cn.md) + + +## 简介 + +! + +异构算力虚拟化中间件HAMi满足了所有你对于管理异构算力集群所需要的能力,包括: + +***设备复用***: 每个任务可以只占用一部分显卡,多个任务可以共享一张显卡 + +***可限制分配的显存大小***: 你现在可以用显存值(例如3000M)或者显存比例(例如50%)来分配GPU,vGPU调度器会确保任务使用的显存不会超过分配数值 + +***指定设备型号***:当前任务可以通过设置annotation的方式,来选择使用或者不使用某些具体型号的设备 + +***设备指定UUID***:当前任务可以通过设置`annotation`的方式,来选择使用或者不使用指定的设备,比如:"nvidia.com/use-gpuuuid" or "nvidia.com/nouse-gpuuuid" + +***无侵入***: vGPU调度器兼容nvidia官方插件的显卡分配方式,所以安装完毕后,你不需要修改原有的任务文件就可以使用vGPU的功能。当然,你也可以自定义的资源名称 + +***调度策略***: vGPU调度器支持多种调度策略,包括节点、GPU卡纬度的调度策略,可以通过调度器的参数来进行默认设置,同时也可以根据应用场景,通过设置 Pod 的`annotation`来选择,比如:"hami.io/node-scheduler-policy" or "hami.io/gpu-scheduler-policy",两个纬度都支持`binpack`和`spread`两种策略。 + +## 使用场景 + +1. 云原生场景下需要复用算力设备的场合 +2. 需要定制异构算力申请的场合,如申请特定显存大小的虚拟GPU,每个虚拟GPU使用特定比例的算力。 +3. 在多个异构算力节点组成的集群中,任务需要根据自身的显卡需求分配到合适的节点执行。 +4. 显存、计算单元利用率低的情况,如在一张GPU卡上运行10个tf-serving。 +5. 需要大量小显卡的情况,如教学场景把一张GPU提供给多个学生使用、云平台提供小GPU实例。 + +## 产品设计 + + + +HAMi 包含以下几个组件,一个统一的mutatingwebhook,一个统一的调度器,以及针对各种不同的异构算力设备对应的设备插件和容器内的控制组件,整体的架构特性如上图所示。 + +## 产品特性 + +- 显存资源的硬隔离 + +一个硬隔离的简单展示: +一个使用以下方式定义的任务提交后 +```yaml + resources: + limits: + nvidia.com/gpu: 1 # requesting 1 vGPU + nvidia.com/gpumem: 3000 # Each vGPU contains 3000m device memory +``` +会只有3G可见显存 + +![img](./imgs/hard_limit.jpg) + +- 允许通过指定显存来申请算力设备 +- 算力资源的硬隔离 +- 允许通过指定算力使用比例来申请算力设备 +- 对已有程序零改动 + +## 安装要求 + +* NVIDIA drivers >= 440 +* nvidia-docker version > 2.0 +* docker/containerd/cri-o已配置nvidia作为默认runtime +* Kubernetes version >= 1.16 +* glibc >= 2.17 & glibc < 2.3.0 +* kernel version >= 3.10 +* helm > 3.0 + +## 快速入门 + +### 选择你的集群调度器 + +[![kube-scheduler](https://img.shields.io/badge/kube-scheduler-blue)](https://github.com/Project-HAMi/HAMi#quick-start) +[![volcano-scheduler](https://img.shields.io/badge/volcano-scheduler-orange)](docs/how-to-use-volcano-vgpu.md) + +### 准备节点 + +
配置 nvidia-container-toolkit + +### GPU节点准备 + +以下步骤要在所有GPU节点执行,这份README文档假定GPU节点已经安装NVIDIA驱动。它还假设您已经安装docker或container并且需要将nvidia-container-runtime配置为要使用的默认低级运行时。 + +#### 安装步骤举例: + +```bash +# 加入套件仓库 +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | sudo apt-key add - +curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | sudo tee /etc/apt/sources.list.d/libnvidia-container.list + +sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit +``` + +##### 配置docker +你需要在节点上将nvidia runtime做为你的docker runtime预设值。我们将编辑docker daemon的配置文件,此文件通常在`/etc/docker/daemon.json`路径: + +```json +{ + "default-runtime": "nvidia", + "runtimes": { + "nvidia": { + "path": "/usr/bin/nvidia-container-runtime", + "runtimeArgs": [] + } + } +} +``` +```bash +systemctl daemon-reload && systemctl restart docker +``` +##### 配置containerd +你需要在节点上将nvidia runtime做为你的containerd runtime预设值。我们将编辑containerd daemon的配置文件,此文件通常在`/etc/containerd/config.toml`路径 +```toml +version = 2 +[plugins] + [plugins."io.containerd.grpc.v1.cri"] + [plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "nvidia" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] + privileged_without_host_devices = false + runtime_engine = "" + runtime_root = "" + runtime_type = "io.containerd.runc.v2" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] + BinaryName = "/usr/bin/nvidia-container-runtime" +``` +```bash +systemctl daemon-reload && systemctl restart containerd +``` + +
+ +
为GPU节点打上标签 + +最后,你需要将所有要使用到的GPU节点打上gpu=on标签,否则该节点不会被调度到 + +```bash +kubectl label nodes {nodeid} gpu=on +``` + +
+ +### 安装,更新与卸载 + +
安装 + +首先使用helm添加我们的 repo + +```bash +helm repo add hami-charts https://project-hami.github.io/HAMi/ +``` + +随后,使用下列指令获取集群服务端版本 + +```bash +kubectl version +``` + +在安装过程中须根据集群服务端版本(上一条指令的结果)指定调度器镜像版本,例如集群服务端版本为1.16.8,则可以使用如下指令进行安装 + +```bash +helm install hami hami-charts/hami --set scheduler.kubeScheduler.imageTag=v1.16.8 -n kube-system +``` + +你可以修改这里的[配置](docs/config_cn.md)来定制安装 + +通过kubectl get pods指令看到 `vgpu-device-plugin` 与 `vgpu-scheduler` 两个pod 状态为*Running* 即为安装成功 + +```bash +kubectl get pods -n kube-system +``` + +
+ +
更新 + +只需要更新helm repo,并重新启动整个Chart即可自动完成更新,最新的镜像会被自动下载 + +```bash +helm uninstall hami -n kube-system +helm repo update +helm install hami hami-charts/hami -n kube-system +``` + +> **注意:** *如果你没有清理完任务就进行热更新的话,正在运行的任务可能会出现段错误等报错.* + +
+ +
卸载 + +```bash +helm uninstall hami -n kube-system +``` + +> **注意:** *卸载组件并不会使正在运行的任务失败.* + +
+ +### 提交任务 + +
任务样例 + +NVIDIA vGPUs 现在能透过资源类型`nvidia.com/gpu`被容器请求: + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # 请求2个vGPUs + nvidia.com/gpumem: 3000 # 每个vGPU申请3000m显存 (可选,整数类型) + nvidia.com/gpucores: 30 # 每个vGPU的算力为30%实际显卡的算力 (可选,整数类型) +``` + +如果你的任务无法运行在任何一个节点上(例如任务的`nvidia.com/gpu`大于集群中任意一个GPU节点的实际GPU数量),那么任务会卡在`pending`状态 + +现在你可以在容器执行`nvidia-smi`命令,然后比较vGPU和实际GPU显存大小的不同。 + +> **注意:** *1. 如果你使用privileged字段的话,本任务将不会被调度,因为它可见所有的GPU,会对其它任务造成影响.* +> +> *2. 不要设置nodeName字段,类似需求请使用nodeSelector.* + +
+ +#### 更多范例 + +点击 [范例](examples/nvidia) + + +### 监控: + +
访问集群算力视图 + +调度器部署成功后,监控默认自动开启,你可以通过 + +```http +http://{nodeip}:{monitorPort}/metrics +``` + +来获取监控数据,其中monitorPort可以在Values中进行配置,默认为31992 + +grafana dashboard [示例](docs/dashboard_cn.md) + +> **注意** 节点上的vGPU状态只有在其使用vGPU后才会被统计 + +
+ +## [性能测试](docs/benchmark_cn.md) + +## 已知问题 + +- 目前仅支持计算任务,不支持视频编解码处理。 +- 暂时仅支持MIG的"none"和"mixed"模式,暂时不支持single模式 +- 当任务有字段“nodeName“时会出现无法调度的情况,有类似需求的请使用"nodeSelector"代替 +- 我们修改了 `device-plugin` 组件的环境变量,从 `NodeName` 改为 `NODE_NAME`, 如果使用的是镜像版本是 `v2.3.9`, 则可能会出现 `device-plugin` 无法启动的情况,目前有两种修复建议: + - 手动执行`kubectl edit daemonset` 修改 `device-plugin` 的环境变量从`NodeName` 改为 `NODE_NAME`。 + - 使用helm升级到最新版本,最新版`device-plugin`的镜像版本是`v2.3.10`,执行 `helm upgrade hami hami/hami -n kube-system`, 会自动修复。 + +## 开发计划 + +- 目前支持的异构算力设备及其对应的复用特性如下表所示 + +| 产品 | 制造商 | 显存隔离 | 算力隔离 | 多卡支持 | +|-------------|------------|-----------------|---------------|-------------------| +| GPU | NVIDIA | ✅ | ✅ | ✅ | +| MLU | 寒武纪 | ✅ | ❌ | ❌ | +| DCU | 海光 | ✅ | ✅ | ❌ | +| Ascend | 华为 | 开发中 | 开发中 | ❌ | +| GPU | 天数智芯 | 开发中 | 开发中 | ❌ | +| DPU | 太初 | 开发中 | 开发中 | ❌ | + +- [ ] 支持视频编解码处理。 +- [ ] 支持Multi-Instance GPUs (MIG)。 +- [ ] 支持更加灵活的调度策略。 + - [ ] binpack + - [ ] spread + - [ ] numa affinity +- [ ] 与 nvidia gpu-operator 集成。 +- [ ] 更丰富的可观测性能力支持。 +- [ ] 支持 DRA。 +- [ ] 支持 Intel GPU device。 +- [ ] 支持 AMD GPU device。 + + +## 参与贡献 + +如果你想成为 HAMi 的贡献者,请参[考贡献者指南](CONTRIBUTING.md),里面有详细的贡献流程。 diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 000000000..1727ff381 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,25 @@ +# Benchmarking the vGPU scheduler + +## Prerequisites + +### how to build the benchmark image + +```bash +cd HAMi/benchmarks/ai-benchmark + +sh build.sh +``` + +## How to install the official nvidia device plugin + +Please refer to [Quick Start](https://github.com/NVIDIA/k8s-device-plugin?tab=readme-ov-file#quick-start) in the official nvidia device plugin repository. + +## Run the benchmark + +```bash +cd HAMi/deployments + +kubectl apply -f job-on-hami.yml + +kubectl apply -f job-on-nvidia-device-plugin.yml +``` \ No newline at end of file diff --git a/benchmarks/ai-benchmark/Dockerfile b/benchmarks/ai-benchmark/Dockerfile new file mode 100644 index 000000000..008f089b8 --- /dev/null +++ b/benchmarks/ai-benchmark/Dockerfile @@ -0,0 +1,18 @@ +# This Dockerfile is used to build a Docker image for running the AI Benchmark. +# It is based on the tensorflow/tensorflow:latest-gpu image. + +FROM tensorflow/tensorflow:latest-gpu + +# Set the working directory to /ai-benchmark +WORKDIR ai-benchmark + +# Update the package list and install git and apt-utils +RUN apt-get update && \ + apt-get install -y --no-install-recommends apt-utils git && \ + rm -rf /var/lib/apt/lists/* && \ + pip install --no-cache-dir --upgrade pip && \ + git clone https://github.com/Project-HAMi/ai-benchmark . && \ + pip install --no-cache-dir -r requirements.txt + +# Set the default command to run when the container starts +CMD ["python", "./main.py"] \ No newline at end of file diff --git a/benchmarks/ai-benchmark/build.sh b/benchmarks/ai-benchmark/build.sh new file mode 100644 index 000000000..917770845 --- /dev/null +++ b/benchmarks/ai-benchmark/build.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +IMAGE="vgpu-benchmark" +TAG="v0.0.1" +PLATFORM="linux/amd64" + +docker buildx build --push \ + --platform $PLATFORM \ + --no-cache \ + -t "$IMAGE:$TAG" \ + -f Dockerfile . \ No newline at end of file diff --git a/benchmarks/deployments/job-on-hami.yml b/benchmarks/deployments/job-on-hami.yml new file mode 100644 index 000000000..210a9586a --- /dev/null +++ b/benchmarks/deployments/job-on-hami.yml @@ -0,0 +1,20 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: ai-benchmark-on-hami +spec: + template: + metadata: + name: ai-benchmark-on-hami + spec: + containers: + - name: ai-benchmark-on-hami + image: 4pdosc/ai-benchmark:2.4.1-gpu + resources: + requests: + nvidia.com/gpu: 1 + nvidia.com/gpumem-percentage: 50 + limits: + nvidia.com/gpu: 1 + nvidia.com/gpumem-percentage: 50 + restartPolicy: Never \ No newline at end of file diff --git a/benchmarks/deployments/job-on-nvidia-device-plugin.yml b/benchmarks/deployments/job-on-nvidia-device-plugin.yml new file mode 100644 index 000000000..63cc68e73 --- /dev/null +++ b/benchmarks/deployments/job-on-nvidia-device-plugin.yml @@ -0,0 +1,18 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: ai-benchmark-on-official +spec: + template: + metadata: + name: ai-benchmark-on-official + spec: + containers: + - name: ai-benchmark-on-official + image: 4pdosc/ai-benchmark:2.4.1-gpu + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 + restartPolicy: Never \ No newline at end of file diff --git a/charts/hami/Chart.yaml b/charts/hami/Chart.yaml new file mode 100644 index 000000000..455683cb6 --- /dev/null +++ b/charts/hami/Chart.yaml @@ -0,0 +1,14 @@ +apiVersion: v2 +name: hami +version: 2.3 +kubeVersion: ">= 1.16.0" +description: Heterogeneous AI Computing Virtualization Middleware +keywords: + - vgpu + - gpu +type: application +maintainers: + - name: limengxuan + email: limengxuan@4paradigm.com +appVersion: 2.3 + diff --git a/deployments/4pd-vgpu/templates/NOTES.txt b/charts/hami/templates/NOTES.txt similarity index 100% rename from deployments/4pd-vgpu/templates/NOTES.txt rename to charts/hami/templates/NOTES.txt diff --git a/deployments/4pd-vgpu/templates/_helpers.tpl b/charts/hami/templates/_helpers.tpl similarity index 62% rename from deployments/4pd-vgpu/templates/_helpers.tpl rename to charts/hami/templates/_helpers.tpl index 22626a782..fc7944c98 100644 --- a/deployments/4pd-vgpu/templates/_helpers.tpl +++ b/charts/hami/templates/_helpers.tpl @@ -1,7 +1,7 @@ {{/* Expand the name of the chart. */}} -{{- define "4pd-vgpu.name" -}} +{{- define "hami-vgpu.name" -}} {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} {{- end -}} @@ -10,7 +10,7 @@ Create a default fully qualified app name. We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). If release name contains chart name it will be used as a full name. */}} -{{- define "4pd-vgpu.fullname" -}} +{{- define "hami-vgpu.fullname" -}} {{- if .Values.fullnameOverride -}} {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} {{- else }} @@ -26,44 +26,44 @@ If release name contains chart name it will be used as a full name. {{/* The app name for Scheduler */}} -{{- define "4pd-vgpu.scheduler" -}} -{{- printf "%s-scheduler" ( include "4pd-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}} +{{- define "hami-vgpu.scheduler" -}} +{{- printf "%s-scheduler" ( include "hami-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}} {{- end -}} {{/* The app name for DevicePlugin */}} -{{- define "4pd-vgpu.device-plugin" -}} -{{- printf "%s-device-plugin" ( include "4pd-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}} +{{- define "hami-vgpu.device-plugin" -}} +{{- printf "%s-device-plugin" ( include "hami-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}} {{- end -}} {{/* The tls secret name for Scheduler */}} -{{- define "4pd-vgpu.scheduler.tls" -}} -{{- printf "%s-scheduler-tls" ( include "4pd-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}} +{{- define "hami-vgpu.scheduler.tls" -}} +{{- printf "%s-scheduler-tls" ( include "hami-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}} {{- end -}} {{/* The webhook name */}} -{{- define "4pd-vgpu.scheduler.webhook" -}} -{{- printf "%s-webhook" ( include "4pd-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}} +{{- define "hami-vgpu.scheduler.webhook" -}} +{{- printf "%s-webhook" ( include "hami-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}} {{- end -}} {{/* Create chart name and version as used by the chart label. */}} -{{- define "4pd-vgpu.chart" -}} +{{- define "hami-vgpu.chart" -}} {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} {{- end }} {{/* Common labels */}} -{{- define "4pd-vgpu.labels" -}} -helm.sh/chart: {{ include "4pd-vgpu.chart" . }} -{{ include "4pd-vgpu.selectorLabels" . }} +{{- define "hami-vgpu.labels" -}} +helm.sh/chart: {{ include "hami-vgpu.chart" . }} +{{ include "hami-vgpu.selectorLabels" . }} {{- if .Chart.AppVersion }} app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} {{- end }} @@ -73,15 +73,15 @@ app.kubernetes.io/managed-by: {{ .Release.Service }} {{/* Selector labels */}} -{{- define "4pd-vgpu.selectorLabels" -}} -app.kubernetes.io/name: {{ include "4pd-vgpu.name" . }} +{{- define "hami-vgpu.selectorLabels" -}} +app.kubernetes.io/name: {{ include "hami-vgpu.name" . }} app.kubernetes.io/instance: {{ .Release.Name }} {{- end }} {{/* Image registry secret name */}} -{{- define "4pd-vgpu.imagePullSecrets" -}} +{{- define "hami-vgpu.imagePullSecrets" -}} imagePullSecrets: {{ toYaml .Values.imagePullSecrets | nindent 2 }} {{- end }} diff --git a/charts/hami/templates/device-plugin/configmap.yaml b/charts/hami/templates/device-plugin/configmap.yaml new file mode 100644 index 000000000..302252055 --- /dev/null +++ b/charts/hami/templates/device-plugin/configmap.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "hami-vgpu.device-plugin" . }} + labels: + app.kubernetes.io/component: hami-device-plugin + {{- include "hami-vgpu.labels" . | nindent 4 }} +data: + config.json: | + { + "nodeconfig": [ + { + "name": "m5-cloudinfra-online02", + "devicememoryscaling": 1.8, + "devicesplitcount": 10, + "migstrategy":"none" + } + ] + } \ No newline at end of file diff --git a/charts/hami/templates/device-plugin/daemonsetnvidia.yaml b/charts/hami/templates/device-plugin/daemonsetnvidia.yaml new file mode 100644 index 000000000..ae7cf694f --- /dev/null +++ b/charts/hami/templates/device-plugin/daemonsetnvidia.yaml @@ -0,0 +1,145 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "hami-vgpu.device-plugin" . }} + labels: + app.kubernetes.io/component: hami-device-plugin + {{- include "hami-vgpu.labels" . | nindent 4 }} + {{- with .Values.global.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- if .Values.global.annotations }} + annotations: {{ toYaml .Values.global.annotations | nindent 4}} + {{- end }} +spec: + selector: + matchLabels: + app.kubernetes.io/component: hami-device-plugin + {{- include "hami-vgpu.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + app.kubernetes.io/component: hami-device-plugin + hami.io/webhook: ignore + {{- include "hami-vgpu.selectorLabels" . | nindent 8 }} + {{- if .Values.devicePlugin.podAnnotations }} + annotations: {{ toYaml .Values.devicePlugin.podAnnotations | nindent 8 }} + {{- end }} + spec: + {{- if .Values.devicePlugin.runtimeClassName }} + runtimeClassName: {{ .Values.devicePlugin.runtimeClassName }} + {{- end }} + {{- include "hami-vgpu.imagePullSecrets" . | nindent 6}} + # serviceAccountName: + serviceAccountName: {{ include "hami-vgpu.device-plugin" . }} + priorityClassName: system-node-critical + hostPID: true + hostNetwork: true + containers: + - name: device-plugin + image: {{ .Values.devicePlugin.image }}:{{ .Values.version }} + imagePullPolicy: {{ .Values.devicePlugin.imagePullPolicy | quote }} + lifecycle: + postStart: + exec: + command: ["/bin/sh","-c", {{ printf "cp -f /k8s-vgpu/lib/nvidia/* %s/vgpu/" .Values.global.gpuHookPath | quote }}] + command: + - nvidia-device-plugin + - --resource-name={{ .Values.resourceName }} + - --mig-strategy={{ .Values.devicePlugin.migStrategy }} + - --device-memory-scaling={{ .Values.devicePlugin.deviceMemoryScaling }} + - --device-cores-scaling={{ .Values.devicePlugin.deviceCoreScaling }} + - --device-split-count={{ .Values.devicePlugin.deviceSplitCount }} + - --disable-core-limit={{ .Values.devicePlugin.disablecorelimit }} + {{- range .Values.devicePlugin.extraArgs }} + - {{ . }} + {{- end }} + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NVIDIA_MIG_MONITOR_DEVICES + value: all + - name: HOOK_PATH + value: {{ .Values.global.gpuHookPath }} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + add: ["SYS_ADMIN"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: lib + mountPath: {{ printf "%s%s" .Values.global.gpuHookPath "/vgpu" }} + - name: usrbin + mountPath: /usrbin + - name: deviceconfig + mountPath: /config + - name: hosttmp + mountPath: /tmp + - name: vgpu-monitor + image: {{ .Values.devicePlugin.image }}:{{ .Values.version }} + imagePullPolicy: {{ .Values.devicePlugin.imagePullPolicy | quote }} + command: ["vGPUmonitor"] + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + add: ["SYS_ADMIN"] + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: NVIDIA_MIG_MONITOR_DEVICES + value: all + - name: HOOK_PATH + value: {{ .Values.global.gpuHookPath }}/vgpu + volumeMounts: + - name: ctrs + mountPath: {{ .Values.devicePlugin.monitorctrPath }} + - name: dockers + mountPath: /run/docker + - name: containerds + mountPath: /run/containerd + - name: sysinfo + mountPath: /sysinfo + - name: hostvar + mountPath: /hostvar + volumes: + - name: ctrs + hostPath: + path: {{ .Values.devicePlugin.monitorctrPath }} + - name: hosttmp + hostPath: + path: /tmp + - name: dockers + hostPath: + path: /run/docker + - name: containerds + hostPath: + path: /run/containerd + - name: device-plugin + hostPath: + path: {{ .Values.devicePlugin.pluginPath }} + - name: lib + hostPath: + path: {{ .Values.devicePlugin.libPath }} + - name: usrbin + hostPath: + path: /usr/bin + - name: sysinfo + hostPath: + path: /sys + - name: hostvar + hostPath: + path: /var + - name: deviceconfig + configMap: + name: {{ template "hami-vgpu.device-plugin" . }} + {{- if .Values.devicePlugin.nvidianodeSelector }} + nodeSelector: {{ toYaml .Values.devicePlugin.nvidianodeSelector | nindent 8 }} + {{- end }} + {{- if .Values.devicePlugin.tolerations }} + tolerations: {{ toYaml .Values.devicePlugin.tolerations | nindent 8 }} + {{- end }} diff --git a/charts/hami/templates/device-plugin/monitorrole.yaml b/charts/hami/templates/device-plugin/monitorrole.yaml new file mode 100644 index 000000000..2c5e8e70c --- /dev/null +++ b/charts/hami/templates/device-plugin/monitorrole.yaml @@ -0,0 +1,27 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "hami-vgpu.device-plugin" . }}-monitor +rules: + - apiGroups: + - "" + resources: + - pods + verbs: + - get + - create + - watch + - list + - update + - patch + - apiGroups: + - "" + resources: + - nodes + verbs: + - get + - update + - list + - patch + + diff --git a/charts/hami/templates/device-plugin/monitorrolebinding.yaml b/charts/hami/templates/device-plugin/monitorrolebinding.yaml new file mode 100644 index 000000000..3d45e3a10 --- /dev/null +++ b/charts/hami/templates/device-plugin/monitorrolebinding.yaml @@ -0,0 +1,16 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "hami-vgpu.device-plugin" . }} + labels: + app.kubernetes.io/component: "hami-device-plugin" + {{- include "hami-vgpu.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + #name: cluster-admin + name: {{ include "hami-vgpu.device-plugin" . }}-monitor +subjects: + - kind: ServiceAccount + name: {{ include "hami-vgpu.device-plugin" . }} + namespace: {{ .Release.Namespace | quote }} diff --git a/charts/hami/templates/device-plugin/monitorservice.yaml b/charts/hami/templates/device-plugin/monitorservice.yaml new file mode 100644 index 000000000..edfc38034 --- /dev/null +++ b/charts/hami/templates/device-plugin/monitorservice.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "hami-vgpu.device-plugin" . }}-monitor + labels: + app.kubernetes.io/component: hami-device-plugin + {{- include "hami-vgpu.labels" . | nindent 4 }} + {{- if .Values.scheduler.service.labels }} + {{ toYaml .Values.scheduler.service.labels | indent 4 }} + {{- end }} + {{- if .Values.scheduler.service.annotations }} + annotations: {{ toYaml .Values.scheduler.service.annotations | nindent 4 }} + {{- end }} +spec: + externalTrafficPolicy: Local + selector: + app.kubernetes.io/component: hami-device-plugin + type: NodePort + ports: + - name: monitorport + port: {{ .Values.devicePlugin.service.httpPort }} + targetPort: 9394 + nodePort: {{ .Values.devicePlugin.service.httpPort }} \ No newline at end of file diff --git a/charts/hami/templates/device-plugin/monitorserviceaccount.yaml b/charts/hami/templates/device-plugin/monitorserviceaccount.yaml new file mode 100644 index 000000000..076d9dd08 --- /dev/null +++ b/charts/hami/templates/device-plugin/monitorserviceaccount.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "hami-vgpu.device-plugin" . }} + namespace: {{ .Release.Namespace | quote }} + labels: + app.kubernetes.io/component: "hami-device-plugin" + {{- include "hami-vgpu.labels" . | nindent 4 }} diff --git a/charts/hami/templates/scheduler/configmap.yaml b/charts/hami/templates/scheduler/configmap.yaml new file mode 100644 index 000000000..a1766b48b --- /dev/null +++ b/charts/hami/templates/scheduler/configmap.yaml @@ -0,0 +1,88 @@ +{{- if .Values.scheduler.kubeScheduler.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "hami-vgpu.scheduler" . }} + labels: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.labels" . | nindent 4 }} +data: + config.json: | + { + "kind": "Policy", + "apiVersion": "v1", + "extenders": [ + { + "urlPrefix": "https://127.0.0.1:443", + "filterVerb": "filter", + "bindVerb": "bind", + "enableHttps": true, + "weight": 1, + "nodeCacheCapable": true, + "httpTimeout": 30000000000, + "tlsConfig": { + "insecure": true + }, + "managedResources": [ + { + "name": "{{ .Values.resourceName }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.resourceMem }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.resourceCores }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.resourceMemPercentage }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.resourcePriority }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.mluResourceName }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.dcuResourceName }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.dcuResourceMem }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.dcuResourceCores }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.iluvatarResourceName }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.ascendResourceMem }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.ascendResourceName }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.ascend310PResourceName }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.ascend310PResourceMem }}", + "ignoredByScheduler": true + } + ], + "ignoreable": false + } + ] + } +{{- end }} diff --git a/charts/hami/templates/scheduler/configmapnew.yaml b/charts/hami/templates/scheduler/configmapnew.yaml new file mode 100644 index 000000000..ec483f4c4 --- /dev/null +++ b/charts/hami/templates/scheduler/configmapnew.yaml @@ -0,0 +1,60 @@ +{{- if .Values.scheduler.kubeScheduler.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "hami-vgpu.scheduler" . }}-newversion + labels: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.labels" . | nindent 4 }} +data: + config.yaml: | + {{- if gt (.Values.scheduler.kubeScheduler.imageTag | substr 3 5| atoi) 25}} + apiVersion: kubescheduler.config.k8s.io/v1 + {{- else }} + apiVersion: kubescheduler.config.k8s.io/v1beta2 + {{- end }} + kind: KubeSchedulerConfiguration + leaderElection: + leaderElect: false + profiles: + - schedulerName: {{ .Values.schedulerName }} + extenders: + - urlPrefix: "https://127.0.0.1:443" + filterVerb: filter + bindVerb: bind + nodeCacheCapable: true + weight: 1 + httpTimeout: 30s + enableHTTPS: true + tlsConfig: + insecure: true + managedResources: + - name: {{ .Values.resourceName }} + ignoredByScheduler: true + - name: {{ .Values.resourceMem }} + ignoredByScheduler: true + - name: {{ .Values.resourceCores }} + ignoredByScheduler: true + - name: {{ .Values.resourceMemPercentage }} + ignoredByScheduler: true + - name: {{ .Values.resourcePriority }} + ignoredByScheduler: true + - name: {{ .Values.mluResourceName }} + ignoredByScheduler: true + - name: {{ .Values.dcuResourceName }} + ignoredByScheduler: true + - name: {{ .Values.dcuResourceMem }} + ignoredByScheduler: true + - name: {{ .Values.dcuResourceCores }} + ignoredByScheduler: true + - name: {{ .Values.iluvatarResourceName }} + ignoredByScheduler: true + - name: {{ .Values.ascendResourceMem }} + ignoredByScheduler: true + - name: {{ .Values.ascendResourceName }} + ignoredByScheduler: true + - name: {{ .Values.ascend310PResourceMem }} + ignoredByScheduler: true + - name: {{ .Values.ascend310PResourceName }} + ignoredByScheduler: true +{{- end }} diff --git a/charts/hami/templates/scheduler/deployment.yaml b/charts/hami/templates/scheduler/deployment.yaml new file mode 100644 index 000000000..c53fd642c --- /dev/null +++ b/charts/hami/templates/scheduler/deployment.yaml @@ -0,0 +1,156 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "hami-vgpu.scheduler" . }} + labels: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.labels" . | nindent 4 }} + {{- with .Values.global.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- if .Values.global.annotations }} + annotations: {{ toYaml .Values.global.annotations | nindent 4}} + {{- end }} +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.selectorLabels" . | nindent 8 }} + hami.io/webhook: ignore + {{- if .Values.scheduler.podAnnotations }} + annotations: {{ toYaml .Values.scheduler.podAnnotations | nindent 8 }} + {{- end }} + spec: + {{- include "hami-vgpu.imagePullSecrets" . | nindent 6}} + serviceAccountName: {{ include "hami-vgpu.scheduler" . }} + priorityClassName: system-node-critical + containers: + {{- if .Values.scheduler.kubeScheduler.enabled }} + - name: kube-scheduler + image: {{ .Values.scheduler.kubeScheduler.image }}:{{ .Values.scheduler.kubeScheduler.imageTag }} + imagePullPolicy: {{ .Values.scheduler.kubeScheduler.imagePullPolicy | quote }} + command: + - kube-scheduler + {{- if ge (.Values.scheduler.kubeScheduler.imageTag | substr 3 5| atoi) 22}} + {{- range .Values.scheduler.kubeScheduler.extraNewArgs }} + - {{ . }} + {{- end }} + {{- else }} + - --scheduler-name={{ .Values.schedulerName }} + {{- range .Values.scheduler.kubeScheduler.extraArgs }} + - {{ . }} + {{- end }} + {{- end }} + - --leader-elect={{ .Values.scheduler.leaderElect }} + - --leader-elect-resource-name={{ .Values.schedulerName }} + - --leader-elect-resource-namespace={{ .Release.Namespace }} + volumeMounts: + - name: scheduler-config + mountPath: /config + {{- end }} + {{- if .Values.scheduler.livenessProbe }} + livenessProbe: + failureThreshold: 8 + httpGet: + path: /healthz + port: 10259 + scheme: HTTPS + initialDelaySeconds: 10 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 15 + {{- end }} + - name: vgpu-scheduler-extender + image: {{ .Values.scheduler.extender.image }}:{{ .Values.version }} + imagePullPolicy: {{ .Values.scheduler.extender.imagePullPolicy | quote }} + env: + {{- if .Values.global.managedNodeSelectorEnable }} + {{- range $key, $value := .Values.global.managedNodeSelector }} + - name: NODE_SELECTOR_{{ $key | upper | replace "-" "_" }} + value: "{{ $value }}" + {{- end }} + {{- end }} + command: + - scheduler + - --resource-name={{ .Values.resourceName }} + - --resource-mem={{ .Values.resourceMem }} + - --resource-cores={{ .Values.resourceCores }} + - --resource-mem-percentage={{ .Values.resourceMemPercentage }} + - --resource-priority={{ .Values.resourcePriority }} + - --http_bind=0.0.0.0:443 + - --cert_file=/tls/tls.crt + - --key_file=/tls/tls.key + - --scheduler-name={{ .Values.schedulerName }} + - --metrics-bind-address={{ .Values.scheduler.metricsBindAddress }} + - --default-mem={{ .Values.scheduler.defaultMem }} + - --default-gpu={{ .Values.scheduler.defaultGPUNum }} + - --default-cores={{ .Values.scheduler.defaultCores }} + - --iluvatar-memory={{ .Values.iluvatarResourceMem }} + - --iluvatar-cores={{ .Values.iluvatarResourceCore }} + - --cambricon-mlu-name={{ .Values.mluResourceName }} + - --cambricon-mlu-memory={{ .Values.mluResourceMem }} + - --cambricon-mlu-cores={{ .Values.mluResourceCores }} + - --ascend-name={{ .Values.ascendResourceName }} + - --ascend-memory={{ .Values.ascendResourceMem }} + - --ascend310p-name={{ .Values.ascend310PResourceName }} + - --ascend310p-memory={{ .Values.ascend310PResourceMem }} + - --overwrite-env={{ .Values.scheduler.overwriteEnv }} + - --node-scheduler-policy={{ .Values.scheduler.defaultSchedulerPolicy.nodeSchedulerPolicy }} + - --gpu-scheduler-policy={{ .Values.scheduler.defaultSchedulerPolicy.gpuSchedulerPolicy }} + {{- if .Values.scheduler.nodeLabelSelector }} + - --node-label-selector={{- $first := true -}} + {{- range $key, $value := .Values.scheduler.nodeLabelSelector -}} + {{- if not $first }},{{ end -}} + {{- $key }}={{ $value -}} + {{- $first = false -}} + {{- end -}} + {{- end }} + {{- range .Values.scheduler.extender.extraArgs }} + - {{ . }} + {{- end }} + ports: + - name: http + containerPort: 443 + protocol: TCP + volumeMounts: + - name: tls-config + mountPath: /tls + {{- if .Values.scheduler.livenessProbe }} + livenessProbe: + httpGet: + path: /healthz + port: 443 + scheme: HTTPS + initialDelaySeconds: 10 + periodSeconds: 10 + failureThreshold: 3 + timeoutSeconds: 5 + {{- end }} + volumes: + - name: tls-config + secret: + secretName: {{ template "hami-vgpu.scheduler.tls" . }} + {{- if .Values.scheduler.kubeScheduler.enabled }} + - name: scheduler-config + configMap: + {{- if ge (.Values.scheduler.kubeScheduler.imageTag | substr 3 5| atoi) 22 }} + name: {{ template "hami-vgpu.scheduler" . }}-newversion + {{- else }} + name: {{ template "hami-vgpu.scheduler" . }} + {{- end }} + {{- end }} + {{- if .Values.scheduler.nodeSelector }} + nodeSelector: {{ toYaml .Values.scheduler.nodeSelector | nindent 8 }} + {{- end }} + {{- if .Values.scheduler.tolerations }} + tolerations: {{ toYaml .Values.scheduler.tolerations | nindent 8 }} + {{- end }} + {{- if .Values.scheduler.nodeName }} + nodeName: {{ .Values.scheduler.nodeName }} + {{- end }} diff --git a/deployments/4pd-vgpu/templates/scheduler/job-patch/clusterrole.yaml b/charts/hami/templates/scheduler/job-patch/clusterrole.yaml similarity index 80% rename from deployments/4pd-vgpu/templates/scheduler/job-patch/clusterrole.yaml rename to charts/hami/templates/scheduler/job-patch/clusterrole.yaml index eb8bb33fc..ef6d986b1 100644 --- a/deployments/4pd-vgpu/templates/scheduler/job-patch/clusterrole.yaml +++ b/charts/hami/templates/scheduler/job-patch/clusterrole.yaml @@ -1,12 +1,12 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: {{ include "4pd-vgpu.fullname" . }}-admission + name: {{ include "hami-vgpu.fullname" . }}-admission annotations: "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded labels: - {{- include "4pd-vgpu.labels" . | nindent 4 }} + {{- include "hami-vgpu.labels" . | nindent 4 }} app.kubernetes.io/component: admission-webhook rules: - apiGroups: @@ -22,5 +22,5 @@ rules: resources: ['podsecuritypolicies'] verbs: ['use'] resourceNames: - - {{ include "4pd-vgpu.fullname" . }}-admission + - {{ include "hami-vgpu.fullname" . }}-admission {{- end }} diff --git a/deployments/4pd-vgpu/templates/scheduler/job-patch/clusterrolebinding.yaml b/charts/hami/templates/scheduler/job-patch/clusterrolebinding.yaml similarity index 66% rename from deployments/4pd-vgpu/templates/scheduler/job-patch/clusterrolebinding.yaml rename to charts/hami/templates/scheduler/job-patch/clusterrolebinding.yaml index cbff8d4be..469419e4a 100644 --- a/deployments/4pd-vgpu/templates/scheduler/job-patch/clusterrolebinding.yaml +++ b/charts/hami/templates/scheduler/job-patch/clusterrolebinding.yaml @@ -1,18 +1,18 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: {{ include "4pd-vgpu.fullname" . }}-admission + name: {{ include "hami-vgpu.fullname" . }}-admission annotations: "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded labels: - {{- include "4pd-vgpu.labels" . | nindent 4 }} + {{- include "hami-vgpu.labels" . | nindent 4 }} app.kubernetes.io/component: admission-webhook roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: {{ include "4pd-vgpu.fullname" . }}-admission + name: {{ include "hami-vgpu.fullname" . }}-admission subjects: - kind: ServiceAccount - name: {{ include "4pd-vgpu.fullname" . }}-admission + name: {{ include "hami-vgpu.fullname" . }}-admission namespace: {{ .Release.Namespace | quote }} diff --git a/deployments/4pd-vgpu/templates/scheduler/job-patch/job-createSecret.yaml b/charts/hami/templates/scheduler/job-patch/job-createSecret.yaml similarity index 60% rename from deployments/4pd-vgpu/templates/scheduler/job-patch/job-createSecret.yaml rename to charts/hami/templates/scheduler/job-patch/job-createSecret.yaml index 0cc35df1a..63bab0b82 100644 --- a/deployments/4pd-vgpu/templates/scheduler/job-patch/job-createSecret.yaml +++ b/charts/hami/templates/scheduler/job-patch/job-createSecret.yaml @@ -1,12 +1,12 @@ apiVersion: batch/v1 kind: Job metadata: - name: {{ include "4pd-vgpu.fullname" . }}-admission-create + name: {{ include "hami-vgpu.fullname" . }}-admission-create annotations: "helm.sh/hook": pre-install,pre-upgrade "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded labels: - {{- include "4pd-vgpu.labels" . | nindent 4 }} + {{- include "hami-vgpu.labels" . | nindent 4 }} app.kubernetes.io/component: admission-webhook spec: {{- if .Capabilities.APIVersions.Has "batch/v1alpha1" }} @@ -15,32 +15,40 @@ spec: {{- end }} template: metadata: - name: {{ include "4pd-vgpu.fullname" . }}-admission-create + name: {{ include "hami-vgpu.fullname" . }}-admission-create {{- if .Values.scheduler.patch.podAnnotations }} annotations: {{ toYaml .Values.scheduler.patch.podAnnotations | nindent 8 }} {{- end }} labels: - {{- include "4pd-vgpu.labels" . | nindent 8 }} + {{- include "hami-vgpu.labels" . | nindent 8 }} app.kubernetes.io/component: admission-webhook - 4pd.io/webhook: ignore + hami.io/webhook: ignore spec: - {{- include "4pd-vgpu.imagePullSecrets" . | nindent 6}} + {{- include "hami-vgpu.imagePullSecrets" . | nindent 6}} {{- if .Values.scheduler.patch.priorityClassName }} priorityClassName: {{ .Values.scheduler.patch.priorityClassName }} {{- end }} containers: - name: create + {{- if ge (.Values.scheduler.kubeScheduler.imageTag | substr 3 5| atoi) 22 }} + image: {{ .Values.scheduler.patch.imageNew }} + {{- else }} image: {{ .Values.scheduler.patch.image }} + {{- end }} imagePullPolicy: {{ .Values.scheduler.patch.imagePullPolicy }} args: - create - --cert-name=tls.crt - --key-name=tls.key - - --host={{ printf "%s.%s.svc,127.0.0.1" (include "4pd-vgpu.scheduler" .) .Release.Namespace }} + {{- if .Values.scheduler.customWebhook.enabled }} + - --host={{ printf "%s.%s.svc,127.0.0.1,%s" (include "hami-vgpu.scheduler" .) .Release.Namespace .Values.scheduler.customWebhook.host}} + {{- else }} + - --host={{ printf "%s.%s.svc,127.0.0.1" (include "hami-vgpu.scheduler" .) .Release.Namespace }} + {{- end }} - --namespace={{ .Release.Namespace }} - - --secret-name={{ include "4pd-vgpu.scheduler.tls" . }} + - --secret-name={{ include "hami-vgpu.scheduler.tls" . }} restartPolicy: OnFailure - serviceAccountName: {{ include "4pd-vgpu.fullname" . }}-admission + serviceAccountName: {{ include "hami-vgpu.fullname" . }}-admission {{- if .Values.scheduler.patch.nodeSelector }} nodeSelector: {{ toYaml .Values.scheduler.patch.nodeSelector | nindent 8 }} {{- end }} diff --git a/deployments/4pd-vgpu/templates/scheduler/job-patch/job-patchWebhook.yaml b/charts/hami/templates/scheduler/job-patch/job-patchWebhook.yaml similarity index 67% rename from deployments/4pd-vgpu/templates/scheduler/job-patch/job-patchWebhook.yaml rename to charts/hami/templates/scheduler/job-patch/job-patchWebhook.yaml index 2700546bf..a26fd4261 100644 --- a/deployments/4pd-vgpu/templates/scheduler/job-patch/job-patchWebhook.yaml +++ b/charts/hami/templates/scheduler/job-patch/job-patchWebhook.yaml @@ -1,12 +1,12 @@ apiVersion: batch/v1 kind: Job metadata: - name: {{ include "4pd-vgpu.fullname" . }}-admission-patch + name: {{ include "hami-vgpu.fullname" . }}-admission-patch annotations: "helm.sh/hook": post-install,post-upgrade "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded labels: - {{- include "4pd-vgpu.labels" . | nindent 4 }} + {{- include "hami-vgpu.labels" . | nindent 4 }} app.kubernetes.io/component: admission-webhook spec: {{- if .Capabilities.APIVersions.Has "batch/v1alpha1" }} @@ -15,32 +15,35 @@ spec: {{- end }} template: metadata: - name: {{ include "4pd-vgpu.fullname" . }}-admission-patch + name: {{ include "hami-vgpu.fullname" . }}-admission-patch {{- if .Values.scheduler.patch.podAnnotations }} annotations: {{ toYaml .Values.scheduler.patch.podAnnotations | nindent 8 }} {{- end }} labels: - {{- include "4pd-vgpu.labels" . | nindent 8 }} + {{- include "hami-vgpu.labels" . | nindent 8 }} app.kubernetes.io/component: admission-webhook - 4pd.io/webhook: ignore + hami.io/webhook: ignore spec: - {{- include "4pd-vgpu.imagePullSecrets" . | nindent 6}} + {{- include "hami-vgpu.imagePullSecrets" . | nindent 6}} {{- if .Values.scheduler.patch.priorityClassName }} priorityClassName: {{ .Values.scheduler.patch.priorityClassName }} {{- end }} containers: - name: patch + {{- if ge (.Values.scheduler.kubeScheduler.imageTag | substr 3 5| atoi) 22 }} + image: {{ .Values.scheduler.patch.imageNew }} + {{- else }} image: {{ .Values.scheduler.patch.image }} + {{- end }} imagePullPolicy: {{ .Values.scheduler.patch.imagePullPolicy }} args: - patch - - --webhook-name={{ include "4pd-vgpu.scheduler.webhook" . }} + - --webhook-name={{ include "hami-vgpu.scheduler.webhook" . }} - --namespace={{ .Release.Namespace }} - --patch-validating=false - - --secret-name={{ include "4pd-vgpu.scheduler.tls" . }} - - --patch-failure-policy=Fail + - --secret-name={{ include "hami-vgpu.scheduler.tls" . }} restartPolicy: OnFailure - serviceAccountName: {{ include "4pd-vgpu.fullname" . }}-admission + serviceAccountName: {{ include "hami-vgpu.fullname" . }}-admission {{- if .Values.scheduler.patch.nodeSelector }} nodeSelector: {{ toYaml .Values.scheduler.patch.nodeSelector | nindent 8 }} {{- end }} diff --git a/deployments/4pd-vgpu/templates/scheduler/job-patch/psp.yaml b/charts/hami/templates/scheduler/job-patch/psp.yaml similarity index 80% rename from deployments/4pd-vgpu/templates/scheduler/job-patch/psp.yaml rename to charts/hami/templates/scheduler/job-patch/psp.yaml index 9a616abd7..5716585b4 100644 --- a/deployments/4pd-vgpu/templates/scheduler/job-patch/psp.yaml +++ b/charts/hami/templates/scheduler/job-patch/psp.yaml @@ -1,12 +1,13 @@ +{{- if .Values.podSecurityPolicy.enabled }} apiVersion: policy/v1beta1 kind: PodSecurityPolicy metadata: - name: {{ include "4pd-vgpu.fullname" . }}-admission + name: {{ include "hami-vgpu.fullname" . }}-admission annotations: "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded labels: - {{- include "4pd-vgpu.labels" . | nindent 4 }} + {{- include "hami-vgpu.labels" . | nindent 4 }} app.kubernetes.io/component: admission-webhook spec: allowPrivilegeEscalation: false @@ -32,3 +33,4 @@ spec: - projected - secret - downwardAPI +{{- end }} diff --git a/deployments/4pd-vgpu/templates/scheduler/job-patch/role.yaml b/charts/hami/templates/scheduler/job-patch/role.yaml similarity index 77% rename from deployments/4pd-vgpu/templates/scheduler/job-patch/role.yaml rename to charts/hami/templates/scheduler/job-patch/role.yaml index c5bb7e6b0..7a77cbd89 100644 --- a/deployments/4pd-vgpu/templates/scheduler/job-patch/role.yaml +++ b/charts/hami/templates/scheduler/job-patch/role.yaml @@ -1,12 +1,12 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - name: {{ include "4pd-vgpu.fullname" . }}-admission + name: {{ include "hami-vgpu.fullname" . }}-admission annotations: "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded labels: - {{- include "4pd-vgpu.labels" . | nindent 4 }} + {{- include "hami-vgpu.labels" . | nindent 4 }} app.kubernetes.io/component: admission-webhook rules: - apiGroups: diff --git a/deployments/4pd-vgpu/templates/scheduler/job-patch/rolebinding.yaml b/charts/hami/templates/scheduler/job-patch/rolebinding.yaml similarity index 66% rename from deployments/4pd-vgpu/templates/scheduler/job-patch/rolebinding.yaml rename to charts/hami/templates/scheduler/job-patch/rolebinding.yaml index d60c77dd9..955ffe8f4 100644 --- a/deployments/4pd-vgpu/templates/scheduler/job-patch/rolebinding.yaml +++ b/charts/hami/templates/scheduler/job-patch/rolebinding.yaml @@ -1,18 +1,18 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: - name: {{ include "4pd-vgpu.fullname" . }}-admission + name: {{ include "hami-vgpu.fullname" . }}-admission annotations: "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded labels: - {{- include "4pd-vgpu.labels" . | nindent 4 }} + {{- include "hami-vgpu.labels" . | nindent 4 }} app.kubernetes.io/component: admission-webhook roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: {{ include "4pd-vgpu.fullname" . }}-admission + name: {{ include "hami-vgpu.fullname" . }}-admission subjects: - kind: ServiceAccount - name: {{ include "4pd-vgpu.fullname" . }}-admission + name: {{ include "hami-vgpu.fullname" . }}-admission namespace: {{ .Release.Namespace | quote }} diff --git a/deployments/4pd-vgpu/templates/scheduler/job-patch/serviceaccount.yaml b/charts/hami/templates/scheduler/job-patch/serviceaccount.yaml similarity index 71% rename from deployments/4pd-vgpu/templates/scheduler/job-patch/serviceaccount.yaml rename to charts/hami/templates/scheduler/job-patch/serviceaccount.yaml index 5e4a897ad..813d2b3ee 100644 --- a/deployments/4pd-vgpu/templates/scheduler/job-patch/serviceaccount.yaml +++ b/charts/hami/templates/scheduler/job-patch/serviceaccount.yaml @@ -1,10 +1,10 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: {{ include "4pd-vgpu.fullname" . }}-admission + name: {{ include "hami-vgpu.fullname" . }}-admission annotations: "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded labels: - {{- include "4pd-vgpu.labels" . | nindent 4 }} + {{- include "hami-vgpu.labels" . | nindent 4 }} app.kubernetes.io/component: admission-webhook diff --git a/deployments/4pd-vgpu/templates/scheduler/rolebinding.yaml b/charts/hami/templates/scheduler/rolebinding.yaml similarity index 56% rename from deployments/4pd-vgpu/templates/scheduler/rolebinding.yaml rename to charts/hami/templates/scheduler/rolebinding.yaml index 0cc0c7619..37f3d8693 100644 --- a/deployments/4pd-vgpu/templates/scheduler/rolebinding.yaml +++ b/charts/hami/templates/scheduler/rolebinding.yaml @@ -1,15 +1,15 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: {{ include "4pd-vgpu.scheduler" . }} + name: {{ include "hami-vgpu.scheduler" . }} labels: - app.kubernetes.io/component: "4pd-scheduler" - {{- include "4pd-vgpu.labels" . | nindent 4 }} + app.kubernetes.io/component: "hami-scheduler" + {{- include "hami-vgpu.labels" . | nindent 4 }} roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: cluster-admin subjects: - kind: ServiceAccount - name: {{ include "4pd-vgpu.scheduler" . }} + name: {{ include "hami-vgpu.scheduler" . }} namespace: {{ .Release.Namespace | quote }} diff --git a/charts/hami/templates/scheduler/service.yaml b/charts/hami/templates/scheduler/service.yaml new file mode 100644 index 000000000..836991187 --- /dev/null +++ b/charts/hami/templates/scheduler/service.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "hami-vgpu.scheduler" . }} + labels: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.labels" . | nindent 4 }} + {{- if .Values.scheduler.service.labels }} + {{ toYaml .Values.scheduler.service.labels | indent 4 }} + {{- end }} + {{- if .Values.scheduler.service.annotations }} + annotations: {{ toYaml .Values.scheduler.service.annotations | nindent 4 }} + {{- end }} +spec: + type: NodePort + ports: + - name: http + port: {{ .Values.scheduler.service.httpPort }} + targetPort: 443 + nodePort: {{ .Values.scheduler.service.schedulerPort }} + protocol: TCP + - name: monitor + port: {{ .Values.scheduler.service.monitorPort }} + targetPort: 9395 + nodePort: {{ .Values.scheduler.service.monitorPort }} + protocol: TCP + selector: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.selectorLabels" . | nindent 4 }} + diff --git a/charts/hami/templates/scheduler/serviceaccount.yaml b/charts/hami/templates/scheduler/serviceaccount.yaml new file mode 100644 index 000000000..c9d129df3 --- /dev/null +++ b/charts/hami/templates/scheduler/serviceaccount.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "hami-vgpu.scheduler" . }} + namespace: {{ .Release.Namespace | quote }} + labels: + app.kubernetes.io/component: "hami-scheduler" + {{- include "hami-vgpu.labels" . | nindent 4 }} diff --git a/charts/hami/templates/scheduler/webhook.yaml b/charts/hami/templates/scheduler/webhook.yaml new file mode 100644 index 000000000..314b9255c --- /dev/null +++ b/charts/hami/templates/scheduler/webhook.yaml @@ -0,0 +1,51 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: {{ include "hami-vgpu.scheduler.webhook" . }} +webhooks: + - admissionReviewVersions: + - v1beta1 + clientConfig: + {{- if .Values.scheduler.customWebhook.enabled }} + url: https://{{ .Values.scheduler.customWebhook.host}}:{{.Values.scheduler.customWebhook.port}}{{.Values.scheduler.customWebhook.path}} + {{- else }} + service: + name: {{ include "hami-vgpu.scheduler" . }} + namespace: {{ .Release.Namespace }} + path: /webhook + port: {{ .Values.scheduler.service.httpPort }} + {{- end }} + failurePolicy: {{ .Values.scheduler.mutatingWebhookConfiguration.failurePolicy }} + matchPolicy: Equivalent + name: vgpu.hami.io + namespaceSelector: + matchExpressions: + - key: hami.io/webhook + operator: NotIn + values: + - ignore + {{- if .Values.scheduler.customWebhook.whitelistNamespaces }} + - key: kubernetes.io/metadata.name + operator: NotIn + values: + {{- toYaml .Values.scheduler.customWebhook.whitelistNamespaces | nindent 10 }} + {{- end }} + objectSelector: + matchExpressions: + - key: hami.io/webhook + operator: NotIn + values: + - ignore + reinvocationPolicy: Never + rules: + - apiGroups: + - "" + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: None + timeoutSeconds: 10 diff --git a/charts/hami/values.yaml b/charts/hami/values.yaml new file mode 100644 index 000000000..65e899f64 --- /dev/null +++ b/charts/hami/values.yaml @@ -0,0 +1,145 @@ +# Default values for hami-vgpu. + +nameOverride: "" +fullnameOverride: "" +imagePullSecrets: [ ] +version: "v2.3.13" + +#Nvidia GPU Parameters +resourceName: "nvidia.com/gpu" +resourceMem: "nvidia.com/gpumem" +resourceMemPercentage: "nvidia.com/gpumem-percentage" +resourceCores: "nvidia.com/gpucores" +resourcePriority: "nvidia.com/priority" + +#MLU Parameters +mluResourceName: "cambricon.com/vmlu" +mluResourceMem: "cambricon.com/mlu.smlu.vmemory" +mluResourceCores: "cambricon.com/mlu.smlu.vcore" + +#Hygon DCU Parameters +dcuResourceName: "hygon.com/dcunum" +dcuResourceMem: "hygon.com/dcumem" +dcuResourceCores: "hygon.com/dcucores" + +#Iluvatar GPU Parameters +iluvatarResourceName: "iluvatar.ai/vgpu" +iluvatarResourceMem: "iluvatar.ai/vcuda-memory" +iluvatarResourceCore: "iluvatar.ai/vcuda-core" + +#Ascend 910B Parameters +ascendResourceName: "huawei.com/Ascend910" +ascendResourceMem: "huawei.com/Ascend910-memory" + +#Ascend 310P Parameters +ascend310PResourceName: "huawei.com/Ascend310P" +ascend310PResourceMem: "huawei.com/Ascend310P-memory" + + +schedulerName: "hami-scheduler" + +podSecurityPolicy: + enabled: false + +global: + gpuHookPath: /usr/local + labels: {} + annotations: {} + managedNodeSelectorEnable: false + managedNodeSelector: + usage: "gpu" + + +scheduler: + # @param nodeName defines the node name and the nvidia-vgpu-scheduler-scheduler will schedule to the node. + # if we install the nvidia-vgpu-scheduler-scheduler as default scheduler, we need to remove the k8s default + # scheduler pod from the cluster first, we must specify node name to skip the schedule workflow. + nodeName: "" +# nodeLabelSelector: +# "gpu": "on" + defaultMem: 0 + defaultCores: 0 + defaultGPUNum: 1 + overwriteEnv: "false" + defaultSchedulerPolicy: + nodeSchedulerPolicy: binpack + gpuSchedulerPolicy: spread + metricsBindAddress: ":9395" + livenessProbe: false + leaderElect: true + kubeScheduler: + # @param enabled indicate whether to run kube-scheduler container in the scheduler pod, it's true by default. + enabled: true + imageTag: "v1.20.0" + image: registry.cn-hangzhou.aliyuncs.com/google_containers/kube-scheduler + imagePullPolicy: IfNotPresent + extraNewArgs: + - --config=/config/config.yaml + - -v=4 + extraArgs: + - --policy-config-file=/config/config.json + - -v=4 + extender: + image: "projecthami/hami" + imagePullPolicy: IfNotPresent + extraArgs: + - --debug + - -v=4 + podAnnotations: {} + tolerations: [] + #serviceAccountName: "hami-vgpu-scheduler-sa" + customWebhook: + enabled: false + # must be an endpoint using https. + # should generate host certs here + host: 127.0.0.1 # hostname or ip, can be your node'IP if you want to use https://:/ + port: 31998 + path: /webhook + whitelistNamespaces: + # Specify the namespaces that the webhook will not be applied to. + # - default + # - kube-system + # - istio-system + patch: + image: docker.io/jettech/kube-webhook-certgen:v1.5.2 + imageNew: liangjw/kube-webhook-certgen:v1.1.1 + imagePullPolicy: IfNotPresent + priorityClassName: "" + podAnnotations: {} + nodeSelector: {} + tolerations: [] + runAsUser: 2000 + mutatingWebhookConfiguration: + failurePolicy: Ignore + service: + httpPort: 443 + schedulerPort: 31998 + monitorPort: 31993 + labels: {} + annotations: {} + +devicePlugin: + image: "projecthami/hami" + monitorimage: "projecthami/hami" + monitorctrPath: /usr/local/vgpu/containers + imagePullPolicy: IfNotPresent + deviceSplitCount: 10 + deviceMemoryScaling: 1 + deviceCoreScaling: 1 + runtimeClassName: "" + migStrategy: "none" + disablecorelimit: "false" + extraArgs: + - -v=false + + service: + httpPort: 31992 + + pluginPath: /var/lib/kubelet/device-plugins + libPath: /usr/local/vgpu + + podAnnotations: {} + nvidianodeSelector: + gpu: "on" + tolerations: [] + diff --git a/cmd/device-plugin/main.go b/cmd/device-plugin/main.go deleted file mode 100644 index 78c7c12ef..000000000 --- a/cmd/device-plugin/main.go +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package main - -import ( - "4pd.io/k8s-vgpu/pkg/version" - "fmt" - "net" - "syscall" - - "4pd.io/k8s-vgpu/pkg/api" - "4pd.io/k8s-vgpu/pkg/device-plugin" - "4pd.io/k8s-vgpu/pkg/device-plugin/config" - "4pd.io/k8s-vgpu/pkg/util" - "github.com/NVIDIA/go-gpuallocator/gpuallocator" - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" - "github.com/fsnotify/fsnotify" - "github.com/spf13/cobra" - "github.com/spf13/viper" - "google.golang.org/grpc" - "k8s.io/klog/v2" - pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" -) - -var ( - failOnInitErrorFlag bool - //nvidiaDriverRootFlag string - //enableLegacyPreferredFlag bool - - rootCmd = &cobra.Command{ - Use: "device-plugin", - Short: "kubernetes vgpu device-plugin", - Run: func(cmd *cobra.Command, args []string) { - if err := start(); err != nil { - klog.Fatal(err) - } - }, - } -) - -func init() { - // https://github.com/spf13/viper/issues/461 - viper.BindEnv("node-name", "NODE_NAME") - - rootCmd.Flags().SortFlags = false - rootCmd.PersistentFlags().SortFlags = false - - rootCmd.Flags().BoolVar(&failOnInitErrorFlag, "fail-on-init-error", true, "fail the plugin if an error is encountered during initialization, otherwise block indefinitely") - rootCmd.Flags().StringVar(&config.RuntimeSocketFlag, "runtime-socket", "/var/lib/vgpu/vgpu.sock", "runtime socket") - rootCmd.Flags().UintVar(&config.DeviceSplitCount, "device-split-count", 2, "the number for NVIDIA device split") - rootCmd.Flags().Float64Var(&config.DeviceMemoryScaling, "device-memory-scaling", 1.0, "the ratio for NVIDIA device memory scaling") - rootCmd.Flags().Float64Var(&config.DeviceCoresScaling, "device-cores-scaling", 1.0, "the ratio for NVIDIA device cores scaling") - rootCmd.Flags().StringVar(&config.SchedulerEndpoint, "scheduler-endpoint", "127.0.0.1:9090", "scheduler extender endpoint") - rootCmd.Flags().IntVar(&config.SchedulerTimeout, "scheduler-timeout", 10, "scheduler connection timeout") - rootCmd.Flags().StringVar(&config.NodeName, "node-name", viper.GetString("node-name"), "node name") - - rootCmd.PersistentFlags().AddGoFlagSet(util.GlobalFlagSet()) - rootCmd.AddCommand(version.VersionCmd) -} - -func start() error { - klog.Info("Loading NVML") - if err := nvml.Init(); err != nil { - klog.Infof("Failed to initialize NVML: %v.", err) - klog.Infof("If this is a GPU node, did you set the docker default runtime to `nvidia`?") - klog.Infof("You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites") - klog.Infof("You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start") - klog.Infof("If this is not a GPU node, you should set up a toleration or nodeSelector to only deploy this plugin on GPU nodes") - if failOnInitErrorFlag { - return fmt.Errorf("failed to initialize NVML: %v", err) - } - select {} - } - defer func() { klog.Info("Shutdown of NVML returned:", nvml.Shutdown()) }() - - klog.Info("Starting FS watcher.") - watcher, err := NewFSWatcher(pluginapi.DevicePluginPath) - if err != nil { - return fmt.Errorf("failed to create FS watcher: %v", err) - } - defer watcher.Close() - - klog.Info("Starting OS watcher.") - sigs := NewOSWatcher(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) - - cache := device_plugin.NewDeviceCache() - cache.Start() - defer cache.Stop() - register := device_plugin.NewDeviceRegister(cache) - register.Start() - defer register.Stop() - rt := device_plugin.NewVGPURuntimeService(cache) - - // start runtime grpc server - lisGrpc, err := net.Listen("unix", config.RuntimeSocketFlag) - if err != nil { - klog.Fatalf("bind unix socket %v failed, %v", err) - } - defer lisGrpc.Close() - runtimeServer := grpc.NewServer() - api.RegisterVGPURuntimeServiceServer(runtimeServer, rt) - go func() { - err := runtimeServer.Serve(lisGrpc) - if err != nil { - klog.Fatal(err) - } - }() - defer runtimeServer.Stop() - - var plugins []*device_plugin.NvidiaDevicePlugin -restart: - // If we are restarting, idempotently stop any running plugins before - // recreating them below. - for _, p := range plugins { - p.Stop() - } - - klog.Info("Retreiving plugins.") - plugins = []*device_plugin.NvidiaDevicePlugin{ - device_plugin.NewNvidiaDevicePlugin( - util.ResourceName, - cache, - gpuallocator.NewBestEffortPolicy(), - pluginapi.DevicePluginPath+"nvidia-gpu.sock"), - } - - // Loop through all plugins, starting them if they have any devices - // to serve. If even one plugin fails to start properly, try - // starting them all again. - started := 0 - pluginStartError := make(chan struct{}) - for _, p := range plugins { - // Just continue if there are no devices to serve for plugin p. - if len(p.Devices()) == 0 { - continue - } - - // Start the gRPC server for plugin p and connect it with the kubelet. - if err := p.Start(); err != nil { - //klog.SetOutput(os.Stderr) - klog.Info("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?") - klog.Info("You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites") - klog.Info("You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start") - close(pluginStartError) - goto events - } - started++ - } - - if started == 0 { - klog.Info("No devices found. Waiting indefinitely.") - } - -events: - // Start an infinite loop, waiting for several indicators to either log - // some messages, trigger a restart of the plugins, or exit the program. - for { - select { - // If there was an error starting any plugins, restart them all. - case <-pluginStartError: - goto restart - - // Detect a kubelet restart by watching for a newly created - // 'pluginapi.KubeletSocket' file. When this occurs, restart this loop, - // restarting all of the plugins in the process. - case event := <-watcher.Events: - if event.Name == pluginapi.KubeletSocket && event.Op&fsnotify.Create == fsnotify.Create { - klog.Infof("inotify: %s created, restarting.", pluginapi.KubeletSocket) - goto restart - } - - // Watch for any other fs errors and log them. - case err := <-watcher.Errors: - klog.Infof("inotify: %s", err) - - // Watch for any signals from the OS. On SIGHUP, restart this loop, - // restarting all of the plugins in the process. On all other - // signals, exit the loop and exit the program. - case s := <-sigs: - switch s { - case syscall.SIGHUP: - klog.Info("Received SIGHUP, restarting.") - goto restart - default: - klog.Infof("Received signal %v, shutting down.", s) - for _, p := range plugins { - p.Stop() - } - break events - } - } - } - return nil -} - -func main() { - if err := rootCmd.Execute(); err != nil { - klog.Fatal(err) - } -} diff --git a/cmd/device-plugin/nvidia/main.go b/cmd/device-plugin/nvidia/main.go new file mode 100644 index 000000000..4dc5c604a --- /dev/null +++ b/cmd/device-plugin/nvidia/main.go @@ -0,0 +1,352 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "encoding/json" + "fmt" + "os" + "syscall" + "time" + + "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/info" + "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin" + "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/rm" + "github.com/Project-HAMi/HAMi/pkg/util" + + spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" + "github.com/fsnotify/fsnotify" + cli "github.com/urfave/cli/v2" + + errorsutil "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/klog/v2" + kubeletdevicepluginv1beta1 "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" +) + +func main() { + var configFile string + + c := cli.NewApp() + c.Name = "NVIDIA Device Plugin" + c.Usage = "NVIDIA device plugin for Kubernetes" + c.Version = info.GetVersionString() + c.Action = func(ctx *cli.Context) error { + return start(ctx, c.Flags) + } + + c.Flags = []cli.Flag{ + &cli.StringFlag{ + Name: "mig-strategy", + Value: spec.MigStrategyNone, + Usage: "the desired strategy for exposing MIG devices on GPUs that support it:\n\t\t[none | single | mixed]", + EnvVars: []string{"MIG_STRATEGY"}, + }, + &cli.BoolFlag{ + Name: "fail-on-init-error", + Value: true, + Usage: "fail the plugin if an error is encountered during initialization, otherwise block indefinitely", + EnvVars: []string{"FAIL_ON_INIT_ERROR"}, + }, + &cli.StringFlag{ + Name: "nvidia-driver-root", + Value: "/", + Usage: "the root path for the NVIDIA driver installation (typical values are '/' or '/run/nvidia/driver')", + EnvVars: []string{"NVIDIA_DRIVER_ROOT"}, + }, + &cli.BoolFlag{ + Name: "pass-device-specs", + Value: false, + Usage: "pass the list of DeviceSpecs to the kubelet on Allocate()", + EnvVars: []string{"PASS_DEVICE_SPECS"}, + }, + &cli.StringSliceFlag{ + Name: "device-list-strategy", + Value: cli.NewStringSlice(string(spec.DeviceListStrategyEnvvar)), + Usage: "the desired strategy for passing the device list to the underlying runtime:\n\t\t[envvar | volume-mounts | cdi-annotations]", + EnvVars: []string{"DEVICE_LIST_STRATEGY"}, + }, + &cli.StringFlag{ + Name: "device-id-strategy", + Value: spec.DeviceIDStrategyUUID, + Usage: "the desired strategy for passing device IDs to the underlying runtime:\n\t\t[uuid | index]", + EnvVars: []string{"DEVICE_ID_STRATEGY"}, + }, + &cli.BoolFlag{ + Name: "gds-enabled", + Usage: "ensure that containers are started with NVIDIA_GDS=enabled", + EnvVars: []string{"GDS_ENABLED"}, + }, + &cli.BoolFlag{ + Name: "mofed-enabled", + Usage: "ensure that containers are started with NVIDIA_MOFED=enabled", + EnvVars: []string{"MOFED_ENABLED"}, + }, + &cli.StringFlag{ + Name: "config-file", + Usage: "the path to a config file as an alternative to command line options or environment variables", + Destination: &configFile, + EnvVars: []string{"CONFIG_FILE"}, + }, + &cli.StringFlag{ + Name: "cdi-annotation-prefix", + Value: spec.DefaultCDIAnnotationPrefix, + Usage: "the prefix to use for CDI container annotation keys", + EnvVars: []string{"CDI_ANNOTATION_PREFIX"}, + }, + &cli.StringFlag{ + Name: "nvidia-ctk-path", + Value: spec.DefaultNvidiaCTKPath, + Usage: "the path to use for the nvidia-ctk in the generated CDI specification", + EnvVars: []string{"NVIDIA_CTK_PATH"}, + }, + &cli.StringFlag{ + Name: "container-driver-root", + Value: spec.DefaultContainerDriverRoot, + Usage: "the path where the NVIDIA driver root is mounted in the container; used for generating CDI specifications", + EnvVars: []string{"CONTAINER_DRIVER_ROOT"}, + }, + } + c.Flags = append(c.Flags, addFlags()...) + err := c.Run(os.Args) + if err != nil { + klog.Error(err) + os.Exit(1) + } +} + +func validateFlags(config *spec.Config) error { + _, err := spec.NewDeviceListStrategies(*config.Flags.Plugin.DeviceListStrategy) + if err != nil { + return fmt.Errorf("invalid --device-list-strategy option: %v", err) + } + + if *config.Flags.Plugin.DeviceIDStrategy != spec.DeviceIDStrategyUUID && *config.Flags.Plugin.DeviceIDStrategy != spec.DeviceIDStrategyIndex { + return fmt.Errorf("invalid --device-id-strategy option: %v", *config.Flags.Plugin.DeviceIDStrategy) + } + return nil +} + +func loadConfig(c *cli.Context, flags []cli.Flag) (*spec.Config, error) { + config, err := spec.NewConfig(c, flags) + if err != nil { + return nil, fmt.Errorf("unable to finalize config: %v", err) + } + err = validateFlags(config) + if err != nil { + return nil, fmt.Errorf("unable to validate flags: %v", err) + } + config.Flags.GFD = nil + return config, nil +} + +func start(c *cli.Context, flags []cli.Flag) error { + klog.Info("Starting FS watcher.") + util.NodeName = os.Getenv(util.NodeNameEnvName) + watcher, err := newFSWatcher(kubeletdevicepluginv1beta1.DevicePluginPath) + if err != nil { + return fmt.Errorf("failed to create FS watcher: %v", err) + } + defer watcher.Close() + + /*Loading config files*/ + klog.Infof("Start working on node %s", util.NodeName) + klog.Info("Starting OS watcher.") + sigs := newOSWatcher(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) + + var restarting bool + var restartTimeout <-chan time.Time + var plugins []plugin.Interface +restart: + // If we are restarting, stop plugins from previous run. + if restarting { + err := stopPlugins(plugins) + if err != nil { + return fmt.Errorf("error stopping plugins from previous run: %v", err) + } + } + + klog.Info("Starting Plugins.") + plugins, restartPlugins, err := startPlugins(c, flags, restarting) + if err != nil { + return fmt.Errorf("error starting plugins: %v", err) + } + + if restartPlugins { + klog.Info("Failed to start one or more plugins. Retrying in 30s...") + restartTimeout = time.After(30 * time.Second) + } + + restarting = true + + // Start an infinite loop, waiting for several indicators to either log + // some messages, trigger a restart of the plugins, or exit the program. + for { + select { + // If the restart timeout has expired, then restart the plugins + case <-restartTimeout: + goto restart + + // Detect a kubelet restart by watching for a newly created + // 'kubeletdevicepluginv1beta1.KubeletSocket' file. When this occurs, restart this loop, + // restarting all of the plugins in the process. + case event := <-watcher.Events: + if event.Name == kubeletdevicepluginv1beta1.KubeletSocket && event.Op&fsnotify.Create == fsnotify.Create { + klog.Infof("inotify: %s created, restarting.", kubeletdevicepluginv1beta1.KubeletSocket) + goto restart + } + + // Watch for any other fs errors and log them. + case err := <-watcher.Errors: + klog.Errorf("inotify: %s", err) + + // Watch for any signals from the OS. On SIGHUP, restart this loop, + // restarting all of the plugins in the process. On all other + // signals, exit the loop and exit the program. + case s := <-sigs: + switch s { + case syscall.SIGHUP: + klog.Info("Received SIGHUP, restarting.") + goto restart + default: + klog.Infof("Received signal \"%v\", shutting down.", s) + goto exit + } + } + } +exit: + err = stopPlugins(plugins) + if err != nil { + return fmt.Errorf("error stopping plugins: %v", err) + } + return nil +} + +func startPlugins(c *cli.Context, flags []cli.Flag, restarting bool) ([]plugin.Interface, bool, error) { + // Load the configuration file + klog.Info("Loading configuration.") + config, err := loadConfig(c, flags) + if err != nil { + return nil, false, fmt.Errorf("unable to load config: %v", err) + } + disableResourceRenamingInConfig(config) + + /*Loading config files*/ + //fmt.Println("NodeName=", config.NodeName) + devConfig, err := generateDeviceConfigFromNvidia(config, c, flags) + if err != nil { + fmt.Printf("failed to load config file %s", err.Error()) + } + + // Update the configuration file with default resources. + klog.Info("Updating config with default resource matching patterns.") + err = rm.AddDefaultResourcesToConfig(&devConfig) + if err != nil { + return nil, false, fmt.Errorf("unable to add default resources to config: %v", err) + } + + // Print the config to the output. + configJSON, err := json.MarshalIndent(devConfig, "", " ") + if err != nil { + return nil, false, fmt.Errorf("failed to marshal config to JSON: %v", err) + } + klog.Infof("\nRunning with config:\n%v", string(configJSON)) + + // Get the set of plugins. + klog.Info("Retrieving plugins.") + pluginManager, err := NewPluginManager(&devConfig) + if err != nil { + return nil, false, fmt.Errorf("error creating plugin manager: %v", err) + } + plugins, err := pluginManager.GetPlugins() + if err != nil { + return nil, false, fmt.Errorf("error getting plugins: %v", err) + } + + // Loop through all plugins, starting them if they have any devices + // to serve. If even one plugin fails to start properly, try + // starting them all again. + started := 0 + for _, p := range plugins { + // Just continue if there are no devices to serve for plugin p. + if len(p.Devices()) == 0 { + continue + } + + // Start the gRPC server for plugin p and connect it with the kubelet. + if err := p.Start(); err != nil { + klog.Error("Could not contact Kubelet. Did you enable the device plugin feature gate?") + klog.Error("You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites") + klog.Error("You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start") + return plugins, true, nil + } + started++ + } + + if started == 0 { + klog.Info("No devices found. Waiting indefinitely.") + } + + return plugins, false, nil +} + +func stopPlugins(plugins []plugin.Interface) error { + klog.Info("Stopping plugins.") + errs := []error{} + for _, p := range plugins { + err := p.Stop() + errs = append(errs, err) + } + return errorsutil.NewAggregate(errs) +} + +// disableResourceRenamingInConfig temporarily disable the resource renaming feature of the plugin. +// We plan to reeenable this feature in a future release. +func disableResourceRenamingInConfig(config *spec.Config) { + // Disable resource renaming through config.Resource + if len(config.Resources.GPUs) > 0 || len(config.Resources.MIGs) > 0 { + klog.Infof("Customizing the 'resources' field is not yet supported in the config. Ignoring...") + } + config.Resources.GPUs = nil + config.Resources.MIGs = nil + + // Disable renaming / device selection in Sharing.TimeSlicing.Resources + renameByDefault := config.Sharing.TimeSlicing.RenameByDefault + setsNonDefaultRename := false + setsDevices := false + for i, r := range config.Sharing.TimeSlicing.Resources { + if !renameByDefault && r.Rename != "" { + setsNonDefaultRename = true + config.Sharing.TimeSlicing.Resources[i].Rename = "" + } + if renameByDefault && r.Rename != r.Name.DefaultSharedRename() { + setsNonDefaultRename = true + config.Sharing.TimeSlicing.Resources[i].Rename = r.Name.DefaultSharedRename() + } + if !r.Devices.All { + setsDevices = true + config.Sharing.TimeSlicing.Resources[i].Devices.All = true + config.Sharing.TimeSlicing.Resources[i].Devices.Count = 0 + config.Sharing.TimeSlicing.Resources[i].Devices.List = nil + } + } + if setsNonDefaultRename { + klog.Warning("Setting the 'rename' field in sharing.timeSlicing.resources is not yet supported in the config. Ignoring...") + } + if setsDevices { + klog.Warning("Customizing the 'devices' field in sharing.timeSlicing.resources is not yet supported in the config. Ignoring...") + } +} diff --git a/cmd/device-plugin/nvidia/plugin-manager.go b/cmd/device-plugin/nvidia/plugin-manager.go new file mode 100644 index 000000000..d22fecbd2 --- /dev/null +++ b/cmd/device-plugin/nvidia/plugin-manager.go @@ -0,0 +1,82 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "fmt" + + "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/cdi" + "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager" + "github.com/Project-HAMi/HAMi/pkg/util" + + "github.com/NVIDIA/go-nvlib/pkg/nvml" + spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" +) + +// NewPluginManager creates an NVML-based plugin manager. +func NewPluginManager(config *util.DeviceConfig) (manager.Interface, error) { + var err error + switch *config.Flags.MigStrategy { + case spec.MigStrategyNone: + case spec.MigStrategySingle: + case spec.MigStrategyMixed: + default: + return nil, fmt.Errorf("unknown strategy: %v", *config.Flags.MigStrategy) + } + + nvmllib := nvml.New() + + deviceListStrategies, err := spec.NewDeviceListStrategies(*config.Flags.Plugin.DeviceListStrategy) + if err != nil { + return nil, fmt.Errorf("invalid device list strategy: %v", err) + } + + cdiEnabled := deviceListStrategies.IsCDIEnabled() + + cdiHandler, err := cdi.New( + cdi.WithEnabled(cdiEnabled), + cdi.WithDriverRoot(*config.Flags.Plugin.ContainerDriverRoot), + cdi.WithTargetDriverRoot(*config.Flags.NvidiaDriverRoot), + cdi.WithNvidiaCTKPath(*config.Flags.Plugin.NvidiaCTKPath), + cdi.WithNvml(nvmllib), + cdi.WithDeviceIDStrategy(*config.Flags.Plugin.DeviceIDStrategy), + cdi.WithVendor("k8s.device-plugin.nvidia.com"), + cdi.WithGdsEnabled(*config.Flags.GDSEnabled), + cdi.WithMofedEnabled(*config.Flags.MOFEDEnabled), + ) + if err != nil { + return nil, fmt.Errorf("unable to create cdi handler: %v", err) + } + + m, err := manager.New( + manager.WithNVML(nvmllib), + manager.WithCDIEnabled(cdiEnabled), + manager.WithCDIHandler(cdiHandler), + manager.WithConfig(config), + manager.WithFailOnInitError(*config.Flags.FailOnInitError), + manager.WithMigStrategy(*config.Flags.MigStrategy), + ) + if err != nil { + return nil, fmt.Errorf("unable to create plugin manager: %v", err) + } + + if err := m.CreateCDISpecFile(); err != nil { + return nil, fmt.Errorf("unable to create cdi spec file: %v", err) + } + + return m, nil +} diff --git a/cmd/device-plugin/nvidia/vgpucfg.go b/cmd/device-plugin/nvidia/vgpucfg.go new file mode 100644 index 000000000..5ac45cc96 --- /dev/null +++ b/cmd/device-plugin/nvidia/vgpucfg.go @@ -0,0 +1,154 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "encoding/json" + "fmt" + "os" + "strings" + + "github.com/Project-HAMi/HAMi/pkg/util" + + spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" + cli "github.com/urfave/cli/v2" + "k8s.io/klog/v2" +) + +func addFlags() []cli.Flag { + addition := []cli.Flag{ + &cli.StringFlag{ + Name: "node-name", + Value: os.Getenv(util.NodeNameEnvName), + Usage: "node name", + EnvVars: []string{"NodeName"}, + }, + &cli.UintFlag{ + Name: "device-split-count", + Value: 2, + Usage: "the number for NVIDIA device split", + EnvVars: []string{"DEVICE_SPLIT_COUNT"}, + }, + &cli.Float64Flag{ + Name: "device-memory-scaling", + Value: 1.0, + Usage: "the ratio for NVIDIA device memory scaling", + EnvVars: []string{"DEVICE_MEMORY_SCALING"}, + }, + &cli.Float64Flag{ + Name: "device-cores-scaling", + Value: 1.0, + Usage: "the ratio for NVIDIA device cores scaling", + EnvVars: []string{"DEVICE_CORES_SCALING"}, + }, + &cli.BoolFlag{ + Name: "disable-core-limit", + Value: false, + Usage: "If set, the core utilization limit will be ignored", + EnvVars: []string{"DISABLE_CORE_LIMIT"}, + }, + &cli.StringFlag{ + Name: "resource-name", + Value: "nvidia.com/gpu", + Usage: "the name of field for number GPU visible in container", + }, + } + return addition +} + +// prt returns a reference to whatever type is passed into it. +func ptr[T any](x T) *T { + return &x +} + +// updateFromCLIFlag conditionally updates the config flag at 'pflag' to the value of the CLI flag with name 'flagName'. +func updateFromCLIFlag[T any](pflag **T, c *cli.Context, flagName string) { + if c.IsSet(flagName) || *pflag == (*T)(nil) { + switch flag := any(pflag).(type) { + case **string: + *flag = ptr(c.String(flagName)) + case **[]string: + *flag = ptr(c.StringSlice(flagName)) + case **bool: + *flag = ptr(c.Bool(flagName)) + case **float64: + *flag = ptr(c.Float64(flagName)) + case **uint: + *flag = ptr(c.Uint(flagName)) + default: + panic(fmt.Errorf("unsupported flag type for %v: %T", flagName, flag)) + } + } +} + +func readFromConfigFile() error { + jsonbyte, err := os.ReadFile("/config/config.json") + if err != nil { + return err + } + var deviceConfigs util.DevicePluginConfigs + err = json.Unmarshal(jsonbyte, &deviceConfigs) + if err != nil { + return err + } + klog.Infof("Device Plugin Configs: %v", fmt.Sprintf("%v", deviceConfigs)) + for _, val := range deviceConfigs.Nodeconfig { + if strings.Compare(os.Getenv(util.NodeNameEnvName), val.Name) == 0 { + klog.Infof("Reading config from file %s", val.Name) + if val.Devicememoryscaling > 0 { + *util.DeviceMemoryScaling = val.Devicememoryscaling + } + if val.Devicecorescaling > 0 { + *util.DeviceCoresScaling = val.Devicecorescaling + } + if val.Devicesplitcount > 0 { + *util.DeviceSplitCount = val.Devicesplitcount + } + } + } + return nil +} + +func generateDeviceConfigFromNvidia(cfg *spec.Config, c *cli.Context, flags []cli.Flag) (util.DeviceConfig, error) { + devcfg := util.DeviceConfig{} + devcfg.Config = cfg + + klog.Infoln("flags=", flags) + for _, flag := range flags { + for _, n := range flag.Names() { + // Common flags + if strings.Compare(n, "device-split-count") == 0 { + updateFromCLIFlag(&util.DeviceSplitCount, c, n) + } + if strings.Compare(n, "device-memory-scaling") == 0 { + updateFromCLIFlag(&util.DeviceMemoryScaling, c, n) + klog.Infoln("DeviceMemoryScaling", *util.DeviceMemoryScaling) + } + if strings.Compare(n, "device-cores-scaling") == 0 { + updateFromCLIFlag(&util.DeviceCoresScaling, c, n) + } + if strings.Compare(n, "disable-core-limit") == 0 { + updateFromCLIFlag(&util.DisableCoreLimit, c, n) + } + if strings.Compare(n, "resource-name") == 0 { + updateFromCLIFlag(&devcfg.ResourceName, c, n) + } + } + } + readFromConfigFile() + return devcfg, nil +} diff --git a/cmd/device-plugin/nvidia/watchers.go b/cmd/device-plugin/nvidia/watchers.go new file mode 100644 index 000000000..6e90ff4ff --- /dev/null +++ b/cmd/device-plugin/nvidia/watchers.go @@ -0,0 +1,48 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "os" + "os/signal" + + "github.com/fsnotify/fsnotify" +) + +func newFSWatcher(files ...string) (*fsnotify.Watcher, error) { + watcher, err := fsnotify.NewWatcher() + if err != nil { + return nil, err + } + + for _, f := range files { + err = watcher.Add(f) + if err != nil { + watcher.Close() + return nil, err + } + } + + return watcher, nil +} + +func newOSWatcher(sigs ...os.Signal) chan os.Signal { + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, sigs...) + + return sigChan +} diff --git a/cmd/device-plugin/watchers.go b/cmd/device-plugin/watchers.go deleted file mode 100644 index ec99eb3ff..000000000 --- a/cmd/device-plugin/watchers.go +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package main - -import ( - "github.com/fsnotify/fsnotify" - "os" - "os/signal" -) - -func NewFSWatcher(files ...string) (*fsnotify.Watcher, error) { - watcher, err := fsnotify.NewWatcher() - if err != nil { - return nil, err - } - - for _, f := range files { - err = watcher.Add(f) - if err != nil { - watcher.Close() - return nil, err - } - } - - return watcher, nil -} - -func NewOSWatcher(sigs ...os.Signal) chan os.Signal { - sigChan := make(chan os.Signal, 1) - signal.Notify(sigChan, sigs...) - - return sigChan -} diff --git a/cmd/scheduler/main.go b/cmd/scheduler/main.go index 01c5df8e4..9dad76c8e 100644 --- a/cmd/scheduler/main.go +++ b/cmd/scheduler/main.go @@ -1,99 +1,101 @@ /* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package main import ( - "4pd.io/k8s-vgpu/pkg/util" - "4pd.io/k8s-vgpu/pkg/version" - "net" - "net/http" + "net/http" + + "github.com/Project-HAMi/HAMi/pkg/device" + "github.com/Project-HAMi/HAMi/pkg/scheduler" + "github.com/Project-HAMi/HAMi/pkg/scheduler/config" + "github.com/Project-HAMi/HAMi/pkg/scheduler/policy" + "github.com/Project-HAMi/HAMi/pkg/scheduler/routes" + "github.com/Project-HAMi/HAMi/pkg/util" + "github.com/Project-HAMi/HAMi/pkg/version" - pb "4pd.io/k8s-vgpu/pkg/api" - "4pd.io/k8s-vgpu/pkg/scheduler" - "4pd.io/k8s-vgpu/pkg/scheduler/config" - "4pd.io/k8s-vgpu/pkg/scheduler/routes" - "github.com/julienschmidt/httprouter" - "github.com/spf13/cobra" - "google.golang.org/grpc" - "k8s.io/klog/v2" + "github.com/julienschmidt/httprouter" + "github.com/spf13/cobra" + klog "k8s.io/klog/v2" ) //var version string var ( - tlsKeyFile string - tlsCertFile string - rootCmd = &cobra.Command{ - Use: "scheduler", - Short: "kubernetes vgpu scheduler", - Run: func(cmd *cobra.Command, args []string) { - start() - }, - } + sher *scheduler.Scheduler + tlsKeyFile string + tlsCertFile string + rootCmd = &cobra.Command{ + Use: "scheduler", + Short: "kubernetes vgpu scheduler", + Run: func(cmd *cobra.Command, args []string) { + start() + }, + } ) func init() { - rootCmd.Flags().SortFlags = false - rootCmd.PersistentFlags().SortFlags = false - - rootCmd.Flags().StringVar(&config.GrpcBind, "grpc_bind", "127.0.0.1:9090", "grpc server bind address") - rootCmd.Flags().StringVar(&config.HttpBind, "http_bind", "127.0.0.1:8080", "http server bind address") - rootCmd.Flags().StringVar(&tlsCertFile, "cert_file", "", "tls cert file") - rootCmd.Flags().StringVar(&tlsKeyFile, "key_file", "", "tls key file") - rootCmd.Flags().StringVar(&config.SchedulerName, "scheduler-name", "", "the name to be added to pod.spec.schedulerName if not empty") + rootCmd.Flags().SortFlags = false + rootCmd.PersistentFlags().SortFlags = false - rootCmd.PersistentFlags().AddGoFlagSet(util.GlobalFlagSet()) - rootCmd.AddCommand(version.VersionCmd) + rootCmd.Flags().StringVar(&config.HTTPBind, "http_bind", "127.0.0.1:8080", "http server bind address") + rootCmd.Flags().StringVar(&tlsCertFile, "cert_file", "", "tls cert file") + rootCmd.Flags().StringVar(&tlsKeyFile, "key_file", "", "tls key file") + rootCmd.Flags().StringVar(&config.SchedulerName, "scheduler-name", "", "the name to be added to pod.spec.schedulerName if not empty") + rootCmd.Flags().Int32Var(&config.DefaultMem, "default-mem", 0, "default gpu device memory to allocate") + rootCmd.Flags().Int32Var(&config.DefaultCores, "default-cores", 0, "default gpu core percentage to allocate") + rootCmd.Flags().Int32Var(&config.DefaultResourceNum, "default-gpu", 1, "default gpu to allocate") + rootCmd.Flags().StringVar(&config.NodeSchedulerPolicy, "node-scheduler-policy", policy.NodeSchedulerPolicyBinpack.String(), "node scheduler policy") + rootCmd.Flags().StringVar(&config.GPUSchedulerPolicy, "gpu-scheduler-policy", policy.GPUSchedulerPolicySpread.String(), "GPU scheduler policy") + rootCmd.Flags().StringVar(&config.MetricsBindAddress, "metrics-bind-address", ":9395", "The TCP address that the scheduler should bind to for serving prometheus metrics(e.g. 127.0.0.1:9395, :9395)") + rootCmd.Flags().StringToStringVar(&config.NodeLabelSelector, "node-label-selector", nil, "key=value pairs separated by commas") + rootCmd.PersistentFlags().AddGoFlagSet(device.GlobalFlagSet()) + rootCmd.AddCommand(version.VersionCmd) + rootCmd.Flags().AddGoFlagSet(util.InitKlogFlags()) } func start() { - sher := scheduler.NewScheduler() - sher.Start() - defer sher.Stop() + sher = scheduler.NewScheduler() + sher.Start() + defer sher.Stop() - // start grpc server - lisGrpc, _ := net.Listen("tcp", config.GrpcBind) - defer lisGrpc.Close() - s := grpc.NewServer() - pb.RegisterDeviceServiceServer(s, sher) - go func() { - err := s.Serve(lisGrpc) - if err != nil { - klog.Fatal(err) - } - }() + // start monitor metrics + go sher.RegisterFromNodeAnnotations() + go initMetrics(config.MetricsBindAddress) - // start http server - router := httprouter.New() - router.POST("/filter", routes.PredicateRoute(sher)) - router.POST("/webhook", routes.WebHookRoute()) - klog.Info("listen on ", config.HttpBind) - if len(tlsCertFile) == 0 || len(tlsKeyFile) == 0 { - if err := http.ListenAndServe(config.HttpBind, router); err != nil { - klog.Fatal("Listen and Serve error, ", err) - } - } else { - if err := http.ListenAndServeTLS(config.HttpBind, tlsCertFile, tlsKeyFile, router); err != nil { - klog.Fatal("Listen and Serve error, ", err) - } - } + // start http server + router := httprouter.New() + router.POST("/filter", routes.PredicateRoute(sher)) + router.POST("/bind", routes.Bind(sher)) + router.POST("/webhook", routes.WebHookRoute()) + router.GET("/healthz", routes.HealthzRoute()) + klog.Info("listen on ", config.HTTPBind) + if len(tlsCertFile) == 0 || len(tlsKeyFile) == 0 { + if err := http.ListenAndServe(config.HTTPBind, router); err != nil { + klog.Fatal("Listen and Serve error, ", err) + } + } else { + if err := http.ListenAndServeTLS(config.HTTPBind, tlsCertFile, tlsKeyFile, router); err != nil { + klog.Fatal("Listen and Serve error, ", err) + } + } } func main() { - if err := rootCmd.Execute(); err != nil { - klog.Fatal(err) - } + if err := rootCmd.Execute(); err != nil { + klog.Fatal(err) + } } diff --git a/cmd/scheduler/metrics.go b/cmd/scheduler/metrics.go new file mode 100644 index 000000000..5305bd6d8 --- /dev/null +++ b/cmd/scheduler/metrics.go @@ -0,0 +1,240 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "fmt" + "log" + "net/http" + "strings" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + + klog "k8s.io/klog/v2" +) + +// ClusterManager is an example for a system that might have been built without +// Prometheus in mind. It models a central manager of jobs running in a +// cluster. Thus, we implement a custom Collector called +// ClusterManagerCollector, which collects information from a ClusterManager +// using its provided methods and turns them into Prometheus Metrics for +// collection. +// +// An additional challenge is that multiple instances of the ClusterManager are +// run within the same binary, each in charge of a different zone. We need to +// make use of wrapping Registerers to be able to register each +// ClusterManagerCollector instance with Prometheus. +type ClusterManager struct { + Zone string + // Contains many more fields not listed in this example. +} + +// ClusterManagerCollector implements the Collector interface. +type ClusterManagerCollector struct { + ClusterManager *ClusterManager +} + +// Describe is implemented with DescribeByCollect. That's possible because the +// Collect method will always return the same two metrics with the same two +// descriptors. +func (cc ClusterManagerCollector) Describe(ch chan<- *prometheus.Desc) { + prometheus.DescribeByCollect(cc, ch) +} + +// Collect first triggers the ReallyExpensiveAssessmentOfTheSystemState. Then it +// creates constant metrics for each host on the fly based on the returned data. +// +// Note that Collect could be called concurrently, so we depend on +// ReallyExpensiveAssessmentOfTheSystemState to be concurrency-safe. +func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) { + klog.Info("Starting to collect metrics for scheduler") + nodevGPUMemoryLimitDesc := prometheus.NewDesc( + "GPUDeviceMemoryLimit", + "Device memory limit for a certain GPU", + []string{"nodeid", "deviceuuid", "deviceidx"}, nil, + ) + nodevGPUCoreLimitDesc := prometheus.NewDesc( + "GPUDeviceCoreLimit", + "Device memory core limit for a certain GPU", + []string{"nodeid", "deviceuuid", "deviceidx"}, nil, + ) + nodevGPUMemoryAllocatedDesc := prometheus.NewDesc( + "GPUDeviceMemoryAllocated", + "Device memory allocated for a certain GPU", + []string{"nodeid", "deviceuuid", "deviceidx", "devicecores"}, nil, + ) + nodevGPUSharedNumDesc := prometheus.NewDesc( + "GPUDeviceSharedNum", + "Number of containers sharing this GPU", + []string{"nodeid", "deviceuuid", "deviceidx"}, nil, + ) + + nodeGPUCoreAllocatedDesc := prometheus.NewDesc( + "GPUDeviceCoreAllocated", + "Device core allocated for a certain GPU", + []string{"nodeid", "deviceuuid", "deviceidx"}, nil, + ) + nodeGPUOverview := prometheus.NewDesc( + "nodeGPUOverview", + "GPU overview on a certain node", + []string{"nodeid", "deviceuuid", "deviceidx", "devicecores", "sharedcontainers", "devicememorylimit", "devicetype"}, nil, + ) + nodeGPUMemoryPercentage := prometheus.NewDesc( + "nodeGPUMemoryPercentage", + "GPU Memory Allocated Percentage on a certain GPU", + []string{"nodeid", "deviceuuid", "deviceidx"}, nil, + ) + nu := sher.InspectAllNodesUsage() + for nodeID, val := range *nu { + for _, devs := range val.Devices.DeviceLists { + ch <- prometheus.MustNewConstMetric( + nodevGPUMemoryLimitDesc, + prometheus.GaugeValue, + float64(devs.Device.Totalmem)*float64(1024)*float64(1024), + nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index), + ) + ch <- prometheus.MustNewConstMetric( + nodevGPUCoreLimitDesc, + prometheus.GaugeValue, + float64(devs.Device.Totalcore), + nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index), + ) + ch <- prometheus.MustNewConstMetric( + nodevGPUMemoryAllocatedDesc, + prometheus.GaugeValue, + float64(devs.Device.Usedmem)*float64(1024)*float64(1024), + nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index), fmt.Sprint(devs.Device.Usedcores), + ) + ch <- prometheus.MustNewConstMetric( + nodevGPUSharedNumDesc, + prometheus.GaugeValue, + float64(devs.Device.Used), + nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index), + ) + + ch <- prometheus.MustNewConstMetric( + nodeGPUCoreAllocatedDesc, + prometheus.GaugeValue, + float64(devs.Device.Usedcores), + nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index), + ) + ch <- prometheus.MustNewConstMetric( + nodeGPUOverview, + prometheus.GaugeValue, + float64(devs.Device.Usedmem)*float64(1024)*float64(1024), + nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index), fmt.Sprint(devs.Device.Usedcores), fmt.Sprint(devs.Device.Used), fmt.Sprint(devs.Device.Totalmem), devs.Device.Type, + ) + ch <- prometheus.MustNewConstMetric( + nodeGPUMemoryPercentage, + prometheus.GaugeValue, + float64(devs.Device.Usedmem)/float64(devs.Device.Totalmem), + nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index), + ) + } + } + + ctrvGPUDeviceAllocatedDesc := prometheus.NewDesc( + "vGPUPodsDeviceAllocated", + "vGPU Allocated from pods", + []string{"podnamespace", "nodename", "podname", "containeridx", "deviceuuid", "deviceusedcore"}, nil, + ) + ctrvGPUdeviceAllocatedMemoryPercentageDesc := prometheus.NewDesc( + "vGPUMemoryPercentage", + "vGPU memory percentage allocated from a container", + []string{"podnamespace", "nodename", "podname", "containeridx", "deviceuuid"}, nil, + ) + ctrvGPUdeviceAllocateCorePercentageDesc := prometheus.NewDesc( + "vGPUCorePercentage", + "vGPU core allocated from a container", + []string{"podnamespace", "nodename", "podname", "containeridx", "deviceuuid"}, nil, + ) + schedpods, _ := sher.GetScheduledPods() + for _, val := range schedpods { + for _, podSingleDevice := range val.Devices { + for ctridx, ctrdevs := range podSingleDevice { + for _, ctrdevval := range ctrdevs { + klog.Infoln("Collecting", val.Namespace, val.NodeID, val.Name, ctrdevval.UUID, ctrdevval.Usedcores, ctrdevval.Usedmem) + if len(ctrdevval.UUID) == 0 { + klog.Infof("UUID empty, omitted") + continue + } + ch <- prometheus.MustNewConstMetric( + ctrvGPUDeviceAllocatedDesc, + prometheus.GaugeValue, + float64(ctrdevval.Usedmem)*float64(1024)*float64(1024), + val.Namespace, val.NodeID, val.Name, fmt.Sprint(ctridx), ctrdevval.UUID, fmt.Sprint(ctrdevval.Usedcores)) + var totaldev int32 + found := false + for _, ni := range *nu { + for _, nodedev := range ni.Devices.DeviceLists { + //fmt.Println("uuid=", nodedev.ID, ctrdevval.UUID) + if strings.Compare(nodedev.Device.ID, ctrdevval.UUID) == 0 { + totaldev = nodedev.Device.Totalmem + found = true + break + } + } + if found { + break + } + } + if totaldev > 0 { + ch <- prometheus.MustNewConstMetric( + ctrvGPUdeviceAllocatedMemoryPercentageDesc, + prometheus.GaugeValue, + float64(ctrdevval.Usedmem)/float64(totaldev), + val.Namespace, val.NodeID, val.Name, fmt.Sprint(ctridx), ctrdevval.UUID) + } + ch <- prometheus.MustNewConstMetric( + ctrvGPUdeviceAllocateCorePercentageDesc, + prometheus.GaugeValue, + float64(ctrdevval.Usedcores), + val.Namespace, val.NodeID, val.Name, fmt.Sprint(ctridx), ctrdevval.UUID) + } + } + } + } +} + +// NewClusterManager first creates a Prometheus-ignorant ClusterManager +// instance. Then, it creates a ClusterManagerCollector for the just created +// ClusterManager. Finally, it registers the ClusterManagerCollector with a +// wrapping Registerer that adds the zone as a label. In this way, the metrics +// collected by different ClusterManagerCollectors do not collide. +func NewClusterManager(zone string, reg prometheus.Registerer) *ClusterManager { + c := &ClusterManager{ + Zone: zone, + } + cc := ClusterManagerCollector{ClusterManager: c} + prometheus.WrapRegistererWith(prometheus.Labels{"zone": zone}, reg).MustRegister(cc) + return c +} + +func initMetrics(bindAddress string) { + // Since we are dealing with custom Collector implementations, it might + // be a good idea to try it out with a pedantic registry. + klog.Info("Initializing metrics for scheduler") + reg := prometheus.NewRegistry() + + // Construct cluster managers. In real code, we would assign them to + // variables to then do something with them. + NewClusterManager("vGPU", reg) + + http.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{})) + log.Fatal(http.ListenAndServe(bindAddress, nil)) +} diff --git a/cmd/vGPUmonitor/build.sh b/cmd/vGPUmonitor/build.sh new file mode 100644 index 000000000..c6bfa7201 --- /dev/null +++ b/cmd/vGPUmonitor/build.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# Copyright 2024 The HAMi Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative noderpc/noderpc.proto +go build diff --git a/cmd/vGPUmonitor/feedback.go b/cmd/vGPUmonitor/feedback.go new file mode 100644 index 000000000..c55f50f8d --- /dev/null +++ b/cmd/vGPUmonitor/feedback.go @@ -0,0 +1,275 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "os" + "sort" + "strings" + "time" + + "github.com/Project-HAMi/HAMi/pkg/monitor/nvidia" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "k8s.io/klog/v2" +) + +var cgroupDriver int + +//type hostGPUPid struct { +// hostGPUPid int +// mtime uint64 +//} + +type UtilizationPerDevice []int + +func setcGgroupDriver() int { + // 1 for cgroupfs 2 for systemd + kubeletconfig, err := os.ReadFile("/hostvar/lib/kubelet/config.yaml") + if err != nil { + return 0 + } + content := string(kubeletconfig) + pos := strings.LastIndex(content, "cgroupDriver:") + if pos < 0 { + return 0 + } + if strings.Contains(content, "systemd") { + return 2 + } + if strings.Contains(content, "cgroupfs") { + return 1 + } + return 0 +} + +func getUsedGPUPid() ([]uint, nvml.Return) { + tmp := []nvml.ProcessInfo{} + count, err := nvml.DeviceGetCount() + if err != nvml.SUCCESS { + return []uint{}, err + } + for i := 0; i < count; i++ { + device, err := nvml.DeviceGetHandleByIndex(i) + if err != nvml.SUCCESS { + return []uint{}, err + } + ids, err := device.GetComputeRunningProcesses() + if err != nvml.SUCCESS { + return []uint{}, err + } + tmp = append(tmp, ids...) + } + result := make([]uint, 0) + m := make(map[uint]bool) + for _, v := range tmp { + if _, ok := m[uint(v.Pid)]; !ok { + result = append(result, uint(v.Pid)) + m[uint(v.Pid)] = true + } + } + sort.Slice(tmp, func(i, j int) bool { return tmp[i].Pid > tmp[j].Pid }) + return result, nvml.SUCCESS +} + +//func setHostPid(pod corev1.Pod, ctr corev1.ContainerStatus, sr *podusage) error { +// var pids []string +// mutex.Lock() +// defer mutex.Unlock() +// +// if cgroupDriver == 0 { +// cgroupDriver = setcGgroupDriver() +// } +// if cgroupDriver == 0 { +// return errors.New("can not identify cgroup driver") +// } +// usedGPUArray, err := getUsedGPUPid() +// if err != nvml.SUCCESS { +// return errors.New("get usedGPUID failed, ret:" + nvml.ErrorString(err)) +// } +// if len(usedGPUArray) == 0 { +// return nil +// } +// qos := strings.ToLower(string(pod.Status.QOSClass)) +// var filename string +// if cgroupDriver == 1 { +// /* Cgroupfs */ +// filename = fmt.Sprintf("/sysinfo/fs/cgroup/memory/kubepods/%s/pod%s/%s/tasks", qos, pod.UID, strings.TrimPrefix(ctr.ContainerID, "docker://")) +// } +// if cgroupDriver == 2 { +// /* Systemd */ +// cgroupuid := strings.ReplaceAll(string(pod.UID), "-", "_") +// filename = fmt.Sprintf("/sysinfo/fs/cgroup/systemd/kubepods.slice/kubepods-%s.slice/kubepods-%s-pod%s.slice/docker-%s.scope/tasks", qos, qos, cgroupuid, strings.TrimPrefix(ctr.ContainerID, "docker://")) +// } +// fmt.Println("filename=", filename) +// content, ferr := os.ReadFile(filename) +// if ferr != nil { +// return ferr +// } +// pids = strings.Split(string(content), "\n") +// hostPidArray := []hostGPUPid{} +// for _, val := range pids { +// tmp, _ := strconv.Atoi(val) +// if tmp != 0 { +// var stat os.FileInfo +// var err error +// if stat, err = os.Lstat(fmt.Sprintf("/proc/%v", tmp)); err != nil { +// return err +// } +// mtime := stat.ModTime().Unix() +// hostPidArray = append(hostPidArray, hostGPUPid{ +// hostGPUPid: tmp, +// mtime: uint64(mtime), +// }) +// } +// } +// usedGPUHostArray := []hostGPUPid{} +// for _, val := range usedGPUArray { +// for _, hostpid := range hostPidArray { +// if uint(hostpid.hostGPUPid) == val { +// usedGPUHostArray = append(usedGPUHostArray, hostpid) +// } +// } +// } +// //fmt.Println("usedHostGPUArray=", usedGPUHostArray) +// sort.Slice(usedGPUHostArray, func(i, j int) bool { return usedGPUHostArray[i].mtime > usedGPUHostArray[j].mtime }) +// if sr == nil || sr.sr == nil { +// return nil +// } +// for idx, val := range sr.sr.procs { +// //fmt.Println("pid=", val.pid) +// if val.pid == 0 { +// break +// } +// if idx < len(usedGPUHostArray) { +// if val.hostpid == 0 || val.hostpid != int32(usedGPUHostArray[idx].hostGPUPid) { +// fmt.Println("Assign host pid to pid instead", usedGPUHostArray[idx].hostGPUPid, val.pid, val.hostpid) +// sr.sr.procs[idx].hostpid = int32(usedGPUHostArray[idx].hostGPUPid) +// fmt.Println("val=", val.hostpid, sr.sr.procs[idx].hostpid) +// } +// } +// } +// return nil +// +//} + +func CheckBlocking(utSwitchOn map[string]UtilizationPerDevice, p int, c *nvidia.ContainerUsage) bool { + for i := 0; i < c.Info.DeviceMax(); i++ { + uuid := c.Info.DeviceUUID(i) + _, ok := utSwitchOn[uuid] + if ok { + for i := 0; i < p; i++ { + if utSwitchOn[uuid][i] > 0 { + return true + } + } + return false + } + } + return false +} + +// Check whether task with higher priority use GPU or there are other tasks with the same priority. +func CheckPriority(utSwitchOn map[string]UtilizationPerDevice, p int, c *nvidia.ContainerUsage) bool { + for i := 0; i < c.Info.DeviceMax(); i++ { + uuid := c.Info.DeviceUUID(i) + _, ok := utSwitchOn[uuid] + if ok { + for i := 0; i < p; i++ { + if utSwitchOn[uuid][i] > 0 { + return true + } + } + if utSwitchOn[uuid][p] > 1 { + return true + } + } + } + return false +} + +func Observe(lister *nvidia.ContainerLister) { + utSwitchOn := map[string]UtilizationPerDevice{} + containers := lister.ListContainers() + + for _, c := range containers { + recentKernel := c.Info.GetRecentKernel() + if recentKernel > 0 { + recentKernel-- + if recentKernel > 0 { + for i := 0; i < c.Info.DeviceMax(); i++ { + //for _, devuuid := range val.sr.uuids { + // Null device condition + if !c.Info.IsValidUUID(i) { + continue + } + uuid := c.Info.DeviceUUID(i) + if len(utSwitchOn[uuid]) == 0 { + utSwitchOn[uuid] = []int{0, 0} + } + utSwitchOn[uuid][c.Info.GetPriority()]++ + } + } + c.Info.SetRecentKernel(recentKernel) + } + } + for idx, c := range containers { + priority := c.Info.GetPriority() + recentKernel := c.Info.GetRecentKernel() + utilizationSwitch := c.Info.GetUtilizationSwitch() + if CheckBlocking(utSwitchOn, priority, c) { + if recentKernel >= 0 { + klog.Infof("utSwitchon=%v", utSwitchOn) + klog.Infof("Setting Blocking to on %v", idx) + c.Info.SetRecentKernel(-1) + } + } else { + if recentKernel < 0 { + klog.Infof("utSwitchon=%v", utSwitchOn) + klog.Infof("Setting Blocking to off %v", idx) + c.Info.SetRecentKernel(0) + } + } + if CheckPriority(utSwitchOn, priority, c) { + if utilizationSwitch != 1 { + klog.Infof("utSwitchon=%v", utSwitchOn) + klog.Infof("Setting UtilizationSwitch to on %v", idx) + c.Info.SetUtilizationSwitch(1) + } + } else { + if utilizationSwitch != 0 { + klog.Infof("utSwitchon=%v", utSwitchOn) + klog.Infof("Setting UtilizationSwitch to off %v", idx) + c.Info.SetUtilizationSwitch(0) + } + } + } +} + +func watchAndFeedback(lister *nvidia.ContainerLister) { + nvml.Init() + for { + time.Sleep(time.Second * 5) + err := lister.Update() + if err != nil { + klog.Errorf("Failed to update container list: %v", err) + continue + } + //klog.Infof("WatchAndFeedback srPodList=%v", srPodList) + Observe(lister) + } +} diff --git a/cmd/vGPUmonitor/main.go b/cmd/vGPUmonitor/main.go new file mode 100644 index 000000000..359019a8c --- /dev/null +++ b/cmd/vGPUmonitor/main.go @@ -0,0 +1,47 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "github.com/Project-HAMi/HAMi/pkg/monitor/nvidia" + + "k8s.io/klog/v2" +) + +//var addr = flag.String("listen-address", ":9394", "The address to listen on for HTTP requests.") + +//const shared_directory = "/usr/local/vgpu/shared" + +func main() { + + if err := ValidateEnvVars(); err != nil { + klog.Fatalf("Failed to validate environment variables: %v", err) + } + containerLister, err := nvidia.NewContainerLister() + if err != nil { + klog.Fatalf("Failed to create container lister: %v", err) + } + cgroupDriver = 0 + errchannel := make(chan error) + //go serveInfo(errchannel) + go initMetrics(containerLister) + go watchAndFeedback(containerLister) + for { + err := <-errchannel + klog.Errorf("failed to serve: %v", err) + } +} diff --git a/cmd/vGPUmonitor/metrics.go b/cmd/vGPUmonitor/metrics.go new file mode 100644 index 000000000..30b1b1d1c --- /dev/null +++ b/cmd/vGPUmonitor/metrics.go @@ -0,0 +1,369 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "fmt" + "log" + "net/http" + "strings" + "time" + + "github.com/Project-HAMi/HAMi/pkg/monitor/nvidia" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/informers" + listerscorev1 "k8s.io/client-go/listers/core/v1" + "k8s.io/klog/v2" +) + +// ClusterManager is an example for a system that might have been built without +// Prometheus in mind. It models a central manager of jobs running in a +// cluster. Thus, we implement a custom Collector called +// ClusterManagerCollector, which collects information from a ClusterManager +// using its provided methods and turns them into Prometheus Metrics for +// collection. +// +// An additional challenge is that multiple instances of the ClusterManager are +// run within the same binary, each in charge of a different zone. We need to +// make use of wrapping Registerers to be able to register each +// ClusterManagerCollector instance with Prometheus. +type ClusterManager struct { + Zone string + // Contains many more fields not listed in this example. + PodLister listerscorev1.PodLister + containerLister *nvidia.ContainerLister +} + +// ReallyExpensiveAssessmentOfTheSystemState is a mock for the data gathering a +// real cluster manager would have to do. Since it may actually be really +// expensive, it must only be called once per collection. This implementation, +// obviously, only returns some made-up data. +func (c *ClusterManager) ReallyExpensiveAssessmentOfTheSystemState() ( + oomCountByHost map[string]int, ramUsageByHost map[string]float64, +) { + // Just example fake data. + oomCountByHost = map[string]int{ + "foo.example.org": 42, + "bar.example.org": 2001, + } + ramUsageByHost = map[string]float64{ + "foo.example.org": 6.023e23, + "bar.example.org": 3.14, + } + return +} + +// ClusterManagerCollector implements the Collector interface. +type ClusterManagerCollector struct { + ClusterManager *ClusterManager +} + +// Descriptors used by the ClusterManagerCollector below. +var ( + hostGPUdesc = prometheus.NewDesc( + "HostGPUMemoryUsage", + "GPU device memory usage", + []string{"deviceidx", "deviceuuid"}, nil, + ) + + hostGPUUtilizationdesc = prometheus.NewDesc( + "HostCoreUtilization", + "GPU core utilization", + []string{"deviceidx", "deviceuuid"}, nil, + ) + + ctrvGPUdesc = prometheus.NewDesc( + "vGPU_device_memory_usage_in_bytes", + "vGPU device usage", + []string{"podnamespace", "podname", "ctrname", "vdeviceid", "deviceuuid"}, nil, + ) + + ctrvGPUlimitdesc = prometheus.NewDesc( + "vGPU_device_memory_limit_in_bytes", + "vGPU device limit", + []string{"podnamespace", "podname", "ctrname", "vdeviceid", "deviceuuid"}, nil, + ) + ctrDeviceMemorydesc = prometheus.NewDesc( + "Device_memory_desc_of_container", + "Container device meory description", + []string{"podnamespace", "podname", "ctrname", "vdeviceid", "deviceuuid", "context", "module", "data", "offset"}, nil, + ) + ctrDeviceUtilizationdesc = prometheus.NewDesc( + "Device_utilization_desc_of_container", + "Container device utilization description", + []string{"podnamespace", "podname", "ctrname", "vdeviceid", "deviceuuid"}, nil, + ) + ctrDeviceLastKernelDesc = prometheus.NewDesc( + "Device_last_kernel_of_container", + "Container device last kernel description", + []string{"podnamespace", "podname", "ctrname", "vdeviceid", "deviceuuid"}, nil, + ) +) + +// Describe is implemented with DescribeByCollect. That's possible because the +// Collect method will always return the same two metrics with the same two +// descriptors. +func (cc ClusterManagerCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- hostGPUdesc + ch <- ctrvGPUdesc + ch <- ctrvGPUlimitdesc + ch <- hostGPUUtilizationdesc + //prometheus.DescribeByCollect(cc, ch) +} + +//func parseidstr(podusage string) (string, string, error) { +// tmp := strings.Split(podusage, "_") +// if len(tmp) > 1 { +// return tmp[0], tmp[1], nil +// } else { +// return "", "", errors.New("parse error") +// } +//} +// +//func gettotalusage(usage podusage, vidx int) (deviceMemory, error) { +// added := deviceMemory{ +// bufferSize: 0, +// contextSize: 0, +// moduleSize: 0, +// offset: 0, +// total: 0, +// } +// for _, val := range usage.sr.procs { +// added.bufferSize += val.used[vidx].bufferSize +// added.contextSize += val.used[vidx].contextSize +// added.moduleSize += val.used[vidx].moduleSize +// added.offset += val.used[vidx].offset +// added.total += val.used[vidx].total +// } +// return added, nil +//} +// +//func getTotalUtilization(usage podusage, vidx int) deviceUtilization { +// added := deviceUtilization{ +// decUtil: 0, +// encUtil: 0, +// smUtil: 0, +// } +// for _, val := range usage.sr.procs { +// added.decUtil += val.deviceUtil[vidx].decUtil +// added.encUtil += val.deviceUtil[vidx].encUtil +// added.smUtil += val.deviceUtil[vidx].smUtil +// } +// return added +//} + +// Collect first triggers the ReallyExpensiveAssessmentOfTheSystemState. Then it +// creates constant metrics for each host on the fly based on the returned data. +// +// Note that Collect could be called concurrently, so we depend on +// ReallyExpensiveAssessmentOfTheSystemState to be concurrency-safe. +func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) { + klog.Info("Starting to collect metrics for vGPUMonitor") + containerLister := cc.ClusterManager.containerLister + if err := containerLister.Update(); err != nil { + klog.Error("Update container error: %s", err.Error()) + } + + nvret := nvml.Init() + if nvret != nvml.SUCCESS { + klog.Error("nvml Init err=", nvml.ErrorString(nvret)) + } + devnum, nvret := nvml.DeviceGetCount() + if nvret != nvml.SUCCESS { + klog.Error("nvml GetDeviceCount err=", nvml.ErrorString(nvret)) + } else { + for ii := 0; ii < devnum; ii++ { + hdev, nvret := nvml.DeviceGetHandleByIndex(ii) + if nvret != nvml.SUCCESS { + klog.Error(nvml.ErrorString(nvret)) + } + memoryUsed := 0 + memory, ret := hdev.GetMemoryInfo() + if ret == nvml.SUCCESS { + memoryUsed = int(memory.Used) + } else { + klog.Error("nvml get memory error ret=", ret) + } + + uuid, nvret := hdev.GetUUID() + if nvret != nvml.SUCCESS { + klog.Error(nvml.ErrorString(nvret)) + } else { + ch <- prometheus.MustNewConstMetric( + hostGPUdesc, + prometheus.GaugeValue, + float64(memoryUsed), + fmt.Sprint(ii), uuid, + ) + } + util, nvret := hdev.GetUtilizationRates() + if nvret != nvml.SUCCESS { + klog.Error(nvml.ErrorString(nvret)) + } else { + ch <- prometheus.MustNewConstMetric( + hostGPUUtilizationdesc, + prometheus.GaugeValue, + float64(util.Gpu), + fmt.Sprint(ii), uuid, + ) + } + + } + } + + pods, err := cc.ClusterManager.PodLister.List(labels.Everything()) + if err != nil { + klog.Error("failed to list pods with err=", err.Error()) + } + nowSec := time.Now().Unix() + + containers := containerLister.ListContainers() + for _, pod := range pods { + for _, c := range containers { + //for sridx := range srPodList { + // if srPodList[sridx].sr == nil { + // continue + // } + if c.Info == nil { + continue + } + //podUID := strings.Split(srPodList[sridx].idstr, "_")[0] + //ctrName := strings.Split(srPodList[sridx].idstr, "_")[1] + podUID := c.PodUID + ctrName := c.ContainerName + if strings.Compare(string(pod.UID), podUID) != 0 { + continue + } + fmt.Println("Pod matched!", pod.Name, pod.Namespace, pod.Labels) + for _, ctr := range pod.Spec.Containers { + if strings.Compare(ctr.Name, ctrName) != 0 { + continue + } + fmt.Println("container matched", ctr.Name) + //err := setHostPid(pod, pod.Status.ContainerStatuses[ctridx], &srPodList[sridx]) + //if err != nil { + // fmt.Println("setHostPid filed", err.Error()) + //} + //fmt.Println("sr.list=", srPodList[sridx].sr) + podlabels := make(map[string]string) + for idx, val := range pod.Labels { + idxfix := strings.ReplaceAll(idx, "-", "_") + valfix := strings.ReplaceAll(val, "-", "_") + podlabels[idxfix] = valfix + } + for i := 0; i < c.Info.DeviceNum(); i++ { + uuid := c.Info.DeviceUUID(i)[0:40] + memoryTotal := c.Info.DeviceMemoryTotal(i) + memoryLimit := c.Info.DeviceMemoryLimit(i) + memoryContextSize := c.Info.DeviceMemoryContextSize(i) + memoryModuleSize := c.Info.DeviceMemoryModuleSize(i) + memoryBufferSize := c.Info.DeviceMemoryBufferSize(i) + memoryOffset := c.Info.DeviceMemoryOffset(i) + smUtil := c.Info.DeviceSmUtil(i) + lastKernelTime := c.Info.LastKernelTime() + + //fmt.Println("uuid=", uuid, "length=", len(uuid)) + ch <- prometheus.MustNewConstMetric( + ctrvGPUdesc, + prometheus.GaugeValue, + float64(memoryTotal), + pod.Namespace, pod.Name, ctrName, fmt.Sprint(i), uuid, /*,string(sr.sr.uuids[i].uuid[:])*/ + ) + ch <- prometheus.MustNewConstMetric( + ctrvGPUlimitdesc, + prometheus.GaugeValue, + float64(memoryLimit), + pod.Namespace, pod.Name, ctrName, fmt.Sprint(i), uuid, /*,string(sr.sr.uuids[i].uuid[:])*/ + ) + ch <- prometheus.MustNewConstMetric( + ctrDeviceMemorydesc, + prometheus.CounterValue, + float64(memoryTotal), + pod.Namespace, pod.Name, ctrName, fmt.Sprint(i), uuid, + fmt.Sprint(memoryContextSize), fmt.Sprint(memoryModuleSize), fmt.Sprint(memoryBufferSize), fmt.Sprint(memoryOffset), + ) + ch <- prometheus.MustNewConstMetric( + ctrDeviceUtilizationdesc, + prometheus.GaugeValue, + float64(smUtil), + pod.Namespace, pod.Name, ctrName, fmt.Sprint(i), uuid, + ) + if lastKernelTime > 0 { + lastSec := nowSec - lastKernelTime + if lastSec < 0 { + lastSec = 0 + } + ch <- prometheus.MustNewConstMetric( + ctrDeviceLastKernelDesc, + prometheus.GaugeValue, + float64(lastSec), + pod.Namespace, pod.Name, ctrName, fmt.Sprint(i), uuid, + ) + } + } + } + } + } +} + +// NewClusterManager first creates a Prometheus-ignorant ClusterManager +// instance. Then, it creates a ClusterManagerCollector for the just created +// ClusterManager. Finally, it registers the ClusterManagerCollector with a +// wrapping Registerer that adds the zone as a label. In this way, the metrics +// collected by different ClusterManagerCollectors do not collide. +func NewClusterManager(zone string, reg prometheus.Registerer, containerLister *nvidia.ContainerLister) *ClusterManager { + c := &ClusterManager{ + Zone: zone, + containerLister: containerLister, + } + + informerFactory := informers.NewSharedInformerFactoryWithOptions(containerLister.Clientset(), time.Hour*1) + c.PodLister = informerFactory.Core().V1().Pods().Lister() + stopCh := make(chan struct{}) + informerFactory.Start(stopCh) + + cc := ClusterManagerCollector{ClusterManager: c} + prometheus.WrapRegistererWith(prometheus.Labels{"zone": zone}, reg).MustRegister(cc) + return c +} + +func initMetrics(containerLister *nvidia.ContainerLister) { + // Since we are dealing with custom Collector implementations, it might + // be a good idea to try it out with a pedantic registry. + klog.Info("Initializing metrics for vGPUmonitor") + reg := prometheus.NewRegistry() + //reg := prometheus.NewPedanticRegistry() + + // Construct cluster managers. In real code, we would assign them to + // variables to then do something with them. + NewClusterManager("vGPU", reg, containerLister) + //NewClusterManager("ca", reg) + + // Add the standard process and Go metrics to the custom registry. + //reg.MustRegister( + // prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{}), + // prometheus.NewGoCollector(), + //) + + http.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{})) + log.Fatal(http.ListenAndServe(":9394", nil)) +} diff --git a/cmd/vGPUmonitor/noderpc/noderpc.pb.go b/cmd/vGPUmonitor/noderpc/noderpc.pb.go new file mode 100644 index 000000000..4c732b88b --- /dev/null +++ b/cmd/vGPUmonitor/noderpc/noderpc.pb.go @@ -0,0 +1,518 @@ +// Copyright 2015 gRPC authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.27.1 +// protoc v3.14.0 +// source: noderpc/noderpc.proto + +package vGPUmonitor + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +// The sharedProcs contains the sharedRegion +type ShrregProcSlotT struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Pid int32 `protobuf:"varint,1,opt,name=pid,proto3" json:"pid,omitempty"` + Used []uint64 `protobuf:"varint,2,rep,packed,name=used,proto3" json:"used,omitempty"` + Status int32 `protobuf:"varint,3,opt,name=status,proto3" json:"status,omitempty"` +} + +func (x *ShrregProcSlotT) Reset() { + *x = ShrregProcSlotT{} + if protoimpl.UnsafeEnabled { + mi := &file_noderpc_noderpc_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ShrregProcSlotT) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ShrregProcSlotT) ProtoMessage() {} + +func (x *ShrregProcSlotT) ProtoReflect() protoreflect.Message { + mi := &file_noderpc_noderpc_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ShrregProcSlotT.ProtoReflect.Descriptor instead. +func (*ShrregProcSlotT) Descriptor() ([]byte, []int) { + return file_noderpc_noderpc_proto_rawDescGZIP(), []int{0} +} + +func (x *ShrregProcSlotT) GetPid() int32 { + if x != nil { + return x.Pid + } + return 0 +} + +func (x *ShrregProcSlotT) GetUsed() []uint64 { + if x != nil { + return x.Used + } + return nil +} + +func (x *ShrregProcSlotT) GetStatus() int32 { + if x != nil { + return x.Status + } + return 0 +} + +// The sharedRegionT struct is the main struct for monitoring vgpu +type SharedRegionT struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + InitializedFlag int32 `protobuf:"varint,1,opt,name=initializedFlag,proto3" json:"initializedFlag,omitempty"` + OwnerPid uint32 `protobuf:"varint,2,opt,name=ownerPid,proto3" json:"ownerPid,omitempty"` + Sem uint32 `protobuf:"varint,3,opt,name=sem,proto3" json:"sem,omitempty"` + Limit []uint64 `protobuf:"varint,4,rep,packed,name=limit,proto3" json:"limit,omitempty"` + SmLimit []uint64 `protobuf:"varint,5,rep,packed,name=sm_limit,json=smLimit,proto3" json:"sm_limit,omitempty"` + Procs []*ShrregProcSlotT `protobuf:"bytes,6,rep,name=procs,proto3" json:"procs,omitempty"` +} + +func (x *SharedRegionT) Reset() { + *x = SharedRegionT{} + if protoimpl.UnsafeEnabled { + mi := &file_noderpc_noderpc_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *SharedRegionT) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SharedRegionT) ProtoMessage() {} + +func (x *SharedRegionT) ProtoReflect() protoreflect.Message { + mi := &file_noderpc_noderpc_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SharedRegionT.ProtoReflect.Descriptor instead. +func (*SharedRegionT) Descriptor() ([]byte, []int) { + return file_noderpc_noderpc_proto_rawDescGZIP(), []int{1} +} + +func (x *SharedRegionT) GetInitializedFlag() int32 { + if x != nil { + return x.InitializedFlag + } + return 0 +} + +func (x *SharedRegionT) GetOwnerPid() uint32 { + if x != nil { + return x.OwnerPid + } + return 0 +} + +func (x *SharedRegionT) GetSem() uint32 { + if x != nil { + return x.Sem + } + return 0 +} + +func (x *SharedRegionT) GetLimit() []uint64 { + if x != nil { + return x.Limit + } + return nil +} + +func (x *SharedRegionT) GetSmLimit() []uint64 { + if x != nil { + return x.SmLimit + } + return nil +} + +func (x *SharedRegionT) GetProcs() []*ShrregProcSlotT { + if x != nil { + return x.Procs + } + return nil +} + +type Podusage struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Poduuid string `protobuf:"bytes,1,opt,name=poduuid,proto3" json:"poduuid,omitempty"` + Podvgpuinfo *SharedRegionT `protobuf:"bytes,2,opt,name=podvgpuinfo,proto3" json:"podvgpuinfo,omitempty"` +} + +func (x *Podusage) Reset() { + *x = Podusage{} + if protoimpl.UnsafeEnabled { + mi := &file_noderpc_noderpc_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *Podusage) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*Podusage) ProtoMessage() {} + +func (x *Podusage) ProtoReflect() protoreflect.Message { + mi := &file_noderpc_noderpc_proto_msgTypes[2] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use Podusage.ProtoReflect.Descriptor instead. +func (*Podusage) Descriptor() ([]byte, []int) { + return file_noderpc_noderpc_proto_rawDescGZIP(), []int{2} +} + +func (x *Podusage) GetPoduuid() string { + if x != nil { + return x.Poduuid + } + return "" +} + +func (x *Podusage) GetPodvgpuinfo() *SharedRegionT { + if x != nil { + return x.Podvgpuinfo + } + return nil +} + +// The request message containing the user's name. +type GetNodeVGPURequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Ctruuid string `protobuf:"bytes,1,opt,name=ctruuid,proto3" json:"ctruuid,omitempty"` +} + +func (x *GetNodeVGPURequest) Reset() { + *x = GetNodeVGPURequest{} + if protoimpl.UnsafeEnabled { + mi := &file_noderpc_noderpc_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetNodeVGPURequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetNodeVGPURequest) ProtoMessage() {} + +func (x *GetNodeVGPURequest) ProtoReflect() protoreflect.Message { + mi := &file_noderpc_noderpc_proto_msgTypes[3] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetNodeVGPURequest.ProtoReflect.Descriptor instead. +func (*GetNodeVGPURequest) Descriptor() ([]byte, []int) { + return file_noderpc_noderpc_proto_rawDescGZIP(), []int{3} +} + +func (x *GetNodeVGPURequest) GetCtruuid() string { + if x != nil { + return x.Ctruuid + } + return "" +} + +// The response message containing the greetings +type GetNodeVGPUReply struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Nodeid string `protobuf:"bytes,1,opt,name=nodeid,proto3" json:"nodeid,omitempty"` + Nodevgpuinfo []*Podusage `protobuf:"bytes,2,rep,name=nodevgpuinfo,proto3" json:"nodevgpuinfo,omitempty"` +} + +func (x *GetNodeVGPUReply) Reset() { + *x = GetNodeVGPUReply{} + if protoimpl.UnsafeEnabled { + mi := &file_noderpc_noderpc_proto_msgTypes[4] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetNodeVGPUReply) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetNodeVGPUReply) ProtoMessage() {} + +func (x *GetNodeVGPUReply) ProtoReflect() protoreflect.Message { + mi := &file_noderpc_noderpc_proto_msgTypes[4] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetNodeVGPUReply.ProtoReflect.Descriptor instead. +func (*GetNodeVGPUReply) Descriptor() ([]byte, []int) { + return file_noderpc_noderpc_proto_rawDescGZIP(), []int{4} +} + +func (x *GetNodeVGPUReply) GetNodeid() string { + if x != nil { + return x.Nodeid + } + return "" +} + +func (x *GetNodeVGPUReply) GetNodevgpuinfo() []*Podusage { + if x != nil { + return x.Nodevgpuinfo + } + return nil +} + +var File_noderpc_noderpc_proto protoreflect.FileDescriptor + +var file_noderpc_noderpc_proto_rawDesc = []byte{ + 0x0a, 0x15, 0x6e, 0x6f, 0x64, 0x65, 0x72, 0x70, 0x63, 0x2f, 0x6e, 0x6f, 0x64, 0x65, 0x72, 0x70, + 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x09, 0x70, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x72, + 0x70, 0x63, 0x22, 0x4f, 0x0a, 0x0f, 0x73, 0x68, 0x72, 0x72, 0x65, 0x67, 0x50, 0x72, 0x6f, 0x63, + 0x53, 0x6c, 0x6f, 0x74, 0x54, 0x12, 0x10, 0x0a, 0x03, 0x70, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, + 0x28, 0x05, 0x52, 0x03, 0x70, 0x69, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x75, 0x73, 0x65, 0x64, 0x18, + 0x02, 0x20, 0x03, 0x28, 0x04, 0x52, 0x04, 0x75, 0x73, 0x65, 0x64, 0x12, 0x16, 0x0a, 0x06, 0x73, + 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, + 0x74, 0x75, 0x73, 0x22, 0xca, 0x01, 0x0a, 0x0d, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x52, 0x65, + 0x67, 0x69, 0x6f, 0x6e, 0x54, 0x12, 0x28, 0x0a, 0x0f, 0x69, 0x6e, 0x69, 0x74, 0x69, 0x61, 0x6c, + 0x69, 0x7a, 0x65, 0x64, 0x46, 0x6c, 0x61, 0x67, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0f, + 0x69, 0x6e, 0x69, 0x74, 0x69, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x64, 0x46, 0x6c, 0x61, 0x67, 0x12, + 0x1a, 0x0a, 0x08, 0x6f, 0x77, 0x6e, 0x65, 0x72, 0x50, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, + 0x0d, 0x52, 0x08, 0x6f, 0x77, 0x6e, 0x65, 0x72, 0x50, 0x69, 0x64, 0x12, 0x10, 0x0a, 0x03, 0x73, + 0x65, 0x6d, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x03, 0x73, 0x65, 0x6d, 0x12, 0x14, 0x0a, + 0x05, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x18, 0x04, 0x20, 0x03, 0x28, 0x04, 0x52, 0x05, 0x6c, 0x69, + 0x6d, 0x69, 0x74, 0x12, 0x19, 0x0a, 0x08, 0x73, 0x6d, 0x5f, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x18, + 0x05, 0x20, 0x03, 0x28, 0x04, 0x52, 0x07, 0x73, 0x6d, 0x4c, 0x69, 0x6d, 0x69, 0x74, 0x12, 0x30, + 0x0a, 0x05, 0x70, 0x72, 0x6f, 0x63, 0x73, 0x18, 0x06, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x1a, 0x2e, + 0x70, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x72, 0x70, 0x63, 0x2e, 0x73, 0x68, 0x72, 0x72, 0x65, 0x67, + 0x50, 0x72, 0x6f, 0x63, 0x53, 0x6c, 0x6f, 0x74, 0x54, 0x52, 0x05, 0x70, 0x72, 0x6f, 0x63, 0x73, + 0x22, 0x60, 0x0a, 0x08, 0x70, 0x6f, 0x64, 0x75, 0x73, 0x61, 0x67, 0x65, 0x12, 0x18, 0x0a, 0x07, + 0x70, 0x6f, 0x64, 0x75, 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x70, + 0x6f, 0x64, 0x75, 0x75, 0x69, 0x64, 0x12, 0x3a, 0x0a, 0x0b, 0x70, 0x6f, 0x64, 0x76, 0x67, 0x70, + 0x75, 0x69, 0x6e, 0x66, 0x6f, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x18, 0x2e, 0x70, 0x6c, + 0x75, 0x67, 0x69, 0x6e, 0x72, 0x70, 0x63, 0x2e, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x52, 0x65, + 0x67, 0x69, 0x6f, 0x6e, 0x54, 0x52, 0x0b, 0x70, 0x6f, 0x64, 0x76, 0x67, 0x70, 0x75, 0x69, 0x6e, + 0x66, 0x6f, 0x22, 0x2e, 0x0a, 0x12, 0x47, 0x65, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x56, 0x47, 0x50, + 0x55, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x18, 0x0a, 0x07, 0x63, 0x74, 0x72, 0x75, + 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x63, 0x74, 0x72, 0x75, 0x75, + 0x69, 0x64, 0x22, 0x63, 0x0a, 0x10, 0x47, 0x65, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x56, 0x47, 0x50, + 0x55, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x12, 0x16, 0x0a, 0x06, 0x6e, 0x6f, 0x64, 0x65, 0x69, 0x64, + 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x6e, 0x6f, 0x64, 0x65, 0x69, 0x64, 0x12, 0x37, + 0x0a, 0x0c, 0x6e, 0x6f, 0x64, 0x65, 0x76, 0x67, 0x70, 0x75, 0x69, 0x6e, 0x66, 0x6f, 0x18, 0x02, + 0x20, 0x03, 0x28, 0x0b, 0x32, 0x13, 0x2e, 0x70, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x72, 0x70, 0x63, + 0x2e, 0x70, 0x6f, 0x64, 0x75, 0x73, 0x61, 0x67, 0x65, 0x52, 0x0c, 0x6e, 0x6f, 0x64, 0x65, 0x76, + 0x67, 0x70, 0x75, 0x69, 0x6e, 0x66, 0x6f, 0x32, 0x5b, 0x0a, 0x0c, 0x4e, 0x6f, 0x64, 0x65, 0x56, + 0x47, 0x50, 0x55, 0x49, 0x6e, 0x66, 0x6f, 0x12, 0x4b, 0x0a, 0x0b, 0x47, 0x65, 0x74, 0x4e, 0x6f, + 0x64, 0x65, 0x56, 0x47, 0x50, 0x55, 0x12, 0x1d, 0x2e, 0x70, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x72, + 0x70, 0x63, 0x2e, 0x47, 0x65, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x56, 0x47, 0x50, 0x55, 0x52, 0x65, + 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x1b, 0x2e, 0x70, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x72, 0x70, + 0x63, 0x2e, 0x47, 0x65, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x56, 0x47, 0x50, 0x55, 0x52, 0x65, 0x70, + 0x6c, 0x79, 0x22, 0x00, 0x42, 0x4b, 0x0a, 0x1b, 0x69, 0x6f, 0x2e, 0x67, 0x72, 0x70, 0x63, 0x2e, + 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x73, 0x2e, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x77, 0x6f, + 0x72, 0x6c, 0x64, 0x42, 0x0f, 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x50, + 0x72, 0x6f, 0x74, 0x6f, 0x50, 0x01, 0x5a, 0x19, 0x67, 0x69, 0x74, 0x6c, 0x61, 0x62, 0x2e, 0x34, + 0x70, 0x64, 0x2e, 0x69, 0x6f, 0x2f, 0x76, 0x47, 0x50, 0x55, 0x6d, 0x6f, 0x6e, 0x69, 0x74, 0x6f, + 0x72, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, +} + +var ( + file_noderpc_noderpc_proto_rawDescOnce sync.Once + file_noderpc_noderpc_proto_rawDescData = file_noderpc_noderpc_proto_rawDesc +) + +func file_noderpc_noderpc_proto_rawDescGZIP() []byte { + file_noderpc_noderpc_proto_rawDescOnce.Do(func() { + file_noderpc_noderpc_proto_rawDescData = protoimpl.X.CompressGZIP(file_noderpc_noderpc_proto_rawDescData) + }) + return file_noderpc_noderpc_proto_rawDescData +} + +var file_noderpc_noderpc_proto_msgTypes = make([]protoimpl.MessageInfo, 5) +var file_noderpc_noderpc_proto_goTypes = []interface{}{ + (*ShrregProcSlotT)(nil), // 0: pluginrpc.shrregProcSlotT + (*SharedRegionT)(nil), // 1: pluginrpc.sharedRegionT + (*Podusage)(nil), // 2: pluginrpc.podusage + (*GetNodeVGPURequest)(nil), // 3: pluginrpc.GetNodeVGPURequest + (*GetNodeVGPUReply)(nil), // 4: pluginrpc.GetNodeVGPUReply +} +var file_noderpc_noderpc_proto_depIdxs = []int32{ + 0, // 0: pluginrpc.sharedRegionT.procs:type_name -> pluginrpc.shrregProcSlotT + 1, // 1: pluginrpc.podusage.podvgpuinfo:type_name -> pluginrpc.sharedRegionT + 2, // 2: pluginrpc.GetNodeVGPUReply.nodevgpuinfo:type_name -> pluginrpc.podusage + 3, // 3: pluginrpc.NodeVGPUInfo.GetNodeVGPU:input_type -> pluginrpc.GetNodeVGPURequest + 4, // 4: pluginrpc.NodeVGPUInfo.GetNodeVGPU:output_type -> pluginrpc.GetNodeVGPUReply + 4, // [4:5] is the sub-list for method output_type + 3, // [3:4] is the sub-list for method input_type + 3, // [3:3] is the sub-list for extension type_name + 3, // [3:3] is the sub-list for extension extendee + 0, // [0:3] is the sub-list for field type_name +} + +func init() { file_noderpc_noderpc_proto_init() } +func file_noderpc_noderpc_proto_init() { + if File_noderpc_noderpc_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_noderpc_noderpc_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*ShrregProcSlotT); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_noderpc_noderpc_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*SharedRegionT); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_noderpc_noderpc_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*Podusage); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_noderpc_noderpc_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetNodeVGPURequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_noderpc_noderpc_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetNodeVGPUReply); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_noderpc_noderpc_proto_rawDesc, + NumEnums: 0, + NumMessages: 5, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_noderpc_noderpc_proto_goTypes, + DependencyIndexes: file_noderpc_noderpc_proto_depIdxs, + MessageInfos: file_noderpc_noderpc_proto_msgTypes, + }.Build() + File_noderpc_noderpc_proto = out.File + file_noderpc_noderpc_proto_rawDesc = nil + file_noderpc_noderpc_proto_goTypes = nil + file_noderpc_noderpc_proto_depIdxs = nil +} diff --git a/cmd/vGPUmonitor/noderpc/noderpc.proto b/cmd/vGPUmonitor/noderpc/noderpc.proto new file mode 100644 index 000000000..611c3ec8c --- /dev/null +++ b/cmd/vGPUmonitor/noderpc/noderpc.proto @@ -0,0 +1,61 @@ +// Copyright 2015 gRPC authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +option go_package = "gitlab.4pd.io/vGPUmonitor"; +option java_multiple_files = true; +option java_package = "io.grpc.examples.helloworld"; +option java_outer_classname = "HelloWorldProto"; + +package pluginrpc; + +// The greeting service definition. +service NodeVGPUInfo { + // Sends a greeting + rpc GetNodeVGPU (GetNodeVGPURequest) returns (GetNodeVGPUReply) {} +} + +// The sharedProcs contains the sharedRegion +message shrregProcSlotT { + int32 pid = 1; + repeated uint64 used = 2; + int32 status = 3; +} + +// The sharedRegionT struct is the main struct for monitoring vgpu +message sharedRegionT { + int32 initializedFlag = 1; + uint32 ownerPid = 2; + uint32 sem = 3; + repeated uint64 limit = 4; + repeated uint64 sm_limit = 5; + repeated shrregProcSlotT procs = 6; +} + +message podusage { + string poduuid = 1; + sharedRegionT podvgpuinfo = 2; +} + +// The request message containing the user's name. +message GetNodeVGPURequest { + string ctruuid = 1; +} + +// The response message containing the greetings +message GetNodeVGPUReply { + string nodeid = 1; + repeated podusage nodevgpuinfo = 2; +} diff --git a/cmd/vGPUmonitor/noderpc/noderpc_grpc.pb.go b/cmd/vGPUmonitor/noderpc/noderpc_grpc.pb.go new file mode 100644 index 000000000..346ed4ce8 --- /dev/null +++ b/cmd/vGPUmonitor/noderpc/noderpc_grpc.pb.go @@ -0,0 +1,103 @@ +// Code generated by protoc-gen-go-grpc. DO NOT EDIT. + +package vGPUmonitor + +import ( + context "context" + grpc "google.golang.org/grpc" + codes "google.golang.org/grpc/codes" + status "google.golang.org/grpc/status" +) + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.32.0 or later. +const _ = grpc.SupportPackageIsVersion7 + +// NodeVGPUInfoClient is the client API for NodeVGPUInfo service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +type NodeVGPUInfoClient interface { + // Sends a greeting + GetNodeVGPU(ctx context.Context, in *GetNodeVGPURequest, opts ...grpc.CallOption) (*GetNodeVGPUReply, error) +} + +type nodeVGPUInfoClient struct { + cc grpc.ClientConnInterface +} + +func NewNodeVGPUInfoClient(cc grpc.ClientConnInterface) NodeVGPUInfoClient { + return &nodeVGPUInfoClient{cc} +} + +func (c *nodeVGPUInfoClient) GetNodeVGPU(ctx context.Context, in *GetNodeVGPURequest, opts ...grpc.CallOption) (*GetNodeVGPUReply, error) { + out := new(GetNodeVGPUReply) + err := c.cc.Invoke(ctx, "/pluginrpc.NodeVGPUInfo/GetNodeVGPU", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +// NodeVGPUInfoServer is the server API for NodeVGPUInfo service. +// All implementations must embed UnimplementedNodeVGPUInfoServer +// for forward compatibility +type NodeVGPUInfoServer interface { + // Sends a greeting + GetNodeVGPU(context.Context, *GetNodeVGPURequest) (*GetNodeVGPUReply, error) + mustEmbedUnimplementedNodeVGPUInfoServer() +} + +// UnimplementedNodeVGPUInfoServer must be embedded to have forward compatible implementations. +type UnimplementedNodeVGPUInfoServer struct { +} + +func (UnimplementedNodeVGPUInfoServer) GetNodeVGPU(context.Context, *GetNodeVGPURequest) (*GetNodeVGPUReply, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetNodeVGPU not implemented") +} +func (UnimplementedNodeVGPUInfoServer) mustEmbedUnimplementedNodeVGPUInfoServer() {} + +// UnsafeNodeVGPUInfoServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to NodeVGPUInfoServer will +// result in compilation errors. +type UnsafeNodeVGPUInfoServer interface { + mustEmbedUnimplementedNodeVGPUInfoServer() +} + +func RegisterNodeVGPUInfoServer(s grpc.ServiceRegistrar, srv NodeVGPUInfoServer) { + s.RegisterService(&NodeVGPUInfo_ServiceDesc, srv) +} + +func _NodeVGPUInfo_GetNodeVGPU_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetNodeVGPURequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(NodeVGPUInfoServer).GetNodeVGPU(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/pluginrpc.NodeVGPUInfo/GetNodeVGPU", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(NodeVGPUInfoServer).GetNodeVGPU(ctx, req.(*GetNodeVGPURequest)) + } + return interceptor(ctx, in, info, handler) +} + +// NodeVGPUInfo_ServiceDesc is the grpc.ServiceDesc for NodeVGPUInfo service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var NodeVGPUInfo_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "pluginrpc.NodeVGPUInfo", + HandlerType: (*NodeVGPUInfoServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "GetNodeVGPU", + Handler: _NodeVGPUInfo_GetNodeVGPU_Handler, + }, + }, + Streams: []grpc.StreamDesc{}, + Metadata: "noderpc/noderpc.proto", +} diff --git a/cmd/vGPUmonitor/testcollector/main.go b/cmd/vGPUmonitor/testcollector/main.go new file mode 100644 index 000000000..bf1342723 --- /dev/null +++ b/cmd/vGPUmonitor/testcollector/main.go @@ -0,0 +1,145 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "log" + "net/http" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +// ClusterManager is an example for a system that might have been built without +// Prometheus in mind. It models a central manager of jobs running in a +// cluster. Thus, we implement a custom Collector called +// ClusterManagerCollector, which collects information from a ClusterManager +// using its provided methods and turns them into Prometheus Metrics for +// collection. +// +// An additional challenge is that multiple instances of the ClusterManager are +// run within the same binary, each in charge of a different zone. We need to +// make use of wrapping Registerers to be able to register each +// ClusterManagerCollector instance with Prometheus. +type ClusterManager struct { + Zone string + // Contains many more fields not listed in this example. +} + +// ReallyExpensiveAssessmentOfTheSystemState is a mock for the data gathering a +// real cluster manager would have to do. Since it may actually be really +// expensive, it must only be called once per collection. This implementation, +// obviously, only returns some made-up data. +func (c *ClusterManager) ReallyExpensiveAssessmentOfTheSystemState() ( + oomCountByHost map[string]int, ramUsageByHost map[string]float64, +) { + // Just example fake data. + oomCountByHost = map[string]int{ + "foo.example.org": 42, + "bar.example.org": 2001, + } + ramUsageByHost = map[string]float64{ + "foo.example.org": 6.023e23, + "bar.example.org": 3.14, + } + return +} + +// ClusterManagerCollector implements the Collector interface. +type ClusterManagerCollector struct { + ClusterManager *ClusterManager +} + +// Descriptors used by the ClusterManagerCollector below. +var ( + oomCountDesc = prometheus.NewDesc( + "clustermanager_oom_crashes_total", + "Number of OOM crashes.", + []string{"host"}, nil, + ) + ramUsageDesc = prometheus.NewDesc( + "clustermanager_ram_usage_bytes", + "RAM usage as reported to the cluster manager.", + []string{"host"}, nil, + ) +) + +// Describe is implemented with DescribeByCollect. That's possible because the +// Collect method will always return the same two metrics with the same two +// descriptors. +func (cc ClusterManagerCollector) Describe(ch chan<- *prometheus.Desc) { + prometheus.DescribeByCollect(cc, ch) +} + +// Collect first triggers the ReallyExpensiveAssessmentOfTheSystemState. Then it +// creates constant metrics for each host on the fly based on the returned data. +// +// Note that Collect could be called concurrently, so we depend on +// ReallyExpensiveAssessmentOfTheSystemState to be concurrency-safe. +func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) { + oomCountByHost, ramUsageByHost := cc.ClusterManager.ReallyExpensiveAssessmentOfTheSystemState() + for host, oomCount := range oomCountByHost { + ch <- prometheus.MustNewConstMetric( + oomCountDesc, + prometheus.CounterValue, + float64(oomCount), + host, + ) + } + for host, ramUsage := range ramUsageByHost { + ch <- prometheus.MustNewConstMetric( + ramUsageDesc, + prometheus.GaugeValue, + ramUsage, + host, + ) + } +} + +// NewClusterManager first creates a Prometheus-ignorant ClusterManager +// instance. Then, it creates a ClusterManagerCollector for the just created +// ClusterManager. Finally, it registers the ClusterManagerCollector with a +// wrapping Registerer that adds the zone as a label. In this way, the metrics +// collected by different ClusterManagerCollectors do not collide. +func NewClusterManager(zone string, reg prometheus.Registerer) *ClusterManager { + c := &ClusterManager{ + Zone: zone, + } + cc := ClusterManagerCollector{ClusterManager: c} + prometheus.WrapRegistererWith(prometheus.Labels{"zone": zone}, reg).MustRegister(cc) + return c +} + +func main() { + // Since we are dealing with custom Collector implementations, it might + // be a good idea to try it out with a pedantic registry. + reg := prometheus.NewPedanticRegistry() + + // Construct cluster managers. In real code, we would assign them to + // variables to then do something with them. + NewClusterManager("db", reg) + NewClusterManager("ca", reg) + + // Add the standard process and Go metrics to the custom registry. + reg.MustRegister( + prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{}), + prometheus.NewGoCollector(), + ) + + http.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{})) + log.Fatal(http.ListenAndServe(":8080", nil)) +} diff --git a/cmd/vGPUmonitor/validation.go b/cmd/vGPUmonitor/validation.go new file mode 100644 index 000000000..fbba208b6 --- /dev/null +++ b/cmd/vGPUmonitor/validation.go @@ -0,0 +1,37 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "fmt" + "os" +) + +var requiredEnvVars = map[string]bool{ + "HOOK_PATH": true, + "OTHER_ENV_VAR": false, +} + +func ValidateEnvVars() error { + for envVar, required := range requiredEnvVars { + _, exists := os.LookupEnv(envVar) + if required && !exists { + return fmt.Errorf("required environment variable %s not set", envVar) + } + } + return nil +} diff --git a/deployments/4pd-vgpu/Chart.yaml b/deployments/4pd-vgpu/Chart.yaml deleted file mode 100644 index 7a0a9ed2b..000000000 --- a/deployments/4pd-vgpu/Chart.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v2 -name: 4pd-vgpu -version: 0.0.1 -kubeVersion: ">= 1.16.0" -description: 4paradigm vgpu for kubernetes -keywords: - - vgpu - - gpu -type: application -maintainers: - - name: 4pd - email: opensource@4paradigm.com -appVersion: 0.0.1 diff --git a/deployments/4pd-vgpu/templates/device-plugin/daemonset.yaml b/deployments/4pd-vgpu/templates/device-plugin/daemonset.yaml deleted file mode 100644 index fd3e93b5f..000000000 --- a/deployments/4pd-vgpu/templates/device-plugin/daemonset.yaml +++ /dev/null @@ -1,74 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: {{ include "4pd-vgpu.device-plugin" . }} - labels: - app.kubernetes.io/component: 4pd-device-plugin - {{- include "4pd-vgpu.labels" . | nindent 4 }} - {{- with .Values.global.labels }} - {{- toYaml . | nindent 4 }} - {{- end }} - {{- if .Values.global.annotations }} - annotations: {{ toYaml .Values.global.annotations | nindent 4}} - {{- end }} -spec: - selector: - matchLabels: - app.kubernetes.io/component: 4pd-device-plugin - {{- include "4pd-vgpu.selectorLabels" . | nindent 6 }} - template: - metadata: - labels: - app.kubernetes.io/component: 4pd-device-plugin - 4pd.io/webhook: ignore - {{- include "4pd-vgpu.selectorLabels" . | nindent 8 }} - {{- if .Values.devicePlugin.podAnnotations }} - annotations: {{ toYaml .Values.devicePlugin.podAnnotations | nindent 8 }} - {{- end }} - spec: - {{- include "4pd-vgpu.imagePullSecrets" . | nindent 6}} - # serviceAccountName: - priorityClassName: system-node-critical - containers: - - name: device-plugin - image: {{ .Values.devicePlugin.image | quote }} - imagePullPolicy: {{ .Values.devicePlugin.imagePullPolicy | quote }} - command: - - device-plugin - - --resource-name={{ .Values.resourceName }} - - --scheduler-endpoint={{ printf "%s:%d" ( include "4pd-vgpu.scheduler" . ) ( int .Values.scheduler.service.grpcPort ) }} - {{- range .Values.devicePlugin.extraArgs }} - - {{ . }} - {{- end }} - env: - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - volumeMounts: - - name: device-plugin - mountPath: /var/lib/kubelet/device-plugins - - name: lib - mountPath: /usr/local/vgpu - - name: sock - mountPath: /var/lib/vgpu - volumes: - - name: device-plugin - hostPath: - path: {{ .Values.devicePlugin.pluginPath }} - - name: lib - hostPath: - path: /usr/local/vgpu - #path: {{ .Values.devicePlugin.libPath }} - - name: sock - hostPath: - path: /var/lib/vgpu - #path: {{ .Values.devicePlugin.sockPath }} - {{- if .Values.devicePlugin.nodeSelector }} - nodeSelector: {{ toYaml .Values.devicePlugin.nodeSelector | nindent 8 }} - {{- end }} - {{- if .Values.devicePlugin.tolerations }} - tolerations: {{ toYaml .Values.devicePlugin.tolerations | nindent 8 }} - {{- end }} - - diff --git a/deployments/4pd-vgpu/templates/scheduler/configmap.yaml b/deployments/4pd-vgpu/templates/scheduler/configmap.yaml deleted file mode 100644 index ad432f153..000000000 --- a/deployments/4pd-vgpu/templates/scheduler/configmap.yaml +++ /dev/null @@ -1,29 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "4pd-vgpu.scheduler" . }} - labels: - app.kubernetes.io/component: 4pd-scheduler - {{- include "4pd-vgpu.labels" . | nindent 4 }} -data: - config.yaml: | - apiVersion: kubescheduler.config.k8s.io/v1beta1 - kind: KubeSchedulerConfiguration - healthzBindAddress: 0.0.0.0:10251 - leaderElection: - leaderElect: false - metricsBindAddress: 0.0.0.0:10251 - profiles: - - schedulerName: {{ .Values.schedulerName }} - extenders: - - urlPrefix: "https://127.0.0.1:443" - filterVerb: filter - nodeCacheCapable: true - weight: 1 - httpTimeout: 30s - enableHTTPS: true - tlsConfig: - insecure: true - managedResources: - - name: {{ .Values.resourceName }} - ignoredByScheduler: true diff --git a/deployments/4pd-vgpu/templates/scheduler/deployment.yaml b/deployments/4pd-vgpu/templates/scheduler/deployment.yaml deleted file mode 100644 index 489c9b348..000000000 --- a/deployments/4pd-vgpu/templates/scheduler/deployment.yaml +++ /dev/null @@ -1,82 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "4pd-vgpu.scheduler" . }} - labels: - app.kubernetes.io/component: 4pd-scheduler - {{- include "4pd-vgpu.labels" . | nindent 4 }} - {{- with .Values.global.labels }} - {{- toYaml . | nindent 4 }} - {{- end }} - {{- if .Values.global.annotations }} - annotations: {{ toYaml .Values.global.annotations | nindent 4}} - {{- end }} -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/component: 4pd-scheduler - {{- include "4pd-vgpu.selectorLabels" . | nindent 6 }} - template: - metadata: - labels: - app.kubernetes.io/component: 4pd-scheduler - {{- include "4pd-vgpu.selectorLabels" . | nindent 8 }} - 4pd.io/webhook: ignore - {{- if .Values.scheduler.podAnnotations }} - annotations: {{ toYaml .Values.scheduler.podAnnotations | nindent 8 }} - {{- end }} - spec: - {{- include "4pd-vgpu.imagePullSecrets" . | nindent 6}} - serviceAccountName: {{ include "4pd-vgpu.scheduler" . }} - priorityClassName: system-node-critical - containers: - - name: kube-scheduler - image: {{ .Values.scheduler.kubeScheduler.image | quote }} - imagePullPolicy: {{ .Values.scheduler.kubeScheduler.imagePullPolicy | quote }} - command: - - kube-scheduler - - --config=/config/config.yaml - {{- range .Values.scheduler.kubeScheduler.extraArgs }} - - {{ . }} - {{- end }} - volumeMounts: - - name: scheduler-config - mountPath: /config - - name: vgpu-scheduler-extender - image: {{ .Values.scheduler.extender.image | quote }} - imagePullPolicy: {{ .Values.scheduler.extender.imagePullPolicy | quote }} - command: - - scheduler - - --resource-name={{ .Values.resourceName }} - - --http_bind=0.0.0.0:443 - - --grpc_bind=0.0.0.0:1080 - - --cert_file=/tls/tls.crt - - --key_file=/tls/tls.key - - --scheduler-name={{ .Values.schedulerName }} - {{- range .Values.scheduler.extender.extraArgs }} - - {{ . }} - {{- end }} - ports: - - name: http - containerPort: 443 - protocol: TCP - - name: grpc - containerPort: 1080 - protocol: TCP - volumeMounts: - - name: tls-config - mountPath: /tls - volumes: - - name: tls-config - secret: - secretName: {{ template "4pd-vgpu.scheduler.tls" . }} - - name: scheduler-config - configMap: - name: {{ template "4pd-vgpu.scheduler" . }} - {{- if .Values.scheduler.nodeSelector }} - nodeSelector: {{ toYaml .Values.scheduler.nodeSelector | nindent 8 }} - {{- end }} - {{- if .Values.scheduler.tolerations }} - tolerations: {{ toYaml .Values.scheduler.tolerations | nindent 8 }} - {{- end }} diff --git a/deployments/4pd-vgpu/templates/scheduler/service.yaml b/deployments/4pd-vgpu/templates/scheduler/service.yaml deleted file mode 100644 index 989969810..000000000 --- a/deployments/4pd-vgpu/templates/scheduler/service.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: {{ include "4pd-vgpu.scheduler" . }} - labels: - app.kubernetes.io/component: 4pd-scheduler - {{- include "4pd-vgpu.labels" . | nindent 4 }} - {{- if .Values.scheduler.service.labels }} - {{ toYaml .Values.scheduler.service.labels | indent 4 }} - {{- end }} - {{- if .Values.scheduler.service.annotations }} - annotations: {{ toYaml .Values.scheduler.service.annotations | nindent 4 }} - {{- end }} -spec: - type: ClusterIP - ports: - - name: http - port: {{ .Values.scheduler.service.httpPort }} - targetPort: 443 - protocol: TCP - - name: grpc - port: {{ .Values.scheduler.service.grpcPort }} - targetPort: 1080 - protocol: TCP - selector: - app.kubernetes.io/component: 4pd-scheduler - {{- include "4pd-vgpu.selectorLabels" . | nindent 4 }} - diff --git a/deployments/4pd-vgpu/templates/scheduler/serviceaccount.yaml b/deployments/4pd-vgpu/templates/scheduler/serviceaccount.yaml deleted file mode 100644 index 2846e6fab..000000000 --- a/deployments/4pd-vgpu/templates/scheduler/serviceaccount.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ include "4pd-vgpu.scheduler" . }} - namespace: {{ .Release.Namespace | quote }} - labels: - app.kubernetes.io/component: "4pd-scheduler" - {{- include "4pd-vgpu.labels" . | nindent 4 }} diff --git a/deployments/4pd-vgpu/templates/scheduler/webhook.yaml b/deployments/4pd-vgpu/templates/scheduler/webhook.yaml deleted file mode 100644 index d044b737c..000000000 --- a/deployments/4pd-vgpu/templates/scheduler/webhook.yaml +++ /dev/null @@ -1,41 +0,0 @@ -apiVersion: admissionregistration.k8s.io/v1 -kind: MutatingWebhookConfiguration -metadata: - name: {{ include "4pd-vgpu.scheduler.webhook" . }} -webhooks: - - admissionReviewVersions: - - v1 - clientConfig: - service: - name: {{ include "4pd-vgpu.scheduler" . }} - namespace: {{ .Release.Namespace }} - path: /webhook - port: {{ .Values.scheduler.service.httpPort }} - failurePolicy: Fail - matchPolicy: Equivalent - name: vgpu.4pd.io - namespaceSelector: - matchExpressions: - - key: 4pd.io/webhook - operator: NotIn - values: - - ignore - objectSelector: - matchExpressions: - - key: 4pd.io/webhook - operator: NotIn - values: - - ignore - reinvocationPolicy: Never - rules: - - apiGroups: - - "" - apiVersions: - - v1 - operations: - - CREATE - resources: - - pods - scope: '*' - sideEffects: None - timeoutSeconds: 10 diff --git a/deployments/4pd-vgpu/values.yaml b/deployments/4pd-vgpu/values.yaml deleted file mode 100644 index 4b61feffe..000000000 --- a/deployments/4pd-vgpu/values.yaml +++ /dev/null @@ -1,64 +0,0 @@ -# Default values for 4pd-vgpu. - -nameOverride: "" -fullnameOverride: "" - -imagePullSecrets: [] -# -resourceName: "nvidia.com/gpu" -schedulerName: "4pd-scheduler" - -podSecurityPolicy: - enabled: false - -global: - labels: {} - annotations: {} - -scheduler: - kubeScheduler: - image: "4pdosc/kube-scheduler:v1.20.9" - imagePullPolicy: IfNotPresent - extraArgs: - - -v=4 - extender: - image: "m7-ieg-pico-test01:5000/k8s-vgpu:master" - imagePullPolicy: Always - extraArgs: - - --debug - - -v=4 - podAnnotations: {} - nodeSelector: {} - tolerations: [] - #serviceAccountName: "4pd-vgpu-scheduler-sa" - patch: - image: docker.io/jettech/kube-webhook-certgen:v1.5.2 - imagePullPolicy: IfNotPresent - priorityClassName: "" - podAnnotations: {} - nodeSelector: {} - tolerations: [] - runAsUser: 2000 - - service: - httpPort: 443 - grpcPort: 1080 - labels: {} - annotations: {} - -devicePlugin: - image: "m7-ieg-pico-test01:5000/k8s-vgpu:master" - imagePullPolicy: Always - extraArgs: - - --device-split-count=2 - - -v=4 - - pluginPath: /var/lib/kubelet/device-plugins - #libPath: /usr/local/vgpu - #sockPath: /var/lib/vgpu - - podAnnotations: {} - nodeSelector: - gpu: "on" - tolerations: [] - diff --git a/docker/Dockerfile b/docker/Dockerfile index aaf921882..c922a3d79 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,29 +1,33 @@ -ARG GOLANG_IMAGE=golang:1.16.6-buster -ARG NVIDIA_IMAGE=nvidia/cuda:11.2.1-base-ubuntu20.04 -FROM $GOLANG_IMAGE AS build +ARG GOLANG_IMAGE +ARG NVIDIA_IMAGE +FROM $GOLANG_IMAGE AS build +FROM $GOLANG_IMAGE AS GOBUILD +ARG GOPROXY ADD . /k8s-vgpu -ENV GOPRIVATE="gitlab.4pd.io/*" -ARG GOPROXY=https://goproxy.cn,direct -ARG VERSION="unknown" RUN cd /k8s-vgpu && make all +FROM $NVIDIA_IMAGE AS NVBUILD +COPY ./libvgpu /libvgpu +WORKDIR /libvgpu +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get -y update; apt-get -y install cmake +RUN bash ./build.sh -FROM $NVIDIA_IMAGE - +FROM nvidia/cuda:12.4.1-base-ubuntu22.04 ENV NVIDIA_DISABLE_REQUIRE="true" ENV NVIDIA_VISIBLE_DEVICES=all ENV NVIDIA_DRIVER_CAPABILITIES=utility -ARG VERSION="unknown" +ARG VERSION LABEL version="$VERSION" LABEL maintainer="opensource@4paradigm.com" - COPY ./LICENSE /k8s-vgpu/LICENSE -COPY --from=build /k8s-vgpu/bin /k8s-vgpu/bin +COPY --from=GOBUILD /k8s-vgpu/bin /k8s-vgpu/bin COPY ./docker/entrypoint.sh /k8s-vgpu/bin/entrypoint.sh COPY ./lib /k8s-vgpu/lib +COPY --from=NVBUILD /libvgpu/build/libvgpu.so /k8s-vgpu/lib/nvidia/ ENV PATH="/k8s-vgpu/bin:${PATH}" - -ENTRYPOINT ["entrypoint.sh"] +ARG DEST_DIR +ENTRYPOINT ["/bin/bash", "-c", "entrypoint.sh $DEST_DIR"] diff --git a/docker/Dockerfile.withlib b/docker/Dockerfile.withlib new file mode 100644 index 000000000..77d5450f0 --- /dev/null +++ b/docker/Dockerfile.withlib @@ -0,0 +1,28 @@ +ARG GOLANG_IMAGE +ARG NVIDIA_IMAGE +FROM $GOLANG_IMAGE AS build + +FROM $GOLANG_IMAGE AS GOBUILD +ADD . /k8s-vgpu +ARG GOPROXY=https://goproxy.cn,direct +RUN cd /k8s-vgpu && make all + +FROM ubuntu:20.04 +ENV NVIDIA_DISABLE_REQUIRE="true" +ENV NVIDIA_VISIBLE_DEVICES=all +ENV NVIDIA_DRIVER_CAPABILITIES=utility + +ARG VERSION +LABEL version="$VERSION" +LABEL maintainer="opensource@4paradigm.com" +COPY ./LICENSE /k8s-vgpu/LICENSE +COPY --from=GOBUILD /k8s-vgpu/bin /k8s-vgpu/bin +COPY ./docker/entrypoint.sh /k8s-vgpu/bin/entrypoint.sh +COPY ./lib /k8s-vgpu/lib +COPY ./libvgpu.so /k8s-vgpu/lib/nvidia/ +COPY ./license /k8s-vgpu/lib/nvidia/ +COPY ./vgpuvalidator /k8s-vgpu/lib/nvidia + +ENV PATH="/k8s-vgpu/bin:${PATH}" +ARG DEST_DIR +ENTRYPOINT ["/bin/bash", "-c", "entrypoint.sh $DEST_DIR"] diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 3a7fa08cd..4b9322b11 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright © 2021 peizhaoyou +# Copyright 2024 The HAMi Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,8 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -if [ $1 == "device-plugin" ]; then - cp -f /k8s-vgpu/lib/* /usr/local/vgpu/ -fi -exec "$@" \ No newline at end of file +# if [ $1 == "device-plugin" ]; then +# cp -f /k8s-vgpu/lib/* $DEST_DIR/vgpu +# fi +exec "$@" diff --git a/docs/CHANGELOG/CHANGELOG-0.0.0.md b/docs/CHANGELOG/CHANGELOG-0.0.0.md new file mode 100644 index 000000000..d48b69147 --- /dev/null +++ b/docs/CHANGELOG/CHANGELOG-0.0.0.md @@ -0,0 +1,29 @@ + + +**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)* + +- [v0.0.0](#v000) + - [Downloads for v0.0.0](#downloads-for-v000) + - [Changelog since v0.0.0](#changelog-since-v000) + - [Changes by Kind](#changes-by-kind) + - [Bug Fixes](#bug-fixes) + - [Others](#others) + + + +# v0.0.0 +## Downloads for v0.0.0 + +Download v0.0.0 in the [v0.0.0 release page](https://github.com/Project-HAMi/HAMi/releases/tag/v0.0.0). + +## Changelog since v0.0.0 +### Changes by Kind +#### Bug Fixes +None. + +### Deprecation +None. + +#### Others +None. + diff --git a/docs/ascend910b-support.md b/docs/ascend910b-support.md new file mode 100644 index 000000000..459a5335a --- /dev/null +++ b/docs/ascend910b-support.md @@ -0,0 +1,63 @@ +## Introduction + +**We now support huawei.com/Ascend910 by implementing most device-sharing features as nvidia-GPU**, including: + +***NPU sharing***: Each task can allocate a portion of Ascend NPU instead of a whole NLU card, thus NPU can be shared among multiple tasks. + +***Device Memory Control***: Ascend NPUs can be allocated with certain device memory size and guarantee it that it does not exceed the boundary. + +***Device Core Control***: Ascend NPUs can be allocated with certain compute cores and guarantee it that it does not exceed the boundary. + + +## Prerequisites + +* Ascend device type: 910B(300T A2) +* driver version >= 24.1.rc1 +* Ascend docker runtime + +## Enabling Ascend-sharing Support + +* Install the chart using helm, See 'enabling vGPU support in kubernetes' section [here](https://github.com/Project-HAMi/HAMi#enabling-vgpu-support-in-kubernetes) + +* Tag Ascend-910B node with the following command +``` +kubectl label node {ascend-node} accelerator=huawei-Ascend910 +``` + +* Install [Ascend docker runtime](https://gitee.com/ascend/ascend-docker-runtime) + +* Download yaml for Ascend-vgpu-device-plugin from HAMi Project [here](https://github.com/Project-HAMi/ascend-device-plugin/blob/master/build/ascendplugin-910-hami.yaml), and deploy + +``` +wget https://raw.githubusercontent.com/Project-HAMi/ascend-device-plugin/master/build/ascendplugin-910-hami.yaml +kubectl apply -f ascendplugin-910-hami.yaml +``` + +## Running Ascend jobs + +Ascend 910Bs can now be requested by a container +using the `huawei.com/ascend910` and `huawei.com/ascend910-memory` resource type: + +``` +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: ubuntu-container + image: ascendhub.huawei.com/public-ascendhub/ascend-mindspore:23.0.RC3-centos7 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + huawei.com/Ascend910: 1 # requesting 1 vGPUs + huawei.com/Ascend910-memory: 2000 # requesting 2000m device memory +``` + +## Notes + +1. Currently, the Ascend 910b supports only two sharding strategies, which are 1/4 and 1/2. The memory request of the job will automatically align with the most close sharding strategy. In this example, the task will allocate 16384M device memory. + +1. Ascend-910B-sharing in init container is not supported. + +2. `huawei.com/Ascend910-memory` only work when `huawei.com/Ascend910=1`. \ No newline at end of file diff --git a/docs/ascend910b-support_cn.md b/docs/ascend910b-support_cn.md new file mode 100644 index 000000000..f53e15e82 --- /dev/null +++ b/docs/ascend910b-support_cn.md @@ -0,0 +1,60 @@ +## 简介 + +本组件支持复用华为升腾910B设备,并为此提供以下几种与vGPU类似的复用功能,包括: + +*** NPU 共享***: 每个任务可以只占用一部分显卡,多个任务可以共享一张显卡 + +***可限制分配的显存大小***: 你现在可以用显存值(例如3000M)来分配NPU,本组件会确保任务使用的显存不会超过分配数值 + +***可限制分配的算力大小***: 你现在可以用百分比来分配 NPU的算力,本组件会确保任务使用的算力不会超过分配数值 + +## 节点需求 + +* Ascend docker runtime +* driver version > 24.1.rc1 +* Ascend device type: 910B(300T A2) + +## 开启NPU复用 + +* 通过helm部署本组件, 参照[主文档中的开启vgpu支持章节](https://github.com/Project-HAMi/HAMi/blob/master/README_cn.md#kubernetes开启vgpu支持) + +* 使用以下指令,为Ascend 910B所在节点打上label +``` +kubectl label node {ascend-node} accelerator=huawei-Ascend910 +``` + +* 部署[Ascend docker runtime](https://gitee.com/ascend/ascend-docker-runtime) + +* 从HAMi项目中获取并安装[ascend-device-plugin](https://github.com/Project-HAMi/ascend-device-plugin/blob/master/build/ascendplugin-910-hami.yaml),并进行部署 + +``` +wget https://raw.githubusercontent.com/Project-HAMi/ascend-device-plugin/master/build/ascendplugin-910-hami.yaml +kubectl apply -f ascendplugin-910-hami.yaml +``` + + +## 运行NPU任务 + +``` +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: ubuntu-container + image: ascendhub.huawei.com/public-ascendhub/ascend-mindspore:23.0.RC3-centos7 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + huawei.com/Ascend910: 1 # requesting 1 vGPUs + huawei.com/Ascend910-memory: 2000 # requesting 2000m device memory +``` + +## 注意事项 + +1. 目前Ascend910B设备,只支持2种粒度的切分,分别是1/4卡和1/2卡,分配的显存会自动对齐到在分配额之上最近的粒度上 + +2. 在init container中无法使用NPU复用功能 + +3. 只有申请单MLU的任务可以指定显存`Ascend910-memory`的数值,若申请的NPU数量大于1,则所有申请的NPU都会被整卡分配 diff --git a/docs/benchmark.md b/docs/benchmark.md new file mode 100644 index 000000000..91611d211 --- /dev/null +++ b/docs/benchmark.md @@ -0,0 +1,49 @@ +## Benchmarks + +Three instances from ai-benchmark have been used to evaluate vGPU-device-plugin performance as follows + +| Test Environment | description | +| ---------------- | :------------------------------------------------------: | +| Kubernetes version | v1.12.9 | +| Docker version | 18.09.1 | +| GPU Type | Tesla V100 | +| GPU Num | 2 | + +| Test instance | description | +| ------------- | :---------------------------------------------------------: | +| nvidia-device-plugin | k8s + nvidia k8s-device-plugin | +| vGPU-device-plugin | k8s + VGPU k8s-device-plugin,without virtual device memory | +| vGPU-device-plugin(virtual device memory) | k8s + VGPU k8s-device-plugin,with virtual device memory | + +Test Cases: + +| test id | case | type | params | +| ------- | :-----------: | :-------: | :---------------------: | +| 1.1 | Resnet-V2-50 | inference | batch=50,size=346*346 | +| 1.2 | Resnet-V2-50 | training | batch=20,size=346*346 | +| 2.1 | Resnet-V2-152 | inference | batch=10,size=256*256 | +| 2.2 | Resnet-V2-152 | training | batch=10,size=256*256 | +| 3.1 | VGG-16 | inference | batch=20,size=224*224 | +| 3.2 | VGG-16 | training | batch=2,size=224*224 | +| 4.1 | DeepLab | inference | batch=2,size=512*512 | +| 4.2 | DeepLab | training | batch=1,size=384*384 | +| 5.1 | LSTM | inference | batch=100,size=1024*300 | +| 5.2 | LSTM | training | batch=10,size=1024*300 | + +Test Result: ![img](../imgs/benchmark_inf.png) + +![img](../imgs/benchmark_train.png) + +To reproduce: + +1. install k8s-vGPU-scheduler,and configure properly +2. run benchmark job + +``` +$ kubectl apply -f benchmarks/ai-benchmark/ai-benchmark.yml +``` + +3. View the result by using kubctl logs + +``` +$ kubectl logs [pod id] \ No newline at end of file diff --git a/docs/benchmark_cn.md b/docs/benchmark_cn.md new file mode 100644 index 000000000..c1f5f1fa8 --- /dev/null +++ b/docs/benchmark_cn.md @@ -0,0 +1,50 @@ +## 性能测试 + +在测试报告中,我们一共在下面五种场景都执行了ai-benchmark 测试脚本,并汇总最终结果: + +| 测试环境 | 环境描述 | +| ---------------- | :------------------------------------------------------: | +| Kubernetes version | v1.12.9 | +| Docker version | 18.09.1 | +| GPU Type | Tesla V100 | +| GPU Num | 2 | + +| 测试名称 | 测试用例 | +| -------- | :------------------------------------------------: | +| Nvidia-device-plugin | k8s + nvidia官方k8s-device-plugin | +| vGPU-device-plugin | k8s + VGPU k8s-device-plugin,无虚拟显存 | +| vGPU-device-plugin(virtual device memory) | k8s + VGPU k8s-device-plugin,高负载,开启虚拟显存 | + +测试内容 + +| test id | 名称 | 类型 | 参数 | +| ------- | :-----------: | :-------: | :---------------------: | +| 1.1 | Resnet-V2-50 | inference | batch=50,size=346*346 | +| 1.2 | Resnet-V2-50 | training | batch=20,size=346*346 | +| 2.1 | Resnet-V2-152 | inference | batch=10,size=256*256 | +| 2.2 | Resnet-V2-152 | training | batch=10,size=256*256 | +| 3.1 | VGG-16 | inference | batch=20,size=224*224 | +| 3.2 | VGG-16 | training | batch=2,size=224*224 | +| 4.1 | DeepLab | inference | batch=2,size=512*512 | +| 4.2 | DeepLab | training | batch=1,size=384*384 | +| 5.1 | LSTM | inference | batch=100,size=1024*300 | +| 5.2 | LSTM | training | batch=10,size=1024*300 | + +测试结果: ![img](../imgs/benchmark_inf.png) + +![img](../imgs/benchmark_train.png) + +测试步骤: + +1. 安装nvidia-device-plugin,并配置相应的参数 +2. 运行benchmark任务 + +``` +$ kubectl apply -f benchmarks/ai-benchmark/ai-benchmark.yml +``` + +3. 通过kubctl logs 查看结果 + +``` +$ kubectl logs [pod id] +``` \ No newline at end of file diff --git a/docs/cambricon-mlu-support.md b/docs/cambricon-mlu-support.md new file mode 100644 index 000000000..49ded3884 --- /dev/null +++ b/docs/cambricon-mlu-support.md @@ -0,0 +1,81 @@ +## Introduction + +**We now support cambricon.com/mlu by implementing most device-sharing features as nvidia-GPU**, including: + +***MLU sharing***: Each task can allocate a portion of MLU instead of a whole MLU card, thus MLU can be shared among multiple tasks. + +***Device Memory Control***: MLUs can be allocated with certain device memory size and guarantee it that it does not exceed the boundary. + +***Device Core Control***: MLUs can be allocated with certain compute cores and guarantee it that it does not exceed the boundary. + +***MLU Type Specification***: You can specify which type of MLU to use or to avoid for a certain task, by setting "cambricon.com/use-mlutype" or "cambricon.com/nouse-mlutype" annotations. + + +## Prerequisites + +* neuware-mlu370-driver > 5.10 +* cntoolkit > 2.5.3 + +## Enabling MLU-sharing Support + +* Install the chart using helm, See 'enabling vGPU support in kubernetes' section [here](https://github.com/Project-HAMi/HAMi#enabling-vgpu-support-in-kubernetes) + +* Activate the smlu mode for each MLUs on that node + +``` +cnmon set -c 0 -smlu on +cnmon set -c 1 -smlu on +... +``` + +* Get cambricon-device-plugin from your device provider and specify the following parameters during deployment: + +`mode=dynamic-smlu`, `min-dsmlu-unit=256` + +These two parameters represent enabling the dynamic smlu function and setting the minimum allocable memory unit to 256 MB, respectively. You can refer to the document from device provider for more details + +* Deploy the cambricon-device-plugin you just specified + +``` +kubectl apply -f cambricon-device-plugin-daemonset.yaml +``` + +## Running MLU jobs + +Cambricon MLUs can now be requested by a container +using the `cambricon.com/vmlu` ,`cambricon.com/mlu.smlu.vmemory` and `cambricon.com/mlu.smlu.vcore` resource type: + +``` +apiVersion: apps/v1 +kind: Deployment +metadata: + name: binpack-1 + labels: + app: binpack-1 +spec: + replicas: 1 + selector: + matchLabels: + app: binpack-1 + template: + metadata: + labels: + app: binpack-1 + spec: + containers: + - name: c-1 + image: ubuntu:18.04 + command: ["sleep"] + args: ["100000"] + resources: + limits: + cambricon.com/vmlu: "1" + cambricon.com/mlu.smlu.vmemory: "20" + cambricon.com/mlu.smlu.vcore: "10" +``` + +## Notes + +1. Mlu-sharing in init container is not supported, pods with "combricon.com/mlumem" in init container will never be scheduled. + +2. `cambricon.com/mlu.smlu.vmemory`, `cambricon.com/mlu.smlu.vcore` only work when `cambricon.com/vmlu=1`, otherwise, whole MLUs are allocated when `cambricon.com/vmlu>1` regardless of `cambricon.com/mlu.smlu.vmemory` and `cambricon.com/mlu.smlu.vcore`. \ No newline at end of file diff --git a/docs/cambricon-mlu-support_cn.md b/docs/cambricon-mlu-support_cn.md new file mode 100644 index 000000000..ba7acaaf2 --- /dev/null +++ b/docs/cambricon-mlu-support_cn.md @@ -0,0 +1,75 @@ +## 简介 + +本组件支持复用寒武纪MLU设备,并为此提供以下几种与vGPU类似的复用功能,包括: + +***MLU 共享***: 每个任务可以只占用一部分显卡,多个任务可以共享一张显卡 + +***可限制分配的显存大小***: 你现在可以用显存值(例如3000M)来分配MLU,本组件会确保任务使用的显存不会超过分配数值 + +***可限制分配的算力大小***: 你现在可以用百分比来分配MLU的算力,本组件会确保任务使用的算力不会超过分配数值 + +***指定MLU型号***:当前任务可以通过设置annotation("cambricon.com/use-mlutype","cambricon.com/nouse-mlutype")的方式,来选择使用或者不使用某些具体型号的MLU + +## 节点需求 + +* neuware-mlu370-driver > 5.10 +* cntoolkit > 2.5.3 + +## 开启MLU复用 + +* 通过helm部署本组件, 参照[主文档中的开启vgpu支持章节](https://github.com/Project-HAMi/HAMi/blob/master/README_cn.md#kubernetes开启vgpu支持) + +* 使用以下指令,为MLU节点打上label +``` +kubectl label node {mlu-node} mlu=on +``` + +* 从您的设备提供商处获取cambricon-device-plugin,并配置以下两个参数: + +`mode=dynamic-smlu`, `min-dsmlu-unit=256` + +它们分别代表开启MLU复用功能,与设置最小可分配的内存单元为256M,您可以参考设备提供方的文档来获取更多的配置信息。 + +* 部署配置后的`cambricon-device-plugin` + +``` +kubectl apply -f cambricon-device-plugin-daemonset.yaml +``` + + +## 运行MLU任务 + +``` +apiVersion: apps/v1 +kind: Deployment +metadata: + name: binpack-1 + labels: + app: binpack-1 +spec: + replicas: 1 + selector: + matchLabels: + app: binpack-1 + template: + metadata: + labels: + app: binpack-1 + spec: + containers: + - name: c-1 + image: ubuntu:18.04 + command: ["sleep"] + args: ["100000"] + resources: + limits: + cambricon.com/vmlu: "1" + cambricon.com/mlu.smlu.vmemory: "20" + cambricon.com/mlu.smlu.vcore: "10" +``` + +## 注意事项 + +1. 在init container中无法使用MLU复用功能,否则该任务不会被调度 + +2. 只有申请单MLU的任务可以指定显存`mlu.smlu.vmemory`和算力`mlu.smlu.vcore`的数值,若申请的MLU数量大于1,则所有申请的MLU都会被整卡分配 diff --git a/docs/config.md b/docs/config.md new file mode 100644 index 000000000..498eb7a6d --- /dev/null +++ b/docs/config.md @@ -0,0 +1,58 @@ +# Global Config + +you can customize your vGPU support by setting the following parameters using `-set`, for example + +``` +helm install vgpu-charts/vgpu vgpu --set devicePlugin.deviceMemoryScaling=5 ... +``` + +* `devicePlugin.service.schedulerPort:` + Integer type, by default: 31998, scheduler webhook service nodePort. +* `devicePlugin.deviceMemoryScaling:` + Float type, by default: 1. The ratio for NVIDIA device memory scaling, can be greater than 1 (enable virtual device memory, experimental feature). For NVIDIA GPU with *M* memory, if we set `devicePlugin.deviceMemoryScaling` argument to *S*, vGPUs splitted by this GPU will totally get `S * M` memory in Kubernetes with our device plugin. +* `devicePlugin.deviceSplitCount:` + Integer type, by default: equals 10. Maximum tasks assigned to a simple GPU device. +* `devicePlugin.migstrategy:` + String type, "none" for ignoring MIG features or "mixed" for allocating MIG device by seperate resources. Default "none" +* `devicePlugin.disablecorelimit:` + String type, "true" for disable core limit, "false" for enable core limit, default: false +* `scheduler.defaultMem:` + Integer type, by default: 5000. The default device memory of the current task, in MB +* `scheduler.defaultCores:` + Integer type, by default: equals 0. Percentage of GPU cores reserved for the current task. If assigned to 0, it may fit in any GPU with enough device memory. If assigned to 100, it will use an entire GPU card exclusively. +* `scheduler.defaultGPUNum:` + Integer type, by default: equals 1, if configuration value is 0, then the configuration value will not take effect and will be filtered. when a user does not set nvidia.com/gpu this key in pod resource, webhook should check nvidia.com/gpumem、resource-mem-percentage、nvidia.com/gpucores this three key, anyone a key having value, webhook should add nvidia.com/gpu key and this default value to resources limits map. +* `scheduler.defaultSchedulerPolicy.nodeSchedulerPolicy:` String type, default value is "binpack", representing the GPU node scheduling policy. "binpack" means trying to allocate tasks to the same GPU node as much as possible, while "spread" means trying to allocate tasks to different GPU nodes as much as possible. +* `scheduler.defaultSchedulerPolicy.gpuSchedulerPolicy:` String type, default value is "spread", representing the GPU scheduling policy. "binpack" means trying to allocate tasks to the same GPU as much as possible, while "spread" means trying to allocate tasks to different GPUs as much as possible. +* `resourceName:` + String type, vgpu number resource name, default: "nvidia.com/gpu" +* `resourceMem:` + String type, vgpu memory size resource name, default: "nvidia.com/gpumem" +* `resourceMemPercentage:` + String type, vgpu memory fraction resource name, default: "nvidia.com/gpumem-percentage" +* `resourceCores:` + String type, vgpu cores resource name, default: "nvidia.com/cores" +* `resourcePriority:` + String type, vgpu task priority name, default: "nvidia.com/priority" + +# Container config envs + +* `GPU_CORE_UTILIZATION_POLICY:` + String type, "default", "force", "disable" + default: "default" + "default" means the dafault utilization policy + "force" means the container will always limit the core utilization below "nvidia.com/gpucores" + "disable" means the container will ignore the utilization limitation set by "nvidia.com/gpucores" during task execution + +* `ACTIVE_OOM_KILLER:` + Bool type, "true","false" + default: false + "true" means there will be a daemon process which monitors all running tasks inside this container, and instantly kill any process which exceeds the limitation set by "nvidia.com/gpumem" or "nvidia.com/gpumemory" + +* `CUDA_DISABLE_CONTROL` + Bool type, "true","false" + default: false + "true" means the HAMi-core will not be used inside container, as a result, there will be no resource isolation and limitaion in that container, only for debug. + + + diff --git a/docs/config_cn.md b/docs/config_cn.md new file mode 100644 index 000000000..b94761fec --- /dev/null +++ b/docs/config_cn.md @@ -0,0 +1,49 @@ +# 全局配置 + +你可以在安装过程中,通过`-set`来修改以下的客制化参数,例如: + +``` +helm install vgpu vgpu-charts/vgpu --set devicePlugin.deviceMemoryScaling=5 ... +``` + +* `devicePlugin.deviceSplitCount:` + 整数类型,预设值是10。GPU的分割数,每一张GPU都不能分配超过其配置数目的任务。若其配置为N的话,每个GPU上最多可以同时存在N个任务。 +* `devicePlugin.deviceMemoryScaling:` + 浮点数类型,预设值是1。NVIDIA装置显存使用比例,可以大于1(启用虚拟显存,实验功能)。对于有*M*显存大小的NVIDIA GPU,如果我们配置`devicePlugin.deviceMemoryScaling`参数为*S*,在部署了我们装置插件的Kubenetes集群中,这张GPU分出的vGPU将总共包含 `S * M` 显存。 +* `devicePlugin.migStrategy:` + 字符串类型,目前支持"none“与“mixed“两种工作方式,前者忽略MIG设备,后者使用专门的资源名称指定MIG设备,使用详情请参考mix_example.yaml,默认为"none" +* `devicePlugin.disablecorelimit:` + 字符串类型,"true"为关闭算力限制,"false"为启动算力限制,默认为"false" +* `scheduler.defaultMem:` + 整数类型,预设值为5000,表示不配置显存时使用的默认显存大小,单位为MB +* `scheduler.defaultCores:` + 整数类型(0-100),默认为0,表示默认为每个任务预留的百分比算力。若设置为0,则代表任务可能会被分配到任一满足显存需求的GPU中,若设置为100,代表该任务独享整张显卡 +* `scheduler.defaultGPUNum:` + 整数类型,默认为1,如果配置为0,则配置不会生效。当用户在 pod 资源中没有设置 nvidia.com/gpu 这个 key 时,webhook 会检查 nvidia.com/gpumem、resource-mem-percentage、nvidia.com/gpucores 这三个 key 中的任何一个 key 有值,webhook 都会添加 nvidia.com/gpu 键和此默认值到 resources limit中。 +* `scheduler.defaultSchedulerPolicy.nodeSchedulerPolicy:` 字符串类型,预设值为"binpack", 表示GPU节点调度策略,"binpack"表示尽量将任务分配到同一个GPU节点上,"spread"表示尽量将任务分配到不同GPU节点上。 +* `scheduler.defaultSchedulerPolicy.gpuSchedulerPolicy:` 字符串类型,预设值为"spread", 表示GPU调度策略,"binpack"表示尽量将任务分配到同一个GPU上,"spread"表示尽量将任务分配到不同GPU上。 +* `resourceName:` + 字符串类型, 申请vgpu个数的资源名, 默认: "nvidia.com/gpu" +* `resourceMem:` + 字符串类型, 申请vgpu显存大小资源名, 默认: "nvidia.com/gpumem" +* `resourceMemPercentage:` + 字符串类型,申请vgpu显存比例资源名,默认: "nvidia.com/gpumem-percentage" +* `resourceCores:` + 字符串类型, 申请vgpu算力资源名, 默认: "nvidia.com/cores" +* `resourcePriority:` + 字符串类型,表示申请任务的任务优先级,默认: "nvidia.com/priority" + +# 容器配置(在容器的环境变量中指定) + +* `GPU_CORE_UTILIZATION_POLICY:` + 字符串类型,"default", "force", "disable" + 默认为"default" + 代表容器算力限制策略, "default"为默认,"force"为强制限制算力,一般用于测试算力限制的功能,"disable"为忽略算力限制 +* `ACTIVE_OOM_KILLER:` + 布尔类型,"true", "false" + 默认为false + 若设置为true,则代表监控系统将会持续监控进程的显存使用量,并主动kill掉任何用超配额的进行。 +* `CUDA_DISABLE_CONTROL` + 布尔类型,"true", "false" + 默认为false + 若设置为true,则代表屏蔽掉容器层的资源隔离机制,需要注意的是,这个参数只有在容器创建时指定才会生效,一般用于调试 \ No newline at end of file diff --git a/docs/dashboard.md b/docs/dashboard.md new file mode 100644 index 000000000..878d70b0e --- /dev/null +++ b/docs/dashboard.md @@ -0,0 +1,54 @@ +## Grafana Dashboard + +- You can load this dashboard json file [gpu-dashboard.json](./gpu-dashboard.json) + +- This dashboard also includes some NVIDIA DCGM metrics: + + [dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter) deploy:`kubectl create -f https://raw.githubusercontent.com/NVIDIA/dcgm-exporter/master/dcgm-exporter.yaml` + +- use this prometheus custom metric configure: + +```yaml +- job_name: 'kubernetes-vgpu-exporter' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_endpoints_name] + regex: vgpu-device-plugin-monitor + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_pod_node_name] + regex: (.*) + target_label: node_name + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_pod_host_ip] + regex: (.*) + target_label: ip + replacement: $1 + action: replace +- job_name: 'kubernetes-dcgm-exporter' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_endpoints_name] + regex: dcgm-exporter + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_pod_node_name] + regex: (.*) + target_label: node_name + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_pod_host_ip] + regex: (.*) + target_label: ip + replacement: $1 + action: replace +``` + +- reload promethues: + +```bash +curl -XPOST http://{promethuesServer}:{port}/-/reload +``` diff --git a/docs/dashboard_cn.md b/docs/dashboard_cn.md new file mode 100644 index 000000000..20ee3ecb8 --- /dev/null +++ b/docs/dashboard_cn.md @@ -0,0 +1,53 @@ +## Grafana Dashboard + +- 你可以在 grafana 中导入此 [gpu-dashboard.json](./gpu-dashboard.json) +- 此 dashboard 还包括一部分 NVIDIA DCGM 监控指标: + + [dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter)部署:`kubectl create -f https://raw.githubusercontent.com/NVIDIA/dcgm-exporter/master/dcgm-exporter.yaml` + +- 添加 prometheus 自定义的监控项: + +```yaml +- job_name: 'kubernetes-vgpu-exporter' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_endpoints_name] + regex: vgpu-device-plugin-monitor + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_pod_node_name] + regex: (.*) + target_label: node_name + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_pod_host_ip] + regex: (.*) + target_label: ip + replacement: $1 + action: replace +- job_name: 'kubernetes-dcgm-exporter' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_endpoints_name] + regex: dcgm-exporter + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_pod_node_name] + regex: (.*) + target_label: node_name + replacement: ${1} + action: replace + - source_labels: [__meta_kubernetes_pod_host_ip] + regex: (.*) + target_label: ip + replacement: $1 + action: replace +``` + +- 加载 promethues 配置: + +```bash +curl -XPOST http://{promethuesServer}:{port}/-/reload +``` diff --git a/docs/develop/design.md b/docs/develop/design.md new file mode 100644 index 000000000..11326b53f --- /dev/null +++ b/docs/develop/design.md @@ -0,0 +1,28 @@ +# Design + + + +The architect of HAMi is shown in the figure above, It is organized in the form of "chart". + +- MutatingWebhook + +The MutatingWebhook checks the validity of each task, and set the "schedulerName" to "HAMi scheduler" if the resource requests have been recognized by HAMi +If Not, the MutatingWebhook does nothing and pass this task to default-scheduler. + +- Scheduler + +HAMi support default kube-scheduler and volcano-scheduler, it implements an extender and register 'Filter' and 'Score' methods to deal with sharable devices. +When a pod with sharable device request arrives, 'Filter' searches the cluster and returns a list of 'available' nodes. 'Score' scores each node 'Filter' returned, and pick the highest one to host the pod. It patches the schedule decision on corresponding pod annotations, for the detailed protocol, see protocol.md + +- DevicePlugin + +When the schedule decision is made, scheduler calls devicePlugin on that node to generate environment variables and mounts according to pod annotations. +Please note that, the DP used here is a customized version, you need to install according to README document with that device. Most officaial DP will not fit in HAMi, and will result in unexpected behaviour + +- InContainer Control + +The implementation of in-container hard limit is different for diffent devices. For example, HAMi-Core is responsible for NVIDIA devices. libvgpu-control.so is responsible for iluvatar devices, etc. HAMi needs to pass the correct environment variables in order for it to operate. + + + +In summary, The flowchart of pod is descirbed as the figure above. diff --git a/docs/develop/imgs/flowchart.jpeg b/docs/develop/imgs/flowchart.jpeg new file mode 100644 index 000000000..1cbe0a590 Binary files /dev/null and b/docs/develop/imgs/flowchart.jpeg differ diff --git a/docs/develop/imgs/gpu-scheduler-policy-demo.png b/docs/develop/imgs/gpu-scheduler-policy-demo.png new file mode 100644 index 000000000..fe122ec42 Binary files /dev/null and b/docs/develop/imgs/gpu-scheduler-policy-demo.png differ diff --git a/docs/develop/imgs/node-shceduler-policy-demo.png b/docs/develop/imgs/node-shceduler-policy-demo.png new file mode 100644 index 000000000..5e33fc402 Binary files /dev/null and b/docs/develop/imgs/node-shceduler-policy-demo.png differ diff --git a/docs/develop/imgs/offline_validation.png b/docs/develop/imgs/offline_validation.png new file mode 100644 index 000000000..8dec96255 Binary files /dev/null and b/docs/develop/imgs/offline_validation.png differ diff --git a/docs/develop/imgs/protocol_pod.png b/docs/develop/imgs/protocol_pod.png new file mode 100644 index 000000000..0fff3c6b6 Binary files /dev/null and b/docs/develop/imgs/protocol_pod.png differ diff --git a/docs/develop/imgs/protocol_register.png b/docs/develop/imgs/protocol_register.png new file mode 100644 index 000000000..94c2529e3 Binary files /dev/null and b/docs/develop/imgs/protocol_register.png differ diff --git a/docs/develop/imgs/scheduler-policy-story.png b/docs/develop/imgs/scheduler-policy-story.png new file mode 100644 index 000000000..91039543b Binary files /dev/null and b/docs/develop/imgs/scheduler-policy-story.png differ diff --git a/docs/develop/protocol.md b/docs/develop/protocol.md new file mode 100644 index 000000000..0473b98d5 --- /dev/null +++ b/docs/develop/protocol.md @@ -0,0 +1,67 @@ +# Protocol + +## Device Register + + + +HAMi needs to know the spec of each AI devices in the cluster in order to schedule properly. During device registration, device-plugin needs to keep patching the spec of each device into node annotations every 30 seconds, in the format of the following: + +``` +hami.io/node-handshake-{device-type}: Reported_{device_node_current_timestamp} +hami.io/node-register-{deivce-type}: {Device 1}:{Device2}:...:{Device N} +``` + +The definiation of each device is in the following format: +``` +{Device UUID},{device split count},{device memory limit},{device core limit},{device type},{device numa},{healthy} +``` + +An example is shown below: +``` +hami.io/node-handshake-nvidia: Reported 2024-01-23 04:30:04.434037031 +0000 UTC m=+1104711.777756895 +hami.io/node-handshake-mlu: Requesting_2024.01.10 04:06:57 +hami.io/node-mlu-register: MLU-45013011-2257-0000-0000-000000000000,10,23308,0,MLU-MLU370-X4,0,false:MLU-54043011-2257-0000-0000-000000000000,10,23308,0, +hami.io/node-nvidia-register: GPU-00552014-5c87-89ac-b1a6-7b53aa24b0ec,10,32768,100,NVIDIA-Tesla V100-PCIE-32GB,0,true:GPU-0fc3eda5-e98b-a25b-5b0d-cf5c855d1448,10,32768,100,NVIDIA-Tesla V100-PCIE-32GB,0,true: + +``` +In this example, this node has two different AI devices, 2 Nvidia-V100 GPUs, and 2 Cambircon 370-X4 MLUs + +Note that a device node may become unavailable due to hardware or network failure, if a node hasn't registered in last 5 minutes, scheduler will mark that node as 'unavailable'. + +Since system clock on scheduler node and 'device' node may not align properly, scheduler node will patch the following device node annotations every 30s + +``` +hami.io/node-handshake-{device-type}: Requesting_{scheduler_node_current_timestamp} +``` + +If hami.io/node-handshake annotations remains in "Requesting_xxxx" and {scheduler current timestamp} > 5 mins + {scheduler timestamp in annotations}, then this device on that node will be marked "unavailable" in scheduler. + + +## Schedule Decision + + + +HAMi scheduler needs to patch schedule decisions into pod annotations, in the format of the following: + +``` +hami.io/devices-to-allocate:{ctr1 request}:{ctr2 request}:...{Last ctr request}: +hami.io/device-node: {schedule decision node} +hami.io/device-schedule-time: {timestamp} +``` + +each container request is in the following format: + +``` +{device UUID},{device type keywork},{device memory request}:{device core request} +``` + +for example: + +A pod with 2 containers, first container requests 1 GPU with 3G device Memory, second container requests 1 GPU with 5G device Memory, then the patched annotations will be like the + +``` +hami.io/devices-to-allocate: GPU-0fc3eda5-e98b-a25b-5b0d-cf5c855d1448,NVIDIA,3000,0:GPU-0fc3eda5-e98b-a25b-5b0d-cf5c855d1448,NVIDIA,5000,0: +hami.io/vgpu-node: node67-4v100 +hami.io/vgpu-time: 1705054796 +``` + diff --git a/docs/develop/roadmap.md b/docs/develop/roadmap.md new file mode 100644 index 000000000..70fd2b3f8 --- /dev/null +++ b/docs/develop/roadmap.md @@ -0,0 +1,10 @@ +# roadmap + +| feature | description | release | Example | Example expected behaviour | +|--------------------|----------------------------------------------------------------------------------------------------------------------------------------|---------------|--------------|------------| +| Kubernetes schedule layer | Support Resource Quota for vgpu-memory | v3.2.0 | "requests.nvidia.com/gpu-memory: 30000" in ResourceQuota | Pods in this namespace can allocate up to 30G device memory in this namespace | +| | Support Best-fit, idle-first, Numa-first Schedule Policy | v3.2.0 | add "scheduler policy configmap" | execute schedule policy according to configMap | +| | Support k8s 1.28 version with compatable to v1.16 | v3.1.0 | | | +| Add more Heterogeneous AI computing device | HuaWei Ascend Support | v3.1.0 | | | +| | Iluvatar GPU support | v3.1.0 | | | +| |Teco DPU Support | v3.2.0 | | | diff --git a/docs/develop/scheduler-policy.md b/docs/develop/scheduler-policy.md new file mode 100644 index 000000000..ed209ba0b --- /dev/null +++ b/docs/develop/scheduler-policy.md @@ -0,0 +1,167 @@ +# Scheduler Policy Design + +## Summary + +Current in a cluster with many GPU nodes, nodes are not `binpack` or `spread` when making scheduling decisions, nor are GPU cards `binpack` or `spread` when using vGPU. + +## Proposal + +We add a `node-scheduler-policy` and `gpu-scheduler-policy` to config, then scheduler to use this policy can impl node `binpack` or `spread` or GPU `binpack` or `spread`. and +use can set Pod annotation to change this default policy, use `hami.io/node-scheduler-policy` and `hami.io/gpu-scheduler-policy` to overlay scheduler config. + +### User Stories + +This is a GPU cluster, having two node, the following story takes this cluster as a prerequisite. + +![scheduler-policy-story.png](./imgs/scheduler-policy-story.png) + +#### Story 1 + +node binpack, use one node’s GPU card whenever possible, egs: +- cluster resources: + - node1: GPU having 4 GPU device + - node2: GPU having 4 GPU device + +- request: + - pod1: User 1 GPU + - pod2: User 1 GPU + +- scheduler result: + - pod1: scheduler to node1 + - pod2: scheduler to node1 + +#### Story 2 + +node spread, use GPU cards from different nodes as much as possible, egs: + +- cluster resources: + - node1: GPU having 4 GPU device + - node2: GPU having 4 GPU device + +- request: + - pod1: User 1 GPU + - pod2: User 1 GPU + +- scheduler result: + - pod1: scheduler to node1 + - pod2: scheduler to node2 + +#### Story 3 + +GPU binpack, use the same GPU card as much as possible, egs: + +- cluster resources: + - node1: GPU having 4 GPU device, they are GPU1,GPU2,GPU3,GPU4 + +- request: + - pod1: User 1 GPU, gpucore is 20%, gpumem-percentage is 20% + - pod2: User 1 GPU, gpucore is 20%, gpumem-percentage is 20% + +- scheduler result: + - pod1: scheduler to node1, select GPU1 this device + - pod2: scheduler to node1, select GPU1 this device + +#### Story 4 + +GPU spread, use different GPU cards when possible, egs: + +- cluster resources: + - node1: GPU having 4 GPU device, they are GPU1,GPU2,GPU3,GPU4 + +- request: + - pod1: User 1 GPU, gpucore is 20%, gpumem-percentage is 20% + - pod2: User 1 GPU, gpucore is 20%, gpumem-percentage is 20% + +- scheduler result: + - pod1: scheduler to node1, select GPU1 this device + - pod2: scheduler to node1, select GPU2 this device + +## Design Details + +### Node-scheduler-policy + +![node-shceduler-policy-demo.png](./imgs/node-shceduler-policy-demo.png) + +#### Binpack + +Binpack mainly considers node resource usage. The more full the usage, the higher the score. + +``` +score: ((request + used) / allocatable) * 10 +``` + +1. Binpack scoring information for Node 1 is as follows + +``` +Node1 score: ((1+3)/4) * 10= 10 +``` + +2. Binpack scoring information for Node 2 is as follows + +``` +Node2 score: ((1+2)/4) * 10= 7.5 +``` + +So, in `Binpack` policy we can select `Node1`. + +#### Spread + +Spread mainly considers node resource usage. The less it is used, the higher the score. + +``` +score: ((request + used) / allocatable) * 10 +``` + +1. Spread scoring information for Node 1 is as follows +``` +Node1 score: ((1+3)/4) * 10= 10 +``` + +2. Spread scoring information for Node 2 is as follows +``` +Node2 score: ((1+2)/4) * 10= 7.5 +``` + +So, in `Spread` policy we can select `Node2`. + +### GPU-scheduler-policy + +![gpu-scheduler-policy-demo.png](./imgs/gpu-scheduler-policy-demo.png) + +#### Binpack + +Binpack mainly focuses on the computing power and video memory usage of each card. The more it is used, the higher the score. +``` +score: ((request.core + used.core) / allocatable.core + (request.mem + used.mem) / allocatable.mem)) * 10 +``` + +1. Binpack scoring information for GPU 1 is as follows +``` +GPU1 Score: ((20+10)/100 + (1000+2000)/8000)) * 10 = 6.75 +``` + +2. Binpack scoring information for GPU 2 is as follows +``` +GPU2 Score: ((20+70)/100 + (1000+6000)/8000)) * 10 = 17.75 +``` + +So, in `Binpack` policy we can select `GPU2`. + +#### Spread + +Spread mainly focuses on the computing power and video memory usage of each card. The less it is used, the higher the score. +``` +score: ((request.core + used.core) / allocatable.core + (request.mem + used.mem) / allocatable.mem)) * 10 +``` + +1. Spread scoring information for GPU 1 is as follows +``` +GPU1 Score: ((20+10)/100 + (1000+2000)/8000)) * 10 = 6.75 +``` + +2. Spread scoring information for GPU 2 is as follows +``` +GPU2 Score: ((20+70)/100 + (1000+6000)/8000)) * 10 = 17.75 +``` + +So, in `Spread` policy we can select `GPU1`. \ No newline at end of file diff --git a/docs/develop/tasklist.md b/docs/develop/tasklist.md new file mode 100644 index 000000000..873366f98 --- /dev/null +++ b/docs/develop/tasklist.md @@ -0,0 +1,118 @@ +# Tasks + +## Support Moore threads MTT S4000 + +``` +resources: +requests: + mthreads.com/gpu: ${num} + mthreads.com/vcuda-core: ${core} + mthreads.com/vcuda-memory: ${mem} +limits: + mthreads.com/gpu: ${num} + mthreads.com/vcuda-core: ${core} + mthreads.com/vcuda-memory: ${mem} +``` + +## Support Birentech Model 110 + +``` +resources: +requests: + birentech.com/gpu: ${num} + birentech.com/vcuda-core: ${core} + birentech.com/vcuda-memory: ${mem} +limits: + birentech.com/gpu: ${num} + birentech.com/vcuda-core: ${core} + birentech.com/vcuda-memory: ${mem} +``` + +## Support iluvatar MR-V100 + +``` +resources: +requests: + iluvatar.ai/gpu: ${num} + iluvatar.ai/vcuda-core: ${core} + iluvatar.ai/vcuda-memory: ${mem} +limits: + iluvatar.ai/gpu: ${num} + iluvatar.ai/vcuda-core: ${core} + iluvatar.ai/vcuda-memory: ${mem} +``` + +## Support HuaWei Ascend 910B device + +``` +resources: + requests: + ascend.com/npu: ${num} + ascend.com/npu-core: ${core} + ascend.com/npu-mem: ${mem} + limits: + ascend.com/npu: ${num} + ascend.com/npu-core: ${core} + ascend.com/npu-mem: ${mem} +``` + +## Support resourceQuota for Kubernetes + +Description: ResourceQuota is frequently used in kubernetes namespace. Since the number of virtual devices doesn't mean anything, we need to support the limitation in deviceMemory. + +For example, the following resourceQuota +``` +cat < compute-resources.yaml +apiVersion: v1 +kind: ResourceQuota +metadata: + name: compute-resources +spec: + hard: + requests.cpu: "1" + requests.memory: 1Gi + limits.cpu: "2" + limits.memory: 2Gi + requests.nvidia.com/gpu-memory: 30000 +EOF +``` + +with the following command +``` +kubectl create -f ./compute-resources.yaml--namespace=myspace +``` + +will limit the maxinum device memory allocated to namespace 'myspace' to 30G + +## Support multiple schedule policies + +Description: HAMi needs to support multiple schedule policies, to provide meets the need in complex senarios, a pod can select a schedule policy in annotations field. + +The effect of each schedule policy is shown in the table below + +| Schedule Policy | Effect | +| -------- | ------- | +| best-fit | the fewer device memory remains, the higher score | +| idle-first | idle GPU has higher score | +| numa-first | for multiple GPU allocations, GPUs on the same numa have higher score | + + +For example, if a pod want to select a 'best-fit' schedule policy, it can specify .metadata.annotations as the code below: + +``` +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod + annotations: + nvidia.com/schedule-policy: "best-fit" +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command:["bash","-c","sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # requesting 2 VGPUs +``` + diff --git a/docs/gpu-dashboard.json b/docs/gpu-dashboard.json new file mode 100644 index 000000000..2f71c23ea --- /dev/null +++ b/docs/gpu-dashboard.json @@ -0,0 +1,1150 @@ +{ + "annotations": { + "list": [ + { + "$$hashKey": "object:192", + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "This dashboard is gpu metrics dashboard base on NVIDIA DCGM Exporter and HAMi/k8s-vgpu-scheduler", + "editable": true, + "gnetId": 12239, + "graphTooltip": 0, + "id": 46, + "iteration": 1694498903162, + "links": [], + "panels": [ + { + "datasource": "ALL", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 83 + }, + { + "color": "red", + "value": 87 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 14, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "7.5.17", + "targets": [ + { + "expr": "avg(DCGM_FI_DEV_GPU_TEMP{node_name=~\"${node_name}\", gpu=~\"${gpu}\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "GPU平均温度", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "ALL", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 2400, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 1800 + }, + { + "color": "red", + "value": 2200 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 16, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "sum" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "7.5.17", + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_POWER_USAGE{node_name=~\"${node_name}\", gpu=~\"${gpu}\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "GPU总功率", + "type": "gauge" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "ALL", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 0 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sort": "current", + "sortDesc": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "DCGM_FI_DEV_GPU_TEMP{node_name=~\"${node_name}\", gpu=~\"${gpu}\"}", + "instant": false, + "interval": "", + "legendFormat": "{{node_name}} gpu{{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU温度", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:97", + "format": "celsius", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:98", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "ALL", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 8, + "x": 16, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": false, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "DCGM_FI_DEV_SM_CLOCK{node_name=~\"${node_name}\", gpu=~\"${gpu}\"} * 1000000", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{node_name}} gpu{{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU SM时钟频率(DCGM_FI_DEV_SM_CLOCK)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:462", + "decimals": null, + "format": "hertz", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:463", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "ALL", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 10 + }, + "hiddenSeries": false, + "id": 18, + "legend": { + "avg": true, + "current": false, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "DCGM_FI_DEV_FB_USED{node_name=~\"${node_name}\", gpu=~\"${gpu}\"}", + "interval": "", + "legendFormat": "{{node_name}} gpu{{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU帧缓存(显存)使用量(DCGM_FI_DEV_FB_USED)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:618", + "format": "decmbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:619", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "ALL", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 10 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "DCGM_FI_DEV_POWER_USAGE{node_name=~\"${node_name}\", gpu=~\"${gpu}\"}", + "interval": "", + "legendFormat": "{{node_name}} gpu{{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU功率消耗(DCGM_FI_DEV_POWER_USAGE)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:214", + "format": "watt", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:215", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "ALL", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 20 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "DCGM_FI_DEV_GPU_UTIL{node_name=~\"${node_name}\", gpu=~\"${gpu}\"}", + "interval": "", + "legendFormat": "{{node_name}} gpu{{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU利用率(DCGM_FI_DEV_GPU_UTIL)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:699", + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:700", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "ALL" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 20 + }, + "hiddenSeries": false, + "id": 26, + "legend": { + "alignAsTable": false, + "avg": true, + "current": false, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.14", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "ALL" + }, + "exemplar": true, + "expr": "Device_utilization_desc_of_container{node_name=~\"${node_name}\"}", + "interval": "", + "legendFormat": "{{podname}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "HAMi-pod算力使用率", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:779", + "format": "percent", + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:780", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "ALL", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 20 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "alignAsTable": false, + "avg": true, + "current": false, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "Device_memory_desc_of_container{node_name=~\"${node_name}\"}", + "interval": "", + "legendFormat": "{{podname}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "HAMi-pod显存使用量(byte)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:779", + "format": "bytes", + "label": null, + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:780", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "ALL", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 30 + }, + "hiddenSeries": false, + "id": 22, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "HostGPUMemoryUsage{node_name=~\"${node_name}\"}", + "interval": "", + "legendFormat": "{{node_name}} gpu {{deviceid}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "HAMi-节点GPU显存使用量", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1087", + "format": "bytes", + "label": null, + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:1088", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "ALL", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 30 + }, + "hiddenSeries": false, + "id": 20, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "HostCoreUtilization{node_name=~\"${node_name}\"}", + "interval": "", + "legendFormat": "{{node_name}} gpu {{deviceid}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "HAMi-节点GPU算力使用率", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1243", + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1244", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 27, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "selected": false + }, + "datasource": "ALL", + "definition": "label_values({__name__=~\"DCGM_FI_DEV_FB_FREE|vGPU_device_memory_limit_in_bytes\"}, node_name)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": true, + "name": "node_name", + "options": [], + "query": { + "query": "label_values({__name__=~\"DCGM_FI_DEV_FB_FREE|vGPU_device_memory_limit_in_bytes\"}, node_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": false, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "ALL", + "definition": "label_values(DCGM_FI_DEV_FB_FREE{node_name=\"$node_name\"},gpu)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "gpu", + "options": [], + "query": { + "query": "label_values(DCGM_FI_DEV_FB_FREE{node_name=\"$node_name\"},gpu)", + "refId": "ALL-gpu-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-12h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "k8s-vgpu-scheduler Dashboard", + "uid": "Oxed_c6Wz1", + "version": 3 +} diff --git a/docs/how-to-use-volcano-vgpu.md b/docs/how-to-use-volcano-vgpu.md new file mode 100644 index 000000000..d6edcf616 --- /dev/null +++ b/docs/how-to-use-volcano-vgpu.md @@ -0,0 +1,130 @@ +# Volcano vgpu device plugin for Kubernetes + +**Note**: + +You *DON'T* need to install HAMi when using volcano-vgpu, only use +[Volcano vgpu device-plugin](https://github.com/Project-HAMi/volcano-vgpu-device-plugin) is good enough. It can provide device-sharing mechanism for NVIDIA devices managed by volcano. + +This is based on [Nvidia Device Plugin](https://github.com/NVIDIA/k8s-device-plugin), it uses [HAMi-core](https://github.com/Project-HAMi/HAMi-core) to support hard isolation of GPU card. + +Volcano vgpu is only available in volcano > 1.9 + +## Quick Start + +### Install Volcano + +helm repo add volcano-sh https://volcano-sh.github.io/helm-charts +helm install volcano volcano-sh/volcano -n volcano-system --create-namespace + +### Configure scheduler + +update the scheduler configuration: + +```shell script +kubectl edit cm -n volcano-system volcano-scheduler-configmap +``` + +```yaml +kind: ConfigMap +apiVersion: v1 +metadata: + name: volcano-scheduler-configmap + namespace: volcano-system +data: + volcano-scheduler.conf: | + actions: "enqueue, allocate, backfill" + tiers: + - plugins: + - name: priority + - name: gang + - name: conformance + - plugins: + - name: drf + - name: deviceshare + arguments: + deviceshare.VGPUEnable: true # enable vgpu + - name: predicates + - name: proportion + - name: nodeorder + - name: binpack +``` + +### Enabling GPU Support in Kubernetes + +Once you have enabled this option on *all* the GPU nodes you wish to use, +you can then enable GPU support in your cluster by deploying the following Daemonset: + +``` +$ kubectl create -f https://raw.githubusercontent.com/Project-HAMi/volcano-vgpu-device-plugin/main/volcano-vgpu-device-plugin.yml +``` + +### Verify environment is ready + +Check the node status, it is ok if `volcano.sh/vgpu-number` is included in the allocatable resources. + +```shell script +$ kubectl get node {node name} -oyaml +... +status: + addresses: + - address: 172.17.0.3 + type: InternalIP + - address: volcano-control-plane + type: Hostname + allocatable: + cpu: "4" + ephemeral-storage: 123722704Ki + hugepages-1Gi: "0" + hugepages-2Mi: "0" + memory: 8174332Ki + pods: "110" + volcano.sh/gpu-number: "10" # vGPU resource + capacity: + cpu: "4" + ephemeral-storage: 123722704Ki + hugepages-1Gi: "0" + hugepages-2Mi: "0" + memory: 8174332Ki + pods: "110" + volcano.sh/gpu-memory: "89424" + volcano.sh/gpu-number: "10" # vGPU resource +``` + +### Running VGPU Jobs + +VGPU can be requested by both set "volcano.sh/vgpu-number" , "volcano.sh/vgpu-cores" and "volcano.sh/vgpu-memory" in resource.limit + +```shell script +$ cat < **WARNING:** *if you don't request GPUs when using the device plugin with NVIDIA images all +> the GPUs on the machine will be exposed inside your container. +> The number of vgpu used by a container can not exceed the number of gpus on that node.* + +### Monitor + +volcano-scheduler-metrics records every GPU usage and limitation, visit the following address to get these metrics. + +``` +curl {volcano scheduler cluster ip}:8080/metrics +``` + diff --git a/docs/hygon-dcu-support.md b/docs/hygon-dcu-support.md new file mode 100644 index 000000000..fbf043e74 --- /dev/null +++ b/docs/hygon-dcu-support.md @@ -0,0 +1,79 @@ +## Introduction + +**We now support hygon.com/dcu by implementing most device-sharing features as nvidia-GPU**, including: + +***DCU sharing***: Each task can allocate a portion of DCU instead of a whole DCU card, thus DCU can be shared among multiple tasks. + +***Device Memory Control***: DCUs can be allocated with certain device memory size on certain type(i.e Z100) and have made it that it does not exceed the boundary. + +***Device compute core limitation***: DCUs can be allocated with certain percentage of device core(i.e hygon.com/dcucores:60 indicate this container uses 60% compute cores of this device) + +***DCU Type Specification***: You can specify which type of DCU to use or to avoid for a certain task, by setting "hygon.com/use-dcutype" or "hygon.com/nouse-dcutype" annotations. + +## Prerequisites + +* dtk driver >= 24.04 +* hy-smi v1.6.0 + +## Enabling DCU-sharing Support + +* Deploy the dcu-vgpu-device-plugin [here](https://github.com/Project-HAMi/dcu-vgpu-device-plugin) + + +## Running DCU jobs + +Hygon DCUs can now be requested by a container +using the `hygon.com/dcunum` , `hygon.com/dcumem` and `hygon.com/dcucores` resource type: + +``` +apiVersion: v1 +kind: Pod +metadata: + name: alexnet-tf-gpu-pod-mem + labels: + purpose: demo-tf-amdgpu +spec: + containers: + - name: alexnet-tf-gpu-container + image: pytorch:resnet50 + workingDir: /root + command: ["sleep","infinity"] + resources: + limits: + hygon.com/dcunum: 1 # requesting a GPU + hygon.com/dcumem: 2000 # each dcu require 2000 MiB device memory + hygon.com/dcucores: 60 # each dcu use 60% of total compute cores + +``` + +## Enable vDCU inside container + +You need to enable vDCU inside container in order to use it. +``` +source /opt/hygondriver/env.sh +``` + +check if you have successfully enabled vDCU by using following command + +``` +hy-virtual -show-device-info +``` + +If you have an output like this, then you have successfully enabled vDCU inside container. + +``` +Device 0: + Actual Device: 0 + Compute units: 60 + Global memory: 2097152000 bytes +``` + +Launch your DCU tasks like you usually do + +## Notes + +1. DCU-sharing in init container is not supported, pods with "hygon.com/dcumem" in init container will never be scheduled. + +2. Only one vdcu can be aquired per container. If you want to mount multiple dcu devices, then you shouldn't set `hygon.com/dcumem` or `hygon.com/dcucores` + + \ No newline at end of file diff --git a/docs/hygon-dcu-support_cn.md b/docs/hygon-dcu-support_cn.md new file mode 100644 index 000000000..44424dabd --- /dev/null +++ b/docs/hygon-dcu-support_cn.md @@ -0,0 +1,71 @@ +## 简介 + +本组件支持复用海光DCU设备,并为此提供以下几种与vGPU类似的复用功能,包括: + +***DCU 共享***: 每个任务可以只占用一部分显卡,多个任务可以共享一张显卡 + +***可限制分配的显存大小***: 你现在可以用显存值(例如3000M)来分配DCU,本组件会确保任务使用的显存不会超过分配数值 + +***可限制计算单元数量***: 你现在可以指定任务使用的算力比例(例如60即代表使用60%算力)来分配DCU,本组件会确保任务使用的算力不会超过分配数值 + +***指定DCU型号***:当前任务可以通过设置annotation("hygon.com/use-dcutype","hygon.com/nouse-dcutype")的方式,来选择使用或者不使用某些具体型号的DCU + +## 节点需求 + +* dtk driver >= 24.04 +* hy-smi v1.6.0 + +## 开启DCU复用 + +* 部署[dcu-vgpu-device-plugin](https://github.com/Project-HAMi/dcu-vgpu-device-plugin) + +## 运行DCU任务 + +``` +apiVersion: v1 +kind: Pod +metadata: + name: alexnet-tf-gpu-pod-mem + labels: + purpose: demo-tf-amdgpu +spec: + containers: + - name: alexnet-tf-gpu-container + image: pytorch:resnet50 + workingDir: /root + command: ["sleep","infinity"] + resources: + limits: + hygon.com/dcunum: 1 # requesting a GPU + hygon.com/dcumem: 2000 # each dcu require 2000 MiB device memory + hygon.com/dcucores: 60 # each dcu use 60% of total compute cores + +``` + +## 容器内开启虚拟DCU功能 + +使用vDCU首先需要激活虚拟环境 +``` +source /opt/hygondriver/env.sh +``` + +随后,使用hdmcli指令查看虚拟设备是否已经激活 +``` +hy-virtual -show-device-info +``` + +若输出如下,则代表虚拟设备已经成功激活 +``` +Device 0: + Actual Device: 0 + Compute units: 60 + Global memory: 2097152000 bytes +``` + +接下来正常启动DCU任务即可 + +## 注意事项 + +1. 在init container中无法使用DCU复用功能,否则该任务不会被调度 + +2. 每个容器最多只能使用一个虚拟DCU设备, 如果您希望在容器中挂载多个DCU设备,则不能使用`hygon.com/dcumem`和`hygon.com/dcucores`字段 diff --git a/docs/iluvatar-gpu-support.md b/docs/iluvatar-gpu-support.md new file mode 100644 index 000000000..77815ed12 --- /dev/null +++ b/docs/iluvatar-gpu-support.md @@ -0,0 +1,86 @@ +## Introduction + +**We now support iluvatar.ai/gpu by implementing most device-sharing features as nvidia-GPU**, including: + +***GPU sharing***: Each task can allocate a portion of GPU instead of a whole GPU card, thus GPU can be shared among multiple tasks. + +***Device Memory Control***: GPUs can be allocated with certain device memory size on certain type(i.e m100) and have made it that it does not exceed the boundary. + +***Device Core Control***: GPUs can be allocated with limited compute cores on certain type(i.e m100) and have made it that it does not exceed the boundary. + +***Very Easy to use***: You don't need to modify your task yaml to use our scheduler. All your GPU jobs will be automatically supported after installation. + +## Prerequisites + +* Iluvatar gpu-manager (please consult your device provider) +* driver version > 3.1.0 + +## Enabling GPU-sharing Support + +* Deploy gpu-manager on iluvatar nodes (Please consult your device provider to aquire its package and document) + +> **NOTICE:** *Install only gpu-manager, don't install gpu-admission package.* + +* Identify the resource name about core and memory usage(i.e 'iluvatar.ai/vcuda-core', 'iluvatar.ai/vcuda-memory') + +* set the 'iluvatarResourceMem' and 'iluvatarResourceCore' parameters when install hami + +``` +helm install hami hami-charts/hami --set scheduler.kubeScheduler.imageTag={your kubernetes version} --set iluvatarResourceMem=iluvatar.ai/vcuda-memory --set iluvatarResourceCore=iluvatar.ai/vcuda-core -n kube-system +``` + +## Running Iluvatar jobs + +Iluvatar GPUs can now be requested by a container +using the `iluvatar.ai/vgpu`, `iluvatar.ai/vcuda-memory` and `iluvatar.ai/vcuda-core` resource type: + +``` +apiVersion: v1 +kind: Pod +metadata: + name: poddemo +spec: + restartPolicy: Never + containers: + - name: poddemo + image: harbor.4pd.io/vgpu/corex_transformers@sha256:36a01ec452e6ee63c7aa08bfa1fa16d469ad19cc1e6000cf120ada83e4ceec1e + command: + - bash + args: + - -c + - | + set -ex + echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc + cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/ + cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/ + source /root/.bashrc + sleep 360000 + resources: + requests: + iluvatar.ai/vgpu: 1 + iluvatar.ai/vcuda-core: 50 + iluvatar.ai/vcuda-memory: 64 + limits: + iluvatar.ai/vgpu: 1 + iluvatar.ai/vcuda-core: 50 + iluvatar.ai/vcuda-memory: 64 +``` + +> **NOTICE1:** *Each unit of vcuda-memory indicates 256M device memory* + +> **NOTICE2:** *You can find more examples in [examples/iluvatar folder](../examples/iluvatar/)* + +## Notes + +1. You need to set the following prestart command in order for the device-share to work properly +``` + set -ex + echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc + cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/ + cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/ + source /root/.bashrc +``` + +2. Virtualization takes effect only for containers that apply for one GPU(i.e iluvatar.ai/vgpu=1 ) + + \ No newline at end of file diff --git a/docs/iluvatar-gpu-support_cn.md b/docs/iluvatar-gpu-support_cn.md new file mode 100644 index 000000000..ef1798064 --- /dev/null +++ b/docs/iluvatar-gpu-support_cn.md @@ -0,0 +1,84 @@ +## 简介 + +本组件支持复用天数智芯GPU设备,并为此提供以下几种与vGPU类似的复用功能,包括: + +***GPU 共享***: 每个任务可以只占用一部分显卡,多个任务可以共享一张显卡 + +***可限制分配的显存大小***: 你现在可以用显存值(例如3000M)来分配MLU,本组件会确保任务使用的显存不会超过分配数值,注意只有M100型号的M150支持可配显存 + +***可限制分配的算力核组比例***: 你现在可以用算力比例(例如60%)来分配GPU,本组件会确保任务使用的显存不会超过分配数值,注意只有M100型号的M150支持可配算力比例 + +***方便易用***: 部署本组件后,只需要部署厂家提供的gpu-manager即可使用 + + +## 节点需求 + +* Iluvatar gpu-manager (please consult your device provider) +* driver version > 3.1.0 + +## 开启GPU复用 + +* 部署'gpu-manager',天数智芯的GPU共享需要配合厂家提供的'gpu-manager'一起使用,请联系设备提供方获取 + +> **注意:** *只需要安装gpu-manager,不要安装gpu-admission.* + +* 部署'gpu-manager'之后,你需要确认显存和核组对应的资源名称(例如 'iluvatar.ai/vcuda-core', 'iluvatar.ai/vcuda-memory') + +* 在安装HAMi时配置'iluvatarResourceMem'和'iluvatarResourceCore'参数 + +``` +helm install hami hami-charts/hami --set scheduler.kubeScheduler.imageTag={your kubernetes version} --set iluvatarResourceMem=iluvatar.ai/vcuda-memory --set iluvatarResourceCore=iluvatar.ai/vcuda-core -n kube-system +``` + +## 运行GPU任务 + +``` +apiVersion: v1 +kind: Pod +metadata: + name: poddemo +spec: + restartPolicy: Never + containers: + - name: poddemo + image: harbor.4pd.io/vgpu/corex_transformers@sha256:36a01ec452e6ee63c7aa08bfa1fa16d469ad19cc1e6000cf120ada83e4ceec1e + command: + - bash + args: + - -c + - | + set -ex + echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc + cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/ + cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/ + source /root/.bashrc + sleep 360000 + resources: + requests: + iluvatar.ai/vgpu: 1 + iluvatar.ai/vcuda-core: 50 + iluvatar.ai/vcuda-memory: 64 + limits: + iluvatar.ai/vgpu: 1 + iluvatar.ai/vcuda-core: 50 + iluvatar.ai/vcuda-memory: 64 +``` + +> **注意1:** *每一单位的vcuda-memory代表256M的显存.* + +> **注意2:** *查看更多的[用例](../examples/iluvatar/).* + +## 注意事项 + +1. 你需要在容器中进行如下的设置才能正常的使用共享功能 +``` + set -ex + echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc + cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/ + cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/ + source /root/.bashrc +``` + +2. 共享模式只对申请一张GPU的容器生效(iluvatar.ai/vgpu=1) + + diff --git a/docs/offline-install.md b/docs/offline-install.md new file mode 100644 index 000000000..d0bca1cc5 --- /dev/null +++ b/docs/offline-install.md @@ -0,0 +1,60 @@ +# Offline-install Maunal + +For some cluster that don't have external web access, you can install HAMi by the following step: + +1. Refer to [README.md](../README.md) until step 'Install and Uninstall' + +2. pull the following images and save them into a '.tar' file, then move it into your cluster + +Image list: +``` +projecthami/hami:{HAMi version} +docker.io/jettech/kube-webhook-certgen:v1.5.2 +liangjw/kube-webhook-certgen:v1.1.1 +registry.cn-hangzhou.aliyuncs.com/google_containers/kube-scheduler:{your kubernetes version} +``` + +``` +docker pull {iamge} && docker save {image_name} -o {image_name}.tar +``` + +3. Load these images using docker load, tag these images with your registry, and push them into your registry + +``` +docker load -i {HAMi_image}.tar +docker tag projecthami/hami:{HAMi version} {your_inner_registry}/hami:{HAMi version} +docker push {your_inner_registry}/hami:{HAMi version} +docker tag docker.io/jettech/kube-webhook-certgen:v1.5.2 {your inner_regisry}/kube-webhook-certgen:v1.5.2 +docker push {your inner_regisry}/kube-webhook-certgen:v1.5.2 +docker tag liangjw/kube-webhook-certgen:v1.1.1 {your_inner_registry}/kube-webhook-certgen:v1.1.1 +docker tag registry.cn-hangzhou.aliyuncs.com/google_containers/kube-scheduler:{your kubernetes version} {your_inner_registry}/kube-scheduler:{your kubernetes version} +docker push {your_inner_registry}/kube-scheduler:{your kubernetes version} +``` + +4. Download the charts folder from [github](https://github.com/Project-HAMi/HAMi/tree/master/charts), place it into ${CHART_PATH} inside cluser, then edit the following fields in ${CHART_PATH}/hami/values.yaml. + +``` +scheduler.kubeScheduler.image +scheduler.extender.image +scheduler.patch.image +scheduler.patch.imageNew +scheduler.devicePlugin.image +scheduler.devicePlugin.monitorimage +``` + +5. Execute the following command in your /root/HAMi/chart folder + +``` +helm install hami hami --set scheduler.kubeScheduler.imageTag={your k8s server version} -n kube-system +``` + +6. Verify your installation + +execute the following command +``` +kubectl get pods -n kube-system +``` + +If you can see both the 'device-plugin' and 'schduler' running, then HAMi is installed successfully, as the figure shown below: + + diff --git a/docs/proposals/gpu_utilization.png b/docs/proposals/gpu_utilization.png new file mode 100644 index 000000000..ff0d997f0 Binary files /dev/null and b/docs/proposals/gpu_utilization.png differ diff --git a/docs/proposals/gpu_utilization_cn.md b/docs/proposals/gpu_utilization_cn.md new file mode 100644 index 000000000..8071dbe31 --- /dev/null +++ b/docs/proposals/gpu_utilization_cn.md @@ -0,0 +1,189 @@ +--- +title: Support GPU Utilization Metrics +authors: +- @chaunceyjiang + reviewers: +- TBD + approvers: +- TBD + +creation-date: 2024-04-10 + +--- + +# Support GPU Utilization Metrics + +## Summary +Currently, HMAi supports dividing a Nvidia GPU card into several vGPU cards to efficiently utilize the +efficiency of the GPU. When I assign a vGPU to a Pod, HAMi cannot provide information on the Pod's +utilization of the vGPU. This results in users being unable to observe the usage situation of the Pod's vGPU. + +This KEP proposes support for monitoring vGPU utilization. + + +## Motivation + +### Goals +- Support for monitoring vGPU utilization + +### Non-Goals + +Does not support monitoring of GPU utilization for non-Nvidia GPUs. + +## Proposal + +### User Stories (Optional) + + +#### Story 1 + +I have partitioned a Nvidia GPU card into 4 parts and deployed 2 Pods on this card. +Currently, I want to observe the GPU usage of these two Pods separately, in order to assess +whether my business logic is reasonable. + +Currently, HAMi provides a `HostCoreUtilization` usage rate for the entire GPU card, +but it still cannot observe the use of GPUs from each Pod's perspective. + +### Notes/Constraints/Caveats (Optional) + + +### Risks and Mitigations + +Because the design scheme will expand the fields of the `struct shared_region` structure, there may be potential incompatibilities. + +## Design Details + +Modify the shared_region, add a `gpu_util` field to record the current pid's GPU usage. +```c++ +typedef struct { + uint64_t dec_util; + uint64_t enc_util; + uint64_t sm_util; +} device_gpu_t; + +typedef struct { + int32_t pid; + int32_t hostpid; + device_memory_t used[CUDA_DEVICE_MAX_COUNT]; + uint64_t monitorused[CUDA_DEVICE_MAX_COUNT]; + int32_t status; + device_gpu_t gpu_util[CUDA_DEVICE_MAX_COUNT]; // new field +} shrreg_proc_slot_t; + + +int set_gpu_device_gpu_monitor(int32_t pid,int dev, unsigned int smUtil){ // new function + //LOG_WARN("set_gpu_device_memory_monitor:%d %d %lu",pid,dev,monitor); + int i; + ensure_initialized(); + lock_shrreg(); + for (i=0;iproc_num;i++){ + if (region_info.shared_region->procs[i].hostpid == pid){ + LOG_INFO("set_gpu_device_gpu_monitor:%d %d %lu->%lu",pid,dev,region_info.shared_region->procs[i].gpuUsed[dev].smUtil,smUtil); + region_info.shared_region->procs[i].gpu_util[dev].smUtil = smUtil; + break; + } + } + unlock_shrreg(); + return 1; +} +``` + +Modify the `get_used_gpu_utilization` method, to record the GPU usage rate of the current pid. + +```c++ + +int get_used_gpu_utilization(int *userutil,int *sysprocnum) { + ... + unsigned int nvmlCounts; + CHECK_NVML_API(nvmlDeviceGetCount(&nvmlCounts)); + + int devi,cudadev; + for (devi=0;devi + + \ No newline at end of file diff --git a/docs/release-process.md b/docs/release-process.md new file mode 100644 index 000000000..ed614c96e --- /dev/null +++ b/docs/release-process.md @@ -0,0 +1,27 @@ +This document documents the regular release process, including image building, chart package building, artifact publishing, changelog writing, etc. + +1. Update Changelog + +Currently, there is no automated way to generate the changelog. The changelog file needs to be updated manually. Its directory is `/docs/CHANGELOG`, and a new file needs to be created for each minor version. For example, all changelogs for version 1.2.x are placed in the CHANGELOG-1.2.md file. You can refer to the specific format in CHANGELOG-0.0.0.md. + +2. Modify Version + +Modify the latest version of the chart by modifying the `charts/hami/Chart.yaml` file. Please update both version and appVersion. Currently, there is a CI workflow that checks whether these two fields are consistent. If they are inconsistent, CI will report an error. + +Modify the version in the `charts/hami/values` file to the latest version. +When the chart's version is updated, it will automatically trigger CI to release the chart version, automatically build the tag package, update the index file under the gh-page branch, and automatically generate a release and assert. + +3. Create a release branch +After the above changes are merged into the `master` branch, based on the master branch, create a new release branch with the prefix release and only the `x` and `y` version numbers. For example, release-1.1. In version 1.1, all version releases including the z releases will be based on this branch. + +4. Generate a New Tag + +Based on relase branch, a new tag can be created. Its name must start with `v`. When a new tag is created, it will trigger the building of a new image and upload it to the `ghcr` image repository. + +5. Update Release Description + +The release description will be automatically generated in the second step, and we can add more release content. For example, link to the changelog file. e.g: See [the CHANGELOG](./CHANGELOG/CHANGELOG-0.0.0.md) for details. + +6. Release the z version +Before releasing the z version, you need to ensure that all changes have been merged into the release branch, including the changelogs for the new z version release. All the changelogs for z relases should be put in one changelog file, then based on the latest relase branch generate a new tag. + diff --git a/example.yaml b/example.yaml new file mode 100644 index 000000000..a6afefd31 --- /dev/null +++ b/example.yaml @@ -0,0 +1,59 @@ +apiVersion: v1 +kind: Namespace +metadata: + labels: + kubernetes.io/metadata.name: gpu-test-workloads + pod-security.kubernetes.io/enforce: privileged + name: gpu-test-workloads +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cuda-sample-vector-add + namespace: gpu-test-workloads + labels: + app: cuda-sample-vector-add +spec: + replicas: 1 + selector: + matchLabels: + app: cuda-sample-vector-add + template: + metadata: + labels: + app: cuda-sample-vector-add + spec: + containers: + - name: cuda-sample-vector-add + image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04 + command: + - /bin/bash + - '-c' + - '--' + args: + - while true; do /cuda-samples/vectorAdd; done + resources: + limits: + nvidia.com/gpu: 1 # requesting 1 vGPUs + nvidia.com/gpumem: 3000 # Each vGPU contains 3000m device memory (Optional,Integer) + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + imagePullPolicy: IfNotPresent + restartPolicy: Always + terminationGracePeriodSeconds: 30 + dnsPolicy: ClusterFirst + hostPID: true + securityContext: {} + schedulerName: default-scheduler + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + priorityClassName: system-cluster-critical + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 25% + maxSurge: 25% + revisionHistoryLimit: 10 + progressDeadlineSeconds: 600 \ No newline at end of file diff --git a/examples/hygon/default_use.yaml b/examples/hygon/default_use.yaml new file mode 100644 index 000000000..ea911fa86 --- /dev/null +++ b/examples/hygon/default_use.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Pod +metadata: + name: alexnet-tf-gpu-pod-mem + labels: + purpose: demo-tf-amdgpu +spec: + containers: + - name: alexnet-tf-gpu-container + image: pytorch:resnet50 + workingDir: /root + command: ["sleep","infinity"] + resources: + limits: + hygon.com/dcunum: 1 # requesting a GPU + hygon.com/dcumem: 2000 # each dcu require 2000 MiB device memory + hygon.com/dcucores: 60 # each dcu use 60% of total compute cores diff --git a/examples/hygon/specify_card_type_not_use.yaml b/examples/hygon/specify_card_type_not_use.yaml new file mode 100644 index 000000000..7e2628b0a --- /dev/null +++ b/examples/hygon/specify_card_type_not_use.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Pod +metadata: + name: alexnet-tf-gpu-pod-mem + annotations: + hygon.com/nouse-dcutype: "Z100L" # Specify the card type for this job, use comma to seperate, will not launch job on non-specified card + #In this example, we don't want this container to run on Z100L + purpose: demo-tf-amdgpu +spec: + containers: + - name: alexnet-tf-gpu-container + image: pytorch:resnet50 + workingDir: /root + command: ["sleep","infinity"] + resources: + limits: + hygon.com/dcunum: 1 # requesting a GPU + hygon.com/dcumem: 2000 + hygon.com/dcucores: 60 diff --git a/examples/hygon/specify_card_type_to_use.yaml b/examples/hygon/specify_card_type_to_use.yaml new file mode 100644 index 000000000..b7bd877be --- /dev/null +++ b/examples/hygon/specify_card_type_to_use.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: Pod +metadata: + name: alexnet-tf-gpu-pod-mem + annotations: + hygon.com/use-dcutype: "Z100" # Specify the card type for this job, use comma to seperate, will not launch job on non-specified card + #In this example, we want to run this job on Z100 + labels: + purpose: demo-tf-amdgpu +spec: + containers: + - name: alexnet-tf-gpu-container + image: pytorch:resnet50 + workingDir: /root + command: ["sleep","infinity"] + resources: + limits: + hygon.com/dcunum: 1 # requesting a GPU + hygon.com/dcumem: 2000 + hygon.com/dcucores: 60 diff --git a/examples/iluvatar/default_use.yaml b/examples/iluvatar/default_use.yaml new file mode 100644 index 000000000..89ecfc719 --- /dev/null +++ b/examples/iluvatar/default_use.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +kind: Pod +metadata: + name: poddemo +spec: + restartPolicy: Never + containers: + - name: poddemo + image: harbor.4pd.io/vgpu/corex_transformers@sha256:36a01ec452e6ee63c7aa08bfa1fa16d469ad19cc1e6000cf120ada83e4ceec1e + command: + - bash + args: + - -c + - | + set -ex + echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc + cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/ + cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/ + source /root/.bashrc + sleep 360000 + resources: + requests: + iluvatar.ai/vgpu: 1 + iluvatar.ai/vcuda-core: 50 + iluvatar.ai/vcuda-memory: 64 + limits: + iluvatar.ai/vgpu: 1 + iluvatar.ai/vcuda-core: 50 + iluvatar.ai/vcuda-memory: 64 \ No newline at end of file diff --git a/examples/iluvatar/multi-containers.yaml b/examples/iluvatar/multi-containers.yaml new file mode 100644 index 000000000..49d0c6e00 --- /dev/null +++ b/examples/iluvatar/multi-containers.yaml @@ -0,0 +1,51 @@ +apiVersion: v1 +kind: Pod +metadata: + name: poddemo +spec: + restartPolicy: Never + containers: + - name: poddemo + image: harbor.4pd.io/vgpu/corex_transformers@sha256:36a01ec452e6ee63c7aa08bfa1fa16d469ad19cc1e6000cf120ada83e4ceec1e + command: + - bash + args: + - -c + - | + set -ex + echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc + cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/ + cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/ + source /root/.bashrc + sleep 360000 + resources: + requests: + iluvatar.ai/vgpu: 1 + iluvatar.ai/vcuda-core: 50 + iluvatar.ai/vcuda-memory: 64 + limits: + iluvatar.ai/vgpu: 1 + iluvatar.ai/vcuda-core: 50 + iluvatar.ai/vcuda-memory: 64 + - name: poddemo1 + image: harbor.4pd.io/vgpu/corex_transformers@sha256:36a01ec452e6ee63c7aa08bfa1fa16d469ad19cc1e6000cf120ada83e4ceec1e + command: + - bash + args: + - -c + - | + set -ex + echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc + cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/ + cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/ + source /root/.bashrc + sleep 360000 + resources: + requests: + iluvatar.ai/vgpu: 1 + iluvatar.ai/vcuda-core: 50 + iluvatar.ai/vcuda-memory: 64 + limits: + iluvatar.ai/vgpu: 1 + iluvatar.ai/vcuda-core: 50 + iluvatar.ai/vcuda-memory: 64 \ No newline at end of file diff --git a/examples/iluvatar/multi-devices.yaml b/examples/iluvatar/multi-devices.yaml new file mode 100644 index 000000000..2287835e0 --- /dev/null +++ b/examples/iluvatar/multi-devices.yaml @@ -0,0 +1,25 @@ +apiVersion: v1 +kind: Pod +metadata: + name: poddemo +spec: + restartPolicy: Never + containers: + - name: poddemo + image: harbor.4pd.io/vgpu/corex_transformers@sha256:36a01ec452e6ee63c7aa08bfa1fa16d469ad19cc1e6000cf120ada83e4ceec1e + command: + - bash + args: + - -c + - | + set -ex + echo "export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH">> /root/.bashrc + cp -f /usr/local/iluvatar/lib64/libcuda.* /usr/local/corex/lib64/ + cp -f /usr/local/iluvatar/lib64/libixml.* /usr/local/corex/lib64/ + source /root/.bashrc + sleep 360000 + resources: + requests: + iluvatar.ai/vgpu: 2 + limits: + iluvatar.ai/vgpu: 2 \ No newline at end of file diff --git a/examples/mlu/allocate_whole.yaml b/examples/mlu/allocate_whole.yaml new file mode 100644 index 000000000..40936cbb5 --- /dev/null +++ b/examples/mlu/allocate_whole.yaml @@ -0,0 +1,24 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: binpack-1 + labels: + app: binpack-1 +spec: + replicas: 1 + selector: + matchLabels: + app: binpack-1 + template: + metadata: + labels: + app: binpack-1 + spec: + containers: + - name: c-1 + image: ubuntu:18.04 + command: ["sleep"] + args: ["100000"] + resources: + limits: + cambricon.com/vmlu: "1" #allocates a whole MLU \ No newline at end of file diff --git a/examples/mlu/default_use.yaml b/examples/mlu/default_use.yaml new file mode 100644 index 000000000..8501c5bdb --- /dev/null +++ b/examples/mlu/default_use.yaml @@ -0,0 +1,26 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: binpack-1 + labels: + app: binpack-1 +spec: + replicas: 1 + selector: + matchLabels: + app: binpack-1 + template: + metadata: + labels: + app: binpack-1 + spec: + containers: + - name: c-1 + image: ubuntu:18.04 + command: ["sleep"] + args: ["100000"] + resources: + limits: + cambricon.com/vmlu: "1" + cambricon.com/mlu370.smlu.vmemory: "20" + cambricon.com/mlu370.smlu.vcore: "10" \ No newline at end of file diff --git a/examples/nvidia/default_use.yaml b/examples/nvidia/default_use.yaml new file mode 100644 index 000000000..999cb9d43 --- /dev/null +++ b/examples/nvidia/default_use.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # requesting 2 vGPUs + nvidia.com/gpumem: 3000 # Each vGPU contains 3000m device memory (Optional,Integer) + nvidia.com/gpucores: 30 # Each vGPU uses 30% of the entire GPU (Optional,Integer) diff --git a/examples/nvidia/default_use_legacy.yaml b/examples/nvidia/default_use_legacy.yaml new file mode 100644 index 000000000..0796a0101 --- /dev/null +++ b/examples/nvidia/default_use_legacy.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # requesting 2 vGPUs diff --git a/examples/nvidia/example.yaml b/examples/nvidia/example.yaml new file mode 100644 index 000000000..0710269b4 --- /dev/null +++ b/examples/nvidia/example.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # requesting 2 vGPUs + #nvidia.com/gpumem: 3000 # Each vGPU containers 3000M device memory + nvidia.com/gpumem-percentage: 50 #Each vGPU containers 50% device memory of that GPU. Can not be used with nvidia.com/gpumem + #nvidia.com/gpucores: 90 # Utilization limit of this vGPU is set to 50% of total GPU utilization + #nvidia.com/priority: 0 # We only have two priority class, 0(high) and 1(low), default: 1 + #The utilization of high priority task won't be limited to resourceCores unless sharing GPU node with other high priority tasks. + #The utilization of low priority task won't be limited to resourceCores if no other tasks sharing its GPU. + - name: ubuntu-container0 + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + - name: ubuntu-container1 + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # requesting 2 vGPUs + nvidia.com/gpumem: 2000 + #nvidia.com/gpucores: 90 + diff --git a/examples/nvidia/mig_example.yaml b/examples/nvidia/mig_example.yaml new file mode 100644 index 000000000..82fb2e386 --- /dev/null +++ b/examples/nvidia/mig_example.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/mig-3g.20gb: 1 # requesting 1 vGPUs diff --git a/examples/nvidia/specify_card_type_not_use.yaml b/examples/nvidia/specify_card_type_not_use.yaml new file mode 100644 index 000000000..beb4e63af --- /dev/null +++ b/examples/nvidia/specify_card_type_not_use.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod + annotations: + nvidia.com/nouse-gputype: "1080,2080" # Specify the blacklist card type for this job, use comma to seperate, will not launch job on specified card + # In this job, we don't want our job to run on 1080(include 1080Ti) or 2080(include 2080Ti) type of card. +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # requesting 2 vGPUs diff --git a/examples/nvidia/specify_card_type_to_use.yaml b/examples/nvidia/specify_card_type_to_use.yaml new file mode 100644 index 000000000..df45e6e92 --- /dev/null +++ b/examples/nvidia/specify_card_type_to_use.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod + annotations: + nvidia.com/use-gputype: "A100,V100" # Specify the card type for this job, use comma to seperate, will launch job on specified card + #In this example, we want to run this job on A100 or V100 +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # requesting 2 vGPUs diff --git a/examples/nvidia/specify_scheduling_policy.yaml b/examples/nvidia/specify_scheduling_policy.yaml new file mode 100644 index 000000000..31aed818d --- /dev/null +++ b/examples/nvidia/specify_scheduling_policy.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod + annotations: + hami.io/node-scheduler-policy: "spread" # when this parameter is set to spread, the scheduler will try to allocate the pod to different GPU nodes for execution. + hami.io/gpu-scheduler-policy: "binpack" # when this parameter is set to binpack, the scheduler will try to allocate the pod to the same GPU card for execution. +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 1 # requesting 2 vGPUs diff --git a/examples/nvidia/specify_uuid_not_use.yaml b/examples/nvidia/specify_uuid_not_use.yaml new file mode 100644 index 000000000..3255d7eb0 --- /dev/null +++ b/examples/nvidia/specify_uuid_not_use.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod + annotations: + nvidia.com/nouse-gpuuuid: "GPU-123456" +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # requesting 2 vGPUs \ No newline at end of file diff --git a/examples/nvidia/specify_uuid_to_use.yaml b/examples/nvidia/specify_uuid_to_use.yaml new file mode 100644 index 000000000..c696958d6 --- /dev/null +++ b/examples/nvidia/specify_uuid_to_use.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod + annotations: + nvidia.com/use-gpuuuid: "GPU-123456" +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # requesting 2 vGPUs diff --git a/examples/nvidia/use_as_normal.yaml b/examples/nvidia/use_as_normal.yaml new file mode 100644 index 000000000..015a857bc --- /dev/null +++ b/examples/nvidia/use_as_normal.yaml @@ -0,0 +1,26 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod1 +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # requesting 2 vGPUs +--- +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod2 +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # requesting 2 vGPUs +# gpu-pod1 and gpu-pod2 will NOT share the same GPU \ No newline at end of file diff --git a/examples/nvidia/use_exclusive_card.yaml b/examples/nvidia/use_exclusive_card.yaml new file mode 100644 index 000000000..d3abf5866 --- /dev/null +++ b/examples/nvidia/use_exclusive_card.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod1 +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # requesting 2 vGPUs + nvidia.com/gpumem-percentage: 100 # Each vGPU contains 100% of the entire GPU device memory (Optional,Integer) + nvidia.com/gpucores: 100 # Each vGPU uses 100% of the entire GPU cores(Optional,Integer) diff --git a/examples/nvidia/use_memory_fraction.yaml b/examples/nvidia/use_memory_fraction.yaml new file mode 100644 index 000000000..c507af772 --- /dev/null +++ b/examples/nvidia/use_memory_fraction.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod +spec: + containers: + - name: ubuntu-container + image: ubuntu:18.04 + command: ["bash", "-c", "sleep 86400"] + resources: + limits: + nvidia.com/gpu: 2 # requesting 2 vGPUs + nvidia.com/gpumem-percentage: 50 # Each vGPU contains 50% device memory of that GPU (Optional,Integer) + nvidia.com/gpucores: 30 # Each vGPU uses 30% of the entire GPU (Optional,Integer) diff --git a/go.mod b/go.mod index 773a12346..d238795cb 100644 --- a/go.mod +++ b/go.mod @@ -1,52 +1,118 @@ -module 4pd.io/k8s-vgpu +module github.com/Project-HAMi/HAMi -go 1.16 +go 1.22.2 require ( - 4pd.io/k8s-vgpu/pkg/api v0.0.0 - github.com/NVIDIA/go-gpuallocator v0.2.1 - github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20210624153948-4902944b3b52 - github.com/fsnotify/fsnotify v1.4.9 + github.com/NVIDIA/go-gpuallocator v0.3.2 + github.com/NVIDIA/go-nvlib v0.2.0 + github.com/NVIDIA/go-nvml v0.12.0-3 + github.com/NVIDIA/k8s-device-plugin v0.15.0 + github.com/NVIDIA/nvidia-container-toolkit v1.15.0 + github.com/container-orchestrated-devices/container-device-interface v0.5.4-0.20230111111500-5b3b5d81179a + github.com/fsnotify/fsnotify v1.7.0 + github.com/google/uuid v1.6.0 github.com/julienschmidt/httprouter v1.3.0 - github.com/spf13/cobra v1.1.3 - github.com/spf13/jwalterweatherman v1.1.0 // indirect - github.com/spf13/viper v1.7.0 - golang.org/x/net v0.0.0-20210428140749-89ef3d95e781 - google.golang.org/grpc v1.39.0 - gotest.tools/v3 v3.0.3 - k8s.io/api v0.21.2 - k8s.io/apimachinery v0.21.2 - k8s.io/client-go v0.21.2 - k8s.io/klog/v2 v2.9.0 - k8s.io/kube-scheduler v0.21.2 - k8s.io/kubelet v0.21.2 - sigs.k8s.io/controller-runtime v0.9.3 + github.com/opencontainers/runtime-spec v1.2.0 + github.com/prometheus/client_golang v1.18.0 + github.com/sirupsen/logrus v1.9.3 + github.com/spf13/cobra v1.8.1 + github.com/stretchr/testify v1.9.0 + github.com/urfave/cli/v2 v2.27.1 + golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 + golang.org/x/net v0.26.0 + golang.org/x/term v0.21.0 + golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d + google.golang.org/grpc v1.63.2 + google.golang.org/protobuf v1.33.0 + gotest.tools/v3 v3.5.1 + k8s.io/api v0.29.3 + k8s.io/apimachinery v0.29.3 + k8s.io/client-go v0.29.3 + k8s.io/klog/v2 v2.120.1 + k8s.io/kube-scheduler v0.28.3 + k8s.io/kubelet v0.29.3 + sigs.k8s.io/controller-runtime v0.16.3 + tags.cncf.io/container-device-interface v0.7.1 +) + +require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/emicklei/go-restful/v3 v3.11.3 // indirect + github.com/evanphx/json-patch v5.9.0+incompatible // indirect + github.com/go-logr/logr v1.4.1 // indirect + github.com/go-openapi/jsonpointer v0.20.2 // indirect + github.com/go-openapi/jsonreference v0.20.4 // indirect + github.com/go-openapi/swag v0.22.9 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect + github.com/golang/protobuf v1.5.4 // indirect + github.com/google/gnostic-models v0.6.8 // indirect + github.com/google/go-cmp v0.6.0 // indirect + github.com/google/gofuzz v1.2.0 // indirect + github.com/imdario/mergo v0.3.16 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/josharian/intern v1.0.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/mailru/easyjson v0.7.7 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/opencontainers/runc v1.1.12 // indirect + github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/prometheus/client_model v0.6.0 // indirect + github.com/prometheus/common v0.48.0 // indirect + github.com/prometheus/procfs v0.13.0 // indirect + github.com/russross/blackfriday/v2 v2.1.0 // indirect + github.com/spf13/pflag v1.0.5 // indirect + github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 // indirect + github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect + golang.org/x/mod v0.17.0 // indirect + golang.org/x/oauth2 v0.17.0 // indirect + golang.org/x/sys v0.21.0 // indirect + golang.org/x/text v0.16.0 // indirect + golang.org/x/time v0.5.0 // indirect + gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect + google.golang.org/appengine v1.6.8 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/kube-openapi v0.0.0-20240227032403-f107216b40e2 // indirect + k8s.io/utils v0.0.0-20240102154912-e7106e64919e // indirect + sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect + sigs.k8s.io/yaml v1.4.0 // indirect + tags.cncf.io/container-device-interface/specs-go v0.7.0 // indirect ) replace ( - 4pd.io/k8s-vgpu/pkg/api => ./pkg/api - k8s.io/api => k8s.io/api v0.21.2 - k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.21.2 - k8s.io/apimachinery => k8s.io/apimachinery v0.21.2 - k8s.io/apiserver => k8s.io/apiserver v0.21.2 - k8s.io/cli-runtime => k8s.io/cli-runtime v0.21.2 - k8s.io/client-go => k8s.io/client-go v0.21.2 - k8s.io/cloud-provider => k8s.io/cloud-provider v0.21.2 - k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.21.2 - k8s.io/code-generator => k8s.io/code-generator v0.21.2 - k8s.io/component-base => k8s.io/component-base v0.21.2 - k8s.io/component-helpers => k8s.io/component-helpers v0.21.2 - k8s.io/controller-manager => k8s.io/controller-manager v0.21.2 - k8s.io/cri-api => k8s.io/cri-api v0.21.2 - k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.21.2 - k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.21.2 - k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.21.2 - k8s.io/kube-proxy => k8s.io/kube-proxy v0.21.2 - k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.21.2 - k8s.io/kubectl => k8s.io/kubectl v0.21.2 - k8s.io/kubelet => k8s.io/kubelet v0.21.2 - k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.21.2 - k8s.io/metrics => k8s.io/metrics v0.21.2 - k8s.io/mount-utils => k8s.io/mount-utils v0.21.2 - k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.21.2 + github.com/Project-HAMi/HAMi/pkg/api => ./pkg/api + github.com/Project-HAMi/HAMi/pkg/device-plugin => ./pkg/device-plugin + k8s.io/api => k8s.io/api v0.28.3 + k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.28.3 + k8s.io/apimachinery => k8s.io/apimachinery v0.28.3 + k8s.io/apiserver => k8s.io/apiserver v0.28.3 + k8s.io/cli-runtime => k8s.io/cli-runtime v0.28.3 + k8s.io/client-go => k8s.io/client-go v0.28.3 + k8s.io/cloud-provider => k8s.io/cloud-provider v0.28.3 + k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.28.3 + k8s.io/code-generator => k8s.io/code-generator v0.28.3 + k8s.io/component-base => k8s.io/component-base v0.28.3 + k8s.io/component-helpers => k8s.io/component-helpers v0.28.3 + k8s.io/cri-api => k8s.io/cri-api v0.28.3 + k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.28.3 + k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.28.3 + k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.28.3 + k8s.io/kube-proxy => k8s.io/kube-proxy v0.28.3 + k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.28.3 + k8s.io/kubectl => k8s.io/kubectl v0.28.3 + k8s.io/kubelet => k8s.io/kubelet v0.28.3 + k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.28.3 + k8s.io/metrics => k8s.io/metrics v0.28.3 + k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.28.3 ) diff --git a/go.sum b/go.sum index 80833e95a..f2048535c 100644 --- a/go.sum +++ b/go.sum @@ -1,785 +1,274 @@ -cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= -cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= -cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= -cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6AU= -cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxKY= -cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc= -cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0= -cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To= -cloud.google.com/go v0.52.0/go.mod h1:pXajvRH/6o3+F9jDHZWQ5PbGhn+o8w9qiu/CffaVdO4= -cloud.google.com/go v0.53.0/go.mod h1:fp/UouUEsRkN6ryDKNW/Upv/JBKnv6WDthjR6+vze6M= -cloud.google.com/go v0.54.0/go.mod h1:1rq2OEkV3YMf6n/9ZvGWI3GWw0VoqH/1x2nd8Is/bPc= -cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= -cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= -cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc= -cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= -cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= -cloud.google.com/go/firestore v1.1.0/go.mod h1:ulACoGHTpvq5r8rxGJ4ddJZBZqakUQqClKRT5SZwBmk= -cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I= -cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= -cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA= -cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= -cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= -cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= -dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= -github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78/go.mod h1:LmzpDX56iTiv29bbRTIsUNlaFfuhWRQBWjQdVyAevI8= -github.com/Azure/go-autorest v14.2.0+incompatible/go.mod h1:r+4oMnoxhatjLLJ6zxSWATqVooLgysK6ZNox3g/xq24= -github.com/Azure/go-autorest/autorest v0.11.12/go.mod h1:eipySxLmqSyC5s5k1CLupqet0PSENBEDP93LQ9a8QYw= -github.com/Azure/go-autorest/autorest/adal v0.9.5/go.mod h1:B7KF7jKIeC9Mct5spmyCB/A8CG/sEz1vwIRGv/bbw7A= -github.com/Azure/go-autorest/autorest/date v0.3.0/go.mod h1:BI0uouVdmngYNUzGWeSYnokU+TrmwEsOqdt8Y6sso74= -github.com/Azure/go-autorest/autorest/mocks v0.4.1/go.mod h1:LTp+uSrOhSkaKrUy935gNZuuIPPVsHlr9DSOxSayd+k= -github.com/Azure/go-autorest/logger v0.2.0/go.mod h1:T9E3cAhj2VqvPOtCYAvby9aBXkZmbF5NWuPV8+WeEW8= -github.com/Azure/go-autorest/tracing v0.6.0/go.mod h1:+vhtPC754Xsa23ID7GlGsrdKBpUA79WCAKPPZVC2DeU= -github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= -github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= -github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= -github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y= -github.com/NVIDIA/go-gpuallocator v0.2.1 h1:8EodCWEPjDl5Q+2ONB2EdwFJpcGnmACmlgLB8RkP/W0= -github.com/NVIDIA/go-gpuallocator v0.2.1/go.mod h1:+2ke2/CGym+5xSxHfkIFi9Oof1Kj0KMSXgQDpk39ikk= -github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20210624153948-4902944b3b52 h1:GaJrAt0sWaBLk4hB6juBCYlhi9Otng/EpgD4QLqp/T4= -github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20210624153948-4902944b3b52/go.mod h1:oKPJa5eOTkWvlT4/Y4D8Nds44Fzmww5HUK+xwO+DwTA= -github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm v0.0.0-20210325210537-29b4f1784f18/go.mod h1:8qXwltEzU3idjUcVpMOv3FNgxxbDeXZPGMLyc/khWiY= -github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= -github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c= -github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= -github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= -github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= -github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= -github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= -github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= -github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= -github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho= -github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= -github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o= -github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY= -github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8= -github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= -github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= -github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= +github.com/NVIDIA/go-gpuallocator v0.3.2 h1:gXaGgFKrtsBOvbZTZIWQ81yr7voHm5keRCXb3VNjMMU= +github.com/NVIDIA/go-gpuallocator v0.3.2/go.mod h1:OuqBvWRrs9+A783a753fK9YYP8P1BTf+T4Map+XfTUs= +github.com/NVIDIA/go-nvlib v0.2.0 h1:roq+SDstbP1fcy2XVH7wB2Gz2/Ud7Q+NGQYOcVITVrA= +github.com/NVIDIA/go-nvlib v0.2.0/go.mod h1:kFuLNTyD1tF6FbRFlk+/EdUW5BrkE+v1Y3A3/9zKSjA= +github.com/NVIDIA/go-nvml v0.12.0-3 h1:QwfjYxEqIQVRhl8327g2Y3ZvKResPydpGSKtCIIK9jE= +github.com/NVIDIA/go-nvml v0.12.0-3/go.mod h1:SOufGc5Wql+cxrIZ8RyJwVKDYxfbs4WPkHXqadcbfvA= +github.com/NVIDIA/k8s-device-plugin v0.15.0 h1:QKfAo6Xpl5M4Y9hltlYrzHjwGR+vfeAuiiNNyFN4DoE= +github.com/NVIDIA/k8s-device-plugin v0.15.0/go.mod h1:s6DHR9QG5+xAbWG7NniWTnrZI7wUojl1/hxeZClXm/U= +github.com/NVIDIA/nvidia-container-toolkit v1.15.0 h1:YmYZUKJzhz/lJSVH6k1mk5IUCHpt8HwRtwMrtBoCzhQ= +github.com/NVIDIA/nvidia-container-toolkit v1.15.0/go.mod h1:SUwxfwi+dl1LtVlpAnJEolxuZfCtAVmOKRGWhJYsiJI= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= -github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= -github.com/bketelsen/crypt v0.0.3-0.20200106085610-5cbc8cc4026c/go.mod h1:MKsuJmJgSg28kpZDP6UIiPt0e0Oz0kqKNGyRaWEPv84= -github.com/blang/semver v3.5.1+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk= -github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= -github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko= -github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= -github.com/cespare/xxhash/v2 v2.1.1 h1:6MnRN8NT7+YBpUIWxHtefFZOKTAPgGjpQSxqLNn0+qY= -github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= -github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= -github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= -github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= -github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= -github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= -github.com/cncf/xds/go v0.0.0-20210312221358-fbca930ec8ed/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= -github.com/cockroachdb/datadriven v0.0.0-20190809214429-80d97fb3cbaa/go.mod h1:zn76sxSg3SzpJ0PPJaLDCu+Bu0Lg3sKTORVIj19EIF8= -github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk= -github.com/coreos/etcd v3.3.13+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= -github.com/coreos/go-oidc v2.1.0+incompatible/go.mod h1:CgnwVTmzoESiwO9qyAFEMiHoZ1nMCKZlZ9V6mm3/LKc= -github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= -github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= -github.com/coreos/go-systemd v0.0.0-20180511133405-39ca1b05acc7/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= -github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= -github.com/coreos/pkg v0.0.0-20160727233714-3ac0863d7acf/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= -github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= -github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= -github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7DoTY= -github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= -github.com/creack/pty v1.1.11/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= +github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= +github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= +github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/container-orchestrated-devices/container-device-interface v0.5.4-0.20230111111500-5b3b5d81179a h1:sP3PcgyIkRlHqfF3Jfpe/7G8kf/qpzG4C8r94y9hLbE= +github.com/container-orchestrated-devices/container-device-interface v0.5.4-0.20230111111500-5b3b5d81179a/go.mod h1:xMRa4fJgXzSDFUCURSimOUgoSc+odohvO3uXT9xjqH0= +github.com/cpuguy83/go-md2man/v2 v2.0.4 h1:wfIWP927BUkWJb2NmU/kNDYIBTh/ziUX91+lVfRxZq4= +github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= -github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no= -github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE= -github.com/dustin/go-humanize v0.0.0-20171111073723-bb3d318650d4/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= -github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= -github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153/go.mod h1:/Zj4wYkgs4iZTTu3o/KG3Itv/qCCa8VVMlb3i9OVuzc= -github.com/emicklei/go-restful v0.0.0-20170410110728-ff4f55a20633/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs= -github.com/emicklei/go-restful v2.9.5+incompatible/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs= -github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= -github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= -github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= -github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk= -github.com/envoyproxy/go-control-plane v0.9.9-0.20210512163311-63b5d3c536b0/go.mod h1:hliV/p42l8fGbc6Y9bQ70uLwIvmJyVE5k4iMKlh8wCQ= -github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= -github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ= -github.com/evanphx/json-patch v4.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= -github.com/evanphx/json-patch v4.11.0+incompatible h1:glyUF9yIYtMHzn8xaKw5rMhdWcwsYV8dZHIq5567/xs= -github.com/evanphx/json-patch v4.11.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= -github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= -github.com/form3tech-oss/jwt-go v3.2.2+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= -github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= -github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4= -github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= -github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= -github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= -github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= -github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= -github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= -github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= -github.com/go-kit/log v0.1.0/go.mod h1:zbhenjAZHb184qTLMA9ZjW7ThYL0H2mk7Q6pNt4vbaY= -github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= -github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= -github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A= -github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas= -github.com/go-logr/logr v0.2.0/go.mod h1:z6/tIYblkpsD+a4lm/fGIIU9mZ+XfAiaFtq7xTgseGU= -github.com/go-logr/logr v0.4.0 h1:K7/B1jt6fIBQVd4Owv2MqGQClcgf0R266+7C/QjRcLc= -github.com/go-logr/logr v0.4.0/go.mod h1:z6/tIYblkpsD+a4lm/fGIIU9mZ+XfAiaFtq7xTgseGU= -github.com/go-logr/zapr v0.4.0 h1:uc1uML3hRYL9/ZZPdgHS/n8Nzo+eaYL/Efxkkamf7OM= -github.com/go-logr/zapr v0.4.0/go.mod h1:tabnROwaDl0UNxkVeFRbY8bwB37GwRv0P8lg6aAiEnk= -github.com/go-openapi/jsonpointer v0.19.2/go.mod h1:3akKfEdA7DF1sugOqz1dVQHBcuDBPKZGEoHC/NkiQRg= -github.com/go-openapi/jsonpointer v0.19.3/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= -github.com/go-openapi/jsonreference v0.19.2/go.mod h1:jMjeRr2HHw6nAVajTXJ4eiUwohSTlpa0o73RUL1owJc= -github.com/go-openapi/jsonreference v0.19.3/go.mod h1:rjx6GuL8TTa9VaixXglHmQmIL98+wF9xc8zWvFonSJ8= -github.com/go-openapi/spec v0.19.3/go.mod h1:FpwSN1ksY1eteniUU7X0N/BgJ7a4WvBFVA8Lj9mJglo= -github.com/go-openapi/spec v0.19.5/go.mod h1:Hm2Jr4jv8G1ciIAo+frC/Ft+rR2kQDh8JHKHb3gWUSk= -github.com/go-openapi/swag v0.19.2/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk= -github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk= -github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= -github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE= -github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= -github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= +github.com/emicklei/go-restful/v3 v3.11.3 h1:yagOQz/38xJmcNeZJtrUcKjkHRltIaIFXKWeG1SkWGE= +github.com/emicklei/go-restful/v3 v3.11.3/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/evanphx/json-patch v5.9.0+incompatible h1:fBXyNpNMuTTDdquAq/uisOr2lShz4oaXpDTX2bLe7ls= +github.com/evanphx/json-patch v5.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= +github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= +github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= +github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/zapr v1.2.4 h1:QHVo+6stLbfJmYGkQ7uGHUCu5hnAFAj6mDe6Ea0SeOo= +github.com/go-logr/zapr v1.2.4/go.mod h1:FyHWQIzQORZ0QVE1BtVHv3cKtNLuXsbNLtpuhNapBOA= +github.com/go-openapi/jsonpointer v0.20.2 h1:mQc3nmndL8ZBzStEo3JYF8wzmeWffDH4VbXz58sAx6Q= +github.com/go-openapi/jsonpointer v0.20.2/go.mod h1:bHen+N0u1KEO3YlmqOjTT9Adn1RfD91Ar825/PuiRVs= +github.com/go-openapi/jsonreference v0.20.4 h1:bKlDxQxQJgwpUSgOENiMPzCTBVuc7vTdXSSgNeAhojU= +github.com/go-openapi/jsonreference v0.20.4/go.mod h1:5pZJyJP2MnYCpoeoMAql78cCHauHj0V9Lhc506VOpw4= +github.com/go-openapi/swag v0.22.9 h1:XX2DssF+mQKM2DHsbgZK74y/zj4mo9I99+89xUmuZCE= +github.com/go-openapi/swag v0.22.9/go.mod h1:3/OXnFfnMAwBD099SwYRk7GD3xOrr1iL7d/XNLXVVwE= +github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= +github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= -github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= -github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= -github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= -github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= -github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= -github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY= -github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= -github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= -github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= -github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y= -github.com/golang/mock v1.4.0/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= -github.com/golang/mock v1.4.1/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= -github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= -github.com/golang/protobuf v1.3.4/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= -github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= -github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= -github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= -github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= -github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= -github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= -github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= -github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw= github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= -github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= -github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= -github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= -github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= +github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/gofuzz v1.1.0 h1:Hsa8mG0dQ46ij8Sl2AYJDUv1oA9/d6Vk+3LG99Oe02g= -github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= -github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= -github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= -github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= -github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= -github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= -github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= -github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y= -github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= -github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= -github.com/googleapis/gnostic v0.4.1/go.mod h1:LRhVm6pbyptWbWbuZ38d1eyptfvIytN3ir6b65WBswg= -github.com/googleapis/gnostic v0.5.5 h1:9fHAtK0uDfpveeqqo1hkEZJcFvYXAiCN3UutL8F9xHw= -github.com/googleapis/gnostic v0.5.5/go.mod h1:7+EbHbldMins07ALC74bsA81Ovc97DwqyJO1AENw9kA= -github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= -github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= -github.com/gorilla/mux v1.7.4/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= -github.com/gorilla/websocket v0.0.0-20170926233335-4201258b820c/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= -github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= -github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= -github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs= -github.com/grpc-ecosystem/go-grpc-middleware v1.0.1-0.20190118093823-f849b5445de4/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs= -github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk= -github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= -github.com/grpc-ecosystem/grpc-gateway v1.9.5/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= -github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= -github.com/hashicorp/consul/api v1.1.0/go.mod h1:VmuI/Lkw1nC05EYQWNKwWGbkg+FbDBtguAZLlVdkD9Q= -github.com/hashicorp/consul/sdk v0.1.1/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= -github.com/hashicorp/go-cleanhttp v0.5.1/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80= -github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60= -github.com/hashicorp/go-msgpack v0.5.3/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM= -github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk= -github.com/hashicorp/go-rootcerts v1.0.0/go.mod h1:K6zTfqpRlCUIjkwsN4Z+hiSfzSTQa6eBIzfwKfwNnHU= -github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerXegt+ozgdvDeDU= -github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4= -github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= -github.com/hashicorp/go-uuid v1.0.1/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= -github.com/hashicorp/go.net v0.0.1/go.mod h1:hjKkEWcCURg++eb33jQU7oqQcI9XDCnUzHA0oac0k90= -github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= -github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= -github.com/hashicorp/golang-lru v0.5.4 h1:YDjusn29QI/Das2iO9M0BHnIbxPeyuCHsjMW+lJfyTc= -github.com/hashicorp/golang-lru v0.5.4/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= -github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= -github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= -github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO+LraFDTW64= -github.com/hashicorp/mdns v1.0.0/go.mod h1:tL+uN++7HEJ6SQLQ2/p+z2pH24WQKWjBPkE0mNTz8vQ= -github.com/hashicorp/memberlist v0.1.3/go.mod h1:ajVTdAv/9Im8oMAAj5G31PhhMCZJV2pPBoIllUwCN7I= -github.com/hashicorp/serf v0.8.2/go.mod h1:6hOLApaqBFA1NXqRQAsxw9QxuDEvNxSQRwA/JwenrHc= -github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= -github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= -github.com/imdario/mergo v0.3.5/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= -github.com/imdario/mergo v0.3.12 h1:b6R2BslTbIEToALKP7LxUvijTsNI9TAe80pLWN2g/HU= -github.com/imdario/mergo v0.3.12/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA= -github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NHg9XEKhtSvM= -github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= -github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI= -github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo= -github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= -github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= -github.com/json-iterator/go v1.1.7/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= -github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= -github.com/json-iterator/go v1.1.11 h1:uVUAXhF2To8cbw/3xN3pxj6kk7TYKs98NIrTqPlMWAQ= -github.com/json-iterator/go v1.1.11/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= -github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= -github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= -github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= -github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= -github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= +github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= +github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= +github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= +github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= +github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/julienschmidt/httprouter v1.3.0 h1:U0609e9tgbseu3rBINet9P48AI/D3oJs4dN7jwJOQ1U= github.com/julienschmidt/httprouter v1.3.0/go.mod h1:JR6WtHb+2LUe8TCKY3cZOxFyyO8IZAc4RVcycCCAKdM= -github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= -github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= -github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= -github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= -github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/magiconair/properties v1.8.1 h1:ZC2Vc7/ZFkGmsVC9KvOjumD+G5lXy2RtTKyzRKO2BQ4= -github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= -github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= -github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= -github.com/mailru/easyjson v0.7.0/go.mod h1:KAzv3t3aY1NaHWoQz1+4F1ccyAH66Jk7yos7ldAVICs= -github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= -github.com/mattn/go-isatty v0.0.3/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= -github.com/mattn/go-isatty v0.0.4/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= -github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= -github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= -github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 h1:I0XW9+e1XWDxdcEniV4rQAIOPUGDq67JSCiRCgGCZLI= -github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= -github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg= -github.com/mitchellh/cli v1.0.0/go.mod h1:hNIlj7HEI86fIcpObd7a0FcrxTWetlwJDGcceTlRvqc= -github.com/mitchellh/go-homedir v1.0.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= -github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= -github.com/mitchellh/go-testing-interface v1.0.0/go.mod h1:kRemZodwjscx+RGhAo8eIhFbs2+BFgRtFPeD/KE+zxI= -github.com/mitchellh/gox v0.4.0/go.mod h1:Sd9lOJ0+aimLBi73mGofS1ycjY8lL3uZM3JPS42BGNg= -github.com/mitchellh/iochan v1.0.0/go.mod h1:JwYml1nuB7xOzsp52dPpHFffvOCDupsG0QubkSMEySY= -github.com/mitchellh/mapstructure v0.0.0-20160808181253-ca63d7c062ee/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= -github.com/mitchellh/mapstructure v1.1.2 h1:fmNYVwqnSfB9mZU6OS2O6GsXM+wcskZDuKQzvN1EDeE= -github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= -github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0GqbN2Wy8c= -github.com/moby/term v0.0.0-20201216013528-df9cb8a40635/go.mod h1:FBS0z0QWA44HXygs7VXDUOGoN/1TV3RuWkLO04am3wc= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/mndrix/tap-go v0.0.0-20171203230836-629fa407e90b/go.mod h1:pzzDgJWZ34fGzaAZGFW22KVZDfyrYW+QABMrWnJBnSs= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= -github.com/modern-go/reflect2 v1.0.1 h1:9f412s+6RmYXLWZSEzVVgPGK7C2PphHj5RJrvfx9AWI= -github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= -github.com/munnerz/goautoneg v0.0.0-20120707110453-a547fc61f48d/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/mrunalp/fileutils v0.5.0/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= -github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= -github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= -github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs= -github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= -github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= -github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= -github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= -github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= -github.com/olekukonko/tablewriter v0.0.0-20170122224234-a0225b3f23b5/go.mod h1:vsDQFd/mU46D+Z4whnwzcISnGGzXWMclvtLoiIKAKIo= -github.com/onsi/ginkgo v0.0.0-20170829012221-11459a886d9c/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= -github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= -github.com/onsi/ginkgo v1.11.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= -github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk= -github.com/onsi/ginkgo v1.16.2/go.mod h1:CObGmKUOKaSC0RjmoAK7tKyn4Azo5P2IWuoMnvwxz1E= -github.com/onsi/ginkgo v1.16.4 h1:29JGrr5oVBm5ulCWet69zQkzWipVXIol6ygQUe/EzNc= -github.com/onsi/ginkgo v1.16.4/go.mod h1:dX+/inL/fNMqNlz0e9LfyB9TswhZpCVdJM/Z6Vvnwo0= -github.com/onsi/gomega v0.0.0-20170829124025-dcabb60a477c/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA= -github.com/onsi/gomega v1.7.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= -github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= -github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= -github.com/onsi/gomega v1.13.0 h1:7lLHu94wT9Ij0o6EWWclhu0aOh32VxhkwEJvzuWPeak= -github.com/onsi/gomega v1.13.0/go.mod h1:lRk9szgn8TxENtWd0Tp4c3wjlRfMTMH27I+3Je41yGY= -github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= -github.com/pelletier/go-toml v1.2.0 h1:T5zMGML61Wp+FlcbWjRDT7yAxhJNAiPPLOFECq181zc= -github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= -github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= -github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/onsi/ginkgo/v2 v2.17.1 h1:V++EzdbhI4ZV4ev0UTIj0PzhzOcReJFyJaLjtSF55M8= +github.com/onsi/ginkgo/v2 v2.17.1/go.mod h1:llBI3WDLL9Z6taip6f33H76YcWtJv+7R3HigUjbIBOs= +github.com/onsi/gomega v1.32.0 h1:JRYU78fJ1LPxlckP6Txi/EYqJvjtMrDC04/MM5XRHPk= +github.com/onsi/gomega v1.32.0/go.mod h1:a4x4gW6Pz2yK1MAmvluYme5lvYTn61afQ2ETw/8n4Lg= +github.com/opencontainers/runc v1.1.12 h1:BOIssBaW1La0/qbNZHXOOa71dZfZEQOzW7dqQf3phss= +github.com/opencontainers/runc v1.1.12/go.mod h1:S+lQwSfncpBha7XTy/5lBwWgm5+y5Ma/O44Ekby9FK8= +github.com/opencontainers/runtime-spec v1.0.3-0.20220825212826-86290f6a00fb/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk= +github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 h1:DmNGcqH3WDbV5k8OJ+esPWbqUOX5rMLR2PMvziDMJi0= +github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626/go.mod h1:BRHJJd0E+cx42OybVYSgUvZmU0B8P9gZuRXlZUP7TKI= +github.com/opencontainers/selinux v1.9.1/go.mod h1:2i0OySw99QjzBBQByd1Gr9gSjvuho1lHsJxIJ3gGbJI= +github.com/opencontainers/selinux v1.11.0 h1:+5Zbo97w3Lbmb3PeqQtpmTkMwsW5nRI3YaLpt7tQ7oU= +github.com/opencontainers/selinux v1.11.0/go.mod h1:E5dMC3VPuVvVHDYmi78qvhJp8+M586T4DlDRYpFkyec= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndrE9hABlRI= -github.com/pquerna/cachecontrol v0.0.0-20171018203845-0dec1b30a021/go.mod h1:prYjPmNq4d1NPVmpShWobRqXY3q7Vp+80DqgxxUrUIA= -github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= -github.com/prometheus/client_golang v0.9.3/go.mod h1:/TN21ttK/J9q6uSwhBd54HahCDft0ttaMvbicHlPoso= -github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= -github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M= -github.com/prometheus/client_golang v1.11.0 h1:HNkLOAEQMIDv/K+04rukrLx6ch7msSRwf3/SASFAGtQ= -github.com/prometheus/client_golang v1.11.0/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0= -github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= -github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/client_model v0.2.0 h1:uq5h0d+GuxiXLJLNABMgp2qUWDPiLvgCzz2dUR+/W/M= -github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/common v0.0.0-20181113130724-41aa239b4cce/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro= -github.com/prometheus/common v0.4.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= -github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= -github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo= -github.com/prometheus/common v0.26.0 h1:iMAkS2TDoNWnKM+Kopnx/8tnEStIfpYA0ur0xQzzhMQ= -github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9VFqTh1DIvc= -github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= -github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= -github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= -github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU= -github.com/prometheus/procfs v0.2.0/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU= -github.com/prometheus/procfs v0.6.0 h1:mxy4L2jP6qMonqmq+aTtOx1ifVWUgG/TAmntgbh3xv4= -github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= -github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= -github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= -github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= -github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= -github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= -github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= -github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= -github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= -github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= -github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88= -github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= -github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM= -github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= -github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s= -github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= -github.com/soheilhy/cmux v0.1.4/go.mod h1:IM3LyeVVIOuxMH7sFAkER9+bJ4dT7Ms6E4xg4kGIyLM= -github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= -github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= -github.com/spf13/afero v1.2.2 h1:5jhuqJyZCZf2JRofRvN/nIFgIWNzPa3/Vz8mYylgbWc= -github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk= -github.com/spf13/cast v1.3.0 h1:oget//CVOEoFewqQxwr0Ej5yjygnqGkvggSE/gB35Q8= -github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= -github.com/spf13/cobra v0.0.3/go.mod h1:1l0Ry5zgKvJasoi3XT1TypsSe7PqH0Sj9dhYf7v3XqQ= -github.com/spf13/cobra v1.1.1/go.mod h1:WnodtKOvamDL/PwE2M4iKs8aMDBZ5Q5klgD3qfVJQMI= -github.com/spf13/cobra v1.1.3 h1:xghbfqPkxzxP3C/f3n5DdpAbdKLj4ZE4BWQI362l53M= -github.com/spf13/cobra v1.1.3/go.mod h1:pGADOWyqRD/YMrPZigI/zbliZ2wVD/23d+is3pSWzOo= -github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo= -github.com/spf13/jwalterweatherman v1.1.0 h1:ue6voC5bR5F8YxI5S67j9i582FU4Qvo2bmqnqMYADFk= -github.com/spf13/jwalterweatherman v1.1.0/go.mod h1:aNWZUN0dPAAO/Ljvb5BEdw96iTZ0EXowPYD95IqWIGo= -github.com/spf13/pflag v0.0.0-20170130214245-9ff6c6923cff/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= -github.com/spf13/pflag v1.0.1/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= -github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= +github.com/prometheus/client_golang v1.18.0 h1:HzFfmkOzH5Q8L8G+kSJKUx5dtG87sewO+FoDDqP5Tbk= +github.com/prometheus/client_golang v1.18.0/go.mod h1:T+GXkCk5wSJyOqMIzVgvvjFDlkOQntgjkJWKrN5txjA= +github.com/prometheus/client_model v0.6.0 h1:k1v3CzpSRUTrKMppY35TLwPvxHqBu0bYgxZzqGIgaos= +github.com/prometheus/client_model v0.6.0/go.mod h1:NTQHnmxFpouOD0DpvP4XujX3CdOAGQPoaGhyTchlyt8= +github.com/prometheus/common v0.48.0 h1:QO8U2CdOzSn1BBsmXJXduaaW+dY/5QLjfB8svtSzKKE= +github.com/prometheus/common v0.48.0/go.mod h1:0/KsvlIEfPQCQ5I2iNSAWKPZziNCvRs5EC6ILDTlAPc= +github.com/prometheus/procfs v0.13.0 h1:GqzLlQyfsPbaEHaQkO7tbDlriv/4o5Hudv6OXHGKX7o= +github.com/prometheus/procfs v0.13.0/go.mod h1:cd4PFCR54QLnGKPaKGA6l+cfuNXtht43ZKY6tow0Y1g= +github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= +github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= +github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= +github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/spf13/viper v1.7.0 h1:xVKxvI7ouOI5I+U9s2eeiUfMaWBVoXA3AWskkrqK0VM= -github.com/spf13/viper v1.7.0/go.mod h1:8WkrPz2fc9jxqZNCJI/76HCieCp4Q8HaLFoCha5qpdg= -github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= -github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= -github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/subosito/gotenv v1.2.0 h1:Slr1R9HxAlEKefgq5jn9U+DnETlIUa6HfgEzj0g5d7s= -github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw= -github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= -github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= -github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA= -github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 h1:kdXcSzyDtseVEc4yCz2qF8ZrQvIDBJLl4S1c3GCXmoI= +github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww= +github.com/urfave/cli v1.19.1/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA= +github.com/urfave/cli/v2 v2.27.1 h1:8xSQ6szndafKVRmfyeUMxkNUJQMjL1F2zmsZ+qHpfho= +github.com/urfave/cli/v2 v2.27.1/go.mod h1:8qnjx1vcq5s2/wpsqoZFndg2CE5tNFyrTvS6SinrnYQ= +github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU= +github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb h1:zGWFAtiMcyryUHoUjUJX0/lt1H2+i2Ka2n+D3DImSNo= +github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU= +github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 h1:EzJWgHovont7NscjpAxXsDA8S8BMYve8Y5+7cuRE7R0= +github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ= +github.com/xeipuuv/gojsonschema v1.2.0 h1:LhYJRs+L4fBtjZUfuSZIKGeVu0QRy8e5Xi7D17UxZ74= +github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y= +github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 h1:bAn7/zixMGCfxrRTfdpNzjtPYqr8smhKouy9mxVdGPU= +github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673/go.mod h1:N3UwUGtsrSj3ccvlPHLoLsHnpR27oXr4ZE984MbSER8= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= -go.etcd.io/bbolt v1.3.3/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= -go.etcd.io/bbolt v1.3.5/go.mod h1:G5EMThwa9y8QZGBClrRx5EY+Yw9kAhnjy3bSjsnlVTQ= -go.etcd.io/etcd v0.5.0-alpha.5.0.20200910180754-dd1b699fc489/go.mod h1:yVHk9ub3CSBatqGNg7GRmsnfLWtoW60w4eDYfh7vHDg= -go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= -go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= -go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= -go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= -go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= -go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= -go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= -go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= -go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= -go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= -go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= -go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4= -go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= -go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= -go.uber.org/zap v1.17.0 h1:MTjgFu6ZLKvY6Pvaqk97GlxNBuMpV4Hy/3P6tRGlI2U= -go.uber.org/zap v1.17.0/go.mod h1:MXVU+bhUf/A7Xi2HNOnopQOrmycQ5Ih87HtOu4q5SSo= -golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= -golang.org/x/crypto v0.0.0-20181029021203-45a5f77698d3/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +go.uber.org/zap v1.25.0 h1:4Hvk6GtkucQ790dqmj7l1eEnRdKm3k3ZUrUMS2d5+5c= +go.uber.org/zap v1.25.0/go.mod h1:JIAUzQIH94IC4fOJQm7gMmBJP5k7wQfdcnYdPoEXJYk= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20190611184440-5c40567a22f8/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20201002170205-7f63de1d35b0/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20210220033148-5ea612d1eb83/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I= -golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= -golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek= -golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY= -golang.org/x/exp v0.0.0-20191129062945-2f5052295587/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= -golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= -golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= -golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= -golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= -golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= -golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= -golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= -golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= -golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= -golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= -golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= -golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= -golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= -golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs= -golang.org/x/lint v0.0.0-20200130185559-910be7a94367/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= -golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= -golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE= -golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= -golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= -golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY= -golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= -golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 h1:LfspQV/FYTatPTr/3HzIcmiUFH7PGP+OQ6mgDYo3yuQ= +golang.org/x/exp v0.0.0-20240222234643-814bf88cf225/go.mod h1:CxmFvTBINI24O/j8iY7H1xHzx2i4OsyguNBmN/uPtqc= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.3.1-0.20200828183125-ce943fd02449/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20181023162649-9b4f9f5ad519/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20181201002055-351d144fa1fc/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20181220203305-927f97764cc3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= -golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200222125558-5a598a2470a0/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= -golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= -golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= -golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.0.0-20210224082022-3d97a244fca7/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20210428140749-89ef3d95e781 h1:DzZ89McO9/gWPsQXS/FVKAlG02ZjaQ6AlZRBimEYOd0= -golang.org/x/net v0.0.0-20210428140749-89ef3d95e781/go.mod h1:OJAsFXCWl8Ukc7SiCT/9KSuxbyM7479/AVlXFRxuMCk= -golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= -golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= -golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= -golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= -golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d h1:TzXSXBo42m9gQenoE3b9BGiEpg5IG2JkU5FkPIawgtw= -golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= -golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= +golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= +golang.org/x/oauth2 v0.17.0 h1:6m3ZPmLEFdVxKKWnKq4VqZ60gutO35zm+zrAHVmHyDQ= +golang.org/x/oauth2 v0.17.0/go.mod h1:OzPDGQiuQMguemayvdylqddI7qcD9lnSDb+1FiwQ5HA= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20181026203630-95b1ffbd15a5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20181107165924-66b7b1311ac8/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190616124812-15dcb6c0061f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191120155948-bd437916bb0e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200831180312-196b9ba8737a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210112080510-489259a85091/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210426230700-d19ff857e887/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210603081109-ebe580a85c40 h1:JWgyZ1qgdTaF3N3oxC+MdTV7qvEEgHo3otj+HB5CM7Q= -golang.org/x/sys v0.0.0-20210603081109-ebe580a85c40/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= +golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d h1:SZxvLBoTP5yHO3Frd4z4vrF+DBX9vMVanchswa69toE= -golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= +golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M= -golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20210220033141-f8bda1e9f3ba/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20210611083556-38a9dc6acbc6 h1:Vv0JUPWTyeqUq42B2WJ1FeIDjjvGKoA2Ss+Ts0lAVbs= -golang.org/x/time v0.0.0-20210611083556-38a9dc6acbc6/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= +golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= +golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= +golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= +golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= -golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= -golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= -golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= -golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= -golang.org/x/tools v0.0.0-20190614205625-5aca471b1d59/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= -golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= -golang.org/x/tools v0.0.0-20190624222133-a101b041ded4/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= -golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= -golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191112195655-aa38f8e97acc/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191130070609-6e064ea0cf2d/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191216173652-a0e659d51361/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200117161641-43d50277825c/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200122220014-bf1340f18c4a/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200204074204-1cc6d1ef6c74/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200212150539-ea181f53ac56/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200224181240-023911ca70b2/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200304193943-95d2e580d8eb/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw= -golang.org/x/tools v0.0.0-20200505023115-26f46d2f7ef8/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -gomodules.xyz/jsonpatch/v2 v2.2.0 h1:4pT439QV83L+G9FkcCriY6EkpcK6r6bK+A5FBUMI7qY= -gomodules.xyz/jsonpatch/v2 v2.2.0/go.mod h1:WXp+iVDkoLQqPudfQ9GBlwB2eZ5DKOnjQZCYdOS8GPY= -google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= -google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M= -google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= -google.golang.org/api v0.9.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= -google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= -google.golang.org/api v0.14.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= -google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= -google.golang.org/api v0.17.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= -google.golang.org/api v0.18.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= -google.golang.org/api v0.20.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= -google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= -google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= -google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= -google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0= -google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= -google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= -google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= -google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= -google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= -google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= -google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= -google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= -google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= -google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= -google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8= -google.golang.org/genproto v0.0.0-20191108220845-16a3f7862a1a/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= -google.golang.org/genproto v0.0.0-20191115194625-c23dd37a84c9/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= -google.golang.org/genproto v0.0.0-20191216164720-4f79533eabd1/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= -google.golang.org/genproto v0.0.0-20191230161307-f3c370f40bfb/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= -google.golang.org/genproto v0.0.0-20200115191322-ca5a22157cba/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= -google.golang.org/genproto v0.0.0-20200122232147-0452cf42e150/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= -google.golang.org/genproto v0.0.0-20200204135345-fa8e72b47b90/go.mod h1:GmwEX6Z4W5gMy59cAlVYjN9JhxgbQH6Gn+gFDQe2lzA= -google.golang.org/genproto v0.0.0-20200212174721-66ed5ce911ce/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200224152610-e50cd9704f63/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200305110556-506484158171/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200513103714-09dca8ec2884/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= -google.golang.org/genproto v0.0.0-20201019141844-1ed22bb0c154/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= -google.golang.org/genproto v0.0.0-20201110150050-8816d57aaa9a h1:pOwg4OoaRYScjmR4LlLgdtnyoHYTSAVhhqe5uPdpII8= -google.golang.org/genproto v0.0.0-20201110150050-8816d57aaa9a/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= -google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= -google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= -google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= -google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= -google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= -google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= -google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= -google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= -google.golang.org/grpc v1.33.1/go.mod h1:fr5YgcSWrqhRRxogOsw7RzIpsmvOZ6IcH4kBYTpR3n0= -google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU= -google.golang.org/grpc v1.39.0 h1:Klz8I9kdtkIN6EpHHUOMLCYhTn/2WAe5a0s1hcBkdTI= -google.golang.org/grpc v1.39.0/go.mod h1:PImNr+rS9TWYb2O4/emRugxiyHZ5JyHW5F+RPnDzfrE= -google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= -google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= -google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= -google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= -google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= -google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= -google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= +gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= +gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= +google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= +google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de h1:cZGRis4/ot9uVm639a+rHCUaG0JJHEsdyzSQTMX+suY= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de/go.mod h1:H4O17MA/PE9BsGx3w+a+W2VOLLD1Qf7oJneAoU6WktY= +google.golang.org/grpc v1.63.2 h1:MUeiw1B2maTVZthpU5xvASfTh3LDbxHd6IJ6QQVU+xM= +google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= -google.golang.org/protobuf v1.26.0 h1:bxAC2xTBsZGibn2RTntX0oH50xLsqy1OxA9tTL3p/lk= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= +google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= +google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU= -gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/cheggaaa/pb.v1 v1.0.25/go.mod h1:V/YB90LKu/1FcN3WVnfiiE5oMCibMjukxqG/qStrOgw= -gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= -gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= -gopkg.in/ini.v1 v1.51.0 h1:AQvPpx3LzTDM0AjnIRlVFwFFGC+npRopjZxLJj6gdno= -gopkg.in/ini.v1 v1.51.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= -gopkg.in/natefinch/lumberjack.v2 v2.0.0/go.mod h1:l0ndWWf7gzL7RNwBG7wST/UCcT4T24xpD6X8LsfU/+k= -gopkg.in/resty.v1 v1.12.0/go.mod h1:mDo4pnntr5jdWRML875a/NmxYqAlA73dVijT2AXvQQo= -gopkg.in/square/go-jose.v2 v2.2.2/go.mod h1:M9dMgbHiYLoDGQrXy7OpJDJWiKiU//h+vD76mk0e1AI= -gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= -gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= -gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74= -gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.2.5/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo= -gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk= -gotest.tools/v3 v3.0.3 h1:4AuOwCGf4lLR9u3YOe2awrHygurzhO/HeQ6laiA6Sx0= -gotest.tools/v3 v3.0.3/go.mod h1:Z7Lb0S5l+klDB31fvDQX8ss/FlKDxtlFlw3Oa8Ymbl8= -honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= -honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= -k8s.io/api v0.21.2 h1:vz7DqmRsXTCSa6pNxXwQ1IYeAZgdIsua+DZU+o+SX3Y= -k8s.io/api v0.21.2/go.mod h1:Lv6UGJZ1rlMI1qusN8ruAp9PUBFyBwpEHAdG24vIsiU= -k8s.io/apiextensions-apiserver v0.21.2 h1:+exKMRep4pDrphEafRvpEi79wTnCFMqKf8LBtlA3yrE= -k8s.io/apiextensions-apiserver v0.21.2/go.mod h1:+Axoz5/l3AYpGLlhJDfcVQzCerVYq3K3CvDMvw6X1RA= -k8s.io/apimachinery v0.21.2 h1:vezUc/BHqWlQDnZ+XkrpXSmnANSLbpnlpwo0Lhk0gpc= -k8s.io/apimachinery v0.21.2/go.mod h1:CdTY8fU/BlvAbJ2z/8kBwimGki5Zp8/fbVuLY8gJumM= -k8s.io/apiserver v0.21.2/go.mod h1:lN4yBoGyiNT7SC1dmNk0ue6a5Wi6O3SWOIw91TsucQw= -k8s.io/client-go v0.21.2 h1:Q1j4L/iMN4pTw6Y4DWppBoUxgKO8LbffEMVEV00MUp0= -k8s.io/client-go v0.21.2/go.mod h1:HdJ9iknWpbl3vMGtib6T2PyI/VYxiZfq936WNVHBRrA= -k8s.io/code-generator v0.21.2/go.mod h1:8mXJDCB7HcRo1xiEQstcguZkbxZaqeUOrO9SsicWs3U= -k8s.io/component-base v0.21.2 h1:EsnmFFoJ86cEywC0DoIkAUiEV6fjgauNugiw1lmIjs4= -k8s.io/component-base v0.21.2/go.mod h1:9lvmIThzdlrJj5Hp8Z/TOgIkdfsNARQ1pT+3PByuiuc= -k8s.io/gengo v0.0.0-20200413195148-3a45101e95ac/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0= -k8s.io/gengo v0.0.0-20201214224949-b6c5ce23f027/go.mod h1:FiNAH4ZV3gBg2Kwh89tzAEV2be7d5xI0vBa/VySYy3E= -k8s.io/klog/v2 v2.0.0/go.mod h1:PBfzABfn139FHAV07az/IF9Wp1bkk3vpT2XSJ76fSDE= -k8s.io/klog/v2 v2.2.0/go.mod h1:Od+F08eJP+W3HUb4pSrPpgp9DGU4GzlpG/TmITuYh/Y= -k8s.io/klog/v2 v2.8.0/go.mod h1:hy9LJ/NvuK+iVyP4Ehqva4HxZG/oXyIS3n3Jmire4Ec= -k8s.io/klog/v2 v2.9.0 h1:D7HV+n1V57XeZ0m6tdRkfknthUaM06VFbWldOFh8kzM= -k8s.io/klog/v2 v2.9.0/go.mod h1:hy9LJ/NvuK+iVyP4Ehqva4HxZG/oXyIS3n3Jmire4Ec= -k8s.io/kube-openapi v0.0.0-20210305001622-591a79e4bda7 h1:vEx13qjvaZ4yfObSSXW7BrMc/KQBBT/Jyee8XtLf4x0= -k8s.io/kube-openapi v0.0.0-20210305001622-591a79e4bda7/go.mod h1:wXW5VT87nVfh/iLV8FpR2uDvrFyomxbtb1KivDbvPTE= -k8s.io/kube-scheduler v0.21.2 h1:mdNXiuxKX2WGhHHvXyrlE3PwIfIdqiS27LqAq4m87OY= -k8s.io/kube-scheduler v0.21.2/go.mod h1:uMnMNvgw2EAoujObL1tuJ5+tvj2Pnv3k7i3X069crrs= -k8s.io/kubelet v0.21.2 h1:n6PHxrm0FBlAGi7f3hs3CrNqVr+x3ssfrbb0aKqsBzo= -k8s.io/kubelet v0.21.2/go.mod h1:1EqOUgp3BqvMXuZZRIlPDNkpgT5MfbJrpEnS4Gxn/mo= -k8s.io/utils v0.0.0-20201110183641-67b214c5f920/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA= -k8s.io/utils v0.0.0-20210527160623-6fdb442a123b h1:MSqsVQ3pZvPGTqCjptfimO2WjG7A9un2zcpiHkA6M/s= -k8s.io/utils v0.0.0-20210527160623-6fdb442a123b/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA= -rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= -rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= -rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= -sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.19/go.mod h1:LEScyzhFmoF5pso/YSeBstl57mOzx9xlU9n85RGrDQg= -sigs.k8s.io/controller-runtime v0.9.3 h1:n075bHQ1wb8hpX7C27pNrqsb0fj8mcfCQfNX+oKTbYE= -sigs.k8s.io/controller-runtime v0.9.3/go.mod h1:TxzMCHyEUpaeuOiZx/bIdc2T81vfs/aKdvJt9wuu0zk= -sigs.k8s.io/structured-merge-diff/v4 v4.0.2/go.mod h1:bJZC9H9iH24zzfZ/41RGcq60oK1F7G282QMXDPYydCw= -sigs.k8s.io/structured-merge-diff/v4 v4.1.0 h1:C4r9BgJ98vrKnnVCjwCSXcWjWe0NKcUQkmzDXZXGwH8= -sigs.k8s.io/structured-merge-diff/v4 v4.1.0/go.mod h1:bJZC9H9iH24zzfZ/41RGcq60oK1F7G282QMXDPYydCw= -sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o= -sigs.k8s.io/yaml v1.2.0 h1:kr/MCeFWJWTwyaHoR9c8EjH9OumOmoF9YGiZd7lFm/Q= -sigs.k8s.io/yaml v1.2.0/go.mod h1:yfXDCHCao9+ENCvLSE62v9VSji2MKu5jeNfTrofGhJc= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gotest.tools/v3 v3.5.1 h1:EENdUnS3pdur5nybKYIh2Vfgc8IUNBjxDPSjtiJcOzU= +gotest.tools/v3 v3.5.1/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU= +k8s.io/api v0.28.3 h1:Gj1HtbSdB4P08C8rs9AR94MfSGpRhJgsS+GF9V26xMM= +k8s.io/api v0.28.3/go.mod h1:MRCV/jr1dW87/qJnZ57U5Pak65LGmQVkKTzf3AtKFHc= +k8s.io/apimachinery v0.28.3 h1:B1wYx8txOaCQG0HmYF6nbpU8dg6HvA06x5tEffvOe7A= +k8s.io/apimachinery v0.28.3/go.mod h1:uQTKmIqs+rAYaq+DFaoD2X7pcjLOqbQX2AOiO0nIpb8= +k8s.io/client-go v0.28.3 h1:2OqNb72ZuTZPKCl+4gTKvqao0AMOl9f3o2ijbAj3LI4= +k8s.io/client-go v0.28.3/go.mod h1:LTykbBp9gsA7SwqirlCXBWtK0guzfhpoW4qSm7i9dxo= +k8s.io/klog/v2 v2.120.1 h1:QXU6cPEOIslTGvZaXvFWiP9VKyeet3sawzTOvdXb4Vw= +k8s.io/klog/v2 v2.120.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= +k8s.io/kube-openapi v0.0.0-20240227032403-f107216b40e2 h1:02WBxjyRwX4rJdl3XlWVjFbXT/kAKCsipoM8hQY3Dwo= +k8s.io/kube-openapi v0.0.0-20240227032403-f107216b40e2/go.mod h1:B7Huvd1LKZtTYmY+nC6rnmN8lyGYT9lifBcPD5epL6k= +k8s.io/kube-scheduler v0.28.3 h1:sCvDOzRSDGCZ4whVykNoh/HbAZbwBMhbJ9xFab4QUCI= +k8s.io/kube-scheduler v0.28.3/go.mod h1:bZ0V8rlDE2eoLl2At4mSdGBKe9k6cA9P0+AuJ6aG+Os= +k8s.io/kubelet v0.28.3 h1:bp/uIf1R5F61BlFvFtzc4PDEiK7TtFcw3wFJlc0V0LM= +k8s.io/kubelet v0.28.3/go.mod h1:E3NHYbp/v45Ao6AD0EOZnqO3L0R6Haks6Nm0+bnFwtU= +k8s.io/utils v0.0.0-20240102154912-e7106e64919e h1:eQ/4ljkx21sObifjzXwlPKpdGLrCfRziVtos3ofG/sQ= +k8s.io/utils v0.0.0-20240102154912-e7106e64919e/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/controller-runtime v0.16.3 h1:2TuvuokmfXvDUamSx1SuAOO3eTyye+47mJCigwG62c4= +sigs.k8s.io/controller-runtime v0.16.3/go.mod h1:j7bialYoSn142nv9sCOJmQgDXQXxnroFU4VnX/brVJ0= +sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= +sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= +sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= +sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= +sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= +sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= +tags.cncf.io/container-device-interface v0.7.1 h1:MATNCbAD1su9U6zwQe5BrQ2vGGp1GBayD70bYaxYCNE= +tags.cncf.io/container-device-interface v0.7.1/go.mod h1:h1JVuOqTQVORp8DziaWKUCDNzAmN+zeCbqbqD30D0ZQ= +tags.cncf.io/container-device-interface/specs-go v0.7.0 h1:w/maMGVeLP6TIQJVYT5pbqTi8SCw/iHZ+n4ignuGHqg= +tags.cncf.io/container-device-interface/specs-go v0.7.0/go.mod h1:hMAwAbMZyBLdmYqWgYcKH0F/yctNpV3P35f+/088A80= diff --git a/hack/.import-aliases b/hack/.import-aliases new file mode 100644 index 000000000..bfb1e8537 --- /dev/null +++ b/hack/.import-aliases @@ -0,0 +1,51 @@ +{ + "k8s.io/api/admissionregistration/v1": "admissionregistrationv1", + "k8s.io/api/admissionregistration/v1beta1": "admissionregistrationv1beta1", + "k8s.io/api/admission/v1beta1": "admissionv1beta1", + "k8s.io/api/admission/v1": "admissionv1", + "k8s.io/api/apps/v1": "appsv1", + "k8s.io/api/apps/v1beta1": "appsv1beta1", + "k8s.io/api/apps/v1beta2": "appsv1beta2", + "k8s.io/api/authentication/v1": "authenticationv1", + "k8s.io/api/authentication/v1beta1": "authenticationv1beta1", + "k8s.io/api/authorization/v1": "authorizationv1", + "k8s.io/api/authorization/v1beta1": "authorizationv1beta1", + "k8s.io/api/autoscaling/v1": "autoscalingv1", + "k8s.io/api/autoscaling/v2": "autoscalingv2", + "k8s.io/api/batch/v1": "batchv1", + "k8s.io/api/batch/v1beta1": "batchv1beta1", + "k8s.io/api/certificates/v1beta1": "certificatesv1beta1", + "k8s.io/api/coordination/v1": "coordinationv1", + "k8s.io/api/coordination/v1beta1": "coordinationv1beta1", + "k8s.io/api/core/v1": "corev1", + "k8s.io/api/discovery/v1": "discoveryv1", + "k8s.io/api/events/v1": "eventsv1", + "k8s.io/api/events/v1beta1": "eventsv1beta1", + "k8s.io/api/extensions/v1beta1": "extensionsv1beta1", + "k8s.io/api/imagepolicy/v1alpha1": "imagepolicyv1alpha1", + "k8s.io/api/networking/v1": "networkingv1", + "k8s.io/api/networking/v1beta1": "networkingv1beta1", + "k8s.io/api/node/v1alpha1": "nodev1alpha1", + "k8s.io/api/node/v1beta1": "nodev1beta1", + "k8s.io/api/node/v1": "nodev1", + "k8s.io/api/policy/v1": "policyv1", + "k8s.io/api/policy/v1beta1": "policyv1beta1", + "k8s.io/api/rbac/v1": "rbacv1", + "k8s.io/api/rbac/v1alpha1": "rbacv1alpha1", + "k8s.io/api/rbac/v1beta1": "rbacv1beta1", + "k8s.io/api/scheduling/v1": "schedulingv1", + "k8s.io/api/scheduling/v1alpha1": "schedulingv1alpha1", + "k8s.io/api/scheduling/v1beta1": "schedulingv1beta1", + "k8s.io/api/storage/v1": "storagev1", + "k8s.io/api/storage/v1alpha1": "storagev1alpha1", + "k8s.io/api/storage/v1beta1": "storagev1beta1", + "k8s.io/apimachinery/pkg/api/errors": "apierrors", + "k8s.io/apimachinery/pkg/apis/meta/v1": "metav1", + "k8s.io/kubelet/apis/stats/v1alpha1": "kubeletstatsv1alpha1", + "k8s.io/kubelet/pkg/apis/deviceplugin/v1alpha": "kubeletdevicepluginv1alpha", + "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1": "kubeletdevicepluginv1beta1", + "k8s.io/kubelet/pkg/apis/pluginregistration/v1": "kubeletpluginregistrationv1", + "k8s.io/kubelet/pkg/apis/pluginregistration/v1alpha1": "kubeletpluginregistrationv1alpha1", + "k8s.io/kubelet/pkg/apis/pluginregistration/v1beta1": "kubeletpluginregistrationv1beta1", + "k8s.io/kubelet/pkg/apis/podresources/v1alpha1": "kubeletpodresourcesv1alpha1" +} diff --git a/hack/boilerplate/boilerplate.go.txt b/hack/boilerplate/boilerplate.go.txt new file mode 100644 index 000000000..b33968119 --- /dev/null +++ b/hack/boilerplate/boilerplate.go.txt @@ -0,0 +1,16 @@ +/* +Copyright The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + diff --git a/hack/build.sh b/hack/build.sh index b5881d427..85b7a620d 100755 --- a/hack/build.sh +++ b/hack/build.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright © 2021 peizhaoyou +# Copyright © 2024 HAMi Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,8 +21,12 @@ set -e export SHORT_VERSION export COMMIT_CODE export VERSION="${SHORT_VERSION}-${COMMIT_CODE}" +export LATEST_VERSION="latest" +export GOLANG_IMAGE="golang:1.22.5-bullseye" +export NVIDIA_IMAGE="nvidia/cuda:12.2.0-devel-ubuntu20.04" +export DEST_DIR="/usr/local" -IMAGE=${IMAGE-"m7-ieg-pico-test01:5000/k8s-vgpu"} +IMAGE=${IMAGE-"projecthami/hami"} function go_build() { [[ -z "$J" ]] && J=$(nproc | awk '{print int(($0 + 1)/ 2)}') @@ -30,15 +34,17 @@ function go_build() { } function docker_build() { - docker build --build-arg VERSION="${VERSION}" -t "${IMAGE}:${VERSION}" -f docker/Dockerfile . + docker build --build-arg VERSION="${VERSION}" --build-arg GOLANG_IMAGE=${GOLANG_IMAGE} --build-arg NVIDIA_IMAGE=${NVIDIA_IMAGE} --build-arg DEST_DIR=${DEST_DIR} -t "${IMAGE}:${VERSION}" -f docker/Dockerfile . docker tag "${IMAGE}:${VERSION}" "${IMAGE}:${SHORT_VERSION}" + docker tag "${IMAGE}:${VERSION}" "${IMAGE}:${LATEST_VERSION}" } function docker_push() { - docker push "${IMAGE}:${VERSION}" + #docker push "${IMAGE}:${VERSION}" docker push "${IMAGE}:${SHORT_VERSION}" + docker push "${IMAGE}:${LATEST_VERSION}" } go_build docker_build -docker_push \ No newline at end of file +docker_push diff --git a/hack/kubeconfig-demo.yaml b/hack/kubeconfig-demo.yaml new file mode 100644 index 000000000..f41fa7bdc --- /dev/null +++ b/hack/kubeconfig-demo.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +clusters: +- cluster: + server: http://localhost:8080 + name: local-server +contexts: +- context: + cluster: local-server + namespace: the-right-prefix + user: myself + name: default-context +current-context: default-context +kind: Config +preferences: {} +users: +- name: myself + user: + password: secret + username: admin diff --git a/hack/tools/preferredimports/preferredimports.go b/hack/tools/preferredimports/preferredimports.go new file mode 100644 index 000000000..d2e8f2a44 --- /dev/null +++ b/hack/tools/preferredimports/preferredimports.go @@ -0,0 +1,271 @@ +/* +Copyright 2019 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This code is directly lifted from the Kubernetes codebase in order to avoid relying on the k8s.io/kubernetes package. +// For reference: https://github.com/kubernetes/kubernetes/blob/release-1.22/cmd/preferredimports/preferredimports.go + +// verify that all the imports have our preferred alias(es). +package main + +import ( + "bytes" + "encoding/json" + "flag" + "fmt" + "go/ast" + "go/build" + "go/format" + "go/parser" + "go/token" + "log" + "os" + "path/filepath" + "regexp" + "sort" + "strings" + + "golang.org/x/term" +) + +var ( + importAliases = flag.String("import-aliases", "hack/.import-aliases", "json file with import aliases") + confirm = flag.Bool("confirm", false, "update file with the preferred aliases for imports") + includePathRegex = flag.String("include-path", "(test/e2e/|test/e2e_node)", "only files with paths matching this regex is touched") + excludePathRegex = flag.String("exclude-path", "(testing)", "files with paths matching this regex is ignored") + isTerminal = term.IsTerminal(int(os.Stdout.Fd())) + logPrefix = "" + aliases map[string]string +) + +type analyzer struct { + fset *token.FileSet // positions are relative to fset + ctx build.Context + failed bool + donePaths map[string]interface{} +} + +func newAnalyzer() *analyzer { + ctx := build.Default + ctx.CgoEnabled = true + + a := &analyzer{ + fset: token.NewFileSet(), + ctx: ctx, + donePaths: make(map[string]interface{}), + } + + return a +} + +// collect extracts test metadata from a file. +func (a *analyzer) collect(dir string) { + if _, ok := a.donePaths[dir]; ok { + return + } + a.donePaths[dir] = nil + + // Create the AST by parsing src. + fs, err := parser.ParseDir(a.fset, dir, nil, parser.AllErrors|parser.ParseComments) + + if err != nil { + fmt.Fprintln(os.Stderr, "ERROR(syntax)", logPrefix, err) + a.failed = true + return + } + + for _, p := range fs { + // returns first error, but a.handleError deals with it + files := a.filterFiles(p.Files) + for _, file := range files { + replacements := make(map[string]string) + pathToFile := a.fset.File(file.Pos()).Name() + for _, imp := range file.Imports { + importPath := strings.Replace(imp.Path.Value, "\"", "", -1) + pathSegments := strings.Split(importPath, "/") + importName := pathSegments[len(pathSegments)-1] + if imp.Name != nil { + importName = imp.Name.Name + } + if alias, ok := aliases[importPath]; ok { + if alias != importName { + if !*confirm { + fmt.Fprintf(os.Stderr, "%sERROR wrong alias for import \"%s\" should be %s in file %s\n", logPrefix, importPath, alias, pathToFile) + a.failed = true + } + replacements[importName] = alias + if imp.Name != nil { + imp.Name.Name = alias + } else { + imp.Name = ast.NewIdent(alias) + } + } + } + } + + if len(replacements) > 0 { + if *confirm { + fmt.Printf("%sReplacing imports with aliases in file %s\n", logPrefix, pathToFile) + for key, value := range replacements { + renameImportUsages(file, key, value) + } + ast.SortImports(a.fset, file) + var buffer bytes.Buffer + if err = format.Node(&buffer, a.fset, file); err != nil { + panic(fmt.Sprintf("Error formatting ast node after rewriting import.\n%s\n", err.Error())) + } + + fileInfo, err := os.Stat(pathToFile) + if err != nil { + panic(fmt.Sprintf("Error stat'ing file: %s\n%s\n", pathToFile, err.Error())) + } + + err = os.WriteFile(pathToFile, buffer.Bytes(), fileInfo.Mode()) + if err != nil { + panic(fmt.Sprintf("Error writing file: %s\n%s\n", pathToFile, err.Error())) + } + } + } + } + } +} + +func renameImportUsages(f *ast.File, old, new string) { + // use this to avoid renaming the package declaration, eg: + // given: package foo; import foo "bar"; foo.Baz, rename foo->qux + // yield: package foo; import qux "bar"; qux.Baz + var pkg *ast.Ident + + // Rename top-level old to new, both unresolved names + // (probably defined in another file) and names that resolve + // to a declaration we renamed. + ast.Inspect(f, func(node ast.Node) bool { + if node == nil { + return false + } + switch id := node.(type) { + case *ast.File: + pkg = id.Name + case *ast.Ident: + if pkg != nil && id == pkg { + return false + } + if id.Name == old { + id.Name = new + } + } + return true + }) +} + +func (a *analyzer) filterFiles(fs map[string]*ast.File) []*ast.File { + var files []*ast.File + for _, f := range fs { + files = append(files, f) + } + return files +} + +type collector struct { + dirs []string + includePathRegex *regexp.Regexp + excludePathRegex *regexp.Regexp +} + +// handlePath walks the filesystem recursively, collecting directories, +// ignoring some unneeded directories (hidden/vendored) that are handled +// specially later. +func (c *collector) handlePath(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if info.IsDir() { + // Ignore hidden directories (.git, .cache, etc) + if len(path) > 1 && path[0] == '.' || + // Staging code is symlinked from vendor/k8s.io, and uses import + // paths as if it were inside of vendor/. It fails typechecking + // inside of staging/, but works when typechecked as part of vendor/. + path == "staging" || + // OS-specific vendor code tends to be imported by OS-specific + // packages. We recursively typecheck imported vendored packages for + // each OS, but don't typecheck everything for every OS. + path == "vendor" || + path == "_output" || + // This is a weird one. /testdata/ is *mostly* ignored by Go, + // and this translates to kubernetes/vendor not working. + // edit/record.go doesn't compile without gopkg.in/yaml.v2 + // in $GOSRC/$GOROOT (both typecheck and the shell script). + path == "pkg/kubectl/cmd/testdata/edit" { + return filepath.SkipDir + } + if c.includePathRegex.MatchString(path) && !c.excludePathRegex.MatchString(path) { + c.dirs = append(c.dirs, path) + } + } + return nil +} + +func main() { + flag.Parse() + args := flag.Args() + + if len(args) == 0 { + args = append(args, ".") + } + + includePathRegex, err := regexp.Compile(*includePathRegex) + if err != nil { + log.Fatalf("Error compiling regex: %v", err) + } + excludePathRegex, err := regexp.Compile(*excludePathRegex) + if err != nil { + log.Fatalf("Error compiling regex: %v", err) + } + c := collector{includePathRegex: includePathRegex, excludePathRegex: excludePathRegex} + for _, arg := range args { + err := filepath.Walk(arg, c.handlePath) + if err != nil { + log.Fatalf("Error walking: %v", err) + } + } + sort.Strings(c.dirs) + + if len(*importAliases) > 0 { + bytes, err := os.ReadFile(*importAliases) + if err != nil { + log.Fatalf("Error reading import aliases: %v", err) + } + err = json.Unmarshal(bytes, &aliases) + if err != nil { + log.Fatalf("Error loading aliases: %v", err) + } + } + if isTerminal { + logPrefix = "\r" // clear status bar when printing + } + fmt.Println("checking-imports: ") + + a := newAnalyzer() + for _, dir := range c.dirs { + if isTerminal { + fmt.Printf("\r\033[0m %-80s\n", dir) + } + a.collect(dir) + } + fmt.Println() + if a.failed { + os.Exit(1) + } +} diff --git a/hack/tools/tools.go b/hack/tools/tools.go new file mode 100644 index 000000000..5a4909822 --- /dev/null +++ b/hack/tools/tools.go @@ -0,0 +1,23 @@ +//go:build tools + +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tools + +import ( + _ "golang.org/x/tools/cmd/goimports" +) diff --git a/hack/unit-test.sh b/hack/unit-test.sh new file mode 100755 index 000000000..a60e4e031 --- /dev/null +++ b/hack/unit-test.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# Copyright 2024 The HAMi Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +set -x + +# init kubeconfig env +kubeconfig_path="${HOME}/.kube" +kubeconfig_file="${kubeconfig_path}/config" +kubeconfig_demo="./hack/kubeconfig-demo.yaml" + +echo "kubeconfig: ${kubeconfig_file}" + +if [ ! -f "$kubeconfig_file" ]; then + echo "Generate fake kubeconfig" + if [ ! -d "${kubeconfig_path}" ]; then + trap 'rm -rf "$kubeconfig_path"' EXIT + mkdir -p "${kubeconfig_path}" + cp ${kubeconfig_demo} "${kubeconfig_file}" + else + trap 'rm -f "$kubeconfig_file"' EXIT + cp ${kubeconfig_demo} "${kubeconfig_file}" + fi +else + echo "Use local kubeconfig" +fi + +tmpDir=$(mktemp -d) +mergeF="${tmpDir}/merge.out" +rm -f ${mergeF} +ls $tmpDir +cov_file="${tmpDir}/c.cover" +go test $(go list ./pkg/... | grep -v ./pkg/device-plugin/...) -short --race -count=1 -covermode=atomic -coverprofile=${cov_file} +cat $cov_file | grep -v mode: | grep -v pkg/version | grep -v fake | grep -v main.go >>${mergeF} +#merge them +echo "mode: atomic" >coverage.out +cat ${mergeF} >>coverage.out +go tool cover -func=coverage.out +rm -rf coverage.out ${tmpDir} ${mergeF} diff --git a/hack/update-generated-api.sh b/hack/update-generated-api.sh index f455acc85..c7b11cf23 100755 --- a/hack/update-generated-api.sh +++ b/hack/update-generated-api.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright © 2021 peizhaoyou +# Copyright © 2024 HAMi Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/hack/util.sh b/hack/util.sh new file mode 100755 index 000000000..22c1e625f --- /dev/null +++ b/hack/util.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# Copyright 2024 The HAMi Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +MIN_Go_VERSION=1.21.0 + +function util::cmd_exist { + local CMD=$(command -v ${1}) + if [[ ! -x ${CMD} ]]; then + return 1 + fi + return 0 +} + +function util::verify_go_version { + local go_version + IFS=" " read -ra go_version <<<"$(GOFLAGS='' go version)" + if [[ "${MIN_Go_VERSION}" != $(echo -e "${MIN_Go_VERSION}\n${go_version[2]}" | sort -s -t. -k 1,1 -k 2,2n -k 3,3n | head -n1) && "${go_version[2]}" != "devel" ]]; then + echo "Detected go version: ${go_version[*]}." + echo "requires ${MIN_Go_VERSION} or greater." + echo "Please install ${MIN_Go_VERSION} or later." + exit 1 + fi +} + +# util::install_helm will install the helm command +function util::install_helm { + curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash +} + diff --git a/hack/verify-all.sh b/hack/verify-all.sh new file mode 100755 index 000000000..76be88d70 --- /dev/null +++ b/hack/verify-all.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# Copyright 2024 The HAMi Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +REPO_ROOT=$(dirname "${BASH_SOURCE[0]}")/.. + +# Show progress +set -x + +# Orders are determined by two factors: +# (1) Less Execution time item should be executed first. +# (2) More likely to fail item should be executed first. + +bash "$REPO_ROOT/hack/verify-staticcheck.sh" + +bash "$REPO_ROOT/hack/verify-license.sh" + +bash "$REPO_ROOT/hack/verify-import-aliases.sh" diff --git a/hack/verify-chart-version.sh b/hack/verify-chart-version.sh new file mode 100755 index 000000000..7c6ff0068 --- /dev/null +++ b/hack/verify-chart-version.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash +# Copyright 2024 The HAMi Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +REPO_ROOT=$(dirname "${BASH_SOURCE[0]}")/.. +cd "${REPO_ROOT}" + +source "${REPO_ROOT}"/hack/util.sh + +# install helm +echo -n "Preparing: 'helm' existence check - " +if util::cmd_exist helm; then + echo "passed" +else + echo "installing helm" + util::install_helm +fi + +APP_VERSION=$(helm show chart ./charts/hami | grep '^appVersion' |grep -E '[0-9].*.[0-9]' | awk -F ':' '{print $2}' | tr -d ' ') +VERSION=$(helm show chart ./charts/hami | grep '^version' |grep -E '[0-9].*.[0-9]' | awk -F ':' '{print $2}' | tr -d ' ') + +if [[ ${APP_VERSION} != ${VERSION} ]]; then + echo "AppVersion of HAMi is ${APP_VERSION}, but version is ${VERSION}!" + exit 1 +fi + +echo "Both appVersion and version is ${APP_VERSION}." + diff --git a/hack/verify-import-aliases.sh b/hack/verify-import-aliases.sh new file mode 100755 index 000000000..3b8a8f705 --- /dev/null +++ b/hack/verify-import-aliases.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Copyright 2024 The HAMi Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +SCRIPT_ROOT=$(dirname "${BASH_SOURCE[0]}")/.. +cd "${SCRIPT_ROOT}" +ROOT_PATH=$(pwd) + +IMPORT_ALIASES_PATH="${ROOT_PATH}/hack/.import-aliases" +INCLUDE_PATH="(${ROOT_PATH}/cmd|${ROOT_PATH}/pkg)" + +ret=0 +# We can't directly install preferredimports by `go install` due to the go.mod issue: +# go install k8s.io/kubernetes/cmd/preferredimports@v1.21.3: k8s.io/kubernetes@v1.21.3 +# The go.mod file for the module providing named packages contains one or +# more replace directives. It must not contain directives that would cause +# it to be interpreted differently than if it were the main module. +go run "${ROOT_PATH}/hack/tools/preferredimports/preferredimports.go" -import-aliases "${IMPORT_ALIASES_PATH}" -include-path "${INCLUDE_PATH}" "${ROOT_PATH}" || ret=$? +if [[ $ret -ne 0 ]]; then + echo "!!! Please see hack/.import-aliases for the preferred aliases for imports." >&2 + exit 1 +fi +echo "Passed import-aliases verification." diff --git a/hack/verify-license.sh b/hack/verify-license.sh new file mode 100755 index 000000000..a28cc234e --- /dev/null +++ b/hack/verify-license.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +# Copyright 2024 The HAMi Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail +set -ex + +REPO_ROOT=$(dirname "${BASH_SOURCE[0]}")/.. +cd "${REPO_ROOT}" + +if [[ "$(which addlicense)" == "" ]]; then + go install github.com/google/addlicense@v1.1.1 +fi +ADDLICENSE_BIN=$(which addlicense) + +# verify presence of license headers and exit with non-zero code if missing +missing_license_header_files="$($ADDLICENSE_BIN \ + -check \ + -ignore "benchmarks/**" \ + -ignore "charts/**" \ + -ignore "docs/**" \ + -ignore "docker/**" \ + -ignore "examples/**" \ + -ignore "lib/**" \ + -ignore "libvgpu/**" \ + -ignore "third_party/**" \ + -ignore "vendor/**" \ + -ignore "_output/**" \ + -ignore ".github/**" \ + -ignore "**/*.md" \ + -ignore "**/*.yaml" \ + -ignore "**/*.yml" \ + -ignore "**/*.json" \ + -ignore ".idea/**" \ + .)" || true + +if [[ "$missing_license_header_files" ]]; then + echo "Files with no license header detected:" + echo "$missing_license_header_files" + echo "Please add all missing license headers." + exit 1 +fi + +echo "Congratulations! All files have passed license header check." diff --git a/hack/verify-staticcheck.sh b/hack/verify-staticcheck.sh new file mode 100755 index 000000000..3b3475d67 --- /dev/null +++ b/hack/verify-staticcheck.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# Copyright 2024 The HAMi Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +REPO_ROOT=$(dirname "${BASH_SOURCE[0]}")/.. +GOLANGCI_LINT_VER="v1.57.1" + +cd "${REPO_ROOT}" +source "hack/util.sh" + +if util::cmd_exist golangci-lint; then + echo "Using golangci-lint version:" + golangci-lint version +else + echo "Installing golangci-lint ${GOLANGCI_LINT_VER}" + # https://golangci-lint.run/usage/install/#other-ci + curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/v1.57.1/install.sh | sh -s -- -b $(go env GOPATH)/bin ${GOLANGCI_LINT_VER} +fi + +if golangci-lint run; then + echo 'Congratulations! All Go source files have passed staticcheck.' +else + echo # print one empty line, separate from warning messages. + echo 'Please review the above warnings.' + echo 'If the above warnings do not make sense, feel free to file an issue.' + exit 1 +fi diff --git a/imgs/arch.png b/imgs/arch.png new file mode 100644 index 000000000..e62563686 Binary files /dev/null and b/imgs/arch.png differ diff --git a/imgs/benchmark.png b/imgs/benchmark.png new file mode 100644 index 000000000..3de68fd38 Binary files /dev/null and b/imgs/benchmark.png differ diff --git a/imgs/benchmark_inf.png b/imgs/benchmark_inf.png new file mode 100644 index 000000000..ec52cb56d Binary files /dev/null and b/imgs/benchmark_inf.png differ diff --git a/imgs/benchmark_train.png b/imgs/benchmark_train.png new file mode 100644 index 000000000..78eaa9235 Binary files /dev/null and b/imgs/benchmark_train.png differ diff --git a/imgs/example.png b/imgs/example.png new file mode 100644 index 000000000..0f407f4d9 Binary files /dev/null and b/imgs/example.png differ diff --git a/imgs/hami-arch.jpg b/imgs/hami-arch.jpg new file mode 100644 index 000000000..55fdc5175 Binary files /dev/null and b/imgs/hami-arch.jpg differ diff --git a/imgs/hami-arch.png b/imgs/hami-arch.png new file mode 100644 index 000000000..308daeca0 Binary files /dev/null and b/imgs/hami-arch.png differ diff --git a/imgs/hami-arch.pptx b/imgs/hami-arch.pptx new file mode 100644 index 000000000..a4d8f6797 Binary files /dev/null and b/imgs/hami-arch.pptx differ diff --git a/imgs/hami-graph-color.png b/imgs/hami-graph-color.png new file mode 100644 index 000000000..b6bffc20d Binary files /dev/null and b/imgs/hami-graph-color.png differ diff --git a/imgs/hard_limit.jpg b/imgs/hard_limit.jpg new file mode 100644 index 000000000..554bfbb79 Binary files /dev/null and b/imgs/hard_limit.jpg differ diff --git a/lib/libvgpu.so b/lib/libvgpu.so deleted file mode 100755 index 0c436fe92..000000000 Binary files a/lib/libvgpu.so and /dev/null differ diff --git a/lib/mlu/cntopo b/lib/mlu/cntopo new file mode 100755 index 000000000..d6479b2f4 Binary files /dev/null and b/lib/mlu/cntopo differ diff --git a/lib/mlu/libcndev.so b/lib/mlu/libcndev.so new file mode 100644 index 000000000..8a370ac11 Binary files /dev/null and b/lib/mlu/libcndev.so differ diff --git a/lib/mlu/smlu-containerd b/lib/mlu/smlu-containerd new file mode 100755 index 000000000..5c3b4c506 Binary files /dev/null and b/lib/mlu/smlu-containerd differ diff --git a/lib/ld.so.preload b/lib/nvidia/ld.so.preload similarity index 100% rename from lib/ld.so.preload rename to lib/nvidia/ld.so.preload diff --git a/libvgpu b/libvgpu new file mode 160000 index 000000000..af84bbdfd --- /dev/null +++ b/libvgpu @@ -0,0 +1 @@ +Subproject commit af84bbdfd47fd90a5e2381652c70dd547bec7d0e diff --git a/pkg/api/device_register.go b/pkg/api/device_register.go new file mode 100644 index 000000000..06d7e8b60 --- /dev/null +++ b/pkg/api/device_register.go @@ -0,0 +1,21 @@ +// Code generated by protoc-gen-gogo. DO NOT EDIT. +// source: pkg/api/device_register.proto + +package api + +// Reference imports to suppress errors if they are not otherwise used. + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the proto package it is being compiled against. +// A compilation error at this line likely means your copy of the +// proto package needs to be updated. +type DeviceInfo struct { + Index int + Id string + Count int32 + Devmem int32 + Devcore int32 + Type string + Numa int + Health bool +} diff --git a/pkg/api/device_register.pb.go b/pkg/api/device_register.pb.go deleted file mode 100644 index 2cd673df5..000000000 --- a/pkg/api/device_register.pb.go +++ /dev/null @@ -1,1512 +0,0 @@ -// Code generated by protoc-gen-gogo. DO NOT EDIT. -// source: pkg/api/device_register.proto - -package api - -import ( - context "context" - fmt "fmt" - proto "github.com/golang/protobuf/proto" - grpc "google.golang.org/grpc" - codes "google.golang.org/grpc/codes" - status "google.golang.org/grpc/status" - io "io" - math "math" - math_bits "math/bits" -) - -// Reference imports to suppress errors if they are not otherwise used. -var _ = proto.Marshal -var _ = fmt.Errorf -var _ = math.Inf - -// This is a compile-time assertion to ensure that this generated file -// is compatible with the proto package it is being compiled against. -// A compilation error at this line likely means your copy of the -// proto package needs to be updated. -const _ = proto.ProtoPackageIsVersion3 // please upgrade the proto package - -type DeviceInfo struct { - Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` - Count int32 `protobuf:"varint,2,opt,name=count,proto3" json:"count,omitempty"` - Health bool `protobuf:"varint,3,opt,name=health,proto3" json:"health,omitempty"` - XXX_NoUnkeyedLiteral struct{} `json:"-"` - XXX_unrecognized []byte `json:"-"` - XXX_sizecache int32 `json:"-"` -} - -func (m *DeviceInfo) Reset() { *m = DeviceInfo{} } -func (m *DeviceInfo) String() string { return proto.CompactTextString(m) } -func (*DeviceInfo) ProtoMessage() {} -func (*DeviceInfo) Descriptor() ([]byte, []int) { - return fileDescriptor_f726eb77a5b37099, []int{0} -} -func (m *DeviceInfo) XXX_Unmarshal(b []byte) error { - return m.Unmarshal(b) -} -func (m *DeviceInfo) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { - if deterministic { - return xxx_messageInfo_DeviceInfo.Marshal(b, m, deterministic) - } else { - b = b[:cap(b)] - n, err := m.MarshalToSizedBuffer(b) - if err != nil { - return nil, err - } - return b[:n], nil - } -} -func (m *DeviceInfo) XXX_Merge(src proto.Message) { - xxx_messageInfo_DeviceInfo.Merge(m, src) -} -func (m *DeviceInfo) XXX_Size() int { - return m.Size() -} -func (m *DeviceInfo) XXX_DiscardUnknown() { - xxx_messageInfo_DeviceInfo.DiscardUnknown(m) -} - -var xxx_messageInfo_DeviceInfo proto.InternalMessageInfo - -func (m *DeviceInfo) GetId() string { - if m != nil { - return m.Id - } - return "" -} - -func (m *DeviceInfo) GetCount() int32 { - if m != nil { - return m.Count - } - return 0 -} - -func (m *DeviceInfo) GetHealth() bool { - if m != nil { - return m.Health - } - return false -} - -type RegisterRequest struct { - Node string `protobuf:"bytes,1,opt,name=node,proto3" json:"node,omitempty"` - Devices []*DeviceInfo `protobuf:"bytes,2,rep,name=devices,proto3" json:"devices,omitempty"` - XXX_NoUnkeyedLiteral struct{} `json:"-"` - XXX_unrecognized []byte `json:"-"` - XXX_sizecache int32 `json:"-"` -} - -func (m *RegisterRequest) Reset() { *m = RegisterRequest{} } -func (m *RegisterRequest) String() string { return proto.CompactTextString(m) } -func (*RegisterRequest) ProtoMessage() {} -func (*RegisterRequest) Descriptor() ([]byte, []int) { - return fileDescriptor_f726eb77a5b37099, []int{1} -} -func (m *RegisterRequest) XXX_Unmarshal(b []byte) error { - return m.Unmarshal(b) -} -func (m *RegisterRequest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { - if deterministic { - return xxx_messageInfo_RegisterRequest.Marshal(b, m, deterministic) - } else { - b = b[:cap(b)] - n, err := m.MarshalToSizedBuffer(b) - if err != nil { - return nil, err - } - return b[:n], nil - } -} -func (m *RegisterRequest) XXX_Merge(src proto.Message) { - xxx_messageInfo_RegisterRequest.Merge(m, src) -} -func (m *RegisterRequest) XXX_Size() int { - return m.Size() -} -func (m *RegisterRequest) XXX_DiscardUnknown() { - xxx_messageInfo_RegisterRequest.DiscardUnknown(m) -} - -var xxx_messageInfo_RegisterRequest proto.InternalMessageInfo - -func (m *RegisterRequest) GetNode() string { - if m != nil { - return m.Node - } - return "" -} - -func (m *RegisterRequest) GetDevices() []*DeviceInfo { - if m != nil { - return m.Devices - } - return nil -} - -type RegisterReply struct { - XXX_NoUnkeyedLiteral struct{} `json:"-"` - XXX_unrecognized []byte `json:"-"` - XXX_sizecache int32 `json:"-"` -} - -func (m *RegisterReply) Reset() { *m = RegisterReply{} } -func (m *RegisterReply) String() string { return proto.CompactTextString(m) } -func (*RegisterReply) ProtoMessage() {} -func (*RegisterReply) Descriptor() ([]byte, []int) { - return fileDescriptor_f726eb77a5b37099, []int{2} -} -func (m *RegisterReply) XXX_Unmarshal(b []byte) error { - return m.Unmarshal(b) -} -func (m *RegisterReply) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { - if deterministic { - return xxx_messageInfo_RegisterReply.Marshal(b, m, deterministic) - } else { - b = b[:cap(b)] - n, err := m.MarshalToSizedBuffer(b) - if err != nil { - return nil, err - } - return b[:n], nil - } -} -func (m *RegisterReply) XXX_Merge(src proto.Message) { - xxx_messageInfo_RegisterReply.Merge(m, src) -} -func (m *RegisterReply) XXX_Size() int { - return m.Size() -} -func (m *RegisterReply) XXX_DiscardUnknown() { - xxx_messageInfo_RegisterReply.DiscardUnknown(m) -} - -var xxx_messageInfo_RegisterReply proto.InternalMessageInfo - -type GetContainerRequest struct { - Uuid string `protobuf:"bytes,1,opt,name=uuid,proto3" json:"uuid,omitempty"` - XXX_NoUnkeyedLiteral struct{} `json:"-"` - XXX_unrecognized []byte `json:"-"` - XXX_sizecache int32 `json:"-"` -} - -func (m *GetContainerRequest) Reset() { *m = GetContainerRequest{} } -func (m *GetContainerRequest) String() string { return proto.CompactTextString(m) } -func (*GetContainerRequest) ProtoMessage() {} -func (*GetContainerRequest) Descriptor() ([]byte, []int) { - return fileDescriptor_f726eb77a5b37099, []int{3} -} -func (m *GetContainerRequest) XXX_Unmarshal(b []byte) error { - return m.Unmarshal(b) -} -func (m *GetContainerRequest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { - if deterministic { - return xxx_messageInfo_GetContainerRequest.Marshal(b, m, deterministic) - } else { - b = b[:cap(b)] - n, err := m.MarshalToSizedBuffer(b) - if err != nil { - return nil, err - } - return b[:n], nil - } -} -func (m *GetContainerRequest) XXX_Merge(src proto.Message) { - xxx_messageInfo_GetContainerRequest.Merge(m, src) -} -func (m *GetContainerRequest) XXX_Size() int { - return m.Size() -} -func (m *GetContainerRequest) XXX_DiscardUnknown() { - xxx_messageInfo_GetContainerRequest.DiscardUnknown(m) -} - -var xxx_messageInfo_GetContainerRequest proto.InternalMessageInfo - -func (m *GetContainerRequest) GetUuid() string { - if m != nil { - return m.Uuid - } - return "" -} - -type GetContainerReply struct { - DevList []string `protobuf:"bytes,1,rep,name=devList,proto3" json:"devList,omitempty"` - PodUID string `protobuf:"bytes,3,opt,name=podUID,proto3" json:"podUID,omitempty"` - CtrName string `protobuf:"bytes,4,opt,name=ctrName,proto3" json:"ctrName,omitempty"` - PodNamespace string `protobuf:"bytes,5,opt,name=podNamespace,proto3" json:"podNamespace,omitempty"` - PodName string `protobuf:"bytes,6,opt,name=podName,proto3" json:"podName,omitempty"` - XXX_NoUnkeyedLiteral struct{} `json:"-"` - XXX_unrecognized []byte `json:"-"` - XXX_sizecache int32 `json:"-"` -} - -func (m *GetContainerReply) Reset() { *m = GetContainerReply{} } -func (m *GetContainerReply) String() string { return proto.CompactTextString(m) } -func (*GetContainerReply) ProtoMessage() {} -func (*GetContainerReply) Descriptor() ([]byte, []int) { - return fileDescriptor_f726eb77a5b37099, []int{4} -} -func (m *GetContainerReply) XXX_Unmarshal(b []byte) error { - return m.Unmarshal(b) -} -func (m *GetContainerReply) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { - if deterministic { - return xxx_messageInfo_GetContainerReply.Marshal(b, m, deterministic) - } else { - b = b[:cap(b)] - n, err := m.MarshalToSizedBuffer(b) - if err != nil { - return nil, err - } - return b[:n], nil - } -} -func (m *GetContainerReply) XXX_Merge(src proto.Message) { - xxx_messageInfo_GetContainerReply.Merge(m, src) -} -func (m *GetContainerReply) XXX_Size() int { - return m.Size() -} -func (m *GetContainerReply) XXX_DiscardUnknown() { - xxx_messageInfo_GetContainerReply.DiscardUnknown(m) -} - -var xxx_messageInfo_GetContainerReply proto.InternalMessageInfo - -func (m *GetContainerReply) GetDevList() []string { - if m != nil { - return m.DevList - } - return nil -} - -func (m *GetContainerReply) GetPodUID() string { - if m != nil { - return m.PodUID - } - return "" -} - -func (m *GetContainerReply) GetCtrName() string { - if m != nil { - return m.CtrName - } - return "" -} - -func (m *GetContainerReply) GetPodNamespace() string { - if m != nil { - return m.PodNamespace - } - return "" -} - -func (m *GetContainerReply) GetPodName() string { - if m != nil { - return m.PodName - } - return "" -} - -func init() { - proto.RegisterType((*DeviceInfo)(nil), "api.DeviceInfo") - proto.RegisterType((*RegisterRequest)(nil), "api.RegisterRequest") - proto.RegisterType((*RegisterReply)(nil), "api.RegisterReply") - proto.RegisterType((*GetContainerRequest)(nil), "api.GetContainerRequest") - proto.RegisterType((*GetContainerReply)(nil), "api.GetContainerReply") -} - -func init() { proto.RegisterFile("pkg/api/device_register.proto", fileDescriptor_f726eb77a5b37099) } - -var fileDescriptor_f726eb77a5b37099 = []byte{ - // 357 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x6c, 0x92, 0xcf, 0x4a, 0xeb, 0x40, - 0x14, 0xc6, 0xef, 0x24, 0x6d, 0x6f, 0x7b, 0x6c, 0xad, 0x8e, 0xa5, 0x0c, 0x05, 0x43, 0xc8, 0x2a, - 0xdd, 0xb4, 0x50, 0xc1, 0xb5, 0x68, 0x41, 0x2a, 0x22, 0x32, 0xe2, 0x5a, 0x62, 0x32, 0xb6, 0x83, - 0x31, 0x33, 0x26, 0x13, 0x21, 0x8f, 0xe0, 0x1b, 0xb8, 0xf1, 0x7d, 0x5c, 0xfa, 0x08, 0x52, 0x5f, - 0x44, 0x32, 0x49, 0xac, 0x2d, 0x5d, 0xe5, 0xfc, 0xe6, 0x3b, 0xff, 0xbe, 0x43, 0xe0, 0x50, 0x3e, - 0xce, 0xc7, 0x9e, 0xe4, 0xe3, 0x80, 0xbd, 0x70, 0x9f, 0xdd, 0xc5, 0x6c, 0xce, 0x13, 0xc5, 0xe2, - 0x91, 0x8c, 0x85, 0x12, 0xd8, 0xf4, 0x24, 0x77, 0x2e, 0x00, 0xa6, 0x5a, 0x9d, 0x45, 0x0f, 0x02, - 0xef, 0x82, 0xc1, 0x03, 0x82, 0x6c, 0xe4, 0xb6, 0xa8, 0xc1, 0x03, 0xdc, 0x83, 0xba, 0x2f, 0xd2, - 0x48, 0x11, 0xc3, 0x46, 0x6e, 0x9d, 0x16, 0x80, 0xfb, 0xd0, 0x58, 0x30, 0x2f, 0x54, 0x0b, 0x62, - 0xda, 0xc8, 0x6d, 0xd2, 0x92, 0x9c, 0x6b, 0xe8, 0xd2, 0x72, 0x04, 0x65, 0xcf, 0x29, 0x4b, 0x14, - 0xc6, 0x50, 0x8b, 0x44, 0xc0, 0xca, 0x96, 0x3a, 0xc6, 0x43, 0xf8, 0x5f, 0x2c, 0x94, 0x10, 0xc3, - 0x36, 0xdd, 0x9d, 0x49, 0x77, 0xe4, 0x49, 0x3e, 0x5a, 0xad, 0x41, 0x2b, 0xdd, 0xe9, 0x42, 0x67, - 0xd5, 0x51, 0x86, 0x99, 0x33, 0x84, 0x83, 0x73, 0xa6, 0xce, 0x44, 0xa4, 0x3c, 0x1e, 0xad, 0x8d, - 0x49, 0xd3, 0xdf, 0xcd, 0x75, 0xec, 0xbc, 0x23, 0xd8, 0x5f, 0xcf, 0x95, 0x61, 0x86, 0x89, 0x1e, - 0x7e, 0xc9, 0x13, 0x45, 0x90, 0x6d, 0xba, 0x2d, 0x5a, 0x61, 0xee, 0x4a, 0x8a, 0xe0, 0x76, 0x36, - 0xd5, 0xae, 0x5a, 0xb4, 0xa4, 0xbc, 0xc2, 0x57, 0xf1, 0x95, 0xf7, 0xc4, 0x48, 0x4d, 0x0b, 0x15, - 0x62, 0x07, 0xda, 0x52, 0x04, 0x79, 0x98, 0x48, 0xcf, 0x67, 0xa4, 0xae, 0xe5, 0xb5, 0xb7, 0xbc, - 0xba, 0x64, 0xd2, 0x28, 0xaa, 0x4b, 0x9c, 0xbc, 0x22, 0xe8, 0x14, 0x9e, 0x6f, 0x58, 0x9c, 0x7f, - 0xf0, 0x31, 0x34, 0x2b, 0xb7, 0xb8, 0xa7, 0x6f, 0xb2, 0x71, 0xce, 0x01, 0xde, 0x78, 0x95, 0x61, - 0xe6, 0x22, 0x7c, 0x02, 0xed, 0xbf, 0x46, 0x31, 0xd1, 0x59, 0x5b, 0xee, 0x34, 0xe8, 0x6f, 0x51, - 0x64, 0x98, 0x9d, 0xee, 0x7d, 0x2c, 0x2d, 0xf4, 0xb9, 0xb4, 0xd0, 0xd7, 0xd2, 0x42, 0x6f, 0xdf, - 0xd6, 0xbf, 0xfb, 0x86, 0xfe, 0x47, 0x8e, 0x7e, 0x02, 0x00, 0x00, 0xff, 0xff, 0x5d, 0xc5, 0x41, - 0x30, 0x44, 0x02, 0x00, 0x00, -} - -// Reference imports to suppress errors if they are not otherwise used. -var _ context.Context -var _ grpc.ClientConn - -// This is a compile-time assertion to ensure that this generated file -// is compatible with the grpc package it is being compiled against. -const _ = grpc.SupportPackageIsVersion4 - -// DeviceServiceClient is the client API for DeviceService service. -// -// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://godoc.org/google.golang.org/grpc#ClientConn.NewStream. -type DeviceServiceClient interface { - Register(ctx context.Context, opts ...grpc.CallOption) (DeviceService_RegisterClient, error) - GetContainer(ctx context.Context, in *GetContainerRequest, opts ...grpc.CallOption) (*GetContainerReply, error) -} - -type deviceServiceClient struct { - cc *grpc.ClientConn -} - -func NewDeviceServiceClient(cc *grpc.ClientConn) DeviceServiceClient { - return &deviceServiceClient{cc} -} - -func (c *deviceServiceClient) Register(ctx context.Context, opts ...grpc.CallOption) (DeviceService_RegisterClient, error) { - stream, err := c.cc.NewStream(ctx, &_DeviceService_serviceDesc.Streams[0], "/api.DeviceService/Register", opts...) - if err != nil { - return nil, err - } - x := &deviceServiceRegisterClient{stream} - return x, nil -} - -type DeviceService_RegisterClient interface { - Send(*RegisterRequest) error - CloseAndRecv() (*RegisterReply, error) - grpc.ClientStream -} - -type deviceServiceRegisterClient struct { - grpc.ClientStream -} - -func (x *deviceServiceRegisterClient) Send(m *RegisterRequest) error { - return x.ClientStream.SendMsg(m) -} - -func (x *deviceServiceRegisterClient) CloseAndRecv() (*RegisterReply, error) { - if err := x.ClientStream.CloseSend(); err != nil { - return nil, err - } - m := new(RegisterReply) - if err := x.ClientStream.RecvMsg(m); err != nil { - return nil, err - } - return m, nil -} - -func (c *deviceServiceClient) GetContainer(ctx context.Context, in *GetContainerRequest, opts ...grpc.CallOption) (*GetContainerReply, error) { - out := new(GetContainerReply) - err := c.cc.Invoke(ctx, "/api.DeviceService/GetContainer", in, out, opts...) - if err != nil { - return nil, err - } - return out, nil -} - -// DeviceServiceServer is the server API for DeviceService service. -type DeviceServiceServer interface { - Register(DeviceService_RegisterServer) error - GetContainer(context.Context, *GetContainerRequest) (*GetContainerReply, error) -} - -// UnimplementedDeviceServiceServer can be embedded to have forward compatible implementations. -type UnimplementedDeviceServiceServer struct { -} - -func (*UnimplementedDeviceServiceServer) Register(srv DeviceService_RegisterServer) error { - return status.Errorf(codes.Unimplemented, "method Register not implemented") -} -func (*UnimplementedDeviceServiceServer) GetContainer(ctx context.Context, req *GetContainerRequest) (*GetContainerReply, error) { - return nil, status.Errorf(codes.Unimplemented, "method GetContainer not implemented") -} - -func RegisterDeviceServiceServer(s *grpc.Server, srv DeviceServiceServer) { - s.RegisterService(&_DeviceService_serviceDesc, srv) -} - -func _DeviceService_Register_Handler(srv interface{}, stream grpc.ServerStream) error { - return srv.(DeviceServiceServer).Register(&deviceServiceRegisterServer{stream}) -} - -type DeviceService_RegisterServer interface { - SendAndClose(*RegisterReply) error - Recv() (*RegisterRequest, error) - grpc.ServerStream -} - -type deviceServiceRegisterServer struct { - grpc.ServerStream -} - -func (x *deviceServiceRegisterServer) SendAndClose(m *RegisterReply) error { - return x.ServerStream.SendMsg(m) -} - -func (x *deviceServiceRegisterServer) Recv() (*RegisterRequest, error) { - m := new(RegisterRequest) - if err := x.ServerStream.RecvMsg(m); err != nil { - return nil, err - } - return m, nil -} - -func _DeviceService_GetContainer_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { - in := new(GetContainerRequest) - if err := dec(in); err != nil { - return nil, err - } - if interceptor == nil { - return srv.(DeviceServiceServer).GetContainer(ctx, in) - } - info := &grpc.UnaryServerInfo{ - Server: srv, - FullMethod: "/api.DeviceService/GetContainer", - } - handler := func(ctx context.Context, req interface{}) (interface{}, error) { - return srv.(DeviceServiceServer).GetContainer(ctx, req.(*GetContainerRequest)) - } - return interceptor(ctx, in, info, handler) -} - -var _DeviceService_serviceDesc = grpc.ServiceDesc{ - ServiceName: "api.DeviceService", - HandlerType: (*DeviceServiceServer)(nil), - Methods: []grpc.MethodDesc{ - { - MethodName: "GetContainer", - Handler: _DeviceService_GetContainer_Handler, - }, - }, - Streams: []grpc.StreamDesc{ - { - StreamName: "Register", - Handler: _DeviceService_Register_Handler, - ClientStreams: true, - }, - }, - Metadata: "pkg/api/device_register.proto", -} - -func (m *DeviceInfo) Marshal() (dAtA []byte, err error) { - size := m.Size() - dAtA = make([]byte, size) - n, err := m.MarshalToSizedBuffer(dAtA[:size]) - if err != nil { - return nil, err - } - return dAtA[:n], nil -} - -func (m *DeviceInfo) MarshalTo(dAtA []byte) (int, error) { - size := m.Size() - return m.MarshalToSizedBuffer(dAtA[:size]) -} - -func (m *DeviceInfo) MarshalToSizedBuffer(dAtA []byte) (int, error) { - i := len(dAtA) - _ = i - var l int - _ = l - if m.XXX_unrecognized != nil { - i -= len(m.XXX_unrecognized) - copy(dAtA[i:], m.XXX_unrecognized) - } - if m.Health { - i-- - if m.Health { - dAtA[i] = 1 - } else { - dAtA[i] = 0 - } - i-- - dAtA[i] = 0x18 - } - if m.Count != 0 { - i = encodeVarintDeviceRegister(dAtA, i, uint64(m.Count)) - i-- - dAtA[i] = 0x10 - } - if len(m.Id) > 0 { - i -= len(m.Id) - copy(dAtA[i:], m.Id) - i = encodeVarintDeviceRegister(dAtA, i, uint64(len(m.Id))) - i-- - dAtA[i] = 0xa - } - return len(dAtA) - i, nil -} - -func (m *RegisterRequest) Marshal() (dAtA []byte, err error) { - size := m.Size() - dAtA = make([]byte, size) - n, err := m.MarshalToSizedBuffer(dAtA[:size]) - if err != nil { - return nil, err - } - return dAtA[:n], nil -} - -func (m *RegisterRequest) MarshalTo(dAtA []byte) (int, error) { - size := m.Size() - return m.MarshalToSizedBuffer(dAtA[:size]) -} - -func (m *RegisterRequest) MarshalToSizedBuffer(dAtA []byte) (int, error) { - i := len(dAtA) - _ = i - var l int - _ = l - if m.XXX_unrecognized != nil { - i -= len(m.XXX_unrecognized) - copy(dAtA[i:], m.XXX_unrecognized) - } - if len(m.Devices) > 0 { - for iNdEx := len(m.Devices) - 1; iNdEx >= 0; iNdEx-- { - { - size, err := m.Devices[iNdEx].MarshalToSizedBuffer(dAtA[:i]) - if err != nil { - return 0, err - } - i -= size - i = encodeVarintDeviceRegister(dAtA, i, uint64(size)) - } - i-- - dAtA[i] = 0x12 - } - } - if len(m.Node) > 0 { - i -= len(m.Node) - copy(dAtA[i:], m.Node) - i = encodeVarintDeviceRegister(dAtA, i, uint64(len(m.Node))) - i-- - dAtA[i] = 0xa - } - return len(dAtA) - i, nil -} - -func (m *RegisterReply) Marshal() (dAtA []byte, err error) { - size := m.Size() - dAtA = make([]byte, size) - n, err := m.MarshalToSizedBuffer(dAtA[:size]) - if err != nil { - return nil, err - } - return dAtA[:n], nil -} - -func (m *RegisterReply) MarshalTo(dAtA []byte) (int, error) { - size := m.Size() - return m.MarshalToSizedBuffer(dAtA[:size]) -} - -func (m *RegisterReply) MarshalToSizedBuffer(dAtA []byte) (int, error) { - i := len(dAtA) - _ = i - var l int - _ = l - if m.XXX_unrecognized != nil { - i -= len(m.XXX_unrecognized) - copy(dAtA[i:], m.XXX_unrecognized) - } - return len(dAtA) - i, nil -} - -func (m *GetContainerRequest) Marshal() (dAtA []byte, err error) { - size := m.Size() - dAtA = make([]byte, size) - n, err := m.MarshalToSizedBuffer(dAtA[:size]) - if err != nil { - return nil, err - } - return dAtA[:n], nil -} - -func (m *GetContainerRequest) MarshalTo(dAtA []byte) (int, error) { - size := m.Size() - return m.MarshalToSizedBuffer(dAtA[:size]) -} - -func (m *GetContainerRequest) MarshalToSizedBuffer(dAtA []byte) (int, error) { - i := len(dAtA) - _ = i - var l int - _ = l - if m.XXX_unrecognized != nil { - i -= len(m.XXX_unrecognized) - copy(dAtA[i:], m.XXX_unrecognized) - } - if len(m.Uuid) > 0 { - i -= len(m.Uuid) - copy(dAtA[i:], m.Uuid) - i = encodeVarintDeviceRegister(dAtA, i, uint64(len(m.Uuid))) - i-- - dAtA[i] = 0xa - } - return len(dAtA) - i, nil -} - -func (m *GetContainerReply) Marshal() (dAtA []byte, err error) { - size := m.Size() - dAtA = make([]byte, size) - n, err := m.MarshalToSizedBuffer(dAtA[:size]) - if err != nil { - return nil, err - } - return dAtA[:n], nil -} - -func (m *GetContainerReply) MarshalTo(dAtA []byte) (int, error) { - size := m.Size() - return m.MarshalToSizedBuffer(dAtA[:size]) -} - -func (m *GetContainerReply) MarshalToSizedBuffer(dAtA []byte) (int, error) { - i := len(dAtA) - _ = i - var l int - _ = l - if m.XXX_unrecognized != nil { - i -= len(m.XXX_unrecognized) - copy(dAtA[i:], m.XXX_unrecognized) - } - if len(m.PodName) > 0 { - i -= len(m.PodName) - copy(dAtA[i:], m.PodName) - i = encodeVarintDeviceRegister(dAtA, i, uint64(len(m.PodName))) - i-- - dAtA[i] = 0x32 - } - if len(m.PodNamespace) > 0 { - i -= len(m.PodNamespace) - copy(dAtA[i:], m.PodNamespace) - i = encodeVarintDeviceRegister(dAtA, i, uint64(len(m.PodNamespace))) - i-- - dAtA[i] = 0x2a - } - if len(m.CtrName) > 0 { - i -= len(m.CtrName) - copy(dAtA[i:], m.CtrName) - i = encodeVarintDeviceRegister(dAtA, i, uint64(len(m.CtrName))) - i-- - dAtA[i] = 0x22 - } - if len(m.PodUID) > 0 { - i -= len(m.PodUID) - copy(dAtA[i:], m.PodUID) - i = encodeVarintDeviceRegister(dAtA, i, uint64(len(m.PodUID))) - i-- - dAtA[i] = 0x1a - } - if len(m.DevList) > 0 { - for iNdEx := len(m.DevList) - 1; iNdEx >= 0; iNdEx-- { - i -= len(m.DevList[iNdEx]) - copy(dAtA[i:], m.DevList[iNdEx]) - i = encodeVarintDeviceRegister(dAtA, i, uint64(len(m.DevList[iNdEx]))) - i-- - dAtA[i] = 0xa - } - } - return len(dAtA) - i, nil -} - -func encodeVarintDeviceRegister(dAtA []byte, offset int, v uint64) int { - offset -= sovDeviceRegister(v) - base := offset - for v >= 1<<7 { - dAtA[offset] = uint8(v&0x7f | 0x80) - v >>= 7 - offset++ - } - dAtA[offset] = uint8(v) - return base -} -func (m *DeviceInfo) Size() (n int) { - if m == nil { - return 0 - } - var l int - _ = l - l = len(m.Id) - if l > 0 { - n += 1 + l + sovDeviceRegister(uint64(l)) - } - if m.Count != 0 { - n += 1 + sovDeviceRegister(uint64(m.Count)) - } - if m.Health { - n += 2 - } - if m.XXX_unrecognized != nil { - n += len(m.XXX_unrecognized) - } - return n -} - -func (m *RegisterRequest) Size() (n int) { - if m == nil { - return 0 - } - var l int - _ = l - l = len(m.Node) - if l > 0 { - n += 1 + l + sovDeviceRegister(uint64(l)) - } - if len(m.Devices) > 0 { - for _, e := range m.Devices { - l = e.Size() - n += 1 + l + sovDeviceRegister(uint64(l)) - } - } - if m.XXX_unrecognized != nil { - n += len(m.XXX_unrecognized) - } - return n -} - -func (m *RegisterReply) Size() (n int) { - if m == nil { - return 0 - } - var l int - _ = l - if m.XXX_unrecognized != nil { - n += len(m.XXX_unrecognized) - } - return n -} - -func (m *GetContainerRequest) Size() (n int) { - if m == nil { - return 0 - } - var l int - _ = l - l = len(m.Uuid) - if l > 0 { - n += 1 + l + sovDeviceRegister(uint64(l)) - } - if m.XXX_unrecognized != nil { - n += len(m.XXX_unrecognized) - } - return n -} - -func (m *GetContainerReply) Size() (n int) { - if m == nil { - return 0 - } - var l int - _ = l - if len(m.DevList) > 0 { - for _, s := range m.DevList { - l = len(s) - n += 1 + l + sovDeviceRegister(uint64(l)) - } - } - l = len(m.PodUID) - if l > 0 { - n += 1 + l + sovDeviceRegister(uint64(l)) - } - l = len(m.CtrName) - if l > 0 { - n += 1 + l + sovDeviceRegister(uint64(l)) - } - l = len(m.PodNamespace) - if l > 0 { - n += 1 + l + sovDeviceRegister(uint64(l)) - } - l = len(m.PodName) - if l > 0 { - n += 1 + l + sovDeviceRegister(uint64(l)) - } - if m.XXX_unrecognized != nil { - n += len(m.XXX_unrecognized) - } - return n -} - -func sovDeviceRegister(x uint64) (n int) { - return (math_bits.Len64(x|1) + 6) / 7 -} -func sozDeviceRegister(x uint64) (n int) { - return sovDeviceRegister(uint64((x << 1) ^ uint64((int64(x) >> 63)))) -} -func (m *DeviceInfo) Unmarshal(dAtA []byte) error { - l := len(dAtA) - iNdEx := 0 - for iNdEx < l { - preIndex := iNdEx - var wire uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - wire |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - fieldNum := int32(wire >> 3) - wireType := int(wire & 0x7) - if wireType == 4 { - return fmt.Errorf("proto: DeviceInfo: wiretype end group for non-group") - } - if fieldNum <= 0 { - return fmt.Errorf("proto: DeviceInfo: illegal tag %d (wire type %d)", fieldNum, wire) - } - switch fieldNum { - case 1: - if wireType != 2 { - return fmt.Errorf("proto: wrong wireType = %d for field Id", wireType) - } - var stringLen uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - stringLen |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - intStringLen := int(stringLen) - if intStringLen < 0 { - return ErrInvalidLengthDeviceRegister - } - postIndex := iNdEx + intStringLen - if postIndex < 0 { - return ErrInvalidLengthDeviceRegister - } - if postIndex > l { - return io.ErrUnexpectedEOF - } - m.Id = string(dAtA[iNdEx:postIndex]) - iNdEx = postIndex - case 2: - if wireType != 0 { - return fmt.Errorf("proto: wrong wireType = %d for field Count", wireType) - } - m.Count = 0 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - m.Count |= int32(b&0x7F) << shift - if b < 0x80 { - break - } - } - case 3: - if wireType != 0 { - return fmt.Errorf("proto: wrong wireType = %d for field Health", wireType) - } - var v int - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - v |= int(b&0x7F) << shift - if b < 0x80 { - break - } - } - m.Health = bool(v != 0) - default: - iNdEx = preIndex - skippy, err := skipDeviceRegister(dAtA[iNdEx:]) - if err != nil { - return err - } - if (skippy < 0) || (iNdEx+skippy) < 0 { - return ErrInvalidLengthDeviceRegister - } - if (iNdEx + skippy) > l { - return io.ErrUnexpectedEOF - } - m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) - iNdEx += skippy - } - } - - if iNdEx > l { - return io.ErrUnexpectedEOF - } - return nil -} -func (m *RegisterRequest) Unmarshal(dAtA []byte) error { - l := len(dAtA) - iNdEx := 0 - for iNdEx < l { - preIndex := iNdEx - var wire uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - wire |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - fieldNum := int32(wire >> 3) - wireType := int(wire & 0x7) - if wireType == 4 { - return fmt.Errorf("proto: RegisterRequest: wiretype end group for non-group") - } - if fieldNum <= 0 { - return fmt.Errorf("proto: RegisterRequest: illegal tag %d (wire type %d)", fieldNum, wire) - } - switch fieldNum { - case 1: - if wireType != 2 { - return fmt.Errorf("proto: wrong wireType = %d for field Node", wireType) - } - var stringLen uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - stringLen |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - intStringLen := int(stringLen) - if intStringLen < 0 { - return ErrInvalidLengthDeviceRegister - } - postIndex := iNdEx + intStringLen - if postIndex < 0 { - return ErrInvalidLengthDeviceRegister - } - if postIndex > l { - return io.ErrUnexpectedEOF - } - m.Node = string(dAtA[iNdEx:postIndex]) - iNdEx = postIndex - case 2: - if wireType != 2 { - return fmt.Errorf("proto: wrong wireType = %d for field Devices", wireType) - } - var msglen int - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - msglen |= int(b&0x7F) << shift - if b < 0x80 { - break - } - } - if msglen < 0 { - return ErrInvalidLengthDeviceRegister - } - postIndex := iNdEx + msglen - if postIndex < 0 { - return ErrInvalidLengthDeviceRegister - } - if postIndex > l { - return io.ErrUnexpectedEOF - } - m.Devices = append(m.Devices, &DeviceInfo{}) - if err := m.Devices[len(m.Devices)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { - return err - } - iNdEx = postIndex - default: - iNdEx = preIndex - skippy, err := skipDeviceRegister(dAtA[iNdEx:]) - if err != nil { - return err - } - if (skippy < 0) || (iNdEx+skippy) < 0 { - return ErrInvalidLengthDeviceRegister - } - if (iNdEx + skippy) > l { - return io.ErrUnexpectedEOF - } - m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) - iNdEx += skippy - } - } - - if iNdEx > l { - return io.ErrUnexpectedEOF - } - return nil -} -func (m *RegisterReply) Unmarshal(dAtA []byte) error { - l := len(dAtA) - iNdEx := 0 - for iNdEx < l { - preIndex := iNdEx - var wire uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - wire |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - fieldNum := int32(wire >> 3) - wireType := int(wire & 0x7) - if wireType == 4 { - return fmt.Errorf("proto: RegisterReply: wiretype end group for non-group") - } - if fieldNum <= 0 { - return fmt.Errorf("proto: RegisterReply: illegal tag %d (wire type %d)", fieldNum, wire) - } - switch fieldNum { - default: - iNdEx = preIndex - skippy, err := skipDeviceRegister(dAtA[iNdEx:]) - if err != nil { - return err - } - if (skippy < 0) || (iNdEx+skippy) < 0 { - return ErrInvalidLengthDeviceRegister - } - if (iNdEx + skippy) > l { - return io.ErrUnexpectedEOF - } - m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) - iNdEx += skippy - } - } - - if iNdEx > l { - return io.ErrUnexpectedEOF - } - return nil -} -func (m *GetContainerRequest) Unmarshal(dAtA []byte) error { - l := len(dAtA) - iNdEx := 0 - for iNdEx < l { - preIndex := iNdEx - var wire uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - wire |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - fieldNum := int32(wire >> 3) - wireType := int(wire & 0x7) - if wireType == 4 { - return fmt.Errorf("proto: GetContainerRequest: wiretype end group for non-group") - } - if fieldNum <= 0 { - return fmt.Errorf("proto: GetContainerRequest: illegal tag %d (wire type %d)", fieldNum, wire) - } - switch fieldNum { - case 1: - if wireType != 2 { - return fmt.Errorf("proto: wrong wireType = %d for field Uuid", wireType) - } - var stringLen uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - stringLen |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - intStringLen := int(stringLen) - if intStringLen < 0 { - return ErrInvalidLengthDeviceRegister - } - postIndex := iNdEx + intStringLen - if postIndex < 0 { - return ErrInvalidLengthDeviceRegister - } - if postIndex > l { - return io.ErrUnexpectedEOF - } - m.Uuid = string(dAtA[iNdEx:postIndex]) - iNdEx = postIndex - default: - iNdEx = preIndex - skippy, err := skipDeviceRegister(dAtA[iNdEx:]) - if err != nil { - return err - } - if (skippy < 0) || (iNdEx+skippy) < 0 { - return ErrInvalidLengthDeviceRegister - } - if (iNdEx + skippy) > l { - return io.ErrUnexpectedEOF - } - m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) - iNdEx += skippy - } - } - - if iNdEx > l { - return io.ErrUnexpectedEOF - } - return nil -} -func (m *GetContainerReply) Unmarshal(dAtA []byte) error { - l := len(dAtA) - iNdEx := 0 - for iNdEx < l { - preIndex := iNdEx - var wire uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - wire |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - fieldNum := int32(wire >> 3) - wireType := int(wire & 0x7) - if wireType == 4 { - return fmt.Errorf("proto: GetContainerReply: wiretype end group for non-group") - } - if fieldNum <= 0 { - return fmt.Errorf("proto: GetContainerReply: illegal tag %d (wire type %d)", fieldNum, wire) - } - switch fieldNum { - case 1: - if wireType != 2 { - return fmt.Errorf("proto: wrong wireType = %d for field DevList", wireType) - } - var stringLen uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - stringLen |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - intStringLen := int(stringLen) - if intStringLen < 0 { - return ErrInvalidLengthDeviceRegister - } - postIndex := iNdEx + intStringLen - if postIndex < 0 { - return ErrInvalidLengthDeviceRegister - } - if postIndex > l { - return io.ErrUnexpectedEOF - } - m.DevList = append(m.DevList, string(dAtA[iNdEx:postIndex])) - iNdEx = postIndex - case 3: - if wireType != 2 { - return fmt.Errorf("proto: wrong wireType = %d for field PodUID", wireType) - } - var stringLen uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - stringLen |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - intStringLen := int(stringLen) - if intStringLen < 0 { - return ErrInvalidLengthDeviceRegister - } - postIndex := iNdEx + intStringLen - if postIndex < 0 { - return ErrInvalidLengthDeviceRegister - } - if postIndex > l { - return io.ErrUnexpectedEOF - } - m.PodUID = string(dAtA[iNdEx:postIndex]) - iNdEx = postIndex - case 4: - if wireType != 2 { - return fmt.Errorf("proto: wrong wireType = %d for field CtrName", wireType) - } - var stringLen uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - stringLen |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - intStringLen := int(stringLen) - if intStringLen < 0 { - return ErrInvalidLengthDeviceRegister - } - postIndex := iNdEx + intStringLen - if postIndex < 0 { - return ErrInvalidLengthDeviceRegister - } - if postIndex > l { - return io.ErrUnexpectedEOF - } - m.CtrName = string(dAtA[iNdEx:postIndex]) - iNdEx = postIndex - case 5: - if wireType != 2 { - return fmt.Errorf("proto: wrong wireType = %d for field PodNamespace", wireType) - } - var stringLen uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - stringLen |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - intStringLen := int(stringLen) - if intStringLen < 0 { - return ErrInvalidLengthDeviceRegister - } - postIndex := iNdEx + intStringLen - if postIndex < 0 { - return ErrInvalidLengthDeviceRegister - } - if postIndex > l { - return io.ErrUnexpectedEOF - } - m.PodNamespace = string(dAtA[iNdEx:postIndex]) - iNdEx = postIndex - case 6: - if wireType != 2 { - return fmt.Errorf("proto: wrong wireType = %d for field PodName", wireType) - } - var stringLen uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - stringLen |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - intStringLen := int(stringLen) - if intStringLen < 0 { - return ErrInvalidLengthDeviceRegister - } - postIndex := iNdEx + intStringLen - if postIndex < 0 { - return ErrInvalidLengthDeviceRegister - } - if postIndex > l { - return io.ErrUnexpectedEOF - } - m.PodName = string(dAtA[iNdEx:postIndex]) - iNdEx = postIndex - default: - iNdEx = preIndex - skippy, err := skipDeviceRegister(dAtA[iNdEx:]) - if err != nil { - return err - } - if (skippy < 0) || (iNdEx+skippy) < 0 { - return ErrInvalidLengthDeviceRegister - } - if (iNdEx + skippy) > l { - return io.ErrUnexpectedEOF - } - m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) - iNdEx += skippy - } - } - - if iNdEx > l { - return io.ErrUnexpectedEOF - } - return nil -} -func skipDeviceRegister(dAtA []byte) (n int, err error) { - l := len(dAtA) - iNdEx := 0 - depth := 0 - for iNdEx < l { - var wire uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return 0, ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return 0, io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - wire |= (uint64(b) & 0x7F) << shift - if b < 0x80 { - break - } - } - wireType := int(wire & 0x7) - switch wireType { - case 0: - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return 0, ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return 0, io.ErrUnexpectedEOF - } - iNdEx++ - if dAtA[iNdEx-1] < 0x80 { - break - } - } - case 1: - iNdEx += 8 - case 2: - var length int - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return 0, ErrIntOverflowDeviceRegister - } - if iNdEx >= l { - return 0, io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - length |= (int(b) & 0x7F) << shift - if b < 0x80 { - break - } - } - if length < 0 { - return 0, ErrInvalidLengthDeviceRegister - } - iNdEx += length - case 3: - depth++ - case 4: - if depth == 0 { - return 0, ErrUnexpectedEndOfGroupDeviceRegister - } - depth-- - case 5: - iNdEx += 4 - default: - return 0, fmt.Errorf("proto: illegal wireType %d", wireType) - } - if iNdEx < 0 { - return 0, ErrInvalidLengthDeviceRegister - } - if depth == 0 { - return iNdEx, nil - } - } - return 0, io.ErrUnexpectedEOF -} - -var ( - ErrInvalidLengthDeviceRegister = fmt.Errorf("proto: negative length found during unmarshaling") - ErrIntOverflowDeviceRegister = fmt.Errorf("proto: integer overflow") - ErrUnexpectedEndOfGroupDeviceRegister = fmt.Errorf("proto: unexpected end of group") -) diff --git a/pkg/api/device_register.proto b/pkg/api/device_register.proto deleted file mode 100644 index 5846183a4..000000000 --- a/pkg/api/device_register.proto +++ /dev/null @@ -1,34 +0,0 @@ -syntax = "proto3"; -package api; - -service DeviceService { - rpc Register (stream RegisterRequest) returns (RegisterReply); - - rpc GetContainer(GetContainerRequest) returns (GetContainerReply); -} - -message DeviceInfo { - string id = 1; - int32 count = 2; - bool health = 3; -} - -message RegisterRequest { - string node = 1; - repeated DeviceInfo devices = 2; -} - -message RegisterReply { -} - -message GetContainerRequest { - string uuid = 1; -} - -message GetContainerReply { - repeated string devList = 1; - string podUID = 3; - string ctrName = 4; - string podNamespace = 5; - string podName = 6; -} \ No newline at end of file diff --git a/pkg/api/go.mod b/pkg/api/go.mod deleted file mode 100644 index 0bd493ad8..000000000 --- a/pkg/api/go.mod +++ /dev/null @@ -1,8 +0,0 @@ -module 4pd.io/k8s-vgpu/pkg/api - -go 1.16 - -require ( - github.com/golang/protobuf v1.5.2 - google.golang.org/grpc v1.39.0 -) diff --git a/pkg/api/go.sum b/pkg/api/go.sum deleted file mode 100644 index f3cfe9f9d..000000000 --- a/pkg/api/go.sum +++ /dev/null @@ -1,114 +0,0 @@ -cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= -cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= -github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= -github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= -github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= -github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= -github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= -github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= -github.com/cncf/xds/go v0.0.0-20210312221358-fbca930ec8ed/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= -github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= -github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= -github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk= -github.com/envoyproxy/go-control-plane v0.9.9-0.20210512163311-63b5d3c536b0/go.mod h1:hliV/p42l8fGbc6Y9bQ70uLwIvmJyVE5k4iMKlh8wCQ= -github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= -github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= -github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= -github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= -github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= -github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= -github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= -github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= -github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= -github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= -github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= -github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= -github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= -github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw= -github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= -github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= -github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= -github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= -go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= -golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= -golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= -golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20200822124328-c89045814202 h1:VvcQYSHwXgi7W+TpUR6A9g6Up98WAHf3f/ulnJ62IyA= -golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= -golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= -golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= -golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd h1:xhmwyvizuTgC2qz7ZlMluP20uW+C3Rm0FD/WLDX8884= -golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= -golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= -google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= -google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= -google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= -google.golang.org/genproto v0.0.0-20200513103714-09dca8ec2884/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013 h1:+kGHl1aib/qcwaRi1CbqBZ1rk19r85MNUf8HaBghugY= -google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= -google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= -google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= -google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= -google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= -google.golang.org/grpc v1.33.1/go.mod h1:fr5YgcSWrqhRRxogOsw7RzIpsmvOZ6IcH4kBYTpR3n0= -google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU= -google.golang.org/grpc v1.39.0 h1:Klz8I9kdtkIN6EpHHUOMLCYhTn/2WAe5a0s1hcBkdTI= -google.golang.org/grpc v1.39.0/go.mod h1:PImNr+rS9TWYb2O4/emRugxiyHZ5JyHW5F+RPnDzfrE= -google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= -google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= -google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= -google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= -google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= -google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= -google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= -google.golang.org/protobuf v1.26.0 h1:bxAC2xTBsZGibn2RTntX0oH50xLsqy1OxA9tTL3p/lk= -google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/pkg/api/runtime.pb.go b/pkg/api/runtime.pb.go deleted file mode 100644 index f46b44e9a..000000000 --- a/pkg/api/runtime.pb.go +++ /dev/null @@ -1,915 +0,0 @@ -// Code generated by protoc-gen-gogo. DO NOT EDIT. -// source: pkg/api/runtime.proto - -package api - -import ( - context "context" - fmt "fmt" - proto "github.com/golang/protobuf/proto" - grpc "google.golang.org/grpc" - codes "google.golang.org/grpc/codes" - status "google.golang.org/grpc/status" - io "io" - math "math" - math_bits "math/bits" -) - -// Reference imports to suppress errors if they are not otherwise used. -var _ = proto.Marshal -var _ = fmt.Errorf -var _ = math.Inf - -// This is a compile-time assertion to ensure that this generated file -// is compatible with the proto package it is being compiled against. -// A compilation error at this line likely means your copy of the -// proto package needs to be updated. -const _ = proto.ProtoPackageIsVersion3 // please upgrade the proto package - -type GetDeviceRequest struct { - CtrUUID string `protobuf:"bytes,1,opt,name=ctrUUID,proto3" json:"ctrUUID,omitempty"` - XXX_NoUnkeyedLiteral struct{} `json:"-"` - XXX_unrecognized []byte `json:"-"` - XXX_sizecache int32 `json:"-"` -} - -func (m *GetDeviceRequest) Reset() { *m = GetDeviceRequest{} } -func (m *GetDeviceRequest) String() string { return proto.CompactTextString(m) } -func (*GetDeviceRequest) ProtoMessage() {} -func (*GetDeviceRequest) Descriptor() ([]byte, []int) { - return fileDescriptor_1e1a7998f4db04c8, []int{0} -} -func (m *GetDeviceRequest) XXX_Unmarshal(b []byte) error { - return m.Unmarshal(b) -} -func (m *GetDeviceRequest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { - if deterministic { - return xxx_messageInfo_GetDeviceRequest.Marshal(b, m, deterministic) - } else { - b = b[:cap(b)] - n, err := m.MarshalToSizedBuffer(b) - if err != nil { - return nil, err - } - return b[:n], nil - } -} -func (m *GetDeviceRequest) XXX_Merge(src proto.Message) { - xxx_messageInfo_GetDeviceRequest.Merge(m, src) -} -func (m *GetDeviceRequest) XXX_Size() int { - return m.Size() -} -func (m *GetDeviceRequest) XXX_DiscardUnknown() { - xxx_messageInfo_GetDeviceRequest.DiscardUnknown(m) -} - -var xxx_messageInfo_GetDeviceRequest proto.InternalMessageInfo - -func (m *GetDeviceRequest) GetCtrUUID() string { - if m != nil { - return m.CtrUUID - } - return "" -} - -type GetDeviceReply struct { - Envs map[string]string `protobuf:"bytes,1,rep,name=envs,proto3" json:"envs,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` - PodUID string `protobuf:"bytes,3,opt,name=podUID,proto3" json:"podUID,omitempty"` - CtrName string `protobuf:"bytes,4,opt,name=ctrName,proto3" json:"ctrName,omitempty"` - PodNamespace string `protobuf:"bytes,5,opt,name=podNamespace,proto3" json:"podNamespace,omitempty"` - PodName string `protobuf:"bytes,6,opt,name=podName,proto3" json:"podName,omitempty"` - XXX_NoUnkeyedLiteral struct{} `json:"-"` - XXX_unrecognized []byte `json:"-"` - XXX_sizecache int32 `json:"-"` -} - -func (m *GetDeviceReply) Reset() { *m = GetDeviceReply{} } -func (m *GetDeviceReply) String() string { return proto.CompactTextString(m) } -func (*GetDeviceReply) ProtoMessage() {} -func (*GetDeviceReply) Descriptor() ([]byte, []int) { - return fileDescriptor_1e1a7998f4db04c8, []int{1} -} -func (m *GetDeviceReply) XXX_Unmarshal(b []byte) error { - return m.Unmarshal(b) -} -func (m *GetDeviceReply) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { - if deterministic { - return xxx_messageInfo_GetDeviceReply.Marshal(b, m, deterministic) - } else { - b = b[:cap(b)] - n, err := m.MarshalToSizedBuffer(b) - if err != nil { - return nil, err - } - return b[:n], nil - } -} -func (m *GetDeviceReply) XXX_Merge(src proto.Message) { - xxx_messageInfo_GetDeviceReply.Merge(m, src) -} -func (m *GetDeviceReply) XXX_Size() int { - return m.Size() -} -func (m *GetDeviceReply) XXX_DiscardUnknown() { - xxx_messageInfo_GetDeviceReply.DiscardUnknown(m) -} - -var xxx_messageInfo_GetDeviceReply proto.InternalMessageInfo - -func (m *GetDeviceReply) GetEnvs() map[string]string { - if m != nil { - return m.Envs - } - return nil -} - -func (m *GetDeviceReply) GetPodUID() string { - if m != nil { - return m.PodUID - } - return "" -} - -func (m *GetDeviceReply) GetCtrName() string { - if m != nil { - return m.CtrName - } - return "" -} - -func (m *GetDeviceReply) GetPodNamespace() string { - if m != nil { - return m.PodNamespace - } - return "" -} - -func (m *GetDeviceReply) GetPodName() string { - if m != nil { - return m.PodName - } - return "" -} - -func init() { - proto.RegisterType((*GetDeviceRequest)(nil), "api.GetDeviceRequest") - proto.RegisterType((*GetDeviceReply)(nil), "api.GetDeviceReply") - proto.RegisterMapType((map[string]string)(nil), "api.GetDeviceReply.EnvsEntry") -} - -func init() { proto.RegisterFile("pkg/api/runtime.proto", fileDescriptor_1e1a7998f4db04c8) } - -var fileDescriptor_1e1a7998f4db04c8 = []byte{ - // 290 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0x12, 0x2d, 0xc8, 0x4e, 0xd7, - 0x4f, 0x2c, 0xc8, 0xd4, 0x2f, 0x2a, 0xcd, 0x2b, 0xc9, 0xcc, 0x4d, 0xd5, 0x2b, 0x28, 0xca, 0x2f, - 0xc9, 0x17, 0x62, 0x4e, 0x2c, 0xc8, 0x54, 0xd2, 0xe1, 0x12, 0x70, 0x4f, 0x2d, 0x71, 0x49, 0x2d, - 0xcb, 0x4c, 0x4e, 0x0d, 0x4a, 0x2d, 0x2c, 0x4d, 0x2d, 0x2e, 0x11, 0x92, 0xe0, 0x62, 0x4f, 0x2e, - 0x29, 0x0a, 0x0d, 0xf5, 0x74, 0x91, 0x60, 0x54, 0x60, 0xd4, 0xe0, 0x0c, 0x82, 0x71, 0x95, 0xde, - 0x30, 0x72, 0xf1, 0x21, 0x29, 0x2f, 0xc8, 0xa9, 0x14, 0x32, 0xe4, 0x62, 0x49, 0xcd, 0x2b, 0x2b, - 0x96, 0x60, 0x54, 0x60, 0xd6, 0xe0, 0x36, 0x92, 0xd5, 0x4b, 0x2c, 0xc8, 0xd4, 0x43, 0x55, 0xa2, - 0xe7, 0x9a, 0x57, 0x56, 0xec, 0x9a, 0x57, 0x52, 0x54, 0x19, 0x04, 0x56, 0x2a, 0x24, 0xc6, 0xc5, - 0x56, 0x90, 0x9f, 0x02, 0x32, 0x9e, 0x19, 0x6c, 0x3c, 0x94, 0x07, 0xb5, 0xd7, 0x2f, 0x31, 0x37, - 0x55, 0x82, 0x05, 0x6e, 0x2f, 0x88, 0x2b, 0xa4, 0xc4, 0xc5, 0x53, 0x90, 0x9f, 0x02, 0x62, 0x16, - 0x17, 0x24, 0x26, 0xa7, 0x4a, 0xb0, 0x82, 0xa5, 0x51, 0xc4, 0x40, 0xba, 0xa1, 0x7c, 0x09, 0x36, - 0x88, 0x6e, 0x28, 0x57, 0xca, 0x9c, 0x8b, 0x13, 0xee, 0x04, 0x21, 0x01, 0x2e, 0xe6, 0xec, 0xd4, - 0x4a, 0xa8, 0xc7, 0x40, 0x4c, 0x21, 0x11, 0x2e, 0xd6, 0xb2, 0xc4, 0x9c, 0xd2, 0x54, 0x09, 0x26, - 0xb0, 0x18, 0x84, 0x63, 0xc5, 0x64, 0xc1, 0x68, 0xe4, 0xcf, 0x25, 0x14, 0xe6, 0x1e, 0x10, 0x1a, - 0x04, 0x09, 0xb6, 0xe0, 0xd4, 0x22, 0x90, 0x9f, 0x84, 0x2c, 0xb9, 0x38, 0xe1, 0x1e, 0x14, 0x12, - 0x45, 0xf7, 0x30, 0x38, 0x08, 0xa5, 0x84, 0xb1, 0x84, 0x83, 0x12, 0x83, 0x93, 0xc0, 0x89, 0x47, - 0x72, 0x8c, 0x17, 0x1e, 0xc9, 0x31, 0x3e, 0x78, 0x24, 0xc7, 0x38, 0xe3, 0xb1, 0x1c, 0x43, 0x12, - 0x1b, 0x38, 0x2e, 0x8c, 0x01, 0x01, 0x00, 0x00, 0xff, 0xff, 0xee, 0xa1, 0xc2, 0xa4, 0xa4, 0x01, - 0x00, 0x00, -} - -// Reference imports to suppress errors if they are not otherwise used. -var _ context.Context -var _ grpc.ClientConn - -// This is a compile-time assertion to ensure that this generated file -// is compatible with the grpc package it is being compiled against. -const _ = grpc.SupportPackageIsVersion4 - -// VGPURuntimeServiceClient is the client API for VGPURuntimeService service. -// -// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://godoc.org/google.golang.org/grpc#ClientConn.NewStream. -type VGPURuntimeServiceClient interface { - GetDevice(ctx context.Context, in *GetDeviceRequest, opts ...grpc.CallOption) (*GetDeviceReply, error) -} - -type vGPURuntimeServiceClient struct { - cc *grpc.ClientConn -} - -func NewVGPURuntimeServiceClient(cc *grpc.ClientConn) VGPURuntimeServiceClient { - return &vGPURuntimeServiceClient{cc} -} - -func (c *vGPURuntimeServiceClient) GetDevice(ctx context.Context, in *GetDeviceRequest, opts ...grpc.CallOption) (*GetDeviceReply, error) { - out := new(GetDeviceReply) - err := c.cc.Invoke(ctx, "/api.VGPURuntimeService/GetDevice", in, out, opts...) - if err != nil { - return nil, err - } - return out, nil -} - -// VGPURuntimeServiceServer is the server API for VGPURuntimeService service. -type VGPURuntimeServiceServer interface { - GetDevice(context.Context, *GetDeviceRequest) (*GetDeviceReply, error) -} - -// UnimplementedVGPURuntimeServiceServer can be embedded to have forward compatible implementations. -type UnimplementedVGPURuntimeServiceServer struct { -} - -func (*UnimplementedVGPURuntimeServiceServer) GetDevice(ctx context.Context, req *GetDeviceRequest) (*GetDeviceReply, error) { - return nil, status.Errorf(codes.Unimplemented, "method GetDevice not implemented") -} - -func RegisterVGPURuntimeServiceServer(s *grpc.Server, srv VGPURuntimeServiceServer) { - s.RegisterService(&_VGPURuntimeService_serviceDesc, srv) -} - -func _VGPURuntimeService_GetDevice_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { - in := new(GetDeviceRequest) - if err := dec(in); err != nil { - return nil, err - } - if interceptor == nil { - return srv.(VGPURuntimeServiceServer).GetDevice(ctx, in) - } - info := &grpc.UnaryServerInfo{ - Server: srv, - FullMethod: "/api.VGPURuntimeService/GetDevice", - } - handler := func(ctx context.Context, req interface{}) (interface{}, error) { - return srv.(VGPURuntimeServiceServer).GetDevice(ctx, req.(*GetDeviceRequest)) - } - return interceptor(ctx, in, info, handler) -} - -var _VGPURuntimeService_serviceDesc = grpc.ServiceDesc{ - ServiceName: "api.VGPURuntimeService", - HandlerType: (*VGPURuntimeServiceServer)(nil), - Methods: []grpc.MethodDesc{ - { - MethodName: "GetDevice", - Handler: _VGPURuntimeService_GetDevice_Handler, - }, - }, - Streams: []grpc.StreamDesc{}, - Metadata: "pkg/api/runtime.proto", -} - -func (m *GetDeviceRequest) Marshal() (dAtA []byte, err error) { - size := m.Size() - dAtA = make([]byte, size) - n, err := m.MarshalToSizedBuffer(dAtA[:size]) - if err != nil { - return nil, err - } - return dAtA[:n], nil -} - -func (m *GetDeviceRequest) MarshalTo(dAtA []byte) (int, error) { - size := m.Size() - return m.MarshalToSizedBuffer(dAtA[:size]) -} - -func (m *GetDeviceRequest) MarshalToSizedBuffer(dAtA []byte) (int, error) { - i := len(dAtA) - _ = i - var l int - _ = l - if m.XXX_unrecognized != nil { - i -= len(m.XXX_unrecognized) - copy(dAtA[i:], m.XXX_unrecognized) - } - if len(m.CtrUUID) > 0 { - i -= len(m.CtrUUID) - copy(dAtA[i:], m.CtrUUID) - i = encodeVarintRuntime(dAtA, i, uint64(len(m.CtrUUID))) - i-- - dAtA[i] = 0xa - } - return len(dAtA) - i, nil -} - -func (m *GetDeviceReply) Marshal() (dAtA []byte, err error) { - size := m.Size() - dAtA = make([]byte, size) - n, err := m.MarshalToSizedBuffer(dAtA[:size]) - if err != nil { - return nil, err - } - return dAtA[:n], nil -} - -func (m *GetDeviceReply) MarshalTo(dAtA []byte) (int, error) { - size := m.Size() - return m.MarshalToSizedBuffer(dAtA[:size]) -} - -func (m *GetDeviceReply) MarshalToSizedBuffer(dAtA []byte) (int, error) { - i := len(dAtA) - _ = i - var l int - _ = l - if m.XXX_unrecognized != nil { - i -= len(m.XXX_unrecognized) - copy(dAtA[i:], m.XXX_unrecognized) - } - if len(m.PodName) > 0 { - i -= len(m.PodName) - copy(dAtA[i:], m.PodName) - i = encodeVarintRuntime(dAtA, i, uint64(len(m.PodName))) - i-- - dAtA[i] = 0x32 - } - if len(m.PodNamespace) > 0 { - i -= len(m.PodNamespace) - copy(dAtA[i:], m.PodNamespace) - i = encodeVarintRuntime(dAtA, i, uint64(len(m.PodNamespace))) - i-- - dAtA[i] = 0x2a - } - if len(m.CtrName) > 0 { - i -= len(m.CtrName) - copy(dAtA[i:], m.CtrName) - i = encodeVarintRuntime(dAtA, i, uint64(len(m.CtrName))) - i-- - dAtA[i] = 0x22 - } - if len(m.PodUID) > 0 { - i -= len(m.PodUID) - copy(dAtA[i:], m.PodUID) - i = encodeVarintRuntime(dAtA, i, uint64(len(m.PodUID))) - i-- - dAtA[i] = 0x1a - } - if len(m.Envs) > 0 { - for k := range m.Envs { - v := m.Envs[k] - baseI := i - i -= len(v) - copy(dAtA[i:], v) - i = encodeVarintRuntime(dAtA, i, uint64(len(v))) - i-- - dAtA[i] = 0x12 - i -= len(k) - copy(dAtA[i:], k) - i = encodeVarintRuntime(dAtA, i, uint64(len(k))) - i-- - dAtA[i] = 0xa - i = encodeVarintRuntime(dAtA, i, uint64(baseI-i)) - i-- - dAtA[i] = 0xa - } - } - return len(dAtA) - i, nil -} - -func encodeVarintRuntime(dAtA []byte, offset int, v uint64) int { - offset -= sovRuntime(v) - base := offset - for v >= 1<<7 { - dAtA[offset] = uint8(v&0x7f | 0x80) - v >>= 7 - offset++ - } - dAtA[offset] = uint8(v) - return base -} -func (m *GetDeviceRequest) Size() (n int) { - if m == nil { - return 0 - } - var l int - _ = l - l = len(m.CtrUUID) - if l > 0 { - n += 1 + l + sovRuntime(uint64(l)) - } - if m.XXX_unrecognized != nil { - n += len(m.XXX_unrecognized) - } - return n -} - -func (m *GetDeviceReply) Size() (n int) { - if m == nil { - return 0 - } - var l int - _ = l - if len(m.Envs) > 0 { - for k, v := range m.Envs { - _ = k - _ = v - mapEntrySize := 1 + len(k) + sovRuntime(uint64(len(k))) + 1 + len(v) + sovRuntime(uint64(len(v))) - n += mapEntrySize + 1 + sovRuntime(uint64(mapEntrySize)) - } - } - l = len(m.PodUID) - if l > 0 { - n += 1 + l + sovRuntime(uint64(l)) - } - l = len(m.CtrName) - if l > 0 { - n += 1 + l + sovRuntime(uint64(l)) - } - l = len(m.PodNamespace) - if l > 0 { - n += 1 + l + sovRuntime(uint64(l)) - } - l = len(m.PodName) - if l > 0 { - n += 1 + l + sovRuntime(uint64(l)) - } - if m.XXX_unrecognized != nil { - n += len(m.XXX_unrecognized) - } - return n -} - -func sovRuntime(x uint64) (n int) { - return (math_bits.Len64(x|1) + 6) / 7 -} -func sozRuntime(x uint64) (n int) { - return sovRuntime(uint64((x << 1) ^ uint64((int64(x) >> 63)))) -} -func (m *GetDeviceRequest) Unmarshal(dAtA []byte) error { - l := len(dAtA) - iNdEx := 0 - for iNdEx < l { - preIndex := iNdEx - var wire uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowRuntime - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - wire |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - fieldNum := int32(wire >> 3) - wireType := int(wire & 0x7) - if wireType == 4 { - return fmt.Errorf("proto: GetDeviceRequest: wiretype end group for non-group") - } - if fieldNum <= 0 { - return fmt.Errorf("proto: GetDeviceRequest: illegal tag %d (wire type %d)", fieldNum, wire) - } - switch fieldNum { - case 1: - if wireType != 2 { - return fmt.Errorf("proto: wrong wireType = %d for field CtrUUID", wireType) - } - var stringLen uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowRuntime - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - stringLen |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - intStringLen := int(stringLen) - if intStringLen < 0 { - return ErrInvalidLengthRuntime - } - postIndex := iNdEx + intStringLen - if postIndex < 0 { - return ErrInvalidLengthRuntime - } - if postIndex > l { - return io.ErrUnexpectedEOF - } - m.CtrUUID = string(dAtA[iNdEx:postIndex]) - iNdEx = postIndex - default: - iNdEx = preIndex - skippy, err := skipRuntime(dAtA[iNdEx:]) - if err != nil { - return err - } - if (skippy < 0) || (iNdEx+skippy) < 0 { - return ErrInvalidLengthRuntime - } - if (iNdEx + skippy) > l { - return io.ErrUnexpectedEOF - } - m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) - iNdEx += skippy - } - } - - if iNdEx > l { - return io.ErrUnexpectedEOF - } - return nil -} -func (m *GetDeviceReply) Unmarshal(dAtA []byte) error { - l := len(dAtA) - iNdEx := 0 - for iNdEx < l { - preIndex := iNdEx - var wire uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowRuntime - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - wire |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - fieldNum := int32(wire >> 3) - wireType := int(wire & 0x7) - if wireType == 4 { - return fmt.Errorf("proto: GetDeviceReply: wiretype end group for non-group") - } - if fieldNum <= 0 { - return fmt.Errorf("proto: GetDeviceReply: illegal tag %d (wire type %d)", fieldNum, wire) - } - switch fieldNum { - case 1: - if wireType != 2 { - return fmt.Errorf("proto: wrong wireType = %d for field Envs", wireType) - } - var msglen int - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowRuntime - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - msglen |= int(b&0x7F) << shift - if b < 0x80 { - break - } - } - if msglen < 0 { - return ErrInvalidLengthRuntime - } - postIndex := iNdEx + msglen - if postIndex < 0 { - return ErrInvalidLengthRuntime - } - if postIndex > l { - return io.ErrUnexpectedEOF - } - if m.Envs == nil { - m.Envs = make(map[string]string) - } - var mapkey string - var mapvalue string - for iNdEx < postIndex { - entryPreIndex := iNdEx - var wire uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowRuntime - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - wire |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - fieldNum := int32(wire >> 3) - if fieldNum == 1 { - var stringLenmapkey uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowRuntime - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - stringLenmapkey |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - intStringLenmapkey := int(stringLenmapkey) - if intStringLenmapkey < 0 { - return ErrInvalidLengthRuntime - } - postStringIndexmapkey := iNdEx + intStringLenmapkey - if postStringIndexmapkey < 0 { - return ErrInvalidLengthRuntime - } - if postStringIndexmapkey > l { - return io.ErrUnexpectedEOF - } - mapkey = string(dAtA[iNdEx:postStringIndexmapkey]) - iNdEx = postStringIndexmapkey - } else if fieldNum == 2 { - var stringLenmapvalue uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowRuntime - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - stringLenmapvalue |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - intStringLenmapvalue := int(stringLenmapvalue) - if intStringLenmapvalue < 0 { - return ErrInvalidLengthRuntime - } - postStringIndexmapvalue := iNdEx + intStringLenmapvalue - if postStringIndexmapvalue < 0 { - return ErrInvalidLengthRuntime - } - if postStringIndexmapvalue > l { - return io.ErrUnexpectedEOF - } - mapvalue = string(dAtA[iNdEx:postStringIndexmapvalue]) - iNdEx = postStringIndexmapvalue - } else { - iNdEx = entryPreIndex - skippy, err := skipRuntime(dAtA[iNdEx:]) - if err != nil { - return err - } - if (skippy < 0) || (iNdEx+skippy) < 0 { - return ErrInvalidLengthRuntime - } - if (iNdEx + skippy) > postIndex { - return io.ErrUnexpectedEOF - } - iNdEx += skippy - } - } - m.Envs[mapkey] = mapvalue - iNdEx = postIndex - case 3: - if wireType != 2 { - return fmt.Errorf("proto: wrong wireType = %d for field PodUID", wireType) - } - var stringLen uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowRuntime - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - stringLen |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - intStringLen := int(stringLen) - if intStringLen < 0 { - return ErrInvalidLengthRuntime - } - postIndex := iNdEx + intStringLen - if postIndex < 0 { - return ErrInvalidLengthRuntime - } - if postIndex > l { - return io.ErrUnexpectedEOF - } - m.PodUID = string(dAtA[iNdEx:postIndex]) - iNdEx = postIndex - case 4: - if wireType != 2 { - return fmt.Errorf("proto: wrong wireType = %d for field CtrName", wireType) - } - var stringLen uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowRuntime - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - stringLen |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - intStringLen := int(stringLen) - if intStringLen < 0 { - return ErrInvalidLengthRuntime - } - postIndex := iNdEx + intStringLen - if postIndex < 0 { - return ErrInvalidLengthRuntime - } - if postIndex > l { - return io.ErrUnexpectedEOF - } - m.CtrName = string(dAtA[iNdEx:postIndex]) - iNdEx = postIndex - case 5: - if wireType != 2 { - return fmt.Errorf("proto: wrong wireType = %d for field PodNamespace", wireType) - } - var stringLen uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowRuntime - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - stringLen |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - intStringLen := int(stringLen) - if intStringLen < 0 { - return ErrInvalidLengthRuntime - } - postIndex := iNdEx + intStringLen - if postIndex < 0 { - return ErrInvalidLengthRuntime - } - if postIndex > l { - return io.ErrUnexpectedEOF - } - m.PodNamespace = string(dAtA[iNdEx:postIndex]) - iNdEx = postIndex - case 6: - if wireType != 2 { - return fmt.Errorf("proto: wrong wireType = %d for field PodName", wireType) - } - var stringLen uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return ErrIntOverflowRuntime - } - if iNdEx >= l { - return io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - stringLen |= uint64(b&0x7F) << shift - if b < 0x80 { - break - } - } - intStringLen := int(stringLen) - if intStringLen < 0 { - return ErrInvalidLengthRuntime - } - postIndex := iNdEx + intStringLen - if postIndex < 0 { - return ErrInvalidLengthRuntime - } - if postIndex > l { - return io.ErrUnexpectedEOF - } - m.PodName = string(dAtA[iNdEx:postIndex]) - iNdEx = postIndex - default: - iNdEx = preIndex - skippy, err := skipRuntime(dAtA[iNdEx:]) - if err != nil { - return err - } - if (skippy < 0) || (iNdEx+skippy) < 0 { - return ErrInvalidLengthRuntime - } - if (iNdEx + skippy) > l { - return io.ErrUnexpectedEOF - } - m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) - iNdEx += skippy - } - } - - if iNdEx > l { - return io.ErrUnexpectedEOF - } - return nil -} -func skipRuntime(dAtA []byte) (n int, err error) { - l := len(dAtA) - iNdEx := 0 - depth := 0 - for iNdEx < l { - var wire uint64 - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return 0, ErrIntOverflowRuntime - } - if iNdEx >= l { - return 0, io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - wire |= (uint64(b) & 0x7F) << shift - if b < 0x80 { - break - } - } - wireType := int(wire & 0x7) - switch wireType { - case 0: - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return 0, ErrIntOverflowRuntime - } - if iNdEx >= l { - return 0, io.ErrUnexpectedEOF - } - iNdEx++ - if dAtA[iNdEx-1] < 0x80 { - break - } - } - case 1: - iNdEx += 8 - case 2: - var length int - for shift := uint(0); ; shift += 7 { - if shift >= 64 { - return 0, ErrIntOverflowRuntime - } - if iNdEx >= l { - return 0, io.ErrUnexpectedEOF - } - b := dAtA[iNdEx] - iNdEx++ - length |= (int(b) & 0x7F) << shift - if b < 0x80 { - break - } - } - if length < 0 { - return 0, ErrInvalidLengthRuntime - } - iNdEx += length - case 3: - depth++ - case 4: - if depth == 0 { - return 0, ErrUnexpectedEndOfGroupRuntime - } - depth-- - case 5: - iNdEx += 4 - default: - return 0, fmt.Errorf("proto: illegal wireType %d", wireType) - } - if iNdEx < 0 { - return 0, ErrInvalidLengthRuntime - } - if depth == 0 { - return iNdEx, nil - } - } - return 0, io.ErrUnexpectedEOF -} - -var ( - ErrInvalidLengthRuntime = fmt.Errorf("proto: negative length found during unmarshaling") - ErrIntOverflowRuntime = fmt.Errorf("proto: integer overflow") - ErrUnexpectedEndOfGroupRuntime = fmt.Errorf("proto: unexpected end of group") -) diff --git a/pkg/api/runtime.proto b/pkg/api/runtime.proto deleted file mode 100644 index 2bd89e23d..000000000 --- a/pkg/api/runtime.proto +++ /dev/null @@ -1,18 +0,0 @@ -syntax = "proto3"; -package api; - -service VGPURuntimeService { - rpc GetDevice (GetDeviceRequest) returns (GetDeviceReply) {} -} - -message GetDeviceRequest { - string ctrUUID = 1; -} - -message GetDeviceReply { - map envs = 1; - string podUID = 3; - string ctrName = 4; - string podNamespace = 5; - string podName = 6; -} diff --git a/pkg/api/types.go b/pkg/api/types.go index 8e2e70455..feb468555 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -1,22 +1,22 @@ /* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ package api const ( - ContainerUID = "VGPU_4PD_UUID" - PluginRuntimeSocket = "VGPU_4PD_SOCKET" + TaskPriority = "CUDA_TASK_PRIORITY" + CoreLimitSwitch = "GPU_CORE_UTILIZATION_POLICY" ) diff --git a/pkg/device-plugin/cache.go b/pkg/device-plugin/cache.go deleted file mode 100644 index 36ac6fd7b..000000000 --- a/pkg/device-plugin/cache.go +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package device_plugin - -import ( - "sync" - - pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" -) - -type DeviceCache struct { - GpuDeviceManager - - cache []*Device - stopCh chan interface{} - unhealthy chan *Device - notifyCh map[string]chan *Device - mutex sync.Mutex -} - -func NewDeviceCache() *DeviceCache { - return &DeviceCache{ - GpuDeviceManager: GpuDeviceManager{true}, - stopCh: make(chan interface{}), - unhealthy: make(chan *Device), - notifyCh: make(map[string]chan *Device), - } -} - -func (d *DeviceCache) AddNotifyChannel(name string, ch chan *Device) { - d.mutex.Lock() - defer d.mutex.Unlock() - d.notifyCh[name] = ch -} - -func (d *DeviceCache) RemoveNotifyChannel(name string) { - d.mutex.Lock() - defer d.mutex.Unlock() - delete(d.notifyCh, name) -} - -func (d *DeviceCache) Start() { - d.cache = d.Devices() - go d.CheckHealth(d.stopCh, d.cache, d.unhealthy) - go d.notify() -} - -func (d *DeviceCache) Stop() { - close(d.stopCh) -} - -func (d *DeviceCache) GetCache() []*Device { - return d.cache -} - -func (d *DeviceCache) notify() { - for { - select { - case <-d.stopCh: - return - case dev := <-d.unhealthy: - dev.Health = pluginapi.Unhealthy - d.mutex.Lock() - for _, ch := range d.notifyCh { - ch <- dev - } - d.mutex.Unlock() - } - } -} diff --git a/pkg/device-plugin/config/config.go b/pkg/device-plugin/config/config.go deleted file mode 100644 index a4f8f3c17..000000000 --- a/pkg/device-plugin/config/config.go +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package config - -var ( - DeviceSplitCount uint - DeviceMemoryScaling float64 - DeviceCoresScaling float64 - SchedulerEndpoint string - SchedulerTimeout int - NodeName string - RuntimeSocketFlag string -) diff --git a/pkg/device-plugin/nvidia.go b/pkg/device-plugin/nvidia.go deleted file mode 100644 index 3f609ae59..000000000 --- a/pkg/device-plugin/nvidia.go +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package device_plugin - -import ( - "fmt" - "log" - "os" - "strings" - - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" - - pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" -) - -const ( - envDisableHealthChecks = "DP_DISABLE_HEALTHCHECKS" - allHealthChecks = "xids" -) - -// Device couples an underlying pluginapi.Device type with its device node paths -type Device struct { - pluginapi.Device - Paths []string - Index string - Memory uint64 -} - -// ResourceManager provides an interface for listing a set of Devices and checking health on them -type ResourceManager interface { - Devices() []*Device - CheckHealth(stop <-chan interface{}, devices []*Device, unhealthy chan<- *Device) -} - -// GpuDeviceManager implements the ResourceManager interface for full GPU devices -type GpuDeviceManager struct { - skipMigEnabledGPUs bool -} - -func check(err error) { - if err != nil { - log.Panicln("Fatal:", err) - } -} - -// NewGpuDeviceManager returns a reference to a new GpuDeviceManager -func NewGpuDeviceManager(skipMigEnabledGPUs bool) *GpuDeviceManager { - return &GpuDeviceManager{ - skipMigEnabledGPUs: skipMigEnabledGPUs, - } -} - -// Devices returns a list of devices from the GpuDeviceManager -func (g *GpuDeviceManager) Devices() []*Device { - n, err := nvml.GetDeviceCount() - check(err) - - var devs []*Device - for i := uint(0); i < n; i++ { - d, err := nvml.NewDevice(i) - check(err) - - migEnabled, err := d.IsMigEnabled() - check(err) - - if migEnabled && g.skipMigEnabledGPUs { - continue - } - - devs = append(devs, buildDevice(d, []string{d.Path}, fmt.Sprintf("%v", i))) - } - - return devs -} - -// CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices -func (g *GpuDeviceManager) CheckHealth(stop <-chan interface{}, devices []*Device, unhealthy chan<- *Device) { - checkHealth(stop, devices, unhealthy) -} - -func buildDevice(d *nvml.Device, paths []string, index string) *Device { - dev := Device{} - dev.ID = d.UUID - dev.Health = pluginapi.Healthy - dev.Paths = paths - dev.Index = index - dev.Memory = *d.Memory - if d.CPUAffinity != nil { - dev.Topology = &pluginapi.TopologyInfo{ - Nodes: []*pluginapi.NUMANode{ - { - ID: int64(*(d.CPUAffinity)), - }, - }, - } - } - return &dev -} - -func checkHealth(stop <-chan interface{}, devices []*Device, unhealthy chan<- *Device) { - disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks)) - if disableHealthChecks == "all" { - disableHealthChecks = allHealthChecks - } - if strings.Contains(disableHealthChecks, "xids") { - return - } - - eventSet := nvml.NewEventSet() - defer nvml.DeleteEventSet(eventSet) - - for _, d := range devices { - gpu, _, _, err := nvml.ParseMigDeviceUUID(d.ID) - if err != nil { - gpu = d.ID - } - - err = nvml.RegisterEventForDevice(eventSet, nvml.XidCriticalError, gpu) - if err != nil && strings.HasSuffix(err.Error(), "Not Supported") { - log.Printf("Warning: %s is too old to support healthchecking: %s. Marking it unhealthy.", d.ID, err) - unhealthy <- d - continue - } - check(err) - } - - for { - select { - case <-stop: - return - default: - } - - e, err := nvml.WaitForEvent(eventSet, 5000) - if err != nil && e.Etype != nvml.XidCriticalError { - continue - } - - // FIXME: formalize the full list and document it. - // http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4 - // Application errors: the GPU should still be healthy - if e.Edata == 31 || e.Edata == 43 || e.Edata == 45 { - continue - } - - if e.UUID == nil || len(*e.UUID) == 0 { - // All devices are unhealthy - log.Printf("XidCriticalError: Xid=%d, All devices will go unhealthy.", e.Edata) - for _, d := range devices { - unhealthy <- d - } - continue - } - - for _, d := range devices { - // Please see https://github.com/NVIDIA/gpu-monitoring-tools/blob/148415f505c96052cb3b7fdf443b34ac853139ec/bindings/go/nvml/nvml.h#L1424 - // for the rationale why gi and ci can be set as such when the UUID is a full GPU UUID and not a MIG device UUID. - gpu, gi, ci, err := nvml.ParseMigDeviceUUID(d.ID) - if err != nil { - gpu = d.ID - gi = 0xFFFFFFFF - ci = 0xFFFFFFFF - } - - if gpu == *e.UUID && gi == *e.GpuInstanceId && ci == *e.ComputeInstanceId { - log.Printf("XidCriticalError: Xid=%d on Device=%s, the device will go unhealthy.", e.Edata, d.ID) - unhealthy <- d - } - } - } -} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/cdi/api.go b/pkg/device-plugin/nvidiadevice/nvinternal/cdi/api.go new file mode 100644 index 000000000..53583fca8 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/cdi/api.go @@ -0,0 +1,25 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cdi + +// Interface provides the API to the 'cdi' package +// +//go:generate moq -stub -out api_mock.go . Interface +type Interface interface { + CreateSpecFile() error + QualifiedName(string, string) string +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/cdi/api_mock.go b/pkg/device-plugin/nvidiadevice/nvinternal/cdi/api_mock.go new file mode 100644 index 000000000..7c72851c3 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/cdi/api_mock.go @@ -0,0 +1,123 @@ +// Code generated by moq; DO NOT EDIT. +// github.com/matryer/moq + +package cdi + +import ( + "sync" +) + +// Ensure, that InterfaceMock does implement Interface. +// If this is not the case, regenerate this file with moq. +var _ Interface = &InterfaceMock{} + +// InterfaceMock is a mock implementation of Interface. +// +// func TestSomethingThatUsesInterface(t *testing.T) { +// +// // make and configure a mocked Interface +// mockedInterface := &InterfaceMock{ +// CreateSpecFileFunc: func() error { +// panic("mock out the CreateSpecFile method") +// }, +// QualifiedNameFunc: func(s1 string, s2 string) string { +// panic("mock out the QualifiedName method") +// }, +// } +// +// // use mockedInterface in code that requires Interface +// // and then make assertions. +// +// } +type InterfaceMock struct { + // CreateSpecFileFunc mocks the CreateSpecFile method. + CreateSpecFileFunc func() error + + // QualifiedNameFunc mocks the QualifiedName method. + QualifiedNameFunc func(s1 string, s2 string) string + + // calls tracks calls to the methods. + calls struct { + // CreateSpecFile holds details about calls to the CreateSpecFile method. + CreateSpecFile []struct { + } + // QualifiedName holds details about calls to the QualifiedName method. + QualifiedName []struct { + // S1 is the s1 argument value. + S1 string + // S2 is the s2 argument value. + S2 string + } + } + lockCreateSpecFile sync.RWMutex + lockQualifiedName sync.RWMutex +} + +// CreateSpecFile calls CreateSpecFileFunc. +func (mock *InterfaceMock) CreateSpecFile() error { + callInfo := struct { + }{} + mock.lockCreateSpecFile.Lock() + mock.calls.CreateSpecFile = append(mock.calls.CreateSpecFile, callInfo) + mock.lockCreateSpecFile.Unlock() + if mock.CreateSpecFileFunc == nil { + var ( + errOut error + ) + return errOut + } + return mock.CreateSpecFileFunc() +} + +// CreateSpecFileCalls gets all the calls that were made to CreateSpecFile. +// Check the length with: +// +// len(mockedInterface.CreateSpecFileCalls()) +func (mock *InterfaceMock) CreateSpecFileCalls() []struct { +} { + var calls []struct { + } + mock.lockCreateSpecFile.RLock() + calls = mock.calls.CreateSpecFile + mock.lockCreateSpecFile.RUnlock() + return calls +} + +// QualifiedName calls QualifiedNameFunc. +func (mock *InterfaceMock) QualifiedName(s1 string, s2 string) string { + callInfo := struct { + S1 string + S2 string + }{ + S1: s1, + S2: s2, + } + mock.lockQualifiedName.Lock() + mock.calls.QualifiedName = append(mock.calls.QualifiedName, callInfo) + mock.lockQualifiedName.Unlock() + if mock.QualifiedNameFunc == nil { + var ( + sOut string + ) + return sOut + } + return mock.QualifiedNameFunc(s1, s2) +} + +// QualifiedNameCalls gets all the calls that were made to QualifiedName. +// Check the length with: +// +// len(mockedInterface.QualifiedNameCalls()) +func (mock *InterfaceMock) QualifiedNameCalls() []struct { + S1 string + S2 string +} { + var calls []struct { + S1 string + S2 string + } + mock.lockQualifiedName.RLock() + calls = mock.calls.QualifiedName + mock.lockQualifiedName.RUnlock() + return calls +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/cdi/cdi.go b/pkg/device-plugin/nvidiadevice/nvinternal/cdi/cdi.go new file mode 100644 index 000000000..0e99b3c72 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/cdi/cdi.go @@ -0,0 +1,177 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cdi + +import ( + "fmt" + "path/filepath" + + nvdevice "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvlib/pkg/nvml" + "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi" + roottransform "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/transform/root" + "github.com/sirupsen/logrus" + cdiapi "tags.cncf.io/container-device-interface/pkg/cdi" +) + +const ( + cdiRoot = "/var/run/cdi" +) + +// cdiHandler creates CDI specs for devices assocatied with the device plugin. +type cdiHandler struct { + logger *logrus.Logger + nvml nvml.Interface + nvdevice nvdevice.Interface + driverRoot string + targetDriverRoot string + nvidiaCTKPath string + cdiRoot string + vendor string + deviceIDStrategy string + + enabled bool + gdsEnabled bool + mofedEnabled bool + + cdilibs map[string]nvcdi.Interface +} + +var _ Interface = &cdiHandler{} + +// newHandler constructs a new instance of the 'cdi' interface. +func newHandler(opts ...Option) (Interface, error) { + c := &cdiHandler{} + for _, opt := range opts { + opt(c) + } + + if !c.enabled { + return &null{}, nil + } + + if c.logger == nil { + c.logger = logrus.StandardLogger() + } + if c.nvml == nil { + c.nvml = nvml.New() + } + if c.nvdevice == nil { + c.nvdevice = nvdevice.New(nvdevice.WithNvml(c.nvml)) + } + if c.deviceIDStrategy == "" { + c.deviceIDStrategy = "uuid" + } + if c.driverRoot == "" { + c.driverRoot = "/" + } + if c.targetDriverRoot == "" { + c.targetDriverRoot = c.driverRoot + } + + deviceNamer, err := nvcdi.NewDeviceNamer(c.deviceIDStrategy) + if err != nil { + return nil, err + } + + c.cdilibs = make(map[string]nvcdi.Interface) + + c.cdilibs["gpu"], err = nvcdi.New( + nvcdi.WithLogger(c.logger), + nvcdi.WithNvmlLib(c.nvml), + nvcdi.WithDeviceLib(c.nvdevice), + nvcdi.WithNVIDIACTKPath(c.nvidiaCTKPath), + nvcdi.WithDriverRoot(c.driverRoot), + nvcdi.WithDeviceNamers(deviceNamer), + nvcdi.WithVendor(c.vendor), + nvcdi.WithClass("gpu"), + ) + if err != nil { + return nil, fmt.Errorf("failed to create nvcdi library: %v", err) + } + + var additionalModes []string + if c.gdsEnabled { + additionalModes = append(additionalModes, "gds") + } + if c.mofedEnabled { + additionalModes = append(additionalModes, "mofed") + } + + for _, mode := range additionalModes { + lib, err := nvcdi.New( + nvcdi.WithLogger(c.logger), + nvcdi.WithNVIDIACTKPath(c.nvidiaCTKPath), + nvcdi.WithDriverRoot(c.driverRoot), + nvcdi.WithVendor(c.vendor), + nvcdi.WithMode(mode), + ) + if err != nil { + return nil, fmt.Errorf("failed to create nvcdi library: %v", err) + } + c.cdilibs[mode] = lib + } + + return c, nil +} + +// CreateSpecFile creates a CDI spec file for the specified devices. +func (cdi *cdiHandler) CreateSpecFile() error { + for class, cdilib := range cdi.cdilibs { + cdi.logger.Infof("Generating CDI spec for resource: %s/%s", cdi.vendor, class) + + if class == "gpu" { + ret := cdi.nvml.Init() + if ret != nvml.SUCCESS { + return fmt.Errorf("failed to initialize NVML: %v", ret) + } + defer cdi.nvml.Shutdown() + } + + spec, err := cdilib.GetSpec() + if err != nil { + return fmt.Errorf("failed to get CDI spec: %v", err) + } + + err = roottransform.New( + roottransform.WithRoot(cdi.driverRoot), + roottransform.WithTargetRoot(cdi.targetDriverRoot), + ).Transform(spec.Raw()) + if err != nil { + return fmt.Errorf("failed to transform driver root in CDI spec: %v", err) + } + + raw := spec.Raw() + specName, err := cdiapi.GenerateNameForSpec(raw) + if err != nil { + return fmt.Errorf("failed to generate spec name: %v", err) + } + + err = spec.Save(filepath.Join(cdiRoot, specName+".json")) + if err != nil { + return fmt.Errorf("failed to save CDI spec: %v", err) + } + } + + return nil +} + +// QualifiedName constructs a CDI qualified device name for the specified resources. +// Note: This assumes that the specified id matches the device name returned by the naming strategy. +func (cdi *cdiHandler) QualifiedName(class string, id string) string { + return cdiapi.QualifiedName(cdi.vendor, class, id) +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/cdi/factory.go b/pkg/device-plugin/nvidiadevice/nvinternal/cdi/factory.go new file mode 100644 index 000000000..d35e1bf42 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/cdi/factory.go @@ -0,0 +1,36 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cdi + +import ( + "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + + "k8s.io/klog/v2" +) + +// New is a factory method that creates a CDI handler for creating CDI specs. +func New(opts ...Option) (Interface, error) { + infolib := info.New() + + hasNVML, _ := infolib.HasNvml() + if !hasNVML { + klog.Warning("No valid resources detected, creating a null CDI handler") + return NewNullHandler(), nil + } + + return newHandler(opts...) +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/cdi/null.go b/pkg/device-plugin/nvidiadevice/nvinternal/cdi/null.go new file mode 100644 index 000000000..3f7a508af --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/cdi/null.go @@ -0,0 +1,43 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cdi + +import ( + "k8s.io/klog/v2" +) + +type null struct{} + +var _ Interface = &null{} + +// NewNullHandler returns an instance of the 'cdi' interface that can +// be used when CDI specs are not required. +func NewNullHandler() Interface { + return &null{} +} + +// CreateSpecFile is a no-op for the null handler. +func (n *null) CreateSpecFile() error { + return nil +} + +// QualifiedName is a no-op for the null handler. A error message is logged +// inidicating this should never be called for the null handler. +func (n *null) QualifiedName(class string, id string) string { + klog.Error("cannot return a qualified CDI device name with the null CDI handler") + return "" +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/cdi/options.go b/pkg/device-plugin/nvidiadevice/nvinternal/cdi/options.go new file mode 100644 index 000000000..1a5a4c014 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/cdi/options.go @@ -0,0 +1,87 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cdi + +import ( + "github.com/NVIDIA/go-nvlib/pkg/nvml" +) + +// Option defines a function for passing options to the New() call +type Option func(*cdiHandler) + +// WithEnabled provides an Option to set the enabled flag used by the 'cdi' interface +func WithEnabled(enabled bool) Option { + return func(c *cdiHandler) { + c.enabled = enabled + } +} + +// WithDriverRoot provides an Option to set the driver root used by the 'cdi' interface +func WithDriverRoot(root string) Option { + return func(c *cdiHandler) { + c.driverRoot = root + } +} + +// WithTargetDriverRoot provides an Option to set the target driver root used by the 'cdi' interface +func WithTargetDriverRoot(root string) Option { + return func(c *cdiHandler) { + c.targetDriverRoot = root + } +} + +// WithNvidiaCTKPath provides an Option to set the nvidia-ctk path used by the 'cdi' interface +func WithNvidiaCTKPath(path string) Option { + return func(c *cdiHandler) { + c.nvidiaCTKPath = path + } +} + +// WithNvml provides an Option to set the NVML library used by the 'cdi' interface +func WithNvml(nvml nvml.Interface) Option { + return func(c *cdiHandler) { + c.nvml = nvml + } +} + +// WithDeviceIDStrategy provides an Option to set the device ID strategy used by the 'cdi' interface +func WithDeviceIDStrategy(strategy string) Option { + return func(c *cdiHandler) { + c.deviceIDStrategy = strategy + } +} + +// WithVendor provides an Option to set the vendor used by the 'cdi' interface +func WithVendor(vendor string) Option { + return func(c *cdiHandler) { + c.vendor = vendor + } +} + +// WithGdsEnabled provides and option to set whether a GDS CDI spec should be generated +func WithGdsEnabled(enabled bool) Option { + return func(c *cdiHandler) { + c.gdsEnabled = enabled + } +} + +// WithMofedEnabled provides and option to set whether a MOFED CDI spec should be generated +func WithMofedEnabled(enabled bool) Option { + return func(c *cdiHandler) { + c.mofedEnabled = enabled + } +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/info/version.go b/pkg/device-plugin/nvidiadevice/nvinternal/info/version.go new file mode 100644 index 000000000..3c7c12a02 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/info/version.go @@ -0,0 +1,43 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package info + +import "strings" + +// version must be set by go build's -X main.version= option in the Makefile. +var version = "unknown" + +// gitCommit will be the hash that the binary was built from +// and will be populated by the Makefile. +var gitCommit = "" + +// GetVersionParts returns the different version components. +func GetVersionParts() []string { + v := []string{version} + + if gitCommit != "" { + v = append(v, "commit: "+gitCommit) + } + + return v +} + +// GetVersionString returns the string representation of the version. +func GetVersionString(more ...string) string { + v := append(GetVersionParts(), more...) + return strings.Join(v, "\n") +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/mig/mig.go b/pkg/device-plugin/nvidiadevice/nvinternal/mig/mig.go new file mode 100644 index 000000000..a98ec8de0 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/mig/mig.go @@ -0,0 +1,100 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package mig + +import ( + "bufio" + "fmt" + "os" + + "k8s.io/klog/v2" +) + +const ( + nvidiaProcDriverPath = "/proc/driver/nvidia" + nvidiaCapabilitiesPath = nvidiaProcDriverPath + "/capabilities" + + nvcapsProcDriverPath = "/proc/driver/nvidia-caps" + nvcapsMigMinorsPath = nvcapsProcDriverPath + "/mig-minors" + nvcapsDevicePath = "/dev/nvidia-caps" +) + +// GetMigCapabilityDevicePaths returns a mapping of MIG capability path to device node path. +func GetMigCapabilityDevicePaths() (map[string]string, error) { + // Open nvcapsMigMinorsPath for walking. + // If the nvcapsMigMinorsPath does not exist, then we are not on a MIG + // capable machine, so there is nothing to do. + // The format of this file is discussed in: + // https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#unique_1576522674 + minorsFile, err := os.Open(nvcapsMigMinorsPath) + if os.IsNotExist(err) { + return nil, nil + } + if err != nil { + return nil, fmt.Errorf("error opening MIG minors file: %v", err) + } + defer minorsFile.Close() + + // Define a function to process each each line of nvcapsMigMinorsPath + processLine := func(line string) (string, int, error) { + var gpu, gi, ci, migMinor int + + // Look for a CI access file + n, _ := fmt.Sscanf(line, "gpu%d/gi%d/ci%d/access %d", &gpu, &gi, &ci, &migMinor) + if n == 4 { + capPath := fmt.Sprintf(nvidiaCapabilitiesPath+"/gpu%d/mig/gi%d/ci%d/access", gpu, gi, ci) + return capPath, migMinor, nil + } + + // Look for a GI access file + n, _ = fmt.Sscanf(line, "gpu%d/gi%d/access %d", &gpu, &gi, &migMinor) + if n == 3 { + capPath := fmt.Sprintf(nvidiaCapabilitiesPath+"/gpu%d/mig/gi%d/access", gpu, gi) + return capPath, migMinor, nil + } + + // Look for the MIG config file + n, _ = fmt.Sscanf(line, "config %d", &migMinor) + if n == 1 { + capPath := fmt.Sprintf(nvidiaCapabilitiesPath + "/mig/config") + return capPath, migMinor, nil + } + + // Look for the MIG monitor file + n, _ = fmt.Sscanf(line, "monitor %d", &migMinor) + if n == 1 { + capPath := fmt.Sprintf(nvidiaCapabilitiesPath + "/mig/monitor") + return capPath, migMinor, nil + } + + return "", 0, fmt.Errorf("unparsable line: %v", line) + } + + // Walk each line of nvcapsMigMinorsPath and construct a mapping of nvidia + // capabilities path to device minor for that capability + capsDevicePaths := make(map[string]string) + scanner := bufio.NewScanner(minorsFile) + for scanner.Scan() { + capPath, migMinor, err := processLine(scanner.Text()) + if err != nil { + klog.Errorf("Skipping line in MIG minors file: %v", err) + continue + } + capsDevicePaths[capPath] = fmt.Sprintf(nvcapsDevicePath+"/nvidia-cap%d", migMinor) + } + return capsDevicePaths, nil +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/api.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/api.go new file mode 100644 index 000000000..705a32932 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/api.go @@ -0,0 +1,26 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package plugin + +import "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/rm" + +// Interface defines the API for the plugin package +type Interface interface { + Devices() rm.Devices + Start() error + Stop() error +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/api.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/api.go new file mode 100644 index 000000000..51403e46f --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/api.go @@ -0,0 +1,25 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package manager + +import "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin" + +// Interface defines the API for the plugin manager package +type Interface interface { + GetPlugins() ([]plugin.Interface, error) + CreateCDISpecFile() error +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/factory.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/factory.go new file mode 100644 index 000000000..2f9fe1922 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/factory.go @@ -0,0 +1,136 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package manager + +import ( + "fmt" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvlib/pkg/nvml" + "k8s.io/klog/v2" + + "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/cdi" + "github.com/Project-HAMi/HAMi/pkg/util" +) + +type manager struct { + migStrategy string + failOnInitError bool + nvmllib nvml.Interface + + cdiHandler cdi.Interface + cdiEnabled bool + config *util.DeviceConfig + infolib info.Interface +} + +// New creates a new plugin manager with the supplied options. +func New(opts ...Option) (Interface, error) { + m := &manager{} + for _, opt := range opts { + opt(m) + } + + if m.config == nil { + klog.Warning("no config provided, returning a null manager") + return &null{}, nil + } + + if m.infolib == nil { + m.infolib = info.New() + } + if m.cdiHandler == nil { + m.cdiHandler = cdi.NewNullHandler() + } + + mode, err := m.resolveMode() + if err != nil { + return nil, err + } + + if mode != "nvml" && m.cdiEnabled { + klog.Warning("CDI is not supported; disabling CDI.") + m.cdiEnabled = false + } + + switch mode { + case "nvml": + if m.nvmllib == nil { + m.nvmllib = nvml.New() + } + ret := m.nvmllib.Init() + if ret != nvml.SUCCESS { + klog.Errorf("Failed to initialize NVML: %v.", ret) + klog.Errorf("If this is a GPU node, did you set the docker default runtime to `nvidia`?") + klog.Errorf("You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites") + klog.Errorf("You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start") + klog.Errorf("If this is not a GPU node, you should set up a toleration or nodeSelector to only deploy this plugin on GPU nodes") + if m.failOnInitError { + return nil, fmt.Errorf("nvml init failed: %v", ret) + } + klog.Warningf("nvml init failed: %v", ret) + return &null{}, nil + } + defer m.nvmllib.Shutdown() + + return (*nvmlmanager)(m), nil + case "tegra": + return (*tegramanager)(m), nil + case "null": + return &null{}, nil + } + + return nil, fmt.Errorf("unknown mode: %v", mode) +} + +func (m *manager) resolveMode() (string, error) { + // logWithReason logs the output of the has* / is* checks from the info.Interface + logWithReason := func(f func() (bool, string), tag string) bool { + is, reason := f() + if !is { + tag = "non-" + tag + } + klog.Infof("Detected %v platform: %v", tag, reason) + return is + } + + hasNVML := logWithReason(m.infolib.HasNvml, "NVML") + isTegra := logWithReason(m.infolib.IsTegraSystem, "Tegra") + + if !hasNVML && !isTegra { + klog.Error("Incompatible platform detected") + klog.Error("If this is a GPU node, did you configure the NVIDIA Container Toolkit?") + klog.Error("You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites") + klog.Error("You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start") + klog.Error("If this is not a GPU node, you should set up a toleration or nodeSelector to only deploy this plugin on GPU nodes") + if m.failOnInitError { + return "", fmt.Errorf("platform detection failed") + } + return "null", nil + } + + // The NVIDIA container stack does not yet support the use of integrated AND discrete GPUs on the same node. + if isTegra { + if hasNVML { + klog.Warning("Disabling Tegra-based resources on NVML system") + return "nvml", nil + } + return "tegra", nil + } + + return "nvml", nil +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/null.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/null.go new file mode 100644 index 000000000..ef82a83ad --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/null.go @@ -0,0 +1,33 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package manager + +import ( + "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin" +) + +type null struct{} + +// GetPlugins returns an empty set of Plugins for the null manager +func (m *null) GetPlugins() ([]plugin.Interface, error) { + return nil, nil +} + +// CreateCDISpecFile creates the spec is a no-op for the null plugin +func (m *null) CreateCDISpecFile() error { + return nil +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/nvml.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/nvml.go new file mode 100644 index 000000000..4c234a1ce --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/nvml.go @@ -0,0 +1,45 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package manager + +import ( + "fmt" + + "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin" + "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/rm" +) + +type nvmlmanager manager + +// GetPlugins returns the plugins associated with the NVML resources available on the node +func (m *nvmlmanager) GetPlugins() ([]plugin.Interface, error) { + rms, err := rm.NewNVMLResourceManagers(m.nvmllib, m.config) + if err != nil { + return nil, fmt.Errorf("failed to construct NVML resource managers: %v", err) + } + + var plugins []plugin.Interface + for _, r := range rms { + plugins = append(plugins, plugin.NewNvidiaDevicePlugin(m.config, r, m.cdiHandler, m.cdiEnabled)) + } + return plugins, nil +} + +// CreateCDISpecFile creates forwards the request to the CDI handler +func (m *nvmlmanager) CreateCDISpecFile() error { + return m.cdiHandler.CreateSpecFile() +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/options.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/options.go new file mode 100644 index 000000000..4f1997969 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/options.go @@ -0,0 +1,68 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package manager + +import ( + "github.com/NVIDIA/go-nvlib/pkg/nvml" + "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/cdi" + "github.com/Project-HAMi/HAMi/pkg/util" +) + +// Option is a function that configures a manager +type Option func(*manager) + +// WithCDIEnabled sets whether CDI is enabled for the manager +func WithCDIEnabled(enabled bool) Option { + return func(m *manager) { + m.cdiEnabled = enabled + } +} + +// WithCDIHandler sets the CDI handler for the manager +func WithCDIHandler(handler cdi.Interface) Option { + return func(m *manager) { + m.cdiHandler = handler + } +} + +// WithNVML sets the NVML handler for the manager +func WithNVML(nvmllib nvml.Interface) Option { + return func(m *manager) { + m.nvmllib = nvmllib + } +} + +// WithFailOnInitError sets whether the manager should fail on initialization errors +func WithFailOnInitError(failOnInitError bool) Option { + return func(m *manager) { + m.failOnInitError = failOnInitError + } +} + +// WithMigStrategy sets the MIG strategy for the manager +func WithMigStrategy(migStrategy string) Option { + return func(m *manager) { + m.migStrategy = migStrategy + } +} + +// WithConfig sets the config reference for the manager +func WithConfig(config *util.DeviceConfig) Option { + return func(m *manager) { + m.config = config + } +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/tegra.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/tegra.go new file mode 100644 index 000000000..fff4e324b --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/manager/tegra.go @@ -0,0 +1,45 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package manager + +import ( + "fmt" + + "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin" + "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/rm" +) + +type tegramanager manager + +// GetPlugins returns the plugins associated with the NVML resources available on the node +func (m *tegramanager) GetPlugins() ([]plugin.Interface, error) { + rms, err := rm.NewTegraResourceManagers(m.config) + if err != nil { + return nil, fmt.Errorf("failed to construct NVML resource managers: %v", err) + } + + var plugins []plugin.Interface + for _, r := range rms { + plugins = append(plugins, plugin.NewNvidiaDevicePlugin(m.config, r, m.cdiHandler, m.cdiEnabled)) + } + return plugins, nil +} + +// CreateCDISpecFile creates the spec is a no-op for the tegra plugin +func (m *tegramanager) CreateCDISpecFile() error { + return nil +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go new file mode 100644 index 000000000..3ebb68c1a --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go @@ -0,0 +1,200 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package plugin + +import ( + "fmt" + "os/exec" + "strconv" + "strings" + "time" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "k8s.io/klog/v2" + + "github.com/Project-HAMi/HAMi/pkg/api" + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + "github.com/Project-HAMi/HAMi/pkg/util" +) + +func (plugin *NvidiaDevicePlugin) getNumaInformation(idx int) (int, error) { + cmd := exec.Command("nvidia-smi", "topo", "-m") + out, err := cmd.CombinedOutput() + if err != nil { + return 0, err + } + klog.V(5).InfoS("nvidia-smi topo -m output", "result", string(out)) + return parseNvidiaNumaInfo(idx, string(out)) +} + +func parseNvidiaNumaInfo(idx int, nvidiaTopoStr string) (int, error) { + result := 0 + numaAffinityColumnIndex := 0 + for index, val := range strings.Split(nvidiaTopoStr, "\n") { + if !strings.Contains(val, "GPU") { + continue + } + // Example: GPU0 X 0-7 N/A N/A + // Many values are separated by two tabs, but this actually represents 5 values instead of 7 + // So add logic to remove multiple tabs + words := strings.Split(strings.ReplaceAll(val, "\t\t", "\t"), "\t") + klog.V(5).InfoS("parseNumaInfo", "words", words) + // get numa affinity column number + if index == 0 { + for columnIndex, headerVal := range words { + // The topology output of a single card is as follows: + // GPU0 CPU Affinity NUMA Affinity GPU NUMA ID + // GPU0 X 0-7 N/A N/A + //Legend: Other content omitted + + // The topology output in the case of multiple cards is as follows: + // GPU0 GPU1 CPU Affinity NUMA Affinity + // GPU0 X PHB 0-31 N/A + // GPU1 PHB X 0-31 N/A + // Legend: Other content omitted + + // We need to get the value of the NUMA Affinity column, but their column indexes are inconsistent, + // so we need to get the index first and then get the value. + if strings.Contains(headerVal, "NUMA Affinity") { + // The header is one column less than the actual row. + numaAffinityColumnIndex = columnIndex + continue + } + } + continue + } + klog.V(5).InfoS("nvidia-smi topo -m row output", "row output", words, "length", len(words)) + if strings.Contains(words[0], fmt.Sprint(idx)) { + if words[numaAffinityColumnIndex] == "N/A" { + klog.InfoS("current card has not established numa topology", "gpu row info", words, "index", idx) + return 0, nil + } + result, err := strconv.Atoi(words[numaAffinityColumnIndex]) + if err != nil { + return result, err + } + } + } + return result, nil +} + +func (plugin *NvidiaDevicePlugin) getAPIDevices() *[]*api.DeviceInfo { + devs := plugin.Devices() + nvml.Init() + res := make([]*api.DeviceInfo, 0, len(devs)) + idx := 0 + for idx < len(devs) { + ndev, ret := nvml.DeviceGetHandleByIndex(idx) + //ndev, err := nvml.NewDevice(uint(idx)) + //klog.V(3).Infoln("ndev type=", ndev.Model) + if ret != nvml.SUCCESS { + klog.Errorln("nvml new device by index error idx=", idx, "err=", ret) + panic(0) + } + memoryTotal := 0 + memory, ret := ndev.GetMemoryInfo() + if ret == nvml.SUCCESS { + memoryTotal = int(memory.Total) + } else { + klog.Error("nvml get memory error ret=", ret) + panic(0) + } + UUID, ret := ndev.GetUUID() + if ret != nvml.SUCCESS { + klog.Error("nvml get uuid error ret=", ret) + panic(0) + } + Model, ret := ndev.GetName() + if ret != nvml.SUCCESS { + klog.Error("nvml get name error ret=", ret) + panic(0) + } + + registeredmem := int32(memoryTotal / 1024 / 1024) + if *util.DeviceMemoryScaling != 1 { + registeredmem = int32(float64(registeredmem) * *util.DeviceMemoryScaling) + } + klog.Infoln("MemoryScaling=", *util.DeviceMemoryScaling, "registeredmem=", registeredmem) + health := true + for _, val := range devs { + if strings.Compare(val.ID, UUID) == 0 { + // when NVIDIA-Tesla P4, the device info is : ID:GPU-e290caca-2f0c-9582-acab-67a142b61ffa,Health:Healthy,Topology:nil, + // it is more reasonable to think of healthy as case-insensitive + if strings.EqualFold(val.Health, "healthy") { + health = true + } else { + health = false + } + break + } + } + numa, err := plugin.getNumaInformation(idx) + if err != nil { + klog.ErrorS(err, "failed to get numa information", "idx", idx) + } + res = append(res, &api.DeviceInfo{ + Id: UUID, + Count: int32(*util.DeviceSplitCount), + Devmem: registeredmem, + Devcore: int32(*util.DeviceCoresScaling * 100), + Type: fmt.Sprintf("%v-%v", "NVIDIA", Model), + Numa: numa, + Health: health, + }) + idx++ + klog.Infof("nvml registered device id=%v, memory=%v, type=%v, numa=%v", idx, registeredmem, Model, numa) + } + return &res +} + +func (plugin *NvidiaDevicePlugin) RegistrInAnnotation() error { + devices := plugin.getAPIDevices() + klog.InfoS("start working on the devices", "devices", devices) + annos := make(map[string]string) + node, err := util.GetNode(util.NodeName) + if err != nil { + klog.Errorln("get node error", err.Error()) + return err + } + encodeddevices := util.EncodeNodeDevices(*devices) + annos[nvidia.HandshakeAnnos] = "Reported " + time.Now().String() + annos[nvidia.RegisterAnnos] = encodeddevices + klog.Infof("patch node with the following annos %v", fmt.Sprintf("%v", annos)) + err = util.PatchNodeAnnotations(node, annos) + + if err != nil { + klog.Errorln("patch node error", err.Error()) + } + return err +} + +func (plugin *NvidiaDevicePlugin) WatchAndRegister() { + klog.Info("Starting WatchAndRegister") + errorSleepInterval := time.Second * 5 + successSleepInterval := time.Second * 30 + for { + err := plugin.RegistrInAnnotation() + if err != nil { + klog.Errorf("Failed to register annotation: %v", err) + klog.Infof("Retrying in %v seconds...", errorSleepInterval) + time.Sleep(errorSleepInterval) + } else { + klog.Infof("Successfully registered annotation. Next check in %v seconds...", successSleepInterval) + time.Sleep(successSleepInterval) + } + } +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register_test.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register_test.go new file mode 100644 index 000000000..d8e828629 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register_test.go @@ -0,0 +1,68 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package plugin + +import "testing" + +func Test_parseNvidiaNumaInfo(t *testing.T) { + + tests := []struct { + name string + idx int + nvidiaTopoStr string + want int + wantErr bool + }{ + { + name: "single Tesla P4 NUMA", + idx: 0, + nvidiaTopoStr: `GPU0 CPU Affinity NUMA Affinity ... + ...`, + want: 0, + wantErr: false, + }, + { + name: "two Tesla P4 NUMA topo with index 0", + idx: 0, + nvidiaTopoStr: `GPU0 GPU1 CPU Affinity NUMA Affinity ... + ...`, + want: 0, + wantErr: false, + }, + { + name: "two Tesla P4 NUMA topo with index 1", + idx: 1, + nvidiaTopoStr: `GPU0 GPU1 CPU Affinity NUMA Affinity ... + ...`, + want: 0, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := parseNvidiaNumaInfo(tt.idx, tt.nvidiaTopoStr) + if (err != nil) != tt.wantErr { + t.Errorf("parseNvidiaNumaInfo() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("parseNvidiaNumaInfo() got = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/server.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/server.go new file mode 100644 index 000000000..44b3f28cf --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/server.go @@ -0,0 +1,593 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package plugin + +import ( + "errors" + "fmt" + "net" + "os" + "path" + "path/filepath" + "strconv" + "strings" + "time" + + spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/api" + "github.com/Project-HAMi/HAMi/pkg/device" + "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/cdi" + "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/rm" + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + "github.com/Project-HAMi/HAMi/pkg/util" + "github.com/Project-HAMi/HAMi/pkg/util/nodelock" + cdiapi "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" + + "github.com/google/uuid" + "golang.org/x/net/context" + "google.golang.org/grpc" + + "k8s.io/klog/v2" + kubeletdevicepluginv1beta1 "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" +) + +// Constants for use by the 'volume-mounts' device list strategy +const ( + deviceListAsVolumeMountsHostPath = "/dev/null" + deviceListAsVolumeMountsContainerPathRoot = "/var/run/nvidia-container-devices" + NodeLockNvidia = "hami.io/mutex.lock" +) + +var ( + hostHookPath string +) + +func init() { + hostHookPath, _ = os.LookupEnv("HOOK_PATH") +} + +// NvidiaDevicePlugin implements the Kubernetes device plugin API +type NvidiaDevicePlugin struct { + rm rm.ResourceManager + config *util.DeviceConfig + deviceListEnvvar string + deviceListStrategies spec.DeviceListStrategies + socket string + + cdiHandler cdi.Interface + cdiEnabled bool + cdiAnnotationPrefix string + + server *grpc.Server + health chan *rm.Device + stop chan interface{} +} + +// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin +func NewNvidiaDevicePlugin(config *util.DeviceConfig, resourceManager rm.ResourceManager, cdiHandler cdi.Interface, cdiEnabled bool) *NvidiaDevicePlugin { + _, name := resourceManager.Resource().Split() + + deviceListStrategies, _ := spec.NewDeviceListStrategies(*config.Flags.Plugin.DeviceListStrategy) + + return &NvidiaDevicePlugin{ + rm: resourceManager, + config: config, + deviceListEnvvar: "NVIDIA_VISIBLE_DEVICES", + deviceListStrategies: deviceListStrategies, + socket: kubeletdevicepluginv1beta1.DevicePluginPath + "nvidia-" + name + ".sock", + cdiHandler: cdiHandler, + cdiEnabled: cdiEnabled, + cdiAnnotationPrefix: *config.Flags.Plugin.CDIAnnotationPrefix, + + // These will be reinitialized every + // time the plugin server is restarted. + server: nil, + health: nil, + stop: nil, + } +} + +func (plugin *NvidiaDevicePlugin) initialize() { + plugin.server = grpc.NewServer([]grpc.ServerOption{}...) + plugin.health = make(chan *rm.Device) + plugin.stop = make(chan interface{}) +} + +func (plugin *NvidiaDevicePlugin) cleanup() { + close(plugin.stop) + plugin.server = nil + plugin.health = nil + plugin.stop = nil +} + +// Devices returns the full set of devices associated with the plugin. +func (plugin *NvidiaDevicePlugin) Devices() rm.Devices { + return plugin.rm.Devices() +} + +// Start starts the gRPC server, registers the device plugin with the Kubelet, +// and starts the device healthchecks. +func (plugin *NvidiaDevicePlugin) Start() error { + plugin.initialize() + + err := plugin.Serve() + if err != nil { + klog.Infof("Could not start device plugin for '%s': %s", plugin.rm.Resource(), err) + plugin.cleanup() + return err + } + klog.Infof("Starting to serve '%s' on %s", plugin.rm.Resource(), plugin.socket) + + err = plugin.Register() + if err != nil { + klog.Infof("Could not register device plugin: %s", err) + plugin.Stop() + return err + } + klog.Infof("Registered device plugin for '%s' with Kubelet", plugin.rm.Resource()) + + go func() { + err := plugin.rm.CheckHealth(plugin.stop, plugin.health) + if err != nil { + klog.Infof("Failed to start health check: %v; continuing with health checks disabled", err) + } + }() + + go func() { + plugin.WatchAndRegister() + }() + + return nil +} + +// Stop stops the gRPC server. +func (plugin *NvidiaDevicePlugin) Stop() error { + if plugin == nil || plugin.server == nil { + return nil + } + klog.Infof("Stopping to serve '%s' on %s", plugin.rm.Resource(), plugin.socket) + plugin.server.Stop() + if err := os.Remove(plugin.socket); err != nil && !os.IsNotExist(err) { + return err + } + plugin.cleanup() + return nil +} + +// Serve starts the gRPC server of the device plugin. +func (plugin *NvidiaDevicePlugin) Serve() error { + os.Remove(plugin.socket) + sock, err := net.Listen("unix", plugin.socket) + if err != nil { + return err + } + + kubeletdevicepluginv1beta1.RegisterDevicePluginServer(plugin.server, plugin) + + go func() { + lastCrashTime := time.Now() + restartCount := 0 + for { + klog.Infof("Starting GRPC server for '%s'", plugin.rm.Resource()) + err := plugin.server.Serve(sock) + if err == nil { + break + } + + klog.Infof("GRPC server for '%s' crashed with error: %v", plugin.rm.Resource(), err) + + // restart if it has not been too often + // i.e. if server has crashed more than 5 times and it didn't last more than one hour each time + if restartCount > 5 { + // quit + klog.Fatalf("GRPC server for '%s' has repeatedly crashed recently. Quitting", plugin.rm.Resource()) + } + timeSinceLastCrash := time.Since(lastCrashTime).Seconds() + lastCrashTime = time.Now() + if timeSinceLastCrash > 3600 { + // it has been one hour since the last crash.. reset the count + // to reflect on the frequency + restartCount = 1 + } else { + restartCount++ + } + } + }() + + // Wait for server to start by launching a blocking connexion + conn, err := plugin.dial(plugin.socket, 5*time.Second) + if err != nil { + return err + } + conn.Close() + + return nil +} + +// Register registers the device plugin for the given resourceName with Kubelet. +func (plugin *NvidiaDevicePlugin) Register() error { + conn, err := plugin.dial(kubeletdevicepluginv1beta1.KubeletSocket, 5*time.Second) + if err != nil { + return err + } + defer conn.Close() + + client := kubeletdevicepluginv1beta1.NewRegistrationClient(conn) + reqt := &kubeletdevicepluginv1beta1.RegisterRequest{ + Version: kubeletdevicepluginv1beta1.Version, + Endpoint: path.Base(plugin.socket), + ResourceName: string(plugin.rm.Resource()), + Options: &kubeletdevicepluginv1beta1.DevicePluginOptions{ + GetPreferredAllocationAvailable: false, + }, + } + + _, err = client.Register(context.Background(), reqt) + if err != nil { + return err + } + return nil +} + +// GetDevicePluginOptions returns the values of the optional settings for this plugin +func (plugin *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *kubeletdevicepluginv1beta1.Empty) (*kubeletdevicepluginv1beta1.DevicePluginOptions, error) { + options := &kubeletdevicepluginv1beta1.DevicePluginOptions{ + GetPreferredAllocationAvailable: false, + } + return options, nil +} + +// ListAndWatch lists devices and update that list according to the health status +func (plugin *NvidiaDevicePlugin) ListAndWatch(e *kubeletdevicepluginv1beta1.Empty, s kubeletdevicepluginv1beta1.DevicePlugin_ListAndWatchServer) error { + s.Send(&kubeletdevicepluginv1beta1.ListAndWatchResponse{Devices: plugin.apiDevices()}) + + for { + select { + case <-plugin.stop: + return nil + case d := <-plugin.health: + // FIXME: there is no way to recover from the Unhealthy state. + d.Health = kubeletdevicepluginv1beta1.Unhealthy + klog.Infof("'%s' device marked unhealthy: %s", plugin.rm.Resource(), d.ID) + s.Send(&kubeletdevicepluginv1beta1.ListAndWatchResponse{Devices: plugin.apiDevices()}) + } + } +} + +// GetPreferredAllocation returns the preferred allocation from the set of devices specified in the request +func (plugin *NvidiaDevicePlugin) GetPreferredAllocation(ctx context.Context, r *kubeletdevicepluginv1beta1.PreferredAllocationRequest) (*kubeletdevicepluginv1beta1.PreferredAllocationResponse, error) { + response := &kubeletdevicepluginv1beta1.PreferredAllocationResponse{} + /*for _, req := range r.ContainerRequests { + devices, err := plugin.rm.GetPreferredAllocation(req.AvailableDeviceIDs, req.MustIncludeDeviceIDs, int(req.AllocationSize)) + if err != nil { + return nil, fmt.Errorf("error getting list of preferred allocation devices: %v", err) + } + + resp := &kubeletdevicepluginv1beta1.ContainerPreferredAllocationResponse{ + DeviceIDs: devices, + } + + response.ContainerResponses = append(response.ContainerResponses, resp) + }*/ + return response, nil +} + +// Allocate which return list of devices. +func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *kubeletdevicepluginv1beta1.AllocateRequest) (*kubeletdevicepluginv1beta1.AllocateResponse, error) { + klog.InfoS("Allocate", "request", reqs) + responses := kubeletdevicepluginv1beta1.AllocateResponse{} + nodename := os.Getenv(util.NodeNameEnvName) + current, err := util.GetPendingPod(ctx, nodename) + if err != nil { + nodelock.ReleaseNodeLock(nodename, NodeLockNvidia) + return &kubeletdevicepluginv1beta1.AllocateResponse{}, err + } + klog.V(5).Infof("allocate pod name is %s/%s, annotation is %+v", current.Namespace, current.Name, current.Annotations) + + for idx, req := range reqs.ContainerRequests { + // If the devices being allocated are replicas, then (conditionally) + // error out if more than one resource is being allocated. + + if strings.Contains(req.DevicesIDs[0], "MIG") { + + if plugin.config.Sharing.TimeSlicing.FailRequestsGreaterThanOne && rm.AnnotatedIDs(req.DevicesIDs).AnyHasAnnotations() { + if len(req.DevicesIDs) > 1 { + return nil, fmt.Errorf("request for '%v: %v' too large: maximum request size for shared resources is 1", plugin.rm.Resource(), len(req.DevicesIDs)) + } + } + + for _, id := range req.DevicesIDs { + if !plugin.rm.Devices().Contains(id) { + return nil, fmt.Errorf("invalid allocation request for '%s': unknown device: %s", plugin.rm.Resource(), id) + } + } + + response, err := plugin.getAllocateResponse(req.DevicesIDs) + if err != nil { + return nil, fmt.Errorf("failed to get allocate response: %v", err) + } + responses.ContainerResponses = append(responses.ContainerResponses, response) + } else { + currentCtr, devreq, err := util.GetNextDeviceRequest(nvidia.NvidiaGPUDevice, *current) + klog.Infoln("deviceAllocateFromAnnotation=", devreq) + if err != nil { + device.PodAllocationFailed(nodename, current, NodeLockNvidia) + return &kubeletdevicepluginv1beta1.AllocateResponse{}, err + } + if len(devreq) != len(reqs.ContainerRequests[idx].DevicesIDs) { + device.PodAllocationFailed(nodename, current, NodeLockNvidia) + return &kubeletdevicepluginv1beta1.AllocateResponse{}, errors.New("device number not matched") + } + response, err := plugin.getAllocateResponse(util.GetContainerDeviceStrArray(devreq)) + if err != nil { + return nil, fmt.Errorf("failed to get allocate response: %v", err) + } + + err = util.EraseNextDeviceTypeFromAnnotation(nvidia.NvidiaGPUDevice, *current) + if err != nil { + device.PodAllocationFailed(nodename, current, NodeLockNvidia) + return &kubeletdevicepluginv1beta1.AllocateResponse{}, err + } + + for i, dev := range devreq { + limitKey := fmt.Sprintf("CUDA_DEVICE_MEMORY_LIMIT_%v", i) + response.Envs[limitKey] = fmt.Sprintf("%vm", dev.Usedmem) + + /*tmp := response.Envs["NVIDIA_VISIBLE_DEVICES"] + if i > 0 { + response.Envs["NVIDIA_VISIBLE_DEVICES"] = fmt.Sprintf("%v,%v", tmp, dev.UUID) + } else { + response.Envs["NVIDIA_VISIBLE_DEVICES"] = dev.UUID + }*/ + } + response.Envs["CUDA_DEVICE_SM_LIMIT"] = fmt.Sprint(devreq[0].Usedcores) + response.Envs["CUDA_DEVICE_MEMORY_SHARED_CACHE"] = fmt.Sprintf("%s/vgpu/%v.cache", hostHookPath, uuid.New().String()) + if *util.DeviceMemoryScaling > 1 { + response.Envs["CUDA_OVERSUBSCRIBE"] = "true" + } + if *util.DisableCoreLimit { + response.Envs[api.CoreLimitSwitch] = "disable" + } + cacheFileHostDirectory := fmt.Sprintf("%s/vgpu/containers/%s_%s", hostHookPath, current.UID, currentCtr.Name) + os.RemoveAll(cacheFileHostDirectory) + + os.MkdirAll(cacheFileHostDirectory, 0777) + os.Chmod(cacheFileHostDirectory, 0777) + os.MkdirAll("/tmp/vgpulock", 0777) + os.Chmod("/tmp/vgpulock", 0777) + response.Mounts = append(response.Mounts, + &kubeletdevicepluginv1beta1.Mount{ContainerPath: fmt.Sprintf("%s/vgpu/libvgpu.so", hostHookPath), + HostPath: hostHookPath + "/vgpu/libvgpu.so", + ReadOnly: true}, + &kubeletdevicepluginv1beta1.Mount{ContainerPath: fmt.Sprintf("%s/vgpu", hostHookPath), + HostPath: cacheFileHostDirectory, + ReadOnly: false}, + &kubeletdevicepluginv1beta1.Mount{ContainerPath: "/tmp/vgpulock", + HostPath: "/tmp/vgpulock", + ReadOnly: false}, + ) + found := false + for _, val := range currentCtr.Env { + if strings.Compare(val.Name, "CUDA_DISABLE_CONTROL") == 0 { + // if env existed but is set to false or can not be parsed, ignore + t, _ := strconv.ParseBool(val.Value) + if !t { + continue + } + // only env existed and set to true, we mark it "found" + found = true + break + } + } + if !found { + response.Mounts = append(response.Mounts, &kubeletdevicepluginv1beta1.Mount{ContainerPath: "/etc/ld.so.preload", + HostPath: hostHookPath + "/vgpu/ld.so.preload", + ReadOnly: true}, + ) + } + _, err = os.Stat(fmt.Sprintf("%s/vgpu/license", hostHookPath)) + if err == nil { + response.Mounts = append(response.Mounts, &kubeletdevicepluginv1beta1.Mount{ + ContainerPath: "/tmp/license", + HostPath: fmt.Sprintf("%s/vgpu/license", hostHookPath), + ReadOnly: true, + }) + response.Mounts = append(response.Mounts, &kubeletdevicepluginv1beta1.Mount{ + ContainerPath: "/usr/bin/vgpuvalidator", + HostPath: fmt.Sprintf("%s/vgpu/vgpuvalidator", hostHookPath), + ReadOnly: true, + }) + } + responses.ContainerResponses = append(responses.ContainerResponses, response) + } + } + klog.Infoln("Allocate Response", responses.ContainerResponses) + device.PodAllocationTrySuccess(nodename, nvidia.NvidiaGPUDevice, NodeLockNvidia, current) + return &responses, nil +} + +func (plugin *NvidiaDevicePlugin) getAllocateResponse(requestIds []string) (*kubeletdevicepluginv1beta1.ContainerAllocateResponse, error) { + deviceIDs := plugin.deviceIDsFromAnnotatedDeviceIDs(requestIds) + + responseID := uuid.New().String() + response, err := plugin.getAllocateResponseForCDI(responseID, deviceIDs) + if err != nil { + return nil, fmt.Errorf("failed to get allocate response for CDI: %v", err) + } + + response.Envs = plugin.apiEnvs(plugin.deviceListEnvvar, deviceIDs) + //if plugin.deviceListStrategies.Includes(spec.DeviceListStrategyVolumeMounts) || plugin.deviceListStrategies.Includes(spec.DeviceListStrategyEnvvar) { + // response.Envs = plugin.apiEnvs(plugin.deviceListEnvvar, deviceIDs) + //} + /* + if plugin.deviceListStrategies.Includes(spec.DeviceListStrategyVolumeMounts) { + response.Envs = plugin.apiEnvs(plugin.deviceListEnvvar, []string{deviceListAsVolumeMountsContainerPathRoot}) + response.Mounts = plugin.apiMounts(deviceIDs) + }*/ + if *plugin.config.Flags.Plugin.PassDeviceSpecs { + response.Devices = plugin.apiDeviceSpecs(*plugin.config.Flags.NvidiaDriverRoot, requestIds) + } + if *plugin.config.Flags.GDSEnabled { + response.Envs["NVIDIA_GDS"] = "enabled" + } + if *plugin.config.Flags.MOFEDEnabled { + response.Envs["NVIDIA_MOFED"] = "enabled" + } + + return &response, nil +} + +// getAllocateResponseForCDI returns the allocate response for the specified device IDs. +// This response contains the annotations required to trigger CDI injection in the container engine or nvidia-container-runtime. +func (plugin *NvidiaDevicePlugin) getAllocateResponseForCDI(responseID string, deviceIDs []string) (kubeletdevicepluginv1beta1.ContainerAllocateResponse, error) { + response := kubeletdevicepluginv1beta1.ContainerAllocateResponse{} + + if !plugin.cdiEnabled { + return response, nil + } + + var devices []string + for _, id := range deviceIDs { + devices = append(devices, plugin.cdiHandler.QualifiedName("gpu", id)) + } + + if *plugin.config.Flags.GDSEnabled { + devices = append(devices, plugin.cdiHandler.QualifiedName("gds", "all")) + } + if *plugin.config.Flags.MOFEDEnabled { + devices = append(devices, plugin.cdiHandler.QualifiedName("mofed", "all")) + } + + if len(devices) == 0 { + return response, nil + } + + if plugin.deviceListStrategies.Includes(spec.DeviceListStrategyCDIAnnotations) { + annotations, err := plugin.getCDIDeviceAnnotations(responseID, devices) + if err != nil { + return response, err + } + response.Annotations = annotations + } + + return response, nil +} + +func (plugin *NvidiaDevicePlugin) getCDIDeviceAnnotations(id string, devices []string) (map[string]string, error) { + annotations, err := cdiapi.UpdateAnnotations(map[string]string{}, "nvidia-device-plugin", id, devices) + if err != nil { + return nil, fmt.Errorf("failed to add CDI annotations: %v", err) + } + + if plugin.cdiAnnotationPrefix == spec.DefaultCDIAnnotationPrefix { + return annotations, nil + } + + // update annotations if a custom CDI prefix is configured + updatedAnnotations := make(map[string]string) + for k, v := range annotations { + newKey := plugin.cdiAnnotationPrefix + strings.TrimPrefix(k, spec.DefaultCDIAnnotationPrefix) + updatedAnnotations[newKey] = v + } + + return updatedAnnotations, nil +} + +// PreStartContainer is unimplemented for this plugin +func (plugin *NvidiaDevicePlugin) PreStartContainer(context.Context, *kubeletdevicepluginv1beta1.PreStartContainerRequest) (*kubeletdevicepluginv1beta1.PreStartContainerResponse, error) { + return &kubeletdevicepluginv1beta1.PreStartContainerResponse{}, nil +} + +// dial establishes the gRPC communication with the registered device plugin. +func (plugin *NvidiaDevicePlugin) dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) { + c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(), + grpc.WithTimeout(timeout), + grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) { + return net.DialTimeout("unix", addr, timeout) + }), + ) + + if err != nil { + return nil, err + } + + return c, nil +} + +func (plugin *NvidiaDevicePlugin) deviceIDsFromAnnotatedDeviceIDs(ids []string) []string { + var deviceIDs []string + if *plugin.config.Flags.Plugin.DeviceIDStrategy == spec.DeviceIDStrategyUUID { + deviceIDs = rm.AnnotatedIDs(ids).GetIDs() + } + if *plugin.config.Flags.Plugin.DeviceIDStrategy == spec.DeviceIDStrategyIndex { + deviceIDs = plugin.rm.Devices().Subset(ids).GetIndices() + } + return deviceIDs +} + +func (plugin *NvidiaDevicePlugin) apiDevices() []*kubeletdevicepluginv1beta1.Device { + return plugin.rm.Devices().GetPluginDevices() +} + +func (plugin *NvidiaDevicePlugin) apiEnvs(envvar string, deviceIDs []string) map[string]string { + return map[string]string{ + envvar: strings.Join(deviceIDs, ","), + } +} + +/* +func (plugin *NvidiaDevicePlugin) apiMounts(deviceIDs []string) []*kubeletdevicepluginv1beta1.Mount { + var mounts []*kubeletdevicepluginv1beta1.Mount + + for _, id := range deviceIDs { + mount := &kubeletdevicepluginv1beta1.Mount{ + HostPath: deviceListAsVolumeMountsHostPath, + ContainerPath: filepath.Join(deviceListAsVolumeMountsContainerPathRoot, id), + } + mounts = append(mounts, mount) + } + + return mounts +}*/ + +func (plugin *NvidiaDevicePlugin) apiDeviceSpecs(driverRoot string, ids []string) []*kubeletdevicepluginv1beta1.DeviceSpec { + optional := map[string]bool{ + "/dev/nvidiactl": true, + "/dev/nvidia-uvm": true, + "/dev/nvidia-uvm-tools": true, + "/dev/nvidia-modeset": true, + } + + paths := plugin.rm.GetDevicePaths(ids) + + var specs []*kubeletdevicepluginv1beta1.DeviceSpec + for _, p := range paths { + if optional[p] { + if _, err := os.Stat(p); err != nil { + continue + } + } + spec := &kubeletdevicepluginv1beta1.DeviceSpec{ + ContainerPath: p, + HostPath: filepath.Join(driverRoot, p), + Permissions: "rw", + } + specs = append(specs, spec) + } + + return specs +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/server_test.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/server_test.go new file mode 100644 index 000000000..152690fc5 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/server_test.go @@ -0,0 +1,184 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package plugin + +import ( + "fmt" + "testing" + + v1 "github.com/NVIDIA/k8s-device-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/cdi" + "github.com/Project-HAMi/HAMi/pkg/util" + "github.com/stretchr/testify/require" + kubeletdevicepluginv1beta1 "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" +) + +func TestCDIAllocateResponse(t *testing.T) { + testCases := []struct { + description string + deviceIds []string + deviceListStrategies []string + CDIPrefix string + CDIEnabled bool + GDSEnabled bool + MOFEDEnabled bool + expectedResponse kubeletdevicepluginv1beta1.ContainerAllocateResponse + }{ + { + description: "empty device list has empty response", + deviceListStrategies: []string{"cdi-annotations"}, + CDIPrefix: "cdi.k8s.io/", + CDIEnabled: true, + }, + { + description: "CDI disabled has empty response", + deviceIds: []string{"gpu0"}, + deviceListStrategies: []string{"cdi-annotations"}, + CDIPrefix: "cdi.k8s.io/", + CDIEnabled: false, + }, + { + description: "single device is added to annotations", + deviceIds: []string{"gpu0"}, + deviceListStrategies: []string{"cdi-annotations"}, + CDIPrefix: "cdi.k8s.io/", + CDIEnabled: true, + expectedResponse: kubeletdevicepluginv1beta1.ContainerAllocateResponse{ + Annotations: map[string]string{ + "cdi.k8s.io/nvidia-device-plugin_uuid": "nvidia.com/gpu=gpu0", + }, + }, + }, + { + description: "single device is added to annotations with custom prefix", + deviceIds: []string{"gpu0"}, + deviceListStrategies: []string{"cdi-annotations"}, + CDIPrefix: "custom.cdi.k8s.io/", + CDIEnabled: true, + expectedResponse: kubeletdevicepluginv1beta1.ContainerAllocateResponse{ + Annotations: map[string]string{ + "custom.cdi.k8s.io/nvidia-device-plugin_uuid": "nvidia.com/gpu=gpu0", + }, + }, + }, + { + description: "multiple devices are added to annotations", + deviceIds: []string{"gpu0", "gpu1"}, + deviceListStrategies: []string{"cdi-annotations"}, + CDIPrefix: "cdi.k8s.io/", + CDIEnabled: true, + expectedResponse: kubeletdevicepluginv1beta1.ContainerAllocateResponse{ + Annotations: map[string]string{ + "cdi.k8s.io/nvidia-device-plugin_uuid": "nvidia.com/gpu=gpu0,nvidia.com/gpu=gpu1", + }, + }, + }, + { + description: "multiple devices are added to annotations with custom prefix", + deviceIds: []string{"gpu0", "gpu1"}, + deviceListStrategies: []string{"cdi-annotations"}, + CDIPrefix: "custom.cdi.k8s.io/", + CDIEnabled: true, + expectedResponse: kubeletdevicepluginv1beta1.ContainerAllocateResponse{ + Annotations: map[string]string{ + "custom.cdi.k8s.io/nvidia-device-plugin_uuid": "nvidia.com/gpu=gpu0,nvidia.com/gpu=gpu1", + }, + }, + }, + { + description: "mofed devices are selected if configured", + deviceListStrategies: []string{"cdi-annotations"}, + CDIPrefix: "cdi.k8s.io/", + CDIEnabled: true, + MOFEDEnabled: true, + expectedResponse: kubeletdevicepluginv1beta1.ContainerAllocateResponse{ + Annotations: map[string]string{ + "cdi.k8s.io/nvidia-device-plugin_uuid": "nvidia.com/mofed=all", + }, + }, + }, + { + description: "gds devices are selected if configured", + deviceListStrategies: []string{"cdi-annotations"}, + CDIPrefix: "cdi.k8s.io/", + CDIEnabled: true, + GDSEnabled: true, + expectedResponse: kubeletdevicepluginv1beta1.ContainerAllocateResponse{ + Annotations: map[string]string{ + "cdi.k8s.io/nvidia-device-plugin_uuid": "nvidia.com/gds=all", + }, + }, + }, + { + description: "gds and mofed devices are included with device ids", + deviceIds: []string{"gpu0"}, + deviceListStrategies: []string{"cdi-annotations"}, + CDIPrefix: "cdi.k8s.io/", + CDIEnabled: true, + GDSEnabled: true, + MOFEDEnabled: true, + expectedResponse: kubeletdevicepluginv1beta1.ContainerAllocateResponse{ + Annotations: map[string]string{ + "cdi.k8s.io/nvidia-device-plugin_uuid": "nvidia.com/gpu=gpu0,nvidia.com/gds=all,nvidia.com/mofed=all", + }, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + deviceListStrategies, _ := v1.NewDeviceListStrategies(tc.deviceListStrategies) + plugin := NvidiaDevicePlugin{ + config: &util.DeviceConfig{ + Config: &v1.Config{ + Flags: v1.Flags{ + CommandLineFlags: v1.CommandLineFlags{ + GDSEnabled: &tc.GDSEnabled, + MOFEDEnabled: &tc.MOFEDEnabled, + }, + }, + }, + }, + cdiHandler: &cdi.InterfaceMock{ + QualifiedNameFunc: func(c string, s string) string { + return "nvidia.com/" + c + "=" + s + }, + }, + cdiEnabled: tc.CDIEnabled, + deviceListStrategies: deviceListStrategies, + cdiAnnotationPrefix: tc.CDIPrefix, + } + + response, err := plugin.getAllocateResponseForCDI("uuid", tc.deviceIds) + + require.Nil(t, err) + require.EqualValues(t, &tc.expectedResponse, &response) + }) + } +} + +func Test_pathGeneration(t *testing.T) { + hostHookPath := "/usr/local/vgpu" + uid := "testuid" + cname := "testcname" + expected := "/usr/local/vgpu/containers/testuid_testcname" + result := fmt.Sprintf("%s/containers/%s_%s", hostHookPath, uid, cname) + + if expected != result { + t.Errorf("Expected %s, got %s", expected, result) + } +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/allocate.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/allocate.go new file mode 100644 index 000000000..6b1d8d2f1 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/rm/allocate.go @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package rm + +import ( + "fmt" + "sort" + + "github.com/NVIDIA/go-gpuallocator/gpuallocator" +) + +var alignedAllocationPolicy = gpuallocator.NewBestEffortPolicy() + +// getPreferredAllocation runs an allocation algorithm over the inputs. +// The algorithm chosen is based both on the incoming set of available devices and various config settings. +func (r *resourceManager) getPreferredAllocation(available, required []string, size int) ([]string, error) { + // If all of the available devices are full GPUs without replicas, then + // calculate an aligned allocation across those devices. + if r.Devices().AlignedAllocationSupported() && !AnnotatedIDs(available).AnyHasAnnotations() { + return r.alignedAlloc(available, required, size) + } + + // Otherwise, distribute them evenly across all replicated GPUs + return r.distributedAlloc(available, required, size) +} + +// alignedAlloc shells out to the alignedAllocationPolicy that is set in +// order to calculate the preferred allocation. +func (r *resourceManager) alignedAlloc(available, required []string, size int) ([]string, error) { + var devices []string + + availableDevices, err := gpuallocator.NewDevicesFrom(available) + if err != nil { + return nil, fmt.Errorf("unable to retrieve list of available devices: %v", err) + } + + requiredDevices, err := gpuallocator.NewDevicesFrom(required) + if err != nil { + return nil, fmt.Errorf("unable to retrieve list of required devices: %v", err) + } + + allocatedDevices := alignedAllocationPolicy.Allocate(availableDevices, requiredDevices, size) + + for _, device := range allocatedDevices { + devices = append(devices, device.UUID) + } + + return devices, nil +} + +// distributedAlloc returns a list of devices such that any replicated +// devices are distributed across all replicated GPUs equally. It takes into +// account already allocated replicas to ensure a proper balance across them. +func (r *resourceManager) distributedAlloc(available, required []string, size int) ([]string, error) { + // Get the set of candidate devices as the difference between available and required. + candidates := r.devices.Subset(available).Difference(r.devices.Subset(required)).GetIDs() + needed := size - len(required) + + if len(candidates) < needed { + return nil, fmt.Errorf("not enough available devices to satisfy allocation") + } + + // For each candidate device, build a mapping of (stripped) device ID to + // total / available replicas for that device. + replicas := make(map[string]*struct{ total, available int }) + for _, c := range candidates { + id := AnnotatedID(c).GetID() + if _, exists := replicas[id]; !exists { + replicas[id] = &struct{ total, available int }{} + } + replicas[id].available++ + } + for d := range r.devices { + id := AnnotatedID(d).GetID() + if _, exists := replicas[id]; !exists { + continue + } + replicas[id].total++ + } + + // Grab the set of 'needed' devices one-by-one from the candidates list. + // Before selecting each candidate, first sort the candidate list using the + // replicas map above. After sorting, the first element in the list will + // contain the device with the least difference between total and available + // replications (based on what's already been allocated). Add this device + // to the list of devices to allocate, remove it from the candidate list, + // down its available count in the replicas map, and repeat. + var devices []string + for i := 0; i < needed; i++ { + sort.Slice(candidates, func(i, j int) bool { + iid := AnnotatedID(candidates[i]).GetID() + jid := AnnotatedID(candidates[j]).GetID() + idiff := replicas[iid].total - replicas[iid].available + jdiff := replicas[jid].total - replicas[jid].available + return idiff < jdiff + }) + id := AnnotatedID(candidates[0]).GetID() + replicas[id].available-- + devices = append(devices, candidates[0]) + candidates = candidates[1:] + } + + // Add the set of required devices to this list and return it. + devices = append(required, devices...) + + return devices, nil +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/device_map.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/device_map.go new file mode 100644 index 000000000..f33d80a8f --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/rm/device_map.go @@ -0,0 +1,318 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package rm + +import ( + "fmt" + + "github.com/Project-HAMi/HAMi/pkg/util" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvlib/pkg/nvml" + spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" +) + +type deviceMapBuilder struct { + device.Interface + config *util.DeviceConfig +} + +// DeviceMap stores a set of devices per resource name. +type DeviceMap map[spec.ResourceName]Devices + +// NewDeviceMap creates a device map for the specified NVML library and config. +func NewDeviceMap(nvmllib nvml.Interface, config *util.DeviceConfig) (DeviceMap, error) { + b := deviceMapBuilder{ + Interface: device.New(device.WithNvml(nvmllib)), + config: config, + } + return b.build() +} + +// build builds a map of resource names to devices. +func (b *deviceMapBuilder) build() (DeviceMap, error) { + devices, err := b.buildDeviceMapFromConfigResources() + if err != nil { + return nil, fmt.Errorf("error building device map from config.resources: %v", err) + } + devices, err = updateDeviceMapWithReplicas(b.config, devices) + if err != nil { + return nil, fmt.Errorf("error updating device map with replicas from config.sharing.timeSlicing.resources: %v", err) + } + return devices, nil +} + +// buildDeviceMapFromConfigResources builds a map of resource names to devices from spec.Config.Resources +func (b *deviceMapBuilder) buildDeviceMapFromConfigResources() (DeviceMap, error) { + deviceMap, err := b.buildGPUDeviceMap() + if err != nil { + return nil, fmt.Errorf("error building GPU device map: %v", err) + } + + if *b.config.Flags.MigStrategy == spec.MigStrategyNone { + return deviceMap, nil + } + + migDeviceMap, err := b.buildMigDeviceMap() + if err != nil { + return nil, fmt.Errorf("error building MIG device map: %v", err) + } + + var requireUniformMIGDevices bool + if *b.config.Flags.MigStrategy == spec.MigStrategySingle { + requireUniformMIGDevices = true + } + + err = b.assertAllMigDevicesAreValid(requireUniformMIGDevices) + if err != nil { + return nil, fmt.Errorf("invalid MIG configuration: %v", err) + } + + if requireUniformMIGDevices && !deviceMap.isEmpty() && !migDeviceMap.isEmpty() { + return nil, fmt.Errorf("all devices on the node must be configured with the same migEnabled value") + } + + deviceMap.merge(migDeviceMap) + + return deviceMap, nil +} + +// buildGPUDeviceMap builds a map of resource names to GPU devices +func (b *deviceMapBuilder) buildGPUDeviceMap() (DeviceMap, error) { + devices := make(DeviceMap) + + b.VisitDevices(func(i int, gpu device.Device) error { + name, ret := gpu.GetName() + if ret != nvml.SUCCESS { + return fmt.Errorf("error getting product name for GPU: %v", ret) + } + migEnabled, err := gpu.IsMigEnabled() + if err != nil { + return fmt.Errorf("error checking if MIG is enabled on GPU: %v", err) + } + if migEnabled && *b.config.Flags.MigStrategy != spec.MigStrategyNone { + return nil + } + for _, resource := range b.config.Resources.GPUs { + if resource.Pattern.Matches(name) { + index, info := newGPUDevice(i, gpu) + return devices.setEntry(resource.Name, index, info) + } + } + return fmt.Errorf("GPU name '%v' does not match any resource patterns", name) + }) + return devices, nil +} + +// buildMigDeviceMap builds a map of resource names to MIG devices +func (b *deviceMapBuilder) buildMigDeviceMap() (DeviceMap, error) { + devices := make(DeviceMap) + err := b.VisitMigDevices(func(i int, d device.Device, j int, mig device.MigDevice) error { + migProfile, err := mig.GetProfile() + if err != nil { + return fmt.Errorf("error getting MIG profile for MIG device at index '(%v, %v)': %v", i, j, err) + } + for _, resource := range b.config.Resources.MIGs { + if resource.Pattern.Matches(migProfile.String()) { + index, info := newMigDevice(i, j, mig) + return devices.setEntry(resource.Name, index, info) + } + } + return fmt.Errorf("MIG profile '%v' does not match any resource patterns", migProfile) + }) + return devices, err +} + +// assertAllMigDevicesAreValid ensures that each MIG-enabled device has at least one MIG device +// associated with it. +func (b *deviceMapBuilder) assertAllMigDevicesAreValid(uniform bool) error { + err := b.VisitDevices(func(i int, d device.Device) error { + isMigEnabled, err := d.IsMigEnabled() + if err != nil { + return err + } + if !isMigEnabled { + return nil + } + migDevices, err := d.GetMigDevices() + if err != nil { + return err + } + if len(migDevices) == 0 { + i := 0 + return fmt.Errorf("device %v has an invalid MIG configuration", i) + } + return nil + }) + if err != nil { + return fmt.Errorf("at least one device with migEnabled=true was not configured correctly: %v", err) + } + + if !uniform { + return nil + } + + var previousAttributes *nvml.DeviceAttributes + return b.VisitMigDevices(func(i int, d device.Device, j int, m device.MigDevice) error { + attrs, ret := m.GetAttributes() + if ret != nvml.SUCCESS { + return fmt.Errorf("error getting device attributes: %v", ret) + } + if previousAttributes == nil { + previousAttributes = &attrs + } else if attrs != *previousAttributes { + return fmt.Errorf("more than one MIG device type present on node") + } + + return nil + }) +} + +// setEntry sets the DeviceMap entry for the specified resource. +func (d DeviceMap) setEntry(name spec.ResourceName, index string, info deviceInfo) error { + dev, err := BuildDevice(index, info) + if err != nil { + return fmt.Errorf("error building Device: %v", err) + } + d.insert(name, dev) + return nil +} + +// insert adds the specified device to the device map +func (d DeviceMap) insert(name spec.ResourceName, dev *Device) { + if d[name] == nil { + d[name] = make(Devices) + } + d[name][dev.ID] = dev +} + +// merge merges two devices maps +func (d DeviceMap) merge(o DeviceMap) { + for name, devices := range o { + for _, device := range devices { + d.insert(name, device) + } + } +} + +// isEmpty checks whether a device map is empty +func (d DeviceMap) isEmpty() bool { + for _, devices := range d { + if len(devices) > 0 { + return false + } + } + return true +} + +// getIDsOfDevicesToReplicate returns a list of dervice IDs that we want to replicate. +func (d DeviceMap) getIDsOfDevicesToReplicate(r *spec.ReplicatedResource) ([]string, error) { + devices, exists := d[r.Name] + if !exists { + return nil, nil + } + + // If all devices for this resource type are to be replicated. + if r.Devices.All { + return devices.GetIDs(), nil + } + + // If a specific number of devices for this resource type are to be replicated. + if r.Devices.Count > 0 { + if r.Devices.Count > len(devices) { + return nil, fmt.Errorf("requested %d devices to be replicated, but only %d devices available", r.Devices.Count, len(devices)) + } + return devices.GetIDs()[:r.Devices.Count], nil + } + + // If a specific set of devices for this resource type are to be replicated. + if len(r.Devices.List) > 0 { + var ids []string + for _, ref := range r.Devices.List { + if ref.IsUUID() { + d := devices.GetByID(string(ref)) + if d == nil { + return nil, fmt.Errorf("no matching device with UUID: %v", ref) + } + ids = append(ids, d.ID) + } + if ref.IsGPUIndex() || ref.IsMigIndex() { + d := devices.GetByIndex(string(ref)) + if d == nil { + return nil, fmt.Errorf("no matching device at index: %v", ref) + } + ids = append(ids, d.ID) + } + } + return ids, nil + } + + return nil, fmt.Errorf("unexpected error") +} + +// updateDeviceMapWithReplicas returns an updated map of resource names to devices with replica information from spec.Config.Sharing.TimeSlicing.Resources +func updateDeviceMapWithReplicas(config *util.DeviceConfig, oDevices DeviceMap) (DeviceMap, error) { + devices := make(DeviceMap) + + // Begin by walking config.Sharing.TimeSlicing.Resources and building a map of just the resource names. + names := make(map[spec.ResourceName]bool) + for _, r := range config.Sharing.TimeSlicing.Resources { + names[r.Name] = true + } + + // Copy over all devices from oDevices without a resource reference in TimeSlicing.Resources. + for r, ds := range oDevices { + if !names[r] { + devices[r] = ds + } + } + + // Walk TimeSlicing.Resources and update devices in the device map as appropriate. + for _, r := range config.Sharing.TimeSlicing.Resources { + // Get the IDs of the devices we want to replicate from oDevices + ids, err := oDevices.getIDsOfDevicesToReplicate(&r) + if err != nil { + return nil, fmt.Errorf("unable to get IDs of devices to replicate for '%v' resource: %v", r.Name, err) + } + // Skip any resources not matched in oDevices + if len(ids) == 0 { + continue + } + + // Add any devices we don't want replicated directly into the device map. + for _, d := range oDevices[r.Name].Difference(oDevices[r.Name].Subset(ids)) { + devices.insert(r.Name, d) + } + + // Create replicated devices add them to the device map. + // Rename the resource for replicated devices as requested. + name := r.Name + if r.Rename != "" { + name = r.Rename + } + for _, id := range ids { + for i := 0; i < r.Replicas; i++ { + annotatedID := string(NewAnnotatedID(id, i)) + replicatedDevice := *(oDevices[r.Name][id]) + replicatedDevice.ID = annotatedID + devices.insert(name, &replicatedDevice) + } + } + } + + return devices, nil +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/device_map_test.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/device_map_test.go new file mode 100644 index 000000000..29630b2dc --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/rm/device_map_test.go @@ -0,0 +1,108 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package rm + +import ( + "testing" + + spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" + "github.com/stretchr/testify/require" + kubeletdevicepluginv1beta1 "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" +) + +func TestDeviceMapInsert(t *testing.T) { + device0 := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "0"}} + device0withIndex := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "0"}, Index: "index"} + device1 := Device{Device: kubeletdevicepluginv1beta1.Device{ID: "1"}} + + testCases := []struct { + description string + deviceMap DeviceMap + key string + value *Device + expectedDeviceMap DeviceMap + }{ + { + description: "insert into empty map", + deviceMap: make(DeviceMap), + key: "resource", + value: &device0, + expectedDeviceMap: DeviceMap{ + "resource": Devices{ + "0": &device0, + }, + }, + }, + { + description: "add to existing resource", + deviceMap: DeviceMap{ + "resource": Devices{ + "0": &device0, + }, + }, + key: "resource", + value: &device1, + expectedDeviceMap: DeviceMap{ + "resource": Devices{ + "0": &device0, + "1": &device1, + }, + }, + }, + { + description: "add new resource", + deviceMap: DeviceMap{ + "resource": Devices{ + "0": &device0, + }, + }, + key: "resource1", + value: &device0, + expectedDeviceMap: DeviceMap{ + "resource": Devices{ + "0": &device0, + }, + "resource1": Devices{ + "0": &device0, + }, + }, + }, + { + description: "overwrite existing device", + deviceMap: DeviceMap{ + "resource": Devices{ + "0": &device0, + }, + }, + key: "resource", + value: &device0withIndex, + expectedDeviceMap: DeviceMap{ + "resource": Devices{ + "0": &device0withIndex, + }, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + tc.deviceMap.insert(spec.ResourceName(tc.key), tc.value) + + require.EqualValues(t, tc.expectedDeviceMap, tc.deviceMap) + }) + } +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/devices.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/devices.go new file mode 100644 index 000000000..108b9b8b0 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/rm/devices.go @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package rm + +import ( + "fmt" + "strconv" + "strings" + + "github.com/Project-HAMi/HAMi/pkg/util" + + kubeletdevicepluginv1beta1 "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" +) + +// Device wraps kubeletdevicepluginv1beta1.Device with extra metadata and functions. +type Device struct { + kubeletdevicepluginv1beta1.Device + Paths []string + Index string +} + +// deviceInfo defines the information the required to construct a Device +type deviceInfo interface { + GetUUID() (string, error) + GetPaths() ([]string, error) + GetNumaNode() (bool, int, error) +} + +// Devices wraps a map[string]*Device with some functions. +type Devices map[string]*Device + +// AnnotatedID represents an ID with a replica number embedded in it. +type AnnotatedID string + +// AnnotatedIDs can be used to treat a []string as a []AnnotatedID. +type AnnotatedIDs []string + +// BuildDevice builds an rm.Device with the specified index and deviceInfo +func BuildDevice(index string, d deviceInfo) (*Device, error) { + uuid, err := d.GetUUID() + if err != nil { + return nil, fmt.Errorf("error getting UUID device: %v", err) + } + + paths, err := d.GetPaths() + if err != nil { + return nil, fmt.Errorf("error getting device paths: %v", err) + } + + hasNuma, numa, err := d.GetNumaNode() + if err != nil { + return nil, fmt.Errorf("error getting device NUMA node: %v", err) + } + + dev := Device{} + dev.ID = uuid + dev.Index = index + dev.Paths = paths + dev.Health = kubeletdevicepluginv1beta1.Healthy + if hasNuma { + dev.Topology = &kubeletdevicepluginv1beta1.TopologyInfo{ + Nodes: []*kubeletdevicepluginv1beta1.NUMANode{ + { + ID: int64(numa), + }, + }, + } + } + + return &dev, nil +} + +// Contains checks if Devices contains devices matching all ids. +func (ds Devices) Contains(ids ...string) bool { + for _, id := range ids { + if _, exists := ds[id]; !exists { + return false + } + } + return true +} + +// GetByID returns a reference to the device matching the specified ID (nil otherwise). +func (ds Devices) GetByID(id string) *Device { + return ds[id] +} + +// GetByIndex returns a reference to the device matching the specified Index (nil otherwise). +func (ds Devices) GetByIndex(index string) *Device { + for _, d := range ds { + if d.Index == index { + return d + } + } + return nil +} + +// Subset returns the subset of devices in Devices matching the provided ids. +// If any id in ids is not in Devices, then the subset that did match will be returned. +func (ds Devices) Subset(ids []string) Devices { + res := make(Devices) + for _, id := range ids { + if ds.Contains(id) { + res[id] = ds[id] + } + } + return res +} + +// Difference returns the set of devices contained in ds but not in ods. +func (ds Devices) Difference(ods Devices) Devices { + res := make(Devices) + for id := range ds { + if !ods.Contains(id) { + res[id] = ds[id] + } + } + return res +} + +// GetIDs returns the ids from all devices in the Devices +func (ds Devices) GetIDs() []string { + var res []string + for _, d := range ds { + res = append(res, d.ID) + } + return res +} + +// GetPluginDevices returns the plugin Devices from all devices in the Devices +func (ds Devices) GetPluginDevices() []*kubeletdevicepluginv1beta1.Device { + var res []*kubeletdevicepluginv1beta1.Device + + if !strings.Contains(ds.GetIDs()[0], "MIG") { + for _, dev := range ds { + for i := uint(0); i < *util.DeviceSplitCount; i++ { + id := fmt.Sprintf("%v-%v", dev.ID, i) + res = append(res, &kubeletdevicepluginv1beta1.Device{ + ID: id, + Health: dev.Health, + Topology: nil, + }) + } + } + } else { + for _, d := range ds { + res = append(res, &d.Device) + } + + } + + return res +} + +// GetIndices returns the Indices from all devices in the Devices +func (ds Devices) GetIndices() []string { + var res []string + for _, d := range ds { + res = append(res, d.Index) + } + return res +} + +// GetPaths returns the Paths from all devices in the Devices +func (ds Devices) GetPaths() []string { + var res []string + for _, d := range ds { + res = append(res, d.Paths...) + } + return res +} + +// AlignedAllocationSupported checks whether all devices support an alligned allocation +func (ds Devices) AlignedAllocationSupported() bool { + for _, d := range ds { + if !d.AlignedAllocationSupported() { + return false + } + } + return true +} + +// AlignedAllocationSupported checks whether the device supports an alligned allocation +func (d Device) AlignedAllocationSupported() bool { + if d.IsMigDevice() { + return false + } + + for _, p := range d.Paths { + if p == "/dev/dxg" { + return false + } + } + + return true +} + +// IsMigDevice returns checks whether d is a MIG device or not. +func (d Device) IsMigDevice() bool { + return strings.Contains(d.Index, ":") +} + +// GetUUID returns the UUID for the device from the annotated ID. +func (d Device) GetUUID() string { + return AnnotatedID(d.ID).GetID() +} + +// NewAnnotatedID creates a new AnnotatedID from an ID and a replica number. +func NewAnnotatedID(id string, replica int) AnnotatedID { + return AnnotatedID(fmt.Sprintf("%s::%d", id, replica)) +} + +// HasAnnotations checks if an AnnotatedID has any annotations or not. +func (r AnnotatedID) HasAnnotations() bool { + split := strings.SplitN(string(r), "::", 2) + if len(split) != 2 { + return false + } + return true +} + +// Split splits a AnnotatedID into its ID and replica number parts. +func (r AnnotatedID) Split() (string, int) { + split := strings.SplitN(string(r), "::", 2) + if len(split) != 2 { + return string(r), 0 + } + replica, _ := strconv.ParseInt(split[1], 10, 0) + return split[0], int(replica) +} + +// GetID returns just the ID part of the replicated ID +func (r AnnotatedID) GetID() string { + id, _ := r.Split() + return id +} + +// AnyHasAnnotations checks if any ID has annotations or not. +func (rs AnnotatedIDs) AnyHasAnnotations() bool { + for _, r := range rs { + if AnnotatedID(r).HasAnnotations() { + return true + } + } + return false +} + +// GetIDs returns just the ID parts of the annotated IDs as a []string +func (rs AnnotatedIDs) GetIDs() []string { + res := make([]string, len(rs)) + for i, r := range rs { + res[i] = AnnotatedID(r).GetID() + } + return res +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/health.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/health.go new file mode 100644 index 000000000..65bd2a88e --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/rm/health.go @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package rm + +import ( + "fmt" + "os" + "strconv" + "strings" + + "github.com/NVIDIA/go-nvlib/pkg/nvml" + "k8s.io/klog/v2" +) + +const ( + // envDisableHealthChecks defines the environment variable that is checked to determine whether healthchecks + // should be disabled. If this envvar is set to "all" or contains the string "xids", healthchecks are + // disabled entirely. If set, the envvar is treated as a comma-separated list of Xids to ignore. Note that + // this is in addition to the Application errors that are already ignored. + envDisableHealthChecks = "DP_DISABLE_HEALTHCHECKS" + allHealthChecks = "xids" + + // maxSuccessiveEventErrorCount sets the number of errors waiting for events before marking all devices as unhealthy. + maxSuccessiveEventErrorCount = 3 +) + +// CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices +func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devices, unhealthy chan<- *Device) error { + disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks)) + if disableHealthChecks == "all" { + disableHealthChecks = allHealthChecks + } + if strings.Contains(disableHealthChecks, "xids") { + return nil + } + + ret := r.nvml.Init() + if ret != nvml.SUCCESS { + if *r.config.Flags.FailOnInitError { + return fmt.Errorf("failed to initialize NVML: %v", ret) + } + return nil + } + defer func() { + ret := r.nvml.Shutdown() + if ret != nvml.SUCCESS { + klog.Infof("Error shutting down NVML: %v", ret) + } + }() + + // FIXME: formalize the full list and document it. + // http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4 + // Application errors: the GPU should still be healthy + applicationErrorXids := []uint64{ + 13, // Graphics Engine Exception + 31, // GPU memory page fault + 43, // GPU stopped processing + 45, // Preemptive cleanup, due to previous errors + 68, // Video processor exception + } + + skippedXids := make(map[uint64]bool) + for _, id := range applicationErrorXids { + skippedXids[id] = true + } + + for _, additionalXid := range getAdditionalXids(disableHealthChecks) { + skippedXids[additionalXid] = true + } + + eventSet, ret := r.nvml.EventSetCreate() + if ret != nvml.SUCCESS { + return fmt.Errorf("failed to create event set: %v", ret) + } + defer eventSet.Free() + + parentToDeviceMap := make(map[string]*Device) + deviceIDToGiMap := make(map[string]int) + deviceIDToCiMap := make(map[string]int) + + eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError) + for _, d := range devices { + uuid, gi, ci, err := r.getDevicePlacement(d) + if err != nil { + klog.Warningf("Could not determine device placement for %v: %v; Marking it unhealthy.", d.ID, err) + unhealthy <- d + continue + } + deviceIDToGiMap[d.ID] = gi + deviceIDToCiMap[d.ID] = ci + parentToDeviceMap[uuid] = d + + gpu, ret := r.nvml.DeviceGetHandleByUUID(uuid) + if ret != nvml.SUCCESS { + klog.Infof("unable to get device handle from UUID: %v; marking it as unhealthy", ret) + unhealthy <- d + continue + } + + supportedEvents, ret := gpu.GetSupportedEventTypes() + if ret != nvml.SUCCESS { + klog.Infof("Unable to determine the supported events for %v: %v; marking it as unhealthy", d.ID, ret) + unhealthy <- d + continue + } + + ret = gpu.RegisterEvents(eventMask&supportedEvents, eventSet) + if ret == nvml.ERROR_NOT_SUPPORTED { + klog.Warningf("Device %v is too old to support healthchecking.", d.ID) + } + if ret != nvml.SUCCESS { + klog.Infof("Marking device %v as unhealthy: %v", d.ID, ret) + unhealthy <- d + } + } + + for { + select { + case <-stop: + return nil + default: + } + + e, ret := eventSet.Wait(5000) + if ret == nvml.ERROR_TIMEOUT { + continue + } + if ret != nvml.SUCCESS { + klog.Infof("Error waiting for event: %v; Marking all devices as unhealthy", ret) + for _, d := range devices { + unhealthy <- d + } + continue + } + + if e.EventType != nvml.EventTypeXidCriticalError { + klog.Infof("Skipping non-nvmlEventTypeXidCriticalError event: %+v", e) + continue + } + + if skippedXids[e.EventData] { + klog.Infof("Skipping event %+v", e) + continue + } + + klog.Infof("Processing event %+v", e) + eventUUID, ret := e.Device.GetUUID() + if ret != nvml.SUCCESS { + // If we cannot reliably determine the device UUID, we mark all devices as unhealthy. + klog.Infof("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", e, ret) + for _, d := range devices { + unhealthy <- d + } + continue + } + + d, exists := parentToDeviceMap[eventUUID] + if !exists { + klog.Infof("Ignoring event for unexpected device: %v", eventUUID) + continue + } + + if d.IsMigDevice() && e.GpuInstanceId != 0xFFFFFFFF && e.ComputeInstanceId != 0xFFFFFFFF { + gi := deviceIDToGiMap[d.ID] + ci := deviceIDToCiMap[d.ID] + if !(uint32(gi) == e.GpuInstanceId && uint32(ci) == e.ComputeInstanceId) { + continue + } + klog.Infof("Event for mig device %v (gi=%v, ci=%v)", d.ID, gi, ci) + } + + klog.Infof("XidCriticalError: Xid=%d on Device=%s; marking device as unhealthy.", e.EventData, d.ID) + unhealthy <- d + } +} + +// getAdditionalXids returns a list of additional Xids to skip from the specified string. +// The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values. Invalid values +// are ignored. +func getAdditionalXids(input string) []uint64 { + if input == "" { + return nil + } + + var additionalXids []uint64 + for _, additionalXid := range strings.Split(input, ",") { + trimmed := strings.TrimSpace(additionalXid) + if trimmed == "" { + continue + } + xid, err := strconv.ParseUint(trimmed, 10, 64) + if err != nil { + klog.Infof("Ignoring malformed Xid value %v: %v", trimmed, err) + continue + } + additionalXids = append(additionalXids, xid) + } + + return additionalXids +} + +// getDevicePlacement returns the placement of the specified device. +// For a MIG device the placement is defined by the 3-tuple +// For a full device the returned 3-tuple is the device's uuid and 0xFFFFFFFF for the other two elements. +func (r *nvmlResourceManager) getDevicePlacement(d *Device) (string, int, int, error) { + if !d.IsMigDevice() { + return d.GetUUID(), 0xFFFFFFFF, 0xFFFFFFFF, nil + } + return r.getMigDeviceParts(d) +} + +// getMigDeviceParts returns the parent GI and CI ids of the MIG device. +func (r *nvmlResourceManager) getMigDeviceParts(d *Device) (string, int, int, error) { + if !d.IsMigDevice() { + return "", 0, 0, fmt.Errorf("cannot get GI and CI of full device") + } + + uuid := d.GetUUID() + // For older driver versions, the call to DeviceGetHandleByUUID will fail for MIG devices. + mig, ret := r.nvml.DeviceGetHandleByUUID(uuid) + if ret == nvml.SUCCESS { + parentHandle, ret := mig.GetDeviceHandleFromMigDeviceHandle() + if ret != nvml.SUCCESS { + return "", 0, 0, fmt.Errorf("failed to get parent device handle: %v", ret) + } + + parentUUID, ret := parentHandle.GetUUID() + if ret != nvml.SUCCESS { + return "", 0, 0, fmt.Errorf("failed to get parent uuid: %v", ret) + } + gi, ret := mig.GetGpuInstanceId() + if ret != nvml.SUCCESS { + return "", 0, 0, fmt.Errorf("failed to get GPU Instance ID: %v", ret) + } + + ci, ret := mig.GetComputeInstanceId() + if ret != nvml.SUCCESS { + return "", 0, 0, fmt.Errorf("failed to get Compute Instance ID: %v", ret) + } + return parentUUID, gi, ci, nil + } + return parseMigDeviceUUID(uuid) +} + +// parseMigDeviceUUID splits the MIG device UUID into the parent device UUID and ci and gi +func parseMigDeviceUUID(mig string) (string, int, int, error) { + tokens := strings.SplitN(mig, "-", 2) + if len(tokens) != 2 || tokens[0] != "MIG" { + return "", 0, 0, fmt.Errorf("unable to parse UUID as MIG device") + } + + tokens = strings.SplitN(tokens[1], "/", 3) + if len(tokens) != 3 || !strings.HasPrefix(tokens[0], "GPU-") { + return "", 0, 0, fmt.Errorf("unable to parse UUID as MIG device") + } + + gi, err := strconv.Atoi(tokens[1]) + if err != nil { + return "", 0, 0, fmt.Errorf("unable to parse UUID as MIG device") + } + + ci, err := strconv.Atoi(tokens[2]) + if err != nil { + return "", 0, 0, fmt.Errorf("unable to parse UUID as MIG device") + } + + return tokens[0], gi, ci, nil +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/health_test.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/health_test.go new file mode 100644 index 000000000..edc8982db --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/rm/health_test.go @@ -0,0 +1,84 @@ +/** +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package rm + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestGetAdditionalXids(t *testing.T) { + testCases := []struct { + description string + input string + expected []uint64 + }{ + { + description: "Empty input", + }, + { + description: "Only comma", + input: ",", + }, + { + description: "Non-integer input", + input: "not-an-int", + }, + { + description: "Single integer", + input: "68", + expected: []uint64{68}, + }, + { + description: "Negative integer", + input: "-68", + }, + { + description: "Single integer with trailing spaces", + input: "68 ", + expected: []uint64{68}, + }, + { + description: "Single integer followed by comma without trailing number", + input: "68,", + expected: []uint64{68}, + }, + { + description: "Comma without preceding number followed by single integer", + input: ",68", + expected: []uint64{68}, + }, + { + description: "Two comma-separated integers", + input: "68,67", + expected: []uint64{68, 67}, + }, + { + description: "Two integers separated by non-integer", + input: "68,not-an-int,67", + expected: []uint64{68, 67}, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + xids := getAdditionalXids(tc.input) + require.EqualValues(t, tc.expected, xids) + }) + } +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/helper.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/helper.go new file mode 100644 index 000000000..0b5e4e878 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/rm/helper.go @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package rm + +// int8Slice wraps an []int8 with more functions. +type int8Slice []int8 + +// String turns a nil terminated int8Slice into a string +func (s int8Slice) String() string { + var b []byte + for _, c := range s { + if c == 0 { + break + } + b = append(b, byte(c)) + } + return string(b) +} + +// uintPtr returns a *uint from a uint32 +func uintPtr(c uint32) *uint { + i := uint(c) + return &i +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_devices.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_devices.go new file mode 100644 index 000000000..4ffbeb244 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_devices.go @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package rm + +import ( + "bytes" + "fmt" + "os" + "strconv" + "strings" + + "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/mig" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvlib/pkg/nvml" +) + +const ( + nvidiaProcDriverPath = "/proc/driver/nvidia" + nvidiaCapabilitiesPath = nvidiaProcDriverPath + "/capabilities" +) + +// nvmlDevice wraps an nvml.Device with more functions. +type nvmlDevice struct { + nvml.Device +} + +// nvmlMigDevice allows for specific functions of nvmlDevice to be overridden. +type nvmlMigDevice nvmlDevice + +var _ deviceInfo = (*nvmlDevice)(nil) +var _ deviceInfo = (*nvmlMigDevice)(nil) + +func newGPUDevice(i int, gpu nvml.Device) (string, deviceInfo) { + index := fmt.Sprintf("%v", i) + isWsl, _ := info.New().HasDXCore() + if isWsl { + return index, wslDevice{gpu} + } + + return index, nvmlDevice{gpu} +} + +func newMigDevice(i int, j int, mig nvml.Device) (string, nvmlMigDevice) { + return fmt.Sprintf("%v:%v", i, j), nvmlMigDevice{mig} +} + +// GetUUID returns the UUID of the device +func (d nvmlDevice) GetUUID() (string, error) { + uuid, ret := d.Device.GetUUID() + if ret != nvml.SUCCESS { + return "", ret + } + return uuid, nil +} + +// GetUUID returns the UUID of the device +func (d nvmlMigDevice) GetUUID() (string, error) { + return nvmlDevice(d).GetUUID() +} + +// GetPaths returns the paths for a GPU device +func (d nvmlDevice) GetPaths() ([]string, error) { + minor, ret := d.GetMinorNumber() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting GPU device minor number: %v", ret) + } + path := fmt.Sprintf("/dev/nvidia%d", minor) + + return []string{path}, nil +} + +// GetPaths returns the paths for a MIG device +func (d nvmlMigDevice) GetPaths() ([]string, error) { + capDevicePaths, err := mig.GetMigCapabilityDevicePaths() + if err != nil { + return nil, fmt.Errorf("error getting MIG capability device paths: %v", err) + } + + gi, ret := d.GetGpuInstanceId() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting GPU Instance ID: %v", ret) + } + + ci, ret := d.GetComputeInstanceId() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting Compute Instance ID: %v", ret) + } + + parent, ret := d.GetDeviceHandleFromMigDeviceHandle() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting parent device: %v", ret) + } + minor, ret := parent.GetMinorNumber() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting GPU device minor number: %v", ret) + } + parentPath := fmt.Sprintf("/dev/nvidia%d", minor) + + giCapPath := fmt.Sprintf(nvidiaCapabilitiesPath+"/gpu%d/mig/gi%d/access", minor, gi) + if _, exists := capDevicePaths[giCapPath]; !exists { + return nil, fmt.Errorf("missing MIG GPU instance capability path: %v", giCapPath) + } + + ciCapPath := fmt.Sprintf(nvidiaCapabilitiesPath+"/gpu%d/mig/gi%d/ci%d/access", minor, gi, ci) + if _, exists := capDevicePaths[ciCapPath]; !exists { + return nil, fmt.Errorf("missing MIG GPU instance capability path: %v", giCapPath) + } + + devicePaths := []string{ + parentPath, + capDevicePaths[giCapPath], + capDevicePaths[ciCapPath], + } + + return devicePaths, nil +} + +// GetNumaNode returns the NUMA node associated with the GPU device +func (d nvmlDevice) GetNumaNode() (bool, int, error) { + pciInfo, ret := d.GetPciInfo() + if ret != nvml.SUCCESS { + return false, 0, fmt.Errorf("error getting PCI Bus Info of device: %v", ret) + } + + // Discard leading zeros. + busID := strings.ToLower(strings.TrimPrefix(int8Slice(pciInfo.BusId[:]).String(), "0000")) + + b, err := os.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", busID)) + if err != nil { + return false, 0, nil + } + + node, err := strconv.Atoi(string(bytes.TrimSpace(b))) + if err != nil { + return false, 0, fmt.Errorf("eror parsing value for NUMA node: %v", err) + } + + if node < 0 { + return false, 0, nil + } + + return true, node, nil +} + +// GetNumaNode for a MIG device is the NUMA node of the parent device. +func (d nvmlMigDevice) GetNumaNode() (bool, int, error) { + parent, ret := d.GetDeviceHandleFromMigDeviceHandle() + if ret != nvml.SUCCESS { + return false, 0, fmt.Errorf("error getting parent GPU device from MIG device: %v", ret) + } + + return nvmlDevice{parent}.GetNumaNode() +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_manager.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_manager.go new file mode 100644 index 000000000..18977ad14 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_manager.go @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package rm + +import ( + "fmt" + + "github.com/Project-HAMi/HAMi/pkg/util" + + "github.com/NVIDIA/go-nvlib/pkg/nvml" + "k8s.io/klog/v2" +) + +type nvmlResourceManager struct { + resourceManager + nvml nvml.Interface +} + +var _ ResourceManager = (*nvmlResourceManager)(nil) + +// NewNVMLResourceManagers returns a set of ResourceManagers, one for each NVML resource in 'config'. +func NewNVMLResourceManagers(nvmllib nvml.Interface, config *util.DeviceConfig) ([]ResourceManager, error) { + ret := nvmllib.Init() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("failed to initialize NVML: %v", ret) + } + defer func() { + ret := nvmllib.Shutdown() + if ret != nvml.SUCCESS { + klog.Infof("Error shutting down NVML: %v", ret) + } + }() + + deviceMap, err := NewDeviceMap(nvmllib, config) + if err != nil { + return nil, fmt.Errorf("error building device map: %v", err) + } + + var rms []ResourceManager + for resourceName, devices := range deviceMap { + if len(devices) == 0 { + continue + } + r := &nvmlResourceManager{ + resourceManager: resourceManager{ + config: config, + resource: resourceName, + devices: devices, + }, + nvml: nvmllib, + } + rms = append(rms, r) + } + + return rms, nil +} + +// GetPreferredAllocation runs an allocation algorithm over the inputs. +// The algorithm chosen is based both on the incoming set of available devices and various config settings. +func (r *nvmlResourceManager) GetPreferredAllocation(available, required []string, size int) ([]string, error) { + return r.getPreferredAllocation(available, required, size) +} + +// GetDevicePaths returns the required and optional device nodes for the requested resources +func (r *nvmlResourceManager) GetDevicePaths(ids []string) []string { + paths := []string{ + "/dev/nvidiactl", + "/dev/nvidia-uvm", + "/dev/nvidia-uvm-tools", + "/dev/nvidia-modeset", + } + + for _, p := range r.Devices().Subset(ids).GetPaths() { + paths = append(paths, p) + } + + return paths +} + +// CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices +func (r *nvmlResourceManager) CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error { + return r.checkHealth(stop, r.devices, unhealthy) +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/rm.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/rm.go new file mode 100644 index 000000000..afc907d68 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/rm/rm.go @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package rm + +import ( + "fmt" + "strings" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvlib/pkg/nvml" + spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" + "github.com/Project-HAMi/HAMi/pkg/util" + "k8s.io/klog/v2" +) + +// resourceManager forms the base type for specific resource manager implementations +type resourceManager struct { + config *util.DeviceConfig + resource spec.ResourceName + devices Devices +} + +// ResourceManager provides an interface for listing a set of Devices and checking health on them +type ResourceManager interface { + Resource() spec.ResourceName + Devices() Devices + GetDevicePaths([]string) []string + GetPreferredAllocation(available, required []string, size int) ([]string, error) + CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error +} + +// NewResourceManagers returns a []ResourceManager, one for each resource in 'config'. +func NewResourceManagers(nvmllib nvml.Interface, config *util.DeviceConfig) ([]ResourceManager, error) { + // logWithReason logs the output of the has* / is* checks from the info.Interface + logWithReason := func(f func() (bool, string), tag string) bool { + is, reason := f() + if !is { + tag = "non-" + tag + } + klog.Infof("Detected %v platform: %v", tag, reason) + return is + } + + infolib := info.New() + + hasNVML := logWithReason(infolib.HasNvml, "NVML") + isTegra := logWithReason(infolib.IsTegraSystem, "Tegra") + + if !hasNVML && !isTegra { + klog.Error("Incompatible platform detected") + klog.Error("If this is a GPU node, did you configure the NVIDIA Container Toolkit?") + klog.Error("You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites") + klog.Error("You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start") + klog.Error("If this is not a GPU node, you should set up a toleration or nodeSelector to only deploy this plugin on GPU nodes") + if *config.Flags.FailOnInitError { + return nil, fmt.Errorf("platform detection failed") + } + return nil, nil + } + + // The NVIDIA container stack does not yet support the use of integrated AND discrete GPUs on the same node. + if hasNVML && isTegra { + klog.Warning("Disabling Tegra-based resources on NVML system") + isTegra = false + } + + var resourceManagers []ResourceManager + + if hasNVML { + nvmlManagers, err := NewNVMLResourceManagers(nvmllib, config) + if err != nil { + return nil, fmt.Errorf("failed to construct NVML resource managers: %v", err) + } + resourceManagers = append(resourceManagers, nvmlManagers...) + } + + if isTegra { + tegraManagers, err := NewTegraResourceManagers(config) + if err != nil { + return nil, fmt.Errorf("failed to construct Tegra resource managers: %v", err) + } + resourceManagers = append(resourceManagers, tegraManagers...) + } + + return resourceManagers, nil +} + +// Resource gets the resource name associated with the ResourceManager +func (r *resourceManager) Resource() spec.ResourceName { + return r.resource +} + +// Resource gets the devices managed by the ResourceManager +func (r *resourceManager) Devices() Devices { + return r.devices +} + +// AddDefaultResourcesToConfig adds default resource matching rules to config.Resources +func AddDefaultResourcesToConfig(config *util.DeviceConfig) error { + //config.Resources.AddGPUResource("*", "gpu") + config.Resources.GPUs = append(config.Resources.GPUs, spec.Resource{ + Pattern: "*", + Name: spec.ResourceName(*config.ResourceName), + }) + fmt.Println("config=", config.Resources.GPUs) + switch *config.Flags.MigStrategy { + case spec.MigStrategySingle: + return config.Resources.AddMIGResource("*", "gpu") + case spec.MigStrategyMixed: + hasNVML, reason := info.New().HasNvml() + if !hasNVML { + klog.Warningf("mig-strategy=%q is only supported with NVML", spec.MigStrategyMixed) + klog.Warningf("NVML not detected: %v", reason) + return nil + } + + nvmllib := nvml.New() + ret := nvmllib.Init() + if ret != nvml.SUCCESS { + if *config.Flags.FailOnInitError { + return fmt.Errorf("failed to initialize NVML: %v", ret) + } + return nil + } + defer func() { + ret := nvmllib.Shutdown() + if ret != nvml.SUCCESS { + klog.Errorf("Error shutting down NVML: %v", ret) + } + }() + + devicelib := device.New( + device.WithNvml(nvmllib), + ) + return devicelib.VisitMigProfiles(func(p device.MigProfile) error { + profileInfo := p.GetInfo() + if profileInfo.C != profileInfo.G { + return nil + } + resourceName := strings.ReplaceAll("mig-"+p.String(), "+", ".") + return config.Resources.AddMIGResource(p.String(), resourceName) + }) + } + return nil +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/tegra_devices.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/tegra_devices.go new file mode 100644 index 000000000..d7df377f7 --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/rm/tegra_devices.go @@ -0,0 +1,69 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package rm + +import ( + "fmt" + + "github.com/Project-HAMi/HAMi/pkg/util" +) + +const ( + tegraDeviceName = "tegra" +) + +// buildTegraDeviceMap creates a DeviceMap for the tegra devices in the sytesm. +// NOTE: At present only a single tegra device is expected. +func buildTegraDeviceMap(config *util.DeviceConfig) (DeviceMap, error) { + devices := make(DeviceMap) + + name := tegraDeviceName + i := 0 + for _, resource := range config.Resources.GPUs { + if resource.Pattern.Matches(name) { + index := fmt.Sprintf("%d", i) + err := devices.setEntry(resource.Name, index, &tegraDevice{}) + if err != nil { + return nil, err + } + i++ + } + + } + return devices, nil +} + +type tegraDevice struct{} + +var _ deviceInfo = (*tegraDevice)(nil) + +// GetUUID returns the UUID of the tegra device. +// TODO: This is currently hardcoded to `tegra` +func (d *tegraDevice) GetUUID() (string, error) { + return tegraDeviceName, nil +} + +// GetPaths returns the paths for a tegra device. +// A tegra device does not have paths associated with it. +func (d *tegraDevice) GetPaths() ([]string, error) { + return nil, nil +} + +// GetNumaNode always returns unsupported for a Tegra device +func (d *tegraDevice) GetNumaNode() (bool, int, error) { + return false, -1, nil +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/tegra_manager.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/tegra_manager.go new file mode 100644 index 000000000..053a5a22b --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/rm/tegra_manager.go @@ -0,0 +1,76 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package rm + +import ( + "fmt" + + "github.com/Project-HAMi/HAMi/pkg/util" +) + +type tegraResourceManager struct { + resourceManager +} + +var _ ResourceManager = (*tegraResourceManager)(nil) + +// NewTegraResourceManagers returns a set of ResourceManagers for tegra resources +func NewTegraResourceManagers(config *util.DeviceConfig) ([]ResourceManager, error) { + deviceMap, err := buildTegraDeviceMap(config) + if err != nil { + return nil, fmt.Errorf("error building Tegra device map: %v", err) + } + + deviceMap, err = updateDeviceMapWithReplicas(config, deviceMap) + if err != nil { + return nil, fmt.Errorf("error updating device map with replicas from config.sharing.timeSlicing.resources: %v", err) + } + + var rms []ResourceManager + for resourceName, devices := range deviceMap { + if len(devices) == 0 { + continue + } + r := &tegraResourceManager{ + resourceManager: resourceManager{ + config: config, + resource: resourceName, + devices: devices, + }, + } + if len(devices) != 0 { + rms = append(rms, r) + } + } + + return rms, nil +} + +// GetPreferredAllocation returns a standard allocation for the Tegra resource manager. +func (r *tegraResourceManager) GetPreferredAllocation(available, required []string, size int) ([]string, error) { + return r.distributedAlloc(available, required, size) +} + +// GetDevicePaths returns an empty slice for the tegraResourceManager +func (r *tegraResourceManager) GetDevicePaths(ids []string) []string { + return nil +} + +// CheckHealth is disabled for the tegraResourceManager +func (r *tegraResourceManager) CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error { + return nil +} diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/wsl_devices.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/wsl_devices.go new file mode 100644 index 000000000..e3696163c --- /dev/null +++ b/pkg/device-plugin/nvidiadevice/nvinternal/rm/wsl_devices.go @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package rm + +type wslDevice nvmlDevice + +var _ deviceInfo = (*wslDevice)(nil) + +// GetUUID returns the UUID of the device +func (d wslDevice) GetUUID() (string, error) { + return nvmlDevice(d).GetUUID() +} + +// GetPaths returns the paths for a tegra device. +func (d wslDevice) GetPaths() ([]string, error) { + return []string{"/dev/dxg"}, nil +} + +// GetNumaNode returns the NUMA node associated with the GPU device +func (d wslDevice) GetNumaNode() (bool, int, error) { + return nvmlDevice(d).GetNumaNode() +} diff --git a/pkg/device-plugin/plugin.go b/pkg/device-plugin/plugin.go deleted file mode 100644 index 9378bee07..000000000 --- a/pkg/device-plugin/plugin.go +++ /dev/null @@ -1,433 +0,0 @@ -/* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package device_plugin - -import ( - "4pd.io/k8s-vgpu/pkg/api" - "4pd.io/k8s-vgpu/pkg/device-plugin/config" - "fmt" - "k8s.io/apimachinery/pkg/util/uuid" - "log" - "net" - "os" - "path" - "strconv" - "time" - - "github.com/NVIDIA/go-gpuallocator/gpuallocator" - "golang.org/x/net/context" - "google.golang.org/grpc" - pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" -) - -// Constants to represent the various device list strategies -const ( - DeviceListStrategyEnvvar = "envvar" - DeviceListStrategyVolumeMounts = "volume-mounts" -) - -// Constants to represent the various device id strategies -const ( - DeviceIDStrategyUUID = "uuid" - DeviceIDStrategyIndex = "index" -) - -// Constants for use by the 'volume-mounts' device list strategy -const ( - deviceListAsVolumeMountsHostPath = "/dev/null" - deviceListAsVolumeMountsContainerPathRoot = "/var/run/nvidia-container-devices" -) - -// NvidiaDevicePlugin implements the Kubernetes device plugin API -type NvidiaDevicePlugin struct { - //ResourceManager - //resourceManager *ResourceManager - deviceCache *DeviceCache - resourceName string - //deviceListEnvvar string - allocatePolicy gpuallocator.Policy - socket string - - server *grpc.Server - //cachedDevices []*Device - health chan *Device - stop chan interface{} - //changed chan struct{} - //devRegister *DeviceRegister - //podManager *PodManager -} - -// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin -func NewNvidiaDevicePlugin(resourceName string, deviceCache *DeviceCache, allocatePolicy gpuallocator.Policy, socket string) *NvidiaDevicePlugin { - return &NvidiaDevicePlugin{ - deviceCache: deviceCache, - resourceName: resourceName, - allocatePolicy: allocatePolicy, - socket: socket, - - // These will be reinitialized every - // time the plugin server is restarted. - server: nil, - health: nil, - stop: nil, - } -} - -func (m *NvidiaDevicePlugin) initialize() { - var err error - m.server = grpc.NewServer([]grpc.ServerOption{}...) - m.health = make(chan *Device) - m.stop = make(chan interface{}) - check(err) -} - -func (m *NvidiaDevicePlugin) cleanup() { - close(m.stop) - m.server = nil - m.health = nil - m.stop = nil - //m.podManager = nil -} - -// Start starts the gRPC server, registers the device plugin with the Kubelet, -// and starts the device healthchecks. -func (m *NvidiaDevicePlugin) Start() error { - m.initialize() - - err := m.Serve() - if err != nil { - log.Printf("Could not start device plugin for '%s': %s", m.resourceName, err) - m.cleanup() - return err - } - log.Printf("Starting to serve '%s' on %s", m.resourceName, m.socket) - - err = m.Register() - if err != nil { - log.Printf("Could not register device plugin: %s", err) - m.Stop() - return err - } - log.Printf("Registered device plugin for '%s' with Kubelet", m.resourceName) - - m.deviceCache.AddNotifyChannel("plugin", m.health) - return nil -} - -// Stop stops the gRPC server. -func (m *NvidiaDevicePlugin) Stop() error { - if m == nil || m.server == nil { - return nil - } - log.Printf("Stopping to serve '%s' on %s", m.resourceName, m.socket) - m.deviceCache.RemoveNotifyChannel("plugin") - m.server.Stop() - if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) { - return err - } - m.cleanup() - return nil -} - -// Serve starts the gRPC server of the device plugin. -func (m *NvidiaDevicePlugin) Serve() error { - os.Remove(m.socket) - sock, err := net.Listen("unix", m.socket) - if err != nil { - return err - } - - pluginapi.RegisterDevicePluginServer(m.server, m) - - go func() { - lastCrashTime := time.Now() - restartCount := 0 - for { - log.Printf("Starting GRPC server for '%s'", m.resourceName) - err := m.server.Serve(sock) - if err == nil { - break - } - - log.Printf("GRPC server for '%s' crashed with error: %v", m.resourceName, err) - - // restart if it has not been too often - // i.e. if server has crashed more than 5 times and it didn't last more than one hour each time - if restartCount > 5 { - // quit - log.Fatalf("GRPC server for '%s' has repeatedly crashed recently. Quitting", m.resourceName) - } - timeSinceLastCrash := time.Since(lastCrashTime).Seconds() - lastCrashTime = time.Now() - if timeSinceLastCrash > 3600 { - // it has been one hour since the last crash.. reset the count - // to reflect on the frequency - restartCount = 1 - } else { - restartCount++ - } - } - }() - - // Wait for server to start by launching a blocking connexion - conn, err := m.dial(m.socket, 5*time.Second) - if err != nil { - return err - } - conn.Close() - - return nil -} - -// Register registers the device plugin for the given resourceName with Kubelet. -func (m *NvidiaDevicePlugin) Register() error { - conn, err := m.dial(pluginapi.KubeletSocket, 5*time.Second) - if err != nil { - return err - } - defer conn.Close() - - client := pluginapi.NewRegistrationClient(conn) - reqt := &pluginapi.RegisterRequest{ - Version: pluginapi.Version, - Endpoint: path.Base(m.socket), - ResourceName: m.resourceName, - Options: &pluginapi.DevicePluginOptions{ - GetPreferredAllocationAvailable: false, - }, - } - - _, err = client.Register(context.Background(), reqt) - if err != nil { - return err - } - return nil -} - -// GetDevicePluginOptions returns the values of the optional settings for this plugin -func (m *NvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) { - options := &pluginapi.DevicePluginOptions{ - GetPreferredAllocationAvailable: false, - } - return options, nil -} - -// ListAndWatch lists devices and update that list according to the health status -func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error { - _ = s.Send(&pluginapi.ListAndWatchResponse{Devices: m.apiDevices()}) - for { - select { - case <-m.stop: - return nil - case d := <-m.health: - // FIXME: there is no way to recover from the Unhealthy state. - //d.Health = pluginapi.Unhealthy - log.Printf("'%s' device marked unhealthy: %s", m.resourceName, d.ID) - _ = s.Send(&pluginapi.ListAndWatchResponse{Devices: m.apiDevices()}) - } - } -} - -// GetPreferredAllocation returns the preferred allocation from the set of devices specified in the request -func (m *NvidiaDevicePlugin) GetPreferredAllocation(ctx context.Context, r *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) { - - return &pluginapi.PreferredAllocationResponse{}, nil -} - -// Allocate which return list of devices. -func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { - ////reqNums := make([]int, 0, len(reqs.ContainerRequests)) - ////for _, req := range reqs.ContainerRequests { - //// reqNums = append(reqNums, len(req.DevicesIDs)) - ////} - ////klog.V(3).Infof("allocate for device %v", reqNums) - // - ////devRequests, err := m.podManager.getDevices(reqNums) - //if err != nil { - // klog.Errorf("get device request error, %v", err) - // return nil, err - //} - //if devRequests == nil { - // err = fmt.Errorf("get device request empty") - // klog.Errorf("%v", err) - // return nil, err - //} - responses := pluginapi.AllocateResponse{} - for _, _ = range reqs.ContainerRequests { - //reqDeviceIDs := req.DevicesIDs - //devs, err := m.getDevices(reqDeviceIDs) - //if err != nil { - // return nil, err - //} - - response := pluginapi.ContainerAllocateResponse{} - - //response.Envs = m.apiEnvs(m.deviceListEnvvar, reqDeviceIDs) - ////var mapEnvs []string - //for i, dev := range devs { - // limitKey := fmt.Sprintf("CUDA_DEVICE_MEMORY_LIMIT_%v", i) - // response.Envs[limitKey] = fmt.Sprintf("%vm", config.DeviceMemoryScaling*float64(dev.Memory)/float64(config.DeviceSplitCount)) - // //mapEnvs = append(mapEnvs, fmt.Sprintf("%v:%v", i, vd.dev.ID)) - //} - response.Envs = make(map[string]string) - //response.Annotations = map[string]string{util.AssignedIDsAnnotations: util.EncodeContainerDevices(reqDeviceIDs)} - response.Envs["CUDA_DEVICE_SM_LIMIT"] = strconv.Itoa(int(100 * config.DeviceCoresScaling / float64(config.DeviceSplitCount))) - //response.Envs["NVIDIA_DEVICE_MAP"] = strings.Join(mapEnvs, " ") - response.Envs["CUDA_DEVICE_MEMORY_SHARED_CACHE"] = fmt.Sprintf("/tmp/%v.cache", uuid.NewUUID()) - if config.DeviceMemoryScaling > 1 { - response.Envs["CUDA_OVERSUBSCRIBE"] = "true" - } - response.Envs[api.PluginRuntimeSocket] = fmt.Sprintf("unix://%v", config.RuntimeSocketFlag) - response.Mounts = append(response.Mounts, - &pluginapi.Mount{ContainerPath: "/usr/local/vgpu/libvgpu.so", - HostPath: "/usr/local/vgpu/libvgpu.so", - ReadOnly: true}, - &pluginapi.Mount{ContainerPath: "/etc/ld.so.preload", - HostPath: "/usr/local/vgpu/ld.so.preload", - ReadOnly: true}, - ) - responses.ContainerResponses = append(responses.ContainerResponses, &response) - } - return &responses, nil -} - -// PreStartContainer is unimplemented for this plugin -func (m *NvidiaDevicePlugin) PreStartContainer(context.Context, *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) { - return &pluginapi.PreStartContainerResponse{}, nil -} - -// dial establishes the gRPC communication with the registered device plugin. -func (m *NvidiaDevicePlugin) dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) { - c, err := grpc.Dial(unixSocketPath, grpc.WithInsecure(), grpc.WithBlock(), - grpc.WithTimeout(timeout), - grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) { - return net.DialTimeout("unix", addr, timeout) - }), - ) - - if err != nil { - return nil, err - } - - return c, nil -} - -func (m *NvidiaDevicePlugin) Devices() []*Device { - return m.deviceCache.GetCache() -} - -//func (m *NvidiaDevicePlugin) deviceExists(id string) bool { -// for _, d := range m.deviceCache.GetCache() { -// if d.ID == id { -// return true -// } -// } -// return false -//} -// -//func (m *NvidiaDevicePlugin) getDevices(ids []string) ([]*Device, error) { -// var res []*Device -// for _, id := range ids { -// found := false -// for _, dev := range m.deviceCache.GetCache() { -// if id == dev.ID { -// res = append(res, dev) -// found = true -// break -// } -// } -// if !found { -// return res, fmt.Errorf("device %v not found", id) -// } -// } -// return res, nil -//} - -func (m *NvidiaDevicePlugin) apiDevices() []*pluginapi.Device { - devices := m.Devices() - var res []*pluginapi.Device - for _, dev := range devices { - for i := uint(0); i < config.DeviceSplitCount; i++ { - id := fmt.Sprintf("%v-%v", dev.ID, i) - res = append(res, &pluginapi.Device{ - ID: id, - Health: dev.Health, - Topology: nil, - }) - } - } - return res -} - -//func (m *NvidiaDevicePlugin) apiEnvs(envvar string, deviceIDs []string) map[string]string { -// return map[string]string{ -// envvar: strings.Join(deviceIDs, ","), -// } -//} -// -//func (m *NvidiaDevicePlugin) apiMounts(deviceIDs []string) []*pluginapi.Mount { -// var mounts []*pluginapi.Mount -// -// for _, id := range deviceIDs { -// mount := &pluginapi.Mount{ -// HostPath: deviceListAsVolumeMountsHostPath, -// ContainerPath: filepath.Join(deviceListAsVolumeMountsContainerPathRoot, id), -// } -// mounts = append(mounts, mount) -// } -// -// return mounts -//} - -//func (m *NvidiaDevicePlugin) apiDeviceSpecs(driverRoot string, uuids []string) []*pluginapi.DeviceSpec { -// var specs []*pluginapi.DeviceSpec -// -// paths := []string{ -// "/dev/nvidiactl", -// "/dev/nvidia-uvm", -// "/dev/nvidia-uvm-tools", -// "/dev/nvidia-modeset", -// } -// -// for _, p := range paths { -// if _, err := os.Stat(p); err == nil { -// spec := &pluginapi.DeviceSpec{ -// ContainerPath: p, -// HostPath: filepath.Join(driverRoot, p), -// Permissions: "rw", -// } -// specs = append(specs, spec) -// } -// } -// -// for _, d := range m.deviceCache.GetCache() { -// for _, id := range uuids { -// if d.ID == id { -// for _, p := range d.Paths { -// spec := &pluginapi.DeviceSpec{ -// ContainerPath: p, -// HostPath: filepath.Join(driverRoot, p), -// Permissions: "rw", -// } -// specs = append(specs, spec) -// } -// } -// } -// } -// -// return specs -//} diff --git a/pkg/device-plugin/register.go b/pkg/device-plugin/register.go deleted file mode 100644 index 15202b5a8..000000000 --- a/pkg/device-plugin/register.go +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package device_plugin - -import ( - "context" - "fmt" - "k8s.io/klog/v2" - "time" - - "4pd.io/k8s-vgpu/pkg/api" - "4pd.io/k8s-vgpu/pkg/device-plugin/config" - "google.golang.org/grpc" -) - -type DevListFunc func() []*Device - -type DeviceRegister struct { - deviceCache *DeviceCache - unhealthy chan *Device - stopCh chan struct{} -} - -func NewDeviceRegister(deviceCache *DeviceCache) *DeviceRegister { - return &DeviceRegister{ - deviceCache: deviceCache, - unhealthy: make(chan *Device), - stopCh: make(chan struct{}), - } -} - -func (r *DeviceRegister) Start() { - r.deviceCache.AddNotifyChannel("register", r.unhealthy) - go r.WatchAndRegister() -} - -func (r *DeviceRegister) Stop() { - close(r.stopCh) -} - -func (r *DeviceRegister) apiDevices() *[]*api.DeviceInfo { - devs := r.deviceCache.GetCache() - res := make([]*api.DeviceInfo, 0, len(devs)) - for _, dev := range devs { - res = append(res, &api.DeviceInfo{ - Id: dev.ID, - Count: int32(config.DeviceSplitCount), - Health: dev.Health == "healthy", - }) - } - return &res -} - -func (r *DeviceRegister) Register(ctx context.Context) error { - conn, err := grpc.DialContext( - ctx, - config.SchedulerEndpoint, - grpc.WithInsecure(), - grpc.WithBlock(), - //grpc.WithConnectParams(grpc.ConnectParams{MinConnectTimeout: 3}), - ) - if err != nil { - return fmt.Errorf("connect scheduler error, %v", err) - } - client := api.NewDeviceServiceClient(conn) - register, err := client.Register(ctx) - if err != nil { - klog.Errorf("register error %v", err) - err = fmt.Errorf("client register error, %v", err) - return err - } - req := api.RegisterRequest{Node: config.NodeName, Devices: *r.apiDevices()} - err = register.Send(&req) - if err != nil { - klog.Errorf("register send error, %v", err) - return err - } - klog.V(3).Infof("register info %v", req.String()) - closeCh := make(chan struct{}) - go func() { - reply := api.RegisterReply{} - err := register.RecvMsg(reply) - if err != nil { - klog.Errorf("register recv error, %v", err) - } else { - klog.Errorf("register recv closed") - } - closeCh <- struct{}{} - }() - for { - select { - case <-r.unhealthy: - err = register.Send(&api.RegisterRequest{ - Node: config.NodeName, - Devices: *r.apiDevices(), - }) - if err != nil { - klog.Errorf("register send error, %v", err) - return err - } - klog.V(3).Infof("register info %v", req.String()) - case <-closeCh: - klog.Infof("register server closed") - return fmt.Errorf("register server closed") - case <-r.stopCh: - return nil - } - } -} - -func (r *DeviceRegister) WatchAndRegister() { - //ctx, cancel := context.WithTimeout(context.Background(), time.Second*30) - //defer cancel() - ctx := context.Background() - for { - err := r.Register(ctx) - if err != nil { - klog.Errorf("register error, %v", err) - time.Sleep(time.Second * 5) - } else { - klog.Infof("register stopped") - break - } - } -} diff --git a/pkg/device-plugin/runtime.go b/pkg/device-plugin/runtime.go deleted file mode 100644 index df979f3ce..000000000 --- a/pkg/device-plugin/runtime.go +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package device_plugin - -import ( - "context" - "fmt" - "strings" - - "4pd.io/k8s-vgpu/pkg/api" - "4pd.io/k8s-vgpu/pkg/device-plugin/config" - "google.golang.org/grpc" -) - -type VGPURuntimeService struct { - deviceCache *DeviceCache -} - -func NewVGPURuntimeService(deviceCache *DeviceCache) *VGPURuntimeService { - return &VGPURuntimeService{deviceCache: deviceCache} -} - -func (s *VGPURuntimeService) GetDevice(ctx context.Context, req *api.GetDeviceRequest) (*api.GetDeviceReply, error) { - conn, err := grpc.DialContext( - ctx, - config.SchedulerEndpoint, - grpc.WithInsecure(), - grpc.WithBlock(), - //grpc.WithConnectParams(grpc.ConnectParams{MinConnectTimeout: 3}), - ) - if err != nil { - return nil, fmt.Errorf("connect scheduler error, %v", err) - } - client := api.NewDeviceServiceClient(conn) - sReq := api.GetContainerRequest{Uuid: req.CtrUUID} - sResp, err := client.GetContainer(ctx, &sReq) - if err != nil { - return nil, err - } - envs, err := s.containerEnvs(sResp.DevList) - if err != nil { - return nil, err - } - resp := api.GetDeviceReply{ - Envs: envs, - PodUID: sResp.PodUID, - CtrName: sResp.CtrName, - PodNamespace: sResp.PodNamespace, - PodName: sResp.PodName, - } - return &resp, nil -} - -func (s *VGPURuntimeService) containerEnvs(devIDs []string) (map[string]string, error) { - envs := make(map[string]string) - var devs []*Device - for _, id := range devIDs { - found := false - for _, d := range s.deviceCache.GetCache() { - if id == d.ID { - found = true - devs = append(devs, d) - break - } - } - if !found { - return nil, fmt.Errorf("device %v not found", id) - } - } - - envs["NVIDIA_VISIBLE_DEVICES"] = strings.Join(devIDs, ",") - for i, d := range devs { - limitKey := fmt.Sprintf("CUDA_DEVICE_MEMORY_LIMIT_%v", i) - envs[limitKey] = fmt.Sprintf("%vm", config.DeviceMemoryScaling*float64(d.Memory)/float64(config.DeviceSplitCount)) - } - return envs, nil -} diff --git a/pkg/device/ascend/ascend310p.go b/pkg/device/ascend/ascend310p.go new file mode 100644 index 000000000..2c4a0253f --- /dev/null +++ b/pkg/device/ascend/ascend310p.go @@ -0,0 +1,248 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ascend + +import ( + "errors" + "flag" + "fmt" + "strconv" + "strings" + "time" + + "github.com/Project-HAMi/HAMi/pkg/api" + "github.com/Project-HAMi/HAMi/pkg/util" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/klog/v2" +) + +type Ascend310P struct { +} + +const ( + Ascend310PName = "Ascend310P" + Ascend310PSelection = "huawei.com/predicate-ascend310p-idx-" + Ascend310PUseUUID = "huawei.com/use-ascend310p-uuid" + Ascend310PNoUseUUID = "huawei.com/no-use-ascend310p-uuid" + Ascend310PMaxMemory = 21 * 1024 // Just for the sake of being able to split, if it exceeds 12G, the whole card will be used. + Ascend310PMemoryCapacity = 24 * 1024 +) + +var ( + Ascend310PResourceCount string + Ascend310PResourceMemory string + Ascend310PResourceCores string +) + +type virTemplate struct { + name string + aiCore int + aiCPU int + memory int64 +} + +var virAscend310PTemplates = []virTemplate{ + {"vir01", 1, 1, 3 * 1024}, + {"vir02", 2, 2, 6 * 1024}, + {"vir04", 4, 4, 12 * 1024}, +} + +func trimAscend310PMemory(m int64) (int64, string) { + for i := 0; i < len(virAscend310PTemplates); i++ { + if m <= virAscend310PTemplates[i].memory { + return virAscend310PTemplates[i].memory, virAscend310PTemplates[i].name + } + } + if m <= Ascend310PMemoryCapacity { + // use the whole card + return Ascend310PMaxMemory, "" + } + return 0, "" +} + +func InitAscend310P() *Ascend310P { + util.InRequestDevices[Ascend310PName] = "hami.io/ascend310p-devices-to-allocate" + util.SupportDevices[Ascend310PName] = "hami.io/ascend310p-devices-allocated" + return &Ascend310P{} +} + +func (dev *Ascend310P) ParseConfig(fs *flag.FlagSet) { + fs.StringVar(&Ascend310PResourceCount, "ascend310p-name", "huawei.com/Ascend310P", "Ascend310P resource count") + fs.StringVar(&Ascend310PResourceMemory, "ascend310p-memory", "huawei.com/Ascend310P-memory", "Ascend310P memory resource") +} + +func (dev *Ascend310P) MutateAdmission(ctr *corev1.Container) (bool, error) { + count, ok := ctr.Resources.Limits[corev1.ResourceName(Ascend310PResourceCount)] + if !ok { + return false, nil + } + trimMem := int64(Ascend310PMaxMemory) + memory, ok := ctr.Resources.Limits[corev1.ResourceName(Ascend310PResourceMemory)] + if ok { + trimMem, _ = trimAscend310PMemory(memory.Value()) + if trimMem <= 0 { + return false, fmt.Errorf("ascend310p memory %d is invalid", memory.Value()) + } + } + if count.Value() > 1 { + if trimMem != int64(Ascend310PMaxMemory) { + return true, errors.New("vNPU nor supported for multiple devices") + } + } + ctr.Resources.Limits[corev1.ResourceName(Ascend310PResourceMemory)] = resource.MustParse(fmt.Sprint(trimMem)) + ctr.Resources.Requests[corev1.ResourceName(Ascend310PResourceMemory)] = resource.MustParse(fmt.Sprint(trimMem)) + return true, nil +} + +func (dev *Ascend310P) GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, error) { + nodedevices := []*api.DeviceInfo{} + i := 0 + cards, _ := n.Status.Capacity.Name(corev1.ResourceName(Ascend310PResourceCount), resource.DecimalSI).AsInt64() + for int64(i)*10 < cards { + nodedevices = append(nodedevices, &api.DeviceInfo{ + Index: i, + Id: n.Name + "-Ascend310P-" + fmt.Sprint(i), + Count: 100, + Devmem: Ascend310PMaxMemory, + Devcore: 100, + Type: Ascend310PName, + Numa: 0, + Health: true, + }) + i++ + } + return nodedevices, nil +} + +func (dev *Ascend310P) PatchAnnotations(annoinput *map[string]string, pd util.PodDevices) map[string]string { + devlist, ok := pd[Ascend310PName] + if ok && len(devlist) > 0 { + (*annoinput)[util.InRequestDevices[Ascend310PName]] = util.EncodePodSingleDevice(devlist) + (*annoinput)[util.SupportDevices[Ascend310PName]] = util.EncodePodSingleDevice(devlist) + (*annoinput)["predicate-time"] = strconv.FormatInt(time.Now().Unix(), 10) + allocateStr := "huawei.com/Ascend310P" + for _, dp := range devlist { + value := "" + for _, val := range dp { + value = value + "Ascend310P-" + _, temp := trimAscend310PMemory(int64(val.Usedmem)) + value = value + temp + "-" + value = value + fmt.Sprint(val.Idx) + "," + } + if len(value) > 0 { + (*annoinput)[allocateStr] = strings.TrimRight(value, ",") + } + } + } + return *annoinput +} + +func (dev *Ascend310P) LockNode(n *corev1.Node, p *corev1.Pod) error { + return nil +} + +func (dev *Ascend310P) ReleaseNodeLock(n *corev1.Node, p *corev1.Pod) error { + return nil +} + +func (dev *Ascend310P) NodeCleanUp(nn string) error { + return nil +} + +func (dev *Ascend310P) CheckType(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) (bool, bool, bool) { + if strings.Compare(n.Type, Ascend310PName) == 0 { + return true, true, false + } + return false, false, false +} + +func (dev *Ascend310P) CheckUUID(annos map[string]string, d util.DeviceUsage) bool { + userUUID, ok := annos[Ascend310PUseUUID] + if ok { + klog.V(5).Infof("check uuid for Iluvatar user uuid [%s], device id is %s", userUUID, d.ID) + // use , symbol to connect multiple uuid + userUUIDs := strings.Split(userUUID, ",") + for _, uuid := range userUUIDs { + if d.ID == uuid { + return true + } + } + return false + } + + noUserUUID, ok := annos[Ascend310PNoUseUUID] + if ok { + klog.V(5).Infof("check uuid for Iluvatar not user uuid [%s], device id is %s", noUserUUID, d.ID) + // use , symbol to connect multiple uuid + noUserUUIDs := strings.Split(noUserUUID, ",") + for _, uuid := range noUserUUIDs { + if d.ID == uuid { + return false + } + } + return true + } + return true +} + +func (dev *Ascend310P) CheckHealth(devType string, n *corev1.Node) (bool, bool) { + return true, true +} + +func (dev *Ascend310P) GenerateResourceRequests(ctr *corev1.Container) util.ContainerDeviceRequest { + klog.Info("Counting Ascend310P devices") + ascendResourceCount := corev1.ResourceName(Ascend310PResourceCount) + ascendResourceMem := corev1.ResourceName(Ascend310PResourceMemory) + v, ok := ctr.Resources.Limits[ascendResourceCount] + if !ok { + v, ok = ctr.Resources.Requests[ascendResourceCount] + } + if ok { + if n, ok := v.AsInt64(); ok { + klog.Info("Found Ascend310P devices") + memnum := 0 + mem, ok := ctr.Resources.Limits[ascendResourceMem] + if !ok { + mem, ok = ctr.Resources.Requests[ascendResourceMem] + } + if ok { + memnums, ok := mem.AsInt64() + if ok { + m, _ := trimAscend310PMemory(memnums) + memnum = int(m) + } + } + corenum := int32(0) + + mempnum := 0 + if memnum == 0 { + mempnum = 100 + } + + return util.ContainerDeviceRequest{ + Nums: int32(n), + Type: Ascend310PName, + Memreq: int32(memnum), + MemPercentagereq: int32(mempnum), + Coresreq: corenum, + } + } + } + return util.ContainerDeviceRequest{} +} diff --git a/pkg/device/ascend/device.go b/pkg/device/ascend/device.go new file mode 100644 index 000000000..9671b2a5c --- /dev/null +++ b/pkg/device/ascend/device.go @@ -0,0 +1,233 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ascend + +import ( + "errors" + "flag" + "fmt" + "strconv" + "strings" + "time" + + "github.com/Project-HAMi/HAMi/pkg/api" + "github.com/Project-HAMi/HAMi/pkg/util" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/klog/v2" +) + +type AscendDevices struct { +} + +const ( + AscendDevice = "Ascend" + AscendDeviceSelection = "huawei.com/predicate-ascend-idx-" + // IluvatarUseUUID is user can use specify Iluvatar device for set Iluvatar UUID. + AscendDeviceUseUUID = "huawei.com/use-ascenduuid" + // IluvatarNoUseUUID is user can not use specify Iluvatar device for set Iluvatar UUID. + AscendNoUseUUID = "huawei.com/nouse-ascenduuid" +) + +var ( + AscendResourceCount string + AscendResourceMemory string + AscendResourceCores string +) + +func InitDevice() *AscendDevices { + util.InRequestDevices[AscendDevice] = "hami.io/ascend-devices-to-allocate" + util.SupportDevices[AscendDevice] = "hami.io/ascend-devices-allocated" + return &AscendDevices{} +} + +func (dev *AscendDevices) ParseConfig(fs *flag.FlagSet) { + fs.StringVar(&AscendResourceCount, "ascend-name", "huawei.com/Ascend910", "iluvatar resource count") + fs.StringVar(&AscendResourceMemory, "ascend-memory", "huawei.com/Ascend910-memory", "iluvatar memory resource") +} + +func (dev *AscendDevices) MutateAdmission(ctr *corev1.Container) (bool, error) { + count, ok := ctr.Resources.Limits[corev1.ResourceName(AscendResourceCount)] + if ok { + if count.Value() > 1 { + memory, ok := ctr.Resources.Limits[corev1.ResourceName(AscendResourceMemory)] + if ok && memory.Value() != 65536 { + return true, errors.New("vNPU nor supported for multiple devices") + } + return true, nil + } + if count.Value() == 1 { + memory, ok := ctr.Resources.Limits[corev1.ResourceName(AscendResourceMemory)] + if ok { + ctr.Resources.Limits[corev1.ResourceName(AscendResourceMemory)] = resource.MustParse(fmt.Sprint(trimMemory(memory.Value()))) + ctr.Resources.Requests[corev1.ResourceName(AscendResourceMemory)] = resource.MustParse(fmt.Sprint(trimMemory(memory.Value()))) + } + return true, nil + } + } + return false, nil +} + +func (dev *AscendDevices) GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, error) { + nodedevices := []*api.DeviceInfo{} + i := 0 + cards, _ := n.Status.Capacity.Name(corev1.ResourceName(AscendResourceCount), resource.DecimalSI).AsInt64() + for int64(i)*10 < cards { + nodedevices = append(nodedevices, &api.DeviceInfo{ + Index: i, + Id: n.Name + "-Ascend910-" + fmt.Sprint(i), + Count: 100, + Devmem: int32(65536), + Devcore: 100, + Type: AscendDevice, + Numa: 0, + Health: true, + }) + i++ + } + return nodedevices, nil +} + +func (dev *AscendDevices) PatchAnnotations(annoinput *map[string]string, pd util.PodDevices) map[string]string { + devlist, ok := pd[AscendDevice] + if ok && len(devlist) > 0 { + (*annoinput)[util.InRequestDevices[AscendDevice]] = util.EncodePodSingleDevice(devlist) + (*annoinput)[util.SupportDevices[AscendDevice]] = util.EncodePodSingleDevice(devlist) + (*annoinput)["predicate-time"] = strconv.FormatInt(time.Now().Unix(), 10) + allocateStr := "huawei.com/Ascend910" + for _, dp := range devlist { + value := "" + for _, val := range dp { + value = value + "Ascend910-" + if val.Usedmem == 16384 { + value = value + "vir05_1c_16g-" + } else if val.Usedmem == 32768 { + value = value + "vir10_3c_32g-" + } + value = value + fmt.Sprint(val.Idx) + "," + } + if len(value) > 0 { + (*annoinput)[allocateStr] = strings.TrimRight(value, ",") + } + } + } + return *annoinput +} + +func (dev *AscendDevices) LockNode(n *corev1.Node, p *corev1.Pod) error { + return nil +} + +func (dev *AscendDevices) ReleaseNodeLock(n *corev1.Node, p *corev1.Pod) error { + return nil +} + +func (dev *AscendDevices) NodeCleanUp(nn string) error { + return nil +} + +func (dev *AscendDevices) CheckType(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) (bool, bool, bool) { + if strings.Compare(n.Type, AscendDevice) == 0 { + return true, true, false + } + return false, false, false +} + +func (dev *AscendDevices) CheckUUID(annos map[string]string, d util.DeviceUsage) bool { + userUUID, ok := annos[AscendDeviceUseUUID] + if ok { + klog.V(5).Infof("check uuid for Iluvatar user uuid [%s], device id is %s", userUUID, d.ID) + // use , symbol to connect multiple uuid + userUUIDs := strings.Split(userUUID, ",") + for _, uuid := range userUUIDs { + if d.ID == uuid { + return true + } + } + return false + } + + noUserUUID, ok := annos[AscendNoUseUUID] + if ok { + klog.V(5).Infof("check uuid for Iluvatar not user uuid [%s], device id is %s", noUserUUID, d.ID) + // use , symbol to connect multiple uuid + noUserUUIDs := strings.Split(noUserUUID, ",") + for _, uuid := range noUserUUIDs { + if d.ID == uuid { + return false + } + } + return true + } + return true +} + +func (dev *AscendDevices) CheckHealth(devType string, n *corev1.Node) (bool, bool) { + return true, true +} + +func trimMemory(i int64) int64 { + if i <= 16384 { + return 16384 + } + if i <= 32768 { + return 32768 + } + return 0 +} + +func (dev *AscendDevices) GenerateResourceRequests(ctr *corev1.Container) util.ContainerDeviceRequest { + klog.Info("Counting ascend 910B devices") + ascendResourceCount := corev1.ResourceName(AscendResourceCount) + ascendResourceMem := corev1.ResourceName(AscendResourceMemory) + v, ok := ctr.Resources.Limits[ascendResourceCount] + if !ok { + v, ok = ctr.Resources.Requests[ascendResourceCount] + } + if ok { + if n, ok := v.AsInt64(); ok { + klog.Info("Found ascend 910B devices") + memnum := 0 + mem, ok := ctr.Resources.Limits[ascendResourceMem] + if !ok { + mem, ok = ctr.Resources.Requests[ascendResourceMem] + } + if ok { + memnums, ok := mem.AsInt64() + if ok { + memnum = int(trimMemory(memnums)) + } + } + corenum := int32(0) + + mempnum := 0 + if memnum == 0 { + mempnum = 100 + } + + return util.ContainerDeviceRequest{ + Nums: int32(n), + Type: AscendDevice, + Memreq: int32(memnum), + MemPercentagereq: int32(mempnum), + Coresreq: corenum, + } + } + } + return util.ContainerDeviceRequest{} +} diff --git a/pkg/device/cambricon/device.go b/pkg/device/cambricon/device.go new file mode 100644 index 000000000..14e9e157f --- /dev/null +++ b/pkg/device/cambricon/device.go @@ -0,0 +1,301 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cambricon + +import ( + "context" + "encoding/json" + "flag" + "fmt" + "math/rand" + "strings" + "time" + + "github.com/Project-HAMi/HAMi/pkg/api" + "github.com/Project-HAMi/HAMi/pkg/util" + "github.com/Project-HAMi/HAMi/pkg/util/client" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/klog/v2" +) + +const ( + CambriconMLUDevice = "MLU" + CambriconMLUCommonWord = "MLU" + MluMemSplitLimit = "CAMBRICON_SPLIT_MEMS" + MluMemSplitIndex = "CAMBRICON_SPLIT_VISIBLE_DEVICES" + MluMemSplitEnable = "CAMBRICON_SPLIT_ENABLE" + MLUInUse = "cambricon.com/use-mlutype" + MLUNoUse = "cambricon.com/nouse-mlutype" + // MLUUseUUID is user can use specify MLU device for set MLU UUID. + MLUUseUUID = "cambricon.com/use-gpuuuid" + // MLUNoUseUUID is user can not use specify MLU device for set MLU UUID. + MLUNoUseUUID = "cambricon.com/nouse-gpuuuid" + DsmluLockTime = "cambricon.com/dsmlu.lock" + DsmluProfile = "CAMBRICON_DSMLU_PROFILE" + DsmluResourceAssigned = "CAMBRICON_DSMLU_ASSIGHED" + retry = 5 +) + +var ( + MLUResourceCount string + MLUResourceMemory string + MLUResourceCores string +) + +type CambriconDevices struct { +} + +func (dev *CambriconDevices) ParseConfig(fs *flag.FlagSet) { + fs.StringVar(&MLUResourceCount, "cambricon-mlu-name", "cambricon.com/mlu", "cambricon mlu resource count") + fs.StringVar(&MLUResourceMemory, "cambricon-mlu-memory", "cambricon.com/mlu.smlu.vmemory", "cambricon mlu memory resource") + fs.StringVar(&MLUResourceCores, "cambricon-mlu-cores", "cambricon.com/mlu.smlu.vcore", "cambricon mlu core resource") +} + +func InitMLUDevice() *CambriconDevices { + util.InRequestDevices[CambriconMLUDevice] = "hami.io/cambricon-mlu-devices-to-allocate" + util.SupportDevices[CambriconMLUDevice] = "hami.io/cambricon-mlu-devices-allocated" + return &CambriconDevices{} +} + +func (dev *CambriconDevices) setNodeLock(node *corev1.Node) error { + ctx := context.Background() + if _, ok := node.ObjectMeta.Annotations[DsmluLockTime]; ok { + return fmt.Errorf("node %s is locked", node.Name) + } + + patchedAnnotation, err := json.Marshal( + map[string]interface{}{ + "metadata": map[string]map[string]string{"annotations": { + DsmluLockTime: time.Now().Format(time.RFC3339), + }}}) + if err != nil { + klog.ErrorS(err, "Failed to patch node annotation", "node", node.Name) + return fmt.Errorf("patch node annotation %v", err) + } + + _, err = client.GetClient().CoreV1().Nodes().Patch(ctx, node.Name, types.StrategicMergePatchType, patchedAnnotation, metav1.PatchOptions{}) + for i := 0; i < retry && err != nil; i++ { + klog.ErrorS(err, "Failed to patch node annotation", "node", node.Name, "retry", i) + time.Sleep(time.Duration(rand.Intn(i)) * 10 * time.Millisecond) + _, err = client.GetClient().CoreV1().Nodes().Patch(ctx, node.Name, types.StrategicMergePatchType, patchedAnnotation, metav1.PatchOptions{}) + } + if err != nil { + return fmt.Errorf("setNodeLock exceeds retry count %d", retry) + } + klog.InfoS("Node lock set", "node", node.Name) + return nil +} + +func (dev *CambriconDevices) LockNode(n *corev1.Node, p *corev1.Pod) error { + found := false + for _, val := range p.Spec.Containers { + if (dev.GenerateResourceRequests(&val).Nums) > 0 { + found = true + break + } + } + if !found { + return nil + } + if _, ok := n.ObjectMeta.Annotations[DsmluLockTime]; !ok { + return dev.setNodeLock(n) + } + lockTime, err := time.Parse(time.RFC3339, n.ObjectMeta.Annotations[DsmluLockTime]) + if err != nil { + return err + } + if time.Since(lockTime) > time.Minute*2 { + klog.InfoS("Node lock expired", "node", n.Name, "lockTime", lockTime) + err = dev.ReleaseNodeLock(n, p) + if err != nil { + klog.ErrorS(err, "Failed to release node lock", "node", n.Name) + return err + } + return dev.setNodeLock(n) + } + return fmt.Errorf("node %s has been locked within 2 minutes", n.Name) +} + +func (dev *CambriconDevices) ReleaseNodeLock(n *corev1.Node, p *corev1.Pod) error { + if n.Annotations == nil { + return nil + } + if _, ok := n.Annotations[DsmluLockTime]; !ok { + klog.InfoS("Node lock not set", "node", n.Name) + return nil + } + + patchData := []byte(`[ + { + "op": "remove", + "path": "/metadata/annotations/cambricon.com~1dsmlu.lock" + } + ]`) + + _, err := client.GetClient().CoreV1().Nodes().Patch(context.TODO(), n.Name, types.JSONPatchType, patchData, metav1.PatchOptions{}) + for i := 0; i < retry && err != nil; i++ { + klog.ErrorS(err, "Failed to patch node annotation", "node", n.Name, "retry", i) + time.Sleep(time.Duration(rand.Intn(i)) * 10 * time.Millisecond) + _, err = client.GetClient().CoreV1().Nodes().Patch(context.TODO(), n.Name, types.JSONPatchType, patchData, metav1.PatchOptions{}) + } + if err != nil { + return fmt.Errorf("releaseNodeLock exceeds retry count %d", retry) + } + klog.InfoS("Node lock released", "node", n.Name) + return nil +} + +func (dev *CambriconDevices) NodeCleanUp(nn string) error { + return nil +} + +func (dev *CambriconDevices) CheckHealth(devType string, n *corev1.Node) (bool, bool) { + return true, true +} + +func (dev *CambriconDevices) GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, error) { + nodedevices := []*api.DeviceInfo{} + i := 0 + cards, _ := n.Status.Capacity.Name(corev1.ResourceName(MLUResourceCores), resource.DecimalSI).AsInt64() + memoryTotal, _ := n.Status.Capacity.Name(corev1.ResourceName(MLUResourceMemory), resource.DecimalSI).AsInt64() + for int64(i)*100 < cards { + nodedevices = append(nodedevices, &api.DeviceInfo{ + Index: i, + Id: n.Name + "-cambricon-mlu-" + fmt.Sprint(i), + Count: 100, + Devmem: int32(memoryTotal * 256 * 100 / cards), + Devcore: 100, + Type: CambriconMLUDevice, + Numa: 0, + Health: true, + }) + i++ + } + return nodedevices, nil +} + +func (dev *CambriconDevices) AssertNuma(annos map[string]string) bool { + return false +} + +func (dev *CambriconDevices) MutateAdmission(ctr *corev1.Container) (bool, error) { + _, ok := ctr.Resources.Limits[corev1.ResourceName(MLUResourceCount)] + return ok, nil +} + +func (dev *CambriconDevices) CheckType(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) (bool, bool, bool) { + if strings.Compare(n.Type, CambriconMLUDevice) == 0 { + return true, true, false + } + return false, false, false +} + +func (dev *CambriconDevices) CheckUUID(annos map[string]string, d util.DeviceUsage) bool { + userUUID, ok := annos[MLUUseUUID] + if ok { + klog.V(5).Infof("check uuid for mlu user uuid [%s], device id is %s", userUUID, d.ID) + // use , symbol to connect multiple uuid + userUUIDs := strings.Split(userUUID, ",") + for _, uuid := range userUUIDs { + if d.ID == uuid { + return true + } + } + return false + } + + noUserUUID, ok := annos[MLUNoUseUUID] + if ok { + klog.V(5).Infof("check uuid for mlu not user uuid [%s], device id is %s", noUserUUID, d.ID) + // use , symbol to connect multiple uuid + noUserUUIDs := strings.Split(noUserUUID, ",") + for _, uuid := range noUserUUIDs { + if d.ID == uuid { + return false + } + } + return true + } + return true +} + +func (dev *CambriconDevices) GenerateResourceRequests(ctr *corev1.Container) util.ContainerDeviceRequest { + klog.Info("Counting mlu devices") + mluResourceCount := corev1.ResourceName(MLUResourceCount) + mluResourceMem := corev1.ResourceName(MLUResourceMemory) + mluResourceCores := corev1.ResourceName(MLUResourceCores) + v, ok := ctr.Resources.Limits[mluResourceCount] + if !ok { + v, ok = ctr.Resources.Requests[mluResourceCount] + } + if ok { + if n, ok := v.AsInt64(); ok { + klog.Info("Found iluvatar devices") + memnum := 0 + mem, ok := ctr.Resources.Limits[mluResourceMem] + if !ok { + mem, ok = ctr.Resources.Requests[mluResourceMem] + } + if ok { + memnums, ok := mem.AsInt64() + if ok { + memnum = int(memnums) * 256 + } + } + corenum := int32(100) + core, ok := ctr.Resources.Limits[mluResourceCores] + if !ok { + core, ok = ctr.Resources.Requests[mluResourceCores] + } + if ok { + corenums, ok := core.AsInt64() + if ok { + corenum = int32(corenums) + } + } + + mempnum := 0 + if memnum == 0 { + mempnum = 100 + } + + return util.ContainerDeviceRequest{ + Nums: int32(n), + Type: CambriconMLUDevice, + Memreq: int32(memnum), + MemPercentagereq: int32(mempnum), + Coresreq: corenum, + } + } + } + return util.ContainerDeviceRequest{ + Nums: 0, + } +} + +func (dev *CambriconDevices) PatchAnnotations(annoinput *map[string]string, pd util.PodDevices) map[string]string { + devlist, ok := pd[CambriconMLUDevice] + if ok { + (*annoinput)[DsmluResourceAssigned] = "false" + (*annoinput)[DsmluProfile] = fmt.Sprintf("%d_%d_%d", devlist[0][0].Idx, devlist[0][0].Usedcores, devlist[0][0].Usedmem/256) + } + return *annoinput +} diff --git a/pkg/device/devices.go b/pkg/device/devices.go new file mode 100644 index 000000000..7d2359ee8 --- /dev/null +++ b/pkg/device/devices.go @@ -0,0 +1,136 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package device + +import ( + "context" + "flag" + "os" + "strings" + + "github.com/Project-HAMi/HAMi/pkg/api" + "github.com/Project-HAMi/HAMi/pkg/device/ascend" + "github.com/Project-HAMi/HAMi/pkg/device/cambricon" + "github.com/Project-HAMi/HAMi/pkg/device/hygon" + "github.com/Project-HAMi/HAMi/pkg/device/iluvatar" + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + "github.com/Project-HAMi/HAMi/pkg/util" + "github.com/Project-HAMi/HAMi/pkg/util/client" + "github.com/Project-HAMi/HAMi/pkg/util/nodelock" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" +) + +type Devices interface { + MutateAdmission(ctr *corev1.Container) (bool, error) + CheckHealth(devType string, n *corev1.Node) (bool, bool) + NodeCleanUp(nn string) error + GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, error) + CheckType(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) (bool, bool, bool) + // CheckUUID is check current device id whether in GPUUseUUID or GPUNoUseUUID set, return true is check success. + CheckUUID(annos map[string]string, d util.DeviceUsage) bool + LockNode(n *corev1.Node, p *corev1.Pod) error + ReleaseNodeLock(n *corev1.Node, p *corev1.Pod) error + GenerateResourceRequests(ctr *corev1.Container) util.ContainerDeviceRequest + PatchAnnotations(annoinput *map[string]string, pd util.PodDevices) map[string]string + ParseConfig(fs *flag.FlagSet) +} + +var ( + HandshakeAnnos = map[string]string{} + RegisterAnnos = map[string]string{} + DevicesToHandle []string +) + +var devices map[string]Devices +var DebugMode bool + +func GetDevices() map[string]Devices { + return devices +} + +func init() { + devices = make(map[string]Devices) + devices[cambricon.CambriconMLUDevice] = cambricon.InitMLUDevice() + devices[nvidia.NvidiaGPUDevice] = nvidia.InitNvidiaDevice() + devices[hygon.HygonDCUDevice] = hygon.InitDCUDevice() + devices[iluvatar.IluvatarGPUDevice] = iluvatar.InitIluvatarDevice() + devices[ascend.AscendDevice] = ascend.InitDevice() + devices[ascend.Ascend310PName] = ascend.InitAscend310P() + DevicesToHandle = []string{} + DevicesToHandle = append(DevicesToHandle, nvidia.NvidiaGPUCommonWord) + DevicesToHandle = append(DevicesToHandle, cambricon.CambriconMLUCommonWord) + DevicesToHandle = append(DevicesToHandle, hygon.HygonDCUCommonWord) + DevicesToHandle = append(DevicesToHandle, iluvatar.IluvatarGPUCommonWord) + DevicesToHandle = append(DevicesToHandle, ascend.AscendDevice) + DevicesToHandle = append(DevicesToHandle, ascend.Ascend310PName) +} + +func PodAllocationTrySuccess(nodeName string, devName string, lockName string, pod *corev1.Pod) { + refreshed, err := client.GetClient().CoreV1().Pods(pod.Namespace).Get(context.Background(), pod.Name, metav1.GetOptions{}) + if err != nil { + klog.Errorf("get pods %s/%s error: %+v", pod.Namespace, pod.Name, err) + return + } + annos := refreshed.Annotations[util.InRequestDevices[devName]] + klog.Infoln("TrySuccess:", annos) + for _, val := range DevicesToHandle { + if strings.Contains(annos, val) { + return + } + } + klog.Infoln("AllDevicesAllocateSuccess releasing lock") + PodAllocationSuccess(nodeName, pod, lockName) +} + +func PodAllocationSuccess(nodeName string, pod *corev1.Pod, lockname string) { + newannos := make(map[string]string) + newannos[util.DeviceBindPhase] = util.DeviceBindSuccess + err := util.PatchPodAnnotations(pod, newannos) + if err != nil { + klog.Errorf("patchPodAnnotations failed:%v", err.Error()) + } + err = nodelock.ReleaseNodeLock(nodeName, lockname) + if err != nil { + klog.Errorf("release lock failed:%v", err.Error()) + } +} + +func PodAllocationFailed(nodeName string, pod *corev1.Pod, lockname string) { + newannos := make(map[string]string) + newannos[util.DeviceBindPhase] = util.DeviceBindFailed + err := util.PatchPodAnnotations(pod, newannos) + if err != nil { + klog.Errorf("patchPodAnnotations failed:%v", err.Error()) + } + err = nodelock.ReleaseNodeLock(nodeName, lockname) + if err != nil { + klog.Errorf("release lock failed:%v", err.Error()) + } +} + +func GlobalFlagSet() *flag.FlagSet { + fs := flag.NewFlagSet(os.Args[0], flag.ExitOnError) + for _, val := range devices { + val.ParseConfig(fs) + } + fs.BoolVar(&DebugMode, "debug", false, "debug mode") + klog.InitFlags(fs) + return fs +} diff --git a/pkg/device/hygon/device.go b/pkg/device/hygon/device.go new file mode 100644 index 000000000..9b85e4948 --- /dev/null +++ b/pkg/device/hygon/device.go @@ -0,0 +1,235 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package hygon + +import ( + "errors" + "flag" + "strings" + + "github.com/Project-HAMi/HAMi/pkg/api" + "github.com/Project-HAMi/HAMi/pkg/util" + + corev1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" +) + +type DCUDevices struct { +} + +const ( + HandshakeAnnos = "hami.io/node-handshake-dcu" + RegisterAnnos = "hami.io/node-dcu-register" + HygonDCUDevice = "DCU" + HygonDCUCommonWord = "DCU" + DCUInUse = "hygon.com/use-dcutype" + DCUNoUse = "hygon.com/nouse-dcutype" + // DCUUseUUID is user can use specify DCU device for set DCU UUID. + DCUUseUUID = "hygon.com/use-gpuuuid" + // DCUNoUseUUID is user can not use specify DCU device for set DCU UUID. + DCUNoUseUUID = "hygon.com/nouse-gpuuuid" +) + +var ( + HygonResourceCount string + HygonResourceMemory string + HygonResourceCores string +) + +func InitDCUDevice() *DCUDevices { + util.InRequestDevices[HygonDCUDevice] = "hami.io/dcu-devices-to-allocate" + util.SupportDevices[HygonDCUDevice] = "hami.io/dcu-devices-allocated" + util.HandshakeAnnos[HygonDCUDevice] = HandshakeAnnos + return &DCUDevices{} +} + +func (dev *DCUDevices) ParseConfig(fs *flag.FlagSet) { + fs.StringVar(&HygonResourceCount, "dcu-name", "hygon.com/dcunum", "dcu resource count") + fs.StringVar(&HygonResourceMemory, "dcu-memory", "hygon.com/dcumem", "dcu memory resource") + fs.StringVar(&HygonResourceCores, "dcu-cores", "hygon.com/dcucores", "dcu core resource") +} + +func (dev *DCUDevices) MutateAdmission(ctr *corev1.Container) (bool, error) { + _, ok := ctr.Resources.Limits[corev1.ResourceName(HygonResourceCount)] + return ok, nil +} + +func checkDCUtype(annos map[string]string, cardtype string) bool { + if inuse, ok := annos[DCUInUse]; ok { + if !strings.Contains(inuse, ",") { + if strings.Contains(strings.ToUpper(cardtype), strings.ToUpper(inuse)) { + return true + } + } else { + for _, val := range strings.Split(inuse, ",") { + if strings.Contains(strings.ToUpper(cardtype), strings.ToUpper(val)) { + return true + } + } + } + return false + } + if nouse, ok := annos[DCUNoUse]; ok { + if !strings.Contains(nouse, ",") { + if strings.Contains(strings.ToUpper(cardtype), strings.ToUpper(nouse)) { + return false + } + } else { + for _, val := range strings.Split(nouse, ",") { + if strings.Contains(strings.ToUpper(cardtype), strings.ToUpper(val)) { + return false + } + } + } + return true + } + return true +} + +func (dev *DCUDevices) LockNode(n *corev1.Node, p *corev1.Pod) error { + return nil +} + +func (dev *DCUDevices) ReleaseNodeLock(n *corev1.Node, p *corev1.Pod) error { + return nil +} + +func (dev *DCUDevices) GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, error) { + devEncoded, ok := n.Annotations[RegisterAnnos] + if !ok { + return []*api.DeviceInfo{}, errors.New("annos not found " + RegisterAnnos) + } + nodedevices, err := util.DecodeNodeDevices(devEncoded) + if err != nil { + klog.ErrorS(err, "failed to decode node devices", "node", n.Name, "device annotation", devEncoded) + return []*api.DeviceInfo{}, err + } + if len(nodedevices) == 0 { + klog.InfoS("no gpu device found", "node", n.Name, "device annotation", devEncoded) + return []*api.DeviceInfo{}, errors.New("no gpu found on node") + } + devDecoded := util.EncodeNodeDevices(nodedevices) + klog.V(5).InfoS("nodes device information", "node", n.Name, "nodedevices", devDecoded) + return nodedevices, nil +} + +func (dev *DCUDevices) NodeCleanUp(nn string) error { + return util.MarkAnnotationsToDelete(HandshakeAnnos, nn) +} + +func (dev *DCUDevices) CheckHealth(devType string, n *corev1.Node) (bool, bool) { + return util.CheckHealth(devType, n) +} + +func (dev *DCUDevices) CheckType(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) (bool, bool, bool) { + if strings.Compare(n.Type, HygonDCUDevice) == 0 { + return true, checkDCUtype(annos, d.Type), false + } + return false, false, false +} + +func (dev *DCUDevices) CheckUUID(annos map[string]string, d util.DeviceUsage) bool { + userUUID, ok := annos[DCUUseUUID] + if ok { + klog.V(5).Infof("check uuid for dcu user uuid [%s], device id is %s", userUUID, d.ID) + // use , symbol to connect multiple uuid + userUUIDs := strings.Split(userUUID, ",") + for _, uuid := range userUUIDs { + if d.ID == uuid { + return true + } + } + return false + } + + noUserUUID, ok := annos[DCUNoUseUUID] + if ok { + klog.V(5).Infof("check uuid for dcu not user uuid [%s], device id is %s", noUserUUID, d.ID) + // use , symbol to connect multiple uuid + noUserUUIDs := strings.Split(noUserUUID, ",") + for _, uuid := range noUserUUIDs { + if d.ID == uuid { + return false + } + } + return true + } + return true +} + +func (dev *DCUDevices) GenerateResourceRequests(ctr *corev1.Container) util.ContainerDeviceRequest { + klog.Info("Counting dcu devices") + dcuResourceCount := corev1.ResourceName(HygonResourceCount) + dcuResourceMem := corev1.ResourceName(HygonResourceMemory) + dcuResourceCores := corev1.ResourceName(HygonResourceCores) + v, ok := ctr.Resources.Limits[dcuResourceCount] + if !ok { + v, ok = ctr.Resources.Requests[dcuResourceCount] + } else { + if n, ok := v.AsInt64(); ok { + klog.Info("Found dcu devices") + memnum := 0 + mem, ok := ctr.Resources.Limits[dcuResourceMem] + if !ok { + mem, ok = ctr.Resources.Requests[dcuResourceMem] + } + if ok { + memnums, ok := mem.AsInt64() + if ok { + memnum = int(memnums) + } + } + corenum := int32(100) + core, ok := ctr.Resources.Limits[dcuResourceCores] + if !ok { + core, ok = ctr.Resources.Requests[dcuResourceCores] + } + if ok { + corenums, ok := core.AsInt64() + if ok { + corenum = int32(corenums) + } + } + + mempnum := 0 + if memnum == 0 { + mempnum = 100 + } + + return util.ContainerDeviceRequest{ + Nums: int32(n), + Type: HygonDCUDevice, + Memreq: int32(memnum), + MemPercentagereq: int32(mempnum), + Coresreq: corenum, + } + } + } + return util.ContainerDeviceRequest{} +} + +func (dev *DCUDevices) PatchAnnotations(annoinput *map[string]string, pd util.PodDevices) map[string]string { + devlist, ok := pd[HygonDCUDevice] + if ok && len(devlist) > 0 { + deviceStr := util.EncodePodSingleDevice(devlist) + (*annoinput)[util.InRequestDevices[HygonDCUDevice]] = deviceStr + (*annoinput)[util.SupportDevices[HygonDCUDevice]] = deviceStr + klog.V(5).Infof("pod add notation key [%s], values is [%s]", util.InRequestDevices[HygonDCUDevice], deviceStr) + klog.V(5).Infof("pod add notation key [%s], values is [%s]", util.SupportDevices[HygonDCUDevice], deviceStr) + } + return *annoinput +} diff --git a/pkg/device/iluvatar/device.go b/pkg/device/iluvatar/device.go new file mode 100644 index 000000000..194ccfda9 --- /dev/null +++ b/pkg/device/iluvatar/device.go @@ -0,0 +1,215 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package iluvatar + +import ( + "flag" + "fmt" + "strings" + + "github.com/Project-HAMi/HAMi/pkg/api" + "github.com/Project-HAMi/HAMi/pkg/util" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/klog/v2" +) + +type IluvatarDevices struct { +} + +const ( + IluvatarGPUDevice = "Iluvatar" + IluvatarGPUCommonWord = "Iluvatar" + IluvatarDeviceSelection = "iluvatar.ai/predicate-gpu-idx-" + // IluvatarUseUUID is user can use specify Iluvatar device for set Iluvatar UUID. + IluvatarUseUUID = "iluvatar.ai/use-gpuuuid" + // IluvatarNoUseUUID is user can not use specify Iluvatar device for set Iluvatar UUID. + IluvatarNoUseUUID = "iluvatar.ai/nouse-gpuuuid" +) + +var ( + IluvatarResourceCount string + IluvatarResourceMemory string + IluvatarResourceCores string +) + +func InitIluvatarDevice() *IluvatarDevices { + util.InRequestDevices[IluvatarGPUDevice] = "hami.io/iluvatar-vgpu-devices-to-allocate" + util.SupportDevices[IluvatarGPUDevice] = "hami.io/iluvatar-vgpu-devices-allocated" + return &IluvatarDevices{} +} + +func (dev *IluvatarDevices) ParseConfig(fs *flag.FlagSet) { + fs.StringVar(&IluvatarResourceCount, "iluvatar-name", "iluvatar.ai/vgpu", "iluvatar resource count") + fs.StringVar(&IluvatarResourceMemory, "iluvatar-memory", "iluvatar.ai/vcuda-memory", "iluvatar memory resource") + fs.StringVar(&IluvatarResourceCores, "iluvatar-cores", "iluvatar.ai/vcuda-core", "iluvatar core resource") +} + +func (dev *IluvatarDevices) MutateAdmission(ctr *corev1.Container) (bool, error) { + count, ok := ctr.Resources.Limits[corev1.ResourceName(IluvatarResourceCount)] + if ok { + if count.Value() > 1 { + ctr.Resources.Limits[corev1.ResourceName(IluvatarResourceCores)] = *resource.NewQuantity(count.Value()*int64(100), resource.DecimalSI) + } + } + return ok, nil +} + +func (dev *IluvatarDevices) GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, error) { + nodedevices := []*api.DeviceInfo{} + i := 0 + cards, _ := n.Status.Capacity.Name(corev1.ResourceName(IluvatarResourceCores), resource.DecimalSI).AsInt64() + memoryTotal, _ := n.Status.Capacity.Name(corev1.ResourceName(IluvatarResourceMemory), resource.DecimalSI).AsInt64() + for int64(i)*100 < cards { + i++ + nodedevices = append(nodedevices, &api.DeviceInfo{ + Index: i, + Id: n.Name + "-iluvatar-" + fmt.Sprint(i), + Count: 100, + Devmem: int32(memoryTotal * 256 * 100 / cards), + Devcore: 100, + Type: IluvatarGPUDevice, + Numa: 0, + Health: true, + }) + } + return nodedevices, nil +} + +func (dev *IluvatarDevices) PatchAnnotations(annoinput *map[string]string, pd util.PodDevices) map[string]string { + devlist, ok := pd[IluvatarGPUDevice] + if ok && len(devlist) > 0 { + (*annoinput)[util.InRequestDevices[IluvatarGPUDevice]] = util.EncodePodSingleDevice(devlist) + (*annoinput)[util.SupportDevices[IluvatarGPUDevice]] = util.EncodePodSingleDevice(devlist) + for idx, dp := range devlist { + annoKey := IluvatarDeviceSelection + fmt.Sprint(idx) + value := "" + for _, val := range dp { + value = value + fmt.Sprint(val.Idx) + "," + } + if len(value) > 0 { + (*annoinput)[annoKey] = strings.TrimRight(value, ",") + } + } + } + return *annoinput +} + +func (dev *IluvatarDevices) LockNode(n *corev1.Node, p *corev1.Pod) error { + return nil +} + +func (dev *IluvatarDevices) ReleaseNodeLock(n *corev1.Node, p *corev1.Pod) error { + return nil +} + +func (dev *IluvatarDevices) NodeCleanUp(nn string) error { + return nil +} + +func (dev *IluvatarDevices) CheckType(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) (bool, bool, bool) { + if strings.Compare(n.Type, IluvatarGPUDevice) == 0 { + return true, true, false + } + return false, false, false +} + +func (dev *IluvatarDevices) CheckUUID(annos map[string]string, d util.DeviceUsage) bool { + userUUID, ok := annos[IluvatarUseUUID] + if ok { + klog.V(5).Infof("check uuid for Iluvatar user uuid [%s], device id is %s", userUUID, d.ID) + // use , symbol to connect multiple uuid + userUUIDs := strings.Split(userUUID, ",") + for _, uuid := range userUUIDs { + if d.ID == uuid { + return true + } + } + return false + } + + noUserUUID, ok := annos[IluvatarNoUseUUID] + if ok { + klog.V(5).Infof("check uuid for Iluvatar not user uuid [%s], device id is %s", noUserUUID, d.ID) + // use , symbol to connect multiple uuid + noUserUUIDs := strings.Split(noUserUUID, ",") + for _, uuid := range noUserUUIDs { + if d.ID == uuid { + return false + } + } + return true + } + return true +} + +func (dev *IluvatarDevices) CheckHealth(devType string, n *corev1.Node) (bool, bool) { + return true, true +} + +func (dev *IluvatarDevices) GenerateResourceRequests(ctr *corev1.Container) util.ContainerDeviceRequest { + klog.Info("Counting iluvatar devices") + iluvatarResourceCount := corev1.ResourceName(IluvatarResourceCount) + iluvatarResourceMem := corev1.ResourceName(IluvatarResourceMemory) + iluvatarResourceCores := corev1.ResourceName(IluvatarResourceCores) + v, ok := ctr.Resources.Limits[iluvatarResourceCount] + if !ok { + v, ok = ctr.Resources.Requests[iluvatarResourceCount] + } + if ok { + if n, ok := v.AsInt64(); ok { + klog.Info("Found iluvatar devices") + memnum := 0 + mem, ok := ctr.Resources.Limits[iluvatarResourceMem] + if !ok { + mem, ok = ctr.Resources.Requests[iluvatarResourceMem] + } + if ok { + memnums, ok := mem.AsInt64() + if ok { + memnum = int(memnums) * 256 + } + } + corenum := int32(0) + core, ok := ctr.Resources.Limits[iluvatarResourceCores] + if !ok { + core, ok = ctr.Resources.Requests[iluvatarResourceCores] + } + if ok { + corenums, ok := core.AsInt64() + if ok { + corenum = int32(corenums) + } + } + + mempnum := 0 + if memnum == 0 { + mempnum = 100 + } + + return util.ContainerDeviceRequest{ + Nums: int32(n), + Type: IluvatarGPUDevice, + Memreq: int32(memnum), + MemPercentagereq: int32(mempnum), + Coresreq: corenum, + } + } + } + return util.ContainerDeviceRequest{} +} diff --git a/pkg/device/nvidia/device.go b/pkg/device/nvidia/device.go new file mode 100644 index 000000000..d5fabaa80 --- /dev/null +++ b/pkg/device/nvidia/device.go @@ -0,0 +1,321 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nvidia + +import ( + "errors" + "flag" + "fmt" + "strconv" + "strings" + + "github.com/Project-HAMi/HAMi/pkg/api" + "github.com/Project-HAMi/HAMi/pkg/scheduler/config" + "github.com/Project-HAMi/HAMi/pkg/util" + "github.com/Project-HAMi/HAMi/pkg/util/nodelock" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/klog/v2" +) + +const ( + HandshakeAnnos = "hami.io/node-handshake" + RegisterAnnos = "hami.io/node-nvidia-register" + NvidiaGPUDevice = "NVIDIA" + NvidiaGPUCommonWord = "GPU" + GPUInUse = "nvidia.com/use-gputype" + GPUNoUse = "nvidia.com/nouse-gputype" + NumaBind = "nvidia.com/numa-bind" + NodeLockNvidia = "hami.io/mutex.lock" + // GPUUseUUID is user can use specify GPU device for set GPU UUID. + GPUUseUUID = "nvidia.com/use-gpuuuid" + // GPUNoUseUUID is user can not use specify GPU device for set GPU UUID. + GPUNoUseUUID = "nvidia.com/nouse-gpuuuid" +) + +var ( + ResourceName string + ResourceMem string + ResourceCores string + ResourceMemPercentage string + ResourcePriority string + DebugMode bool + OverwriteEnv bool +) + +type NvidiaGPUDevices struct { +} + +func InitNvidiaDevice() *NvidiaGPUDevices { + util.InRequestDevices[NvidiaGPUDevice] = "hami.io/vgpu-devices-to-allocate" + util.SupportDevices[NvidiaGPUDevice] = "hami.io/vgpu-devices-allocated" + util.HandshakeAnnos[NvidiaGPUDevice] = HandshakeAnnos + return &NvidiaGPUDevices{} +} + +func (dev *NvidiaGPUDevices) ParseConfig(fs *flag.FlagSet) { + fs.StringVar(&ResourceName, "resource-name", "nvidia.com/gpu", "resource name") + fs.StringVar(&ResourceMem, "resource-mem", "nvidia.com/gpumem", "gpu memory to allocate") + fs.StringVar(&ResourceMemPercentage, "resource-mem-percentage", "nvidia.com/gpumem-percentage", "gpu memory fraction to allocate") + fs.StringVar(&ResourceCores, "resource-cores", "nvidia.com/gpucores", "cores percentage to use") + fs.StringVar(&ResourcePriority, "resource-priority", "vgputaskpriority", "vgpu task priority 0 for high and 1 for low") + fs.BoolVar(&OverwriteEnv, "overwrite-env", false, "If set NVIDIA_VISIBLE_DEVICES=none to pods with no-gpu allocation") +} + +func (dev *NvidiaGPUDevices) NodeCleanUp(nn string) error { + return util.MarkAnnotationsToDelete(HandshakeAnnos, nn) +} + +func (dev *NvidiaGPUDevices) CheckHealth(devType string, n *corev1.Node) (bool, bool) { + return util.CheckHealth(devType, n) +} + +func (dev *NvidiaGPUDevices) LockNode(n *corev1.Node, p *corev1.Pod) error { + found := false + for _, val := range p.Spec.Containers { + if (dev.GenerateResourceRequests(&val).Nums) > 0 { + found = true + break + } + } + if !found { + return nil + } + return nodelock.LockNode(n.Name, NodeLockNvidia, p) +} + +func (dev *NvidiaGPUDevices) ReleaseNodeLock(n *corev1.Node, p *corev1.Pod) error { + found := false + for _, val := range p.Spec.Containers { + if (dev.GenerateResourceRequests(&val).Nums) > 0 { + found = true + break + } + } + if !found { + return nil + } + return nodelock.ReleaseNodeLock(n.Name, NodeLockNvidia) +} + +func (dev *NvidiaGPUDevices) GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, error) { + devEncoded, ok := n.Annotations[RegisterAnnos] + if !ok { + return []*api.DeviceInfo{}, errors.New("annos not found " + RegisterAnnos) + } + nodedevices, err := util.DecodeNodeDevices(devEncoded) + if err != nil { + klog.ErrorS(err, "failed to decode node devices", "node", n.Name, "device annotation", devEncoded) + return []*api.DeviceInfo{}, err + } + if len(nodedevices) == 0 { + klog.InfoS("no gpu device found", "node", n.Name, "device annotation", devEncoded) + return []*api.DeviceInfo{}, errors.New("no gpu found on node") + } + devDecoded := util.EncodeNodeDevices(nodedevices) + klog.V(5).InfoS("nodes device information", "node", n.Name, "nodedevices", devDecoded) + return nodedevices, nil +} + +func (dev *NvidiaGPUDevices) MutateAdmission(ctr *corev1.Container) (bool, error) { + /*gpu related */ + priority, ok := ctr.Resources.Limits[corev1.ResourceName(ResourcePriority)] + if ok { + ctr.Env = append(ctr.Env, corev1.EnvVar{ + Name: api.TaskPriority, + Value: fmt.Sprint(priority.Value()), + }) + } + + _, resourceNameOK := ctr.Resources.Limits[corev1.ResourceName(ResourceName)] + if resourceNameOK { + return resourceNameOK, nil + } + + _, resourceCoresOK := ctr.Resources.Limits[corev1.ResourceName(ResourceCores)] + _, resourceMemOK := ctr.Resources.Limits[corev1.ResourceName(ResourceMem)] + _, resourceMemPercentageOK := ctr.Resources.Limits[corev1.ResourceName(ResourceMemPercentage)] + + if resourceCoresOK || resourceMemOK || resourceMemPercentageOK { + if config.DefaultResourceNum > 0 { + ctr.Resources.Limits[corev1.ResourceName(ResourceName)] = *resource.NewQuantity(int64(config.DefaultResourceNum), resource.BinarySI) + resourceNameOK = true + } + } + + if !resourceNameOK && OverwriteEnv { + ctr.Env = append(ctr.Env, corev1.EnvVar{ + Name: "NVIDIA_VISIBLE_DEVICES", + Value: "none", + }) + } + return resourceNameOK, nil +} + +func checkGPUtype(annos map[string]string, cardtype string) bool { + cardtype = strings.ToUpper(cardtype) + if inuse, ok := annos[GPUInUse]; ok { + useTypes := strings.Split(inuse, ",") + if !ContainsSliceFunc(useTypes, func(useType string) bool { + return strings.Contains(cardtype, strings.ToUpper(useType)) + }) { + return false + } + } + if unuse, ok := annos[GPUNoUse]; ok { + unuseTypes := strings.Split(unuse, ",") + if ContainsSliceFunc(unuseTypes, func(unuseType string) bool { + return strings.Contains(cardtype, strings.ToUpper(unuseType)) + }) { + return false + } + } + return true +} + +func ContainsSliceFunc[S ~[]E, E any](s S, match func(E) bool) bool { + for _, e := range s { + if match(e) { + return true + } + } + return false +} + +func assertNuma(annos map[string]string) bool { + numabind, ok := annos[NumaBind] + if ok { + enforce, err := strconv.ParseBool(numabind) + if err == nil && enforce { + return true + } + } + return false +} + +func (dev *NvidiaGPUDevices) CheckType(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) (bool, bool, bool) { + if strings.Compare(n.Type, NvidiaGPUDevice) == 0 { + return true, checkGPUtype(annos, d.Type), assertNuma(annos) + } + return false, false, false +} + +func (dev *NvidiaGPUDevices) CheckUUID(annos map[string]string, d util.DeviceUsage) bool { + userUUID, ok := annos[GPUUseUUID] + if ok { + klog.V(5).Infof("check uuid for nvidia user uuid [%s], device id is %s", userUUID, d.ID) + // use , symbol to connect multiple uuid + userUUIDs := strings.Split(userUUID, ",") + for _, uuid := range userUUIDs { + if d.ID == uuid { + return true + } + } + return false + } + + noUserUUID, ok := annos[GPUNoUseUUID] + if ok { + klog.V(5).Infof("check uuid for nvidia not user uuid [%s], device id is %s", noUserUUID, d.ID) + // use , symbol to connect multiple uuid + noUserUUIDs := strings.Split(noUserUUID, ",") + for _, uuid := range noUserUUIDs { + if d.ID == uuid { + return false + } + } + return true + } + + return true +} + +func (dev *NvidiaGPUDevices) PatchAnnotations(annoinput *map[string]string, pd util.PodDevices) map[string]string { + devlist, ok := pd[NvidiaGPUDevice] + if ok && len(devlist) > 0 { + deviceStr := util.EncodePodSingleDevice(devlist) + (*annoinput)[util.InRequestDevices[NvidiaGPUDevice]] = deviceStr + (*annoinput)[util.SupportDevices[NvidiaGPUDevice]] = deviceStr + klog.V(5).Infof("pod add notation key [%s], values is [%s]", util.InRequestDevices[NvidiaGPUDevice], deviceStr) + klog.V(5).Infof("pod add notation key [%s], values is [%s]", util.SupportDevices[NvidiaGPUDevice], deviceStr) + } + return *annoinput +} + +func (dev *NvidiaGPUDevices) GenerateResourceRequests(ctr *corev1.Container) util.ContainerDeviceRequest { + resourceName := corev1.ResourceName(ResourceName) + resourceMem := corev1.ResourceName(ResourceMem) + resourceMemPercentage := corev1.ResourceName(ResourceMemPercentage) + resourceCores := corev1.ResourceName(ResourceCores) + v, ok := ctr.Resources.Limits[resourceName] + if !ok { + v, ok = ctr.Resources.Requests[resourceName] + } + if ok { + if n, ok := v.AsInt64(); ok { + memnum := 0 + mem, ok := ctr.Resources.Limits[resourceMem] + if !ok { + mem, ok = ctr.Resources.Requests[resourceMem] + } + if ok { + memnums, ok := mem.AsInt64() + if ok { + memnum = int(memnums) + } + } + mempnum := int32(101) + mem, ok = ctr.Resources.Limits[resourceMemPercentage] + if !ok { + mem, ok = ctr.Resources.Requests[resourceMemPercentage] + } + if ok { + mempnums, ok := mem.AsInt64() + if ok { + mempnum = int32(mempnums) + } + } + if mempnum == 101 && memnum == 0 { + if config.DefaultMem != 0 { + memnum = int(config.DefaultMem) + } else { + mempnum = 100 + } + } + corenum := config.DefaultCores + core, ok := ctr.Resources.Limits[resourceCores] + if !ok { + core, ok = ctr.Resources.Requests[resourceCores] + } + if ok { + corenums, ok := core.AsInt64() + if ok { + corenum = int32(corenums) + } + } + return util.ContainerDeviceRequest{ + Nums: int32(n), + Type: NvidiaGPUDevice, + Memreq: int32(memnum), + MemPercentagereq: int32(mempnum), + Coresreq: int32(corenum), + } + } + } + return util.ContainerDeviceRequest{} +} diff --git a/pkg/device/nvidia/device_test.go b/pkg/device/nvidia/device_test.go new file mode 100644 index 000000000..b48fe2668 --- /dev/null +++ b/pkg/device/nvidia/device_test.go @@ -0,0 +1,277 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nvidia + +import ( + "testing" + + "github.com/Project-HAMi/HAMi/pkg/scheduler/config" + "github.com/Project-HAMi/HAMi/pkg/util" + + "gotest.tools/v3/assert" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" +) + +func Test_DefaultResourceNum(t *testing.T) { + v := *resource.NewQuantity(1, resource.BinarySI) + vv, ok := v.AsInt64() + assert.Equal(t, ok, true) + assert.Equal(t, vv, int64(1)) +} + +func Test_MutateAdmission(t *testing.T) { + ResourceName = "nvidia.com/gpu" + ResourceMem = "nvidia.com/gpumem" + ResourceMemPercentage = "nvidia.com/gpumem-percentage" + ResourceCores = "nvidia.com/gpucores" + config.DefaultResourceNum = 1 + tests := []struct { + name string + args *corev1.Container + want bool + }{ + { + name: "having ResourceName set to resource limits.", + args: &corev1.Container{ + Name: "test", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "nvidia.com/gpu": *resource.NewQuantity(1, resource.BinarySI), + }, + }, + }, + want: true, + }, + { + name: "don't having ResourceName, but having ResourceCores set to resource limits", + args: &corev1.Container{ + Name: "test", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "nvidia.com/gpucores": *resource.NewQuantity(1, resource.BinarySI), + }, + }, + }, + want: true, + }, + { + name: "don't having ResourceName, but having ResourceMem set to resource limits", + args: &corev1.Container{ + Name: "test", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "nvidia.com/gpumem": *resource.NewQuantity(1, resource.BinarySI), + }, + }, + }, + want: true, + }, + { + name: "don't having ResourceName, but having ResourceMemPercentage set to resource limits", + args: &corev1.Container{ + Name: "test", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "nvidia.com/gpumem-percentage": *resource.NewQuantity(1, resource.BinarySI), + }, + }, + }, + want: true, + }, + { + name: "don't having math resources.", + args: &corev1.Container{ + Name: "test", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{}, + }, + }, + want: false, + }, + } + + gpuDevices := &NvidiaGPUDevices{} + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + got, _ := gpuDevices.MutateAdmission(test.args) + if test.want != got { + t.Fatalf("exec MutateAdmission method expect return is %+v, but got is %+v", test.want, got) + } + }) + } +} + +func Test_CheckUUID(t *testing.T) { + gpuDevices := &NvidiaGPUDevices{} + tests := []struct { + name string + args struct { + annos map[string]string + d util.DeviceUsage + } + want bool + }{ + { + name: "don't set GPUUseUUID and GPUNoUseUUID annotation", + args: struct { + annos map[string]string + d util.DeviceUsage + }{ + annos: make(map[string]string), + d: util.DeviceUsage{}, + }, + want: true, + }, + { + name: "use set GPUUseUUID don't set GPUNoUseUUID annotation,device match", + args: struct { + annos map[string]string + d util.DeviceUsage + }{ + annos: map[string]string{ + GPUUseUUID: "abc,123", + }, + d: util.DeviceUsage{ + ID: "abc", + }, + }, + want: true, + }, + { + name: "use set GPUUseUUID don't set GPUNoUseUUID annotation,device don't match", + args: struct { + annos map[string]string + d util.DeviceUsage + }{ + annos: map[string]string{ + GPUUseUUID: "abc,123", + }, + d: util.DeviceUsage{ + ID: "1abc", + }, + }, + want: false, + }, + { + name: "use don't set GPUUseUUID set GPUNoUseUUID annotation,device match", + args: struct { + annos map[string]string + d util.DeviceUsage + }{ + annos: map[string]string{ + GPUNoUseUUID: "abc,123", + }, + d: util.DeviceUsage{ + ID: "abc", + }, + }, + want: false, + }, + { + name: "use don't set GPUUseUUID set GPUNoUseUUID annotation,device don't match", + args: struct { + annos map[string]string + d util.DeviceUsage + }{ + annos: map[string]string{ + GPUNoUseUUID: "abc,123", + }, + d: util.DeviceUsage{ + ID: "1abc", + }, + }, + want: true, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + got := gpuDevices.CheckUUID(test.args.annos, test.args.d) + assert.Equal(t, test.want, got) + }) + } +} + +func Test_CheckType(t *testing.T) { + gpuDevices := &NvidiaGPUDevices{} + tests := []struct { + name string + args struct { + annos map[string]string + d util.DeviceUsage + } + want bool + }{ + { + name: "use set GPUInUse don't set GPUNoUse annotation,device match", + args: struct { + annos map[string]string + d util.DeviceUsage + }{ + annos: map[string]string{ + GPUInUse: "A10", + }, + d: util.DeviceUsage{ + Type: "NVIDIA A100", + }, + }, + want: true, + }, + { + name: "use set GPUInUse set GPUNoUse annotation,device don't match", + args: struct { + annos map[string]string + d util.DeviceUsage + }{ + annos: map[string]string{ + GPUInUse: "A10", + GPUNoUse: "A100", + }, + d: util.DeviceUsage{ + Type: "NVIDIA A100", + }, + }, + want: false, + }, + { + name: "use set GPUInUse set GPUNoUse annotation,device match", + args: struct { + annos map[string]string + d util.DeviceUsage + }{ + annos: map[string]string{ + GPUInUse: "A10", + GPUNoUse: "A100", + }, + d: util.DeviceUsage{ + Type: "NVIDIA A10", + }, + }, + want: true, + }, + } + req := util.ContainerDeviceRequest{ + Type: NvidiaGPUDevice, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + _, got, _ := gpuDevices.CheckType(test.args.annos, test.args.d, req) + assert.Equal(t, test.want, got) + }) + } +} diff --git a/pkg/k8sutil/client.go b/pkg/k8sutil/client.go index 86715acfc..65ca108b4 100644 --- a/pkg/k8sutil/client.go +++ b/pkg/k8sutil/client.go @@ -1,42 +1,43 @@ /* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ package k8sutil import ( - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/clientcmd" - "os" - "path/filepath" + "os" + "path/filepath" + + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" ) -// NewClient connects to an API server +// NewClient connects to an API server. func NewClient() (kubernetes.Interface, error) { - kubeConfig := os.Getenv("KUBECONFIG") - if kubeConfig == "" { - kubeConfig = filepath.Join(os.Getenv("HOME"), ".kube", "config") - } - config, err := rest.InClusterConfig() - if err != nil { - config, err = clientcmd.BuildConfigFromFlags("", kubeConfig) - if err != nil { - return nil, err - } - } - client, err := kubernetes.NewForConfig(config) - return client, err + kubeConfig := os.Getenv("KUBECONFIG") + if kubeConfig == "" { + kubeConfig = filepath.Join(os.Getenv("HOME"), ".kube", "config") + } + config, err := rest.InClusterConfig() + if err != nil { + config, err = clientcmd.BuildConfigFromFlags("", kubeConfig) + if err != nil { + return nil, err + } + } + client, err := kubernetes.NewForConfig(config) + return client, err } diff --git a/pkg/k8sutil/pod.go b/pkg/k8sutil/pod.go index 90bee783a..ea39c846c 100644 --- a/pkg/k8sutil/pod.go +++ b/pkg/k8sutil/pod.go @@ -1,45 +1,50 @@ /* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ package k8sutil import ( - corev1 "k8s.io/api/core/v1" + "github.com/Project-HAMi/HAMi/pkg/device" + "github.com/Project-HAMi/HAMi/pkg/util" + + corev1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" ) -func ResourceNums(pod *corev1.Pod, resourceName corev1.ResourceName) (counts []int) { - counts = make([]int, len(pod.Spec.Containers)) - for i := 0; i < len(pod.Spec.Containers); i++ { - v, ok := pod.Spec.Containers[i].Resources.Limits[resourceName] - if !ok { - v, ok = pod.Spec.Containers[i].Resources.Requests[resourceName] - } - if ok { - if n, ok := v.AsInt64(); ok { - counts[i] = int(n) - } - } - } - return counts +func Resourcereqs(pod *corev1.Pod) (counts util.PodDeviceRequests) { + counts = make(util.PodDeviceRequests, len(pod.Spec.Containers)) + //Count Nvidia GPU + for i := 0; i < len(pod.Spec.Containers); i++ { + devices := device.GetDevices() + counts[i] = make(util.ContainerDeviceRequests) + for idx, val := range devices { + request := val.GenerateResourceRequests(&pod.Spec.Containers[i]) + if request.Nums > 0 { + counts[i][idx] = val.GenerateResourceRequests(&pod.Spec.Containers[i]) + } + } + } + klog.InfoS("collect requestreqs", "counts", counts) + return counts } func IsPodInTerminatedState(pod *corev1.Pod) bool { - return pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodSucceeded + return pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodSucceeded } func AllContainersCreated(pod *corev1.Pod) bool { - return len(pod.Status.ContainerStatuses) >= len(pod.Spec.Containers) + return len(pod.Status.ContainerStatuses) >= len(pod.Spec.Containers) } diff --git a/pkg/k8sutil/pod_test.go b/pkg/k8sutil/pod_test.go new file mode 100644 index 000000000..b89356fc4 --- /dev/null +++ b/pkg/k8sutil/pod_test.go @@ -0,0 +1,131 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package k8sutil + +import ( + "testing" + + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + "github.com/Project-HAMi/HAMi/pkg/util" + + "gotest.tools/v3/assert" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" +) + +func Test_Resourcereqs(t *testing.T) { + nvidia.ResourceName = "hami.io/gpu" + nvidia.ResourceMem = "hami.io/gpumem" + nvidia.ResourceMemPercentage = "hami.io/gpumem-percentage" + nvidia.ResourceCores = "hami.io/gpucores" + tests := []struct { + name string + args *corev1.Pod + want util.PodDeviceRequests + }{ + { + name: "don't resource", + args: &corev1.Pod{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "cpu": *resource.NewQuantity(1, resource.BinarySI), + }, + }, + }, + }, + }, + }, + want: []util.ContainerDeviceRequests{{}}, + }, + { + name: "one container use gpu", + args: &corev1.Pod{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "hami.io/gpu": *resource.NewQuantity(1, resource.BinarySI), + "hami.io/gpucores": *resource.NewQuantity(30, resource.BinarySI), + "hami.io/gpumem": *resource.NewQuantity(1000, resource.BinarySI), + }, + }, + }, + }, + }, + }, + want: []util.ContainerDeviceRequests{ + { + nvidia.NvidiaGPUDevice: util.ContainerDeviceRequest{ + Nums: 1, + Type: nvidia.NvidiaGPUDevice, + Memreq: 1000, + MemPercentagereq: 101, + Coresreq: 30, + }, + }, + }, + }, + { + name: "two container only one container use gpu", + args: &corev1.Pod{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "hami.io/gpu": *resource.NewQuantity(1, resource.BinarySI), + "hami.io/gpucores": *resource.NewQuantity(30, resource.BinarySI), + "hami.io/gpumem": *resource.NewQuantity(1000, resource.BinarySI), + }, + }, + }, + { + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "cpu": *resource.NewQuantity(1, resource.BinarySI), + }, + }, + }, + }, + }, + }, + want: []util.ContainerDeviceRequests{ + { + nvidia.NvidiaGPUDevice: util.ContainerDeviceRequest{ + Nums: 1, + Type: nvidia.NvidiaGPUDevice, + Memreq: 1000, + MemPercentagereq: 101, + Coresreq: 30, + }, + }, + {}, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + got := Resourcereqs(test.args) + assert.DeepEqual(t, test.want, got) + }) + } +} diff --git a/pkg/monitor/nvidia/cudevshr.go b/pkg/monitor/nvidia/cudevshr.go new file mode 100644 index 000000000..0198c6626 --- /dev/null +++ b/pkg/monitor/nvidia/cudevshr.go @@ -0,0 +1,244 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nvidia + +import ( + "context" + "errors" + "fmt" + "os" + "path/filepath" + "strings" + "sync" + "syscall" + "time" + "unsafe" + + v0 "github.com/Project-HAMi/HAMi/pkg/monitor/nvidia/v0" + v1 "github.com/Project-HAMi/HAMi/pkg/monitor/nvidia/v1" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" + "k8s.io/klog/v2" +) + +const SharedRegionMagicFlag = 19920718 + +type headerT struct { + initializedFlag int32 + majorVersion int32 + minorVersion int32 +} + +type UsageInfo interface { + DeviceMax() int + DeviceNum() int + DeviceMemoryContextSize(idx int) uint64 + DeviceMemoryModuleSize(idx int) uint64 + DeviceMemoryBufferSize(idx int) uint64 + DeviceMemoryOffset(idx int) uint64 + DeviceMemoryTotal(idx int) uint64 + DeviceSmUtil(idx int) uint64 + IsValidUUID(idx int) bool + DeviceUUID(idx int) string + DeviceMemoryLimit(idx int) uint64 + LastKernelTime() int64 + //UsedMemory(idx int) (uint64, error) + GetPriority() int + GetRecentKernel() int32 + SetRecentKernel(v int32) + GetUtilizationSwitch() int32 + SetUtilizationSwitch(v int32) +} + +type ContainerUsage struct { + PodUID string + ContainerName string + data []byte + Info UsageInfo +} + +type ContainerLister struct { + containerPath string + containers map[string]*ContainerUsage + mutex sync.Mutex + clientset *kubernetes.Clientset +} + +func NewContainerLister() (*ContainerLister, error) { + hookPath, ok := os.LookupEnv("HOOK_PATH") + if !ok { + return nil, fmt.Errorf("HOOK_PATH not set") + } + config, err := clientcmd.BuildConfigFromFlags("", os.Getenv("KUBECONFIG")) + if err != nil { + klog.Errorf("Failed to build kubeconfig: %v", err) + return nil, err + } + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + klog.Errorf("Failed to build clientset: %v", err) + return nil, err + } + return &ContainerLister{ + containerPath: filepath.Join(hookPath, "containers"), + containers: make(map[string]*ContainerUsage), + clientset: clientset, + }, nil +} + +func (l *ContainerLister) Lock() { + l.mutex.Lock() +} + +func (l *ContainerLister) UnLock() { + l.mutex.Unlock() +} + +func (l *ContainerLister) ListContainers() map[string]*ContainerUsage { + return l.containers +} + +func (l *ContainerLister) Clientset() *kubernetes.Clientset { + return l.clientset +} + +func (l *ContainerLister) Update() error { + pods, err := l.clientset.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{}) + if err != nil { + return err + } + + l.mutex.Lock() + defer l.mutex.Unlock() + entries, err := os.ReadDir(l.containerPath) + if err != nil { + return err + } + for _, entry := range entries { + if !entry.IsDir() { + continue + } + dirName := filepath.Join(l.containerPath, entry.Name()) + if !isValidPod(entry.Name(), pods) { + dirInfo, err := os.Stat(dirName) + if err == nil && dirInfo.ModTime().Add(time.Second*300).After(time.Now()) { + continue + } + klog.Infof("Removing dirname %s in monitorpath", dirName) + if c, ok := l.containers[entry.Name()]; ok { + syscall.Munmap(c.data) + delete(l.containers, entry.Name()) + } + _ = os.RemoveAll(dirName) + continue + } + if _, ok := l.containers[entry.Name()]; ok { + continue + } + usage, err := loadCache(dirName) + if err != nil { + klog.Errorf("Failed to load cache: %s, error: %v", dirName, err) + continue + } + if usage == nil { + // no cuInit in container + continue + } + usage.PodUID = strings.Split(entry.Name(), "_")[0] + usage.ContainerName = strings.Split(entry.Name(), "_")[1] + l.containers[entry.Name()] = usage + klog.Infof("Adding ctr dirname %s in monitorpath", dirName) + } + return nil +} + +func loadCache(fpath string) (*ContainerUsage, error) { + klog.Infof("Checking path %s", fpath) + files, err := os.ReadDir(fpath) + if err != nil { + return nil, err + } + if len(files) > 2 { + return nil, errors.New("cache num not matched") + } + if len(files) == 0 { + return nil, nil + } + cacheFile := "" + for _, val := range files { + if strings.Contains(val.Name(), "libvgpu.so") { + continue + } + if !strings.Contains(val.Name(), ".cache") { + continue + } + cacheFile = filepath.Join(fpath, val.Name()) + break + } + if cacheFile == "" { + klog.Infof("No cache file in %s", fpath) + return nil, nil + } + info, err := os.Stat(cacheFile) + if err != nil { + klog.Errorf("Failed to stat cache file: %s, error: %v", cacheFile, err) + return nil, err + } + if info.Size() < int64(unsafe.Sizeof(headerT{})) { + return nil, fmt.Errorf("cache file size %d too small", info.Size()) + } + f, err := os.OpenFile(cacheFile, os.O_RDWR, 0666) + if err != nil { + klog.Errorf("Failed to open cache file: %s, error: %v", cacheFile, err) + return nil, err + } + defer func(f *os.File) { + _ = f.Close() + }(f) + usage := &ContainerUsage{} + usage.data, err = syscall.Mmap(int(f.Fd()), 0, int(info.Size()), syscall.PROT_WRITE|syscall.PROT_READ, syscall.MAP_SHARED) + if err != nil { + klog.Errorf("Failed to mmap cache file: %s, error: %v", cacheFile, err) + return nil, err + } + head := (*headerT)(unsafe.Pointer(&usage.data[0])) + if head.initializedFlag != SharedRegionMagicFlag { + _ = syscall.Munmap(usage.data) + return nil, fmt.Errorf("cache file magic flag not matched") + } + if info.Size() == 1197897 { + usage.Info = v0.CastSpec(usage.data) + } else if head.majorVersion == 1 { + usage.Info = v1.CastSpec(usage.data) + } else { + _ = syscall.Munmap(usage.data) + return nil, fmt.Errorf("unknown cache file size %d version %d.%d", info.Size(), head.majorVersion, head.minorVersion) + } + return usage, nil +} + +func isValidPod(name string, pods *corev1.PodList) bool { + for _, val := range pods.Items { + if strings.Contains(name, string(val.UID)) { + return true + } + } + return false +} diff --git a/pkg/monitor/nvidia/v0/spec.go b/pkg/monitor/nvidia/v0/spec.go new file mode 100644 index 000000000..f29839cf2 --- /dev/null +++ b/pkg/monitor/nvidia/v0/spec.go @@ -0,0 +1,176 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v0 + +import "unsafe" + +const maxDevices = 16 + +type deviceMemory struct { + contextSize uint64 + moduleSize uint64 + bufferSize uint64 + offset uint64 + total uint64 +} + +type deviceUtilization struct { + decUtil uint64 + encUtil uint64 + smUtil uint64 +} + +type shrregProcSlotT struct { + pid int32 + hostpid int32 + used [16]deviceMemory + monitorused [16]uint64 + deviceUtil [16]deviceUtilization + status int32 +} + +type uuid struct { + uuid [96]byte +} + +type semT struct { + sem [32]byte +} + +type sharedRegionT struct { + initializedFlag int32 + smInitFlag int32 + ownerPid uint32 + sem semT + num uint64 + uuids [16]uuid + + limit [16]uint64 + smLimit [16]uint64 + procs [1024]shrregProcSlotT + + procnum int32 + utilizationSwitch int32 + recentKernel int32 + priority int32 +} + +type Spec struct { + sr *sharedRegionT +} + +func (s Spec) DeviceMax() int { + return maxDevices +} + +func (s Spec) DeviceNum() int { + return int(s.sr.num) +} + +func (s Spec) DeviceMemoryContextSize(idx int) uint64 { + v := uint64(0) + for _, p := range s.sr.procs { + v += p.used[idx].contextSize + } + return v +} + +func (s Spec) DeviceMemoryModuleSize(idx int) uint64 { + v := uint64(0) + for _, p := range s.sr.procs { + v += p.used[idx].moduleSize + } + return v +} + +func (s Spec) DeviceMemoryBufferSize(idx int) uint64 { + v := uint64(0) + for _, p := range s.sr.procs { + v += p.used[idx].bufferSize + } + return v +} + +func (s Spec) DeviceMemoryOffset(idx int) uint64 { + v := uint64(0) + for _, p := range s.sr.procs { + v += p.used[idx].offset + } + return v +} + +func (s Spec) DeviceMemoryTotal(idx int) uint64 { + v := uint64(0) + for _, p := range s.sr.procs { + v += p.used[idx].total + } + return v +} + +func (s Spec) DeviceSmUtil(idx int) uint64 { + v := uint64(0) + for _, p := range s.sr.procs { + v += p.deviceUtil[idx].smUtil + } + return v +} + +func (s Spec) IsValidUUID(idx int) bool { + return s.sr.uuids[idx].uuid[0] != 0 +} + +func (s Spec) DeviceUUID(idx int) string { + return string(s.sr.uuids[idx].uuid[:]) +} + +func (s Spec) DeviceMemoryLimit(idx int) uint64 { + return s.sr.limit[idx] +} + +func (s Spec) LastKernelTime() int64 { + return 0 +} + +func CastSpec(data []byte) Spec { + return Spec{ + sr: (*sharedRegionT)(unsafe.Pointer(&data[0])), + } +} + +// func (s *SharedRegionT) UsedMemory(idx int) (uint64, error) { +// return 0, nil +// } + +func (s Spec) GetPriority() int { + return int(s.sr.priority) +} + +func (s Spec) GetRecentKernel() int32 { + return s.sr.recentKernel +} + +func (s Spec) SetRecentKernel(v int32) { + s.sr.recentKernel = v +} + +func (s Spec) GetUtilizationSwitch() int32 { + return s.sr.utilizationSwitch +} + +func (s Spec) SetUtilizationSwitch(v int32) { + s.sr.utilizationSwitch = v +} diff --git a/pkg/monitor/nvidia/v1/spec.go b/pkg/monitor/nvidia/v1/spec.go new file mode 100644 index 000000000..079507d5e --- /dev/null +++ b/pkg/monitor/nvidia/v1/spec.go @@ -0,0 +1,183 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import "unsafe" + +const maxDevices = 16 + +type deviceMemory struct { + contextSize uint64 + moduleSize uint64 + bufferSize uint64 + offset uint64 + total uint64 + unused [3]uint64 +} + +type deviceUtilization struct { + decUtil uint64 + encUtil uint64 + smUtil uint64 + unused [3]uint64 +} + +type shrregProcSlotT struct { + pid int32 + hostpid int32 + used [16]deviceMemory + monitorused [16]uint64 + deviceUtil [16]deviceUtilization + status int32 + unused [3]uint64 +} + +type uuid struct { + uuid [96]byte +} + +type semT struct { + sem [32]byte +} + +type sharedRegionT struct { + initializedFlag int32 + majorVersion int32 + minorVersion int32 + smInitFlag int32 + ownerPid uint32 + sem semT + num uint64 + uuids [16]uuid + + limit [16]uint64 + smLimit [16]uint64 + procs [1024]shrregProcSlotT + + procnum int32 + utilizationSwitch int32 + recentKernel int32 + priority int32 + lastKernelTime int64 + unused [4]uint64 +} + +type Spec struct { + sr *sharedRegionT +} + +func (s Spec) DeviceMax() int { + return maxDevices +} + +func (s Spec) DeviceNum() int { + return int(s.sr.num) +} + +func (s Spec) DeviceMemoryContextSize(idx int) uint64 { + v := uint64(0) + for _, p := range s.sr.procs { + v += p.used[idx].contextSize + } + return v +} + +func (s Spec) DeviceMemoryModuleSize(idx int) uint64 { + v := uint64(0) + for _, p := range s.sr.procs { + v += p.used[idx].moduleSize + } + return v +} + +func (s Spec) DeviceMemoryBufferSize(idx int) uint64 { + v := uint64(0) + for _, p := range s.sr.procs { + v += p.used[idx].bufferSize + } + return v +} + +func (s Spec) DeviceMemoryOffset(idx int) uint64 { + v := uint64(0) + for _, p := range s.sr.procs { + v += p.used[idx].offset + } + return v +} + +func (s Spec) DeviceMemoryTotal(idx int) uint64 { + v := uint64(0) + for _, p := range s.sr.procs { + v += p.used[idx].total + } + return v +} + +func (s Spec) DeviceSmUtil(idx int) uint64 { + v := uint64(0) + for _, p := range s.sr.procs { + v += p.deviceUtil[idx].smUtil + } + return v +} + +func (s Spec) IsValidUUID(idx int) bool { + return s.sr.uuids[idx].uuid[0] != 0 +} + +func (s Spec) DeviceUUID(idx int) string { + return string(s.sr.uuids[idx].uuid[:]) +} + +func (s Spec) DeviceMemoryLimit(idx int) uint64 { + return s.sr.limit[idx] +} + +func (s Spec) LastKernelTime() int64 { + return s.sr.lastKernelTime +} + +func CastSpec(data []byte) Spec { + return Spec{ + sr: (*sharedRegionT)(unsafe.Pointer(&data[0])), + } +} + +// func (s *SharedRegionT) UsedMemory(idx int) (uint64, error) { +// return 0, nil +// } + +func (s Spec) GetPriority() int { + return int(s.sr.priority) +} + +func (s Spec) GetRecentKernel() int32 { + return s.sr.recentKernel +} + +func (s Spec) SetRecentKernel(v int32) { + s.sr.recentKernel = v +} + +func (s Spec) GetUtilizationSwitch() int32 { + return s.sr.utilizationSwitch +} + +func (s Spec) SetUtilizationSwitch(v int32) { + s.sr.utilizationSwitch = v +} diff --git a/pkg/oci/runtime.go b/pkg/oci/runtime.go new file mode 100644 index 000000000..89df5aa1d --- /dev/null +++ b/pkg/oci/runtime.go @@ -0,0 +1,23 @@ +/* +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package oci + +// Runtime is an interface for a runtime shim. The Exec method accepts a list +// of command line arguments, and returns an error / nil. +type Runtime interface { + Exec([]string) error +} diff --git a/pkg/oci/runtime_exec.go b/pkg/oci/runtime_exec.go new file mode 100644 index 000000000..74086c51c --- /dev/null +++ b/pkg/oci/runtime_exec.go @@ -0,0 +1,79 @@ +/* +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package oci + +import ( + "fmt" + "os" + "syscall" + + log "github.com/sirupsen/logrus" +) + +// SyscallExecRuntime wraps the path that a binary and defines the semantics for how to exec into it. +// This can be used to wrap an OCI-compliant low-level runtime binary, allowing it to be used through the +// Runtime internface. +type SyscallExecRuntime struct { + logger *log.Logger + path string + // exec is used for testing. This defaults to syscall.Exec + exec func(argv0 string, argv []string, envv []string) error +} + +var _ Runtime = (*SyscallExecRuntime)(nil) + +// NewSyscallExecRuntime creates a SyscallExecRuntime for the specified path with the standard logger. +func NewSyscallExecRuntime(path string) (Runtime, error) { + return NewSyscallExecRuntimeWithLogger(log.StandardLogger(), path) +} + +// NewSyscallExecRuntimeWithLogger creates a SyscallExecRuntime for the specified logger and path. +func NewSyscallExecRuntimeWithLogger(logger *log.Logger, path string) (Runtime, error) { + info, err := os.Stat(path) + if err != nil { + return nil, fmt.Errorf("invalid path '%v': %v", path, err) + } + if info.IsDir() || info.Mode()&0111 == 0 { + return nil, fmt.Errorf("specified path '%v' is not an executable file", path) + } + + shim := SyscallExecRuntime{ + logger: logger, + path: path, + exec: syscall.Exec, + } + + return &shim, nil +} + +// Exec exces into the binary at the path from the SyscallExecRuntime struct, passing it the supplied arguments +// after ensuring that the first argument is the path of the target binary. +func (s SyscallExecRuntime) Exec(args []string) error { + runtimeArgs := []string{s.path} + if len(args) > 1 { + runtimeArgs = append(runtimeArgs, args[1:]...) + } + + err := s.exec(s.path, runtimeArgs, os.Environ()) + if err != nil { + return fmt.Errorf("could not exec '%v': %v", s.path, err) + } + + // syscall.Exec is not expected to return. This is an error state regardless of whether + // err is nil or not. + return fmt.Errorf("unexpected return from exec '%v'", s.path) +} diff --git a/pkg/oci/runtime_exec_test.go b/pkg/oci/runtime_exec_test.go new file mode 100644 index 000000000..83ac64a2e --- /dev/null +++ b/pkg/oci/runtime_exec_test.go @@ -0,0 +1,100 @@ +/* +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ +package oci + +import ( + "fmt" + "strings" + "testing" + + testlog "github.com/sirupsen/logrus/hooks/test" + "github.com/stretchr/testify/require" +) + +func TestSyscallExecConstructor(t *testing.T) { + r, err := NewSyscallExecRuntime("////an/invalid/path") + require.Error(t, err) + require.Nil(t, r) + + r, err = NewSyscallExecRuntime("/tmp") + require.Error(t, err) + require.Nil(t, r) + + r, err = NewSyscallExecRuntime("/dev/null") + require.Error(t, err) + require.Nil(t, r) + + r, err = NewSyscallExecRuntime("/bin/sh") + require.NoError(t, err) + + f, ok := r.(*SyscallExecRuntime) + require.True(t, ok) + + require.Equal(t, "/bin/sh", f.path) +} + +func TestSyscallExecForwardsArgs(t *testing.T) { + logger, _ := testlog.NewNullLogger() + f := SyscallExecRuntime{ + logger: logger, + path: "runtime", + } + + testCases := []struct { + returnError error + args []string + errorPrefix string + }{ + { + returnError: nil, + errorPrefix: "unexpected return from exec", + }, + { + returnError: fmt.Errorf("error from exec"), + errorPrefix: "could not exec", + }, + { + returnError: nil, + args: []string{"otherargv0"}, + errorPrefix: "unexpected return from exec", + }, + { + returnError: nil, + args: []string{"otherargv0", "arg1", "arg2", "arg3"}, + errorPrefix: "unexpected return from exec", + }, + } + + for i, tc := range testCases { + execMock := WithMockExec(f, tc.returnError) + + err := execMock.Exec(tc.args) + + require.Errorf(t, err, "%d: %v", i, tc) + require.Truef(t, strings.HasPrefix(err.Error(), tc.errorPrefix), "%d: %v", i, tc) + if tc.returnError != nil { + require.Truef(t, strings.HasSuffix(err.Error(), tc.returnError.Error()), "%d: %v", i, tc) + } + + require.Equalf(t, f.path, execMock.argv0, "%d: %v", i, tc) + require.Equalf(t, f.path, execMock.argv[0], "%d: %v", i, tc) + + require.LessOrEqualf(t, len(tc.args), len(execMock.argv), "%d: %v", i, tc) + if len(tc.args) > 1 { + require.Equalf(t, tc.args[1:], execMock.argv[1:], "%d: %v", i, tc) + } + } +} diff --git a/pkg/oci/runtime_mock.go b/pkg/oci/runtime_mock.go new file mode 100644 index 000000000..fc48efd87 --- /dev/null +++ b/pkg/oci/runtime_mock.go @@ -0,0 +1,49 @@ +/* +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package oci + +// MockExecRuntime wraps a SyscallExecRuntime, intercepting the exec call for testing. +type MockExecRuntime struct { + SyscallExecRuntime + execMock +} + +// WithMockExec wraps a specified SyscallExecRuntime with a mocked exec function for testing. +func WithMockExec(e SyscallExecRuntime, execResult error) *MockExecRuntime { + m := MockExecRuntime{ + SyscallExecRuntime: e, + execMock: execMock{result: execResult}, + } + // overrdie the exec function to the mocked exec function. + m.SyscallExecRuntime.exec = m.execMock.exec + return &m +} + +type execMock struct { + argv0 string + argv []string + envv []string + result error +} + +func (m *execMock) exec(argv0 string, argv []string, envv []string) error { + m.argv0 = argv0 + m.argv = argv + m.envv = envv + + return m.result +} diff --git a/pkg/oci/spec.go b/pkg/oci/spec.go new file mode 100644 index 000000000..e163ee1b7 --- /dev/null +++ b/pkg/oci/spec.go @@ -0,0 +1,102 @@ +/* +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package oci + +import ( + "encoding/json" + "fmt" + "os" + + oci "github.com/opencontainers/runtime-spec/specs-go" +) + +// SpecModifier is a function that accepts a pointer to an OCI Srec and returns an +// error. The intention is that the function would modify the spec in-place. +type SpecModifier func(*oci.Spec) error + +// Spec defines the operations to be performed on an OCI specification. +type Spec interface { + Load() error + Flush() error + Modify(SpecModifier) error +} + +type fileSpec struct { + *oci.Spec + path string +} + +var _ Spec = (*fileSpec)(nil) + +// NewSpecFromFile creates an object that encapsulates a file-backed OCI spec. +// This can be used to read from the file, modify the spec, and write to the +// same file. +func NewSpecFromFile(filepath string) Spec { + oci := fileSpec{ + path: filepath, + } + + return &oci +} + +// Load reads the contents of an OCI spec from file to be referenced internally. +// The file is opened "read-only". +func (s *fileSpec) Load() error { + specFile, err := os.Open(s.path) + if err != nil { + return fmt.Errorf("error opening OCI specification file: %v", err) + } + defer specFile.Close() + + decoder := json.NewDecoder(specFile) + + var spec oci.Spec + err = decoder.Decode(&spec) + if err != nil { + return fmt.Errorf("error reading OCI specification from file: %v", err) + } + + s.Spec = &spec + return nil +} + +// Modify applies the specified SpecModifier to the stored OCI specification. +func (s *fileSpec) Modify(f SpecModifier) error { + if s.Spec == nil { + return fmt.Errorf("no spec loaded for modification") + } + return f(s.Spec) +} + +// Flush writes the stored OCI specification to the filepath specified by the path member. +// The file is truncated upon opening, overwriting any existing contents. +func (s fileSpec) Flush() error { + specFile, err := os.Create(s.path) + if err != nil { + return fmt.Errorf("error opening OCI specification file: %v", err) + } + defer specFile.Close() + + encoder := json.NewEncoder(specFile) + + err = encoder.Encode(s.Spec) + if err != nil { + return fmt.Errorf("error writing OCI specification to file: %v", err) + } + + return nil +} diff --git a/pkg/oci/spec_mock.go b/pkg/oci/spec_mock.go new file mode 100644 index 000000000..544d184c7 --- /dev/null +++ b/pkg/oci/spec_mock.go @@ -0,0 +1,70 @@ +/* +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package oci + +import ( + oci "github.com/opencontainers/runtime-spec/specs-go" +) + +// MockSpec provides a simple mock for an OCI spec to be used in testing. +// It also implements the SpecModifier interface. +type MockSpec struct { + *oci.Spec + MockLoad mockFunc + MockFlush mockFunc + MockModify mockFunc +} + +var _ Spec = (*MockSpec)(nil) + +// NewMockSpec constructs a MockSpec to be used in testing as a Spec. +func NewMockSpec(spec *oci.Spec, flushResult error, modifyResult error) *MockSpec { + s := MockSpec{ + Spec: spec, + MockFlush: mockFunc{result: flushResult}, + MockModify: mockFunc{result: modifyResult}, + } + + return &s +} + +// Load invokes the mocked Load function to return the predefined error / result. +func (s *MockSpec) Load() error { + return s.MockLoad.call() +} + +// Flush invokes the mocked Load function to return the predefined error / result. +func (s *MockSpec) Flush() error { + return s.MockFlush.call() +} + +// Modify applies the specified SpecModifier to the spec and invokes the +// mocked modify function to return the predefined error / result. +func (s *MockSpec) Modify(f SpecModifier) error { + f(s.Spec) + return s.MockModify.call() +} + +type mockFunc struct { + Callcount int + result error +} + +func (m *mockFunc) call() error { + m.Callcount++ + return m.result +} diff --git a/pkg/scheduler/config/config.go b/pkg/scheduler/config/config.go index f706d1470..4ac95f4b2 100644 --- a/pkg/scheduler/config/config.go +++ b/pkg/scheduler/config/config.go @@ -1,23 +1,36 @@ /* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ package config +import "github.com/Project-HAMi/HAMi/pkg/scheduler/policy" + var ( - HttpBind string - GrpcBind string - SchedulerName string + HTTPBind string + SchedulerName string + DefaultMem int32 + DefaultCores int32 + DefaultResourceNum int32 + MetricsBindAddress string + + // NodeSchedulerPolicy is config this scheduler node to use `binpack` or `spread`. default value is binpack. + NodeSchedulerPolicy = policy.NodeSchedulerPolicyBinpack.String() + // GPUSchedulerPolicy is config this scheduler GPU to use `binpack` or `spread`. default value is spread. + GPUSchedulerPolicy = policy.GPUSchedulerPolicySpread.String() + + // NodeLabelSelector is scheduler filter node by node label. + NodeLabelSelector map[string]string ) diff --git a/pkg/scheduler/event.go b/pkg/scheduler/event.go new file mode 100644 index 000000000..78ee661cb --- /dev/null +++ b/pkg/scheduler/event.go @@ -0,0 +1,78 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "fmt" + + "github.com/Project-HAMi/HAMi/pkg/scheduler/config" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + v1core "k8s.io/client-go/kubernetes/typed/core/v1" + "k8s.io/client-go/tools/record" +) + +// Define events for ResourceBinding, ResourceFilter objects and their associated resources. +const ( + // EventReasonFilteringFailed indicates that filtering failed. + EventReasonFilteringFailed = "FilteringFailed" + // EventReasonFilteringSucceed indicates that filtering succeed. + EventReasonFilteringSucceed = "FilteringSucceed" + + // EventReasonBindingFailed indicates that binding failed. + EventReasonBindingFailed = "BindingFailed" + // EventReasonBindingSucceed indicates that binding succeed. + EventReasonBindingSucceed = "BindingSucceed" +) + +func (s *Scheduler) addAllEventHandlers() { + + eventBroadcaster := record.NewBroadcaster() + eventBroadcaster.StartStructuredLogging(0) + eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: s.kubeClient.CoreV1().Events(metav1.NamespaceAll)}) + schema := runtime.NewScheme() + + _ = clientgoscheme.AddToScheme(schema) + s.eventRecorder = eventBroadcaster.NewRecorder(schema, corev1.EventSource{Component: config.SchedulerName}) +} + +func (s *Scheduler) recordScheduleBindingResultEvent(pod *corev1.Pod, eventReason string, nodeResult []string, schedulerErr error) { + if pod == nil { + return + } + if schedulerErr == nil { + successMsg := fmt.Sprintf("Successfully binding node %v to %v/%v", nodeResult, pod.Namespace, pod.Name) + s.eventRecorder.Event(pod, corev1.EventTypeNormal, eventReason, successMsg) + } else { + s.eventRecorder.Event(pod, corev1.EventTypeWarning, eventReason, schedulerErr.Error()) + } +} + +func (s *Scheduler) recordScheduleFilterResultEvent(pod *corev1.Pod, eventReason string, nodeResult []string, schedulerErr error) { + if pod == nil { + return + } + if schedulerErr == nil { + successMsg := fmt.Sprintf("Successfully filtered to following nodes: %v for %v/%v ", nodeResult, pod.Namespace, pod.Name) + s.eventRecorder.Event(pod, corev1.EventTypeNormal, eventReason, successMsg) + } else { + s.eventRecorder.Event(pod, corev1.EventTypeWarning, eventReason, schedulerErr.Error()) + } +} diff --git a/pkg/scheduler/nodes.go b/pkg/scheduler/nodes.go index 8b1241888..dfac2a069 100644 --- a/pkg/scheduler/nodes.go +++ b/pkg/scheduler/nodes.go @@ -1,76 +1,108 @@ /* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +Copyright 2024 The HAMi Authors. -package scheduler +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at -import ( - "fmt" - "sync" -) + http://www.apache.org/licenses/LICENSE-2.0 -type DeviceInfo struct { - ID string - Count int32 - Health bool -} +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ -type NodeInfo struct { - ID string - Devices []DeviceInfo -} +package scheduler -type DeviceUsage struct { - id string - used int32 - count int32 - health bool -} +import ( + "fmt" + "strings" + "sync" + + "github.com/Project-HAMi/HAMi/pkg/scheduler/policy" + "github.com/Project-HAMi/HAMi/pkg/util" -type DeviceUsageList []*DeviceUsage + "k8s.io/klog/v2" +) type NodeUsage struct { - devices DeviceUsageList + Devices policy.DeviceUsageList } type nodeManager struct { - nodes map[string]NodeInfo - mutex sync.Mutex + nodes map[string]*util.NodeInfo + mutex sync.RWMutex } func (m *nodeManager) init() { - m.nodes = make(map[string]NodeInfo) + m.nodes = make(map[string]*util.NodeInfo) +} + +func (m *nodeManager) addNode(nodeID string, nodeInfo *util.NodeInfo) { + if nodeInfo == nil || len(nodeInfo.Devices) == 0 { + return + } + m.mutex.Lock() + defer m.mutex.Unlock() + _, ok := m.nodes[nodeID] + if ok { + tmp := make([]util.DeviceInfo, 0, len(m.nodes[nodeID].Devices)+len(nodeInfo.Devices)) + tmp = append(tmp, m.nodes[nodeID].Devices...) + tmp = append(tmp, nodeInfo.Devices...) + m.nodes[nodeID].Devices = tmp + } else { + m.nodes[nodeID] = nodeInfo + } } -func (m *nodeManager) addNode(nodeID string, nodeInfo NodeInfo) { - m.mutex.Lock() - defer m.mutex.Unlock() - m.nodes[nodeID] = nodeInfo +func (m *nodeManager) rmNodeDevice(nodeID string, nodeInfo *util.NodeInfo, deviceVendor string) { + m.mutex.Lock() + defer m.mutex.Unlock() + _, ok := m.nodes[nodeID] + if ok { + if len(m.nodes[nodeID].Devices) == 0 { + delete(m.nodes, nodeID) + return + } + klog.V(5).Infoln("before rm:", m.nodes[nodeID].Devices, "needs remove", nodeInfo.Devices) + tmp := make([]util.DeviceInfo, 0, len(m.nodes[nodeID].Devices)-len(nodeInfo.Devices)) + for _, val := range m.nodes[nodeID].Devices { + if deviceVendor != val.DeviceVendor { + continue + } + found := false + for _, rmval := range nodeInfo.Devices { + if strings.Compare(val.ID, rmval.ID) == 0 { + found = true + break + } + } + if !found && len(val.ID) > 0 { + tmp = append(tmp, val) + } + } + m.nodes[nodeID].Devices = tmp + if len(m.nodes[nodeID].Devices) == 0 { + delete(m.nodes, nodeID) + return + } + klog.V(5).Infoln("Rm Devices res:", m.nodes[nodeID].Devices) + } } -func (m *nodeManager) delNode(nodeID string) { - m.mutex.Lock() - defer m.mutex.Unlock() - delete(m.nodes, nodeID) +func (m *nodeManager) GetNode(nodeID string) (*util.NodeInfo, error) { + m.mutex.RLock() + defer m.mutex.RUnlock() + if n, ok := m.nodes[nodeID]; ok { + return n, nil + } + return &util.NodeInfo{}, fmt.Errorf("node %v not found", nodeID) } -func (m *nodeManager) GetNode(nodeID string) (NodeInfo, error) { - m.mutex.Lock() - defer m.mutex.Unlock() - if n, ok := m.nodes[nodeID]; ok { - return n, nil - } - return NodeInfo{}, fmt.Errorf("node %v not found", nodeID) +func (m *nodeManager) ListNodes() (map[string]*util.NodeInfo, error) { + m.mutex.RLock() + defer m.mutex.RUnlock() + return m.nodes, nil } diff --git a/pkg/scheduler/pods.go b/pkg/scheduler/pods.go index 9211cd77b..1bd026f0d 100644 --- a/pkg/scheduler/pods.go +++ b/pkg/scheduler/pods.go @@ -1,114 +1,107 @@ /* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ package scheduler import ( - "4pd.io/k8s-vgpu/pkg/api" - "4pd.io/k8s-vgpu/pkg/util" - "fmt" - corev1 "k8s.io/api/core/v1" - k8stypes "k8s.io/apimachinery/pkg/types" - "k8s.io/klog/v2" - "sync" + "sync" + + "github.com/Project-HAMi/HAMi/pkg/util" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8stypes "k8s.io/apimachinery/pkg/types" + "k8s.io/klog/v2" ) type podInfo struct { - namespace string - name string - uid k8stypes.UID - nodeID string - devices util.PodDevices - ctrIDs []string + Namespace string + Name string + UID k8stypes.UID + NodeID string + Devices util.PodDevices + CtrIDs []string } -type containerInfo struct { - podUID k8stypes.UID - ctrIdx int +// PodUseDeviceStat count pod use device info. +type PodUseDeviceStat struct { + // count current node all running success pod num + TotalPod int + // only running success pod and use device pod can count. + UseDevicePod int } type podManager struct { - pods map[k8stypes.UID]*podInfo - containers map[string]containerInfo - mutex sync.Mutex + pods map[k8stypes.UID]*podInfo + mutex sync.RWMutex } func (m *podManager) init() { - m.pods = make(map[k8stypes.UID]*podInfo) - m.containers = make(map[string]containerInfo) + m.pods = make(map[k8stypes.UID]*podInfo) } func (m *podManager) addPod(pod *corev1.Pod, nodeID string, devices util.PodDevices) { - m.mutex.Lock() - defer m.mutex.Unlock() - pi, ok := m.pods[pod.UID] - if !ok { - pi = &podInfo{name: pod.Name, uid: pod.UID} - m.pods[pod.UID] = pi - pi.namespace = pod.Namespace - pi.name = pod.Name - pi.uid = pod.UID - pi.nodeID = nodeID - pi.devices = devices - pi.ctrIDs = make([]string, len(pod.Spec.Containers)) - for i := 0; i < len(pod.Spec.Containers); i++ { - c := &pod.Spec.Containers[i] - if i >= len(devices) { - klog.Errorf("len(device) != len(containers)") - continue - } - for _, env := range c.Env { - if env.Name == api.ContainerUID { - m.containers[env.Value] = containerInfo{ - podUID: pod.UID, - ctrIdx: i, - } - pi.ctrIDs[i] = env.Value - break - } - } - if len(pi.ctrIDs[i]) == 0 { - klog.Errorf("not found container uid in container %v/%v/%v", pod.Namespace, pod.Name, c.Name) - } - } - } + m.mutex.Lock() + defer m.mutex.Unlock() + _, ok := m.pods[pod.UID] + if !ok { + pi := &podInfo{Name: pod.Name, UID: pod.UID, Namespace: pod.Namespace, NodeID: nodeID, Devices: devices} + m.pods[pod.UID] = pi + klog.Infof("Pod added: Name: %s, UID: %s, Namespace: %s, NodeID: %s", pod.Name, pod.UID, pod.Namespace, nodeID) + } } func (m *podManager) delPod(pod *corev1.Pod) { - m.mutex.Lock() - defer m.mutex.Unlock() - pi, ok := m.pods[pod.UID] - if ok { - for _, id := range pi.ctrIDs { - delete(m.containers, id) - } - delete(m.pods, pod.UID) - } + m.mutex.Lock() + defer m.mutex.Unlock() + pi, ok := m.pods[pod.UID] + if ok { + klog.Infof("Deleted pod %s with node ID %s", pi.Name, pi.NodeID) + delete(m.pods, pod.UID) + } +} + +func (m *podManager) ListPodsUID() ([]*corev1.Pod, error) { + m.mutex.RLock() + defer m.mutex.RUnlock() + pods := make([]*corev1.Pod, 0) + for uid := range m.pods { + pods = append(pods, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: uid, + }, + }) + } + return pods, nil +} + +func (m *podManager) ListPodsInfo() []*podInfo { + m.mutex.RLock() + defer m.mutex.RUnlock() + pods := make([]*podInfo, 0) + for key := range m.pods { + values := m.pods[key] + pods = append(pods, values) + } + return pods } -func (m *podManager) getContainerByUUID(uuid string) (podInfo, int, error) { - m.mutex.Lock() - defer m.mutex.Unlock() - c, ok := m.containers[uuid] - if !ok { - return podInfo{}, 0, fmt.Errorf("not found container %v", uuid) - } - pi, ok := m.pods[c.podUID] - if !ok { - return podInfo{}, 0, fmt.Errorf("not found pod %v", c.podUID) - } - return *pi, c.ctrIdx, nil +func (m *podManager) GetScheduledPods() (map[k8stypes.UID]*podInfo, error) { + m.mutex.RLock() + defer m.mutex.RUnlock() + klog.Infof("Getting all scheduled pods with %d nums", len(m.pods)) + return m.pods, nil } diff --git a/pkg/scheduler/policy/constant.go b/pkg/scheduler/policy/constant.go new file mode 100644 index 000000000..451a0b2b8 --- /dev/null +++ b/pkg/scheduler/policy/constant.go @@ -0,0 +1,45 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package policy + +type SchedulerPolicyName string + +const ( + // NodeSchedulerPolicyBinpack is node use binpack scheduler policy. + NodeSchedulerPolicyBinpack SchedulerPolicyName = "binpack" + // NodeSchedulerPolicySpread is node use spread scheduler policy. + NodeSchedulerPolicySpread SchedulerPolicyName = "spread" + // GPUSchedulerPolicyBinpack is GPU use binpack scheduler. + GPUSchedulerPolicyBinpack SchedulerPolicyName = "binpack" + // GPUSchedulerPolicySpread is GPU use spread scheduler. + GPUSchedulerPolicySpread SchedulerPolicyName = "spread" +) + +func (s SchedulerPolicyName) String() string { + return string(s) +} + +const ( + // NodeSchedulerPolicyAnnotationKey is user set Pod annotation to change this default node policy. + NodeSchedulerPolicyAnnotationKey = "hami.io/node-scheduler-policy" + // GPUSchedulerPolicyAnnotationKey is user set Pod annotation to change this default GPU policy. + GPUSchedulerPolicyAnnotationKey = "hami.io/gpu-scheduler-policy" +) + +const ( + Weight int = 10 +) diff --git a/pkg/scheduler/policy/gpu_policy.go b/pkg/scheduler/policy/gpu_policy.go new file mode 100644 index 000000000..e757ca1fe --- /dev/null +++ b/pkg/scheduler/policy/gpu_policy.go @@ -0,0 +1,77 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package policy + +import ( + "github.com/Project-HAMi/HAMi/pkg/util" + + "k8s.io/klog/v2" +) + +type DeviceListsScore struct { + Device *util.DeviceUsage + // Score recode every device user/allocate score + Score float32 +} + +type DeviceUsageList struct { + DeviceLists []*DeviceListsScore + Policy string +} + +func (l DeviceUsageList) Len() int { + return len(l.DeviceLists) +} + +func (l DeviceUsageList) Swap(i, j int) { + l.DeviceLists[i], l.DeviceLists[j] = l.DeviceLists[j], l.DeviceLists[i] +} + +func (l DeviceUsageList) Less(i, j int) bool { + if l.Policy == GPUSchedulerPolicyBinpack.String() { + if l.DeviceLists[i].Device.Numa == l.DeviceLists[j].Device.Numa { + return l.DeviceLists[i].Score < l.DeviceLists[j].Score + } + return l.DeviceLists[i].Device.Numa > l.DeviceLists[j].Device.Numa + } + // default policy is spread + if l.DeviceLists[i].Device.Numa == l.DeviceLists[j].Device.Numa { + return l.DeviceLists[i].Score > l.DeviceLists[j].Score + } + return l.DeviceLists[i].Device.Numa < l.DeviceLists[j].Device.Numa +} + +func (ds *DeviceListsScore) ComputeScore(requests util.ContainerDeviceRequests) { + request, core, mem := int32(0), int32(0), int32(0) + // Here we are required to use the same type device + for _, container := range requests { + request += container.Nums + core += container.Coresreq + if container.MemPercentagereq != 0 && container.MemPercentagereq != 101 { + mem += ds.Device.Totalmem * (container.MemPercentagereq / 100.0) + continue + } + mem += container.Memreq + } + klog.V(2).Infof("device %s user %d, userCore %d, userMem %d,", ds.Device.ID, ds.Device.Used, ds.Device.Usedcores, ds.Device.Usedmem) + + usedScore := float32(request+ds.Device.Used) / float32(ds.Device.Count) + coreScore := float32(core+ds.Device.Usedcores) / float32(ds.Device.Totalcore) + memScore := float32(mem+ds.Device.Usedmem) / float32(ds.Device.Totalmem) + ds.Score = float32(Weight) * (usedScore + coreScore + memScore) + klog.V(2).Infof("device %s computer score is %f", ds.Device.ID, ds.Score) +} diff --git a/pkg/scheduler/policy/node_policy.go b/pkg/scheduler/policy/node_policy.go new file mode 100644 index 000000000..417990a9f --- /dev/null +++ b/pkg/scheduler/policy/node_policy.go @@ -0,0 +1,74 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package policy + +import ( + "github.com/Project-HAMi/HAMi/pkg/util" + + "k8s.io/klog/v2" +) + +type NodeScore struct { + NodeID string + Devices util.PodDevices + // Score recode every node all device user/allocate score + Score float32 +} + +type NodeScoreList struct { + NodeList []*NodeScore + Policy string +} + +func (l NodeScoreList) Len() int { + return len(l.NodeList) +} + +func (l NodeScoreList) Swap(i, j int) { + l.NodeList[i], l.NodeList[j] = l.NodeList[j], l.NodeList[i] +} + +func (l NodeScoreList) Less(i, j int) bool { + if l.Policy == NodeSchedulerPolicySpread.String() { + return l.NodeList[i].Score > l.NodeList[j].Score + } + // default policy is Binpack + return l.NodeList[i].Score < l.NodeList[j].Score +} + +func (ns *NodeScore) ComputeScore(devices DeviceUsageList) { + // current user having request resource + used, usedCore, usedMem := int32(0), int32(0), int32(0) + for _, device := range devices.DeviceLists { + used += device.Device.Used + usedCore += device.Device.Usedcores + usedMem += device.Device.Usedmem + } + klog.V(2).Infof("node %s used %d, usedCore %d, usedMem %d,", ns.NodeID, used, usedCore, usedMem) + + total, totalCore, totalMem := int32(0), int32(0), int32(0) + for _, deviceLists := range devices.DeviceLists { + total += deviceLists.Device.Count + totalCore += deviceLists.Device.Totalcore + totalMem += deviceLists.Device.Totalmem + } + useScore := float32(used) / float32(total) + coreScore := float32(usedCore) / float32(totalCore) + memScore := float32(usedMem) / float32(totalMem) + ns.Score = float32(Weight) * (useScore + coreScore + memScore) + klog.V(2).Infof("node %s computer score is %f", ns.NodeID, ns.Score) +} diff --git a/pkg/scheduler/routes/route.go b/pkg/scheduler/routes/route.go index 74501ef63..42538f3e9 100644 --- a/pkg/scheduler/routes/route.go +++ b/pkg/scheduler/routes/route.go @@ -1,84 +1,140 @@ /* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ package routes import ( - "4pd.io/k8s-vgpu/pkg/scheduler" - "bytes" - "encoding/json" - "io" - "net/http" - - "github.com/julienschmidt/httprouter" - "k8s.io/klog/v2" - extenderv1 "k8s.io/kube-scheduler/extender/v1" + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + + "github.com/julienschmidt/httprouter" + "k8s.io/apimachinery/pkg/types" + "k8s.io/klog/v2" + extenderv1 "k8s.io/kube-scheduler/extender/v1" + + "github.com/Project-HAMi/HAMi/pkg/scheduler" ) func checkBody(w http.ResponseWriter, r *http.Request) { - if r.Body == nil { - http.Error(w, "Please send a request body", 400) - return - } + if r.Body == nil { + http.Error(w, "Please send a request body", 400) + return + } } func PredicateRoute(s *scheduler.Scheduler) httprouter.Handle { - return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) { - checkBody(w, r) - - var buf bytes.Buffer - body := io.TeeReader(r.Body, &buf) - - var extenderArgs extenderv1.ExtenderArgs - var extenderFilterResult *extenderv1.ExtenderFilterResult - - if err := json.NewDecoder(body).Decode(&extenderArgs); err != nil { - extenderFilterResult = &extenderv1.ExtenderFilterResult{ - Error: err.Error(), - } - } else { - extenderFilterResult, err = s.Filter(extenderArgs) - if err != nil { - klog.Errorf("pod %v filter error, %v", extenderArgs.Pod.Name, err) - extenderFilterResult = &extenderv1.ExtenderFilterResult{ - Error: err.Error(), - } - } - } - - if resultBody, err := json.Marshal(extenderFilterResult); err != nil { - klog.Errorf("Failed to marshal extenderFilterResult: %+v, %+v", - err, extenderFilterResult) - w.Header().Set("Content-Type", "application/json") - w.WriteHeader(http.StatusInternalServerError) - w.Write([]byte(err.Error())) - } else { - w.Header().Set("Content-Type", "application/json") - w.WriteHeader(http.StatusOK) - w.Write(resultBody) - } - } + klog.Infoln("Into Predicate Route outer func") + return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) { + klog.Infoln("Into Predicate Route inner func") + checkBody(w, r) + + var buf bytes.Buffer + body := io.TeeReader(r.Body, &buf) + + var extenderArgs extenderv1.ExtenderArgs + var extenderFilterResult *extenderv1.ExtenderFilterResult + + if err := json.NewDecoder(body).Decode(&extenderArgs); err != nil { + klog.Errorln("decode error", err.Error()) + extenderFilterResult = &extenderv1.ExtenderFilterResult{ + Error: err.Error(), + } + } else { + extenderFilterResult, err = s.Filter(extenderArgs) + if err != nil { + klog.Errorf("pod %v filter error, %v", extenderArgs.Pod.Name, err) + extenderFilterResult = &extenderv1.ExtenderFilterResult{ + Error: err.Error(), + } + } + } + + if resultBody, err := json.Marshal(extenderFilterResult); err != nil { + klog.Errorf("Failed to marshal extenderFilterResult: %+v, %+v", + err, extenderFilterResult) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusInternalServerError) + w.Write([]byte(err.Error())) + } else { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + w.Write(resultBody) + } + } +} + +func Bind(s *scheduler.Scheduler) httprouter.Handle { + return func(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { + var buf bytes.Buffer + body := io.TeeReader(r.Body, &buf) + var extenderBindingArgs extenderv1.ExtenderBindingArgs + var extenderBindingResult *extenderv1.ExtenderBindingResult + + if err := json.NewDecoder(body).Decode(&extenderBindingArgs); err != nil { + klog.ErrorS(err, "Decode extender binding args") + extenderBindingResult = &extenderv1.ExtenderBindingResult{ + Error: err.Error(), + } + } else { + extenderBindingResult, err = s.Bind(extenderBindingArgs) + } + + if response, err := json.Marshal(extenderBindingResult); err != nil { + klog.ErrorS(err, "Marshal binding result", "result", extenderBindingResult) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusInternalServerError) + errMsg := fmt.Sprintf("{'error':'%s'}", err.Error()) + w.Write([]byte(errMsg)) + } else { + klog.V(5).InfoS("Return bind response", "result", extenderBindingResult) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + w.Write(response) + } + } +} + +func bind(args extenderv1.ExtenderBindingArgs, bindFunc func(string, string, types.UID, string) error) *extenderv1.ExtenderBindingResult { + err := bindFunc(args.PodName, args.PodNamespace, args.PodUID, args.Node) + errMsg := "" + if err != nil { + klog.ErrorS(err, "Bind", "pod", args.PodName, "namespace", args.PodNamespace, "node", args.Node, "uid", args.PodUID) + errMsg = err.Error() + } + return &extenderv1.ExtenderBindingResult{ + Error: errMsg, + } } func WebHookRoute() httprouter.Handle { - h, err := scheduler.NewWebHook() - if err != nil { - klog.Fatalf("new web hook error, %v", err) - } - return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) { - h.ServeHTTP(w, r) - } + h, err := scheduler.NewWebHook() + if err != nil { + klog.Errorf("failed to create new web hook, %v", err) + } + return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) { + klog.Infof("Start to handle webhook request on %s", r.URL.Path) + h.ServeHTTP(w, r) + } +} + +func HealthzRoute() httprouter.Handle { + return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) { + w.WriteHeader(http.StatusOK) + } } diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 455a3dd0e..6daf126d8 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -1,295 +1,511 @@ /* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ package scheduler import ( - "context" - "encoding/json" - "fmt" - "sort" - "strconv" - "time" - - "4pd.io/k8s-vgpu/pkg/api" - "4pd.io/k8s-vgpu/pkg/k8sutil" - "4pd.io/k8s-vgpu/pkg/util" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - k8stypes "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/informers" - "k8s.io/client-go/kubernetes" - listerscorev1 "k8s.io/client-go/listers/core/v1" - "k8s.io/client-go/tools/cache" - "k8s.io/klog/v2" - extenderv1 "k8s.io/kube-scheduler/extender/v1" + "context" + "fmt" + "sort" + "strconv" + "strings" + "time" + + "github.com/Project-HAMi/HAMi/pkg/device" + "github.com/Project-HAMi/HAMi/pkg/k8sutil" + "github.com/Project-HAMi/HAMi/pkg/scheduler/config" + "github.com/Project-HAMi/HAMi/pkg/scheduler/policy" + "github.com/Project-HAMi/HAMi/pkg/util" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" + listerscorev1 "k8s.io/client-go/listers/core/v1" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/record" + "k8s.io/klog/v2" + extenderv1 "k8s.io/kube-scheduler/extender/v1" ) type Scheduler struct { - nodeManager - podManager + nodeManager + podManager + + stopCh chan struct{} + kubeClient kubernetes.Interface + podLister listerscorev1.PodLister + nodeLister listerscorev1.NodeLister + //Node status returned by filter + cachedstatus map[string]*NodeUsage + nodeNotify chan struct{} + //Node Overview + overviewstatus map[string]*NodeUsage - stopCh chan struct{} - kubeClient kubernetes.Interface - podLister listerscorev1.PodLister - nodeLister listerscorev1.NodeLister + eventRecorder record.EventRecorder } func NewScheduler() *Scheduler { - s := &Scheduler{ - stopCh: make(chan struct{}), - } - s.nodeManager.init() - s.podManager.init() - return s + klog.Info("New Scheduler") + s := &Scheduler{ + stopCh: make(chan struct{}), + cachedstatus: make(map[string]*NodeUsage), + nodeNotify: make(chan struct{}, 1), + } + s.nodeManager.init() + s.podManager.init() + return s } func check(err error) { - if err != nil { - klog.Fatal(err) - } + if err != nil { + klog.Fatal(err) + } +} + +func (s *Scheduler) onUpdateNode(_, newObj interface{}) { + s.nodeNotify <- struct{}{} +} + +func (s *Scheduler) onDelNode(obj interface{}) { + s.nodeNotify <- struct{}{} +} + +func (s *Scheduler) onAddNode(obj interface{}) { + s.nodeNotify <- struct{}{} } func (s *Scheduler) onAddPod(obj interface{}) { - pod, ok := obj.(*corev1.Pod) - if !ok { - klog.Errorf("unknown add object type") - return - } - nodeID, ok := pod.Annotations[util.AssignedNodeAnnotations] - if !ok { - return - } - ids, ok := pod.Annotations[util.AssignedIDsAnnotations] - if !ok { - return - } - if k8sutil.IsPodInTerminatedState(pod) { - s.delPod(pod) - return - } - podDev := util.DecodePodDevices(ids) - s.addPod(pod, nodeID, podDev) + pod, ok := obj.(*corev1.Pod) + if !ok { + klog.Errorf("unknown add object type") + return + } + nodeID, ok := pod.Annotations[util.AssignedNodeAnnotations] + if !ok { + return + } + if k8sutil.IsPodInTerminatedState(pod) { + s.delPod(pod) + return + } + podDev, _ := util.DecodePodDevices(util.SupportDevices, pod.Annotations) + s.addPod(pod, nodeID, podDev) } func (s *Scheduler) onUpdatePod(_, newObj interface{}) { - s.onAddPod(newObj) + s.onAddPod(newObj) } func (s *Scheduler) onDelPod(obj interface{}) { - pod, ok := obj.(*corev1.Pod) - if !ok { - klog.Errorf("unknown add object type") - return - } - _, ok = pod.Annotations[util.AssignedNodeAnnotations] - if !ok { - return - } - s.delPod(pod) + pod, ok := obj.(*corev1.Pod) + if !ok { + klog.Errorf("unknown add object type") + return + } + _, ok = pod.Annotations[util.AssignedNodeAnnotations] + if !ok { + return + } + s.delPod(pod) } func (s *Scheduler) Start() { - kubeClient, err := k8sutil.NewClient() - check(err) - s.kubeClient = kubeClient - informerFactory := informers.NewSharedInformerFactoryWithOptions(s.kubeClient, time.Hour*1) - s.podLister = informerFactory.Core().V1().Pods().Lister() - s.nodeLister = informerFactory.Core().V1().Nodes().Lister() - - informer := informerFactory.Core().V1().Pods().Informer() - informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: s.onAddPod, - UpdateFunc: s.onUpdatePod, - DeleteFunc: s.onDelPod, - }) - - informerFactory.Start(s.stopCh) - informerFactory.WaitForCacheSync(s.stopCh) + kubeClient, err := k8sutil.NewClient() + check(err) + s.kubeClient = kubeClient + informerFactory := informers.NewSharedInformerFactoryWithOptions(s.kubeClient, time.Hour*1) + s.podLister = informerFactory.Core().V1().Pods().Lister() + s.nodeLister = informerFactory.Core().V1().Nodes().Lister() + + informer := informerFactory.Core().V1().Pods().Informer() + informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: s.onAddPod, + UpdateFunc: s.onUpdatePod, + DeleteFunc: s.onDelPod, + }) + informerFactory.Core().V1().Nodes().Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: s.onAddNode, + UpdateFunc: s.onUpdateNode, + DeleteFunc: s.onDelNode, + }) + informerFactory.Start(s.stopCh) + informerFactory.WaitForCacheSync(s.stopCh) + s.addAllEventHandlers() } func (s *Scheduler) Stop() { - close(s.stopCh) + close(s.stopCh) } -//func (s *Scheduler) assignedNode(pod *corev1.Pod) string { -// if node, ok := pod.ObjectMeta.Annotations[util.AssignedNodeAnnotations]; ok { -// return node -// } -// return "" -//} -func (s *Scheduler) Register(stream api.DeviceService_RegisterServer) error { - var nodeID string - for { - req, err := stream.Recv() - if err != nil { - s.delNode(nodeID) - klog.Infof("node %v leave, %v", nodeID, err) - _ = stream.SendAndClose(&api.RegisterReply{}) - return err - } - klog.V(3).Infof("device register %v", req.String()) - nodeID = req.GetNode() - nodeInfo := NodeInfo{} - nodeInfo.ID = nodeID - nodeInfo.Devices = make([]DeviceInfo, len(req.Devices)) - for i := 0; i < len(req.Devices); i++ { - nodeInfo.Devices[i] = DeviceInfo{ - ID: req.Devices[i].GetId(), - Count: req.Devices[i].GetCount(), - Health: req.Devices[i].GetHealth(), - } - } - s.addNode(nodeID, nodeInfo) - klog.Infof("node %v come", nodeID) - } +func (s *Scheduler) RegisterFromNodeAnnotations() { + klog.V(5).Infoln("Scheduler into RegisterFromNodeAnnotations") + ticker := time.NewTicker(time.Second * 15) + for { + select { + case <-s.nodeNotify: + case <-ticker.C: + case <-s.stopCh: + return + } + labelSelector := labels.Everything() + if config.NodeLabelSelector != nil && len(config.NodeLabelSelector) > 0 { + labelSelector = (labels.Set)(config.NodeLabelSelector).AsSelector() + } + rawNodes, err := s.nodeLister.List(labelSelector) + if err != nil { + klog.Errorln("nodes list failed", err.Error()) + continue + } + var nodeNames []string + for _, val := range rawNodes { + nodeNames = append(nodeNames, val.Name) + for devhandsk, devInstance := range device.GetDevices() { + health, needUpdate := devInstance.CheckHealth(devhandsk, val) + klog.V(5).InfoS("node", val.Name, "deviceVendor", devhandsk, "health", health, "needUpdate", needUpdate) + if !health { + err := devInstance.NodeCleanUp(val.Name) + // If the device is not healthy, the device is removed from the node. + // At the same time, this node needs to be removed from the cache. + if err != nil { + klog.Errorln("node cleanup failed", err.Error()) + } + info, ok := s.nodes[val.Name] + if ok { + klog.Infof("node %v device %s:%v leave, %v remaining devices:%v", val.Name, devhandsk, info, err, s.nodes[val.Name].Devices) + s.rmNodeDevice(val.Name, info, devhandsk) + continue + } + } + if !needUpdate { + continue + } + _, ok := util.HandshakeAnnos[devhandsk] + if ok { + tmppat := make(map[string]string) + tmppat[util.HandshakeAnnos[devhandsk]] = "Requesting_" + time.Now().Format("2006.01.02 15:04:05") + klog.V(4).InfoS("New timestamp", util.HandshakeAnnos[devhandsk], tmppat[util.HandshakeAnnos[devhandsk]], "nodeName", val.Name) + n, err := util.GetNode(val.Name) + if err != nil { + klog.Errorln("get node failed", err.Error()) + continue + } + util.PatchNodeAnnotations(n, tmppat) + } + + nodedevices, err := devInstance.GetNodeDevices(*val) + if err != nil { + continue + } + nodeInfo := &util.NodeInfo{} + nodeInfo.ID = val.Name + nodeInfo.Devices = make([]util.DeviceInfo, 0) + for _, deviceinfo := range nodedevices { + found := false + _, ok := s.nodes[val.Name] + if ok { + for i1, val1 := range s.nodes[val.Name].Devices { + if strings.Compare(val1.ID, deviceinfo.Id) == 0 { + found = true + s.nodes[val.Name].Devices[i1].Devmem = deviceinfo.Devmem + s.nodes[val.Name].Devices[i1].Devcore = deviceinfo.Devcore + break + } + } + } + if !found { + nodeInfo.Devices = append(nodeInfo.Devices, util.DeviceInfo{ + ID: deviceinfo.Id, + Index: uint(deviceinfo.Index), + Count: deviceinfo.Count, + Devmem: deviceinfo.Devmem, + Devcore: deviceinfo.Devcore, + Type: deviceinfo.Type, + Numa: deviceinfo.Numa, + Health: deviceinfo.Health, + DeviceVendor: devhandsk, + }) + } + } + s.addNode(val.Name, nodeInfo) + if s.nodes[val.Name] != nil && len(nodeInfo.Devices) > 0 { + klog.Infof("node %v device %s come node info=%v total=%v", val.Name, devhandsk, nodeInfo, s.nodes[val.Name].Devices) + } + } + } + _, _, err = s.getNodesUsage(&nodeNames, nil) + if err != nil { + klog.Errorln("get node usage failed", err.Error()) + } + } } -func (s *Scheduler) GetContainer(_ context.Context, req *api.GetContainerRequest) (*api.GetContainerReply, error) { - pi, ctrIdx, err := s.getContainerByUUID(req.Uuid) - if err != nil { - return nil, err - } - if ctrIdx >= len(pi.devices) { - return nil, fmt.Errorf("container index error") - } - pod, err := s.podLister.Pods(pi.namespace).Get(pi.name) - if err != nil { - return nil, err - } - if pod == nil || ctrIdx >= len(pi.devices) { - return nil, fmt.Errorf("container not found") - } - rep := api.GetContainerReply{ - DevList: pi.devices[ctrIdx], - PodUID: string(pod.UID), - CtrName: pod.Spec.Containers[ctrIdx].Name, - PodNamespace: pod.Namespace, - PodName: pod.Name, - } - return &rep, nil +// InspectAllNodesUsage is used by metrics monitor. +func (s *Scheduler) InspectAllNodesUsage() *map[string]*NodeUsage { + return &s.overviewstatus } -func (s *Scheduler) getNodesUsage(nodes *[]string) (*map[string]*NodeUsage, map[string]string, error) { - nodeMap := make(map[string]*NodeUsage) - failedNodes := make(map[string]string) - for _, nodeID := range *nodes { - node, err := s.GetNode(nodeID) - if err != nil { - klog.Errorf("get node %v device error, %v", nodeID, err) - failedNodes[nodeID] = fmt.Sprintf("node unregisterd") - continue - } - - nodeInfo := &NodeUsage{} - for _, d := range node.Devices { - nodeInfo.devices = append(nodeInfo.devices, &DeviceUsage{ - id: d.ID, - used: 0, - count: d.Count, - health: d.Health, - }) - } - nodeMap[nodeID] = nodeInfo - } - for _, p := range s.pods { - node, ok := nodeMap[p.nodeID] - if !ok { - continue - } - for _, ds := range p.devices { - for _, deviceID := range ds { - for _, d := range node.devices { - if d.id == deviceID { - d.used++ - } - } - } - } - klog.V(5).Infof("usage: pod %v assigned %v %v", p.name, p.nodeID, p.devices) - } - return &nodeMap, failedNodes, nil +// returns all nodes and its device memory usage, and we filter it with nodeSelector, taints, nodeAffinity +// unschedulerable and nodeName. +func (s *Scheduler) getNodesUsage(nodes *[]string, task *corev1.Pod) (*map[string]*NodeUsage, map[string]string, error) { + overallnodeMap := make(map[string]*NodeUsage) + cachenodeMap := make(map[string]*NodeUsage) + failedNodes := make(map[string]string) + //for _, nodeID := range *nodes { + allNodes, err := s.ListNodes() + if err != nil { + return &overallnodeMap, failedNodes, err + } + + for _, node := range allNodes { + nodeInfo := &NodeUsage{} + userGPUPolicy := config.GPUSchedulerPolicy + if task != nil && task.Annotations != nil { + if value, ok := task.Annotations[policy.GPUSchedulerPolicyAnnotationKey]; ok { + userGPUPolicy = value + } + } + nodeInfo.Devices = policy.DeviceUsageList{ + Policy: userGPUPolicy, + DeviceLists: make([]*policy.DeviceListsScore, 0), + } + for _, d := range node.Devices { + nodeInfo.Devices.DeviceLists = append(nodeInfo.Devices.DeviceLists, &policy.DeviceListsScore{ + Score: 0, + Device: &util.DeviceUsage{ + ID: d.ID, + Index: d.Index, + Used: 0, + Count: d.Count, + Usedmem: 0, + Totalmem: d.Devmem, + Totalcore: d.Devcore, + Usedcores: 0, + Type: d.Type, + Numa: d.Numa, + Health: d.Health, + }, + }) + } + overallnodeMap[node.ID] = nodeInfo + } + + podsInfo := s.ListPodsInfo() + for _, p := range podsInfo { + node, ok := overallnodeMap[p.NodeID] + if !ok { + continue + } + for _, podsingleds := range p.Devices { + for _, ctrdevs := range podsingleds { + for _, udevice := range ctrdevs { + for _, d := range node.Devices.DeviceLists { + if d.Device.ID == udevice.UUID { + d.Device.Used++ + d.Device.Usedmem += udevice.Usedmem + d.Device.Usedcores += udevice.Usedcores + } + } + } + } + } + klog.V(5).Infof("usage: pod %v assigned %v %v", p.Name, p.NodeID, p.Devices) + } + s.overviewstatus = overallnodeMap + for _, nodeID := range *nodes { + node, err := s.GetNode(nodeID) + if err != nil { + // The identified node does not have a gpu device, so the log here has no practical meaning,increase log priority. + klog.V(5).InfoS("node unregistered", "node", nodeID, "error", err) + failedNodes[nodeID] = "node unregistered" + continue + } + cachenodeMap[node.ID] = overallnodeMap[node.ID] + } + s.cachedstatus = cachenodeMap + return &cachenodeMap, failedNodes, nil } -func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFilterResult, error) { - klog.Infof("schedule pod %v/%v[%v]", args.Pod.Namespace, args.Pod.Name, args.Pod.UID) - nums := k8sutil.ResourceNums(args.Pod, corev1.ResourceName(util.ResourceName)) - total := 0 - for _, n := range nums { - total += n - } - if total == 0 { - klog.V(1).Infof("pod %v not find resource %v", args.Pod.Name, util.ResourceName) - return &extenderv1.ExtenderFilterResult{ - NodeNames: args.NodeNames, - FailedNodes: nil, - Error: "", - }, nil - } - s.delPod(args.Pod) - nodeUsage, failedNodes, err := s.getNodesUsage(args.NodeNames) - if err != nil { - return nil, err - } - nodeScores, err := calcScore(nodeUsage, &failedNodes, nums) - if err != nil { - return nil, err - } - if len(*nodeScores) == 0 { - return &extenderv1.ExtenderFilterResult{ - FailedNodes: failedNodes, - }, nil - } - sort.Sort(nodeScores) - m := (*nodeScores)[len(*nodeScores)-1] - klog.Infof("schedule %v/%v to %v %v", args.Pod.Namespace, args.Pod.Name, m.nodeID, m.devices) - annotations := make(map[string]string) - annotations[util.AssignedNodeAnnotations] = m.nodeID - annotations[util.AssignedTimeAnnotations] = strconv.FormatInt(time.Now().Unix(), 10) - annotations[util.AssignedIDsAnnotations] = util.EncodePodDevices(m.devices) - s.addPod(args.Pod, m.nodeID, m.devices) - err = s.patchPodAnnotations(args.Pod, annotations) - if err != nil { - s.delPod(args.Pod) - return nil, err - } - res := extenderv1.ExtenderFilterResult{NodeNames: &[]string{m.nodeID}} - return &res, nil +func (s *Scheduler) getPodUsage() (map[string]PodUseDeviceStat, error) { + podUsageStat := make(map[string]PodUseDeviceStat) + pods, err := s.podLister.List(labels.NewSelector()) + if err != nil { + return nil, err + } + for _, pod := range pods { + if pod.Status.Phase != corev1.PodSucceeded { + continue + } + podUseDeviceNum := 0 + if v, ok := pod.Annotations[util.DeviceBindPhase]; ok && v == util.DeviceBindSuccess { + podUseDeviceNum = 1 + } + nodeName := pod.Spec.NodeName + if _, ok := podUsageStat[nodeName]; !ok { + podUsageStat[nodeName] = PodUseDeviceStat{ + TotalPod: 1, + UseDevicePod: podUseDeviceNum, + } + } else { + exist := podUsageStat[nodeName] + podUsageStat[nodeName] = PodUseDeviceStat{ + TotalPod: exist.TotalPod + 1, + UseDevicePod: exist.UseDevicePod + podUseDeviceNum, + } + } + } + return podUsageStat, nil } -func (s *Scheduler) patchPodAnnotations(pod *corev1.Pod, annotations map[string]string) error { - type patchMetadata struct { - Annotations map[string]string `json:"annotations,omitempty"` - } - type patchPod struct { - Metadata patchMetadata `json:"metadata"` - //Spec patchSpec `json:"spec,omitempty"` - } - - p := patchPod{} - p.Metadata.Annotations = annotations - - bytes, err := json.Marshal(p) - if err != nil { - return err - } - _, err = s.kubeClient.CoreV1().Pods(pod.Namespace). - Patch(context.Background(), pod.Name, k8stypes.StrategicMergePatchType, bytes, metav1.PatchOptions{}) - if err != nil { - klog.Infof("patch pod %v failed, %v", pod.Name, err) - } - return err +func (s *Scheduler) Bind(args extenderv1.ExtenderBindingArgs) (*extenderv1.ExtenderBindingResult, error) { + klog.InfoS("Bind", "pod", args.PodName, "namespace", args.PodNamespace, "podUID", args.PodUID, "node", args.Node) + var err error + var res *extenderv1.ExtenderBindingResult + binding := &corev1.Binding{ + ObjectMeta: metav1.ObjectMeta{Name: args.PodName, UID: args.PodUID}, + Target: corev1.ObjectReference{Kind: "Node", Name: args.Node}, + } + current, err := s.kubeClient.CoreV1().Pods(args.PodNamespace).Get(context.Background(), args.PodName, metav1.GetOptions{}) + if err != nil { + klog.ErrorS(err, "Get pod failed") + } + + node, err := s.kubeClient.CoreV1().Nodes().Get(context.Background(), args.Node, metav1.GetOptions{}) + if err != nil { + klog.ErrorS(err, "Failed to get node", "node", args.Node) + s.recordScheduleBindingResultEvent(current, EventReasonBindingFailed, []string{}, fmt.Errorf("failed to get node %v", args.Node)) + res = &extenderv1.ExtenderBindingResult{ + Error: err.Error(), + } + return res, nil + } + + tmppatch := make(map[string]string) + for _, val := range device.GetDevices() { + err = val.LockNode(node, current) + if err != nil { + goto ReleaseNodeLocks + } + } + /* + err = nodelock.LockNode(args.Node) + if err != nil { + klog.ErrorS(err, "Failed to lock node", "node", args.Node) + res = &extenderv1.ExtenderBindingResult{ + Error: err.Error(), + } + return res, nil + }*/ + //defer util.ReleaseNodeLock(args.Node) + + tmppatch[util.DeviceBindPhase] = "allocating" + tmppatch[util.BindTimeAnnotations] = strconv.FormatInt(time.Now().Unix(), 10) + + err = util.PatchPodAnnotations(current, tmppatch) + if err != nil { + klog.ErrorS(err, "patch pod annotation failed") + } + if err = s.kubeClient.CoreV1().Pods(args.PodNamespace).Bind(context.Background(), binding, metav1.CreateOptions{}); err != nil { + klog.ErrorS(err, "Failed to bind pod", "pod", args.PodName, "namespace", args.PodNamespace, "podUID", args.PodUID, "node", args.Node) + } + if err == nil { + s.recordScheduleBindingResultEvent(current, EventReasonBindingSucceed, []string{args.Node}, nil) + res = &extenderv1.ExtenderBindingResult{ + Error: "", + } + klog.Infoln("After Binding Process") + return res, nil + } +ReleaseNodeLocks: + klog.InfoS("bind failed", "err", err.Error()) + for _, val := range device.GetDevices() { + val.ReleaseNodeLock(node, current) + } + s.recordScheduleBindingResultEvent(current, EventReasonBindingFailed, []string{}, err) + return &extenderv1.ExtenderBindingResult{ + Error: err.Error(), + }, nil +} + +func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFilterResult, error) { + klog.InfoS("begin schedule filter", "pod", args.Pod.Name, "uuid", args.Pod.UID, "namespaces", args.Pod.Namespace) + nums := k8sutil.Resourcereqs(args.Pod) + total := 0 + for _, n := range nums { + for _, k := range n { + total += int(k.Nums) + } + } + if total == 0 { + klog.V(1).Infof("pod %v not find resource", args.Pod.Name) + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, []string{}, fmt.Errorf("does not request any resource")) + return &extenderv1.ExtenderFilterResult{ + NodeNames: args.NodeNames, + FailedNodes: nil, + Error: "", + }, nil + } + annos := args.Pod.Annotations + s.delPod(args.Pod) + nodeUsage, failedNodes, err := s.getNodesUsage(args.NodeNames, args.Pod) + if err != nil { + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, []string{}, err) + return nil, err + } + if len(failedNodes) != 0 { + klog.V(5).InfoS("getNodesUsage failed nodes", "nodes", failedNodes) + } + nodeScores, err := s.calcScore(nodeUsage, nums, annos, args.Pod) + if err != nil { + err := fmt.Errorf("calcScore failed %v for pod %v", err, args.Pod.Name) + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, []string{}, err) + return nil, err + } + if len((*nodeScores).NodeList) == 0 { + klog.V(4).Infof("All node scores do not meet for pod %v", args.Pod.Name) + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, []string{}, fmt.Errorf("no available node, all node scores do not meet")) + return &extenderv1.ExtenderFilterResult{ + FailedNodes: failedNodes, + }, nil + } + klog.V(4).Infoln("nodeScores_len=", len((*nodeScores).NodeList)) + sort.Sort(nodeScores) + m := (*nodeScores).NodeList[len((*nodeScores).NodeList)-1] + klog.Infof("schedule %v/%v to %v %v", args.Pod.Namespace, args.Pod.Name, m.NodeID, m.Devices) + annotations := make(map[string]string) + annotations[util.AssignedNodeAnnotations] = m.NodeID + annotations[util.AssignedTimeAnnotations] = strconv.FormatInt(time.Now().Unix(), 10) + + for _, val := range device.GetDevices() { + val.PatchAnnotations(&annotations, m.Devices) + } + + //InRequestDevices := util.EncodePodDevices(util.InRequestDevices, m.devices) + //supportDevices := util.EncodePodDevices(util.SupportDevices, m.devices) + //maps.Copy(annotations, InRequestDevices) + //maps.Copy(annotations, supportDevices) + s.addPod(args.Pod, m.NodeID, m.Devices) + err = util.PatchPodAnnotations(args.Pod, annotations) + if err != nil { + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, []string{}, err) + s.delPod(args.Pod) + return nil, err + } + s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringSucceed, []string{m.NodeID}, nil) + res := extenderv1.ExtenderFilterResult{NodeNames: &[]string{m.NodeID}} + return &res, nil } diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go new file mode 100644 index 000000000..aaa623d01 --- /dev/null +++ b/pkg/scheduler/scheduler_test.go @@ -0,0 +1,485 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "context" + "testing" + "time" + + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + "github.com/Project-HAMi/HAMi/pkg/scheduler/policy" + "github.com/Project-HAMi/HAMi/pkg/util" + "github.com/Project-HAMi/HAMi/pkg/util/client" + + "gotest.tools/v3/assert" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/tools/cache" + extenderv1 "k8s.io/kube-scheduler/extender/v1" +) + +func Test_getNodesUsage(t *testing.T) { + nodeMage := nodeManager{} + nodeMage.init() + nodeMage.addNode("node1", &util.NodeInfo{ + ID: "node1", + Devices: []util.DeviceInfo{ + { + ID: "GPU0", + Index: 0, + Count: 10, + Devmem: 1024, + Devcore: 100, + Numa: 1, + Health: true, + }, + { + ID: "GPU1", + Index: 1, + Count: 10, + Devmem: 1024, + Devcore: 100, + Numa: 1, + Health: true, + }, + }, + }) + podDevces := util.PodDevices{ + "NVIDIA": util.PodSingleDevice{ + []util.ContainerDevice{ + { + Idx: 0, + UUID: "GPU0", + Usedmem: 100, + Usedcores: 10, + }, + }, + }, + } + podMap := podManager{} + podMap.init() + podMap.addPod(&corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "1111", + Name: "test1", + Namespace: "default", + }, + }, "node1", podDevces) + podMap.addPod(&corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "2222", + Name: "test2", + Namespace: "default", + }, + }, "node1", podDevces) + s := Scheduler{ + nodeManager: nodeMage, + podManager: podMap, + } + nodes := make([]string, 0) + nodes = append(nodes, "node1") + cachenodeMap, _, err := s.getNodesUsage(&nodes, nil) + if err != nil { + t.Fatal(err) + } + assert.Equal(t, len(*cachenodeMap), 1) + v, ok := (*cachenodeMap)["node1"] + assert.Equal(t, ok, true) + assert.Equal(t, len(v.Devices.DeviceLists), 2) + assert.Equal(t, v.Devices.DeviceLists[0].Device.Used, int32(2)) + assert.Equal(t, v.Devices.DeviceLists[0].Device.Usedmem, int32(200)) + assert.Equal(t, v.Devices.DeviceLists[0].Device.Usedcores, int32(20)) +} + +// test case matrix +/** +| node policy| gpu policy| node num | per node device | pod use device | device use info | result | +|------------|------------|----------|-----------------|----------------|---------------------------|--------------| +| binpack | binpack | 2 | 2 | 1 |device1: 25%,device4: 75% | node2-device4| +| binpack | spread | 2 | 2 | 1 |device1: 25%,device4: 75% | node2-device3| +| spread | binpack | 2 | 2 | 1 |device1: 25%,device4: 75% | node1-device1| +| spread | spread | 2 | 2 | 1 |device1: 25%,device4: 75% | node1-device2| +test case matrix. +*/ +func Test_Filter(t *testing.T) { + s := NewScheduler() + client.KubeClient = fake.NewSimpleClientset() + s.kubeClient = client.KubeClient + informerFactory := informers.NewSharedInformerFactoryWithOptions(client.KubeClient, time.Hour*1) + s.podLister = informerFactory.Core().V1().Pods().Lister() + informer := informerFactory.Core().V1().Pods().Informer() + informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: s.onAddPod, + UpdateFunc: s.onUpdatePod, + DeleteFunc: s.onDelPod, + }) + informerFactory.Start(s.stopCh) + informerFactory.WaitForCacheSync(s.stopCh) + s.addAllEventHandlers() + + pod1 := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + UID: "uuid1", + Annotations: map[string]string{ + util.DeviceBindPhase: util.DeviceBindSuccess, + }, + }, + Spec: corev1.PodSpec{ + NodeName: "node1", + Containers: []corev1.Container{ + { + Name: "gpu-burn", + Image: "chrstnhntschl/gpu_burn", + Args: []string{"6000"}, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "hami.io/gpu": *resource.NewQuantity(1, resource.BinarySI), + "hami.io/gpucores": *resource.NewQuantity(25, resource.BinarySI), + "hami.io/gpumem": *resource.NewQuantity(2000, resource.BinarySI), + }, + }, + }, + }, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodSucceeded, + }, + } + pod2 := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod2", + UID: "uuid2", + Annotations: map[string]string{ + util.DeviceBindPhase: util.DeviceBindSuccess, + }, + }, + Spec: corev1.PodSpec{ + NodeName: "node2", + Containers: []corev1.Container{ + { + Name: "gpu-burn", + Image: "chrstnhntschl/gpu_burn", + Args: []string{"6000"}, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "hami.io/gpu": *resource.NewQuantity(1, resource.BinarySI), + "hami.io/gpucores": *resource.NewQuantity(75, resource.BinarySI), + "hami.io/gpumem": *resource.NewQuantity(6000, resource.BinarySI), + }, + }, + }, + }, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodSucceeded, + }, + } + pod3 := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod3", + UID: "uuid3", + }, + Spec: corev1.PodSpec{ + NodeName: "node2", + Containers: []corev1.Container{ + { + Name: "gpu-burn", + Image: "chrstnhntschl/gpu_burn", + Args: []string{"6000"}, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{}, + }, + }, + }, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodSucceeded, + }, + } + client.KubeClient.CoreV1().Pods(pod1.Namespace).Create(context.Background(), pod1, metav1.CreateOptions{}) + client.KubeClient.CoreV1().Pods(pod2.Namespace).Create(context.Background(), pod2, metav1.CreateOptions{}) + client.KubeClient.CoreV1().Pods(pod3.Namespace).Create(context.Background(), pod3, metav1.CreateOptions{}) + + initNode := func() { + nodes, _ := s.ListNodes() + for index := range nodes { + node := nodes[index] + s.rmNodeDevice(node.ID, node, nvidia.NvidiaGPUDevice) + } + pods, _ := s.ListPodsUID() + for index := range pods { + s.delPod(pods[index]) + } + + s.addNode("node1", &util.NodeInfo{ + ID: "node1", + Devices: []util.DeviceInfo{ + { + ID: "device1", + Index: 0, + Count: 10, + Devmem: 8000, + Devcore: 100, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + DeviceVendor: nvidia.NvidiaGPUDevice, + }, + { + ID: "device2", + Index: 1, + Count: 10, + Devmem: 8000, + Devcore: 100, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + DeviceVendor: nvidia.NvidiaGPUDevice, + }, + }, + }) + s.addNode("node2", &util.NodeInfo{ + ID: "node2", + Devices: []util.DeviceInfo{ + { + ID: "device3", + Index: 0, + Count: 10, + Devmem: 8000, + Devcore: 100, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + }, + { + ID: "device4", + Index: 1, + Count: 10, + Devmem: 8000, + Devcore: 100, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + }, + }, + }) + s.addPod(pod1, "node1", util.PodDevices{ + nvidia.NvidiaGPUDevice: util.PodSingleDevice{ + { + { + Idx: 0, + UUID: "device1", + Type: nvidia.NvidiaGPUDevice, + Usedmem: 2000, + Usedcores: 25, + }, + }, + }, + }) + s.addPod(pod2, "node2", util.PodDevices{ + nvidia.NvidiaGPUDevice: util.PodSingleDevice{ + { + { + Idx: 0, + UUID: "device4", + Type: nvidia.NvidiaGPUDevice, + Usedmem: 6000, + Usedcores: 75, + }, + }, + }, + }) + } + + tests := []struct { + name string + args extenderv1.ExtenderArgs + want *extenderv1.ExtenderFilterResult + wantPodAnnotationDeviceID string + wantErr error + }{ + { + name: "node use binpack gpu use binpack policy", + args: extenderv1.ExtenderArgs{ + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test1", + UID: "test1-uid1", + Annotations: map[string]string{ + policy.GPUSchedulerPolicyAnnotationKey: policy.GPUSchedulerPolicyBinpack.String(), + policy.NodeSchedulerPolicyAnnotationKey: policy.NodeSchedulerPolicyBinpack.String(), + }, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "gpu-burn", + Image: "chrstnhntschl/gpu_burn", + Args: []string{"6000"}, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "hami.io/gpu": *resource.NewQuantity(1, resource.BinarySI), + "hami.io/gpucores": *resource.NewQuantity(20, resource.BinarySI), + "hami.io/gpumem": *resource.NewQuantity(1000, resource.BinarySI), + }, + }, + }, + }, + }, + }, + NodeNames: &[]string{"node1", "node2"}, + }, + wantErr: nil, + want: &extenderv1.ExtenderFilterResult{ + NodeNames: &[]string{"node2"}, + }, + wantPodAnnotationDeviceID: "device4", + }, + { + name: "node use binpack gpu use spread policy", + args: extenderv1.ExtenderArgs{ + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test2", + UID: "test2-uid2", + Annotations: map[string]string{ + policy.GPUSchedulerPolicyAnnotationKey: policy.GPUSchedulerPolicySpread.String(), + policy.NodeSchedulerPolicyAnnotationKey: policy.NodeSchedulerPolicyBinpack.String(), + }, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "gpu-burn", + Image: "chrstnhntschl/gpu_burn", + Args: []string{"6000"}, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "hami.io/gpu": *resource.NewQuantity(1, resource.BinarySI), + "hami.io/gpucores": *resource.NewQuantity(20, resource.BinarySI), + "hami.io/gpumem": *resource.NewQuantity(1000, resource.BinarySI), + }, + }, + }, + }, + }, + }, + NodeNames: &[]string{"node1", "node2"}, + }, + wantErr: nil, + want: &extenderv1.ExtenderFilterResult{ + NodeNames: &[]string{"node2"}, + }, + wantPodAnnotationDeviceID: "device3", + }, + { + name: "node use spread gpu use binpack policy", + args: extenderv1.ExtenderArgs{ + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test3", + UID: "test3-uid3", + Annotations: map[string]string{ + policy.GPUSchedulerPolicyAnnotationKey: policy.GPUSchedulerPolicyBinpack.String(), + policy.NodeSchedulerPolicyAnnotationKey: policy.NodeSchedulerPolicySpread.String(), + }, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "gpu-burn", + Image: "chrstnhntschl/gpu_burn", + Args: []string{"6000"}, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "hami.io/gpu": *resource.NewQuantity(1, resource.BinarySI), + "hami.io/gpucores": *resource.NewQuantity(20, resource.BinarySI), + "hami.io/gpumem": *resource.NewQuantity(1000, resource.BinarySI), + }, + }, + }, + }, + }, + }, + NodeNames: &[]string{"node1", "node2"}, + }, + wantErr: nil, + want: &extenderv1.ExtenderFilterResult{ + NodeNames: &[]string{"node1"}, + }, + wantPodAnnotationDeviceID: "device1", + }, + { + name: "node use spread gpu use spread policy", + args: extenderv1.ExtenderArgs{ + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test4", + UID: "test4-uid4", + Annotations: map[string]string{ + policy.GPUSchedulerPolicyAnnotationKey: policy.GPUSchedulerPolicySpread.String(), + policy.NodeSchedulerPolicyAnnotationKey: policy.NodeSchedulerPolicySpread.String(), + }, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "gpu-burn", + Image: "chrstnhntschl/gpu_burn", + Args: []string{"6000"}, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "hami.io/gpu": *resource.NewQuantity(1, resource.BinarySI), + "hami.io/gpucores": *resource.NewQuantity(20, resource.BinarySI), + "hami.io/gpumem": *resource.NewQuantity(1000, resource.BinarySI), + }, + }, + }, + }, + }, + }, + NodeNames: &[]string{"node1", "node2"}, + }, + wantErr: nil, + want: &extenderv1.ExtenderFilterResult{ + NodeNames: &[]string{"node1"}, + }, + wantPodAnnotationDeviceID: "device2", + }, + } + + nvidia.ResourceName = "hami.io/gpu" + nvidia.ResourceMem = "hami.io/gpumem" + nvidia.ResourceCores = "hami.io/gpucores" + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + initNode() + client.KubeClient.CoreV1().Pods(test.args.Pod.Namespace).Create(context.Background(), test.args.Pod, metav1.CreateOptions{}) + got, gotErr := s.Filter(test.args) + assert.DeepEqual(t, test.wantErr, gotErr) + assert.DeepEqual(t, test.want, got) + getPod, _ := client.KubeClient.CoreV1().Pods(test.args.Pod.Namespace).Get(context.Background(), test.args.Pod.Name, metav1.GetOptions{}) + podDevices, _ := util.DecodePodDevices(util.SupportDevices, getPod.Annotations) + assert.DeepEqual(t, test.wantPodAnnotationDeviceID, podDevices["NVIDIA"][0][0].UUID) + }) + } +} diff --git a/pkg/scheduler/score.go b/pkg/scheduler/score.go index 7d9c63847..c04bca6ec 100644 --- a/pkg/scheduler/score.go +++ b/pkg/scheduler/score.go @@ -1,94 +1,233 @@ /* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +Copyright 2024 The HAMi Authors. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ package scheduler import ( - "4pd.io/k8s-vgpu/pkg/util" - "sort" -) + "sort" + "strings" -type NodeScore struct { - nodeID string - devices util.PodDevices - score float32 -} + "github.com/Project-HAMi/HAMi/pkg/device" + "github.com/Project-HAMi/HAMi/pkg/scheduler/config" + "github.com/Project-HAMi/HAMi/pkg/scheduler/policy" + "github.com/Project-HAMi/HAMi/pkg/util" -type NodeScoreList []*NodeScore + corev1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" +) -func (l DeviceUsageList) Len() int { - return len(l) +func viewStatus(usage NodeUsage) { + klog.Info("devices status") + for _, val := range usage.Devices.DeviceLists { + klog.InfoS("device status", "device id", val.Device.ID, "device detail", val) + } } -func (l DeviceUsageList) Swap(i, j int) { - l[i], l[j] = l[j], l[i] +func checkType(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) (bool, bool) { + //General type check, NVIDIA->NVIDIA MLU->MLU + if !strings.Contains(d.Type, n.Type) { + return false, false + } + for _, val := range device.GetDevices() { + found, pass, numaAssert := val.CheckType(annos, d, n) + if found { + return pass, numaAssert + } + } + klog.Infof("Unrecognized device %s", n.Type) + return false, false } -func (l DeviceUsageList) Less(i, j int) bool { - return l[i].count-l[i].used < l[j].count-l[j].used +func checkUUID(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) bool { + devices, ok := device.GetDevices()[n.Type] + if !ok { + klog.Errorf("can not get device for %s type", n.Type) + return false + } + result := devices.CheckUUID(annos, d) + klog.V(2).Infof("checkUUID result is %v for %s type", result, n.Type) + return result } -func (l NodeScoreList) Len() int { - return len(l) -} +func fitInCertainDevice(node *NodeUsage, request util.ContainerDeviceRequest, annos map[string]string, pod *corev1.Pod) (bool, map[string]util.ContainerDevices) { + k := request + originReq := k.Nums + prevnuma := -1 + klog.InfoS("Allocating device for container request", "pod", klog.KObj(pod), "card request", k) + var tmpDevs map[string]util.ContainerDevices + tmpDevs = make(map[string]util.ContainerDevices) + for i := len(node.Devices.DeviceLists) - 1; i >= 0; i-- { + klog.InfoS("scoring pod", "pod", klog.KObj(pod), "Memreq", k.Memreq, "MemPercentagereq", k.MemPercentagereq, "Coresreq", k.Coresreq, "Nums", k.Nums, "device index", i, "device", node.Devices.DeviceLists[i].Device.ID) + found, numa := checkType(annos, *node.Devices.DeviceLists[i].Device, k) + if !found { + klog.InfoS("card type mismatch,continuing...", "pod", klog.KObj(pod), (node.Devices.DeviceLists[i].Device).Type, k.Type) + continue + } + if numa && prevnuma != node.Devices.DeviceLists[i].Device.Numa { + klog.InfoS("Numa not fit, resotoreing", "pod", klog.KObj(pod), "k.nums", k.Nums, "numa", numa, "prevnuma", prevnuma, "device numa", node.Devices.DeviceLists[i].Device.Numa) + k.Nums = originReq + prevnuma = node.Devices.DeviceLists[i].Device.Numa + tmpDevs = make(map[string]util.ContainerDevices) + } + if !checkUUID(annos, *node.Devices.DeviceLists[i].Device, k) { + klog.InfoS("card uuid mismatch,", "pod", klog.KObj(pod), "current device info is:", *node.Devices.DeviceLists[i].Device) + continue + } -func (l NodeScoreList) Swap(i, j int) { - l[i], l[j] = l[j], l[i] + memreq := int32(0) + if node.Devices.DeviceLists[i].Device.Count <= node.Devices.DeviceLists[i].Device.Used { + continue + } + if k.Coresreq > 100 { + klog.ErrorS(nil, "core limit can't exceed 100", "pod", klog.KObj(pod)) + k.Coresreq = 100 + //return false, tmpDevs + } + if k.Memreq > 0 { + memreq = k.Memreq + } + if k.MemPercentagereq != 101 && k.Memreq == 0 { + //This incurs an issue + memreq = node.Devices.DeviceLists[i].Device.Totalmem * k.MemPercentagereq / 100 + } + if node.Devices.DeviceLists[i].Device.Totalmem-node.Devices.DeviceLists[i].Device.Usedmem < memreq { + klog.V(5).InfoS("card Insufficient remaining memory", "pod", klog.KObj(pod), "device index", i, "device", node.Devices.DeviceLists[i].Device.ID, "device total memory", node.Devices.DeviceLists[i].Device.Totalmem, "device used memory", node.Devices.DeviceLists[i].Device.Usedmem, "request memory", memreq) + continue + } + if node.Devices.DeviceLists[i].Device.Totalcore-node.Devices.DeviceLists[i].Device.Usedcores < k.Coresreq { + klog.V(5).InfoS("card Insufficient remaining cores", "pod", klog.KObj(pod), "device index", i, "device", node.Devices.DeviceLists[i].Device.ID, "device total core", node.Devices.DeviceLists[i].Device.Totalcore, "device used core", node.Devices.DeviceLists[i].Device.Usedcores, "request cores", k.Coresreq) + continue + } + // Coresreq=100 indicates it want this card exclusively + if node.Devices.DeviceLists[i].Device.Totalcore == 100 && k.Coresreq == 100 && node.Devices.DeviceLists[i].Device.Used > 0 { + klog.V(5).InfoS("the container wants exclusive access to an entire card, but the card is already in use", "pod", klog.KObj(pod), "device index", i, "device", node.Devices.DeviceLists[i].Device.ID, "used", node.Devices.DeviceLists[i].Device.Used) + continue + } + // You can't allocate core=0 job to an already full GPU + if node.Devices.DeviceLists[i].Device.Totalcore != 0 && node.Devices.DeviceLists[i].Device.Usedcores == node.Devices.DeviceLists[i].Device.Totalcore && k.Coresreq == 0 { + klog.V(5).InfoS("can't allocate core=0 job to an already full GPU", "pod", klog.KObj(pod), "device index", i, "device", node.Devices.DeviceLists[i].Device.ID) + continue + } + if k.Nums > 0 { + klog.InfoS("first fitted", "pod", klog.KObj(pod), "device", node.Devices.DeviceLists[i].Device.ID) + k.Nums-- + tmpDevs[k.Type] = append(tmpDevs[k.Type], util.ContainerDevice{ + Idx: int(node.Devices.DeviceLists[i].Device.Index), + UUID: node.Devices.DeviceLists[i].Device.ID, + Type: k.Type, + Usedmem: memreq, + Usedcores: k.Coresreq, + }) + } + if k.Nums == 0 { + klog.InfoS("device allocate success", "pod", klog.KObj(pod), "allocate device", tmpDevs) + return true, tmpDevs + } + } + return false, tmpDevs } -func (l NodeScoreList) Less(i, j int) bool { - return l[i].score < l[j].score +func fitInDevices(node *NodeUsage, requests util.ContainerDeviceRequests, annos map[string]string, pod *corev1.Pod, devinput *util.PodDevices) (bool, float32) { + //devmap := make(map[string]util.ContainerDevices) + devs := util.ContainerDevices{} + total, totalCore, totalMem := int32(0), int32(0), int32(0) + free, freeCore, freeMem := int32(0), int32(0), int32(0) + sums := 0 + // computer all device score for one node + for index := range node.Devices.DeviceLists { + node.Devices.DeviceLists[index].ComputeScore(requests) + } + //This loop is for requests for different devices + for _, k := range requests { + sums += int(k.Nums) + if int(k.Nums) > len(node.Devices.DeviceLists) { + klog.InfoS("request devices nums cannot exceed the total number of devices on the node.", "pod", klog.KObj(pod), "request devices nums", k.Nums, "node device nums", len(node.Devices.DeviceLists)) + return false, 0 + } + sort.Sort(node.Devices) + fit, tmpDevs := fitInCertainDevice(node, k, annos, pod) + if fit { + for _, val := range tmpDevs[k.Type] { + total += node.Devices.DeviceLists[val.Idx].Device.Count + totalCore += node.Devices.DeviceLists[val.Idx].Device.Totalcore + totalMem += node.Devices.DeviceLists[val.Idx].Device.Totalmem + free += node.Devices.DeviceLists[val.Idx].Device.Count - node.Devices.DeviceLists[val.Idx].Device.Used + freeCore += node.Devices.DeviceLists[val.Idx].Device.Totalcore - node.Devices.DeviceLists[val.Idx].Device.Usedcores + freeMem += node.Devices.DeviceLists[val.Idx].Device.Totalmem - node.Devices.DeviceLists[val.Idx].Device.Usedmem + + node.Devices.DeviceLists[val.Idx].Device.Used++ + node.Devices.DeviceLists[val.Idx].Device.Usedcores += val.Usedcores + node.Devices.DeviceLists[val.Idx].Device.Usedmem += val.Usedmem + } + devs = append(devs, tmpDevs[k.Type]...) + } else { + return false, 0 + } + (*devinput)[k.Type] = append((*devinput)[k.Type], devs) + } + return true, 0 } -func calcScore(nodes *map[string]*NodeUsage, errMap *map[string]string, nums []int) (*NodeScoreList, error) { - res := make(NodeScoreList, 0, len(*nodes)) - for nodeID, node := range *nodes { - dn := len(node.devices) - score := NodeScore{nodeID: nodeID, score: 0} - for _, n := range nums { - if n == 0 { - score.devices = append(score.devices, []string{}) - continue - } - if n > dn { - break - } - sort.Sort(node.devices) - if node.devices[dn-n].count <= node.devices[dn-n].used { - continue - } - total := int32(0) - free := int32(0) - devs := make([]string, 0, n) - for i := len(node.devices) - 1; i >= 0; i-- { - total += node.devices[i].count - free += node.devices[i].count - node.devices[i].used - if n > 0 { - n-- - node.devices[i].used++ - devs = append(devs, node.devices[i].id) - } - } - score.devices = append(score.devices, devs) - score.score += float32(free) / float32(total) - score.score += float32(dn - n) - } - if len(score.devices) == len(nums) { - res = append(res, &score) - } - } - return &res, nil +func (s *Scheduler) calcScore(nodes *map[string]*NodeUsage, nums util.PodDeviceRequests, annos map[string]string, task *corev1.Pod) (*policy.NodeScoreList, error) { + userNodePolicy := config.NodeSchedulerPolicy + if annos != nil { + if value, ok := annos[policy.NodeSchedulerPolicyAnnotationKey]; ok { + userNodePolicy = value + } + } + res := policy.NodeScoreList{ + Policy: userNodePolicy, + NodeList: make([]*policy.NodeScore, 0), + } + + //func calcScore(nodes *map[string]*NodeUsage, errMap *map[string]string, nums util.PodDeviceRequests, annos map[string]string, task *corev1.Pod) (*NodeScoreList, error) { + // res := make(NodeScoreList, 0, len(*nodes)) + for nodeID, node := range *nodes { + viewStatus(*node) + score := policy.NodeScore{NodeID: nodeID, Devices: make(util.PodDevices), Score: 0} + score.ComputeScore(node.Devices) + + //This loop is for different container request + ctrfit := false + for ctrid, n := range nums { + sums := 0 + for _, k := range n { + sums += int(k.Nums) + } + + if sums == 0 { + for idx := range score.Devices { + if len(score.Devices[idx]) <= ctrid { + score.Devices[idx] = append(score.Devices[idx], util.ContainerDevices{}) + } + score.Devices[idx][ctrid] = append(score.Devices[idx][ctrid], util.ContainerDevice{}) + continue + } + } + klog.V(5).InfoS("fitInDevices", "pod", klog.KObj(task), "node", nodeID) + fit, _ := fitInDevices(node, n, annos, task, &score.Devices) + ctrfit = fit + if !fit { + klog.InfoS("calcScore:node not fit pod", "pod", klog.KObj(task), "node", nodeID) + break + } + } + + if ctrfit { + res.NodeList = append(res.NodeList, &score) + } + } + return &res, nil } diff --git a/pkg/scheduler/score_test.go b/pkg/scheduler/score_test.go new file mode 100644 index 000000000..88860066d --- /dev/null +++ b/pkg/scheduler/score_test.go @@ -0,0 +1,1110 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "testing" + + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + "github.com/Project-HAMi/HAMi/pkg/scheduler/policy" + "github.com/Project-HAMi/HAMi/pkg/util" + + "gotest.tools/v3/assert" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// test case matrix +/** +| node num | per node device | pod use device | device having use | score | +|----------|-----------------|----------------|-------------------|-------| +| 1 node | 1 device | 1 device | no | 5.25 | +| 1 node | 1 device | 1 device | 50% core, 50% mem | 20.25 | +| 1 node | 2 device | 1 device | no | 2.625 | +| 1 node | 2 device | 1 device | 50% core, 50% mem | 10.125 | +| 1 node | 2 device | 2 device | no | 5.25 | +| 1 node | 2 device | 2 device | 50% core, 50% mem | 20.25 | +| 2 node | 1 device | 1 device | no | 5.25 | +| 2 node | 1 device | 1 device | node1-device1: 50% core, 50% mem, node2-device1: 0% core, 0% mem | node1: 5.25 node2: 5.25 | +| 2 node | 2 device | 1 device | no | 1,1 | +| 2 node | 1 device | 1 device | node1-device1: 50% core, 50% mem, node2-device1: 0% core, 0% mem | node1: 20.25 node2: 5.25 | +test case matrix. +*/ +func Test_calcScore(t *testing.T) { + tests := []struct { + name string + args struct { + nodes *map[string]*NodeUsage + nums util.PodDeviceRequests + annos map[string]string + task *corev1.Pod + } + wants struct { + want *policy.NodeScoreList + err error + } + }{ + { + name: "one node one device one pod one container use one device.", + args: struct { + nodes *map[string]*NodeUsage + nums util.PodDeviceRequests + annos map[string]string + task *corev1.Pod + }{ + nodes: &map[string]*NodeUsage{ + "node1": { + Devices: policy.DeviceUsageList{ + Policy: policy.GPUSchedulerPolicySpread.String(), + DeviceLists: []*policy.DeviceListsScore{ + { + Device: &util.DeviceUsage{ + ID: "uuid1", + Index: 0, + Used: 0, + Count: 10, + Usedmem: 0, + Totalmem: 8000, + Totalcore: 100, + Usedcores: 0, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + }, + Score: 0, + }, + }, + }, + }, + }, + nums: util.PodDeviceRequests{ + { + "hami.io/vgpu-devices-to-allocate": util.ContainerDeviceRequest{ + Nums: 1, + Type: nvidia.NvidiaGPUDevice, + Memreq: 1000, + Coresreq: 30, + }, + }, + }, + annos: make(map[string]string), + task: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test1", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "gpu-burn", + Image: "chrstnhntschl/gpu_burn", + Args: []string{"6000"}, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "hami.io/gpu": *resource.NewQuantity(1, resource.BinarySI), + "hami.io/gpucores": *resource.NewQuantity(30, resource.BinarySI), + "hami.io/gpumem": *resource.NewQuantity(1000, resource.BinarySI), + }, + }, + }, + }, + }, + }, + }, + wants: struct { + want *policy.NodeScoreList + err error + }{ + want: &policy.NodeScoreList{ + Policy: policy.NodeSchedulerPolicyBinpack.String(), + NodeList: []*policy.NodeScore{ + { + NodeID: "node1", + Devices: util.PodDevices{ + "NVIDIA": util.PodSingleDevice{ + { + { + Idx: 0, + UUID: "uuid1", + Type: nvidia.NvidiaGPUDevice, + Usedcores: 30, + Usedmem: 1000, + }, + }, + }, + }, + Score: 0, + }, + }, + }, + err: nil, + }, + }, + { + name: "one node one device one pod one container use one device,but this device before having use.", + args: struct { + nodes *map[string]*NodeUsage + nums util.PodDeviceRequests + annos map[string]string + task *corev1.Pod + }{ + nodes: &map[string]*NodeUsage{ + "node1": { + Devices: policy.DeviceUsageList{ + Policy: policy.GPUSchedulerPolicySpread.String(), + DeviceLists: []*policy.DeviceListsScore{ + { + Device: &util.DeviceUsage{ + ID: "uuid1", + Index: 0, + Used: 5, + Count: 10, + Usedmem: 4000, + Totalmem: 8000, + Totalcore: 100, + Usedcores: 50, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + }, + Score: 0, + }, + }, + }, + }, + }, + nums: util.PodDeviceRequests{ + { + "hami.io/vgpu-devices-to-allocate": util.ContainerDeviceRequest{ + Nums: 1, + Type: nvidia.NvidiaGPUDevice, + Memreq: 1000, + Coresreq: 30, + }, + }, + }, + annos: make(map[string]string), + task: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test1", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "gpu-burn", + Image: "chrstnhntschl/gpu_burn", + Args: []string{"6000"}, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "hami.io/gpu": *resource.NewQuantity(1, resource.BinarySI), + "hami.io/gpucores": *resource.NewQuantity(30, resource.BinarySI), + "hami.io/gpumem": *resource.NewQuantity(1000, resource.BinarySI), + }, + }, + }, + }, + }, + }, + }, + wants: struct { + want *policy.NodeScoreList + err error + }{ + want: &policy.NodeScoreList{ + Policy: policy.NodeSchedulerPolicyBinpack.String(), + NodeList: []*policy.NodeScore{ + { + NodeID: "node1", + Devices: util.PodDevices{ + "NVIDIA": util.PodSingleDevice{ + { + { + Idx: 0, + UUID: "uuid1", + Type: nvidia.NvidiaGPUDevice, + Usedcores: 30, + Usedmem: 1000, + }, + }, + }, + }, + Score: 15, + }, + }, + }, + err: nil, + }, + }, + { + name: "one node two device one pod one container use one device", + args: struct { + nodes *map[string]*NodeUsage + nums util.PodDeviceRequests + annos map[string]string + task *corev1.Pod + }{ + nodes: &map[string]*NodeUsage{ + "node1": { + Devices: policy.DeviceUsageList{ + Policy: policy.GPUSchedulerPolicySpread.String(), + DeviceLists: []*policy.DeviceListsScore{ + { + Device: &util.DeviceUsage{ + ID: "uuid1", + Index: 0, + Used: 0, + Count: 10, + Usedmem: 0, + Totalmem: 8000, + Totalcore: 100, + Usedcores: 0, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + }, + Score: 0, + }, + { + Device: &util.DeviceUsage{ + ID: "uuid2", + Index: 0, + Used: 0, + Count: 10, + Usedmem: 0, + Totalmem: 8000, + Totalcore: 100, + Usedcores: 0, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + }, + Score: 0, + }, + }, + }, + }, + }, + nums: util.PodDeviceRequests{ + { + "hami.io/vgpu-devices-to-allocate": util.ContainerDeviceRequest{ + Nums: 1, + Type: nvidia.NvidiaGPUDevice, + Memreq: 1000, + Coresreq: 30, + }, + }, + }, + annos: make(map[string]string), + task: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test1", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "gpu-burn", + Image: "chrstnhntschl/gpu_burn", + Args: []string{"6000"}, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "hami.io/gpu": *resource.NewQuantity(1, resource.BinarySI), + "hami.io/gpucores": *resource.NewQuantity(30, resource.BinarySI), + "hami.io/gpumem": *resource.NewQuantity(1000, resource.BinarySI), + }, + }, + }, + }, + }, + }, + }, + wants: struct { + want *policy.NodeScoreList + err error + }{ + want: &policy.NodeScoreList{ + Policy: policy.NodeSchedulerPolicyBinpack.String(), + NodeList: []*policy.NodeScore{ + { + NodeID: "node1", + Devices: util.PodDevices{ + "NVIDIA": util.PodSingleDevice{ + { + { + Idx: 0, + UUID: "uuid2", + Type: nvidia.NvidiaGPUDevice, + Usedcores: 30, + Usedmem: 1000, + }, + }, + }, + }, + Score: 0, + }, + }, + }, + err: nil, + }, + }, + { + name: "one node two device one pod one container use one device,but having use 50%", + args: struct { + nodes *map[string]*NodeUsage + nums util.PodDeviceRequests + annos map[string]string + task *corev1.Pod + }{ + nodes: &map[string]*NodeUsage{ + "node1": { + Devices: policy.DeviceUsageList{ + Policy: policy.GPUSchedulerPolicySpread.String(), + DeviceLists: []*policy.DeviceListsScore{ + { + Device: &util.DeviceUsage{ + ID: "uuid1", + Index: 0, + Used: 0, + Count: 10, + Usedmem: 0, + Totalmem: 8000, + Totalcore: 100, + Usedcores: 0, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + }, + Score: 0, + }, + { + Device: &util.DeviceUsage{ + ID: "uuid2", + Index: 0, + Used: 5, + Count: 10, + Usedmem: 4000, + Totalmem: 8000, + Totalcore: 100, + Usedcores: 50, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + }, + Score: 0, + }, + }, + }, + }, + }, + nums: util.PodDeviceRequests{ + { + "hami.io/vgpu-devices-to-allocate": util.ContainerDeviceRequest{ + Nums: 1, + Type: nvidia.NvidiaGPUDevice, + Memreq: 1000, + Coresreq: 30, + }, + }, + }, + annos: make(map[string]string), + task: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test1", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "gpu-burn", + Image: "chrstnhntschl/gpu_burn", + Args: []string{"6000"}, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "hami.io/gpu": *resource.NewQuantity(1, resource.BinarySI), + "hami.io/gpucores": *resource.NewQuantity(30, resource.BinarySI), + "hami.io/gpumem": *resource.NewQuantity(1000, resource.BinarySI), + }, + }, + }, + }, + }, + }, + }, + wants: struct { + want *policy.NodeScoreList + err error + }{ + want: &policy.NodeScoreList{ + Policy: policy.NodeSchedulerPolicyBinpack.String(), + NodeList: []*policy.NodeScore{ + { + NodeID: "node1", + Devices: util.PodDevices{ + "NVIDIA": util.PodSingleDevice{ + { + { + Idx: 0, + UUID: "uuid1", + Type: nvidia.NvidiaGPUDevice, + Usedcores: 30, + Usedmem: 1000, + }, + }, + }, + }, + Score: 7.5, + }, + }, + }, + err: nil, + }, + }, + { + name: "one node two device one pod one container use two device", + args: struct { + nodes *map[string]*NodeUsage + nums util.PodDeviceRequests + annos map[string]string + task *corev1.Pod + }{ + nodes: &map[string]*NodeUsage{ + "node1": { + Devices: policy.DeviceUsageList{ + Policy: policy.GPUSchedulerPolicySpread.String(), + DeviceLists: []*policy.DeviceListsScore{ + { + Device: &util.DeviceUsage{ + ID: "uuid1", + Index: 0, + Used: 0, + Count: 10, + Usedmem: 0, + Totalmem: 8000, + Totalcore: 100, + Usedcores: 0, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + }, + Score: 0, + }, + { + Device: &util.DeviceUsage{ + ID: "uuid2", + Index: 0, + Used: 0, + Count: 10, + Usedmem: 0, + Totalmem: 8000, + Totalcore: 100, + Usedcores: 0, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + }, + Score: 0, + }, + }, + }, + }, + }, + nums: util.PodDeviceRequests{ + { + "hami.io/vgpu-devices-to-allocate": util.ContainerDeviceRequest{ + Nums: 2, + Type: nvidia.NvidiaGPUDevice, + Memreq: 1000, + Coresreq: 30, + }, + }, + }, + annos: make(map[string]string), + task: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test1", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "gpu-burn", + Image: "chrstnhntschl/gpu_burn", + Args: []string{"6000"}, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "hami.io/gpu": *resource.NewQuantity(2, resource.BinarySI), + "hami.io/gpucores": *resource.NewQuantity(30, resource.BinarySI), + "hami.io/gpumem": *resource.NewQuantity(1000, resource.BinarySI), + }, + }, + }, + }, + }, + }, + }, + wants: struct { + want *policy.NodeScoreList + err error + }{ + want: &policy.NodeScoreList{ + Policy: policy.NodeSchedulerPolicyBinpack.String(), + NodeList: []*policy.NodeScore{ + { + NodeID: "node1", + Devices: util.PodDevices{ + "NVIDIA": util.PodSingleDevice{ + { + { + Idx: 0, + UUID: "uuid2", + Type: nvidia.NvidiaGPUDevice, + Usedcores: 30, + Usedmem: 1000, + }, + { + Idx: 0, + UUID: "uuid1", + Type: nvidia.NvidiaGPUDevice, + Usedcores: 30, + Usedmem: 1000, + }, + }, + }, + }, + Score: 0, + }, + }, + }, + err: nil, + }, + }, + { + name: "one node two device one pod one container use two device,but this two device before having use.", + args: struct { + nodes *map[string]*NodeUsage + nums util.PodDeviceRequests + annos map[string]string + task *corev1.Pod + }{ + nodes: &map[string]*NodeUsage{ + "node1": { + Devices: policy.DeviceUsageList{ + Policy: policy.GPUSchedulerPolicySpread.String(), + DeviceLists: []*policy.DeviceListsScore{ + { + Device: &util.DeviceUsage{ + ID: "uuid1", + Index: 0, + Used: 5, + Count: 10, + Usedmem: 4000, + Totalmem: 8000, + Totalcore: 100, + Usedcores: 50, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + }, + Score: 0, + }, + { + Device: &util.DeviceUsage{ + ID: "uuid2", + Index: 0, + Used: 5, + Count: 10, + Usedmem: 4000, + Totalmem: 8000, + Totalcore: 100, + Usedcores: 50, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + }, + Score: 0, + }, + }, + }, + }, + }, + nums: util.PodDeviceRequests{ + { + "hami.io/vgpu-devices-to-allocate": util.ContainerDeviceRequest{ + Nums: 2, + Type: nvidia.NvidiaGPUDevice, + Memreq: 1000, + Coresreq: 30, + }, + }, + }, + annos: make(map[string]string), + task: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test1", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "gpu-burn", + Image: "chrstnhntschl/gpu_burn", + Args: []string{"6000"}, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "hami.io/gpu": *resource.NewQuantity(2, resource.BinarySI), + "hami.io/gpucores": *resource.NewQuantity(30, resource.BinarySI), + "hami.io/gpumem": *resource.NewQuantity(1000, resource.BinarySI), + }, + }, + }, + }, + }, + }, + }, + wants: struct { + want *policy.NodeScoreList + err error + }{ + want: &policy.NodeScoreList{ + Policy: policy.NodeSchedulerPolicyBinpack.String(), + NodeList: []*policy.NodeScore{ + { + NodeID: "node1", + Devices: util.PodDevices{ + "NVIDIA": util.PodSingleDevice{ + { + { + Idx: 0, + UUID: "uuid2", + Type: nvidia.NvidiaGPUDevice, + Usedcores: 30, + Usedmem: 1000, + }, + { + Idx: 0, + UUID: "uuid1", + Type: nvidia.NvidiaGPUDevice, + Usedcores: 30, + Usedmem: 1000, + }, + }, + }, + }, + Score: 15, + }, + }, + }, + err: nil, + }, + }, + { + name: "two node per node having one device one pod one container use one device", + args: struct { + nodes *map[string]*NodeUsage + nums util.PodDeviceRequests + annos map[string]string + task *corev1.Pod + }{ + nodes: &map[string]*NodeUsage{ + "node1": { + Devices: policy.DeviceUsageList{ + Policy: policy.GPUSchedulerPolicySpread.String(), + DeviceLists: []*policy.DeviceListsScore{ + { + Device: &util.DeviceUsage{ + ID: "uuid1", + Index: 0, + Used: 0, + Count: 10, + Usedmem: 0, + Totalmem: 8000, + Totalcore: 100, + Usedcores: 0, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + }, + Score: 0, + }, + }, + }, + }, + "node2": { + Devices: policy.DeviceUsageList{ + Policy: policy.GPUSchedulerPolicySpread.String(), + DeviceLists: []*policy.DeviceListsScore{ + { + Device: &util.DeviceUsage{ + ID: "uuid2", + Index: 0, + Used: 0, + Count: 10, + Usedmem: 0, + Totalmem: 8000, + Totalcore: 100, + Usedcores: 0, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + }, + Score: 0, + }, + }, + }, + }, + }, + nums: util.PodDeviceRequests{ + { + "hami.io/vgpu-devices-to-allocate": util.ContainerDeviceRequest{ + Nums: 1, + Type: nvidia.NvidiaGPUDevice, + Memreq: 1000, + Coresreq: 30, + }, + }, + }, + annos: make(map[string]string), + task: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test1", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "gpu-burn", + Image: "chrstnhntschl/gpu_burn", + Args: []string{"6000"}, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "hami.io/gpu": *resource.NewQuantity(1, resource.BinarySI), + "hami.io/gpucores": *resource.NewQuantity(30, resource.BinarySI), + "hami.io/gpumem": *resource.NewQuantity(1000, resource.BinarySI), + }, + }, + }, + }, + }, + }, + }, + wants: struct { + want *policy.NodeScoreList + err error + }{ + want: &policy.NodeScoreList{ + Policy: policy.NodeSchedulerPolicyBinpack.String(), + NodeList: []*policy.NodeScore{ + { + NodeID: "node1", + Devices: util.PodDevices{ + "NVIDIA": util.PodSingleDevice{ + { + { + Idx: 0, + UUID: "uuid1", + Type: nvidia.NvidiaGPUDevice, + Usedcores: 30, + Usedmem: 1000, + }, + }, + }, + }, + Score: 0, + }, + { + NodeID: "node2", + Devices: util.PodDevices{ + "NVIDIA": util.PodSingleDevice{ + { + { + Idx: 0, + UUID: "uuid2", + Type: nvidia.NvidiaGPUDevice, + Usedcores: 30, + Usedmem: 1000, + }, + }, + }, + }, + Score: 0, + }, + }, + }, + err: nil, + }, + }, + { + name: "two node per node having one device one pod one container use one device,one device having use 50%", + args: struct { + nodes *map[string]*NodeUsage + nums util.PodDeviceRequests + annos map[string]string + task *corev1.Pod + }{ + nodes: &map[string]*NodeUsage{ + "node1": { + Devices: policy.DeviceUsageList{ + Policy: policy.GPUSchedulerPolicySpread.String(), + DeviceLists: []*policy.DeviceListsScore{ + { + Device: &util.DeviceUsage{ + ID: "uuid1", + Index: 0, + Used: 5, + Count: 10, + Usedmem: 4000, + Totalmem: 8000, + Totalcore: 100, + Usedcores: 50, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + }, + Score: 0, + }, + }, + }, + }, + "node2": { + Devices: policy.DeviceUsageList{ + Policy: policy.GPUSchedulerPolicySpread.String(), + DeviceLists: []*policy.DeviceListsScore{ + { + Device: &util.DeviceUsage{ + ID: "uuid2", + Index: 0, + Used: 0, + Count: 10, + Usedmem: 0, + Totalmem: 8000, + Totalcore: 100, + Usedcores: 0, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + }, + Score: 0, + }, + }, + }, + }, + }, + nums: util.PodDeviceRequests{ + { + "hami.io/vgpu-devices-to-allocate": util.ContainerDeviceRequest{ + Nums: 1, + Type: nvidia.NvidiaGPUDevice, + Memreq: 1000, + Coresreq: 30, + }, + }, + }, + annos: make(map[string]string), + task: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test1", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "gpu-burn", + Image: "chrstnhntschl/gpu_burn", + Args: []string{"6000"}, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "hami.io/gpu": *resource.NewQuantity(1, resource.BinarySI), + "hami.io/gpucores": *resource.NewQuantity(30, resource.BinarySI), + "hami.io/gpumem": *resource.NewQuantity(1000, resource.BinarySI), + }, + }, + }, + }, + }, + }, + }, + wants: struct { + want *policy.NodeScoreList + err error + }{ + want: &policy.NodeScoreList{ + Policy: policy.NodeSchedulerPolicyBinpack.String(), + NodeList: []*policy.NodeScore{ + { + NodeID: "node1", + Devices: util.PodDevices{ + "NVIDIA": util.PodSingleDevice{ + { + { + Idx: 0, + UUID: "uuid1", + Type: nvidia.NvidiaGPUDevice, + Usedcores: 30, + Usedmem: 1000, + }, + }, + }, + }, + Score: 15, + }, + { + NodeID: "node2", + Devices: util.PodDevices{ + "NVIDIA": util.PodSingleDevice{ + { + { + Idx: 0, + UUID: "uuid2", + Type: nvidia.NvidiaGPUDevice, + Usedcores: 30, + Usedmem: 1000, + }, + }, + }, + }, + Score: 0, + }, + }, + }, + err: nil, + }, + }, + { + name: "one node two device one pod two container use two device", + args: struct { + nodes *map[string]*NodeUsage + nums util.PodDeviceRequests + annos map[string]string + task *corev1.Pod + }{ + nodes: &map[string]*NodeUsage{ + "node1": { + Devices: policy.DeviceUsageList{ + Policy: policy.NodeSchedulerPolicyBinpack.String(), + DeviceLists: []*policy.DeviceListsScore{ + { + Device: &util.DeviceUsage{ + ID: "uuid1", + Index: 0, + Used: 0, + Count: 10, + Usedmem: 0, + Totalmem: 8000, + Totalcore: 100, + Usedcores: 0, + Numa: 0, + Type: nvidia.NvidiaGPUDevice, + Health: true, + }, + }, + }, + }, + }, + }, + nums: util.PodDeviceRequests{ + { + nvidia.NvidiaGPUDevice: util.ContainerDeviceRequest{ + Nums: 1, + Type: nvidia.NvidiaGPUDevice, + Memreq: 1000, + MemPercentagereq: 101, + Coresreq: 30, + }, + }, + {}, + }, + annos: make(map[string]string), + task: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test1", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "gpu-burn1", + Image: "chrstnhntschl/gpu_burn", + Args: []string{"6000"}, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "hami.io/gpu": *resource.NewQuantity(1, resource.BinarySI), + "hami.io/gpucores": *resource.NewQuantity(30, resource.BinarySI), + "hami.io/gpumem": *resource.NewQuantity(1000, resource.BinarySI), + }, + }, + }, + { + Name: "gpu-burn2", + Image: "chrstnhntschl/gpu_burn", + Args: []string{"6000"}, + Resources: corev1.ResourceRequirements{}, + }, + }, + }, + }, + }, + wants: struct { + want *policy.NodeScoreList + err error + }{ + want: &policy.NodeScoreList{ + Policy: policy.NodeSchedulerPolicyBinpack.String(), + NodeList: []*policy.NodeScore{ + { + NodeID: "node1", + Devices: util.PodDevices{ + "NVIDIA": util.PodSingleDevice{ + { + { + Idx: 0, + UUID: "uuid1", + Type: nvidia.NvidiaGPUDevice, + Usedcores: 30, + Usedmem: 1000, + }, + }, + { + { + Idx: 0, + UUID: "", + Type: "", + Usedcores: 0, + Usedmem: 0, + }, + }, + }, + }, + Score: 0, + }, + }, + }, + err: nil, + }, + }, + } + s := NewScheduler() + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + got, gotErr := s.calcScore(test.args.nodes, test.args.nums, test.args.annos, test.args.task) + assert.DeepEqual(t, test.wants.err, gotErr) + wantMap := make(map[string]*policy.NodeScore) + for index, node := range (*(test.wants.want)).NodeList { + wantMap[node.NodeID] = (*(test.wants.want)).NodeList[index] + } + for i := 0; i < got.Len(); i++ { + gotI := (*(got)).NodeList[i] + wantI := wantMap[gotI.NodeID] + assert.DeepEqual(t, wantI.NodeID, gotI.NodeID) + assert.DeepEqual(t, wantI.Devices, gotI.Devices) + assert.DeepEqual(t, wantI.Score, gotI.Score) + } + }) + } +} diff --git a/pkg/scheduler/webhook.go b/pkg/scheduler/webhook.go index e32d7fcca..dd72727b9 100644 --- a/pkg/scheduler/webhook.go +++ b/pkg/scheduler/webhook.go @@ -1,91 +1,95 @@ /* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ package scheduler import ( - "context" - "encoding/json" - "fmt" - "net/http" + "context" + "encoding/json" + "net/http" - "4pd.io/k8s-vgpu/pkg/api" - "4pd.io/k8s-vgpu/pkg/k8sutil" - "4pd.io/k8s-vgpu/pkg/scheduler/config" - "4pd.io/k8s-vgpu/pkg/util" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/runtime" - clientgoscheme "k8s.io/client-go/kubernetes/scheme" - "k8s.io/klog/v2" - "k8s.io/klog/v2/klogr" - "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/klog/v2" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + + "github.com/Project-HAMi/HAMi/pkg/device" + "github.com/Project-HAMi/HAMi/pkg/scheduler/config" ) +const template = "Processing admission hook for pod %v/%v, UID: %v" + type webhook struct { - decoder *admission.Decoder + decoder *admission.Decoder } func NewWebHook() (*admission.Webhook, error) { - schema := runtime.NewScheme() - if err := clientgoscheme.AddToScheme(schema); err != nil { - return nil, err - } - decoder, err := admission.NewDecoder(schema) - if err != nil { - return nil, err - } - wh := &admission.Webhook{Handler: &webhook{decoder: decoder}} - _ = wh.InjectLogger(klogr.New()) - return wh, nil + logf.SetLogger(klog.NewKlogr()) + schema := runtime.NewScheme() + if err := clientgoscheme.AddToScheme(schema); err != nil { + return nil, err + } + decoder := admission.NewDecoder(schema) + wh := &admission.Webhook{Handler: &webhook{decoder: decoder}} + return wh, nil } func (h *webhook) Handle(_ context.Context, req admission.Request) admission.Response { - pod := &corev1.Pod{} - err := h.decoder.Decode(req, pod) - if err != nil { - return admission.Errored(http.StatusBadRequest, err) - } - if len(pod.Spec.Containers) == 0 { - return admission.Denied("pod has no containers") - } - klog.V(1).Infof("hook %v pod %v/%v", req.UID, req.Namespace, req.Name) - nums := k8sutil.ResourceNums(pod, corev1.ResourceName(util.ResourceName)) - total := 0 - // use request uid - uid := req.UID - for i := 0; i < len(nums); i++ { - if nums[i] == 0 { - continue - } - total += nums[i] - c := &pod.Spec.Containers[i] - c.Env = append(c.Env, corev1.EnvVar{ - Name: api.ContainerUID, - Value: fmt.Sprintf("%v/%v", uid, c.Name), - }) - } - if total == 0 { - return admission.Allowed(fmt.Sprintf("no resource %v", util.ResourceName)) - } - if len(config.SchedulerName) > 0 { - pod.Spec.SchedulerName = config.SchedulerName - } - marshaledPod, err := json.Marshal(pod) - if err != nil { - return admission.Errored(http.StatusInternalServerError, err) - } - return admission.PatchResponseFromRaw(req.Object.Raw, marshaledPod) + pod := &corev1.Pod{} + err := h.decoder.Decode(req, pod) + if err != nil { + klog.Errorf("Failed to decode request: %v", err) + return admission.Errored(http.StatusBadRequest, err) + } + if len(pod.Spec.Containers) == 0 { + klog.Warningf(template+" - Denying admission as pod has no containers", req.Namespace, req.Name, req.UID) + return admission.Denied("pod has no containers") + } + klog.Infof(template, req.Namespace, req.Name, req.UID) + hasResource := false + for idx, ctr := range pod.Spec.Containers { + c := &pod.Spec.Containers[idx] + if ctr.SecurityContext != nil { + if ctr.SecurityContext.Privileged != nil && *ctr.SecurityContext.Privileged { + klog.Warningf(template+" - Denying admission as container %s is privileged", req.Namespace, req.Name, req.UID, c.Name) + continue + } + } + for _, val := range device.GetDevices() { + found, err := val.MutateAdmission(c) + if err != nil { + klog.Errorf("validating pod failed:%s", err.Error()) + return admission.Errored(http.StatusInternalServerError, err) + } + hasResource = hasResource || found + } + } + + if !hasResource { + klog.Infof(template+" - Allowing admission for pod: no resource found", req.Namespace, req.Name, req.UID) + //return admission.Allowed("no resource found") + } else if len(config.SchedulerName) > 0 { + pod.Spec.SchedulerName = config.SchedulerName + } + marshaledPod, err := json.Marshal(pod) + if err != nil { + klog.Errorf(template+" - Failed to marshal pod, error: %v", req.Namespace, req.Name, req.UID, err) + return admission.Errored(http.StatusInternalServerError, err) + } + return admission.PatchResponseFromRaw(req.Object.Raw, marshaledPod) } diff --git a/pkg/scheduler/webhook_test.go b/pkg/scheduler/webhook_test.go new file mode 100644 index 000000000..46824356a --- /dev/null +++ b/pkg/scheduler/webhook_test.go @@ -0,0 +1,89 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduler + +import ( + "context" + "testing" + + admissionv1 "k8s.io/api/admission/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/serializer" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" +) + +func TestHandle(t *testing.T) { + // create a Pod object + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "default", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "container1", + SecurityContext: &corev1.SecurityContext{ + Privileged: nil, + }, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("1"), + }, + }, + }, + }, + }, + } + + // encode the Pod object + scheme := runtime.NewScheme() + corev1.AddToScheme(scheme) + codec := serializer.NewCodecFactory(scheme).LegacyCodec(corev1.SchemeGroupVersion) + podBytes, err := runtime.Encode(codec, pod) + if err != nil { + t.Fatalf("Error encoding pod: %v", err) + } + + // create an AdmissionRequest object + req := admission.Request{ + AdmissionRequest: admissionv1.AdmissionRequest{ + UID: "test-uid", + Namespace: "default", + Name: "test-pod", + Object: runtime.RawExtension{ + Raw: podBytes, + }, + }, + } + + // create a WebHook object + wh, err := NewWebHook() + if err != nil { + t.Fatalf("Error creating WebHook: %v", err) + } + + // call the Handle method + resp := wh.Handle(context.Background(), req) + if !resp.Allowed { + t.Errorf("Expected allowed response, but got: %v", resp) + } + +} diff --git a/pkg/util/client/client.go b/pkg/util/client/client.go new file mode 100644 index 000000000..768cc5d64 --- /dev/null +++ b/pkg/util/client/client.go @@ -0,0 +1,64 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package client + +import ( + "os" + "path/filepath" + + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + "k8s.io/klog/v2" +) + +var ( + KubeClient kubernetes.Interface +) + +func init() { + var err error + KubeClient, err = NewClient() + if err != nil { + panic(err) + } +} + +func GetClient() kubernetes.Interface { + return KubeClient +} + +// NewClient connects to an API server. +func NewClient() (kubernetes.Interface, error) { + kubeConfig := os.Getenv("KUBECONFIG") + if kubeConfig == "" { + kubeConfig = filepath.Join(os.Getenv("HOME"), ".kube", "config") + } + config, err := clientcmd.BuildConfigFromFlags("", kubeConfig) + if err != nil { + klog.Infof("BuildConfigFromFlags failed for file %s: %v using inClusterConfig", kubeConfig, err) + config, err = rest.InClusterConfig() + if err != nil { + klog.Errorf("InClusterConfig Failed for err:%s", err.Error()) + } + } + KubeClient, err := kubernetes.NewForConfig(config) + if err != nil { + klog.Errorf("new config error %s", err.Error()) + } + return KubeClient, err +} diff --git a/pkg/util/nodelock/nodelock.go b/pkg/util/nodelock/nodelock.go new file mode 100644 index 000000000..7e01804ea --- /dev/null +++ b/pkg/util/nodelock/nodelock.go @@ -0,0 +1,146 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodelock + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/Project-HAMi/HAMi/pkg/util/client" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" +) + +const ( + NodeLockKey = "hami.io/mutex.lock" + MaxLockRetry = 5 + NodeLockSep = "," +) + +func SetNodeLock(nodeName string, lockname string, pods *corev1.Pod) error { + ctx := context.Background() + node, err := client.GetClient().CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return err + } + if _, ok := node.ObjectMeta.Annotations[NodeLockKey]; ok { + return fmt.Errorf("node %s is locked", nodeName) + } + newNode := node.DeepCopy() + newNode.ObjectMeta.Annotations[NodeLockKey] = GenerateNodeLockKeyByPod(pods) + _, err = client.GetClient().CoreV1().Nodes().Update(ctx, newNode, metav1.UpdateOptions{}) + for i := 0; i < MaxLockRetry && err != nil; i++ { + klog.ErrorS(err, "Failed to update node", "node", nodeName, "retry", i) + time.Sleep(100 * time.Millisecond) + node, err = client.GetClient().CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + klog.ErrorS(err, "Failed to get node when retry to update", "node", nodeName) + continue + } + newNode := node.DeepCopy() + newNode.ObjectMeta.Annotations[NodeLockKey] = GenerateNodeLockKeyByPod(pods) + _, err = client.GetClient().CoreV1().Nodes().Update(ctx, newNode, metav1.UpdateOptions{}) + } + if err != nil { + return fmt.Errorf("setNodeLock exceeds retry count %d", MaxLockRetry) + } + klog.InfoS("Node lock set", "node", nodeName) + return nil +} + +func ReleaseNodeLock(nodeName string, lockname string) error { + ctx := context.Background() + node, err := client.GetClient().CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return err + } + if _, ok := node.ObjectMeta.Annotations[NodeLockKey]; !ok { + klog.InfoS("Node lock not set", "node", nodeName) + return nil + } + newNode := node.DeepCopy() + delete(newNode.ObjectMeta.Annotations, NodeLockKey) + _, err = client.GetClient().CoreV1().Nodes().Update(ctx, newNode, metav1.UpdateOptions{}) + for i := 0; i < MaxLockRetry && err != nil; i++ { + klog.ErrorS(err, "Failed to update node", "node", nodeName, "retry", i) + time.Sleep(100 * time.Millisecond) + node, err = client.GetClient().CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + klog.ErrorS(err, "Failed to get node when retry to update", "node", nodeName) + continue + } + newNode := node.DeepCopy() + delete(newNode.ObjectMeta.Annotations, NodeLockKey) + _, err = client.GetClient().CoreV1().Nodes().Update(ctx, newNode, metav1.UpdateOptions{}) + } + if err != nil { + return fmt.Errorf("releaseNodeLock exceeds retry count %d", MaxLockRetry) + } + klog.InfoS("Node lock released", "node", nodeName) + return nil +} + +func LockNode(nodeName string, lockname string, pods *corev1.Pod) error { + ctx := context.Background() + node, err := client.GetClient().CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return err + } + if _, ok := node.ObjectMeta.Annotations[NodeLockKey]; !ok { + return SetNodeLock(nodeName, lockname, pods) + } + lockTime, _, _, err := ParseNodeLock(node.ObjectMeta.Annotations[NodeLockKey]) + if err != nil { + return err + } + if time.Since(lockTime) > time.Minute*5 { + klog.InfoS("Node lock expired", "node", nodeName, "lockTime", lockTime) + err = ReleaseNodeLock(nodeName, lockname) + if err != nil { + klog.ErrorS(err, "Failed to release node lock", "node", nodeName) + return err + } + return SetNodeLock(nodeName, lockname, pods) + } + return fmt.Errorf("node %s has been locked within 5 minutes", nodeName) +} + +func ParseNodeLock(value string) (lockTime time.Time, ns, name string, err error) { + if !strings.Contains(value, NodeLockSep) { + lockTime, err = time.Parse(time.RFC3339, value) + return lockTime, "", "", err + } + s := strings.Split(value, NodeLockSep) + if len(s) != 3 { + lockTime, err = time.Parse(time.RFC3339, value) + return lockTime, "", "", err + } + lockTime, err = time.Parse(time.RFC3339, s[0]) + return lockTime, s[1], s[2], err +} + +func GenerateNodeLockKeyByPod(pods *corev1.Pod) string { + if pods == nil { + return time.Now().Format(time.RFC3339) + } + ns, name := pods.Namespace, pods.Name + return fmt.Sprintf("%s%s%s%s%s", time.Now().Format(time.RFC3339), NodeLockSep, ns, NodeLockSep, name) +} diff --git a/pkg/util/types.go b/pkg/util/types.go index 941206f57..ce0c23a9a 100644 --- a/pkg/util/types.go +++ b/pkg/util/types.go @@ -1,45 +1,141 @@ /* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ package util +import ( + spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" +) + const ( - //ResourceName = "nvidia.com/gpu" - //ResourceName = "4pd.io/vgpu" - AssignedTimeAnnotations = "4pd.io/vgpu-time" - AssignedIDsAnnotations = "4pd.io/vgpu-ids" - AssignedNodeAnnotations = "4pd.io/vgpu-node" - - //TimeLayout = "ANSIC" - //DefaultTimeout = time.Second * 60 + //ResourceName = "nvidia.com/gpu" + //ResourceName = "hami.io/vgpu". + AssignedTimeAnnotations = "hami.io/vgpu-time" + AssignedNodeAnnotations = "hami.io/vgpu-node" + BindTimeAnnotations = "hami.io/bind-time" + DeviceBindPhase = "hami.io/bind-phase" + + DeviceBindAllocating = "allocating" + DeviceBindFailed = "failed" + DeviceBindSuccess = "success" + + //Set default mem to 5000m + //DefaultMem = 5000 + //DefaultCores = 0. + + DeviceLimit = 100 + //TimeLayout = "ANSIC" + //DefaultTimeout = time.Second * 60. + + BestEffort string = "best-effort" + Restricted string = "restricted" + Guaranteed string = "guaranteed" + + // NodeNameEnvName define env var name for use get node name. + NodeNameEnvName = "NODE_NAME" ) +type DevicePluginConfigs struct { + Nodeconfig []struct { + Name string `json:"name"` + Devicememoryscaling float64 `json:"devicememoryscaling"` + Devicecorescaling float64 `json:"devicecorescaling"` + Devicesplitcount uint `json:"devicesplitcount"` + Migstrategy string `json:"migstrategy"` + } `json:"nodeconfig"` +} + +type DeviceConfig struct { + *spec.Config + + ResourceName *string + DebugMode *bool +} + var ( - ResourceName string - DebugMode bool + DebugMode bool + + DeviceSplitCount *uint + DeviceMemoryScaling *float64 + DeviceCoresScaling *float64 + NodeName string + RuntimeSocketFlag string + DisableCoreLimit *bool ) -//type ContainerDevices struct { -// Devices []string `json:"devices,omitempty"` -//} +// type ContainerDevices struct { +// Devices []string `json:"devices,omitempty"` +// } // -//type PodDevices struct { -// Containers []ContainerDevices `json:"containers,omitempty"` -//} +// type PodDevices struct { +// Containers []ContainerDevices `json:"containers,omitempty"` +// } +type ContainerDevice struct { + // TODO current Idx cannot use, because EncodeContainerDevices method not encode this filed. + Idx int + UUID string + Type string + Usedmem int32 + Usedcores int32 +} + +type ContainerDeviceRequest struct { + Nums int32 + Type string + Memreq int32 + MemPercentagereq int32 + Coresreq int32 +} + +type ContainerDevices []ContainerDevice +type ContainerDeviceRequests map[string]ContainerDeviceRequest + +// type ContainerAllDevices map[string]ContainerDevices. +type PodSingleDevice []ContainerDevices + +type PodDeviceRequests []ContainerDeviceRequests +type PodDevices map[string]PodSingleDevice + +type DeviceUsage struct { + ID string + Index uint + Used int32 + Count int32 + Usedmem int32 + Totalmem int32 + Totalcore int32 + Usedcores int32 + Numa int + Type string + Health bool +} -type ContainerDevices []string +type DeviceInfo struct { + ID string + Index uint + Count int32 + Devmem int32 + Devcore int32 + Type string + Numa int + Health bool + DeviceVendor string +} -type PodDevices []ContainerDevices +type NodeInfo struct { + ID string + Devices []DeviceInfo +} diff --git a/pkg/util/util.go b/pkg/util/util.go index 85d483380..1e19c04de 100644 --- a/pkg/util/util.go +++ b/pkg/util/util.go @@ -1,64 +1,403 @@ /* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ package util import ( - "flag" - "os" - "strings" + "context" + "encoding/json" + "errors" + "flag" + "fmt" + "strconv" + "strings" + "time" + + "github.com/Project-HAMi/HAMi/pkg/api" + "github.com/Project-HAMi/HAMi/pkg/util/client" + "github.com/Project-HAMi/HAMi/pkg/util/nodelock" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8stypes "k8s.io/apimachinery/pkg/types" + "k8s.io/klog/v2" +) + +const ( + // OneContainerMultiDeviceSplitSymbol this is when one container use multi device, use : symbol to join device info. + OneContainerMultiDeviceSplitSymbol = ":" - "k8s.io/klog/v2" + // OnePodMultiContainerSplitSymbol this is when one pod having multi container and more than one container use device, use ; symbol to join device info. + OnePodMultiContainerSplitSymbol = ";" ) -func GlobalFlagSet() *flag.FlagSet { - fs := flag.NewFlagSet(os.Args[0], flag.ExitOnError) - fs.StringVar(&ResourceName, "resource-name", "nvidia.com/gpu", "resource name") - fs.BoolVar(&DebugMode, "debug", false, "debug mode") - klog.InitFlags(fs) - return fs +var ( + InRequestDevices map[string]string + SupportDevices map[string]string + HandshakeAnnos map[string]string +) + +func init() { + InRequestDevices = make(map[string]string) + SupportDevices = make(map[string]string) + HandshakeAnnos = make(map[string]string) +} + +func GetNode(nodename string) (*corev1.Node, error) { + n, err := client.GetClient().CoreV1().Nodes().Get(context.Background(), nodename, metav1.GetOptions{}) + return n, err +} + +func GetPendingPod(ctx context.Context, node string) (*corev1.Pod, error) { + pod, err := GetAllocatePodByNode(ctx, node) + if err != nil { + return nil, err + } + if pod != nil { + return pod, nil + } + // filter pods for this node. + selector := fmt.Sprintf("spec.nodeName=%s", node) + podListOptions := metav1.ListOptions{ + FieldSelector: selector, + } + podlist, err := client.GetClient().CoreV1().Pods("").List(ctx, podListOptions) + if err != nil { + return nil, err + } + for _, p := range podlist.Items { + if p.Status.Phase != corev1.PodPending { + continue + } + if _, ok := p.Annotations[BindTimeAnnotations]; !ok { + continue + } + if phase, ok := p.Annotations[DeviceBindPhase]; !ok { + continue + } else { + if strings.Compare(phase, DeviceBindAllocating) != 0 { + continue + } + } + if n, ok := p.Annotations[AssignedNodeAnnotations]; !ok { + continue + } else { + if strings.Compare(n, node) == 0 { + return &p, nil + } + } + } + return nil, fmt.Errorf("no binding pod found on node %s", node) +} + +func GetAllocatePodByNode(ctx context.Context, nodeName string) (*corev1.Pod, error) { + node, err := client.GetClient().CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return nil, err + } + if value, ok := node.Annotations[nodelock.NodeLockKey]; ok { + klog.V(2).Infof("node annotation key is %s, value is %s ", nodelock.NodeLockKey, value) + _, ns, name, err := nodelock.ParseNodeLock(value) + if err != nil { + return nil, err + } + if ns == "" || name == "" { + return nil, nil + } + return client.GetClient().CoreV1().Pods(ns).Get(ctx, name, metav1.GetOptions{}) + } + return nil, nil +} + +func DecodeNodeDevices(str string) ([]*api.DeviceInfo, error) { + if !strings.Contains(str, OneContainerMultiDeviceSplitSymbol) { + return []*api.DeviceInfo{}, errors.New("node annotations not decode successfully") + } + tmp := strings.Split(str, OneContainerMultiDeviceSplitSymbol) + var retval []*api.DeviceInfo + for _, val := range tmp { + if strings.Contains(val, ",") { + items := strings.Split(val, ",") + if len(items) == 7 { + count, _ := strconv.Atoi(items[1]) + devmem, _ := strconv.Atoi(items[2]) + devcore, _ := strconv.Atoi(items[3]) + health, _ := strconv.ParseBool(items[6]) + numa, _ := strconv.Atoi(items[5]) + i := api.DeviceInfo{ + Id: items[0], + Count: int32(count), + Devmem: int32(devmem), + Devcore: int32(devcore), + Type: items[4], + Numa: numa, + Health: health, + } + retval = append(retval, &i) + } else { + return []*api.DeviceInfo{}, errors.New("node annotations not decode successfully") + } + } + } + return retval, nil +} + +func EncodeNodeDevices(dlist []*api.DeviceInfo) string { + tmp := "" + for _, val := range dlist { + tmp += val.Id + "," + strconv.FormatInt(int64(val.Count), 10) + "," + strconv.Itoa(int(val.Devmem)) + "," + strconv.Itoa(int(val.Devcore)) + "," + val.Type + "," + strconv.Itoa(val.Numa) + "," + strconv.FormatBool(val.Health) + OneContainerMultiDeviceSplitSymbol + } + klog.Infof("Encoded node Devices: %s", tmp) + return tmp } func EncodeContainerDevices(cd ContainerDevices) string { - return strings.Join(cd, ",") -} - -func EncodePodDevices(pd PodDevices) string { - var ss []string - for _, cd := range pd { - ss = append(ss, EncodeContainerDevices(cd)) - } - return strings.Join(ss, ";") -} - -func DecodeContainerDevices(str string) ContainerDevices { - if len(str) == 0 { - return ContainerDevices{} - } - return strings.Split(str, ",") -} - -func DecodePodDevices(str string) PodDevices { - if len(str) == 0 { - return PodDevices{} - } - var pd PodDevices - for _, s := range strings.Split(str, ";") { - cd := DecodeContainerDevices(s) - pd = append(pd, cd) - } - return pd + tmp := "" + for _, val := range cd { + tmp += val.UUID + "," + val.Type + "," + strconv.Itoa(int(val.Usedmem)) + "," + strconv.Itoa(int(val.Usedcores)) + OneContainerMultiDeviceSplitSymbol + } + klog.Infof("Encoded container Devices: %s", tmp) + return tmp + //return strings.Join(cd, ",") +} + +func EncodeContainerDeviceType(cd ContainerDevices, t string) string { + tmp := "" + for _, val := range cd { + if strings.Compare(val.Type, t) == 0 { + tmp += val.UUID + "," + val.Type + "," + strconv.Itoa(int(val.Usedmem)) + "," + strconv.Itoa(int(val.Usedcores)) + } + tmp += OneContainerMultiDeviceSplitSymbol + } + klog.Infof("Encoded container Certain Device type: %s->%s", t, tmp) + return tmp +} + +func EncodePodSingleDevice(pd PodSingleDevice) string { + res := "" + for _, ctrdevs := range pd { + res = res + EncodeContainerDevices(ctrdevs) + res = res + OnePodMultiContainerSplitSymbol + } + klog.Infof("Encoded pod single devices %s", res) + return res +} + +func EncodePodDevices(checklist map[string]string, pd PodDevices) map[string]string { + res := map[string]string{} + for devType, cd := range pd { + klog.Infoln("devtype=", devType) + res[checklist[devType]] = EncodePodSingleDevice(cd) + } + klog.Infof("Encoded pod Devices %s\n", res) + return res +} + +func DecodeContainerDevices(str string) (ContainerDevices, error) { + if len(str) == 0 { + return ContainerDevices{}, nil + } + cd := strings.Split(str, OneContainerMultiDeviceSplitSymbol) + contdev := ContainerDevices{} + tmpdev := ContainerDevice{} + klog.V(5).Infof("Start to decode container device %s", str) + if len(str) == 0 { + return ContainerDevices{}, nil + } + for _, val := range cd { + if strings.Contains(val, ",") { + //fmt.Println("cd is ", val) + tmpstr := strings.Split(val, ",") + if len(tmpstr) < 4 { + return ContainerDevices{}, fmt.Errorf("pod annotation format error; information missing, please do not use nodeName field in task") + } + tmpdev.UUID = tmpstr[0] + tmpdev.Type = tmpstr[1] + devmem, _ := strconv.ParseInt(tmpstr[2], 10, 32) + tmpdev.Usedmem = int32(devmem) + devcores, _ := strconv.ParseInt(tmpstr[3], 10, 32) + tmpdev.Usedcores = int32(devcores) + contdev = append(contdev, tmpdev) + } + } + klog.V(5).Infof("Finished decoding container devices. Total devices: %d", len(contdev)) + return contdev, nil +} + +func DecodePodDevices(checklist map[string]string, annos map[string]string) (PodDevices, error) { + klog.V(5).Infof("checklist is [%+v], annos is [%+v]", checklist, annos) + if len(annos) == 0 { + return PodDevices{}, nil + } + pd := make(PodDevices) + for devID, devs := range checklist { + str, ok := annos[devs] + if !ok { + continue + } + pd[devID] = make(PodSingleDevice, 0) + for _, s := range strings.Split(str, OnePodMultiContainerSplitSymbol) { + cd, err := DecodeContainerDevices(s) + if err != nil { + return PodDevices{}, nil + } + if len(cd) == 0 { + continue + } + pd[devID] = append(pd[devID], cd) + } + } + klog.InfoS("Decoded pod annos", "poddevices", pd) + return pd, nil +} + +func GetNextDeviceRequest(dtype string, p corev1.Pod) (corev1.Container, ContainerDevices, error) { + pdevices, err := DecodePodDevices(InRequestDevices, p.Annotations) + if err != nil { + return corev1.Container{}, ContainerDevices{}, err + } + klog.Infof("pod annotation decode vaule is %+v", pdevices) + res := ContainerDevices{} + + pd, ok := pdevices[dtype] + if !ok { + return corev1.Container{}, res, errors.New("device request not found") + } + for ctridx, ctrDevice := range pd { + if len(ctrDevice) > 0 { + return p.Spec.Containers[ctridx], ctrDevice, nil + } + } + return corev1.Container{}, res, errors.New("device request not found") +} + +func GetContainerDeviceStrArray(c ContainerDevices) []string { + tmp := []string{} + for _, val := range c { + tmp = append(tmp, val.UUID) + } + return tmp +} + +func EraseNextDeviceTypeFromAnnotation(dtype string, p corev1.Pod) error { + pdevices, err := DecodePodDevices(InRequestDevices, p.Annotations) + if err != nil { + return err + } + res := PodSingleDevice{} + pd, ok := pdevices[dtype] + if !ok { + return errors.New("erase device annotation not found") + } + found := false + for _, val := range pd { + if found { + res = append(res, val) + } else { + if len(val) > 0 { + found = true + res = append(res, ContainerDevices{}) + } else { + res = append(res, val) + } + } + } + klog.Infoln("After erase res=", res) + newannos := make(map[string]string) + newannos[InRequestDevices[dtype]] = EncodePodSingleDevice(res) + return PatchPodAnnotations(&p, newannos) +} + +func PatchNodeAnnotations(node *corev1.Node, annotations map[string]string) error { + type patchMetadata struct { + Annotations map[string]string `json:"annotations,omitempty"` + } + type patchPod struct { + Metadata patchMetadata `json:"metadata"` + //Spec patchSpec `json:"spec,omitempty"` + } + + p := patchPod{} + p.Metadata.Annotations = annotations + + bytes, err := json.Marshal(p) + if err != nil { + return err + } + _, err = client.GetClient().CoreV1().Nodes(). + Patch(context.Background(), node.Name, k8stypes.StrategicMergePatchType, bytes, metav1.PatchOptions{}) + if err != nil { + klog.Infoln("annotations=", annotations) + klog.Infof("patch pod %v failed, %v", node.Name, err) + } + return err +} + +func PatchPodAnnotations(pod *corev1.Pod, annotations map[string]string) error { + type patchMetadata struct { + Annotations map[string]string `json:"annotations,omitempty"` + } + type patchPod struct { + Metadata patchMetadata `json:"metadata"` + //Spec patchSpec `json:"spec,omitempty"` + } + + p := patchPod{} + p.Metadata.Annotations = annotations + + bytes, err := json.Marshal(p) + if err != nil { + return err + } + klog.V(5).Infof("patch pod %s/%s annotation content is %s", pod.Namespace, pod.Name, string(bytes)) + _, err = client.GetClient().CoreV1().Pods(pod.Namespace). + Patch(context.Background(), pod.Name, k8stypes.StrategicMergePatchType, bytes, metav1.PatchOptions{}) + if err != nil { + klog.Infof("patch pod %v failed, %v", pod.Name, err) + } + return err +} + +func InitKlogFlags() *flag.FlagSet { + // Init log flags + flagset := flag.NewFlagSet("klog", flag.ExitOnError) + klog.InitFlags(flagset) + + return flagset +} + +func CheckHealth(devType string, n *corev1.Node) (bool, bool) { + handshake := n.Annotations[HandshakeAnnos[devType]] + if strings.Contains(handshake, "Requesting") { + formertime, _ := time.Parse("2006.01.02 15:04:05", strings.Split(handshake, "_")[1]) + return time.Now().Before(formertime.Add(time.Second * 60)), false + } else if strings.Contains(handshake, "Deleted") { + return true, false + } else { + return true, true + } +} + +func MarkAnnotationsToDelete(devType string, nn string) error { + tmppat := make(map[string]string) + tmppat[devType] = "Deleted_" + time.Now().Format("2006.01.02 15:04:05") + n, err := GetNode(nn) + if err != nil { + klog.Errorln("get node failed", err.Error()) + return err + } + return PatchNodeAnnotations(n, tmppat) } diff --git a/pkg/util/util_test.go b/pkg/util/util_test.go index 6f5f6158f..c8d08628b 100644 --- a/pkg/util/util_test.go +++ b/pkg/util/util_test.go @@ -1,51 +1,173 @@ /* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ package util import ( - "fmt" - "gotest.tools/v3/assert" - "testing" + "fmt" + "testing" + + "gotest.tools/v3/assert" ) +var inRequestDevices map[string]string + +func init() { + inRequestDevices = make(map[string]string) + inRequestDevices["NVIDIA"] = "hami.io/vgpu-devices-to-allocate" +} + func TestEmptyContainerDevicesCoding(t *testing.T) { - cd1 := ContainerDevices{} - s := EncodeContainerDevices(cd1) - fmt.Println(s) - cd2 := DecodeContainerDevices(s) - assert.DeepEqual(t, cd1, cd2) + cd1 := ContainerDevices{} + s := EncodeContainerDevices(cd1) + fmt.Println(s) + cd2, _ := DecodeContainerDevices(s) + assert.DeepEqual(t, cd1, cd2) } func TestEmptyPodDeviceCoding(t *testing.T) { - pd1 := PodDevices{} - s := EncodePodDevices(pd1) - fmt.Println(s) - pd2 := DecodePodDevices(s) - assert.DeepEqual(t, pd1, pd2) + pd1 := PodDevices{} + s := EncodePodDevices(inRequestDevices, pd1) + fmt.Println(s) + pd2, _ := DecodePodDevices(inRequestDevices, s) + assert.DeepEqual(t, pd1, pd2) } func TestPodDevicesCoding(t *testing.T) { - pd1 := PodDevices{ - ContainerDevices{"1", "2"}, - ContainerDevices{}, - ContainerDevices{"3", "4"}, - } - s := EncodePodDevices(pd1) - fmt.Println(s) - pd2 := DecodePodDevices(s) - assert.DeepEqual(t, pd1, pd2) + tests := []struct { + name string + args PodDevices + }{ + { + name: "one pod one container use zero device", + args: PodDevices{ + "NVIDIA": PodSingleDevice{}, + }, + }, + { + name: "one pod one container use one device", + args: PodDevices{ + "NVIDIA": PodSingleDevice{ + ContainerDevices{ + ContainerDevice{0, "UUID1", "Type1", 1000, 30}, + }, + }, + }, + }, + { + name: "one pod two container, every container use one device", + args: PodDevices{ + "NVIDIA": PodSingleDevice{ + ContainerDevices{ + ContainerDevice{0, "UUID1", "Type1", 1000, 30}, + }, + ContainerDevices{ + ContainerDevice{0, "UUID1", "Type1", 1000, 30}, + }, + }, + }, + }, + { + name: "one pod one container use two devices", + args: PodDevices{ + "NVIDIA": PodSingleDevice{ + ContainerDevices{ + ContainerDevice{0, "UUID1", "Type1", 1000, 30}, + ContainerDevice{0, "UUID2", "Type1", 1000, 30}, + }, + }, + }, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + s := EncodePodDevices(inRequestDevices, test.args) + fmt.Println(s) + got, _ := DecodePodDevices(inRequestDevices, s) + assert.DeepEqual(t, test.args, got) + }) + } +} + +func Test_DecodePodDevices(t *testing.T) { + //DecodePodDevices(checklist map[string]string, annos map[string]string) (PodDevices, error) + InRequestDevices["NVIDIA"] = "hami.io/vgpu-devices-to-allocate" + SupportDevices["NVIDIA"] = "hami.io/vgpu-devices-allocated" + tests := []struct { + name string + args struct { + checklist map[string]string + annos map[string]string + } + want PodDevices + wantErr error + }{ + { + name: "annos len is 0", + args: struct { + checklist map[string]string + annos map[string]string + }{ + checklist: map[string]string{}, + annos: make(map[string]string), + }, + want: PodDevices{}, + wantErr: nil, + }, + { + name: "annos having two device", + args: struct { + checklist map[string]string + annos map[string]string + }{ + checklist: InRequestDevices, + annos: map[string]string{ + InRequestDevices["NVIDIA"]: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76,NVIDIA,500,3:;GPU-ebe7c3f7-303d-558d-435e-99a160631fe4,NVIDIA,500,3:;", + SupportDevices["NVIDIA"]: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76,NVIDIA,500,3:;GPU-ebe7c3f7-303d-558d-435e-99a160631fe4,NVIDIA,500,3:;", + }, + }, + want: PodDevices{ + "NVIDIA": { + { + { + UUID: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76", + Type: "NVIDIA", + Usedmem: 500, + Usedcores: 3, + }, + }, + { + { + UUID: "GPU-ebe7c3f7-303d-558d-435e-99a160631fe4", + Type: "NVIDIA", + Usedmem: 500, + Usedcores: 3, + }, + }, + }, + }, + wantErr: nil, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + got, gotErr := DecodePodDevices(test.args.checklist, test.args.annos) + assert.DeepEqual(t, test.wantErr, gotErr) + assert.DeepEqual(t, test.want, got) + }) + } } diff --git a/pkg/version/version.go b/pkg/version/version.go index 7095c2754..93b87547a 100644 --- a/pkg/version/version.go +++ b/pkg/version/version.go @@ -1,37 +1,38 @@ /* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ package version import ( - "fmt" - "github.com/spf13/cobra" + "fmt" + + "github.com/spf13/cobra" ) var ( - version string - VersionCmd = &cobra.Command{ - Use: "version", - Short: "print version", - Run: func(cmd *cobra.Command, args []string) { - fmt.Println(Version()) - }, - } + version string + VersionCmd = &cobra.Command{ + Use: "version", + Short: "print version", + Run: func(cmd *cobra.Command, args []string) { + fmt.Println(Version()) + }, + } ) func Version() string { - return version + return version } diff --git a/tools/plugin_cli.go b/tools/plugin_cli.go deleted file mode 100644 index 6035cb2c3..000000000 --- a/tools/plugin_cli.go +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright © 2021 peizhaoyou - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package main - -import ( - "context" - "fmt" - "os" - "time" - - "4pd.io/k8s-vgpu/pkg/api" - "github.com/spf13/cobra" - "google.golang.org/grpc" -) - -var ( - runtimeSocketFlag string - rootCmd = &cobra.Command{ - Use: "device plugin cli", - Short: "device plugin socket cli", - } - - getCmd = &cobra.Command{ - Use: "get devices", - Short: "get devices", - Args: cobra.ExactArgs(1), - Run: func(cmd *cobra.Command, args []string) { - getDevices(args[0]) - }, - } -) - -func init() { - rootCmd.Flags().SortFlags = false - rootCmd.PersistentFlags().SortFlags = false - rootCmd.PersistentFlags().StringVar(&runtimeSocketFlag, "socket", "/var/lib/vgpu/vgpu.sock", "device plugin socket") - - rootCmd.AddCommand(getCmd) -} - -func getDevices(uid string) { - ctx := context.Background() - ctx, cancel := context.WithTimeout(ctx, time.Second*10) - defer cancel() - conn, err := grpc.DialContext( - ctx, - fmt.Sprintf("unix://%v", runtimeSocketFlag), - grpc.WithInsecure(), - grpc.WithBlock(), - ) - if err != nil { - fmt.Printf("connect device plugin error, %v\n", err) - os.Exit(1) - } - client := api.NewVGPURuntimeServiceClient(conn) - req := api.GetDeviceRequest{CtrUUID: uid} - resp, err := client.GetDevice(ctx, &req) - if err != nil { - fmt.Printf("get device failed, %v\n", err) - os.Exit(1) - } - fmt.Printf("%v\n", resp.String()) -} - -func main() { - if err := rootCmd.Execute(); err != nil { - fmt.Print(err) - os.Exit(1) - } -} diff --git a/version.mk b/version.mk new file mode 100644 index 000000000..3f7ce0d01 --- /dev/null +++ b/version.mk @@ -0,0 +1,13 @@ +GO=go +GO111MODULE=on +CMDS=scheduler vGPUmonitor +DEVICES=nvidia +OUTPUT_DIR=bin +TARGET_ARCH=amd64 +GOLANG_IMAGE=golang:1.22.5-bullseye +NVIDIA_IMAGE=nvidia/cuda:12.3.2-devel-ubuntu22.04 +DEST_DIR=/usr/local/vgpu/ + +VERSION = v0.0.1 +IMG_NAME =hami +IMG_TAG="${IMG_NAME}:${VERSION}" \ No newline at end of file