diff --git a/.github/workflows/preview-docs-build.yml b/.github/workflows/preview-docs-build.yml index 7551c7f6b6c4..546e05f6aa58 100644 --- a/.github/workflows/preview-docs-build.yml +++ b/.github/workflows/preview-docs-build.yml @@ -5,18 +5,9 @@ on: paths: - 'document/**' types: [opened, synchronize, reopened] - pull_request_target: - paths: - - 'document/**' - types: [opened, synchronize, reopened] jobs: build-docs-image: - # 内部和外部贡献者都可以触发构建 - if: | - (github.event_name == 'pull_request') || - (github.event_name == 'pull_request_target') - permissions: contents: read pull-requests: write @@ -26,20 +17,11 @@ jobs: steps: - name: Checkout PR code uses: actions/checkout@v4 - with: - # 对于 pull_request_target,检出 PR 的代码 - ref: ${{ github.event_name == 'pull_request_target' && github.event.pull_request.head.sha || github.sha }} - name: Get current datetime id: datetime run: echo "datetime=$(date +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT - - name: Save PR metadata - run: | - mkdir -p /tmp/pr-metadata - echo "${{ github.event.pull_request.number }}" > /tmp/pr-metadata/pr-number.txt - echo "${{ github.event.pull_request.head.sha }}" > /tmp/pr-metadata/pr-sha.txt - - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -64,21 +46,21 @@ jobs: path: /tmp/fastgpt-docs-${{ steps.datetime.outputs.datetime }}.tar retention-days: 1 - - name: Upload PR metadata - uses: actions/upload-artifact@v4 - with: - name: pr-metadata-docs-${{ steps.datetime.outputs.datetime }} - path: /tmp/pr-metadata/ - retention-days: 1 + outputs: + datetime: ${{ steps.datetime.outputs.datetime }} call-push-workflow: needs: build-docs-image + permissions: + contents: read + packages: write + attestations: write + id-token: write + pull-requests: write + issues: write uses: ./.github/workflows/preview-docs-push.yml secrets: inherit with: - pr_number: ${{ github.event.pull_request.number }} + pr_number: ${{ format('{0}', github.event.pull_request.number) }} datetime: ${{ needs.build-docs-image.outputs.datetime }} - run_id: ${{ github.run_id }} - - outputs: - datetime: ${{ steps.datetime.outputs.datetime }} + run_id: ${{ format('{0}', github.run_id) }} diff --git a/.github/workflows/preview-docs-push.yml b/.github/workflows/preview-docs-push.yml index 9495044f4866..5cdf44d2679d 100644 --- a/.github/workflows/preview-docs-push.yml +++ b/.github/workflows/preview-docs-push.yml @@ -26,14 +26,6 @@ jobs: runs-on: ubuntu-24.04 steps: - - name: Download PR metadata - uses: actions/download-artifact@v4 - with: - name: pr-metadata-docs-${{ inputs.datetime }} - path: /tmp/pr-metadata/ - run-id: ${{ inputs.run_id }} - github-token: ${{ secrets.GITHUB_TOKEN }} - - name: Read PR information id: pr run: | diff --git a/.github/workflows/preview-fastgpt-build.yml b/.github/workflows/preview-fastgpt-build.yml index 77424ec69bae..508289925d91 100644 --- a/.github/workflows/preview-fastgpt-build.yml +++ b/.github/workflows/preview-fastgpt-build.yml @@ -2,19 +2,10 @@ name: Preview fastgpt build on: pull_request: - # 支持所有分支 - types: [opened, synchronize, reopened] - pull_request_target: - # 外部贡献者也支持自动构建 types: [opened, synchronize, reopened] jobs: build-preview-images: - # 内部和外部贡献者都可以触发构建 - if: | - (github.event_name == 'pull_request') || - (github.event_name == 'pull_request_target') - permissions: contents: read pull-requests: write @@ -29,16 +20,8 @@ jobs: - name: Checkout PR code uses: actions/checkout@v4 with: - # 对于 pull_request_target,检出 PR 的代码 - ref: ${{ github.event_name == 'pull_request_target' && github.event.pull_request.head.sha || github.sha }} fetch-depth: 0 - - name: Save PR metadata - run: | - mkdir -p /tmp/pr-metadata - echo "${{ github.event.pull_request.number }}" > /tmp/pr-metadata/pr-number.txt - echo "${{ github.event.pull_request.head.sha }}" > /tmp/pr-metadata/pr-sha.txt - - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 with: @@ -95,23 +78,23 @@ jobs: path: /tmp/${{ steps.config.outputs.IMAGE_NAME }}-${{ github.sha }}.tar retention-days: 1 - - name: Upload PR metadata - uses: actions/upload-artifact@v4 - with: - name: pr-metadata-${{ matrix.image }}-${{ github.sha }} - path: /tmp/pr-metadata/ - retention-days: 1 - call-push-workflow: needs: build-preview-images strategy: matrix: image: [fastgpt, sandbox, mcp_server] fail-fast: false + permissions: + contents: read + packages: write + attestations: write + id-token: write + pull-requests: write + issues: write uses: ./.github/workflows/preview-fastgpt-push.yml secrets: inherit with: - pr_number: ${{ github.event.pull_request.number }} + pr_number: ${{ format('{0}', github.event.pull_request.number) }} pr_sha: ${{ github.sha }} - run_id: ${{ github.run_id }} + run_id: ${{ format('{0}', github.run_id) }} image: ${{ matrix.image }} diff --git a/.github/workflows/preview-fastgpt-push.yml b/.github/workflows/preview-fastgpt-push.yml index 603291c7b646..b64dc04a4874 100644 --- a/.github/workflows/preview-fastgpt-push.yml +++ b/.github/workflows/preview-fastgpt-push.yml @@ -16,15 +16,16 @@ on: required: true type: string +permissions: + contents: read + packages: write + attestations: write + id-token: write + pull-requests: write + issues: write + jobs: push-preview-images: - permissions: - contents: read - packages: write - attestations: write - id-token: write - pull-requests: write - issues: write # Required for issue-comment (PR comments use Issues API) runs-on: ubuntu-24.04 diff --git a/deploy/args.json b/deploy/args.json index 4e205f3d886a..47fddd72ab8d 100644 --- a/deploy/args.json +++ b/deploy/args.json @@ -14,7 +14,10 @@ "milvus-etcd": "v3.5.5", "milvus-standalone": "v2.4.3", "oceanbase": "4.3.5-lts", - "seekdb": "1.0.1.0-100000392025122619" + "seekdb": "1.0.1.0-100000392025122619", + "opensandbox-server": "v0.1.7", + "opensandbox-execd": "v1.0.7", + "opensandbox-egress": "v1.0.1" }, "images": { "cn": { @@ -32,7 +35,10 @@ "milvus-etcd": "quay.io/coreos/etcd", "milvus-standalone": "milvusdb/milvus", "oceanbase": "oceanbase/oceanbase-ce", - "seekdb": "oceanbase/seekdb" + "seekdb": "oceanbase/seekdb", + "opensandbox-server": "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/server", + "opensandbox-execd": "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd", + "opensandbox-egress": "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress" }, "global": { "fastgpt": "ghcr.io/labring/fastgpt", @@ -49,7 +55,10 @@ "milvus-etcd": "quay.io/coreos/etcd", "milvus-standalone": "milvusdb/milvus", "oceanbase": "oceanbase/oceanbase-ce", - "seekdb": "oceanbase/seekdb" + "seekdb": "oceanbase/seekdb", + "opensandbox-server": "opensandbox/server", + "opensandbox-execd": "opensandbox/execd", + "opensandbox-egress": "opensandbox/egress" } } } diff --git a/deploy/docker/cn/docker-compose.milvus.yml b/deploy/docker/cn/docker-compose.milvus.yml index 9ad432f9ba9f..b8c09c731e63 100644 --- a/deploy/docker/cn/docker-compose.milvus.yml +++ b/deploy/docker/cn/docker-compose.milvus.yml @@ -183,6 +183,27 @@ services: timeout: 20s retries: 3 + opensandbox-server: + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/server:v0.1.7 + container_name: opensandbox-server + restart: always + networks: + - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine + volumes: + - /var/run/docker.sock:/var/run/docker.sock + configs: + - source: opensandbox-config + target: /etc/opensandbox/config.toml + environment: + - SANDBOX_CONFIG_PATH=/etc/opensandbox/config.toml + healthcheck: + test: ['CMD', 'curl', '-f', 'http://localhost:8090/health'] + interval: 10s + timeout: 5s + retries: 5 + fastgpt: container_name: fastgpt image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.14.8 # git @@ -190,10 +211,13 @@ services: - 3000:3000 networks: - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine depends_on: - mongo - sandbox - vectorDB + - opensandbox-server restart: always environment: <<: [*x-share-db-config, *x-vec-config, *x-log-config] @@ -215,6 +239,8 @@ services: PLUGIN_TOKEN: *x-plugin-auth-token # sandbox 地址 CODE_SANDBOX_URL: http://sandbox:3000 + # opensandbox server 地址 + OPENSANDBOX_SERVER_URL: http://opensandbox-server:8090 # AI Proxy 的地址,如果配了该地址,优先使用 AIPROXY_API_ENDPOINT: http://aiproxy:3000 # AI Proxy 的 Admin Token,与 AI Proxy 中的环境变量 ADMIN_KEY @@ -335,3 +361,28 @@ networks: aiproxy: vector: +configs: + # opensandbox config + opensandbox-config: + content: | + [server] + host = "0.0.0.0" + port = 8090 + log_level = "INFO" + + [runtime] + type = "docker" + execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.7" + + [egress] + image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.1" + + [docker] + network_mode = "bridge" + host_ip = "host.docker.internal" + drop_capabilities = ["AUDIT_WRITE", "MKNOD", "NET_ADMIN", "NET_RAW", "SYS_ADMIN", "SYS_MODULE", "SYS_PTRACE", "SYS_TIME", "SYS_TTY_CONFIG"] + no_new_privileges = true + pids_limit = 512 + + [ingress] + mode = "direct" diff --git a/deploy/docker/cn/docker-compose.oceanbase.yml b/deploy/docker/cn/docker-compose.oceanbase.yml index e39c2748780d..93041b15ee6a 100644 --- a/deploy/docker/cn/docker-compose.oceanbase.yml +++ b/deploy/docker/cn/docker-compose.oceanbase.yml @@ -160,6 +160,27 @@ services: timeout: 20s retries: 3 + opensandbox-server: + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/server:v0.1.7 + container_name: opensandbox-server + restart: always + networks: + - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine + volumes: + - /var/run/docker.sock:/var/run/docker.sock + configs: + - source: opensandbox-config + target: /etc/opensandbox/config.toml + environment: + - SANDBOX_CONFIG_PATH=/etc/opensandbox/config.toml + healthcheck: + test: ['CMD', 'curl', '-f', 'http://localhost:8090/health'] + interval: 10s + timeout: 5s + retries: 5 + fastgpt: container_name: fastgpt image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.14.8 # git @@ -167,10 +188,13 @@ services: - 3000:3000 networks: - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine depends_on: - mongo - sandbox - vectorDB + - opensandbox-server restart: always environment: <<: [*x-share-db-config, *x-vec-config, *x-log-config] @@ -192,6 +216,8 @@ services: PLUGIN_TOKEN: *x-plugin-auth-token # sandbox 地址 CODE_SANDBOX_URL: http://sandbox:3000 + # opensandbox server 地址 + OPENSANDBOX_SERVER_URL: http://opensandbox-server:8090 # AI Proxy 的地址,如果配了该地址,优先使用 AIPROXY_API_ENDPOINT: http://aiproxy:3000 # AI Proxy 的 Admin Token,与 AI Proxy 中的环境变量 ADMIN_KEY @@ -316,4 +342,28 @@ configs: name: init_sql content: | ALTER SYSTEM SET ob_vector_memory_limit_percentage = 30; - + # opensandbox config + opensandbox-config: + content: | + [server] + host = "0.0.0.0" + port = 8090 + log_level = "INFO" + + [runtime] + type = "docker" + execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.7" + + [egress] + image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.1" + + [docker] + network_mode = "bridge" + host_ip = "host.docker.internal" + drop_capabilities = ["AUDIT_WRITE", "MKNOD", "NET_ADMIN", "NET_RAW", "SYS_ADMIN", "SYS_MODULE", "SYS_PTRACE", "SYS_TIME", "SYS_TTY_CONFIG"] + no_new_privileges = true + pids_limit = 512 + + [ingress] + mode = "direct" + diff --git a/deploy/docker/cn/docker-compose.pg.yml b/deploy/docker/cn/docker-compose.pg.yml index 77aad451949d..dd79531fa9f1 100644 --- a/deploy/docker/cn/docker-compose.pg.yml +++ b/deploy/docker/cn/docker-compose.pg.yml @@ -141,6 +141,27 @@ services: timeout: 20s retries: 3 + opensandbox-server: + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/server:v0.1.7 + container_name: opensandbox-server + restart: always + networks: + - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine + volumes: + - /var/run/docker.sock:/var/run/docker.sock + configs: + - source: opensandbox-config + target: /etc/opensandbox/config.toml + environment: + - SANDBOX_CONFIG_PATH=/etc/opensandbox/config.toml + healthcheck: + test: ['CMD', 'curl', '-f', 'http://localhost:8090/health'] + interval: 10s + timeout: 5s + retries: 5 + fastgpt: container_name: fastgpt image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.14.8 # git @@ -148,10 +169,13 @@ services: - 3000:3000 networks: - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine depends_on: - mongo - sandbox - vectorDB + - opensandbox-server restart: always environment: <<: [*x-share-db-config, *x-vec-config, *x-log-config] @@ -173,6 +197,8 @@ services: PLUGIN_TOKEN: *x-plugin-auth-token # sandbox 地址 CODE_SANDBOX_URL: http://sandbox:3000 + # opensandbox server 地址 + OPENSANDBOX_SERVER_URL: http://opensandbox-server:8090 # AI Proxy 的地址,如果配了该地址,优先使用 AIPROXY_API_ENDPOINT: http://aiproxy:3000 # AI Proxy 的 Admin Token,与 AI Proxy 中的环境变量 ADMIN_KEY @@ -293,3 +319,28 @@ networks: aiproxy: vector: +configs: + # opensandbox config + opensandbox-config: + content: | + [server] + host = "0.0.0.0" + port = 8090 + log_level = "INFO" + + [runtime] + type = "docker" + execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.7" + + [egress] + image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.1" + + [docker] + network_mode = "bridge" + host_ip = "host.docker.internal" + drop_capabilities = ["AUDIT_WRITE", "MKNOD", "NET_ADMIN", "NET_RAW", "SYS_ADMIN", "SYS_MODULE", "SYS_PTRACE", "SYS_TIME", "SYS_TTY_CONFIG"] + no_new_privileges = true + pids_limit = 512 + + [ingress] + mode = "direct" diff --git a/deploy/docker/cn/docker-compose.seekdb.yml b/deploy/docker/cn/docker-compose.seekdb.yml index fdc65c85804c..e7024462f309 100644 --- a/deploy/docker/cn/docker-compose.seekdb.yml +++ b/deploy/docker/cn/docker-compose.seekdb.yml @@ -147,6 +147,27 @@ services: timeout: 20s retries: 3 + opensandbox-server: + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/server:v0.1.7 + container_name: opensandbox-server + restart: always + networks: + - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine + volumes: + - /var/run/docker.sock:/var/run/docker.sock + configs: + - source: opensandbox-config + target: /etc/opensandbox/config.toml + environment: + - SANDBOX_CONFIG_PATH=/etc/opensandbox/config.toml + healthcheck: + test: ['CMD', 'curl', '-f', 'http://localhost:8090/health'] + interval: 10s + timeout: 5s + retries: 5 + fastgpt: container_name: fastgpt image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.14.8 # git @@ -154,10 +175,13 @@ services: - 3000:3000 networks: - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine depends_on: - mongo - sandbox - vectorDB + - opensandbox-server restart: always environment: <<: [*x-share-db-config, *x-vec-config, *x-log-config] @@ -179,6 +203,8 @@ services: PLUGIN_TOKEN: *x-plugin-auth-token # sandbox 地址 CODE_SANDBOX_URL: http://sandbox:3000 + # opensandbox server 地址 + OPENSANDBOX_SERVER_URL: http://opensandbox-server:8090 # AI Proxy 的地址,如果配了该地址,优先使用 AIPROXY_API_ENDPOINT: http://aiproxy:3000 # AI Proxy 的 Admin Token,与 AI Proxy 中的环境变量 ADMIN_KEY @@ -299,3 +325,28 @@ networks: aiproxy: vector: +configs: + # opensandbox config + opensandbox-config: + content: | + [server] + host = "0.0.0.0" + port = 8090 + log_level = "INFO" + + [runtime] + type = "docker" + execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.7" + + [egress] + image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.1" + + [docker] + network_mode = "bridge" + host_ip = "host.docker.internal" + drop_capabilities = ["AUDIT_WRITE", "MKNOD", "NET_ADMIN", "NET_RAW", "SYS_ADMIN", "SYS_MODULE", "SYS_PTRACE", "SYS_TIME", "SYS_TTY_CONFIG"] + no_new_privileges = true + pids_limit = 512 + + [ingress] + mode = "direct" diff --git a/deploy/docker/cn/docker-compose.zilliz.yml b/deploy/docker/cn/docker-compose.zilliz.yml index d7334f73edd0..d3fe255d6abc 100644 --- a/deploy/docker/cn/docker-compose.zilliz.yml +++ b/deploy/docker/cn/docker-compose.zilliz.yml @@ -125,6 +125,27 @@ services: timeout: 20s retries: 3 + opensandbox-server: + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/server:v0.1.7 + container_name: opensandbox-server + restart: always + networks: + - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine + volumes: + - /var/run/docker.sock:/var/run/docker.sock + configs: + - source: opensandbox-config + target: /etc/opensandbox/config.toml + environment: + - SANDBOX_CONFIG_PATH=/etc/opensandbox/config.toml + healthcheck: + test: ['CMD', 'curl', '-f', 'http://localhost:8090/health'] + interval: 10s + timeout: 5s + retries: 5 + fastgpt: container_name: fastgpt image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.14.8 # git @@ -132,10 +153,13 @@ services: - 3000:3000 networks: - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine depends_on: - mongo - sandbox - vectorDB + - opensandbox-server restart: always environment: <<: [*x-share-db-config, *x-vec-config, *x-log-config] @@ -157,6 +181,8 @@ services: PLUGIN_TOKEN: *x-plugin-auth-token # sandbox 地址 CODE_SANDBOX_URL: http://sandbox:3000 + # opensandbox server 地址 + OPENSANDBOX_SERVER_URL: http://opensandbox-server:8090 # AI Proxy 的地址,如果配了该地址,优先使用 AIPROXY_API_ENDPOINT: http://aiproxy:3000 # AI Proxy 的 Admin Token,与 AI Proxy 中的环境变量 ADMIN_KEY @@ -277,3 +303,28 @@ networks: aiproxy: vector: +configs: + # opensandbox config + opensandbox-config: + content: | + [server] + host = "0.0.0.0" + port = 8090 + log_level = "INFO" + + [runtime] + type = "docker" + execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.7" + + [egress] + image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.1" + + [docker] + network_mode = "bridge" + host_ip = "host.docker.internal" + drop_capabilities = ["AUDIT_WRITE", "MKNOD", "NET_ADMIN", "NET_RAW", "SYS_ADMIN", "SYS_MODULE", "SYS_PTRACE", "SYS_TIME", "SYS_TTY_CONFIG"] + no_new_privileges = true + pids_limit = 512 + + [ingress] + mode = "direct" diff --git a/deploy/docker/global/docker-compose.milvus.yml b/deploy/docker/global/docker-compose.milvus.yml index 3b9ba6f5ea85..a3c5e06e9943 100644 --- a/deploy/docker/global/docker-compose.milvus.yml +++ b/deploy/docker/global/docker-compose.milvus.yml @@ -183,6 +183,27 @@ services: timeout: 20s retries: 3 + opensandbox-server: + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/server:v0.1.7 + container_name: opensandbox-server + restart: always + networks: + - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine + volumes: + - /var/run/docker.sock:/var/run/docker.sock + configs: + - source: opensandbox-config + target: /etc/opensandbox/config.toml + environment: + - SANDBOX_CONFIG_PATH=/etc/opensandbox/config.toml + healthcheck: + test: ['CMD', 'curl', '-f', 'http://localhost:8090/health'] + interval: 10s + timeout: 5s + retries: 5 + fastgpt: container_name: fastgpt image: ghcr.io/labring/fastgpt:v4.14.8 # git @@ -190,10 +211,13 @@ services: - 3000:3000 networks: - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine depends_on: - mongo - sandbox - vectorDB + - opensandbox-server restart: always environment: <<: [*x-share-db-config, *x-vec-config, *x-log-config] @@ -215,6 +239,8 @@ services: PLUGIN_TOKEN: *x-plugin-auth-token # sandbox 地址 CODE_SANDBOX_URL: http://sandbox:3000 + # opensandbox server 地址 + OPENSANDBOX_SERVER_URL: http://opensandbox-server:8090 # AI Proxy 的地址,如果配了该地址,优先使用 AIPROXY_API_ENDPOINT: http://aiproxy:3000 # AI Proxy 的 Admin Token,与 AI Proxy 中的环境变量 ADMIN_KEY @@ -335,3 +361,28 @@ networks: aiproxy: vector: +configs: + # opensandbox config + opensandbox-config: + content: | + [server] + host = "0.0.0.0" + port = 8090 + log_level = "INFO" + + [runtime] + type = "docker" + execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.7" + + [egress] + image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.1" + + [docker] + network_mode = "bridge" + host_ip = "host.docker.internal" + drop_capabilities = ["AUDIT_WRITE", "MKNOD", "NET_ADMIN", "NET_RAW", "SYS_ADMIN", "SYS_MODULE", "SYS_PTRACE", "SYS_TIME", "SYS_TTY_CONFIG"] + no_new_privileges = true + pids_limit = 512 + + [ingress] + mode = "direct" diff --git a/deploy/docker/global/docker-compose.oceanbase.yml b/deploy/docker/global/docker-compose.oceanbase.yml index 26cc22ea6294..faef7729ca36 100644 --- a/deploy/docker/global/docker-compose.oceanbase.yml +++ b/deploy/docker/global/docker-compose.oceanbase.yml @@ -160,6 +160,27 @@ services: timeout: 20s retries: 3 + opensandbox-server: + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/server:v0.1.7 + container_name: opensandbox-server + restart: always + networks: + - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine + volumes: + - /var/run/docker.sock:/var/run/docker.sock + configs: + - source: opensandbox-config + target: /etc/opensandbox/config.toml + environment: + - SANDBOX_CONFIG_PATH=/etc/opensandbox/config.toml + healthcheck: + test: ['CMD', 'curl', '-f', 'http://localhost:8090/health'] + interval: 10s + timeout: 5s + retries: 5 + fastgpt: container_name: fastgpt image: ghcr.io/labring/fastgpt:v4.14.8 # git @@ -167,10 +188,13 @@ services: - 3000:3000 networks: - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine depends_on: - mongo - sandbox - vectorDB + - opensandbox-server restart: always environment: <<: [*x-share-db-config, *x-vec-config, *x-log-config] @@ -192,6 +216,8 @@ services: PLUGIN_TOKEN: *x-plugin-auth-token # sandbox 地址 CODE_SANDBOX_URL: http://sandbox:3000 + # opensandbox server 地址 + OPENSANDBOX_SERVER_URL: http://opensandbox-server:8090 # AI Proxy 的地址,如果配了该地址,优先使用 AIPROXY_API_ENDPOINT: http://aiproxy:3000 # AI Proxy 的 Admin Token,与 AI Proxy 中的环境变量 ADMIN_KEY @@ -316,4 +342,28 @@ configs: name: init_sql content: | ALTER SYSTEM SET ob_vector_memory_limit_percentage = 30; - + # opensandbox config + opensandbox-config: + content: | + [server] + host = "0.0.0.0" + port = 8090 + log_level = "INFO" + + [runtime] + type = "docker" + execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.7" + + [egress] + image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.1" + + [docker] + network_mode = "bridge" + host_ip = "host.docker.internal" + drop_capabilities = ["AUDIT_WRITE", "MKNOD", "NET_ADMIN", "NET_RAW", "SYS_ADMIN", "SYS_MODULE", "SYS_PTRACE", "SYS_TIME", "SYS_TTY_CONFIG"] + no_new_privileges = true + pids_limit = 512 + + [ingress] + mode = "direct" + diff --git a/deploy/docker/global/docker-compose.pg.yml b/deploy/docker/global/docker-compose.pg.yml index 88d194aafc1d..6b9a0ce1d97e 100644 --- a/deploy/docker/global/docker-compose.pg.yml +++ b/deploy/docker/global/docker-compose.pg.yml @@ -141,6 +141,27 @@ services: timeout: 20s retries: 3 + opensandbox-server: + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/server:v0.1.7 + container_name: opensandbox-server + restart: always + networks: + - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine + volumes: + - /var/run/docker.sock:/var/run/docker.sock + configs: + - source: opensandbox-config + target: /etc/opensandbox/config.toml + environment: + - SANDBOX_CONFIG_PATH=/etc/opensandbox/config.toml + healthcheck: + test: ['CMD', 'curl', '-f', 'http://localhost:8090/health'] + interval: 10s + timeout: 5s + retries: 5 + fastgpt: container_name: fastgpt image: ghcr.io/labring/fastgpt:v4.14.8 # git @@ -148,10 +169,13 @@ services: - 3000:3000 networks: - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine depends_on: - mongo - sandbox - vectorDB + - opensandbox-server restart: always environment: <<: [*x-share-db-config, *x-vec-config, *x-log-config] @@ -173,6 +197,8 @@ services: PLUGIN_TOKEN: *x-plugin-auth-token # sandbox 地址 CODE_SANDBOX_URL: http://sandbox:3000 + # opensandbox server 地址 + OPENSANDBOX_SERVER_URL: http://opensandbox-server:8090 # AI Proxy 的地址,如果配了该地址,优先使用 AIPROXY_API_ENDPOINT: http://aiproxy:3000 # AI Proxy 的 Admin Token,与 AI Proxy 中的环境变量 ADMIN_KEY @@ -293,3 +319,28 @@ networks: aiproxy: vector: +configs: + # opensandbox config + opensandbox-config: + content: | + [server] + host = "0.0.0.0" + port = 8090 + log_level = "INFO" + + [runtime] + type = "docker" + execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.7" + + [egress] + image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.1" + + [docker] + network_mode = "bridge" + host_ip = "host.docker.internal" + drop_capabilities = ["AUDIT_WRITE", "MKNOD", "NET_ADMIN", "NET_RAW", "SYS_ADMIN", "SYS_MODULE", "SYS_PTRACE", "SYS_TIME", "SYS_TTY_CONFIG"] + no_new_privileges = true + pids_limit = 512 + + [ingress] + mode = "direct" diff --git a/deploy/docker/global/docker-compose.seekdb.yml b/deploy/docker/global/docker-compose.seekdb.yml index eca3330c1582..c13a2acbd3be 100644 --- a/deploy/docker/global/docker-compose.seekdb.yml +++ b/deploy/docker/global/docker-compose.seekdb.yml @@ -147,6 +147,27 @@ services: timeout: 20s retries: 3 + opensandbox-server: + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/server:v0.1.7 + container_name: opensandbox-server + restart: always + networks: + - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine + volumes: + - /var/run/docker.sock:/var/run/docker.sock + configs: + - source: opensandbox-config + target: /etc/opensandbox/config.toml + environment: + - SANDBOX_CONFIG_PATH=/etc/opensandbox/config.toml + healthcheck: + test: ['CMD', 'curl', '-f', 'http://localhost:8090/health'] + interval: 10s + timeout: 5s + retries: 5 + fastgpt: container_name: fastgpt image: ghcr.io/labring/fastgpt:v4.14.8 # git @@ -154,10 +175,13 @@ services: - 3000:3000 networks: - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine depends_on: - mongo - sandbox - vectorDB + - opensandbox-server restart: always environment: <<: [*x-share-db-config, *x-vec-config, *x-log-config] @@ -179,6 +203,8 @@ services: PLUGIN_TOKEN: *x-plugin-auth-token # sandbox 地址 CODE_SANDBOX_URL: http://sandbox:3000 + # opensandbox server 地址 + OPENSANDBOX_SERVER_URL: http://opensandbox-server:8090 # AI Proxy 的地址,如果配了该地址,优先使用 AIPROXY_API_ENDPOINT: http://aiproxy:3000 # AI Proxy 的 Admin Token,与 AI Proxy 中的环境变量 ADMIN_KEY @@ -299,3 +325,28 @@ networks: aiproxy: vector: +configs: + # opensandbox config + opensandbox-config: + content: | + [server] + host = "0.0.0.0" + port = 8090 + log_level = "INFO" + + [runtime] + type = "docker" + execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.7" + + [egress] + image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.1" + + [docker] + network_mode = "bridge" + host_ip = "host.docker.internal" + drop_capabilities = ["AUDIT_WRITE", "MKNOD", "NET_ADMIN", "NET_RAW", "SYS_ADMIN", "SYS_MODULE", "SYS_PTRACE", "SYS_TIME", "SYS_TTY_CONFIG"] + no_new_privileges = true + pids_limit = 512 + + [ingress] + mode = "direct" diff --git a/deploy/docker/global/docker-compose.ziliiz.yml b/deploy/docker/global/docker-compose.ziliiz.yml index f86081f85c7f..1397a3227c1a 100644 --- a/deploy/docker/global/docker-compose.ziliiz.yml +++ b/deploy/docker/global/docker-compose.ziliiz.yml @@ -125,6 +125,27 @@ services: timeout: 20s retries: 3 + opensandbox-server: + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/server:v0.1.7 + container_name: opensandbox-server + restart: always + networks: + - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine + volumes: + - /var/run/docker.sock:/var/run/docker.sock + configs: + - source: opensandbox-config + target: /etc/opensandbox/config.toml + environment: + - SANDBOX_CONFIG_PATH=/etc/opensandbox/config.toml + healthcheck: + test: ['CMD', 'curl', '-f', 'http://localhost:8090/health'] + interval: 10s + timeout: 5s + retries: 5 + fastgpt: container_name: fastgpt image: ghcr.io/labring/fastgpt:v4.14.8 # git @@ -132,10 +153,13 @@ services: - 3000:3000 networks: - fastgpt + extra_hosts: + - 'host.docker.internal:host-gateway' # Enable access to host machine depends_on: - mongo - sandbox - vectorDB + - opensandbox-server restart: always environment: <<: [*x-share-db-config, *x-vec-config, *x-log-config] @@ -157,6 +181,8 @@ services: PLUGIN_TOKEN: *x-plugin-auth-token # sandbox 地址 CODE_SANDBOX_URL: http://sandbox:3000 + # opensandbox server 地址 + OPENSANDBOX_SERVER_URL: http://opensandbox-server:8090 # AI Proxy 的地址,如果配了该地址,优先使用 AIPROXY_API_ENDPOINT: http://aiproxy:3000 # AI Proxy 的 Admin Token,与 AI Proxy 中的环境变量 ADMIN_KEY @@ -277,3 +303,28 @@ networks: aiproxy: vector: +configs: + # opensandbox config + opensandbox-config: + content: | + [server] + host = "0.0.0.0" + port = 8090 + log_level = "INFO" + + [runtime] + type = "docker" + execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.7" + + [egress] + image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.1" + + [docker] + network_mode = "bridge" + host_ip = "host.docker.internal" + drop_capabilities = ["AUDIT_WRITE", "MKNOD", "NET_ADMIN", "NET_RAW", "SYS_ADMIN", "SYS_MODULE", "SYS_PTRACE", "SYS_TIME", "SYS_TTY_CONFIG"] + no_new_privileges = true + pids_limit = 512 + + [ingress] + mode = "direct" diff --git a/deploy/helm/opensandbox/.helmignore b/deploy/helm/opensandbox/.helmignore new file mode 100644 index 000000000000..0e8a0eb36f4c --- /dev/null +++ b/deploy/helm/opensandbox/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/deploy/helm/opensandbox/CHANGELOG.md b/deploy/helm/opensandbox/CHANGELOG.md new file mode 100644 index 000000000000..15565bc11753 --- /dev/null +++ b/deploy/helm/opensandbox/CHANGELOG.md @@ -0,0 +1,101 @@ +# Changelog + +All notable changes to the OpenSandbox Helm Chart will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.1.0] - Initial Release + +### Added + +#### Core Features +- OpenSandbox Kubernetes Controller Helm chart +- **OpenSandbox Server deployment with FastAPI control plane for SDK integration** +- Support for deploying controller with configurable replicas and resources +- BatchSandbox and Pool CRD definitions +- RBAC resources (ClusterRole, ClusterRoleBinding, ServiceAccount) +- Leader election configuration for high availability + +#### Server Features +- Server Deployment with configurable replicas and resources +- Server Service with ClusterIP/NodePort/LoadBalancer support +- ConfigMap-based configuration management +- Optional Ingress support for external access +- Health probes for liveness and readiness checks +- In-cluster Kubernetes configuration +- API key authentication support (optional) +- SDK-compatible REST API on port 8080 + +#### Pool Management +- Default agent-pool with execd and task-executor sidecar +- Pool template support for creating pre-warmed Pod pools +- Configurable Pool capacity (bufferMin, bufferMax, poolMin, poolMax) +- SDK-compatible Pool configuration with execd on port 44772 + +#### Multiple Values Files +- `values.yaml` - Default configuration with agent-pool enabled +- `values-e2e.yaml` - End-to-end testing with minimal resources (2-5 pods) +- Use `--set` or custom values files for production/development overrides + +#### Templates +- `deployment.yaml` - Controller manager deployment +- **`server-deployment.yaml` - Server deployment** +- **`server-service.yaml` - Server service** +- **`server-configmap.yaml` - Server configuration** +- **`server-ingress.yaml` - Server ingress (optional)** +- `pools.yaml` - Dynamic Pool resource generation from values +- `serviceaccount.yaml` - Service account for controller +- `clusterrole.yaml` - RBAC cluster role +- `clusterrolebinding.yaml` - RBAC cluster role binding +- `leader-election-role.yaml` - Leader election RBAC +- `leader-election-rolebinding.yaml` - Leader election binding +- `metrics-service.yaml` - Metrics service endpoint +- `metrics-rbac.yaml` - Metrics RBAC resources +- `servicemonitor.yaml` - Prometheus ServiceMonitor (optional) +- `extra-roles.yaml` - User management roles (viewer, editor) +- `poddisruptionbudget.yaml` - High availability Pod disruption budget +- `NOTES.txt` - Post-installation guidance +- `_helpers.tpl` - Template helper functions + +#### Scripts +- `scripts/install.sh` - Interactive installation wizard with environment selection +- `scripts/uninstall.sh` - Safe uninstallation with resource cleanup +- `scripts/e2e-test.sh` - End-to-end validation (Install → Server → Pool → SDK → Uninstall) +- `scripts/README.md` - Comprehensive script documentation and troubleshooting guide + +#### Configuration Options +- `nameOverride` and `fullnameOverride` for custom resource naming +- **`server.enabled` - Enable/disable server deployment (default: true)** +- **`server.service.type` - Service type (ClusterIP/NodePort/LoadBalancer)** +- **`server.service.nodePort` - NodePort value (optional)** +- **`server.ingress.enabled` - Enable Ingress for external access** +- **`server.config.server.apiKey` - Optional API key authentication** +- `healthProbePort` - Configurable health check port (default: 8081) +- `healthProbes.liveness` - Liveness probe timing configuration +- `healthProbes.readiness` - Readiness probe timing configuration +- `podDisruptionBudget.enabled` - Optional PDB for HA deployments +- `namespaceOverride` - Custom namespace (default: opensandbox) + +#### Documentation +- Comprehensive README.md with installation and configuration guide +- examples/README.md with usage scenarios and best practices +- examples/pool-agent-production.yaml with production-ready Pool configuration +- examples/DIRECTORY_STRUCTURE.md explaining file organization +- Example YAML files for Pool and BatchSandbox resources + +### Configuration Defaults +- Controller image: `opensandbox/controller:dev` +- **Server image: `opensandbox/server:v0.1.0`** +- Task executor image: `opensandbox/task-executor:dev` +- Image pull policy: `Never` (for local development) +- Namespace: `opensandbox` +- Controller replicas: 1 (3 in production values) +- **Server replicas: 1** +- **Server enabled: true (required for SDK usage)** +- Default Pool enabled: `agent-pool` with 2-5 pods (E2E) or 10-100 pods (default) + +### Notes +- This is the initial release of the Helm chart +- All templates have been tested with `helm lint` and E2E validation +- Chart supports Kubernetes 1.19+ diff --git a/deploy/helm/opensandbox/Chart.yaml b/deploy/helm/opensandbox/Chart.yaml new file mode 100644 index 000000000000..e292bebcc3f0 --- /dev/null +++ b/deploy/helm/opensandbox/Chart.yaml @@ -0,0 +1,17 @@ +apiVersion: v2 +name: opensandbox-controller +description: A Helm chart for deploying OpenSandbox Kubernetes Controller +type: application +version: 0.1.0 +appVersion: "0.0.1" +keywords: + - opensandbox + - sandbox + - kubernetes + - operator + - controller +home: https://github.com/alibaba/OpenSandbox +maintainers: + - name: OpenSandbox Team + url: https://github.com/alibaba/OpenSandbox +icon: https://avatars.githubusercontent.com/u/1961952 diff --git a/deploy/helm/opensandbox/Makefile b/deploy/helm/opensandbox/Makefile new file mode 100644 index 000000000000..272db3cf31b4 --- /dev/null +++ b/deploy/helm/opensandbox/Makefile @@ -0,0 +1,134 @@ +# Helm Chart Makefile for OpenSandbox Controller + +CHART_NAME := opensandbox-controller +RELEASE_NAME := opensandbox-controller +NAMESPACE := opensandbox +VALUES_FILE := values.yaml + +# Helm commands +HELM := helm +KUBECTL := kubectl + +.PHONY: help +help: ## Display this help + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n\nTargets:\n"} /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 }' $(MAKEFILE_LIST) + +.PHONY: lint +lint: ## Lint the Helm chart + $(HELM) lint . + +.PHONY: template +template: ## Render chart templates to stdout + $(HELM) template $(RELEASE_NAME) . -f $(VALUES_FILE) + +.PHONY: template-debug +template-debug: ## Render chart templates with debug output + $(HELM) template $(RELEASE_NAME) . -f $(VALUES_FILE) --debug + +.PHONY: dry-run +dry-run: ## Perform a dry-run installation + $(HELM) install $(RELEASE_NAME) . \ + -f $(VALUES_FILE) \ + --namespace $(NAMESPACE) \ + --create-namespace \ + --dry-run --debug + +.PHONY: install +install: ## Install the chart + $(HELM) install $(RELEASE_NAME) . \ + -f $(VALUES_FILE) \ + --namespace $(NAMESPACE) \ + --create-namespace + +.PHONY: install-e2e +install-e2e: ## Install the chart with e2e test values + $(HELM) install $(RELEASE_NAME) . \ + -f values-e2e.yaml \ + --namespace $(NAMESPACE) \ + --create-namespace + +.PHONY: upgrade +upgrade: ## Upgrade the chart + $(HELM) upgrade $(RELEASE_NAME) . \ + -f $(VALUES_FILE) \ + --namespace $(NAMESPACE) + +.PHONY: upgrade-e2e +upgrade-e2e: ## Upgrade with e2e test values + $(HELM) upgrade $(RELEASE_NAME) . \ + -f values-e2e.yaml \ + --namespace $(NAMESPACE) + +.PHONY: uninstall +uninstall: ## Uninstall the chart + $(HELM) uninstall $(RELEASE_NAME) --namespace $(NAMESPACE) + +.PHONY: status +status: ## Show release status + $(HELM) status $(RELEASE_NAME) --namespace $(NAMESPACE) + +.PHONY: list +list: ## List all releases + $(HELM) list --namespace $(NAMESPACE) + +.PHONY: get-values +get-values: ## Get values for the release + $(HELM) get values $(RELEASE_NAME) --namespace $(NAMESPACE) + +.PHONY: get-all +get-all: ## Get all information about the release + $(HELM) get all $(RELEASE_NAME) --namespace $(NAMESPACE) + +.PHONY: package +package: ## Package the chart into an archive + $(HELM) package . + +.PHONY: verify-install +verify-install: ## Verify the installation + @echo "Checking deployment..." + $(KUBECTL) get deployment -n $(NAMESPACE) + @echo "\nChecking pods..." + $(KUBECTL) get pods -n $(NAMESPACE) + @echo "\nChecking CRDs..." + $(KUBECTL) get crds | grep sandbox.opensandbox.io + +.PHONY: logs +logs: ## Show controller logs + $(KUBECTL) logs -n $(NAMESPACE) -l control-plane=controller-manager -f + +.PHONY: clean-crds +clean-crds: ## Delete CRDs (use with caution!) + $(KUBECTL) delete crd batchsandboxes.sandbox.opensandbox.io + $(KUBECTL) delete crd pools.sandbox.opensandbox.io + +.PHONY: test-connection +test-connection: ## Test if kubectl can connect to the cluster + $(KUBECTL) cluster-info + $(KUBECTL) get nodes + +.PHONY: create-namespace +create-namespace: ## Create the namespace + $(KUBECTL) create namespace $(NAMESPACE) --dry-run=client -o yaml | $(KUBECTL) apply -f - + +.PHONY: delete-namespace +delete-namespace: ## Delete the namespace + $(KUBECTL) delete namespace $(NAMESPACE) + +# Advanced targets +.PHONY: diff +diff: ## Show diff between current release and chart + @command -v helm-diff >/dev/null 2>&1 || { echo "helm-diff plugin required. Install: helm plugin install https://github.com/databus23/helm-diff"; exit 1; } + $(HELM) diff upgrade $(RELEASE_NAME) . -f $(VALUES_FILE) --namespace $(NAMESPACE) + +.PHONY: history +history: ## Show release history + $(HELM) history $(RELEASE_NAME) --namespace $(NAMESPACE) + +.PHONY: rollback +rollback: ## Rollback to previous release + $(HELM) rollback $(RELEASE_NAME) --namespace $(NAMESPACE) + +.PHONY: rollback-to +rollback-to: ## Rollback to specific revision (usage: make rollback-to REVISION=2) + @if [ -z "$(REVISION)" ]; then echo "Please specify REVISION="; exit 1; fi + $(HELM) rollback $(RELEASE_NAME) $(REVISION) --namespace $(NAMESPACE) diff --git a/deploy/helm/opensandbox/README.md b/deploy/helm/opensandbox/README.md new file mode 100644 index 000000000000..6c441020cf86 --- /dev/null +++ b/deploy/helm/opensandbox/README.md @@ -0,0 +1,524 @@ +# OpenSandbox Controller Helm Chart + +This Helm chart deploys the OpenSandbox Kubernetes Controller, which manages sandbox environments through custom resources. + +## Prerequisites + +- Kubernetes 1.19+ +- Helm 3.0+ +- Container runtime (Docker, containerd, etc.) +- **Three container images required**: + 1. **Controller image**: The main controller manager + 2. **Server image**: FastAPI control plane for SDK usage + 3. **Task Executor image**: Sidecar container for task execution (optional but required for task features) + +## Important: Image Requirements + +OpenSandbox requires **three separate images**: + +### 1. Controller Image +The main controller that manages BatchSandbox and Pool resources. + +```bash +# Build controller image +make docker-build IMG=your-registry/opensandbox-controller:v1.0.0 +docker push your-registry/opensandbox-controller:v1.0.0 +``` + +### 2. Server Image +FastAPI control plane that exposes REST API for SDK usage. **This is the entry point for SDK clients**. + +```bash +# Build server image (from server directory) +cd ../../../server +TAG=v1.0.0 ./build.sh +# Or manually: +docker build -t your-registry/opensandbox-server:v1.0.0 . +docker push your-registry/opensandbox-server:v1.0.0 +``` + +**Note**: The server is **required for SDK usage**. If you only use `kubectl` to manage CRDs directly, you can disable it by setting `server.enabled=false`. + +### 3. Task Executor Image +A sidecar container injected into Pool pods for task execution. **This is not deployed as a separate Deployment**, but configured in Pool resources. + +```bash +# Build task-executor image +make docker-build-task-executor TASK_EXECUTOR_IMG=your-registry/opensandbox-task-executor:v1.0.0 +docker push your-registry/opensandbox-task-executor:v1.0.0 +``` + +**Note**: The task-executor image is only needed if you want to use task execution features. For basic sandbox management without tasks, only the controller and server images are required. + +## Features + +- **SDK Control Plane**: FastAPI server for Python SDK integration +- **Batch Sandbox Management**: Create and manage multiple identical sandbox environments +- **Resource Pooling**: Maintain pre-warmed resource pools for rapid provisioning +- **Task Orchestration**: Optional integrated task execution engine +- **High Availability**: Leader election support for multiple replicas +- **Metrics & Monitoring**: Prometheus metrics endpoint with optional ServiceMonitor +- **Flexible Access**: ClusterIP, NodePort, or Ingress support for server access + +## Installation + +### Quick Start + +```bash +# Add the chart repository (if published) +helm repo add opensandbox https://charts.opensandbox.io +helm repo update + +# Install the chart with all images +helm install opensandbox-controller opensandbox/opensandbox-controller \ + --set controllerManager.image.repository=your-registry/opensandbox-controller \ + --set controllerManager.image.tag=v1.0.0 \ + --set server.image.repository=your-registry/opensandbox-server \ + --set server.image.tag=v1.0.0 \ + --set taskExecutor.image.repository=your-registry/opensandbox-task-executor \ + --set taskExecutor.image.tag=v1.0.0 + +# Or install from local directory +helm install opensandbox-controller ./opensandbox-controller \ + --set controllerManager.image.repository=your-registry/opensandbox-controller \ + --set controllerManager.image.tag=v1.0.0 \ + --set server.image.repository=your-registry/opensandbox-server \ + --set server.image.tag=v1.0.0 \ + --set taskExecutor.image.repository=your-registry/opensandbox-task-executor \ + --set taskExecutor.image.tag=v1.0.0 +``` + +### Custom Installation + +```bash +# Install with custom values +helm install opensandbox-controller ./opensandbox-controller \ + --set controllerManager.image.repository=your-registry/sandbox-controller \ + --set controllerManager.image.tag=v1.0.0 \ + --namespace opensandbox \ + --create-namespace + +# Install with values file +helm install opensandbox-controller ./opensandbox-controller \ + -f custom-values.yaml +``` + +## Configuration + +The following table lists the configurable parameters of the chart and their default values. + +### Controller Manager Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `controllerManager.image.repository` | Controller image repository | `opensandbox/controller` | +| `controllerManager.image.tag` | Controller image tag | `dev` | +| `controllerManager.image.pullPolicy` | Image pull policy | `Never` | +| `controllerManager.replicas` | Number of controller replicas | `1` | +| `controllerManager.resources.limits.cpu` | CPU limit | `500m` | +| `controllerManager.resources.limits.memory` | Memory limit | `128Mi` | +| `controllerManager.resources.requests.cpu` | CPU request | `10m` | +| `controllerManager.resources.requests.memory` | Memory request | `64Mi` | +| `controllerManager.leaderElect` | Enable leader election | `true` | +| `controllerManager.logLevel` | Log verbosity level | `3` | + +### Task Executor Configuration + +**Important**: The task-executor is not deployed as a separate service. It is configured as a sidecar container in Pool resources. These settings provide the default image and resource configurations for reference when creating Pools. + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `taskExecutor.image.repository` | Task Executor image repository | `opensandbox/task-executor` | +| `taskExecutor.image.tag` | Task Executor image tag | `dev` | +| `taskExecutor.image.pullPolicy` | Image pull policy | `Never` | +| `taskExecutor.resources.limits.cpu` | Recommended CPU limit for sidecar | `500m` | +| `taskExecutor.resources.limits.memory` | Recommended memory limit for sidecar | `256Mi` | +| `taskExecutor.resources.requests.cpu` | Recommended CPU request for sidecar | `100m` | +| `taskExecutor.resources.requests.memory` | Recommended memory request for sidecar | `128Mi` | + +### Server Configuration + +**Important**: The server is a FastAPI control plane that exposes REST API for SDK usage. It is **required for SDK integration** but can be disabled if you only use `kubectl` to manage CRDs. + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `server.enabled` | Enable server deployment | `true` | +| `server.image.repository` | Server image repository | `opensandbox/server` | +| `server.image.tag` | Server image tag | `v0.1.0` | +| `server.image.pullPolicy` | Image pull policy | `Never` | +| `server.replicas` | Number of server replicas | `1` | +| `server.resources.limits.cpu` | CPU limit | `1` | +| `server.resources.limits.memory` | Memory limit | `512Mi` | +| `server.resources.requests.cpu` | CPU request | `100m` | +| `server.resources.requests.memory` | Memory request | `256Mi` | +| `server.config.server.host` | Server listen host | `0.0.0.0` | +| `server.config.server.port` | Server listen port | `8080` | +| `server.config.server.logLevel` | Log level (INFO/DEBUG/WARNING/ERROR) | `INFO` | +| `server.config.server.apiKey` | Optional API key for authentication | `""` | +| `server.config.runtime.type` | Runtime type (kubernetes/docker) | `kubernetes` | +| `server.config.runtime.execdImage` | execd image for non-pool mode | `opensandbox/execd:v1.0.5` | +| `server.config.kubernetes.workloadProvider` | Workload provider type | `batchsandbox` | +| `server.service.type` | Service type (ClusterIP/NodePort/LoadBalancer) | `ClusterIP` | +| `server.service.port` | Service port | `8080` | +| `server.service.nodePort` | NodePort (when type=NodePort) | `""` | +| `server.ingress.enabled` | Enable Ingress | `false` | +| `server.ingress.className` | Ingress class name | `""` | +| `server.ingress.hosts` | Ingress host configuration | `[]` | + +### Namespace Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `namespaceOverride` | Override the default namespace name | `"opensandbox"` | + +**Note**: Both the controller, server, and user resources (Pool, BatchSandbox) use the same namespace for simplicity. + +The server automatically uses in-cluster Kubernetes configuration and reads the namespace from the Helm chart configuration. + +### Accessing the Server + +#### Option 1: Port Forward (Development) + +```bash +# Forward local port to server +kubectl port-forward -n opensandbox svc/opensandbox-controller-server 8080:8080 + +# Test connection +curl http://localhost:8080/health +``` + +#### Option 2: NodePort (Local Development) + +```bash +# Install with NodePort +helm install opensandbox-controller ./opensandbox-controller \ + --set server.service.type=NodePort \ + --set server.service.nodePort=30080 + +# Access via node IP +curl http://:30080/health +``` + +#### Option 3: Ingress (Production) + +```bash +# Install with Ingress +helm install opensandbox-controller ./opensandbox-controller \ + --set server.ingress.enabled=true \ + --set server.ingress.className=nginx \ + --set server.ingress.hosts[0].host=opensandbox.example.com \ + --set server.ingress.hosts[0].paths[0].path=/ \ + --set server.ingress.hosts[0].paths[0].pathType=Prefix + +# Access via domain +curl https://opensandbox.example.com/health +``` + +### RBAC Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `rbac.create` | Create RBAC resources | `true` | +| `rbac.serviceAccount.create` | Create ServiceAccount | `true` | +| `rbac.serviceAccount.name` | ServiceAccount name (if not created) | `""` | + +### Metrics Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `metrics.enabled` | Enable metrics service | `true` | +| `metrics.service.type` | Metrics service type | `ClusterIP` | +| `metrics.service.port` | Metrics service port | `8443` | +| `metrics.serviceMonitor.enabled` | Create ServiceMonitor (Prometheus Operator) | `false` | +| `metrics.serviceMonitor.interval` | Scrape interval | `30s` | + +### CRD Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `crds.install` | Install CRDs | `true` | + +### Extra Roles Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `extraRoles.batchsandboxEditor.enabled` | Create BatchSandbox editor role | `true` | +| `extraRoles.batchsandboxViewer.enabled` | Create BatchSandbox viewer role | `true` | +| `extraRoles.poolEditor.enabled` | Create Pool editor role | `true` | +| `extraRoles.poolViewer.enabled` | Create Pool viewer role | `true` | + +## Usage Examples + +### Example 1: Install with Custom Image + +```bash +helm install opensandbox-controller ./opensandbox-controller \ + --set controllerManager.image.repository=myregistry.com/sandbox-controller \ + --set controllerManager.image.tag=latest +``` + +### Example 2: Install with High Availability + +```bash +helm install opensandbox-controller ./opensandbox-controller \ + --set controllerManager.replicas=3 \ + --set controllerManager.resources.requests.cpu=100m \ + --set controllerManager.resources.requests.memory=256Mi +``` + +### Example 3: Install with Prometheus Monitoring + +```bash +helm install opensandbox-controller ./opensandbox-controller \ + --set metrics.serviceMonitor.enabled=true +``` + +### Example 4: Install without CRDs (for upgrades) + +```bash +helm upgrade opensandbox-controller ./opensandbox-controller \ + --set crds.install=false +``` + +## Creating Resources + +After installation, you can create OpenSandbox resources: + +### Create a Pool + +```yaml +apiVersion: sandbox.opensandbox.io/v1alpha1 +kind: Pool +metadata: + name: example-pool +spec: + minBufferSize: 2 + maxBufferSize: 5 + capacity: 10 + sandboxTemplate: + spec: + image: ubuntu:latest + command: ["sleep", "infinity"] +``` + +### Create a BatchSandbox + +```yaml +apiVersion: sandbox.opensandbox.io/v1alpha1 +kind: BatchSandbox +metadata: + name: example-batchsandbox +spec: + replicas: 3 + ttlSecondsAfterFinished: 3600 + sandboxTemplate: + spec: + image: ubuntu:latest + command: ["sleep", "infinity"] +``` + +## Using with SDK + +The OpenSandbox Python SDK connects to the server to manage sandboxes. The server must be accessible from where you run the SDK. + +### Access Methods + +#### 1. Port Forward (Recommended for Development) + +```bash +# Forward local port to server +kubectl port-forward -n opensandbox svc/opensandbox-controller-server 8080:8080 +``` + +Then use SDK with `localhost:8080`: + +```python +from opensandbox import Sandbox +from opensandbox.config import ConnectionConfig + +sandbox = await Sandbox.create( + "ubuntu:latest", + entrypoint=["sleep", "infinity"], + connection_config=ConnectionConfig(domain="localhost:8080"), + extensions={"poolRef": "agent-pool"} +) +``` + +#### 2. In-Cluster Access + +If running SDK inside the same Kubernetes cluster: + +```python +sandbox = await Sandbox.create( + "ubuntu:latest", + entrypoint=["sleep", "infinity"], + connection_config=ConnectionConfig( + domain="opensandbox-controller-server.opensandbox.svc.cluster.local:8080" + ), + extensions={"poolRef": "agent-pool"} +) +``` + +#### 3. NodePort / LoadBalancer / Ingress + +For external access, configure the service type accordingly and use the appropriate domain. + +### SDK Usage Examples + +The OpenSandbox Python SDK supports two creation modes: + +### Pooled Mode (Recommended) + +Fast creation using pre-warmed pools. **Image must match the Pool's configuration**: + +```python +from opensandbox import Sandbox +from opensandbox.config import ConnectionConfig + +sandbox = await Sandbox.create( + "ubuntu:latest", # Must match Pool's image + entrypoint=["sleep", "infinity"], + connection_config=ConnectionConfig(domain="localhost:8080"), # Server address + extensions={"poolRef": "agent-pool"} # Reference to Pool name +) +``` + +**Important**: When using `poolRef`, the SDK's `image` parameter will be **ignored** - the Pool's pre-configured image is used instead. Only `entrypoint` and `env` can be customized. + +### Non-pooled Mode + +Direct creation with custom image and resources: + +```python +sandbox = await Sandbox.create( + "python:3.11", # Any image + resource={"cpu": "1", "memory": "500Mi"}, + connection_config=ConnectionConfig(domain="localhost:8080") + # No poolRef specified +) +``` + +### Connect to Existing Sandbox + +```python +# List all sandboxes +from opensandbox import SandboxManager +manager = SandboxManager(connection_config=ConnectionConfig(domain="localhost:8080")) +sandboxes = await manager.list_sandbox_infos(SandboxFilter()) + +# Connect to existing +sandbox = await Sandbox.connect( + sandbox_id="", + connection_config=ConnectionConfig(domain="localhost:8080") +) +``` + +For detailed SDK integration guide including troubleshooting, see [examples/README.md](examples/README.md) + +## Upgrading + +```bash +# Upgrade to a new version +helm upgrade opensandbox-controller ./opensandbox-controller \ + --set controllerManager.image.tag=v1.1.0 + +# Upgrade with new values +helm upgrade opensandbox-controller ./opensandbox-controller \ + -f new-values.yaml +``` + +## Uninstalling + +```bash +# Uninstall the release +helm uninstall opensandbox-controller + +# Note: CRDs are not automatically deleted. To remove them: +kubectl delete crd batchsandboxes.sandbox.opensandbox.io +kubectl delete crd pools.sandbox.opensandbox.io +``` + +## Troubleshooting + +### Check Controller Status + +```bash +# Check deployment +kubectl get deployment -n opensandbox + +# Check pods +kubectl get pods -n opensandbox + +# Check logs +kubectl logs -n opensandbox -l control-plane=controller-manager +``` + +### Verify CRDs + +```bash +# List CRDs +kubectl get crds | grep sandbox.opensandbox.io + +# Describe CRD +kubectl describe crd batchsandboxes.sandbox.opensandbox.io +``` + +### Check RBAC + +```bash +# Check ServiceAccount +kubectl get sa -n opensandbox + +# Check ClusterRoles +kubectl get clusterrole | grep sandbox-k8s + +# Check ClusterRoleBindings +kubectl get clusterrolebinding | grep sandbox-k8s +``` + +## Development + +### Quick Start Scripts + +The chart includes utility scripts in the `scripts/` directory: + +- **`scripts/install.sh`** - Interactive installation wizard +- **`scripts/uninstall.sh`** - Safe uninstallation with cleanup +- **`scripts/e2e-test.sh`** - End-to-end validation + +See [scripts/README.md](scripts/README.md) for detailed documentation. + +### Linting the Chart + +```bash +helm lint ./opensandbox-controller +``` + +### Testing the Chart + +```bash +# Dry run +helm install opensandbox-controller ./opensandbox-controller --dry-run --debug + +# Template rendering +helm template opensandbox-controller ./opensandbox-controller +``` + +### Package the Chart + +```bash +helm package ./opensandbox-controller +``` + +## Contributing + +Please refer to the main [OpenSandbox repository](https://github.com/alibaba/OpenSandbox) for contribution guidelines. + +## License + +Apache License 2.0 + +## Support + +- Documentation: https://github.com/alibaba/OpenSandbox +- Issues: https://github.com/alibaba/OpenSandbox/issues diff --git a/deploy/helm/opensandbox/crds/sandbox.opensandbox.io_batchsandboxes.yaml b/deploy/helm/opensandbox/crds/sandbox.opensandbox.io_batchsandboxes.yaml new file mode 100644 index 000000000000..72c43bdda008 --- /dev/null +++ b/deploy/helm/opensandbox/crds/sandbox.opensandbox.io_batchsandboxes.yaml @@ -0,0 +1,189 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.18.0 + name: batchsandboxes.sandbox.opensandbox.io +spec: + group: sandbox.opensandbox.io + names: + kind: BatchSandbox + listKind: BatchSandboxList + plural: batchsandboxes + shortNames: + - bsbx + singular: batchsandbox + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: The desired number of pods. + jsonPath: .spec.replicas + name: DESIRED + type: integer + - description: The number of currently all pods. + jsonPath: .status.replicas + name: TOTAL + type: integer + - description: The number of currently all allocated pods. + jsonPath: .status.allocated + name: ALLOCATED + type: integer + - description: The number of currently all ready pods. + jsonPath: .status.ready + name: Ready + type: integer + - description: The number of currently all running tasks. + jsonPath: .status.taskRunning + name: TASK_RUNNING + priority: 1 + type: integer + - description: The number of currently all succeed tasks. + jsonPath: .status.taskSucceed + name: TASK_SUCCEED + priority: 1 + type: integer + - description: The number of currently all failed tasks. + jsonPath: .status.taskFailed + name: TASK_FAILED + priority: 1 + type: integer + - description: The number of currently all unknown tasks. + jsonPath: .status.taskUnknown + name: TASK_UNKNOWN + priority: 1 + type: integer + - description: sandbox expire time + jsonPath: .spec.expireTime + name: EXPIRE + type: string + - description: CreationTimestamp is a timestamp representing the server time when + this object was created. It is not guaranteed to be set in happens-before + order across separate operations. Clients may not set this value. It is represented + in RFC3339 form and is in UTC. + jsonPath: .metadata.creationTimestamp + name: AGE + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: BatchSandbox is the Schema for the batchsandboxes API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: BatchSandboxSpec defines the desired state of BatchSandbox. + properties: + expireTime: + description: |- + ExpireTime - Absolute time when the batch-sandbox is deleted. + If a time in the past is provided, the batch-sandbox will be deleted immediately. + format: date-time + type: string + poolRef: + description: |- + PoolRef references the Pool resource name for pooled sandbox creation. + Mutually exclusive with Template - use PoolRef for pool-based allocation or Template for direct sandbox creation. + type: string + replicas: + default: 1 + description: Replicas is the number of desired replicas. + format: int32 + minimum: 0 + type: integer + shardPatches: + description: ShardPatches indicates patching to the Template for BatchSandbox. + x-kubernetes-preserve-unknown-fields: true + shardTaskPatches: + description: ShardTaskPatches indicates patching to the TaskTemplate + for individual Task. + x-kubernetes-preserve-unknown-fields: true + taskResourcePolicyWhenCompleted: + default: Retain + description: |- + TaskResourcePolicyWhenCompleted specifies how resources should be handled once a task reaches a completed state (SUCCEEDED or FAILED). + - Retain: Keep the resources until the BatchSandbox is deleted. + - Release: Free the resources immediately when the task completes. + type: string + taskTemplate: + description: |- + Task is a custom task spec that is automatically dispatched after the sandbox is successfully created. + The Sandbox is responsible for managing the lifecycle of the task. + x-kubernetes-preserve-unknown-fields: true + template: + description: Template describes the pods that will be created. + x-kubernetes-preserve-unknown-fields: true + required: + - replicas + type: object + status: + description: BatchSandboxStatus defines the observed state of BatchSandbox. + properties: + allocated: + description: "\tAllocated is the number of actual scheduled Pod" + format: int32 + type: integer + observedGeneration: + description: |- + ObservedGeneration is the most recent generation observed for this BatchSandbox. It corresponds to the + BatchSandbox's generation, which is updated on mutation by the API Server. + format: int64 + type: integer + ready: + description: "\tReady is the number of actual Ready Pod" + format: int32 + type: integer + replicas: + description: Replicas is the number of actual Pods + format: int32 + type: integer + taskFailed: + description: TaskFailed is the number of Failed task + format: int32 + type: integer + taskPending: + description: TaskPending is the number of Pending task which is unassigned + format: int32 + type: integer + taskRunning: + description: TaskRunning is the number of Running task + format: int32 + type: integer + taskSucceed: + description: TaskSucceed is the number of Succeed task + format: int32 + type: integer + taskUnknown: + description: TaskUnknown is the number of Unknown task + format: int32 + type: integer + required: + - allocated + - ready + - replicas + - taskFailed + - taskPending + - taskRunning + - taskSucceed + - taskUnknown + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/deploy/helm/opensandbox/crds/sandbox.opensandbox.io_pools.yaml b/deploy/helm/opensandbox/crds/sandbox.opensandbox.io_pools.yaml new file mode 100644 index 000000000000..8b987cadad53 --- /dev/null +++ b/deploy/helm/opensandbox/crds/sandbox.opensandbox.io_pools.yaml @@ -0,0 +1,129 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.18.0 + name: pools.sandbox.opensandbox.io +spec: + group: sandbox.opensandbox.io + names: + kind: Pool + listKind: PoolList + plural: pools + singular: pool + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: The number of all nodes in pool. + jsonPath: .status.total + name: TOTAL + type: integer + - description: The number of allocated nodes in pool. + jsonPath: .status.allocated + name: ALLOCATED + type: integer + - description: The number of available nodes in pool. + jsonPath: .status.available + name: AVAILABLE + type: integer + name: v1alpha1 + schema: + openAPIV3Schema: + description: Pool is the Schema for the pools API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: PoolSpec defines the desired state of Pool. + properties: + capacitySpec: + description: CapacitySpec controls the size of the resource pool. + properties: + bufferMax: + description: BufferMax is the maximum number of nodes kept in + the warm buffer. + format: int32 + minimum: 0 + type: integer + bufferMin: + description: BufferMin is the minimum number of nodes that must + remain in the buffer. + format: int32 + minimum: 0 + type: integer + poolMax: + description: PoolMax is the maximum total number of nodes allowed + in the entire pool. + format: int32 + minimum: 0 + type: integer + poolMin: + description: PoolMin is the minimum total size of the pool. + format: int32 + minimum: 0 + type: integer + required: + - bufferMax + - bufferMin + - poolMax + - poolMin + type: object + template: + description: Pod Template used to create pre-warmed nodes in the pool. + x-kubernetes-preserve-unknown-fields: true + required: + - capacitySpec + type: object + status: + description: PoolStatus defines the observed state of Pool. + properties: + allocated: + description: Allocated is the number of nodes currently allocated + to sandboxes. + format: int32 + type: integer + available: + description: Available is the number of nodes currently available + in the pool. + format: int32 + type: integer + observedGeneration: + description: |- + ObservedGeneration is the most recent generation observed for this BatchSandbox. It corresponds to the + BatchSandbox's generation, which is updated on mutation by the API Server. + format: int64 + type: integer + revision: + description: Revision is the latest version of pool + type: string + total: + description: Total is the total number of nodes in the pool. + format: int32 + type: integer + required: + - allocated + - available + - revision + - total + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/deploy/helm/opensandbox/examples/DIRECTORY_STRUCTURE.md b/deploy/helm/opensandbox/examples/DIRECTORY_STRUCTURE.md new file mode 100644 index 000000000000..f75503856b29 --- /dev/null +++ b/deploy/helm/opensandbox/examples/DIRECTORY_STRUCTURE.md @@ -0,0 +1,195 @@ +# Examples 目录结构 + +本文档说明 `examples/` 目录的组织结构和设计原则。 + +## 📂 目录结构 + +``` +examples/ +├── README.md # 主文档:快速开始、最佳实践 +├── DIRECTORY_STRUCTURE.md # 本文档:目录结构说明 +├── pool-examples.md # 详细的 Pool 配置说明 +│ +├── pool-agent-production.yaml # 🌟 生产级 Agent Pool(推荐) +├── pool-sdk-compatible.yaml # SDK 基础 Pool(execd only) +├── pool-sdk-with-tasks.yaml # SDK 完整 Pool(execd + task-executor) +│ +├── batchsandbox-basic.yaml # Non-pooled 模式示例 +└── batchsandbox-with-tasks.yaml # Pooled 批量任务示例 +``` + +## 📝 文件分类 + +### Pool 配置文件(3 个) + +| 文件 | 类型 | SDK 支持 | 自定义 entrypoint | 推荐度 | +|------|------|----------|-----------------|--------| +| `pool-agent-production.yaml` | 生产级 | ✅ | ✅ | ⭐⭐⭐⭐⭐ | +| `pool-sdk-with-tasks.yaml` | 完整功能 | ✅ | ✅ | ⭐⭐⭐⭐ | +| `pool-sdk-compatible.yaml` | 基础功能 | ✅ | ❌ | ⭐⭐⭐ | + +**选择建议**: +- **生产环境**:使用 `pool-agent-production.yaml`(包含详细注释和最佳实践) +- **快速测试**:使用 `pool-sdk-with-tasks.yaml`(更简洁) +- **特殊需求**:如果不需要自定义 entrypoint,使用 `pool-sdk-compatible.yaml` + +### BatchSandbox 配置文件(2 个) + +| 文件 | 模式 | 依赖 | 用途 | +|------|------|------|------| +| `batchsandbox-basic.yaml` | Non-pooled | 无 | 演示直接创建 Pod | +| `batchsandbox-with-tasks.yaml` | Pooled | Pool | 演示批量异构任务 | + +**使用说明**: +- **SDK 场景**:通常不需要手动创建 BatchSandbox(SDK 自动创建) +- **kubectl 场景**:用于批量任务执行(RL 训练、压力测试等) + +### 文档文件(3 个) + +| 文件 | 内容 | 面向用户 | +|------|------|---------| +| `README.md` | 快速开始、最佳实践、故障排查 | 所有用户 | +| `pool-examples.md` | Pool 详细配置说明 | 高级用户 | +| `DIRECTORY_STRUCTURE.md` | 目录结构说明 | 开发者 | + +## 🎯 设计原则 + +### 1. 简化选择 + +**问题**:之前有太多相似的示例(pool-basic, pool-with-execd, pool-with-task-executor...),用户不知道选哪个。 + +**解决**: +- 保留 3 个 Pool 示例,明确分类和推荐度 +- 删除误导性示例(pool-basic 无 execd,pool-with-task-executor 无 execd) +- 突出推荐 `pool-agent-production.yaml` + +### 2. 面向实际场景 + +**问题**:示例文件缺乏实际使用场景说明。 + +**解决**: +- 明确标注适用场景(Agent 服务、RL 训练、压力测试等) +- 提供完整的 SDK 使用代码示例 +- 包含部署、验证、监控的完整流程 + +### 3. 最佳实践优先 + +**问题**:缺少生产级配置参考。 + +**解决**: +- 创建 `pool-agent-production.yaml` 包含: + - 详细的配置注释 + - 容量规划建议 + - 安全最佳实践 + - 监控和调试指南 + +### 4. 纠正常见误区 + +**问题**:用户容易误解 Pool 的使用方式。 + +**解决**: +- 在 README.md 中突出"常见误区"章节 +- 明确说明: + - Pool 是 Pod 池,不是 Sandbox 池 + - 不需要预创建 BatchSandbox + - SDK 每次 create() 创建新 BatchSandbox + +## 🔄 文件变更历史 + +### 删除的文件(4 个) + +| 文件 | 删除原因 | +|------|---------| +| `pool-basic.yaml` | 无 execd,SDK 无法使用,误导性 | +| `pool-with-task-executor.yaml` | 只有 task-executor 无 execd,不完整 | +| `pool-with-execd.yaml` | execd 启动方式不标准,已被 pool-sdk-compatible.yaml 替代 | +| `batchsandbox-pooled.yaml` | 依赖不存在的 basic-pool,无效 | + +### 新增的文件(1 个) + +| 文件 | 内容 | +|------|------| +| `pool-agent-production.yaml` | 生产级 Agent Pool 配置,包含详细注释和最佳实践 | + +### 修改的文件(3 个) + +| 文件 | 修改内容 | +|------|---------| +| `README.md` | 重写,添加场景分类、常见误区、容量规划等 | +| `batchsandbox-with-tasks.yaml` | 修改 poolRef 为 agent-pool,添加注释 | +| `batchsandbox-basic.yaml` | 添加详细注释说明 | + +## 📊 使用场景映射 + +### 场景 A:多 Agent 并发使用 + +``` +用户需求: Agent 服务、Code Interpreter、动态工作流 + ↓ +推荐配置: pool-agent-production.yaml + ↓ + 使用方式: SDK 动态创建 sandbox + ↓ + 流程: Helm 部署 Pool → SDK.create() → SDK.kill() +``` + +### 场景 B:批量任务执行 + +``` +用户需求: RL 训练、压力测试、批量数据处理 + ↓ +推荐配置: pool-agent-production.yaml + batchsandbox-with-tasks.yaml + ↓ + 使用方式: kubectl 创建 BatchSandbox + ↓ + 流程: kubectl apply pool → kubectl apply batchsandbox → 自动清理 +``` + +### 场景 C:测试和开发 + +``` +用户需求: 测试特定镜像、验证功能 + ↓ +推荐配置: batchsandbox-basic.yaml (non-pooled) + ↓ + 使用方式: kubectl 直接创建 + ↓ + 流程: kubectl apply → kubectl delete +``` + +## 🔗 相关资源 + +- **深度分析**:`/data/home/cz/sandbox-test/pool-analysis/` + - Pool 使用指南 + - 架构流程图 + - 验证测试脚本 + +- **Helm 配置**:`/data/home/cz/OpenSandbox/kubernetes/helm-chart/` + - `values.yaml` - 默认配置 + - `values-e2e.yaml` - E2E 测试配置 + - 生产/开发环境配置:使用 `--set` 或自定义 values 文件 + +- **主文档**:`/data/home/cz/OpenSandbox/kubernetes/README.md` + - Kubernetes 部署完整指南 + +## 💡 维护建议 + +### 添加新示例时 + +1. **明确场景**:每个示例应对应明确的使用场景 +2. **完整注释**:包含配置说明、使用方式、注意事项 +3. **验证测试**:确保示例可以正常运行 +4. **更新文档**:同步更新 README.md 和本文档 + +### 修改现有示例时 + +1. **保持兼容**:避免破坏性变更 +2. **版本说明**:在注释中说明版本要求 +3. **测试验证**:修改后进行完整测试 +4. **文档同步**:更新相关文档 + +### 删除示例时 + +1. **评估影响**:确认没有外部依赖 +2. **提供替代**:在文档中说明替代方案 +3. **迁移指南**:如果有用户使用,提供迁移步骤 diff --git a/deploy/helm/opensandbox/examples/README.md b/deploy/helm/opensandbox/examples/README.md new file mode 100644 index 000000000000..1bb53cb8d324 --- /dev/null +++ b/deploy/helm/opensandbox/examples/README.md @@ -0,0 +1,427 @@ +# OpenSandbox Examples + +This directory contains various usage examples and best practices for the OpenSandbox Kubernetes Controller. + +## 🎉 Important Update + +**Starting from version v0.2.0, the Helm chart deploys an agent-pool by default** without manual creation! + +```bash +# Default installation automatically creates agent-pool +helm install opensandbox opensandbox-controller + +# View the automatically created Pool +kubectl get pools -n opensandbox +``` + +If you don't need the Pool, you can disable it: +```bash +helm install opensandbox opensandbox-controller --set pools[0].enabled=false +``` + +## 📁 File List + +### Pool Examples + +| File | SDK Compatible | Custom Entrypoint | Purpose | +|------|----------------|-------------------|---------| +| `pool-sdk-compatible.yaml` | ✅ Supported | ❌ Not Supported | SDK Basic Mode (execd only)| +| `pool-sdk-with-tasks.yaml` | ✅ Supported | ✅ Supported | SDK Complete Mode (execd + task-executor)| +| **`pool-agent-production.yaml`** | ✅ Supported | ✅ Supported | **🌟 Production-Grade Agent Pool (Recommended)** | + +### BatchSandbox Examples + +| File | Mode | Purpose | +|------|------|---------| +| `batchsandbox-basic.yaml` | Non-pooled | Direct Pod creation without using Pool | +| `batchsandbox-with-tasks.yaml` | Pooled | Batch heterogeneous task example | + +### Documentation + +| File | Content | +|------|---------| +| `README.md` | This document | +| `pool-examples.md` | Detailed Pool configuration guide | + +## 🎯 Core Concepts + +### ❌ Common Misconceptions + +> **Misconception 1**: Pool is a pre-created pool of Sandboxes that Agents can reuse +> **Correct**: Pool is a **Pod pool**, not a Sandbox pool + +> **Misconception 2**: Need to pre-create BatchSandbox during Helm deployment for Agent use +> **Correct**: SDK creates a **new BatchSandbox** with each create() call, no pre-creation needed + +> **Misconception 3**: Pool without execd can work with SDK +> **Correct**: SDK **requires** Pool to contain execd (port 44772) + +### ✅ Correct Understanding + +``` +During Helm deployment: +└─> Only create Pool (long-running, maintains pre-warmed Pods) + +During SDK runtime: +├─> Agent-1: SDK.create() → Creates BatchSandbox-1 (allocates Pod-1) +├─> Agent-2: SDK.create() → Creates BatchSandbox-2 (allocates Pod-2) +└─> Agent-1: SDK.kill() → Deletes BatchSandbox-1 (Pod-1 returns to Pool) + +Next request: +└─> Agent-3: SDK.create() → Creates BatchSandbox-3 (reuses Pod-1) ← Fast! +``` + +**Key Points**: +- ✅ Pool maintains **Pods** (pre-warmed containers) +- ✅ Each SDK.create() creates a **new BatchSandbox** +- ✅ Pods are reused, BatchSandboxes are not +- ❌ Don't pre-create BatchSandboxes + +## 🚀 Quick Start + +### Scenario A: Multi-Agent Concurrent Usage (Recommended) + +**Use Cases**: Agent services, Code Interpreter, dynamic workflows + +```bash +# 1. Install Helm chart (automatically creates agent-pool) +helm install opensandbox opensandbox-controller + +# 2. Verify deployment +kubectl get deployment -n opensandbox +kubectl get pool -n opensandbox + +# 3. Check Pool status +kubectl get pool agent-pool -n opensandbox -o jsonpath='{.status}' | jq +# Example output: +# { +# "total": 10, # Total Pods +# "allocated": 0, # Allocated +# "available": 10 # Available +# } + +# 4. View Pool Pods +kubectl get pods -l pool=agent-pool -n opensandbox +``` + +**SDK Usage**: + +```python +from opensandbox import Sandbox +from opensandbox.config import ConnectionConfig +from datetime import timedelta + +async def handle_agent_request(agent_id: str, task: str): + """Create a new sandbox for each Agent request""" + # SDK.create() will allocate a Pod from agent-pool + sandbox = await Sandbox.create( + "nginx:latest", # Will be ignored, uses image from Pool + entrypoint=["/bin/sh", "-c", "sleep infinity"], + env={"AGENT_ID": agent_id}, + timeout=timedelta(hours=1), + connection_config=ConnectionConfig(domain=":8088"), + extensions={"poolRef": "agent-pool"} # Use default agent-pool + ) + + try: + # Use sandbox + result = await sandbox.commands.run(task) + return result + finally: + # Delete BatchSandbox, Pod returns to Pool + await sandbox.kill() +``` + +### Scenario B: Custom Pool Capacity (High Concurrency Scenarios) + +If you need higher concurrency capacity, override default capacity parameters using `--set`: + +```bash +# Use configuration optimized for multiple Agents (bufferMin: 50, poolMax: 300) +helm install opensandbox opensandbox-controller \ + --set pools[0].capacitySpec.bufferMin=50 \ + --set pools[0].capacitySpec.bufferMax=100 \ + --set pools[0].capacitySpec.poolMin=50 \ + --set pools[0].capacitySpec.poolMax=300 +``` + +### Scenario C: Batch Task Execution (kubectl) + +**Use Cases**: RL training, stress testing, batch data processing + +```bash +# 1. Ensure Pool is deployed (automatically created by default) +kubectl get pool agent-pool -n opensandbox + +# 2. Create BatchSandbox to execute batch tasks +kubectl apply -f batchsandbox-with-tasks.yaml + +# 3. View task execution status +kubectl get batchsandbox task-batch-sandbox -n opensandbox -o wide + +# 4. View task logs +POD_NAME=$(kubectl get pods -l batchsandbox=task-batch-sandbox -n opensandbox -o jsonpath='{.items[0].metadata.name}') +kubectl logs $POD_NAME -c sandbox-container -n opensandbox +kubectl logs $POD_NAME -c task-executor -n opensandbox + +# 5. Automatic cleanup after task completion (ttlSecondsAfterFinished) +``` + +### Scenario D: Non-pooled Direct Creation + +**Use Cases**: Testing environments, special image requirements + +```bash +# Directly create BatchSandbox (without using Pool) +kubectl apply -f batchsandbox-basic.yaml + +# View created Pods +kubectl get pods -l batchsandbox=basic-batch-sandbox -n opensandbox +``` + +## 📊 Pool Configuration Type Comparison + +### Type 1: SDK Pool (Basic) - pool-sdk-compatible.yaml + +```yaml +# ✅ SDK compatible - contains execd only +# ❌ Does not support custom entrypoint +initContainers: +- name: execd-installer + image: opensandbox/execd:v1.0.5 +containers: +- name: sandbox-container + command: ["/opt/opensandbox/bin/bootstrap.sh", "nginx", "-g", "daemon off;"] + ports: + - containerPort: 44772 + name: execd +``` + +**SDK Usage**: +```python +sandbox = await Sandbox.create( + "nginx:latest", + # ❌ Cannot pass entrypoint + env={"VAR": "value"}, # ✅ Can pass environment variables + extensions={"poolRef": "sdk-pool"} +) +``` + +### Type 2: Task Pool (Complete) - pool-agent-production.yaml (Recommended) + +```yaml +# ✅ SDK compatible - contains execd + task-executor +# ✅ Supports custom entrypoint +spec: + shareProcessNamespace: true # Required by task-executor + initContainers: + - name: execd-installer + image: opensandbox/execd:v1.0.5 + containers: + - name: sandbox-container + command: ["/opt/opensandbox/bin/bootstrap.sh", "sleep", "infinity"] + ports: + - containerPort: 44772 + name: execd + - name: task-executor # Add task-executor sidecar + image: opensandbox/task-executor:dev + securityContext: + capabilities: + add: ["SYS_PTRACE"] +``` + +**SDK Usage** (with custom entrypoint): +```python +sandbox = await Sandbox.create( + "nginx:latest", + entrypoint=["/bin/sh", "-c", "custom command"], # ✅ Can customize + env={"VAR": "value"}, + extensions={"poolRef": "agent-pool"} +) +``` + +## 🔍 Monitoring and Debugging + +### Monitor Pool Utilization + +```bash +# Real-time monitoring +watch kubectl get pool agent-pool -o jsonpath='{.status}' | jq + +# View detailed information +kubectl describe pool agent-pool + +# View Pool Pod list +kubectl get pods -l pool=agent-pool -o wide +``` + +**Optimization Recommendations**: +- If `available` is frequently 0 → Increase `bufferMax` +- If `available` is always close to `total` → Decrease `bufferMin` +- If `total` frequently reaches `poolMax` → Increase `poolMax` or optimize Agent usage + +### Verify Pool Configuration + +```bash +# Check if Pod contains execd +kubectl exec -it -c sandbox-container -- ps aux | grep execd + +# Check execd port +kubectl exec -it -c sandbox-container -- nc -zv localhost 44772 + +# Check task-executor (if present) +kubectl get pods -l pool=agent-pool -o jsonpath='{.items[0].spec.containers[*].name}' +# Output should include: sandbox-container task-executor +``` + +### View BatchSandbox Status + +```bash +# List all BatchSandboxes +kubectl get batchsandboxes + +# View detailed status +kubectl describe batchsandbox + +# View task execution statistics +kubectl get batchsandbox -o custom-columns=\ +NAME:.metadata.name,\ +REPLICAS:.spec.replicas,\ +RUNNING:.status.taskRunning,\ +SUCCEED:.status.taskSucceed,\ +FAILED:.status.taskFailed +``` + +## 🛠️ Troubleshooting + +### Pool Pod Fails to Start + +```bash +# View Pod events +kubectl describe pod + +# View container logs +kubectl logs -c sandbox-container +kubectl logs -c task-executor # If present + +# Check image pull +kubectl describe pod | grep -A 5 Events +``` + +### SDK Sandbox Creation Timeout + +**Symptom**: SDK error `Health check timeout` + +**Possible Causes**: +1. Pool doesn't have execd → Use `pool-agent-production.yaml` +2. execd not started → Check execd process in Pod +3. Network issues → Check network connectivity between Server and Pod + +**Troubleshooting Steps**: +```bash +# 1. Confirm Pool contains execd +kubectl get pool agent-pool -o yaml | grep -A 10 initContainers + +# 2. Check execd process +kubectl exec -it -c sandbox-container -- ps aux | grep execd + +# 3. Check execd port +kubectl exec -it -c sandbox-container -- nc -zv localhost 44772 + +# 4. View Server logs +kubectl logs -l app=opensandbox-server -n opensandbox +``` + +### task-executor Permission Issues + +```bash +# Check security context +kubectl get pod -o yaml | grep -A 10 securityContext + +# Should contain: +# capabilities: +# add: ["SYS_PTRACE"] + +# Check process namespace sharing +kubectl get pod -o jsonpath='{.spec.shareProcessNamespace}' +# Should output: true +``` + +## 📦 Capacity Planning Recommendations + +Plan Pool capacity based on concurrent Agent count: + +| Concurrent Agents | bufferMin | bufferMax | poolMin | poolMax | Description | +|------------------|-----------|-----------|---------|---------|-------------| +| 1-10 | 2 | 5 | 2 | 20 | Small-scale testing | +| 10-50 | 10 | 20 | 10 | 100 | Small to medium applications | +| 50-200 | 50 | 100 | 50 | 300 | Medium to large applications | +| 200+ | 100 | 200 | 100 | 500 | Large-scale production | + +**Parameter Descriptions**: +- `bufferMin`: Minimum buffer, ensures fast response +- `bufferMax`: Maximum buffer, controls pre-warming cost +- `poolMin`: Minimum capacity during low traffic periods +- `poolMax`: Maximum capacity during peak periods + +**Cost Optimization**: +- Low traffic periods: Pool scales down to `poolMin`, saving resources +- Peak periods: Pool expands to `poolMax`, ensuring response speed +- Buffer zone: `bufferMin` ensures fast response, `bufferMax` avoids excessive pre-warming + +## 📚 Related Documentation + +### In-depth Analysis Documents + +- **Pool Usage Guide**: `/data/home/cz/sandbox-test/pool-analysis/opensandbox_pool_usage_guide.md` +- **Architecture Diagrams**: `/data/home/cz/sandbox-test/pool-analysis/pool_architecture.txt` +- **Verification Test Scripts**: `/data/home/cz/sandbox-test/pool-analysis/test_pool_behavior.py` + +### Helm Deployment Configuration + +- **Main values configuration**: `/data/home/cz/OpenSandbox/kubernetes/helm-chart/values.yaml` +- **E2E test configuration**: `/data/home/cz/OpenSandbox/kubernetes/helm-chart/values-e2e.yaml` +- **Main README**: `/data/home/cz/OpenSandbox/kubernetes/README.md` + +### API Reference + +```bash +# View Pool CRD definition +kubectl explain pool +kubectl explain pool.spec +kubectl explain pool.spec.capacitySpec + +# View BatchSandbox CRD definition +kubectl explain batchsandbox +kubectl explain batchsandbox.spec +kubectl explain batchsandbox.spec.taskTemplate +``` + +## 🧹 Resource Cleanup + +```bash +# Delete BatchSandbox +kubectl delete batchsandbox --all -n default + +# Delete Pool (automatically cleans up related Pods) +kubectl delete pool --all -n default + +# Delete all resources in namespace +kubectl delete all --all -n default +``` + +## 💡 Best Practices Summary + +1. **Pool is a Pod pool, not a Sandbox pool** +2. **SDK creates a new BatchSandbox with each create() call** +3. **No need to pre-create BatchSandboxes for reuse** +4. **Only create Pool during Helm deployment** +5. **Dynamically create/delete BatchSandboxes at runtime** +6. **Configure Pool capacity parameters appropriately to optimize cost and performance** +7. **Use `pool-agent-production.yaml` as production environment template** + +## 🔗 More Examples + +- **Detailed Pool configuration guide**: See `pool-examples.md` +- **SDK integration examples**: See "Quick Start" section in this document +- **Helm Chart configuration**: See `values.yaml` and `values-e2e.yaml` diff --git a/deploy/helm/opensandbox/examples/batchsandbox-basic.yaml b/deploy/helm/opensandbox/examples/batchsandbox-basic.yaml new file mode 100644 index 000000000000..b32f0fd60573 --- /dev/null +++ b/deploy/helm/opensandbox/examples/batchsandbox-basic.yaml @@ -0,0 +1,55 @@ +# ============================================================================== +# BatchSandbox - Non-pooled 模式(直接创建 Pod) +# ============================================================================== +# +# 用途:演示不使用 Pool 直接创建 BatchSandbox +# +# 使用场景: +# - 需要使用特定镜像或资源配置(Pool 无法满足) +# - 一次性任务,不需要预热优化 +# - 测试和开发环境 +# +# 特点: +# - 可自由指定镜像、资源、配置 +# - 创建速度较慢(需要拉取镜像、启动 Pod) +# - 不复用预热资源 +# +# 注意: +# - SDK 使用场景通常不需要预创建 BatchSandbox(动态创建即可) +# - 此示例主要用于演示 kubectl 直接创建 +# +# ============================================================================== + +apiVersion: sandbox.opensandbox.io/v1alpha1 +kind: BatchSandbox +metadata: + name: basic-batch-sandbox + namespace: default + labels: + mode: non-pooled + annotations: + description: "Non-pooled 模式示例" +spec: + # 创建3个沙箱副本 + replicas: 3 + + # 不使用资源池,直接创建Pod + # poolRef: "" # 留空表示 non-pooled 模式 + + # TTL:3600秒(1小时)后自动清理 + ttlSecondsAfterFinished: 3600 + + # 沙箱模板 + template: + spec: + containers: + - name: sandbox-container + image: ubuntu:22.04 + command: ["sleep", "infinity"] + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "256Mi" diff --git a/deploy/helm/opensandbox/examples/batchsandbox-with-tasks.yaml b/deploy/helm/opensandbox/examples/batchsandbox-with-tasks.yaml new file mode 100644 index 000000000000..265e8c0967a5 --- /dev/null +++ b/deploy/helm/opensandbox/examples/batchsandbox-with-tasks.yaml @@ -0,0 +1,86 @@ +# ============================================================================== +# BatchSandbox with Heterogeneous Tasks(带异构任务的批量沙箱) +# ============================================================================== +# +# 用途:演示如何使用 Pool 批量执行异构任务(每个 sandbox 执行不同任务) +# +# 使用场景: +# - RL 训练:批量创建训练环境,每个环境执行不同策略 +# - 压力测试:批量执行不同的测试用例 +# - 数据处理:并行处理多个数据分片 +# +# 注意: +# - 此示例主要用于 kubectl 直接创建的批量任务场景 +# - SDK 使用场景通常不需要预创建 BatchSandbox(动态创建即可) +# - 需要先创建 pool-agent-production.yaml +# +# 相关文档: +# - /data/home/cz/sandbox-test/pool-analysis/opensandbox_pool_usage_guide.md +# +# ============================================================================== + +apiVersion: sandbox.opensandbox.io/v1alpha1 +kind: BatchSandbox +metadata: + name: task-batch-sandbox + namespace: default + labels: + use-case: batch-training + annotations: + description: "批量异构任务示例" +spec: + # 创建3个沙箱副本,每个执行不同的任务 + replicas: 3 + + # 使用包含 execd + task-executor 的 Pool + # 注意:需要先创建 pool-agent-production.yaml + poolRef: agent-pool + + # TTL:3600秒(1小时)后自动清理 + ttlSecondsAfterFinished: 3600 + + # 默认任务模板(如果shardTaskPatches没有覆盖,则使用此模板) + taskTemplate: + spec: + process: + command: ["echo", "Default task message"] + + # 异构任务:为每个沙箱自定义不同的任务 + shardTaskPatches: + # 第1个沙箱的任务 + - spec: + process: + command: ["bash", "-c"] + args: + - | + echo "Task 1: Running Python script" + python3 -c " + import time + print('Task 1 started') + time.sleep(2) + print('Task 1 completed') + " + + # 第2个沙箱的任务 + - spec: + process: + command: ["bash", "-c"] + args: + - | + echo "Task 2: Running shell commands" + date + uname -a + sleep 1 + echo "Task 2 completed" + + # 第3个沙箱的任务 + - spec: + process: + command: ["bash", "-c"] + args: + - | + echo "Task 3: System info check" + cat /etc/os-release + df -h + free -h + echo "Task 3 completed" diff --git a/deploy/helm/opensandbox/examples/pool-agent-production.yaml b/deploy/helm/opensandbox/examples/pool-agent-production.yaml new file mode 100644 index 000000000000..5c636acb9763 --- /dev/null +++ b/deploy/helm/opensandbox/examples/pool-agent-production.yaml @@ -0,0 +1,342 @@ +# ============================================================================== +# OpenSandbox Agent Pool - 生产级配置 +# ============================================================================== +# +# 用途:为多 Agent 并发场景提供预热的沙箱资源池(SDK 模式) +# +# 注意:此配置适用于 SDK 场景(需要 execd) +# 如果只使用 kubectl 管理 BatchSandbox,可以移除 execd 相关配置 +# +# 架构说明: +# Pool 维护预热的 Pod 池(不是 Sandbox 池) +# - 每个 Pod 包含 execd(SDK 通信)+ task-executor(任务注入) +# - SDK.create() 从 Pool 分配 Pod,创建新的 BatchSandbox +# - SDK.kill() 删除 BatchSandbox,Pod 返回 Pool +# +# 使用模式: +# 1. Helm 部署时创建此 Pool(长期运行) +# 2. Agent 运行时通过 SDK 动态创建/删除 BatchSandbox +# 3. 不需要预创建 BatchSandbox +# +# 容量规划建议: +# | 并发 Agent 数 | bufferMin | bufferMax | poolMin | poolMax | +# |--------------|-----------|-----------|---------|---------| +# | 1-10 | 2 | 5 | 2 | 20 | +# | 10-50 | 10 | 20 | 10 | 100 | +# | 50-200 | 50 | 100 | 50 | 300 | +# | 200+ | 100 | 200 | 100 | 500 | +# +# 相关文档: +# - /data/home/cz/sandbox-test/pool-analysis/opensandbox_pool_usage_guide.md +# - /data/home/cz/OpenSandbox/kubernetes/helm-chart/values.yaml (查看 pools 配置) +# +# ============================================================================== + +apiVersion: sandbox.opensandbox.io/v1alpha1 +kind: Pool +metadata: + name: agent-pool + namespace: default # 使用与 Helm chart namespaceOverride 一致的命名空间 + labels: + app: opensandbox + component: agent-pool + annotations: + description: "生产级 Agent Pool,支持 SDK 动态创建 sandbox" +spec: + template: + metadata: + labels: + pool: agent-pool + sdk-compatible: "true" + spec: + # ======================================== + # 必需:共享进程命名空间 + # ======================================== + # task-executor 需要访问 sandbox 容器的进程树 + shareProcessNamespace: true + + # ======================================== + # Init Container:安装 execd(仅 SDK 场景需要) + # ======================================== + # 注意:如果只使用 kubectl 管理 BatchSandbox(不使用 SDK), + # 可以移除此 init container 和相关的 execd 配置。 + # + # execd 的作用: + # - 提供 SDK 与 Pod 的通信接口(44772 端口) + # - 执行 SDK 发送的命令(commands.run(), files.read_file() 等) + # + # 使用场景: + # ✅ 需要:SDK 动态创建 sandbox(Agent、Code Interpreter 等) + # ❌ 不需要:纯 kubectl 批量任务(RL 训练、压力测试等) + initContainers: + - name: execd-installer + image: opensandbox/execd:v1.0.5 + imagePullPolicy: IfNotPresent + command: ["/bin/sh", "-c"] + args: + - | + # 复制 execd 二进制和启动脚本 + cp ./execd /opt/opensandbox/bin/execd && \ + cp ./bootstrap.sh /opt/opensandbox/bin/bootstrap.sh && \ + chmod +x /opt/opensandbox/bin/execd && \ + chmod +x /opt/opensandbox/bin/bootstrap.sh && \ + echo "execd installed successfully" + volumeMounts: + - name: opensandbox-bin + mountPath: /opt/opensandbox/bin + + # ======================================== + # 主容器:Sandbox 环境 + # ======================================== + containers: + - name: sandbox-container + # 镜像说明: + # - Pool 中的镜像由 Pool 预定义,SDK 指定的镜像会被忽略 + # - 根据 Agent 需求选择合适的基础镜像(nginx, ubuntu, python, etc.) + # - 确保镜像包含 /bin/sh 用于执行 bootstrap.sh + image: nginx:latest + imagePullPolicy: IfNotPresent + + # 启动命令:使用 bootstrap.sh 启动 execd + # bootstrap.sh 会: + # 1. 后台启动 execd(监听 44772 端口) + # 2. 执行用户指定的命令(这里是 sleep infinity) + command: ["/opt/opensandbox/bin/bootstrap.sh", "sleep", "infinity"] + + # 环境变量 + env: + - name: EXECD + value: /opt/opensandbox/bin/execd + # 可添加其他环境变量: + # - name: CUSTOM_VAR + # value: "custom-value" + + # 端口配置 + ports: + - containerPort: 44772 + name: execd + protocol: TCP + # 可暴露其他端口(如应用端口): + # - containerPort: 8080 + # name: app + # protocol: TCP + + # 资源配置 + # 根据 Agent 任务复杂度调整 + resources: + requests: + cpu: "100m" # 最小 CPU(保证调度) + memory: "128Mi" # 最小内存 + limits: + cpu: "500m" # 最大 CPU(防止资源抢占) + memory: "256Mi" # 最大内存 + + # 卷挂载 + volumeMounts: + - name: opensandbox-bin + mountPath: /opt/opensandbox/bin + readOnly: true + + # 健康检查(可选) + # livenessProbe: + # tcpSocket: + # port: execd + # initialDelaySeconds: 10 + # periodSeconds: 30 + # readinessProbe: + # tcpSocket: + # port: execd + # initialDelaySeconds: 5 + # periodSeconds: 10 + + # ======================================== + # Sidecar:Task Executor + # ======================================== + - name: task-executor + # 镜像说明: + # - 使用 Helm chart 中配置的 task-executor 镜像 + # - 版本应与 controller 版本保持一致 + # - 开发环境可用:opensandbox/task-executor:dev + # - 生产环境建议:your-registry/opensandbox-task-executor:v1.0.0 + image: opensandbox/task-executor:dev + imagePullPolicy: Never # 开发环境使用本地镜像;生产环境改为 IfNotPresent + + # 端口配置 + ports: + - containerPort: 5758 + name: task-executor + protocol: TCP + + # 资源配置 + # task-executor 需要更多 CPU 用于进程注入 + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" # 可能需要更多 CPU + memory: "256Mi" + + # 安全上下文 + # 必需:SYS_PTRACE 权限用于注入进程到 sandbox 容器 + securityContext: + capabilities: + add: ["SYS_PTRACE"] + # 生产环境建议添加其他安全设置: + # runAsNonRoot: true + # runAsUser: 1000 + # allowPrivilegeEscalation: false + + # ======================================== + # 卷配置 + # ======================================== + volumes: + - name: opensandbox-bin + emptyDir: {} + # 可添加其他卷(如配置文件、数据持久化): + # - name: config + # configMap: + # name: agent-config + # - name: data + # persistentVolumeClaim: + # claimName: agent-data-pvc + + # ======================================== + # 调度配置(可选) + # ======================================== + # 节点选择器 + # nodeSelector: + # workload-type: agent + # zone: production + + # 容忍度(允许调度到特定污点的节点) + # tolerations: + # - key: "workload" + # operator: "Equal" + # value: "agent" + # effect: "NoSchedule" + + # 亲和性(控制 Pod 分布) + # affinity: + # podAntiAffinity: + # preferredDuringSchedulingIgnoredDuringExecution: + # - weight: 100 + # podAffinityTerm: + # labelSelector: + # matchExpressions: + # - key: pool + # operator: In + # values: + # - agent-pool + # topologyKey: kubernetes.io/hostname + + # ======================================== + # Pool 容量配置 + # ======================================== + capacitySpec: + # bufferMin: 最小缓冲 - Pool 保证至少有这么多可用 Pod + # - 保证快速响应,避免 Agent 等待 + # - 根据并发 Agent 峰值设置 + bufferMin: 10 + + # bufferMax: 最大缓冲 - Pool 最多预热这么多 Pod + # - 控制预热成本,避免资源浪费 + # - 通常设为 bufferMin 的 2-5 倍 + bufferMax: 50 + + # poolMin: Pool 最小容量 - 即使没人用,也保持这么多 Pod + # - 低峰期保底容量,避免冷启动 + # - 通常与 bufferMin 相同或稍大 + poolMin: 10 + + # poolMax: Pool 最大容量 - 高峰期最多这么多 Pod + # - 限制最大资源使用,避免集群资源耗尽 + # - 根据集群资源和业务峰值设置 + poolMax: 200 + + # 容量规划示例: + # 场景:200 个并发 Agent,平均每个 Agent 会话 5 分钟 + # bufferMin: 50 (保证 50 个 Agent 立即可用) + # bufferMax: 100 (预热 100 个,覆盖短期突发) + # poolMin: 50 (低峰期保持 50 个) + # poolMax: 300 (高峰期最多 300 个) + +# ============================================================================== +# SDK 使用示例 +# ============================================================================== +# +# Python SDK 使用此 Pool: +# +# ```python +# import asyncio +# from datetime import timedelta +# from opensandbox import Sandbox +# from opensandbox.config import ConnectionConfig +# +# async def create_agent_sandbox(agent_id: str): +# """为 Agent 创建 sandbox""" +# sandbox = await Sandbox.create( +# "nginx:latest", # 镜像会被忽略,使用 Pool 中的镜像 +# entrypoint=["/bin/sh", "-c", "sleep infinity"], # 可自定义 +# env={"AGENT_ID": agent_id}, # 可传递环境变量 +# timeout=timedelta(hours=1), +# connection_config=ConnectionConfig(domain=":8088"), +# extensions={"poolRef": "agent-pool"} # 指定 Pool 名称 +# ) +# return sandbox +# +# async def handle_agent_request(agent_id: str, task: str): +# """处理单个 Agent 请求""" +# sandbox = await create_agent_sandbox(agent_id) +# try: +# result = await sandbox.commands.run(task) +# return result +# finally: +# await sandbox.kill() # Pod 返回 Pool +# ``` +# +# ============================================================================== +# 部署和监控 +# ============================================================================== +# +# 1. 部署 Pool: +# kubectl apply -f pool-agent-production.yaml +# +# 2. 验证 Pool 状态: +# kubectl get pool agent-pool -n default +# kubectl get pool agent-pool -n default -o jsonpath='{.status}' | jq +# +# 3. 查看 Pool 的 Pod: +# kubectl get pods -l pool=agent-pool -n default +# +# 4. 监控 Pool 使用率: +# watch kubectl get pool agent-pool -n default -o jsonpath='{.status}' +# # 输出示例: +# # { +# # "total": 50, # 总 Pod 数 +# # "allocated": 30, # 已分配 +# # "available": 20 # 可用 +# # } +# +# 5. 优化建议: +# - 如果 available 经常为 0 → 增加 bufferMax +# - 如果 available 总是接近 total → 减少 bufferMin +# - 如果 total 经常达到 poolMax → 增加 poolMax 或优化 Agent 使用 +# +# ============================================================================== +# 故障排查 +# ============================================================================== +# +# Pool Pod 无法启动: +# kubectl describe pod -l pool=agent-pool -n default +# kubectl logs -l pool=agent-pool -n default -c sandbox-container +# kubectl logs -l pool=agent-pool -n default -c task-executor +# +# execd 连接失败: +# kubectl exec -it -n default -c sandbox-container -- ps aux | grep execd +# kubectl exec -it -n default -c sandbox-container -- nc -zv localhost 44772 +# +# task-executor 权限问题: +# kubectl get pod -n default -o yaml | grep -A 10 securityContext +# +# ============================================================================== diff --git a/deploy/helm/opensandbox/examples/pool-examples.md b/deploy/helm/opensandbox/examples/pool-examples.md new file mode 100644 index 000000000000..b89c3eeb9ecd --- /dev/null +++ b/deploy/helm/opensandbox/examples/pool-examples.md @@ -0,0 +1,241 @@ +# Pool示例 - 包含Task Executor Sidecar + +## 基本Pool(不包含任务执行) + +```yaml +apiVersion: sandbox.opensandbox.io/v1alpha1 +kind: Pool +metadata: + name: basic-pool + namespace: default +spec: + template: + spec: + containers: + - name: sandbox-container + image: ubuntu:22.04 + command: ["sleep", "infinity"] + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "256Mi" + capacitySpec: + bufferMax: 10 + bufferMin: 2 + poolMax: 20 + poolMin: 5 +``` + +## Pool with Task Executor(支持任务执行) + +**重要提示**: +- Task Executor作为sidecar容器运行在Pool的Pod中 +- 必须启用`shareProcessNamespace: true`以共享进程命名空间 +- Task Executor需要`SYS_PTRACE`权限来注入进程 + +```yaml +apiVersion: sandbox.opensandbox.io/v1alpha1 +kind: Pool +metadata: + name: task-enabled-pool + namespace: default +spec: + template: + spec: + # 必需:共享进程命名空间,允许task-executor访问sandbox容器的进程 + shareProcessNamespace: true + containers: + # 主容器:沙箱环境 + - name: sandbox-container + image: ubuntu:22.04 + command: ["sleep", "infinity"] + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "256Mi" + + # Sidecar:Task Executor(用于任务注入) + - name: task-executor + # 使用Helm values中配置的镜像 + # {{ .Values.taskExecutor.image.repository }}:{{ .Values.taskExecutor.image.tag }} + image: opensandbox.io/task-executor:v0.0.1 + imagePullPolicy: IfNotPresent + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "256Mi" + securityContext: + # 必需:需要ptrace权限来注入进程到sandbox容器 + capabilities: + add: ["SYS_PTRACE"] + + capacitySpec: + bufferMax: 10 + bufferMin: 2 + poolMax: 20 + poolMin: 5 +``` + +## BatchSandbox with Tasks(使用Pool执行任务) + +创建使用上述Pool的BatchSandbox,并执行异构任务: + +```yaml +apiVersion: sandbox.opensandbox.io/v1alpha1 +kind: BatchSandbox +metadata: + name: task-batch-sandbox + namespace: default +spec: + # 副本数量 + replicas: 3 + + # 引用包含task-executor的Pool + poolRef: task-enabled-pool + + # TTL:3600秒后自动清理 + ttlSecondsAfterFinished: 3600 + + # 默认任务模板(所有沙箱共享) + taskTemplate: + spec: + process: + command: ["echo", "Default task"] + + # 异构任务:为每个沙箱自定义不同的任务 + shardTaskPatches: + - spec: + process: + command: ["python3", "-c", "print('Task for sandbox 0')"] + - spec: + process: + command: ["bash", "-c", "echo 'Task for sandbox 1' && sleep 5"] + - spec: + process: + command: ["node", "-e", "console.log('Task for sandbox 2')"] +``` + +## 镜像配置说明 + +### 方式1:使用Helm Values配置 + +在`values.yaml`中配置task-executor镜像: + +```yaml +taskExecutor: + image: + repository: your-registry/opensandbox-task-executor + tag: "v1.0.0" + pullPolicy: IfNotPresent +``` + +然后在Pool YAML中引用: + +```yaml +image: your-registry/opensandbox-task-executor:v1.0.0 +``` + +### 方式2:使用环境变量(ConfigMap) + +创建ConfigMap存储镜像信息: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: opensandbox-images + namespace: default +data: + taskExecutorImage: "your-registry/opensandbox-task-executor:v1.0.0" +``` + +在应用层读取ConfigMap并创建Pool。 + +### 方式3:使用Kustomize替换 + +使用Kustomize的镜像替换功能: + +```yaml +# kustomization.yaml +images: +- name: opensandbox.io/task-executor + newName: your-registry/opensandbox-task-executor + newTag: v1.0.0 +``` + +## 验证Task Executor + +创建资源后,验证task-executor是否正常运行: + +```bash +# 查看Pool状态 +kubectl get pools task-enabled-pool + +# 查看Pool创建的Pod +kubectl get pods -l pool=task-enabled-pool + +# 检查Pod中是否有task-executor容器 +kubectl get pods -l pool=task-enabled-pool -o jsonpath='{.items[0].spec.containers[*].name}' +# 输出应包含: sandbox-container task-executor + +# 查看task-executor日志 +kubectl logs -c task-executor + +# 查看BatchSandbox任务状态 +kubectl get batchsandbox task-batch-sandbox -o wide +# 应显示: TASK_RUNNING, TASK_SUCCEED, TASK_FAILED 等状态 +``` + +## 故障排查 + +### Task Executor无法启动 + +```bash +# 检查容器状态 +kubectl describe pod + +# 检查权限问题 +kubectl get pod -o jsonpath='{.spec.containers[1].securityContext}' +# 应显示: {"capabilities":{"add":["SYS_PTRACE"]}} + +# 检查进程命名空间共享 +kubectl get pod -o jsonpath='{.spec.shareProcessNamespace}' +# 应显示: true +``` + +### 任务执行失败 + +```bash +# 查看任务状态 +kubectl describe batchsandbox task-batch-sandbox + +# 查看task-executor日志 +kubectl logs -c task-executor -f + +# 查看sandbox容器日志 +kubectl logs -c sandbox-container +``` + +## 性能考虑 + +- **资源配置**:根据任务复杂度调整task-executor的资源限制 +- **并发控制**:Pool的`bufferMax`和`poolMax`控制并发沙箱数量 +- **任务超时**:在taskTemplate中配置超时时间防止任务卡死 +- **清理策略**:使用`ttlSecondsAfterFinished`自动清理完成的沙箱 + +## 最佳实践 + +1. **镜像版本管理**:controller和task-executor镜像版本保持一致 +2. **资源限制**:task-executor通常需要更多CPU用于进程注入 +3. **安全配置**:只在需要时启用`SYS_PTRACE`权限 +4. **任务设计**:将长时间运行的任务拆分为多个短任务 +5. **监控告警**:监控任务失败率和执行时间 diff --git a/deploy/helm/opensandbox/examples/pool-kubectl-only.yaml b/deploy/helm/opensandbox/examples/pool-kubectl-only.yaml new file mode 100644 index 000000000000..48368adfbff8 --- /dev/null +++ b/deploy/helm/opensandbox/examples/pool-kubectl-only.yaml @@ -0,0 +1,94 @@ +# ============================================================================== +# Pool for kubectl-only scenarios(纯 kubectl 场景 Pool) +# ============================================================================== +# +# 用途:纯 kubectl 场景,不使用 SDK +# +# 适用场景: +# - RL 训练批量任务 +# - 压力测试 +# - 批量数据处理 +# - 不需要 SDK 动态交互的场景 +# +# 特点: +# - 没有 execd(不支持 SDK) +# - 包含 task-executor(支持 taskTemplate) +# - 配置简洁,资源占用少 +# +# ============================================================================== + +apiVersion: sandbox.opensandbox.io/v1alpha1 +kind: Pool +metadata: + name: kubectl-pool + namespace: default + labels: + app: opensandbox + component: kubectl-pool + annotations: + description: "纯 kubectl 场景 Pool,不包含 execd" +spec: + template: + metadata: + labels: + pool: kubectl-pool + sdk-compatible: "false" + spec: + # ======================================== + # 必需:共享进程命名空间(task-executor 需要) + # ======================================== + shareProcessNamespace: true + + # ======================================== + # 主容器:Sandbox 环境 + # ======================================== + containers: + - name: sandbox-container + image: nginx:latest + imagePullPolicy: IfNotPresent + + # 直接运行业务命令,不需要 bootstrap.sh + command: ["sleep", "infinity"] + + # 资源配置 + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "256Mi" + + # ======================================== + # Sidecar:Task Executor + # ======================================== + - name: task-executor + image: opensandbox/task-executor:dev + imagePullPolicy: Never + + ports: + - containerPort: 5758 + name: task-executor + protocol: TCP + + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "256Mi" + + # 安全上下文 + securityContext: + capabilities: + add: ["SYS_PTRACE"] + + # ======================================== + # Pool 容量配置 + # ======================================== + capacitySpec: + bufferMin: 2 + bufferMax: 5 + poolMin: 2 + poolMax: 10 diff --git a/deploy/helm/opensandbox/examples/pool-sdk-compatible.yaml b/deploy/helm/opensandbox/examples/pool-sdk-compatible.yaml new file mode 100644 index 000000000000..389f3c4d365f --- /dev/null +++ b/deploy/helm/opensandbox/examples/pool-sdk-compatible.yaml @@ -0,0 +1,62 @@ +apiVersion: sandbox.opensandbox.io/v1alpha1 +kind: Pool +metadata: + name: sdk-pool + namespace: default +spec: + template: + spec: + # Init container: 安装execd + initContainers: + - name: execd-installer + image: opensandbox/execd:v1.0.5 + command: ["/bin/sh", "-c"] + args: + - | + cp ./execd /opt/opensandbox/bin/execd && \ + cp ./bootstrap.sh /opt/opensandbox/bin/bootstrap.sh && \ + chmod +x /opt/opensandbox/bin/execd && \ + chmod +x /opt/opensandbox/bin/bootstrap.sh + volumeMounts: + - name: opensandbox-bin + mountPath: /opt/opensandbox/bin + + # 主容器:带execd + containers: + - name: sandbox-container + image: nginx:latest + command: + - /opt/opensandbox/bin/bootstrap.sh + - nginx + - -g + - daemon off; # nginx前台运行 + env: + - name: EXECD + value: /opt/opensandbox/bin/execd + ports: + - containerPort: 80 + name: http + - containerPort: 44772 + name: execd + protocol: TCP + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "256Mi" + volumeMounts: + - name: opensandbox-bin + mountPath: /opt/opensandbox/bin + + # 共享卷 + volumes: + - name: opensandbox-bin + emptyDir: {} + + capacitySpec: + bufferMax: 10 + bufferMin: 2 + poolMax: 20 + poolMin: 5 diff --git a/deploy/helm/opensandbox/examples/pool-sdk-with-tasks.yaml b/deploy/helm/opensandbox/examples/pool-sdk-with-tasks.yaml new file mode 100644 index 000000000000..58be128b2eb4 --- /dev/null +++ b/deploy/helm/opensandbox/examples/pool-sdk-with-tasks.yaml @@ -0,0 +1,75 @@ +apiVersion: sandbox.opensandbox.io/v1alpha1 +kind: Pool +metadata: + name: sdk-pool-with-tasks + namespace: default +spec: + template: + spec: + shareProcessNamespace: true # task-executor需要 + + # Init container: 安装execd + initContainers: + - name: execd-installer + image: opensandbox/execd:v1.0.5 + command: ["/bin/sh", "-c"] + args: + - | + cp ./execd /opt/opensandbox/bin/execd && \ + cp ./bootstrap.sh /opt/opensandbox/bin/bootstrap.sh && \ + chmod +x /opt/opensandbox/bin/execd && \ + chmod +x /opt/opensandbox/bin/bootstrap.sh + volumeMounts: + - name: opensandbox-bin + mountPath: /opt/opensandbox/bin + + containers: + # 主容器:带execd + - name: sandbox-container + image: nginx:latest + command: + - /opt/opensandbox/bin/bootstrap.sh + - sleep + - infinity + env: + - name: EXECD + value: /opt/opensandbox/bin/execd + ports: + - containerPort: 44772 + name: execd + protocol: TCP + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "256Mi" + volumeMounts: + - name: opensandbox-bin + mountPath: /opt/opensandbox/bin + + # task-executor sidecar: 支持自定义entrypoint + - name: task-executor + image: opensandbox/task-executor:dev + imagePullPolicy: Never # 使用本地镜像 + securityContext: + capabilities: + add: ["SYS_PTRACE"] + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "256Mi" + + volumes: + - name: opensandbox-bin + emptyDir: {} + + capacitySpec: + bufferMax: 10 + bufferMin: 2 + poolMax: 20 + poolMin: 5 diff --git a/deploy/helm/opensandbox/scripts/README.md b/deploy/helm/opensandbox/scripts/README.md new file mode 100644 index 000000000000..b7b9d789fa86 --- /dev/null +++ b/deploy/helm/opensandbox/scripts/README.md @@ -0,0 +1,230 @@ +# OpenSandbox Helm Chart Scripts + +This directory contains utility scripts for OpenSandbox Controller deployment and testing. + +## Script List + +### 1. install.sh - Installation Script + +Interactive installation of OpenSandbox Controller to Kubernetes cluster. + +**Features:** +- Automatic detection of sudo privilege requirements +- Validation of dependency tools (helm, kubectl) +- Cluster connection verification +- Support for multiple deployment environments: + - Default configuration (values.yaml) + - E2E testing (values-e2e.yaml) + - Custom configuration (via --set or custom values file) +- Helm Chart validation +- Display verification commands after deployment + +**Usage:** +```bash +cd scripts +./install.sh +``` + +**Environment Variables:** +- `IMAGE_REPO` - Override controller image repository +- `IMAGE_TAG` - Override controller image tag +- `SERVER_IMAGE_REPO` - Override server image repository +- `SERVER_IMAGE_TAG` - Override server image tag + +**Example:** +```bash +# Using custom images +IMAGE_REPO=myregistry.com/controller \ +IMAGE_TAG=v1.0.0 \ +SERVER_IMAGE_REPO=myregistry.com/server \ +SERVER_IMAGE_TAG=v0.1.0 \ +./install.sh +``` + +### 2. uninstall.sh - Uninstallation Script + +Uninstall OpenSandbox Controller and clean up related resources. + +**Features:** +- Check running BatchSandbox and Pool resources +- Display Controller and Server deployment status +- Optional CRD deletion +- Optional namespace deletion +- Post-uninstall cleanup verification + +**Usage:** +```bash +cd scripts +./uninstall.sh +``` + +**Environment Variables:** +- `RELEASE_NAME` - Release name (default: opensandbox-controller) +- `NAMESPACE` - Namespace (default: opensandbox) + +**Example:** +```bash +# Uninstall specific release +RELEASE_NAME=my-release NAMESPACE=my-namespace ./uninstall.sh +``` + +### 3. e2e-test.sh - End-to-End Test Script + +Execute complete end-to-end test workflow. + +**Test Workflow:** +1. Helm Install (using values-e2e.yaml) +2. Verify Controller and Server deployment +3. Verify Pool deployment +4. Verify SDK calls +5. Helm Uninstall + +**Features:** +- Automatic Server port-forward setup +- Server API health check validation +- Pool Pod execd process verification +- SDK integration test execution +- Automatic resource cleanup (including port-forward processes) + +**Usage:** +```bash +cd scripts +./e2e-test.sh [VALUES_FILE] + +# Using default values-e2e.yaml +./e2e-test.sh + +# Using custom values file +./e2e-test.sh custom-values.yaml +``` + +**Prerequisites:** +- Required Docker images must be loaded: + - opensandbox/controller:dev + - opensandbox/server:v0.1.0 + - opensandbox/task-executor:dev + - opensandbox/execd:v1.0.5 + - nginx:latest +- Python SDK installed (using uv) +- Cluster has sufficient resources to run test Pods + +## General Instructions + +### Sudo Privileges + +All scripts automatically detect whether sudo privileges are required to execute kubectl and helm commands. + +### Script Paths + +Scripts use relative paths to locate the Chart directory and can be invoked from any location: +```bash +# From chart root directory +./scripts/install.sh + +# From scripts directory +cd scripts +./install.sh + +# From other directory +/path/to/opensandbox-controller/scripts/install.sh +``` + +### Colored Output + +Scripts use ANSI color codes to enhance readability: +- 🟢 Green - Success messages +- 🟡 Yellow - Warnings and step titles +- 🔴 Red - Error messages + +### Error Handling + +All scripts use `set -e`, exiting immediately on errors. The e2e-test.sh uses trap to ensure cleanup functions execute on exit. + +## Troubleshooting + +### install.sh + +**Issue: Cannot connect to Kubernetes cluster** +```bash +# Check kubeconfig +kubectl cluster-info + +# Check context +kubectl config current-context +``` + +**Issue: Chart validation fails** +```bash +# Manual validation +helm lint ../ +``` + +### uninstall.sh + +**Issue: Resources are still running** +```bash +# View all BatchSandbox +kubectl get batchsandboxes -A + +# View all Pool +kubectl get pools -A + +# Delete all resources +kubectl delete batchsandboxes --all -A +kubectl delete pools --all -A +``` + +### e2e-test.sh + +**Issue: Port-forward fails** +```bash +# Check if any process is using port 8080 +lsof -i :8080 + +# Manual port-forward test +kubectl port-forward -n opensandbox svc/opensandbox-controller-server 8080:8080 +``` + +**Issue: SDK test fails** +```bash +# Check Server logs +kubectl logs -n opensandbox -l app.kubernetes.io/component=server + +# Check Pool Pod logs +kubectl logs -n opensandbox -l pool=agent-pool + +# Test Server API +curl http://localhost:8080/health +``` + +**Issue: Image not found** +```bash +# Check if images are loaded +docker images | grep opensandbox + +# Reload images +docker load -i /path/to/image.tar +``` + +## Development Guide + +### Modifying Scripts + +After modifying scripts, ensure: +1. Maintain consistent error handling +2. Update related documentation +3. Use meaningful colored output +4. Add appropriate validation steps + +### Adding New Scripts + +New scripts should follow these conventions: +- Use `#!/bin/bash` shebang +- Use `set -e` to enable exit-on-error +- Implement automatic sudo detection +- Add colored output for readability +- Add documentation in this README + +## License + +Apache License 2.0 diff --git a/deploy/helm/opensandbox/scripts/e2e-test.sh b/deploy/helm/opensandbox/scripts/e2e-test.sh new file mode 100755 index 000000000000..48556d78fb15 --- /dev/null +++ b/deploy/helm/opensandbox/scripts/e2e-test.sh @@ -0,0 +1,546 @@ +#!/bin/bash +# OpenSandbox Helm Chart End-to-End Test Script +set -x +set -e + +# Get the parent directory of the script's directory (chart root directory) +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CHART_DIR="$(dirname "$SCRIPT_DIR")" +NAMESPACE="opensandbox" +RELEASE_NAME="opensandbox-e2e-test" +VALUES_FILE="${1:-values-e2e.yaml}" + +# Cleanup function: ensure temporary resources are cleaned up +cleanup() { + # Clean up any test resources that might be left over + if command -v kubectl &> /dev/null || command -v minikube &> /dev/null; then + kubectl delete configmap sdk-test-script -n "$NAMESPACE" --ignore-not-found=true > /dev/null 2>&1 || true + kubectl delete job sdk-test-job -n "$NAMESPACE" --ignore-not-found=true > /dev/null 2>&1 || true + kubectl delete pod server-health-check -n "$NAMESPACE" --ignore-not-found=true > /dev/null 2>&1 || true + fi +} + +# Register cleanup function to ensure execution on script exit +trap cleanup EXIT INT TERM + +# Check kubectl availability and detect minikube +USE_SUDO=false +USE_MINIKUBE=false + +# First check if kubectl is available directly +if kubectl get nodes &> /dev/null 2>&1; then + echo "Detected kubectl access to Kubernetes cluster" +elif sudo kubectl get nodes &> /dev/null 2>&1; then + echo "Detected sudo privileges required, will use sudo for commands" + USE_SUDO=true +# If kubectl is not available, check for minikube +elif command -v minikube &> /dev/null && minikube kubectl -- get nodes &> /dev/null 2>&1; then + echo "Detected minikube cluster, will use 'minikube kubectl --' for commands" + USE_MINIKUBE=true +elif command -v minikube &> /dev/null && sudo minikube kubectl -- get nodes &> /dev/null 2>&1; then + echo "Detected minikube cluster with sudo, will use 'sudo minikube kubectl --' for commands" + USE_MINIKUBE=true + USE_SUDO=true +else + echo "Error: Unable to access Kubernetes cluster" + echo "Please ensure kubectl or minikube is properly configured" + exit 1 +fi + +# Define command functions +kubectl_cmd() { + if [ "$USE_MINIKUBE" = true ]; then + if [ "$USE_SUDO" = true ]; then + sudo minikube kubectl -- "$@" + else + minikube kubectl -- "$@" + fi + else + if [ "$USE_SUDO" = true ]; then + sudo kubectl "$@" + else + kubectl "$@" + fi + fi +} + +helm_cmd() { + if [ "$USE_SUDO" = true ]; then + sudo helm "$@" + else + helm "$@" + fi +} + +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' + +echo -e "${GREEN}==========================================${NC}" +echo -e "${GREEN}OpenSandbox Helm Chart E2E Validation${NC}" +echo -e "${GREEN}==========================================${NC}" +echo "" +echo "Test Coverage:" +echo " 1. Helm Install (using ${VALUES_FILE})" +echo " 2. Server Deployment Verification" +echo " 3. Pool Deployment Verification" +echo " 4. SDK Integration Verification" +echo " 5. Helm Uninstall" +echo "" +echo "Environment Info:" +echo " Chart: ${CHART_DIR}" +echo " Values: ${VALUES_FILE}" +echo " Release: ${RELEASE_NAME}" +echo " Namespace: ${NAMESPACE}" +echo "" + +# ========================================== +# Stage 1: Helm Install +# ========================================== +echo -e "${GREEN}==========================================${NC}" +echo -e "${GREEN}Stage 1: Helm Install${NC}" +echo -e "${GREEN}==========================================${NC}" +echo "" + +echo -e "${YELLOW}[1.1] Checking for existing release...${NC}" +if helm_cmd list -n "$NAMESPACE" 2>/dev/null | grep -q "$RELEASE_NAME"; then + echo " Release already exists, uninstalling first..." + helm_cmd uninstall "$RELEASE_NAME" -n "$NAMESPACE" 2>/dev/null || true + sleep 5 +fi +echo -e "${GREEN}✓ Check completed${NC}" +echo "" + +echo -e "${YELLOW}[1.2] Installing Helm chart (using ${VALUES_FILE})...${NC}" +helm_cmd install "$RELEASE_NAME" "$CHART_DIR" \ + --values "$CHART_DIR/$VALUES_FILE" \ + --namespace "$NAMESPACE" \ + --create-namespace \ + --wait \ + --timeout 3m 2>&1 | tail -5 +echo -e "${GREEN}✓ Helm chart installed successfully${NC}" +echo "" + +echo -e "${YELLOW}[1.3] Waiting for Controller to be ready...${NC}" +kubectl_cmd wait --for=condition=available \ + deployment/opensandbox-controller-manager \ + -n "$NAMESPACE" \ + --timeout=120s 2>/dev/null +echo -e "${GREEN}✓ Controller is ready${NC}" +echo "" + +echo -e "${YELLOW}[1.4] Checking deployment status...${NC}" +kubectl_cmd get deployment -n "$NAMESPACE" +echo "" + +echo -e "${GREEN}✅ Stage 1 Complete: Helm Install Successful${NC}" +echo "" + +# ========================================== +# Stage 2: Server Deployment Verification +# ========================================== +echo -e "${GREEN}==========================================${NC}" +echo -e "${GREEN}Stage 2: Server Deployment Verification${NC}" +echo -e "${GREEN}==========================================${NC}" +echo "" + +echo -e "${YELLOW}[2.1] Checking Server Service...${NC}" +SERVER_SERVICE_NAME=$(kubectl_cmd get svc -n "$NAMESPACE" -l app.kubernetes.io/component=server -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) +if [ -z "$SERVER_SERVICE_NAME" ]; then + echo -e "${RED}❌ Server Service does not exist${NC}" + exit 1 +fi +echo " Server Service: ${SERVER_SERVICE_NAME}" +kubectl_cmd get svc "$SERVER_SERVICE_NAME" -n "$NAMESPACE" +echo "" + +echo -e "${YELLOW}[2.2] Waiting for Server Pod to be ready...${NC}" +SERVER_DEPLOYMENT_NAME=$(kubectl_cmd get deployment -n "$NAMESPACE" -l app.kubernetes.io/component=server -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) +if [ -z "$SERVER_DEPLOYMENT_NAME" ]; then + echo -e "${RED}❌ Server Deployment does not exist${NC}" + exit 1 +fi +echo " Server Deployment: ${SERVER_DEPLOYMENT_NAME}" +kubectl_cmd wait --for=condition=available \ + deployment/"$SERVER_DEPLOYMENT_NAME" \ + -n "$NAMESPACE" \ + --timeout=120s 2>/dev/null +echo -e "${GREEN}✓ Server Deployment is ready${NC}" +echo "" + +echo -e "${YELLOW}[2.3] Checking Server Pod status...${NC}" +kubectl_cmd get pods -n "$NAMESPACE" -l app.kubernetes.io/component=server +echo "" + +echo -e "${YELLOW}[2.4] Testing Server API (from within cluster)...${NC}" +# Create a simple test pod to check server health from inside the cluster +cat < /dev/null +apiVersion: v1 +kind: Pod +metadata: + name: server-health-check + namespace: $NAMESPACE +spec: + restartPolicy: Never + containers: + - name: curl + image: curlimages/curl:latest + command: ['curl', '-s', 'http://${SERVER_SERVICE_NAME}:8080/health'] +EOF + +# Wait for pod to complete +kubectl_cmd wait --for=condition=Ready pod/server-health-check -n "$NAMESPACE" --timeout=30s 2>/dev/null || true +sleep 2 +HEALTH_RESPONSE=$(kubectl_cmd logs server-health-check -n "$NAMESPACE" 2>/dev/null || echo "") +kubectl_cmd delete pod server-health-check -n "$NAMESPACE" --ignore-not-found=true > /dev/null 2>&1 + +if [ -n "$HEALTH_RESPONSE" ]; then + echo -e "${GREEN}✓ Server API responding normally: $HEALTH_RESPONSE${NC}" +else + echo -e "${RED}❌ Server API not responding${NC}" + exit 1 +fi +echo "" + +echo -e "${GREEN}✅ Stage 2 Complete: Server Deployment Verified${NC}" +echo "" + +# ========================================== +# Stage 3: Pool Deployment Verification +# ========================================== +echo -e "${GREEN}==========================================${NC}" +echo -e "${GREEN}Stage 3: Pool Deployment Verification${NC}" +echo -e "${GREEN}==========================================${NC}" +echo "" + +echo -e "${YELLOW}[3.1] Checking Pool resources...${NC}" +POOL_COUNT=$(kubectl_cmd get pool -n "$NAMESPACE" --no-headers 2>/dev/null | wc -l) +echo " Pool count: ${POOL_COUNT}" +if [ "$POOL_COUNT" -eq 0 ]; then + echo -e "${RED}❌ No Pool resources found${NC}" + exit 1 +fi +kubectl_cmd get pool -n "$NAMESPACE" +echo "" + +echo -e "${YELLOW}[3.2] Checking agent-pool status...${NC}" +if ! kubectl_cmd get pool agent-pool -n "$NAMESPACE" --no-headers 2>/dev/null | grep -q agent-pool; then + echo -e "${RED}❌ agent-pool does not exist${NC}" + exit 1 +fi +echo -e "${GREEN}✓ agent-pool exists${NC}" +echo "" + +echo -e "${YELLOW}[3.3] Viewing Pool detailed status...${NC}" +kubectl_cmd get pool agent-pool -n "$NAMESPACE" -o jsonpath='{.status}' 2>/dev/null | jq '.' 2>/dev/null || echo " (jq not installed, skipping JSON formatting)" +echo "" + +echo -e "${YELLOW}[3.4] Waiting for Pool Pods to be ready (up to 180 seconds)...${NC}" +TIMEOUT=180 +ELAPSED=0 +READY=false +while [ $ELAPSED -lt $TIMEOUT ]; do + AVAILABLE=$(kubectl_cmd get pool agent-pool -n "$NAMESPACE" -o jsonpath='{.status.available}' 2>/dev/null || echo "") + if [ -n "$AVAILABLE" ] && [ "$AVAILABLE" -gt 0 ]; then + echo -e "${GREEN}✓ Pool has ${AVAILABLE} available Pods${NC}" + READY=true + break + fi + echo " Waiting... (${ELAPSED}s/${TIMEOUT}s)" + sleep 5 + ELAPSED=$((ELAPSED + 5)) +done +if [ "$READY" = false ]; then + echo -e "${RED}❌ Pool Pods not ready, timed out${NC}" + echo "" + echo "Viewing Pool events:" + kubectl_cmd describe pool agent-pool -n "$NAMESPACE" | tail -20 + exit 1 +fi +echo "" + +echo -e "${YELLOW}[3.5] Viewing Pool Pods...${NC}" +kubectl_cmd get pods -l pool=agent-pool -n "$NAMESPACE" +echo "" + +echo -e "${YELLOW}[3.6] Checking execd process in Pod...${NC}" +POD_NAME=$(kubectl_cmd get pods -l pool=agent-pool -n "$NAMESPACE" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) +if [ -n "$POD_NAME" ]; then + echo " Checking Pod: ${POD_NAME}" + sleep 3 + if kubectl_cmd exec -n "$NAMESPACE" "$POD_NAME" -c sandbox-container -- pgrep -f execd > /dev/null 2>&1; then + EXECD_PID=$(kubectl_cmd exec -n "$NAMESPACE" "$POD_NAME" -c sandbox-container -- pgrep -f execd 2>/dev/null) + echo -e "${GREEN}✓ execd process is running (PID: ${EXECD_PID})${NC}" + else + echo -e "${YELLOW}⚠️ execd process not found, checking container logs...${NC}" + kubectl_cmd logs -n "$NAMESPACE" "$POD_NAME" -c sandbox-container --tail=20 2>/dev/null || true + fi +else + echo -e "${YELLOW}⚠️ No agent-pool Pod found${NC}" +fi +echo "" + +echo -e "${GREEN}✅ Stage 3 Complete: Pool Deployment Verified${NC}" +echo "" + +# ========================================== +# Stage 4: SDK Integration Verification +# ========================================== +echo -e "${GREEN}==========================================${NC}" +echo -e "${GREEN}Stage 4: SDK Integration Verification${NC}" +echo -e "${GREEN}==========================================${NC}" +echo "" + +echo -e "${YELLOW}[4.1] Creating SDK test script ConfigMap...${NC}" +SDK_TEST_SCRIPT=$(cat <<'EOF' +import asyncio +from datetime import timedelta +from opensandbox import Sandbox +from opensandbox.config import ConnectionConfig + +async def main(): + print("=" * 60) + print("SDK End-to-End Test") + print("=" * 60) + + # Use internal service name for cluster-internal communication + config = ConnectionConfig(domain="SERVER_SERVICE_PLACEHOLDER:8080") + + print("\n[Test 1] Creating sandbox (using agent-pool)...") + try: + sandbox = await Sandbox.create( + "nginx:latest", + entrypoint=["/bin/sh", "-c", "sleep infinity"], + env={"TEST": "e2e"}, + timeout=timedelta(minutes=10), + ready_timeout=timedelta(minutes=5), + connection_config=config, + extensions={"poolRef": "agent-pool"} + ) + print(f"✅ Sandbox created successfully: {sandbox.id}") + except Exception as e: + print(f"❌ Sandbox creation failed: {e}") + import traceback + traceback.print_exc() + return False + + try: + print("\n[Test 2] Executing command...") + execution = await sandbox.commands.run("echo 'Hello from E2E test'") + if execution.logs.stdout: + print(f"✅ Command executed successfully: {execution.logs.stdout[0].text}") + else: + print("⚠️ Command executed successfully but no output") + + print("\n[Test 3] File operations...") + from opensandbox.models import WriteEntry + await sandbox.files.write_files([ + WriteEntry(path="/tmp/e2e.txt", data="E2E Test", mode=644) + ]) + print("✅ File written successfully") + + content = await sandbox.files.read_file("/tmp/e2e.txt") + print(f"✅ File read successfully: {content}") + + print("\n[Test 4] Cleaning up sandbox...") + await sandbox.kill() + print("✅ Sandbox cleaned up successfully") + + print("\n" + "=" * 60) + print("✅ All SDK end-to-end tests passed!") + print("=" * 60) + return True + + except Exception as e: + print(f"❌ Test failed: {e}") + import traceback + traceback.print_exc() + try: + await sandbox.kill() + except: + pass + return False + +if __name__ == "__main__": + success = asyncio.run(main()) + exit(0 if success else 1) +EOF +) + +# Replace placeholder with actual service name +SDK_TEST_SCRIPT="${SDK_TEST_SCRIPT//SERVER_SERVICE_PLACEHOLDER/${SERVER_SERVICE_NAME}}" + +# Create ConfigMap with test script +cat < /dev/null +apiVersion: v1 +kind: ConfigMap +metadata: + name: sdk-test-script + namespace: $NAMESPACE +data: + test.py: | +$(echo "$SDK_TEST_SCRIPT" | sed 's/^/ /') +EOF + +echo -e "${GREEN}✓ SDK test script ConfigMap created${NC}" +echo "" + +echo -e "${YELLOW}[4.2] Creating SDK test Job...${NC}" +cat < /dev/null +apiVersion: batch/v1 +kind: Job +metadata: + name: sdk-test-job + namespace: $NAMESPACE +spec: + ttlSecondsAfterFinished: 300 + backoffLimit: 0 + template: + spec: + restartPolicy: Never + containers: + - name: sdk-test + image: ghcr.io/astral-sh/uv:python3.12-bookworm + command: + - /bin/bash + - -c + - | + set -e + echo "Installing opensandbox SDK..." + uv pip install --system opensandbox + echo "" + echo "Running SDK tests..." + python /test/test.py + volumeMounts: + - name: test-script + mountPath: /test + volumes: + - name: test-script + configMap: + name: sdk-test-script +EOF + +echo -e "${GREEN}✓ SDK test Job created${NC}" +echo "" + +echo -e "${YELLOW}[4.3] Waiting for SDK test Job to complete (timeout: 5 minutes)...${NC}" +# Wait for job to start +sleep 5 + +# Get pod name +SDK_TEST_POD="" +for i in $(seq 1 30); do + SDK_TEST_POD=$(kubectl_cmd get pods -n "$NAMESPACE" -l job-name=sdk-test-job -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + if [ -n "$SDK_TEST_POD" ]; then + echo " SDK test pod: $SDK_TEST_POD" + break + fi + sleep 2 +done + +if [ -z "$SDK_TEST_POD" ]; then + echo -e "${RED}❌ SDK test pod not found${NC}" + kubectl_cmd get jobs -n "$NAMESPACE" sdk-test-job + exit 1 +fi + +# Stream logs and wait for completion +echo "" +echo " Streaming test logs:" +echo " ---" +kubectl_cmd logs -f "$SDK_TEST_POD" -n "$NAMESPACE" 2>/dev/null || true +echo " ---" +echo "" + +# Wait a bit for Job to update status after pod completion +echo " Waiting for Job status to update..." +sleep 5 + +# Wait for job to complete (up to 60 seconds) +for i in $(seq 1 60); do + JOB_STATUS=$(kubectl_cmd get job sdk-test-job -n "$NAMESPACE" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null || echo "") + JOB_FAILED=$(kubectl_cmd get job sdk-test-job -n "$NAMESPACE" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null || echo "") + + if [ "$JOB_STATUS" = "True" ] || [ "$JOB_FAILED" = "True" ]; then + break + fi + sleep 1 +done + +# Check job status +JOB_STATUS=$(kubectl_cmd get job sdk-test-job -n "$NAMESPACE" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null || echo "") +JOB_FAILED=$(kubectl_cmd get job sdk-test-job -n "$NAMESPACE" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null || echo "") + +if [ "$JOB_STATUS" = "True" ]; then + echo -e "${GREEN}✅ Stage 4 Complete: SDK Integration Verified${NC}" +elif [ "$JOB_FAILED" = "True" ]; then + echo -e "${RED}❌ Stage 4 Failed: SDK Integration Failed${NC}" + echo "" + echo -e "${YELLOW}Diagnostic Information:${NC}" + kubectl_cmd describe job sdk-test-job -n "$NAMESPACE" | tail -20 + exit 1 +else + echo -e "${RED}❌ Stage 4 Failed: SDK test job did not complete${NC}" + kubectl_cmd get job sdk-test-job -n "$NAMESPACE" + exit 1 +fi +echo "" + +# ========================================== +# Stage 5: Helm Uninstall +# ========================================== +echo -e "${GREEN}==========================================${NC}" +echo -e "${GREEN}Stage 5: Helm Uninstall${NC}" +echo -e "${GREEN}==========================================${NC}" +echo "" + +echo -e "${YELLOW}[5.1] Cleaning up test resources...${NC}" +kubectl_cmd delete configmap sdk-test-script -n "$NAMESPACE" --ignore-not-found=true > /dev/null 2>&1 +kubectl_cmd delete job sdk-test-job -n "$NAMESPACE" --ignore-not-found=true > /dev/null 2>&1 +echo -e "${GREEN}✓ Test resources cleaned up${NC}" +echo "" + +echo -e "${YELLOW}[5.2] Uninstalling Helm release...${NC}" +helm_cmd uninstall "$RELEASE_NAME" -n "$NAMESPACE" +echo -e "${GREEN}✓ Helm release uninstalled${NC}" +echo "" + +echo "Waiting for resource cleanup..." +sleep 10 + +echo -e "${YELLOW}[5.3] Verifying resources are cleaned up...${NC}" +REMAINING_PODS=$(kubectl_cmd get pods -n "$NAMESPACE" --no-headers 2>/dev/null | wc -l) +REMAINING_POOLS=$(kubectl_cmd get pools -n "$NAMESPACE" --no-headers 2>/dev/null | wc -l) +echo " Remaining Pods: ${REMAINING_PODS}" +echo " Remaining Pools: ${REMAINING_POOLS}" + +if [ "$REMAINING_PODS" -eq 0 ] && [ "$REMAINING_POOLS" -eq 0 ]; then + echo -e "${GREEN}✓ All resources cleaned up${NC}" +else + echo -e "${YELLOW}⚠️ Resources still remaining (Terminating)${NC}" + if [ "$REMAINING_PODS" -gt 0 ]; then + kubectl_cmd get pods -n "$NAMESPACE" 2>/dev/null || true + fi +fi +echo "" + +echo -e "${GREEN}✅ Stage 5 Complete: Helm Uninstall Successful${NC}" +echo "" + +# ========================================== +# Test Summary +# ========================================== +echo -e "${GREEN}==========================================${NC}" +echo -e "${GREEN}End-to-End Test Complete!${NC}" +echo -e "${GREEN}==========================================${NC}" +echo "" +echo "Test Results:" +echo -e " ${GREEN}✅ Stage 1: Helm Install - Success${NC}" +echo -e " ${GREEN}✅ Stage 2: Server Deployment Verification - Success${NC}" +echo -e " ${GREEN}✅ Stage 3: Pool Deployment Verification - Success${NC}" +echo -e " ${GREEN}✅ Stage 4: SDK Integration Verification - Success${NC}" +echo -e " ${GREEN}✅ Stage 5: Helm Uninstall - Success${NC}" +echo "" +echo -e "${GREEN}🎉 All tests passed!${NC}" +echo "" diff --git a/deploy/helm/opensandbox/scripts/install.sh b/deploy/helm/opensandbox/scripts/install.sh new file mode 100755 index 000000000000..f6016338ff13 --- /dev/null +++ b/deploy/helm/opensandbox/scripts/install.sh @@ -0,0 +1,209 @@ +#!/bin/bash +# OpenSandbox Controller Deployment Script + +set -e + +# Check if sudo is required +USE_SUDO=false +if ! kubectl get nodes &> /dev/null 2>&1; then + if sudo kubectl get nodes &> /dev/null 2>&1; then + echo "Detected that sudo permissions are required, will use sudo to execute commands" + USE_SUDO=true + else + echo "Error: Unable to access Kubernetes cluster" + exit 1 + fi +fi + +# Define command functions +kubectl_cmd() { + if [ "$USE_SUDO" = true ]; then + sudo kubectl "$@" + else + kubectl "$@" + fi +} + +helm_cmd() { + if [ "$USE_SUDO" = true ]; then + sudo helm "$@" + else + helm "$@" + fi +} + +# Color output +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +echo -e "${GREEN}======================================${NC}" +echo -e "${GREEN}OpenSandbox Controller Helm Deployment${NC}" +echo -e "${GREEN}======================================${NC}" +echo "" + +# Check dependencies +echo -e "${YELLOW}[1/6] Checking dependencies...${NC}" +if ! command -v helm &> /dev/null; then + echo -e "${RED}Error: helm command not found${NC}" + echo "Please install Helm 3.0+: https://helm.sh/docs/intro/install/" + exit 1 +fi + +if ! command -v kubectl &> /dev/null; then + echo -e "${RED}Error: kubectl command not found${NC}" + echo "Please install kubectl: https://kubernetes.io/docs/tasks/tools/" + exit 1 +fi + +echo -e "${GREEN}✓ Helm version: $(helm version --short)${NC}" +echo -e "${GREEN}✓ Kubectl version: $(kubectl version --client --short 2>/dev/null || kubectl version --client)${NC}" +echo "" + +# Check cluster connection +echo -e "${YELLOW}[2/6] Checking Kubernetes cluster connection...${NC}" +if ! kubectl_cmd cluster-info &> /dev/null; then + echo -e "${RED}Error: Unable to connect to Kubernetes cluster${NC}" + echo "Please check your ~/.kube/config configuration" + exit 1 +fi +echo -e "${GREEN}✓ Cluster connection successful${NC}" +kubectl_cmd cluster-info | head -2 +echo "" + +# Configuration parameters +# Get the parent directory of the script directory (chart root directory) +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CHART_DIR="$(dirname "$SCRIPT_DIR")" +RELEASE_NAME="opensandbox-controller" +NAMESPACE="opensandbox" + +echo -e "${YELLOW}[3/6] Configuration parameters${NC}" +echo "Chart directory: $CHART_DIR" +echo "Release name: $RELEASE_NAME" +echo "Namespace: $NAMESPACE" +if [ -n "$IMAGE_REPO" ] || [ -n "$IMAGE_TAG" ]; then + echo "Image override: ${IMAGE_REPO:-}:${IMAGE_TAG:-}" +else + echo "Image configuration: Using configuration from values file" +fi +echo "" + +# Select deployment environment +echo -e "${YELLOW}[4/6] Select deployment environment${NC}" +echo "1) Default configuration (values.yaml)" +echo "2) End-to-end testing (values-e2e.yaml)" +echo "3) Custom (custom values)" +read -p "Please select [1-3]: " env_choice + +case $env_choice in + 1) + VALUES_FILE="$CHART_DIR/values.yaml" + echo -e "${GREEN}✓ Using default configuration${NC}" + ;; + 2) + VALUES_FILE="$CHART_DIR/values-e2e.yaml" + echo -e "${GREEN}✓ E2E test configuration selected${NC}" + ;; + 3) + read -p "Please enter values file path: " custom_values + VALUES_FILE="$custom_values" + echo -e "${GREEN}✓ Using custom configuration: $VALUES_FILE${NC}" + ;; + *) + echo -e "${RED}Invalid selection, using default configuration${NC}" + VALUES_FILE="$CHART_DIR/values.yaml" + ;; +esac +echo "" + +# Validate Chart +echo -e "${YELLOW}[5/6] Validating Helm Chart...${NC}" +if ! helm lint "$CHART_DIR" &> /dev/null; then + echo -e "${RED}Error: Chart validation failed${NC}" + helm lint "$CHART_DIR" + exit 1 +fi +echo -e "${GREEN}✓ Chart validation passed${NC}" +echo "" + +# Confirm deployment +echo -e "${YELLOW}[6/6] Preparing for deployment${NC}" +echo "The following operations will be performed:" +echo " - Create namespace: $NAMESPACE" +echo " - Install CRDs: BatchSandbox, Pool" +echo " - Deploy Controller Manager" +echo " - Deploy Server (FastAPI control plane)" +echo " - Deploy RBAC resources" +echo " - Deploy Metrics service" +echo " - Create default Pool (agent-pool)" +echo "" +read -p "Confirm deployment? [y/N]: " confirm + +if [[ ! $confirm =~ ^[Yy]$ ]]; then + echo -e "${YELLOW}Deployment cancelled${NC}" + exit 0 +fi + +# Execute deployment +echo "" +echo -e "${GREEN}Starting deployment...${NC}" +echo "" + +# If environment variables are set, override image configuration in values file +EXTRA_ARGS="" +if [ -n "$IMAGE_REPO" ]; then + EXTRA_ARGS="$EXTRA_ARGS --set controllerManager.image.repository=$IMAGE_REPO" +fi +if [ -n "$IMAGE_TAG" ]; then + EXTRA_ARGS="$EXTRA_ARGS --set controllerManager.image.tag=$IMAGE_TAG" +fi +if [ -n "$SERVER_IMAGE_REPO" ]; then + EXTRA_ARGS="$EXTRA_ARGS --set server.image.repository=$SERVER_IMAGE_REPO" +fi +if [ -n "$SERVER_IMAGE_TAG" ]; then + EXTRA_ARGS="$EXTRA_ARGS --set server.image.tag=$SERVER_IMAGE_TAG" +fi + +helm_cmd upgrade --install "$RELEASE_NAME" "$CHART_DIR" \ + --namespace "$NAMESPACE" \ + --create-namespace \ + -f "$VALUES_FILE" \ + $EXTRA_ARGS \ + --wait \ + --timeout 5m + +echo "" +echo -e "${GREEN}======================================${NC}" +echo -e "${GREEN}✓ Deployment completed!${NC}" +echo -e "${GREEN}======================================${NC}" +echo "" + +# Display deployment information +echo -e "${YELLOW}Deployment information:${NC}" +helm_cmd status "$RELEASE_NAME" -n "$NAMESPACE" + +echo "" +echo -e "${YELLOW}Verify deployment:${NC}" +echo "1. Check Pod status:" +echo " $([ "$USE_SUDO" = true ] && echo "sudo ")kubectl get pods -n $NAMESPACE" +echo "" +echo "2. View Controller logs:" +echo " $([ "$USE_SUDO" = true ] && echo "sudo ")kubectl logs -n $NAMESPACE -l control-plane=controller-manager -f" +echo "" +echo "3. View Server logs:" +echo " $([ "$USE_SUDO" = true ] && echo "sudo ")kubectl logs -n $NAMESPACE -l app.kubernetes.io/component=server -f" +echo "" +echo "4. Access Server API (Port Forward):" +echo " $([ "$USE_SUDO" = true ] && echo "sudo ")kubectl port-forward -n $NAMESPACE svc/$RELEASE_NAME-server 8080:8080" +echo " curl http://localhost:8080/health" +echo "" +echo "5. View CRDs:" +echo " $([ "$USE_SUDO" = true ] && echo "sudo ")kubectl get crds | grep sandbox.opensandbox.io" +echo "" +echo "6. Check Pool status:" +echo " $([ "$USE_SUDO" = true ] && echo "sudo ")kubectl get pools -n $NAMESPACE" +echo "" + +echo -e "${GREEN}Thank you for using OpenSandbox Controller!${NC}" diff --git a/deploy/helm/opensandbox/scripts/uninstall.sh b/deploy/helm/opensandbox/scripts/uninstall.sh new file mode 100755 index 000000000000..a0295381b79f --- /dev/null +++ b/deploy/helm/opensandbox/scripts/uninstall.sh @@ -0,0 +1,154 @@ +#!/bin/bash +# OpenSandbox Controller Uninstall Script + +set -e + +# Check if sudo is required +USE_SUDO=false +if ! kubectl get nodes &> /dev/null 2>&1; then + if sudo kubectl get nodes &> /dev/null 2>&1; then + echo "Detected sudo privileges required, will use sudo to execute commands" + USE_SUDO=true + else + echo "Error: Unable to access Kubernetes cluster" + exit 1 + fi +fi + +# Define command functions +kubectl_cmd() { + if [ "$USE_SUDO" = true ]; then + sudo kubectl "$@" + else + kubectl "$@" + fi +} + +helm_cmd() { + if [ "$USE_SUDO" = true ]; then + sudo helm "$@" + else + helm "$@" + fi +} + +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' + +echo -e "${YELLOW}======================================${NC}" +echo -e "${YELLOW}OpenSandbox Controller Uninstall${NC}" +echo -e "${YELLOW}======================================${NC}" +echo "" + +RELEASE_NAME="${RELEASE_NAME:-opensandbox-controller}" +NAMESPACE="${NAMESPACE:-opensandbox}" + +# Check if already installed +if ! helm_cmd list -n "$NAMESPACE" | grep -q "$RELEASE_NAME"; then + echo -e "${RED}Release not found: $RELEASE_NAME${NC}" + echo "Currently installed releases:" + helm_cmd list -A + exit 1 +fi + +echo "About to uninstall:" +echo " Release: $RELEASE_NAME" +echo " Namespace: $NAMESPACE" +echo "" + +# Show current resources +echo -e "${YELLOW}Current resources:${NC}" +echo "Controller:" +kubectl_cmd get deployment -n "$NAMESPACE" -l control-plane=controller-manager 2>/dev/null || echo " Controller not found" +echo "" +echo "Server:" +kubectl_cmd get deployment -n "$NAMESPACE" -l app.kubernetes.io/component=server 2>/dev/null || echo " Server not found" +echo "" + +# Check if there are running resources +echo -e "${YELLOW}Checking custom resources...${NC}" +BATCHSANDBOXES=$(kubectl_cmd get batchsandboxes -A --no-headers 2>/dev/null | wc -l) +POOLS=$(kubectl_cmd get pools -A --no-headers 2>/dev/null | wc -l) + +if [ "$BATCHSANDBOXES" -gt 0 ] || [ "$POOLS" -gt 0 ]; then + echo -e "${RED}Warning: Running resources detected!${NC}" + echo " BatchSandboxes: $BATCHSANDBOXES" + echo " Pools: $POOLS" + echo "" + + # Show Pool details + if [ "$POOLS" -gt 0 ]; then + echo "Pool details:" + kubectl_cmd get pools -A + echo "" + fi + + echo "Recommended to delete these resources first:" + echo " $([ "$USE_SUDO" = true ] && echo "sudo ")kubectl delete batchsandboxes --all -A" + echo " $([ "$USE_SUDO" = true ] && echo "sudo ")kubectl delete pools --all -A" + echo "" + read -p "Continue with uninstall? [y/N]: " force_continue + if [[ ! $force_continue =~ ^[Yy]$ ]]; then + echo -e "${YELLOW}Uninstall cancelled${NC}" + exit 0 + fi +fi + +# Confirm uninstall +read -p "Confirm uninstall of $RELEASE_NAME (including Controller and Server)? [y/N]: " confirm +if [[ ! $confirm =~ ^[Yy]$ ]]; then + echo -e "${YELLOW}Uninstall cancelled${NC}" + exit 0 +fi + +echo "" +echo -e "${GREEN}Starting uninstall...${NC}" + +# Uninstall Helm release +helm_cmd uninstall "$RELEASE_NAME" -n "$NAMESPACE" + +echo -e "${GREEN}✓ Helm release uninstalled${NC}" +echo "" + +# Wait for Pod termination +echo "Waiting for Pods to terminate..." +sleep 5 + +# Ask whether to delete CRDs +read -p "Delete CRDs? (This will delete all BatchSandbox and Pool resources) [y/N]: " delete_crds +if [[ $delete_crds =~ ^[Yy]$ ]]; then + echo "Deleting CRDs..." + kubectl_cmd delete crd batchsandboxes.sandbox.opensandbox.io 2>/dev/null || echo " CRD batchsandboxes does not exist" + kubectl_cmd delete crd pools.sandbox.opensandbox.io 2>/dev/null || echo " CRD pools does not exist" + echo -e "${GREEN}✓ CRDs deleted${NC}" +else + echo -e "${YELLOW}⊗ CRDs retained${NC}" +fi +echo "" + +# Ask whether to delete namespace +read -p "Delete namespace $NAMESPACE? [y/N]: " delete_ns +if [[ $delete_ns =~ ^[Yy]$ ]]; then + echo "Deleting namespace..." + kubectl_cmd delete namespace "$NAMESPACE" 2>/dev/null || echo " Namespace does not exist" + echo -e "${GREEN}✓ Namespace deleted${NC}" +else + echo -e "${YELLOW}⊗ Namespace retained${NC}" +fi + +echo "" +echo -e "${GREEN}======================================${NC}" +echo -e "${GREEN}✓ Uninstall completed${NC}" +echo -e "${GREEN}======================================${NC}" +echo "" + +# Verify uninstall +echo -e "${YELLOW}Verifying uninstall:${NC}" +echo "Checking Helm releases:" +helm_cmd list -n "$NAMESPACE" 2>/dev/null || echo " Namespace does not exist" +echo "" +echo "Checking CRDs:" +kubectl_cmd get crds | grep sandbox.opensandbox.io || echo " No OpenSandbox CRDs found" +echo "" diff --git a/deploy/helm/opensandbox/templates/NOTES.txt b/deploy/helm/opensandbox/templates/NOTES.txt new file mode 100644 index 000000000000..07f05c007eae --- /dev/null +++ b/deploy/helm/opensandbox/templates/NOTES.txt @@ -0,0 +1,159 @@ +Thank you for installing {{ .Chart.Name }}! + +Your release is named {{ .Release.Name }}. + +To learn more about the release, try: + + $ helm status {{ .Release.Name }} + $ helm get all {{ .Release.Name }} + +=============================================================================== + +OpenSandbox Kubernetes Controller has been deployed to namespace: {{ include "opensandbox-controller.namespace" . }} + +Note: Both the controller and user resources (Pool, BatchSandbox) use the same namespace. + +1. Check the controller status: + + kubectl get deployment -n {{ include "opensandbox-controller.namespace" . }} {{ .Values.namePrefix }}controller-manager + +2. View controller logs: + + kubectl logs -n {{ include "opensandbox-controller.namespace" . }} -l control-plane=controller-manager -f + +3. Verify CRDs are installed: + + kubectl get crds | grep sandbox.opensandbox.io + + You should see: + - batchsandboxes.sandbox.opensandbox.io + - pools.sandbox.opensandbox.io + +4. Pool Resources: +{{- if .Values.pools }} +{{- $enabledPools := list }} +{{- range .Values.pools }} +{{- if .enabled }} +{{- $enabledPools = append $enabledPools .name }} +{{- end }} +{{- end }} +{{- if $enabledPools }} + + ✅ The following Pools have been deployed: +{{- range $enabledPools }} + - {{ . }} +{{- end }} + + Check Pool status: + kubectl get pools -n {{ include "opensandbox-controller.namespace" . }} + + Check Pool details: + kubectl describe pool agent-pool -n {{ include "opensandbox-controller.namespace" . }} + + Monitor Pool capacity: + kubectl get pool agent-pool -n {{ include "opensandbox-controller.namespace" . }} -o jsonpath='{.status}' +{{- else }} + + ⚠️ All Pools are disabled in values.yaml. + To enable the default agent-pool: + helm upgrade {{ .Release.Name }} opensandbox-controller --set pools[0].enabled=true +{{- end }} +{{- else }} + + No Pools configured. +{{- end }} + + To disable Pool auto-deployment: + helm upgrade {{ .Release.Name }} opensandbox-controller --set pools[0].enabled=false + + Create additional Pool manually: + + # Basic Pool without task executor + cat <