From 95b23e55ecf0c806e7d2f0455525508bc1c5824a Mon Sep 17 00:00:00 2001 From: dima Date: Tue, 13 May 2025 15:30:21 -0700 Subject: [PATCH 01/48] kubernetes manifests for the genesis node --- test/genesis-k3s/api-deployment.yaml | 64 +++++++++++++++++ test/genesis-k3s/api-service.yaml | 23 ++++++ test/genesis-k3s/config.yaml | 7 ++ .../genesis-overrides-configmap.yaml | 52 ++++++++++++++ test/genesis-k3s/inference-deployment.yaml | 46 ++++++++++++ test/genesis-k3s/inference-service.yaml | 14 ++++ test/genesis-k3s/node-config-configmap.yaml | 23 ++++++ test/genesis-k3s/node-service.yaml | 23 ++++++ test/genesis-k3s/node-statefulset.yaml | 70 +++++++++++++++++++ test/genesis-k3s/tmkms-deployment.yaml | 32 +++++++++ 10 files changed, 354 insertions(+) create mode 100644 test/genesis-k3s/api-deployment.yaml create mode 100644 test/genesis-k3s/api-service.yaml create mode 100644 test/genesis-k3s/config.yaml create mode 100644 test/genesis-k3s/genesis-overrides-configmap.yaml create mode 100644 test/genesis-k3s/inference-deployment.yaml create mode 100644 test/genesis-k3s/inference-service.yaml create mode 100644 test/genesis-k3s/node-config-configmap.yaml create mode 100644 test/genesis-k3s/node-service.yaml create mode 100644 test/genesis-k3s/node-statefulset.yaml create mode 100644 test/genesis-k3s/tmkms-deployment.yaml diff --git a/test/genesis-k3s/api-deployment.yaml b/test/genesis-k3s/api-deployment.yaml new file mode 100644 index 0000000..facea66 --- /dev/null +++ b/test/genesis-k3s/api-deployment.yaml @@ -0,0 +1,64 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api + labels: + app: api +spec: + replicas: 1 + selector: + matchLabels: + app: api + template: + metadata: + labels: + app: api + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-1 + containers: + - name: api + image: ghcr.io/product-science/api:0.1.2 + ports: + - containerPort: 9000 + name: public + - containerPort: 9100 + name: ml-server + - containerPort: 9200 + name: admin + env: + - name: KEY_NAME + value: "genesis" + - name: DAPI_API__POC_CALLBACK_URL + value: "http://api:9100" + - name: DAPI_API__PUBLIC_URL + valueFrom: + configMapKeyRef: + name: config + key: DAPI_API_PUBLIC_URL + - name: DAPI_CHAIN_NODE__IS_GENESIS + value: "true" + - name: DAPI_CHAIN_NODE__URL + value: "http://node:26657" # Assumes 'node' service DNS resolves + - name: NODE_CONFIG_PATH + value: "/root/node_config.json" + - name: DAPI_API__PUBLIC_SERVER_PORT + value: "9000" + - name: DAPI_API__ML_SERVER_PORT + value: "9100" + - name: DAPI_API__ADMIN_SERVER_PORT + value: "9200" + volumeMounts: + - name: data + mountPath: /root/.inference + - name: node-config + mountPath: /root/node_config.json + subPath: node_config.json + volumes: + - name: data + hostPath: + path: /srv/dai/inference + type: DirectoryOrCreate + - name: node-config + configMap: + name: node-config-cm diff --git a/test/genesis-k3s/api-service.yaml b/test/genesis-k3s/api-service.yaml new file mode 100644 index 0000000..b39bd0d --- /dev/null +++ b/test/genesis-k3s/api-service.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: Service +metadata: + name: api + labels: + app: api +spec: + type: NodePort + selector: + app: api + ports: + - name: public + port: 9000 + targetPort: 9000 + nodePort: 30002 # Mapped from 8000 in Docker Compose + - name: ml-server + port: 9100 + targetPort: 9100 + nodePort: 30003 # Mapped from 9100 in Docker Compose + - name: admin + port: 9200 + targetPort: 9200 + nodePort: 30004 # Mapped from 9200 in Docker Compose diff --git a/test/genesis-k3s/config.yaml b/test/genesis-k3s/config.yaml new file mode 100644 index 0000000..9a93e42 --- /dev/null +++ b/test/genesis-k3s/config.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: config +data: + DAPI_API_PUBLIC_URL: "http://34.9.136.116:30002" + P2P_EXTERNAL_ADDRESS: "tcp://34.9.136.116:30001" \ No newline at end of file diff --git a/test/genesis-k3s/genesis-overrides-configmap.yaml b/test/genesis-k3s/genesis-overrides-configmap.yaml new file mode 100644 index 0000000..e2ad629 --- /dev/null +++ b/test/genesis-k3s/genesis-overrides-configmap.yaml @@ -0,0 +1,52 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: genesis-overrides-cm +data: + genesis_overrides.json: | + { + "app_state": { + "gov": { + "params": { + "voting_period": "24h0m0s", + "min_deposit": [ + { + "denom": "nicoin", + "amount": "25_000_000" + } + ], + "expedited_voting_period": "1h0m0s", + "expedited_min_deposit": [ + { + "denom": "nicoin", + "amount": "50_000_000" + } + ], + "quorum": "0.01000000000000000" + } + }, + "inference": { + "params": { + "epochParams": { + "epoch_length": "250", + "epoch_multiplier": "1", + "epoch_shift": "200", + "poc_stage_duration": "60", + "poc_exchange_duration": "5", + "poc_validation_delay": "5", + "poc_validation_duration": "20", + "default_unit_of_compute_price": "100" + } + }, + "genesis_only_params": { + "total_supply": "1000", + "originator_supply": "160", + "top_reward_amount": "120", + "standard_reward_amount": "600", + "pre_programmed_sale_amount": "120", + "top_rewards": 3, + "supply_denom": "mcicoin" + } + } + } + } diff --git a/test/genesis-k3s/inference-deployment.yaml b/test/genesis-k3s/inference-deployment.yaml new file mode 100644 index 0000000..aedfac7 --- /dev/null +++ b/test/genesis-k3s/inference-deployment.yaml @@ -0,0 +1,46 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: inference + labels: + app: inference +spec: + replicas: 1 + selector: + matchLabels: + app: inference + template: + metadata: + labels: + app: inference + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-1 + hostIPC: true + containers: + - name: inference + image: ghcr.io/product-science/mlnode:3.0.4-alpha2 + command: + - uvicorn + - api.app:app + - --host=0.0.0.0 + - --port=8080 + ports: + - containerPort: 8080 + name: inference-api + env: + - name: HF_HOME + value: "/root/.cache" + - name: VLLM_ATTENTION_BACKEND + value: "FLASHINFER" + resources: + limits: + nvidia.com/gpu: "1" + volumeMounts: + - name: cache + mountPath: /root/.cache + volumes: + - name: cache + hostPath: + path: /mnt/shared + type: DirectoryOrCreate \ No newline at end of file diff --git a/test/genesis-k3s/inference-service.yaml b/test/genesis-k3s/inference-service.yaml new file mode 100644 index 0000000..bd3d142 --- /dev/null +++ b/test/genesis-k3s/inference-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: inference + labels: + app: inference +spec: + type: ClusterIP # Default, but explicit for clarity + selector: + app: inference + ports: + - name: inference-api + port: 8080 + targetPort: 8080 \ No newline at end of file diff --git a/test/genesis-k3s/node-config-configmap.yaml b/test/genesis-k3s/node-config-configmap.yaml new file mode 100644 index 0000000..6c79760 --- /dev/null +++ b/test/genesis-k3s/node-config-configmap.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: node-config-cm +data: + node_config.json: | + [ + { + "id": "mlnode1", + "host": "inference", + "inference_port": 5000, + "poc_port": 8080, + "max_concurrent": 500, + "models": { + "Qwen/Qwen2.5-7B-Instruct": { + "args": [ + "--quantization", + "fp8" + ] + } + } + } + ] \ No newline at end of file diff --git a/test/genesis-k3s/node-service.yaml b/test/genesis-k3s/node-service.yaml new file mode 100644 index 0000000..92cecfc --- /dev/null +++ b/test/genesis-k3s/node-service.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: Service +metadata: + name: node + labels: + app: node +spec: + type: NodePort + selector: + app: node + ports: + - name: rpc + port: 26656 + targetPort: 26656 + nodePort: 30000 # Mapped from 5000 in Docker Compose, NodePort range is 30000-32767 + - name: p2p + port: 26657 + targetPort: 26657 + nodePort: 30001 # Mapped from 26657 in Docker Compose + - name: tmkms + port: 26658 + targetPort: 26658 + nodePort: 30005 # For tmkms communication \ No newline at end of file diff --git a/test/genesis-k3s/node-statefulset.yaml b/test/genesis-k3s/node-statefulset.yaml new file mode 100644 index 0000000..4a8fa91 --- /dev/null +++ b/test/genesis-k3s/node-statefulset.yaml @@ -0,0 +1,70 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: node + labels: + app: node +spec: + serviceName: "node" + replicas: 1 + selector: + matchLabels: + app: node + template: + metadata: + labels: + app: node + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-1 + containers: + - name: node + image: ghcr.io/product-science/inferenced:0.1.2 + command: ["sh", "./init-docker-genesis.sh"] + ports: + - containerPort: 26656 + name: rpc + - containerPort: 26657 + name: p2p + - containerPort: 26658 + name: tmkms + env: + - name: KEY_NAME + value: "genesis" + - name: SNAPSHOT_INTERVAL + value: "1000" + - name: SNAPSHOT_KEEP_RECENT + value: "5" + - name: NODE_CONFIG_PATH + value: "/root/node_config.json" + - name: P2P_EXTERNAL_ADDRESS + valueFrom: + configMapKeyRef: + name: config + key: P2P_EXTERNAL_ADDRESS + - name: CONFIG_p2p__allow_duplicate_ip + value: "true" + - name: CONFIG_p2p__handshake_timeout + value: "30s" + - name: CONFIG_p2p__dial_timeout + value: "30s" + - name: INIT_TGBOT + value: "true" + - name: TGBOT_PRIVATE_KEY_PASS + value: "defaultpassword" + - name: TMKMS_PORT + value: "26658" + volumeMounts: + - name: data + mountPath: /root/.inference + - name: genesis-overrides + mountPath: /root/genesis_overrides.json + subPath: genesis_overrides.json + volumes: + - name: data + hostPath: + path: /srv/dai/inference + type: DirectoryOrCreate + - name: genesis-overrides + configMap: + name: genesis-overrides-cm diff --git a/test/genesis-k3s/tmkms-deployment.yaml b/test/genesis-k3s/tmkms-deployment.yaml new file mode 100644 index 0000000..c52584a --- /dev/null +++ b/test/genesis-k3s/tmkms-deployment.yaml @@ -0,0 +1,32 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tmkms + labels: + app: tmkms +spec: + replicas: 1 + selector: + matchLabels: + app: tmkms + template: + metadata: + labels: + app: tmkms + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-1 + containers: + - name: tmkms + image: ghcr.io/product-science/tmkms-softsign-with-keygen:0.1.2 + env: + - name: VALIDATOR_LISTEN_ADDRESS + value: "tcp://node:26658" # Assumes 'node' service DNS resolves to the node StatefulSet + volumeMounts: + - name: tmkms-data + mountPath: /root/.tmkms + volumes: + - name: tmkms-data + hostPath: + path: /srv/dai/tmkms_data # Path on k8s-worker-1 for tmkms data + type: DirectoryOrCreate From af3a876ab815b2f90de7fff1ef077122cd57e904 Mon Sep 17 00:00:00 2001 From: dima Date: Tue, 13 May 2025 15:32:23 -0700 Subject: [PATCH 02/48] Formatting fixes --- test/genesis-k3s/api-service.yaml | 6 +++--- test/genesis-k3s/config.yaml | 2 +- test/genesis-k3s/inference-deployment.yaml | 2 +- test/genesis-k3s/inference-service.yaml | 2 +- test/genesis-k3s/node-config-configmap.yaml | 2 +- test/genesis-k3s/node-service.yaml | 4 ++-- test/genesis-k3s/tmkms-deployment.yaml | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/test/genesis-k3s/api-service.yaml b/test/genesis-k3s/api-service.yaml index b39bd0d..afda42b 100644 --- a/test/genesis-k3s/api-service.yaml +++ b/test/genesis-k3s/api-service.yaml @@ -12,12 +12,12 @@ spec: - name: public port: 9000 targetPort: 9000 - nodePort: 30002 # Mapped from 8000 in Docker Compose + nodePort: 30002 - name: ml-server port: 9100 targetPort: 9100 - nodePort: 30003 # Mapped from 9100 in Docker Compose + nodePort: 30003 - name: admin port: 9200 targetPort: 9200 - nodePort: 30004 # Mapped from 9200 in Docker Compose + nodePort: 30004 diff --git a/test/genesis-k3s/config.yaml b/test/genesis-k3s/config.yaml index 9a93e42..00f5ec8 100644 --- a/test/genesis-k3s/config.yaml +++ b/test/genesis-k3s/config.yaml @@ -4,4 +4,4 @@ metadata: name: config data: DAPI_API_PUBLIC_URL: "http://34.9.136.116:30002" - P2P_EXTERNAL_ADDRESS: "tcp://34.9.136.116:30001" \ No newline at end of file + P2P_EXTERNAL_ADDRESS: "tcp://34.9.136.116:30001" diff --git a/test/genesis-k3s/inference-deployment.yaml b/test/genesis-k3s/inference-deployment.yaml index aedfac7..4ba1139 100644 --- a/test/genesis-k3s/inference-deployment.yaml +++ b/test/genesis-k3s/inference-deployment.yaml @@ -43,4 +43,4 @@ spec: - name: cache hostPath: path: /mnt/shared - type: DirectoryOrCreate \ No newline at end of file + type: DirectoryOrCreate diff --git a/test/genesis-k3s/inference-service.yaml b/test/genesis-k3s/inference-service.yaml index bd3d142..9b3db2a 100644 --- a/test/genesis-k3s/inference-service.yaml +++ b/test/genesis-k3s/inference-service.yaml @@ -11,4 +11,4 @@ spec: ports: - name: inference-api port: 8080 - targetPort: 8080 \ No newline at end of file + targetPort: 8080 diff --git a/test/genesis-k3s/node-config-configmap.yaml b/test/genesis-k3s/node-config-configmap.yaml index 6c79760..0f2d0b2 100644 --- a/test/genesis-k3s/node-config-configmap.yaml +++ b/test/genesis-k3s/node-config-configmap.yaml @@ -20,4 +20,4 @@ data: } } } - ] \ No newline at end of file + ] diff --git a/test/genesis-k3s/node-service.yaml b/test/genesis-k3s/node-service.yaml index 92cecfc..ce3c0d8 100644 --- a/test/genesis-k3s/node-service.yaml +++ b/test/genesis-k3s/node-service.yaml @@ -12,11 +12,11 @@ spec: - name: rpc port: 26656 targetPort: 26656 - nodePort: 30000 # Mapped from 5000 in Docker Compose, NodePort range is 30000-32767 + nodePort: 30000 - name: p2p port: 26657 targetPort: 26657 - nodePort: 30001 # Mapped from 26657 in Docker Compose + nodePort: 30001 - name: tmkms port: 26658 targetPort: 26658 diff --git a/test/genesis-k3s/tmkms-deployment.yaml b/test/genesis-k3s/tmkms-deployment.yaml index c52584a..0bd3336 100644 --- a/test/genesis-k3s/tmkms-deployment.yaml +++ b/test/genesis-k3s/tmkms-deployment.yaml @@ -29,4 +29,4 @@ spec: - name: tmkms-data hostPath: path: /srv/dai/tmkms_data # Path on k8s-worker-1 for tmkms data - type: DirectoryOrCreate + type: DirectoryOrCreate From a3b76d053c11990048a013e3467ae9314b358f9d Mon Sep 17 00:00:00 2001 From: dima Date: Tue, 13 May 2025 16:11:13 -0700 Subject: [PATCH 03/48] k3s instructions for genesis node --- test/genesis-k3s/README.md | 90 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 test/genesis-k3s/README.md diff --git a/test/genesis-k3s/README.md b/test/genesis-k3s/README.md new file mode 100644 index 0000000..21937db --- /dev/null +++ b/test/genesis-k3s/README.md @@ -0,0 +1,90 @@ +# Genesis Node K3s Deployment + +This directory contains Kubernetes manifests for deploying the Genesis Node on a k3s cluster. + +## Prerequisites + +- A running k3s cluster with at least one worker node that has GPU support +- `kubectl` configured to access your cluster +- `stern` (optional, for improved log viewing) + +## Deployment + +1. **Configure kubectl** either by: + - Copying `/etc/rancher/k3s/k3s.yaml` from control-plane to `~/.kube/config` locally, or + - Setting up an SSH tunnel (see Appendix) + +2. **Deploy the Genesis Node**: + ```bash + kubectl apply -f . + ``` + +3. **Verify deployment**: + ```bash + kubectl get pods + ``` + Wait until all pods (`node-0`, `api-*`, `tmkms-*`, `inference-*`) show `Running` status. + +## Managing the Deployment + +### View Logs + +**Using kubectl** (individual components): +```bash +kubectl logs -f node-0 # Node logs +kubectl logs -f $(kubectl get pod -l app=api -o name) # API logs +kubectl logs -f $(kubectl get pod -l app=tmkms -o name) # TMKMS logs +kubectl logs -f $(kubectl get pod -l app=inference -o name) # Inference logs +``` + +**Using stern** (all components): +```bash +stern 'node|api|tmkms|inference' --exclude-container=POD +``` + +### Restart Components + +```bash +kubectl rollout restart statefulset/node # Restart node +kubectl rollout restart deployment/api # Restart API +kubectl rollout restart deployment/tmkms # Restart TMKMS +kubectl rollout restart deployment/inference # Restart inference +``` + +### Update Configuration + +1. Edit the ConfigMap: + ```bash + kubectl edit configmap config + ``` + +2. Restart affected components: + ```bash + kubectl rollout restart statefulset/node deployment/api + ``` + +### Delete/Stop Everything + +```bash +kubectl delete -f . +``` + +## Appendix: SSH Tunnel Setup + +If accessing the cluster remotely, set up an SSH tunnel: + +```bash +# Start tunnel +gcloud compute ssh k8s-control-plane \ + --project=YOUR_GCP_PROJECT_ID \ + --zone=YOUR_GCE_INSTANCE_ZONE \ + -- -L 6443:127.0.0.1:6443 -N -f + +# Check tunnel status +pgrep -f 'ssh.*-L 6443:127.0.0.1:6443' > /dev/null && echo "Tunnel ACTIVE" || echo "Tunnel NOT ACTIVE" + +# Kill tunnel +pkill -f 'ssh.*-L 6443:127.0.0.1:6443' +``` + +Update your kubeconfig's server field to: `https://127.0.0.1:6443` \ No newline at end of file From 80cf802b14afe5ffd4362a88e8404b4d462a10f8 Mon Sep 17 00:00:00 2001 From: dima Date: Tue, 13 May 2025 16:22:34 -0700 Subject: [PATCH 04/48] Use secret to pull --- test/genesis-k3s/README.md | 2 +- test/genesis-k3s/api-deployment.yaml | 2 ++ test/genesis-k3s/inference-deployment.yaml | 2 ++ test/genesis-k3s/node-statefulset.yaml | 2 ++ test/genesis-k3s/tmkms-deployment.yaml | 2 ++ 5 files changed, 9 insertions(+), 1 deletion(-) diff --git a/test/genesis-k3s/README.md b/test/genesis-k3s/README.md index 21937db..7991e6d 100644 --- a/test/genesis-k3s/README.md +++ b/test/genesis-k3s/README.md @@ -87,4 +87,4 @@ pgrep -f 'ssh.*-L 6443:127.0.0.1:6443' > /dev/null && echo "Tunnel ACTIVE" || ec pkill -f 'ssh.*-L 6443:127.0.0.1:6443' ``` -Update your kubeconfig's server field to: `https://127.0.0.1:6443` \ No newline at end of file +Update your kubeconfig's server field to: `https://127.0.0.1:6443` diff --git a/test/genesis-k3s/api-deployment.yaml b/test/genesis-k3s/api-deployment.yaml index facea66..b0e44ab 100644 --- a/test/genesis-k3s/api-deployment.yaml +++ b/test/genesis-k3s/api-deployment.yaml @@ -16,6 +16,8 @@ spec: spec: nodeSelector: kubernetes.io/hostname: k8s-worker-1 + imagePullSecrets: + - name: ghcr-credentials containers: - name: api image: ghcr.io/product-science/api:0.1.2 diff --git a/test/genesis-k3s/inference-deployment.yaml b/test/genesis-k3s/inference-deployment.yaml index 4ba1139..acc6093 100644 --- a/test/genesis-k3s/inference-deployment.yaml +++ b/test/genesis-k3s/inference-deployment.yaml @@ -17,6 +17,8 @@ spec: nodeSelector: kubernetes.io/hostname: k8s-worker-1 hostIPC: true + imagePullSecrets: + - name: ghcr-credentials containers: - name: inference image: ghcr.io/product-science/mlnode:3.0.4-alpha2 diff --git a/test/genesis-k3s/node-statefulset.yaml b/test/genesis-k3s/node-statefulset.yaml index 4a8fa91..42a8d89 100644 --- a/test/genesis-k3s/node-statefulset.yaml +++ b/test/genesis-k3s/node-statefulset.yaml @@ -17,6 +17,8 @@ spec: spec: nodeSelector: kubernetes.io/hostname: k8s-worker-1 + imagePullSecrets: + - name: ghcr-credentials containers: - name: node image: ghcr.io/product-science/inferenced:0.1.2 diff --git a/test/genesis-k3s/tmkms-deployment.yaml b/test/genesis-k3s/tmkms-deployment.yaml index 0bd3336..737dc4c 100644 --- a/test/genesis-k3s/tmkms-deployment.yaml +++ b/test/genesis-k3s/tmkms-deployment.yaml @@ -16,6 +16,8 @@ spec: spec: nodeSelector: kubernetes.io/hostname: k8s-worker-1 + imagePullSecrets: + - name: ghcr-credentials containers: - name: tmkms image: ghcr.io/product-science/tmkms-softsign-with-keygen:0.1.2 From 37e01bdacde58264cc31dd2b40b2c61a76db17c4 Mon Sep 17 00:00:00 2001 From: dima Date: Tue, 13 May 2025 16:23:46 -0700 Subject: [PATCH 05/48] Add GitHub auth instruction --- test/genesis-k3s/README.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/test/genesis-k3s/README.md b/test/genesis-k3s/README.md index 7991e6d..75e3f97 100644 --- a/test/genesis-k3s/README.md +++ b/test/genesis-k3s/README.md @@ -14,12 +14,21 @@ This directory contains Kubernetes manifests for deploying the Genesis Node on a - Copying `/etc/rancher/k3s/k3s.yaml` from control-plane to `~/.kube/config` locally, or - Setting up an SSH tunnel (see Appendix) -2. **Deploy the Genesis Node**: +2. **Set up GitHub Container Registry authentication**: + ```bash + kubectl create secret docker-registry ghcr-credentials \ + --docker-server=ghcr.io \ + --docker-username=YOUR_GITHUB_USERNAME \ + --docker-password=YOUR_GITHUB_TOKEN + ``` + Replace `YOUR_GITHUB_USERNAME` with your GitHub username and `YOUR_GITHUB_TOKEN` with a Personal Access Token that has `read:packages` permission. + +3. **Deploy the Genesis Node**: ```bash kubectl apply -f . ``` -3. **Verify deployment**: +4. **Verify deployment**: ```bash kubectl get pods ``` From 28500e769c2b820bb6de9748a72008e52b67f909 Mon Sep 17 00:00:00 2001 From: dima Date: Tue, 13 May 2025 16:31:21 -0700 Subject: [PATCH 06/48] Try to fix tmkms-deployment.yaml --- test/genesis-k3s/tmkms-deployment.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/genesis-k3s/tmkms-deployment.yaml b/test/genesis-k3s/tmkms-deployment.yaml index 737dc4c..9b7ce99 100644 --- a/test/genesis-k3s/tmkms-deployment.yaml +++ b/test/genesis-k3s/tmkms-deployment.yaml @@ -18,6 +18,13 @@ spec: kubernetes.io/hostname: k8s-worker-1 imagePullSecrets: - name: ghcr-credentials + initContainers: + - name: init-permissions-tmkms + image: busybox + command: ['sh', '-c', 'chmod -R 777 /data_for_tmkms || true'] + volumeMounts: + - name: tmkms-data + mountPath: /data_for_tmkms containers: - name: tmkms image: ghcr.io/product-science/tmkms-softsign-with-keygen:0.1.2 From d1f182f186d1b09afbd42bf2f936c6810c387397 Mon Sep 17 00:00:00 2001 From: dima Date: Tue, 13 May 2025 16:44:51 -0700 Subject: [PATCH 07/48] More kuber changes --- test/genesis-k3s/README.md | 61 ++++++++++++++++++--- test/genesis-k3s/clear-state-job.yaml | 75 ++++++++++++++++++++++++++ test/genesis-k3s/tmkms-deployment.yaml | 2 +- 3 files changed, 131 insertions(+), 7 deletions(-) create mode 100644 test/genesis-k3s/clear-state-job.yaml diff --git a/test/genesis-k3s/README.md b/test/genesis-k3s/README.md index 75e3f97..95f8e6d 100644 --- a/test/genesis-k3s/README.md +++ b/test/genesis-k3s/README.md @@ -4,14 +4,15 @@ This directory contains Kubernetes manifests for deploying the Genesis Node on a ## Prerequisites -- A running k3s cluster with at least one worker node that has GPU support +- A running k3s cluster with at least one worker node that has GPU support (`k8s-worker-1`) - `kubectl` configured to access your cluster +- SSH access from your management machine (or `k8s-control-plane`) to `k8s-worker-1` for state cleaning. - `stern` (optional, for improved log viewing) ## Deployment 1. **Configure kubectl** either by: - - Copying `/etc/rancher/k3s/k3s.yaml` from control-plane to `~/.kube/config` locally, or + - Copying `/etc/rancher/k3s/k3s.yaml` from `k8s-control-plane` to `~/.kube/config` locally, or - Setting up an SSH tunnel (see Appendix) 2. **Set up GitHub Container Registry authentication**: @@ -21,7 +22,7 @@ This directory contains Kubernetes manifests for deploying the Genesis Node on a --docker-username=YOUR_GITHUB_USERNAME \ --docker-password=YOUR_GITHUB_TOKEN ``` - Replace `YOUR_GITHUB_USERNAME` with your GitHub username and `YOUR_GITHUB_TOKEN` with a Personal Access Token that has `read:packages` permission. + Replace `YOUR_GITHUB_USERNAME` with your GitHub username and `YOUR_GITHUB_TOKEN` with a Personal Access Token that has `read:packages` permission. (If the secret already exists, this command will fail, which is fine.) 3. **Deploy the Genesis Node**: ```bash @@ -30,7 +31,7 @@ This directory contains Kubernetes manifests for deploying the Genesis Node on a 4. **Verify deployment**: ```bash - kubectl get pods + kubectl get pods -w ``` Wait until all pods (`node-0`, `api-*`, `tmkms-*`, `inference-*`) show `Running` status. @@ -72,15 +73,63 @@ kubectl rollout restart deployment/inference # Restart inference kubectl rollout restart statefulset/node deployment/api ``` -### Delete/Stop Everything +### Stop Deployment (Delete Kubernetes Resources) + +This stops the application but leaves data on the `hostPath` volumes intact. ```bash kubectl delete -f . ``` +### Clean Restart (Delete Kubernetes Resources and Clear State) + +This performs a full reset, deleting Kubernetes resources and clearing persisted data from `hostPath` volumes on `k8s-worker-1`, `k8s-worker-2`, and `k8s-worker-3`. + +**1. Delete Existing Kubernetes Application Resources:** + Run this from where your `kubectl` is configured (e.g., your local machine or `k8s-control-plane`): + ```bash + kubectl delete -f . --ignore-not-found=true # Deletes app resources, ignores if clear-state-job.yaml is not found or vice-versa + kubectl delete job clear-worker-state-job --ignore-not-found=true # Ensure previous job is cleaned up + ``` + Wait for all resources to be terminated. + +**2. Clear HostPath Volume Data using a Kubernetes Job:** + Apply the `clear-state-job.yaml` manifest. This job will run pods on `k8s-worker-1`, `k8s-worker-2`, and `k8s-worker-3` to delete the contents of the specified host directories. + ```bash + kubectl apply -f clear-state-job.yaml + ``` + +**3. Monitor the State Clearing Job:** + Check the status of the job: + ```bash + kubectl get job clear-worker-state-job -w + ``` + Wait for the job to show `COMPLETIONS` as `3/3`. + + View logs from the job's pods to confirm successful clearance on each node: + ```bash + kubectl logs -l app=clear-worker-state --tail=-1 # Shows all logs from all pods of the job + ``` + +**4. Delete the State Clearing Job (Important):** + Once the job is complete, delete it to avoid re-running it accidentally and to clean up the completed pods. + ```bash + kubectl delete job clear-worker-state-job + ``` + +**5. Re-deploy Application:** + Follow steps 2-4 from the main [Deployment](#deployment) section (create GHCR secret if needed, then `kubectl apply -f .` excluding `clear-state-job.yaml` if you re-applied everything from the directory). + + A safer re-deploy command after cleanup: + ```bash + kubectl apply -f api-deployment.yaml -f api-service.yaml -f config.yaml -f genesis-overrides-configmap.yaml -f inference-deployment.yaml -f inference-service.yaml -f node-config-configmap.yaml -f node-service.yaml -f node-statefulset.yaml -f tmkms-deployment.yaml + ``` + + *Note: The `initContainer` in `tmkms-deployment.yaml` should handle permissions for its directory. If permission issues arise for `/srv/dai/inference` (used by `node` and `api`), consider adding similar `initContainers` to their respective manifests.* + ## Appendix: SSH Tunnel Setup -If accessing the cluster remotely, set up an SSH tunnel: +If accessing the cluster remotely from your local machine, set up an SSH tunnel: ```bash # Start tunnel diff --git a/test/genesis-k3s/clear-state-job.yaml b/test/genesis-k3s/clear-state-job.yaml new file mode 100644 index 0000000..3977773 --- /dev/null +++ b/test/genesis-k3s/clear-state-job.yaml @@ -0,0 +1,75 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: clear-worker-state-job + labels: + app: clear-worker-state +spec: + parallelism: 3 + completions: 3 + backoffLimit: 2 # Number of retries before marking job as failed + template: + metadata: + labels: + app: clear-worker-state # For log selection + spec: + restartPolicy: Never # Or OnFailure + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - k8s-worker-1 + - k8s-worker-2 + - k8s-worker-3 + containers: + - name: state-clearer + image: busybox + command: + - sh + - -c + - > + echo "Running state clearance on node: $(NODE_NAME)"; + rm -rf /mnt_host/srv_dai_inference/* && \ + rm -rf /mnt_host/srv_dai_inference/.* 2>/dev/null && \ + rm -rf /mnt_host/srv_dai_tmkms_data/* && \ + rm -rf /mnt_host/srv_dai_tmkms_data/.* 2>/dev/null && \ + # Uncomment below to clear /mnt/shared as well + # rm -rf /mnt_host/mnt_shared/* && \ + # rm -rf /mnt_host/mnt_shared/.* 2>/dev/null && \ + echo "Successfully cleared state directories on $(NODE_NAME)"; + sleep 5 # Keep pod alive for a few seconds for log visibility + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + securityContext: + runAsUser: 0 # Run as root to have permissions for rm -rf on host paths + # If runAsUser:0 is not sufficient, uncomment below (less secure): + # privileged: true + volumeMounts: + - name: host-srv-dai-inference + mountPath: /mnt_host/srv_dai_inference + - name: host-srv-dai-tmkms-data + mountPath: /mnt_host/srv_dai_tmkms_data + # Uncomment below if clearing /mnt/shared + # - name: host-mnt-shared + # mountPath: /mnt_host/mnt_shared + volumes: + - name: host-srv-dai-inference + hostPath: + path: /srv/dai/inference + type: DirectoryOrCreate + - name: host-srv-dai-tmkms-data + hostPath: + path: /srv/dai/tmkms_data + type: DirectoryOrCreate + # Uncomment below if clearing /mnt/shared + # - name: host-mnt-shared + # hostPath: + # path: /mnt/shared + # type: DirectoryOrCreate \ No newline at end of file diff --git a/test/genesis-k3s/tmkms-deployment.yaml b/test/genesis-k3s/tmkms-deployment.yaml index 9b7ce99..c88f560 100644 --- a/test/genesis-k3s/tmkms-deployment.yaml +++ b/test/genesis-k3s/tmkms-deployment.yaml @@ -21,7 +21,7 @@ spec: initContainers: - name: init-permissions-tmkms image: busybox - command: ['sh', '-c', 'chmod -R 777 /data_for_tmkms || true'] + command: ['sh', '-c', 'chmod -R 777 /data_for_tmkms && ls -la /data_for_tmkms && echo "Permissions set and listed" || ls -la /data_for_tmkms && echo "Chmod failed, listing permissions anyway" && exit 1'] volumeMounts: - name: tmkms-data mountPath: /data_for_tmkms From fa213456ca1ebe59811e0a69e0cdfbb0ca017f5d Mon Sep 17 00:00:00 2001 From: dima Date: Tue, 13 May 2025 16:48:33 -0700 Subject: [PATCH 08/48] Refactor: move --- {test/genesis-k3s => k3s/genesis}/README.md | 0 {test/genesis-k3s => k3s/genesis}/api-deployment.yaml | 0 {test/genesis-k3s => k3s/genesis}/api-service.yaml | 0 {test/genesis-k3s => k3s/genesis}/config.yaml | 0 .../genesis-k3s => k3s/genesis}/genesis-overrides-configmap.yaml | 0 {test/genesis-k3s => k3s/genesis}/inference-deployment.yaml | 0 {test/genesis-k3s => k3s/genesis}/inference-service.yaml | 0 {test/genesis-k3s => k3s/genesis}/node-config-configmap.yaml | 0 {test/genesis-k3s => k3s/genesis}/node-service.yaml | 0 {test/genesis-k3s => k3s/genesis}/node-statefulset.yaml | 0 {test/genesis-k3s => k3s/genesis}/tmkms-deployment.yaml | 0 {test/genesis-k3s => k3s/jobs}/clear-state-job.yaml | 0 12 files changed, 0 insertions(+), 0 deletions(-) rename {test/genesis-k3s => k3s/genesis}/README.md (100%) rename {test/genesis-k3s => k3s/genesis}/api-deployment.yaml (100%) rename {test/genesis-k3s => k3s/genesis}/api-service.yaml (100%) rename {test/genesis-k3s => k3s/genesis}/config.yaml (100%) rename {test/genesis-k3s => k3s/genesis}/genesis-overrides-configmap.yaml (100%) rename {test/genesis-k3s => k3s/genesis}/inference-deployment.yaml (100%) rename {test/genesis-k3s => k3s/genesis}/inference-service.yaml (100%) rename {test/genesis-k3s => k3s/genesis}/node-config-configmap.yaml (100%) rename {test/genesis-k3s => k3s/genesis}/node-service.yaml (100%) rename {test/genesis-k3s => k3s/genesis}/node-statefulset.yaml (100%) rename {test/genesis-k3s => k3s/genesis}/tmkms-deployment.yaml (100%) rename {test/genesis-k3s => k3s/jobs}/clear-state-job.yaml (100%) diff --git a/test/genesis-k3s/README.md b/k3s/genesis/README.md similarity index 100% rename from test/genesis-k3s/README.md rename to k3s/genesis/README.md diff --git a/test/genesis-k3s/api-deployment.yaml b/k3s/genesis/api-deployment.yaml similarity index 100% rename from test/genesis-k3s/api-deployment.yaml rename to k3s/genesis/api-deployment.yaml diff --git a/test/genesis-k3s/api-service.yaml b/k3s/genesis/api-service.yaml similarity index 100% rename from test/genesis-k3s/api-service.yaml rename to k3s/genesis/api-service.yaml diff --git a/test/genesis-k3s/config.yaml b/k3s/genesis/config.yaml similarity index 100% rename from test/genesis-k3s/config.yaml rename to k3s/genesis/config.yaml diff --git a/test/genesis-k3s/genesis-overrides-configmap.yaml b/k3s/genesis/genesis-overrides-configmap.yaml similarity index 100% rename from test/genesis-k3s/genesis-overrides-configmap.yaml rename to k3s/genesis/genesis-overrides-configmap.yaml diff --git a/test/genesis-k3s/inference-deployment.yaml b/k3s/genesis/inference-deployment.yaml similarity index 100% rename from test/genesis-k3s/inference-deployment.yaml rename to k3s/genesis/inference-deployment.yaml diff --git a/test/genesis-k3s/inference-service.yaml b/k3s/genesis/inference-service.yaml similarity index 100% rename from test/genesis-k3s/inference-service.yaml rename to k3s/genesis/inference-service.yaml diff --git a/test/genesis-k3s/node-config-configmap.yaml b/k3s/genesis/node-config-configmap.yaml similarity index 100% rename from test/genesis-k3s/node-config-configmap.yaml rename to k3s/genesis/node-config-configmap.yaml diff --git a/test/genesis-k3s/node-service.yaml b/k3s/genesis/node-service.yaml similarity index 100% rename from test/genesis-k3s/node-service.yaml rename to k3s/genesis/node-service.yaml diff --git a/test/genesis-k3s/node-statefulset.yaml b/k3s/genesis/node-statefulset.yaml similarity index 100% rename from test/genesis-k3s/node-statefulset.yaml rename to k3s/genesis/node-statefulset.yaml diff --git a/test/genesis-k3s/tmkms-deployment.yaml b/k3s/genesis/tmkms-deployment.yaml similarity index 100% rename from test/genesis-k3s/tmkms-deployment.yaml rename to k3s/genesis/tmkms-deployment.yaml diff --git a/test/genesis-k3s/clear-state-job.yaml b/k3s/jobs/clear-state-job.yaml similarity index 100% rename from test/genesis-k3s/clear-state-job.yaml rename to k3s/jobs/clear-state-job.yaml From 3baff3af35d8b3d3ae57731fb8a38540ef04b6c7 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 00:06:28 -0700 Subject: [PATCH 09/48] Add entrypoint to inference-deployment.yaml --- k3s/genesis/inference-deployment.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/k3s/genesis/inference-deployment.yaml b/k3s/genesis/inference-deployment.yaml index acc6093..d9df0bf 100644 --- a/k3s/genesis/inference-deployment.yaml +++ b/k3s/genesis/inference-deployment.yaml @@ -23,6 +23,7 @@ spec: - name: inference image: ghcr.io/product-science/mlnode:3.0.4-alpha2 command: + - /app/entrypoint.sh - uvicorn - api.app:app - --host=0.0.0.0 From 60c2cb7ccf3261630caa768e492b459252816275 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 00:14:21 -0700 Subject: [PATCH 10/48] Tweak again --- k3s/genesis/tmkms-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k3s/genesis/tmkms-deployment.yaml b/k3s/genesis/tmkms-deployment.yaml index c88f560..ccbfac5 100644 --- a/k3s/genesis/tmkms-deployment.yaml +++ b/k3s/genesis/tmkms-deployment.yaml @@ -21,7 +21,7 @@ spec: initContainers: - name: init-permissions-tmkms image: busybox - command: ['sh', '-c', 'chmod -R 777 /data_for_tmkms && ls -la /data_for_tmkms && echo "Permissions set and listed" || ls -la /data_for_tmkms && echo "Chmod failed, listing permissions anyway" && exit 1'] + command: ['sh', '-c', 'set -e; chmod -R 777 /data_for_tmkms; echo "Attempted chmod. Current permissions for /data_for_tmkms:"; ls -la /data_for_tmkms; echo "Listing contents of /data_for_tmkms:"; ls -la /data_for_tmkms/* /data_for_tmkms/.* 2>/dev/null || true; echo "Init container completed."'] volumeMounts: - name: tmkms-data mountPath: /data_for_tmkms From 44a1edf0e1125b26e2e7dc32ba5d13983c6cbdad Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 00:17:33 -0700 Subject: [PATCH 11/48] Add command to tmkms --- k3s/genesis/tmkms-deployment.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/k3s/genesis/tmkms-deployment.yaml b/k3s/genesis/tmkms-deployment.yaml index ccbfac5..791235f 100644 --- a/k3s/genesis/tmkms-deployment.yaml +++ b/k3s/genesis/tmkms-deployment.yaml @@ -28,6 +28,8 @@ spec: containers: - name: tmkms image: ghcr.io/product-science/tmkms-softsign-with-keygen:0.1.2 + command: + - /root/init.sh env: - name: VALIDATOR_LISTEN_ADDRESS value: "tcp://node:26658" # Assumes 'node' service DNS resolves to the node StatefulSet From cffa4a637cd045fec483d1fafefb93c78b96843c Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 00:43:04 -0700 Subject: [PATCH 12/48] tmkms changes --- k3s/genesis/tmkms-deployment.yaml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/k3s/genesis/tmkms-deployment.yaml b/k3s/genesis/tmkms-deployment.yaml index 791235f..e999c7d 100644 --- a/k3s/genesis/tmkms-deployment.yaml +++ b/k3s/genesis/tmkms-deployment.yaml @@ -18,18 +18,11 @@ spec: kubernetes.io/hostname: k8s-worker-1 imagePullSecrets: - name: ghcr-credentials - initContainers: - - name: init-permissions-tmkms - image: busybox - command: ['sh', '-c', 'set -e; chmod -R 777 /data_for_tmkms; echo "Attempted chmod. Current permissions for /data_for_tmkms:"; ls -la /data_for_tmkms; echo "Listing contents of /data_for_tmkms:"; ls -la /data_for_tmkms/* /data_for_tmkms/.* 2>/dev/null || true; echo "Init container completed."'] - volumeMounts: - - name: tmkms-data - mountPath: /data_for_tmkms containers: - name: tmkms image: ghcr.io/product-science/tmkms-softsign-with-keygen:0.1.2 command: - - /root/init.sh + - /root/init.sh env: - name: VALIDATOR_LISTEN_ADDRESS value: "tcp://node:26658" # Assumes 'node' service DNS resolves to the node StatefulSet From a511db5b4806999fdc82681499a6a2836dd824a9 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 00:47:38 -0700 Subject: [PATCH 13/48] tweak ports --- k3s/genesis/inference-deployment.yaml | 2 ++ k3s/genesis/inference-service.yaml | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/k3s/genesis/inference-deployment.yaml b/k3s/genesis/inference-deployment.yaml index d9df0bf..0409e0e 100644 --- a/k3s/genesis/inference-deployment.yaml +++ b/k3s/genesis/inference-deployment.yaml @@ -30,6 +30,8 @@ spec: - --port=8080 ports: - containerPort: 8080 + name: poc-api + - containerPort: 5000 name: inference-api env: - name: HF_HOME diff --git a/k3s/genesis/inference-service.yaml b/k3s/genesis/inference-service.yaml index 9b3db2a..9605bd6 100644 --- a/k3s/genesis/inference-service.yaml +++ b/k3s/genesis/inference-service.yaml @@ -9,6 +9,9 @@ spec: selector: app: inference ports: - - name: inference-api + - name: poc-api port: 8080 targetPort: 8080 + - name: inference-api + port: 5000 + targetPort: 5000 From 24a6803bd07f901c08a653a3c151f7c7f6b4c006 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 01:47:35 -0700 Subject: [PATCH 14/48] no tmkms for genesis machine! --- k3s/genesis/node-service.yaml | 4 -- k3s/genesis/node-statefulset.yaml | 4 -- k3s/genesis/tmkms-deployment.yaml | 36 ---------------- k3s/join/node-service.yaml | 23 ++++++++++ k3s/join/node-statefulset.yaml | 72 +++++++++++++++++++++++++++++++ k3s/join/tmkms-deployment.yaml | 69 +++++++++++++++++++++++++++++ 6 files changed, 164 insertions(+), 44 deletions(-) delete mode 100644 k3s/genesis/tmkms-deployment.yaml create mode 100644 k3s/join/node-service.yaml create mode 100644 k3s/join/node-statefulset.yaml create mode 100644 k3s/join/tmkms-deployment.yaml diff --git a/k3s/genesis/node-service.yaml b/k3s/genesis/node-service.yaml index ce3c0d8..3a0ffa9 100644 --- a/k3s/genesis/node-service.yaml +++ b/k3s/genesis/node-service.yaml @@ -17,7 +17,3 @@ spec: port: 26657 targetPort: 26657 nodePort: 30001 - - name: tmkms - port: 26658 - targetPort: 26658 - nodePort: 30005 # For tmkms communication \ No newline at end of file diff --git a/k3s/genesis/node-statefulset.yaml b/k3s/genesis/node-statefulset.yaml index 42a8d89..a59967f 100644 --- a/k3s/genesis/node-statefulset.yaml +++ b/k3s/genesis/node-statefulset.yaml @@ -28,8 +28,6 @@ spec: name: rpc - containerPort: 26657 name: p2p - - containerPort: 26658 - name: tmkms env: - name: KEY_NAME value: "genesis" @@ -54,8 +52,6 @@ spec: value: "true" - name: TGBOT_PRIVATE_KEY_PASS value: "defaultpassword" - - name: TMKMS_PORT - value: "26658" volumeMounts: - name: data mountPath: /root/.inference diff --git a/k3s/genesis/tmkms-deployment.yaml b/k3s/genesis/tmkms-deployment.yaml deleted file mode 100644 index e999c7d..0000000 --- a/k3s/genesis/tmkms-deployment.yaml +++ /dev/null @@ -1,36 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: tmkms - labels: - app: tmkms -spec: - replicas: 1 - selector: - matchLabels: - app: tmkms - template: - metadata: - labels: - app: tmkms - spec: - nodeSelector: - kubernetes.io/hostname: k8s-worker-1 - imagePullSecrets: - - name: ghcr-credentials - containers: - - name: tmkms - image: ghcr.io/product-science/tmkms-softsign-with-keygen:0.1.2 - command: - - /root/init.sh - env: - - name: VALIDATOR_LISTEN_ADDRESS - value: "tcp://node:26658" # Assumes 'node' service DNS resolves to the node StatefulSet - volumeMounts: - - name: tmkms-data - mountPath: /root/.tmkms - volumes: - - name: tmkms-data - hostPath: - path: /srv/dai/tmkms_data # Path on k8s-worker-1 for tmkms data - type: DirectoryOrCreate diff --git a/k3s/join/node-service.yaml b/k3s/join/node-service.yaml new file mode 100644 index 0000000..ce3c0d8 --- /dev/null +++ b/k3s/join/node-service.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: Service +metadata: + name: node + labels: + app: node +spec: + type: NodePort + selector: + app: node + ports: + - name: rpc + port: 26656 + targetPort: 26656 + nodePort: 30000 + - name: p2p + port: 26657 + targetPort: 26657 + nodePort: 30001 + - name: tmkms + port: 26658 + targetPort: 26658 + nodePort: 30005 # For tmkms communication \ No newline at end of file diff --git a/k3s/join/node-statefulset.yaml b/k3s/join/node-statefulset.yaml new file mode 100644 index 0000000..42a8d89 --- /dev/null +++ b/k3s/join/node-statefulset.yaml @@ -0,0 +1,72 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: node + labels: + app: node +spec: + serviceName: "node" + replicas: 1 + selector: + matchLabels: + app: node + template: + metadata: + labels: + app: node + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-1 + imagePullSecrets: + - name: ghcr-credentials + containers: + - name: node + image: ghcr.io/product-science/inferenced:0.1.2 + command: ["sh", "./init-docker-genesis.sh"] + ports: + - containerPort: 26656 + name: rpc + - containerPort: 26657 + name: p2p + - containerPort: 26658 + name: tmkms + env: + - name: KEY_NAME + value: "genesis" + - name: SNAPSHOT_INTERVAL + value: "1000" + - name: SNAPSHOT_KEEP_RECENT + value: "5" + - name: NODE_CONFIG_PATH + value: "/root/node_config.json" + - name: P2P_EXTERNAL_ADDRESS + valueFrom: + configMapKeyRef: + name: config + key: P2P_EXTERNAL_ADDRESS + - name: CONFIG_p2p__allow_duplicate_ip + value: "true" + - name: CONFIG_p2p__handshake_timeout + value: "30s" + - name: CONFIG_p2p__dial_timeout + value: "30s" + - name: INIT_TGBOT + value: "true" + - name: TGBOT_PRIVATE_KEY_PASS + value: "defaultpassword" + - name: TMKMS_PORT + value: "26658" + volumeMounts: + - name: data + mountPath: /root/.inference + - name: genesis-overrides + mountPath: /root/genesis_overrides.json + subPath: genesis_overrides.json + volumes: + - name: data + hostPath: + path: /srv/dai/inference + type: DirectoryOrCreate + - name: genesis-overrides + configMap: + name: genesis-overrides-cm diff --git a/k3s/join/tmkms-deployment.yaml b/k3s/join/tmkms-deployment.yaml new file mode 100644 index 0000000..867754e --- /dev/null +++ b/k3s/join/tmkms-deployment.yaml @@ -0,0 +1,69 @@ +# 1️⃣ Storage: one-gig PVC that binds to the default StorageClass +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: tmkms-pvc +spec: + accessModes: ["ReadWriteOnce"] # one writer, many readers on same node + resources: + requests: + storage: 1Gi + # storageClassName: "" # omit → use cluster default + # Optional: let the PV survive even if someone deletes the PVC + # volumeMode: Filesystem +--- +# 2️⃣ Workload: Deployment with an idempotent init-container +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tmkms + labels: + app: tmkms +spec: + replicas: 1 + selector: + matchLabels: + app: tmkms + template: + metadata: + labels: + app: tmkms + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-1 + imagePullSecrets: + - name: ghcr-credentials + initContainers: + - name: seed-tmkms + image: ghcr.io/product-science/tmkms-softsign-with-keygen:0.1.2 + command: + - /bin/sh + - -c + - | + set -e + if [ -f /data/config.toml ]; then + echo "PVC already initialised – skipping copy" + else + echo "Initialising PVC with bundled keys" + cp -a /root/.tmkms/. /data/ + fi + volumeMounts: + - name: tmkms-data + mountPath: /data + + containers: + - name: tmkms + image: ghcr.io/product-science/tmkms-softsign-with-keygen:0.1.2 + command: + - /root/init.sh + env: + - name: VALIDATOR_LISTEN_ADDRESS + value: "tcp://node:26658" + volumeMounts: + - name: tmkms-data + mountPath: /root/.tmkms + + volumes: + - name: tmkms-data + persistentVolumeClaim: + claimName: tmkms-pvc From 9298e42274c7322c26ed314a5194f942de2801cb Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 12:15:52 -0700 Subject: [PATCH 15/48] enableServiceLinks: false --- k3s/genesis/api-deployment.yaml | 1 + k3s/genesis/inference-deployment.yaml | 1 + k3s/genesis/node-statefulset.yaml | 1 + k3s/join/node-statefulset.yaml | 3 ++- k3s/join/tmkms-deployment.yaml | 3 ++- 5 files changed, 7 insertions(+), 2 deletions(-) diff --git a/k3s/genesis/api-deployment.yaml b/k3s/genesis/api-deployment.yaml index b0e44ab..1c69596 100644 --- a/k3s/genesis/api-deployment.yaml +++ b/k3s/genesis/api-deployment.yaml @@ -14,6 +14,7 @@ spec: labels: app: api spec: + enableServiceLinks: false # Prevent k8s from propagating vars like {SERVICE_NAME}_PORT inside the container nodeSelector: kubernetes.io/hostname: k8s-worker-1 imagePullSecrets: diff --git a/k3s/genesis/inference-deployment.yaml b/k3s/genesis/inference-deployment.yaml index 0409e0e..1f731dc 100644 --- a/k3s/genesis/inference-deployment.yaml +++ b/k3s/genesis/inference-deployment.yaml @@ -14,6 +14,7 @@ spec: labels: app: inference spec: + enableServiceLinks: false # Prevent k8s from propagating vars like {SERVICE_NAME}_PORT inside the container nodeSelector: kubernetes.io/hostname: k8s-worker-1 hostIPC: true diff --git a/k3s/genesis/node-statefulset.yaml b/k3s/genesis/node-statefulset.yaml index a59967f..b608e64 100644 --- a/k3s/genesis/node-statefulset.yaml +++ b/k3s/genesis/node-statefulset.yaml @@ -15,6 +15,7 @@ spec: labels: app: node spec: + enableServiceLinks: false # Prevent k8s from propagating vars like {SERVICE_NAME}_PORT inside the container nodeSelector: kubernetes.io/hostname: k8s-worker-1 imagePullSecrets: diff --git a/k3s/join/node-statefulset.yaml b/k3s/join/node-statefulset.yaml index 42a8d89..3e456c0 100644 --- a/k3s/join/node-statefulset.yaml +++ b/k3s/join/node-statefulset.yaml @@ -15,8 +15,9 @@ spec: labels: app: node spec: + enableServiceLinks: false # Prevent k8s from propagating vars like {SERVICE_NAME}_PORT inside the container nodeSelector: - kubernetes.io/hostname: k8s-worker-1 + kubernetes.io/hostname: k8s-worker-1 # TODO: change! imagePullSecrets: - name: ghcr-credentials containers: diff --git a/k3s/join/tmkms-deployment.yaml b/k3s/join/tmkms-deployment.yaml index 867754e..ead4c68 100644 --- a/k3s/join/tmkms-deployment.yaml +++ b/k3s/join/tmkms-deployment.yaml @@ -29,8 +29,9 @@ spec: labels: app: tmkms spec: + enableServiceLinks: false # Prevent k8s from propagating vars like {SERVICE_NAME}_PORT inside the container nodeSelector: - kubernetes.io/hostname: k8s-worker-1 + kubernetes.io/hostname: k8s-worker-1 # TODO: change! imagePullSecrets: - name: ghcr-credentials initContainers: From e6538bfed6c4304dc06fba492cf61cc9658bc036 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 12:38:55 -0700 Subject: [PATCH 16/48] Tweak ports a bit --- k3s/genesis/api-service.yaml | 10 +--------- k3s/genesis/config.yaml | 2 +- k3s/genesis/node-service.yaml | 8 ++++---- k3s/genesis/node-statefulset.yaml | 4 ++-- k3s/join/node-service.yaml | 8 ++++---- k3s/join/node-statefulset.yaml | 4 ++-- 6 files changed, 14 insertions(+), 22 deletions(-) diff --git a/k3s/genesis/api-service.yaml b/k3s/genesis/api-service.yaml index afda42b..66a3320 100644 --- a/k3s/genesis/api-service.yaml +++ b/k3s/genesis/api-service.yaml @@ -12,12 +12,4 @@ spec: - name: public port: 9000 targetPort: 9000 - nodePort: 30002 - - name: ml-server - port: 9100 - targetPort: 9100 - nodePort: 30003 - - name: admin - port: 9200 - targetPort: 9200 - nodePort: 30004 + nodePort: 30000 diff --git a/k3s/genesis/config.yaml b/k3s/genesis/config.yaml index 00f5ec8..dfca7df 100644 --- a/k3s/genesis/config.yaml +++ b/k3s/genesis/config.yaml @@ -3,5 +3,5 @@ kind: ConfigMap metadata: name: config data: - DAPI_API_PUBLIC_URL: "http://34.9.136.116:30002" + DAPI_API_PUBLIC_URL: "http://34.9.136.116:30000" P2P_EXTERNAL_ADDRESS: "tcp://34.9.136.116:30001" diff --git a/k3s/genesis/node-service.yaml b/k3s/genesis/node-service.yaml index 3a0ffa9..50f4ad8 100644 --- a/k3s/genesis/node-service.yaml +++ b/k3s/genesis/node-service.yaml @@ -9,11 +9,11 @@ spec: selector: app: node ports: - - name: rpc + - name: tendermint-p2p port: 26656 targetPort: 26656 - nodePort: 30000 - - name: p2p + nodePort: 30001 + - name: tendermint-rpc port: 26657 targetPort: 26657 - nodePort: 30001 + nodePort: 30002 diff --git a/k3s/genesis/node-statefulset.yaml b/k3s/genesis/node-statefulset.yaml index b608e64..4bd5eb3 100644 --- a/k3s/genesis/node-statefulset.yaml +++ b/k3s/genesis/node-statefulset.yaml @@ -26,9 +26,9 @@ spec: command: ["sh", "./init-docker-genesis.sh"] ports: - containerPort: 26656 - name: rpc + name: tendermint-p2p - containerPort: 26657 - name: p2p + name: tendermint-rpc env: - name: KEY_NAME value: "genesis" diff --git a/k3s/join/node-service.yaml b/k3s/join/node-service.yaml index ce3c0d8..0286809 100644 --- a/k3s/join/node-service.yaml +++ b/k3s/join/node-service.yaml @@ -9,14 +9,14 @@ spec: selector: app: node ports: - - name: rpc + - name: tendermint-p2p port: 26656 targetPort: 26656 - nodePort: 30000 - - name: p2p + nodePort: 30001 + - name: tendermint-rpc port: 26657 targetPort: 26657 - nodePort: 30001 + nodePort: 30002 - name: tmkms port: 26658 targetPort: 26658 diff --git a/k3s/join/node-statefulset.yaml b/k3s/join/node-statefulset.yaml index 3e456c0..b4b3501 100644 --- a/k3s/join/node-statefulset.yaml +++ b/k3s/join/node-statefulset.yaml @@ -26,9 +26,9 @@ spec: command: ["sh", "./init-docker-genesis.sh"] ports: - containerPort: 26656 - name: rpc + name: tendermint-p2p - containerPort: 26657 - name: p2p + name: tendermint-rpc - containerPort: 26658 name: tmkms env: From 24e9a2228a2bea834ec566a978bd2a531f6e653c Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 15:50:59 -0700 Subject: [PATCH 17/48] Tweak ports a bit again --- k3s/genesis/api-service.yaml | 8 +++++++- k3s/genesis/node-service.yaml | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/k3s/genesis/api-service.yaml b/k3s/genesis/api-service.yaml index 66a3320..6abd0e3 100644 --- a/k3s/genesis/api-service.yaml +++ b/k3s/genesis/api-service.yaml @@ -12,4 +12,10 @@ spec: - name: public port: 9000 targetPort: 9000 - nodePort: 30000 + nodePort: 30002 # Expose to the outer world + - name: ml-server + port: 9100 + targetPort: 9100 + - name: admin + port: 9200 + targetPort: 9200 diff --git a/k3s/genesis/node-service.yaml b/k3s/genesis/node-service.yaml index 50f4ad8..bff97ce 100644 --- a/k3s/genesis/node-service.yaml +++ b/k3s/genesis/node-service.yaml @@ -12,8 +12,8 @@ spec: - name: tendermint-p2p port: 26656 targetPort: 26656 - nodePort: 30001 + nodePort: 30001 # Expose to the outer world - name: tendermint-rpc port: 26657 targetPort: 26657 - nodePort: 30002 + nodePort: 30002 # Expose to the outer world From d0cf3eaf212e142a2435557b455dc46524553025 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 16:12:14 -0700 Subject: [PATCH 18/48] First draft of join manifests --- k3s/README.md | 12 +++ k3s/join/api-deployment.yaml | 55 ++++++++++++++ k3s/join/api-service.yaml | 21 ++++++ k3s/join/inference-data-pvc.yaml | 13 ++++ k3s/join/inference-deployment.yaml | 58 ++++++++++++++ k3s/join/inference-service.yaml | 18 +++++ .../join-node-config-configmap.template.yaml | 24 ++++++ .../join-overrides-configmap.template.yaml | 45 +++++++++++ k3s/join/node-service.yaml | 24 +++--- k3s/join/node-statefulset.yaml | 75 +++++++------------ k3s/join/tmkms-deployment.yaml | 42 ++++------- k3s/join/tmkms-pvc.yaml | 12 +++ 12 files changed, 316 insertions(+), 83 deletions(-) create mode 100644 k3s/README.md create mode 100644 k3s/join/api-deployment.yaml create mode 100644 k3s/join/api-service.yaml create mode 100644 k3s/join/inference-data-pvc.yaml create mode 100644 k3s/join/inference-deployment.yaml create mode 100644 k3s/join/inference-service.yaml create mode 100644 k3s/join/join-node-config-configmap.template.yaml create mode 100644 k3s/join/join-overrides-configmap.template.yaml create mode 100644 k3s/join/tmkms-pvc.yaml diff --git a/k3s/README.md b/k3s/README.md new file mode 100644 index 0000000..6ebeadc --- /dev/null +++ b/k3s/README.md @@ -0,0 +1,12 @@ +```bash +NAMESPACE=join-worker2 +kubectl create namespace $NAMESPACE +kubectl apply -f worker2-node-config.yaml -n $NAMESPACE +kubectl apply -f worker2-overrides.yaml -n $NAMESPACE +# Assuming nodeSelector in k3s/join/*-deployment/statefulset.yaml is set for k8s-worker-2 +kubectl apply -f k3s/join/tmkms-pvc.yaml -n $NAMESPACE +kubectl apply -f k3s/join/inference-data-pvc.yaml -n $NAMESPACE +kubectl apply -f k3s/join/tmkms-deployment.yaml -n $NAMESPACE +kubectl apply -f k3s/join/node-statefulset.yaml -n $NAMESPACE # and so on for all manifests +kubectl apply -f k3s/join/ -n $NAMESPACE # Or apply all at once after configmaps and PVCs +``` diff --git a/k3s/join/api-deployment.yaml b/k3s/join/api-deployment.yaml new file mode 100644 index 0000000..d4431fe --- /dev/null +++ b/k3s/join/api-deployment.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api + # namespace: + labels: + app: api +spec: + replicas: 1 + selector: + matchLabels: + app: api + template: + metadata: + labels: + app: api + spec: + enableServiceLinks: false + # imagePullSecrets: # Add if your image is in a private registry + # - name: ghcr-credentials + # nodeSelector: # Optional: Pin to a specific worker if needed + # kubernetes.io/hostname: + containers: + - name: api + image: ghcr.io/product-science/api:0.1.2 + imagePullPolicy: IfNotPresent + ports: + - containerPort: 9000 + name: public-api + - containerPort: 9100 + name: ml-api + - containerPort: 9200 + name: admin-api + envFrom: + - configMapRef: + name: join-node-overrides # General env vars + env: + - name: NODE_CONFIG_PATH # Override specifically for the API pod to point to the mounted config + value: "/etc/node-config/node-config.json" + volumeMounts: + - name: inference-data + mountPath: /root/.inference # Shared data with node + - name: node-config-volume # Mount the node-config.json from the CM + mountPath: /etc/node-config # Mount directory + readOnly: true + volumes: + - name: inference-data + persistentVolumeClaim: + claimName: inference-data-pvc + - name: node-config-volume + configMap: + name: join-node-settings # Contains node-config.json + items: + - key: node-config.json + path: node-config.json diff --git a/k3s/join/api-service.yaml b/k3s/join/api-service.yaml new file mode 100644 index 0000000..93c27f7 --- /dev/null +++ b/k3s/join/api-service.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: Service +metadata: + name: api + # namespace: + labels: + app: api +spec: + selector: + app: api + ports: + - name: public-api + port: 9000 # Service port for public API + targetPort: public-api + - name: ml-api + port: 9100 # Service port for ML API / poc_callback + targetPort: ml-api + - name: admin-api + port: 9200 # Service port for Admin API + targetPort: admin-api + # type: ClusterIP # Default. For external access to public-api (9000), consider LoadBalancer or Ingress. diff --git a/k3s/join/inference-data-pvc.yaml b/k3s/join/inference-data-pvc.yaml new file mode 100644 index 0000000..7e170e5 --- /dev/null +++ b/k3s/join/inference-data-pvc.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: inference-data-pvc + # namespace: # Will be applied with -n flag +spec: + accessModes: + - ReadWriteOnce # Assumes node and api (if both write) can manage this with RWO. + # If concurrent writes from different nodes are needed, consider RWX (more complex). + resources: + requests: + storage: 5Gi # Adjust size as needed for .inference data + # storageClassName: # Optional: specify if you have a particular storage class diff --git a/k3s/join/inference-deployment.yaml b/k3s/join/inference-deployment.yaml new file mode 100644 index 0000000..1d507ba --- /dev/null +++ b/k3s/join/inference-deployment.yaml @@ -0,0 +1,58 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: inference + # namespace: + labels: + app: inference +spec: + replicas: 1 + selector: + matchLabels: + app: inference + template: + metadata: + labels: + app: inference + spec: + enableServiceLinks: false + nodeSelector: + # kubernetes.io/hostname: # e.g., k8s-worker-2 - UNCOMMENT AND SET THIS + hostIPC: true # Matches genesis + # imagePullSecrets: # Add if your image is in a private registry + # - name: ghcr-credentials + containers: + - name: inference + image: ghcr.io/product-science/mlnode:3.0.4-alpha2 + imagePullPolicy: IfNotPresent + command: + - "/app/entrypoint.sh" + - "uvicorn" + - "api.app:app" + - "--host=0.0.0.0" + - "--port=8080" + ports: + - containerPort: 8080 + name: inference-api + # Port 5000 was also in genesis inference deployment, let's include it for consistency. + # If not used by join node's inference, it can be removed. + - containerPort: 5000 + name: poc-api # Matches genesis deployment, role might be different for join node config + envFrom: + - configMapRef: + name: join-node-overrides # For HF_HOME, HF_HUB_ENABLE_HF_TRANSFER + env: + # VLLM_ATTENTION_BACKEND was in genesis, add it here if still applicable + # - name: VLLM_ATTENTION_BACKEND + # value: "FLASHINFER" + resources: + limits: + nvidia.com/gpu: "1" # Request 1 GPU + volumeMounts: + - name: cache + mountPath: /root/.cache # HF_HOME points to /mnt/shared, which is mounted here. + volumes: + - name: cache + hostPath: + path: /mnt/shared # Matches genesis; ensure this path exists on worker nodes + type: DirectoryOrCreate diff --git a/k3s/join/inference-service.yaml b/k3s/join/inference-service.yaml new file mode 100644 index 0000000..f994731 --- /dev/null +++ b/k3s/join/inference-service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + name: inference + # namespace: + labels: + app: inference +spec: + selector: + app: inference + ports: + - name: inference-api + port: 8080 # Main uvicorn port + targetPort: inference-api + - name: poc-api # Matches genesis deployment and node-config.json reference + port: 5000 + targetPort: poc-api + # type: ClusterIP # Default diff --git a/k3s/join/join-node-config-configmap.template.yaml b/k3s/join/join-node-config-configmap.template.yaml new file mode 100644 index 0000000..fa75778 --- /dev/null +++ b/k3s/join/join-node-config-configmap.template.yaml @@ -0,0 +1,24 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: join-node-settings # This name will be used to mount the config + # namespace: # Will be applied with -n flag +data: + node-config.json: | + [ + { + "id": "node1", + "host": "inference", + "inference_port": 5000, + "poc_port": 8080, + "max_concurrent": 500, + "models": { + "Qwen/Qwen2.5-7B-Instruct": { + "args": [ + "--quantization", + "fp8" + ] + } + } + } + ] diff --git a/k3s/join/join-overrides-configmap.template.yaml b/k3s/join/join-overrides-configmap.template.yaml new file mode 100644 index 0000000..1a33f6a --- /dev/null +++ b/k3s/join/join-overrides-configmap.template.yaml @@ -0,0 +1,45 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: join-node-overrides # This name will be used by pods + # namespace: # Will be applied with -n flag +data: + # Replace <...> placeholders with actual values for each worker + # From config.env.template + KEY_NAME: "" + API_PORT: "" # e.g., "8001" for worker 2, "8002" for worker 3 + PUBLIC_URL: "http://:" + P2P_EXTERNAL_ADDRESS: "tcp://:" # e.g., port 5000 + NODE_CONFIG: "/etc/node-config/node-config.json" # Path where node-config.json will be mounted from the other ConfigMap + HF_HOME: "/mnt/shared" # Matches genesis and docker-compose intent + + # These seem like global values, but can be overridden per worker if needed + SEED_API_URL: "http://36.189.234.237:19250" + SEED_NODE_RPC_URL: "http://172.18.114.125:26657" + SEED_NODE_P2P_URL: "tcp://36.189.234.237:19249" # Make sure genesis node p2p is reachable at this address + DAPI_API__POC_CALLBACK_URL: "http://api:9100" # Internal service communication + DAPI_CHAIN_NODE__URL: "http://node:26657" # Internal service communication + DAPI_CHAIN_NODE__P2P_URL: "http://node:26656" # Internal service communication + RPC_SERVER_URL_1: "http://172.18.114.125:26657" + RPC_SERVER_URL_2: "http://172.18.114.125:26657" + + # Variables from docker-compose for node + TRUSTED_BLOCK_PERIOD: "2000" # Default from docker-compose + CONFIG_p2p__allow_duplicate_ip: "true" + CONFIG_p2p__handshake_timeout: "30s" + CONFIG_p2p__dial_timeout: "30s" + TKMS_PORT: "26658" # Port node listens on for TMKMS + + # Additional for API service from docker-compose (if not already covered) + # DAPI_API__PUBLIC_URL is covered by PUBLIC_URL + # DAPI_CHAIN_NODE__SEED_API_URL is covered by SEED_API_URL + # DAPI_CHAIN_NODE__URL is DAPI_CHAIN_NODE__URL + # DAPI_CHAIN_NODE__P2P_URL is DAPI_CHAIN_NODE__P2P_URL + # NODE_CONFIG_PATH is covered by NODE_CONFIG + DAPI_API__PUBLIC_SERVER_PORT: "9000" # Internal port for API service, API_PORT is for host exposure + DAPI_API__ML_SERVER_PORT: "9100" + DAPI_API__ADMIN_SERVER_PORT: "9200" + + # Additional for Inference service from docker-compose + # HF_HOME is already defined + HF_HUB_ENABLE_HF_TRANSFER: "true" diff --git a/k3s/join/node-service.yaml b/k3s/join/node-service.yaml index 0286809..8591cc2 100644 --- a/k3s/join/node-service.yaml +++ b/k3s/join/node-service.yaml @@ -2,22 +2,26 @@ apiVersion: v1 kind: Service metadata: name: node + # namespace: labels: app: node spec: - type: NodePort selector: app: node ports: - - name: tendermint-p2p + - name: p2p port: 26656 - targetPort: 26656 - nodePort: 30001 - - name: tendermint-rpc + targetPort: p2p + # protocol: TCP # Default + - name: rpc port: 26657 - targetPort: 26657 - nodePort: 30002 + targetPort: rpc + # protocol: TCP # Default - name: tmkms - port: 26658 - targetPort: 26658 - nodePort: 30005 # For tmkms communication \ No newline at end of file + port: 26658 # Port that TMKMS connects to on the node pod + targetPort: tmkms + # protocol: TCP # Default + # type: ClusterIP # Default, suitable for internal communication (e.g., TMKMS to node) + # If P2P_EXTERNAL_ADDRESS or RPC needs to be externally exposed via LoadBalancer or NodePort directly from this service, + # you might change the type and define specific external ports. + # However, ingress or specific LoadBalancer services are often preferred for external access. diff --git a/k3s/join/node-statefulset.yaml b/k3s/join/node-statefulset.yaml index b4b3501..bcfed9a 100644 --- a/k3s/join/node-statefulset.yaml +++ b/k3s/join/node-statefulset.yaml @@ -2,11 +2,12 @@ apiVersion: apps/v1 kind: StatefulSet metadata: name: node + # namespace: labels: app: node spec: serviceName: "node" - replicas: 1 + replicas: 1 # Typically 1 for a specific join node identity selector: matchLabels: app: node @@ -15,59 +16,41 @@ spec: labels: app: node spec: - enableServiceLinks: false # Prevent k8s from propagating vars like {SERVICE_NAME}_PORT inside the container + enableServiceLinks: false nodeSelector: - kubernetes.io/hostname: k8s-worker-1 # TODO: change! - imagePullSecrets: - - name: ghcr-credentials + # kubernetes.io/hostname: # e.g., k8s-worker-2 - UNCOMMENT AND SET THIS + # imagePullSecrets: # Add if your image is in a private registry + # - name: ghcr-credentials containers: - name: node image: ghcr.io/product-science/inferenced:0.1.2 - command: ["sh", "./init-docker-genesis.sh"] + imagePullPolicy: IfNotPresent + command: + - "sh" + - "-c" + - "./init-docker.sh" # Assuming this script is in the image's WORKDIR/PATH ports: - containerPort: 26656 - name: tendermint-p2p + name: p2p - containerPort: 26657 - name: tendermint-rpc + name: rpc - containerPort: 26658 - name: tmkms - env: - - name: KEY_NAME - value: "genesis" - - name: SNAPSHOT_INTERVAL - value: "1000" - - name: SNAPSHOT_KEEP_RECENT - value: "5" - - name: NODE_CONFIG_PATH - value: "/root/node_config.json" - - name: P2P_EXTERNAL_ADDRESS - valueFrom: - configMapKeyRef: - name: config - key: P2P_EXTERNAL_ADDRESS - - name: CONFIG_p2p__allow_duplicate_ip - value: "true" - - name: CONFIG_p2p__handshake_timeout - value: "30s" - - name: CONFIG_p2p__dial_timeout - value: "30s" - - name: INIT_TGBOT - value: "true" - - name: TGBOT_PRIVATE_KEY_PASS - value: "defaultpassword" - - name: TMKMS_PORT - value: "26658" + name: tmkms # Port the node listens on for TMKMS connections + envFrom: + - configMapRef: + name: join-node-overrides # From join-overrides-configmap.template.yaml volumeMounts: - - name: data - mountPath: /root/.inference - - name: genesis-overrides - mountPath: /root/genesis_overrides.json - subPath: genesis_overrides.json + - name: inference-data + mountPath: /root/.inference # Shared data with API + - name: node-config-volume + mountPath: /etc/node-config # Mount dir for node-config.json volumes: - - name: data - hostPath: - path: /srv/dai/inference - type: DirectoryOrCreate - - name: genesis-overrides + - name: inference-data # This volume is defined by the PVC + persistentVolumeClaim: + claimName: inference-data-pvc + - name: node-config-volume configMap: - name: genesis-overrides-cm + name: join-node-settings # From join-node-config-configmap.template.yaml + items: + - key: node-config.json + path: node-config.json diff --git a/k3s/join/tmkms-deployment.yaml b/k3s/join/tmkms-deployment.yaml index ead4c68..02c4691 100644 --- a/k3s/join/tmkms-deployment.yaml +++ b/k3s/join/tmkms-deployment.yaml @@ -17,6 +17,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: tmkms + # namespace: # Will be applied with -n flag labels: app: tmkms spec: @@ -29,42 +30,29 @@ spec: labels: app: tmkms spec: - enableServiceLinks: false # Prevent k8s from propagating vars like {SERVICE_NAME}_PORT inside the container - nodeSelector: - kubernetes.io/hostname: k8s-worker-1 # TODO: change! - imagePullSecrets: - - name: ghcr-credentials - initContainers: - - name: seed-tmkms - image: ghcr.io/product-science/tmkms-softsign-with-keygen:0.1.2 - command: - - /bin/sh - - -c - - | - set -e - if [ -f /data/config.toml ]; then - echo "PVC already initialised – skipping copy" - else - echo "Initialising PVC with bundled keys" - cp -a /root/.tmkms/. /data/ - fi - volumeMounts: - - name: tmkms-data - mountPath: /data - + enableServiceLinks: false + # imagePullSecrets: # Add if your image is in a private registry + # - name: ghcr-credentials containers: - name: tmkms image: ghcr.io/product-science/tmkms-softsign-with-keygen:0.1.2 - command: - - /root/init.sh + imagePullPolicy: IfNotPresent env: - name: VALIDATOR_LISTEN_ADDRESS - value: "tcp://node:26658" + value: "tcp://node:26658" # Assumes node service is named 'node' and exposes port 26658 for tmkms + # Add other TMKMS specific environment variables if needed from a ConfigMap + # envFrom: + # - configMapRef: + # name: join-node-overrides + ports: + - containerPort: 26658 # Though tmkms connects out, its good practice to declare if it also listens on a port, even if not exposed via service + # The docker-compose doesn't explicitly expose a port for tmkms itself. volumeMounts: - name: tmkms-data mountPath: /root/.tmkms - volumes: - name: tmkms-data persistentVolumeClaim: claimName: tmkms-pvc + # nodeSelector: # Usually TMKMS runs alongside its node. Add if you want to pin it. + # kubernetes.io/hostname: # e.g., k8s-worker-2 diff --git a/k3s/join/tmkms-pvc.yaml b/k3s/join/tmkms-pvc.yaml new file mode 100644 index 0000000..b39f11b --- /dev/null +++ b/k3s/join/tmkms-pvc.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: tmkms-data-pvc + # namespace: # Will be applied with -n flag +spec: + accessModes: + - ReadWriteOnce # Suitable for a single TMKMS pod + resources: + requests: + storage: 1Gi # Adjust size as needed for TMKMS data + # storageClassName: # Optional: specify if you have a particular storage class From 42069781298febf0cd1e9059055315ceec40b720 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 16:30:48 -0700 Subject: [PATCH 19/48] Delete inference-data PVC --- k3s/join/api-deployment.yaml | 7 ++++--- k3s/join/inference-data-pvc.yaml | 13 ------------- k3s/join/node-statefulset.yaml | 7 ++++--- 3 files changed, 8 insertions(+), 19 deletions(-) delete mode 100644 k3s/join/inference-data-pvc.yaml diff --git a/k3s/join/api-deployment.yaml b/k3s/join/api-deployment.yaml index d4431fe..4a4ea85 100644 --- a/k3s/join/api-deployment.yaml +++ b/k3s/join/api-deployment.yaml @@ -44,9 +44,10 @@ spec: mountPath: /etc/node-config # Mount directory readOnly: true volumes: - - name: inference-data - persistentVolumeClaim: - claimName: inference-data-pvc + - name: inference-data # This volume is now a hostPath + hostPath: + path: /srv/dai/inference # Matches genesis setup + type: DirectoryOrCreate - name: node-config-volume configMap: name: join-node-settings # Contains node-config.json diff --git a/k3s/join/inference-data-pvc.yaml b/k3s/join/inference-data-pvc.yaml deleted file mode 100644 index 7e170e5..0000000 --- a/k3s/join/inference-data-pvc.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: inference-data-pvc - # namespace: # Will be applied with -n flag -spec: - accessModes: - - ReadWriteOnce # Assumes node and api (if both write) can manage this with RWO. - # If concurrent writes from different nodes are needed, consider RWX (more complex). - resources: - requests: - storage: 5Gi # Adjust size as needed for .inference data - # storageClassName: # Optional: specify if you have a particular storage class diff --git a/k3s/join/node-statefulset.yaml b/k3s/join/node-statefulset.yaml index bcfed9a..cf4be44 100644 --- a/k3s/join/node-statefulset.yaml +++ b/k3s/join/node-statefulset.yaml @@ -45,9 +45,10 @@ spec: - name: node-config-volume mountPath: /etc/node-config # Mount dir for node-config.json volumes: - - name: inference-data # This volume is defined by the PVC - persistentVolumeClaim: - claimName: inference-data-pvc + - name: inference-data # This volume is now a hostPath + hostPath: + path: /srv/dai/inference # Matches genesis setup + type: DirectoryOrCreate - name: node-config-volume configMap: name: join-node-settings # From join-node-config-configmap.template.yaml From 5fc39749c056c55ef52982886e561c3f281fd859 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 16:47:47 -0700 Subject: [PATCH 20/48] Tweaks of join manifests --- k3s/join/inference-deployment.yaml | 18 +++--- k3s/join/inference-service.yaml | 13 ++--- .../join-overrides-configmap.template.yaml | 6 +- k3s/join/kustomization.yaml | 17 ++++++ k3s/overlays/worker2/kustomization.yaml | 57 +++++++++++++++++++ 5 files changed, 89 insertions(+), 22 deletions(-) create mode 100644 k3s/join/kustomization.yaml create mode 100644 k3s/overlays/worker2/kustomization.yaml diff --git a/k3s/join/inference-deployment.yaml b/k3s/join/inference-deployment.yaml index 1d507ba..0b737ca 100644 --- a/k3s/join/inference-deployment.yaml +++ b/k3s/join/inference-deployment.yaml @@ -33,18 +33,16 @@ spec: - "--port=8080" ports: - containerPort: 8080 - name: inference-api - # Port 5000 was also in genesis inference deployment, let's include it for consistency. - # If not used by join node's inference, it can be removed. + name: poc-api - containerPort: 5000 - name: poc-api # Matches genesis deployment, role might be different for join node config - envFrom: - - configMapRef: - name: join-node-overrides # For HF_HOME, HF_HUB_ENABLE_HF_TRANSFER + name: inference-api env: - # VLLM_ATTENTION_BACKEND was in genesis, add it here if still applicable - # - name: VLLM_ATTENTION_BACKEND - # value: "FLASHINFER" + - name: HF_HOME + value: "/root/.cache" + - name: HF_HUB_ENABLE_HF_TRANSFER + value: "true" + - name: VLLM_ATTENTION_BACKEND + value: "FLASHINFER" resources: limits: nvidia.com/gpu: "1" # Request 1 GPU diff --git a/k3s/join/inference-service.yaml b/k3s/join/inference-service.yaml index f994731..9605bd6 100644 --- a/k3s/join/inference-service.yaml +++ b/k3s/join/inference-service.yaml @@ -2,17 +2,16 @@ apiVersion: v1 kind: Service metadata: name: inference - # namespace: labels: app: inference spec: + type: ClusterIP # Default, but explicit for clarity selector: app: inference ports: + - name: poc-api + port: 8080 + targetPort: 8080 - name: inference-api - port: 8080 # Main uvicorn port - targetPort: inference-api - - name: poc-api # Matches genesis deployment and node-config.json reference - port: 5000 - targetPort: poc-api - # type: ClusterIP # Default + port: 5000 + targetPort: 5000 diff --git a/k3s/join/join-overrides-configmap.template.yaml b/k3s/join/join-overrides-configmap.template.yaml index 1a33f6a..7de8e84 100644 --- a/k3s/join/join-overrides-configmap.template.yaml +++ b/k3s/join/join-overrides-configmap.template.yaml @@ -13,7 +13,7 @@ data: NODE_CONFIG: "/etc/node-config/node-config.json" # Path where node-config.json will be mounted from the other ConfigMap HF_HOME: "/mnt/shared" # Matches genesis and docker-compose intent - # These seem like global values, but can be overridden per worker if needed + # Identical for all join workers SEED_API_URL: "http://36.189.234.237:19250" SEED_NODE_RPC_URL: "http://172.18.114.125:26657" SEED_NODE_P2P_URL: "tcp://36.189.234.237:19249" # Make sure genesis node p2p is reachable at this address @@ -39,7 +39,3 @@ data: DAPI_API__PUBLIC_SERVER_PORT: "9000" # Internal port for API service, API_PORT is for host exposure DAPI_API__ML_SERVER_PORT: "9100" DAPI_API__ADMIN_SERVER_PORT: "9200" - - # Additional for Inference service from docker-compose - # HF_HOME is already defined - HF_HUB_ENABLE_HF_TRANSFER: "true" diff --git a/k3s/join/kustomization.yaml b/k3s/join/kustomization.yaml new file mode 100644 index 0000000..9951710 --- /dev/null +++ b/k3s/join/kustomization.yaml @@ -0,0 +1,17 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - tmkms-pvc.yaml + # inference-data-pvc.yaml was removed, using hostPath instead + - tmkms-deployment.yaml + - node-statefulset.yaml + - node-service.yaml + - api-deployment.yaml + - api-service.yaml + - inference-deployment.yaml + - inference-service.yaml + +# The actual ConfigMaps (join-node-settings and join-node-overrides) +# will be provided by the overlays, not from the base templates. +# The .template.yaml files in this directory are for user reference only. \ No newline at end of file diff --git a/k3s/overlays/worker2/kustomization.yaml b/k3s/overlays/worker2/kustomization.yaml new file mode 100644 index 0000000..3282d1d6 --- /dev/null +++ b/k3s/overlays/worker2/kustomization.yaml @@ -0,0 +1,57 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +bases: + - ../../join # Points to the k3s/join directory as the base + +# Worker-specific ConfigMaps that you will create in this directory +# by copying and filling the templates from the base. +resources: + - worker2-node-config.yaml # You will create this file + - worker2-overrides.yaml # You will create this file + +patchesStrategicMerge: + - |- # Patch for Node StatefulSet + apiVersion: apps/v1 + kind: StatefulSet + metadata: + name: node + spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-2 + + - |- # Patch for Inference Deployment + apiVersion: apps/v1 + kind: Deployment + metadata: + name: inference + spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-2 + +# To also pin TMKMS and API deployments to k8s-worker-2, add similar patches: +# - |- +# apiVersion: apps/v1 +# kind: Deployment +# metadata: +# name: tmkms +# spec: +# template: +# spec: +# nodeSelector: +# kubernetes.io/hostname: k8s-worker-2 +# +# - |- +# apiVersion: apps/v1 +# kind: Deployment +# metadata: +# name: api +# spec: +# template: +# spec: +# nodeSelector: +# kubernetes.io/hostname: k8s-worker-2 \ No newline at end of file From c5461739f9afaefa1c787340d6b79eeeb3bfcb1a Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 16:56:33 -0700 Subject: [PATCH 21/48] Uniform node config --- k3s/join/join-overrides-configmap.template.yaml | 3 +-- k3s/join/kustomization.yaml | 3 +-- ...p.template.yaml => node-config-configmap.template.yaml} | 7 +++---- 3 files changed, 5 insertions(+), 8 deletions(-) rename k3s/join/{join-node-config-configmap.template.yaml => node-config-configmap.template.yaml} (66%) diff --git a/k3s/join/join-overrides-configmap.template.yaml b/k3s/join/join-overrides-configmap.template.yaml index 7de8e84..30db156 100644 --- a/k3s/join/join-overrides-configmap.template.yaml +++ b/k3s/join/join-overrides-configmap.template.yaml @@ -1,8 +1,7 @@ apiVersion: v1 kind: ConfigMap metadata: - name: join-node-overrides # This name will be used by pods - # namespace: # Will be applied with -n flag + name: join-node-overrides data: # Replace <...> placeholders with actual values for each worker # From config.env.template diff --git a/k3s/join/kustomization.yaml b/k3s/join/kustomization.yaml index 9951710..3955b6b 100644 --- a/k3s/join/kustomization.yaml +++ b/k3s/join/kustomization.yaml @@ -3,7 +3,6 @@ kind: Kustomization resources: - tmkms-pvc.yaml - # inference-data-pvc.yaml was removed, using hostPath instead - tmkms-deployment.yaml - node-statefulset.yaml - node-service.yaml @@ -14,4 +13,4 @@ resources: # The actual ConfigMaps (join-node-settings and join-node-overrides) # will be provided by the overlays, not from the base templates. -# The .template.yaml files in this directory are for user reference only. \ No newline at end of file +# The .template.yaml files in this directory are for user reference only. diff --git a/k3s/join/join-node-config-configmap.template.yaml b/k3s/join/node-config-configmap.template.yaml similarity index 66% rename from k3s/join/join-node-config-configmap.template.yaml rename to k3s/join/node-config-configmap.template.yaml index fa75778..0f2d0b2 100644 --- a/k3s/join/join-node-config-configmap.template.yaml +++ b/k3s/join/node-config-configmap.template.yaml @@ -1,13 +1,12 @@ apiVersion: v1 kind: ConfigMap metadata: - name: join-node-settings # This name will be used to mount the config - # namespace: # Will be applied with -n flag + name: node-config-cm data: - node-config.json: | + node_config.json: | [ { - "id": "node1", + "id": "mlnode1", "host": "inference", "inference_port": 5000, "poc_port": 8080, From e00a987a8a3b3b79ef838b4847306c45a52576b1 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 17:01:56 -0700 Subject: [PATCH 22/48] move api-service.yaml --- k3s/{join => common}/api-service.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename k3s/{join => common}/api-service.yaml (82%) diff --git a/k3s/join/api-service.yaml b/k3s/common/api-service.yaml similarity index 82% rename from k3s/join/api-service.yaml rename to k3s/common/api-service.yaml index 93c27f7..d959f32 100644 --- a/k3s/join/api-service.yaml +++ b/k3s/common/api-service.yaml @@ -2,7 +2,7 @@ apiVersion: v1 kind: Service metadata: name: api - # namespace: + # namespace: will be applied by kubectl -n or overlay labels: app: api spec: @@ -18,4 +18,4 @@ spec: - name: admin-api port: 9200 # Service port for Admin API targetPort: admin-api - # type: ClusterIP # Default. For external access to public-api (9000), consider LoadBalancer or Ingress. + # type: ClusterIP # Default. For external access to public-api (9000), consider LoadBalancer or Ingress. \ No newline at end of file From 2ac0c81f916fbeeb0a0671ae8af1d9be16e78f7b Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 17:06:10 -0700 Subject: [PATCH 23/48] Move identical yml files to /common --- k3s/common/api-service.yaml | 22 +++++++-------- .../inference-service.yaml | 0 k3s/{genesis => common}/node-service.yaml | 0 k3s/common/tmkms-service.yaml | 14 ++++++++++ k3s/genesis/api-service.yaml | 21 --------------- k3s/genesis/kustomization.yaml | 22 +++++++++++++++ k3s/join/inference-service.yaml | 17 ------------ k3s/join/kustomization.yaml | 7 ++--- k3s/join/node-service.yaml | 27 ------------------- k3s/join/node-statefulset.yaml | 6 ++--- 10 files changed, 54 insertions(+), 82 deletions(-) rename k3s/{genesis => common}/inference-service.yaml (100%) rename k3s/{genesis => common}/node-service.yaml (100%) create mode 100644 k3s/common/tmkms-service.yaml delete mode 100644 k3s/genesis/api-service.yaml create mode 100644 k3s/genesis/kustomization.yaml delete mode 100644 k3s/join/inference-service.yaml delete mode 100644 k3s/join/node-service.yaml diff --git a/k3s/common/api-service.yaml b/k3s/common/api-service.yaml index d959f32..6abd0e3 100644 --- a/k3s/common/api-service.yaml +++ b/k3s/common/api-service.yaml @@ -2,20 +2,20 @@ apiVersion: v1 kind: Service metadata: name: api - # namespace: will be applied by kubectl -n or overlay labels: app: api spec: + type: NodePort selector: app: api ports: - - name: public-api - port: 9000 # Service port for public API - targetPort: public-api - - name: ml-api - port: 9100 # Service port for ML API / poc_callback - targetPort: ml-api - - name: admin-api - port: 9200 # Service port for Admin API - targetPort: admin-api - # type: ClusterIP # Default. For external access to public-api (9000), consider LoadBalancer or Ingress. \ No newline at end of file + - name: public + port: 9000 + targetPort: 9000 + nodePort: 30002 # Expose to the outer world + - name: ml-server + port: 9100 + targetPort: 9100 + - name: admin + port: 9200 + targetPort: 9200 diff --git a/k3s/genesis/inference-service.yaml b/k3s/common/inference-service.yaml similarity index 100% rename from k3s/genesis/inference-service.yaml rename to k3s/common/inference-service.yaml diff --git a/k3s/genesis/node-service.yaml b/k3s/common/node-service.yaml similarity index 100% rename from k3s/genesis/node-service.yaml rename to k3s/common/node-service.yaml diff --git a/k3s/common/tmkms-service.yaml b/k3s/common/tmkms-service.yaml new file mode 100644 index 0000000..aef2770 --- /dev/null +++ b/k3s/common/tmkms-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: tmkms + labels: + app: tmkms +spec: + type: ClusterIP # Internal service, not exposed externally by default + selector: + app: tmkms + ports: + - name: tmkms-port + port: 26658 + targetPort: 26658 # Assumes the tmkms container exposes port 26658 \ No newline at end of file diff --git a/k3s/genesis/api-service.yaml b/k3s/genesis/api-service.yaml deleted file mode 100644 index 6abd0e3..0000000 --- a/k3s/genesis/api-service.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: api - labels: - app: api -spec: - type: NodePort - selector: - app: api - ports: - - name: public - port: 9000 - targetPort: 9000 - nodePort: 30002 # Expose to the outer world - - name: ml-server - port: 9100 - targetPort: 9100 - - name: admin - port: 9200 - targetPort: 9200 diff --git a/k3s/genesis/kustomization.yaml b/k3s/genesis/kustomization.yaml new file mode 100644 index 0000000..f956ed7 --- /dev/null +++ b/k3s/genesis/kustomization.yaml @@ -0,0 +1,22 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + # ConfigMaps specific to Genesis + - config.yaml + - node-config-configmap.yaml + - genesis-overrides-configmap.yaml + + # Workloads specific to Genesis + - node-statefulset.yaml + - api-deployment.yaml + - inference-deployment.yaml + + # Common Services + - ../common/api-service.yaml + - ../common/node-service.yaml # Assuming you've moved/created this in common + - ../common/inference-service.yaml # Assuming you've moved/created this in common + +# Note: If your genesis deployments/statefulsets have different labels +# than what the common services select, you might need patches here +# to align them, or ensure labels are consistent. \ No newline at end of file diff --git a/k3s/join/inference-service.yaml b/k3s/join/inference-service.yaml deleted file mode 100644 index 9605bd6..0000000 --- a/k3s/join/inference-service.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: inference - labels: - app: inference -spec: - type: ClusterIP # Default, but explicit for clarity - selector: - app: inference - ports: - - name: poc-api - port: 8080 - targetPort: 8080 - - name: inference-api - port: 5000 - targetPort: 5000 diff --git a/k3s/join/kustomization.yaml b/k3s/join/kustomization.yaml index 3955b6b..08327af 100644 --- a/k3s/join/kustomization.yaml +++ b/k3s/join/kustomization.yaml @@ -5,11 +5,12 @@ resources: - tmkms-pvc.yaml - tmkms-deployment.yaml - node-statefulset.yaml - - node-service.yaml + - ../common/node-service.yaml - api-deployment.yaml - - api-service.yaml + - ../common/api-service.yaml - inference-deployment.yaml - - inference-service.yaml + - ../common/inference-service.yaml + - ../common/tmkms-service.yaml # The actual ConfigMaps (join-node-settings and join-node-overrides) # will be provided by the overlays, not from the base templates. diff --git a/k3s/join/node-service.yaml b/k3s/join/node-service.yaml deleted file mode 100644 index 8591cc2..0000000 --- a/k3s/join/node-service.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: node - # namespace: - labels: - app: node -spec: - selector: - app: node - ports: - - name: p2p - port: 26656 - targetPort: p2p - # protocol: TCP # Default - - name: rpc - port: 26657 - targetPort: rpc - # protocol: TCP # Default - - name: tmkms - port: 26658 # Port that TMKMS connects to on the node pod - targetPort: tmkms - # protocol: TCP # Default - # type: ClusterIP # Default, suitable for internal communication (e.g., TMKMS to node) - # If P2P_EXTERNAL_ADDRESS or RPC needs to be externally exposed via LoadBalancer or NodePort directly from this service, - # you might change the type and define specific external ports. - # However, ingress or specific LoadBalancer services are often preferred for external access. diff --git a/k3s/join/node-statefulset.yaml b/k3s/join/node-statefulset.yaml index cf4be44..2583408 100644 --- a/k3s/join/node-statefulset.yaml +++ b/k3s/join/node-statefulset.yaml @@ -31,11 +31,11 @@ spec: - "./init-docker.sh" # Assuming this script is in the image's WORKDIR/PATH ports: - containerPort: 26656 - name: p2p + name: tendermint-p2p - containerPort: 26657 - name: rpc + name: tendermint-rpc - containerPort: 26658 - name: tmkms # Port the node listens on for TMKMS connections + name: tmkms envFrom: - configMapRef: name: join-node-overrides # From join-overrides-configmap.template.yaml From d6cb5d3bd1b965b1517ee5eee68fc8a8fab4ed0d Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 17:22:31 -0700 Subject: [PATCH 24/48] Make sure every deployment uses credentials --- k3s/join/api-deployment.yaml | 7 ++---- k3s/join/inference-deployment.yaml | 9 +++----- .../join-overrides-configmap.template.yaml | 22 +++++++------------ k3s/join/node-statefulset.yaml | 6 ++--- k3s/join/tmkms-deployment.yaml | 5 ++--- 5 files changed, 18 insertions(+), 31 deletions(-) diff --git a/k3s/join/api-deployment.yaml b/k3s/join/api-deployment.yaml index 4a4ea85..5d509c3 100644 --- a/k3s/join/api-deployment.yaml +++ b/k3s/join/api-deployment.yaml @@ -2,7 +2,6 @@ apiVersion: apps/v1 kind: Deployment metadata: name: api - # namespace: labels: app: api spec: @@ -16,10 +15,8 @@ spec: app: api spec: enableServiceLinks: false - # imagePullSecrets: # Add if your image is in a private registry - # - name: ghcr-credentials - # nodeSelector: # Optional: Pin to a specific worker if needed - # kubernetes.io/hostname: + imagePullSecrets: + - name: ghcr-credentials containers: - name: api image: ghcr.io/product-science/api:0.1.2 diff --git a/k3s/join/inference-deployment.yaml b/k3s/join/inference-deployment.yaml index 0b737ca..8641d3b 100644 --- a/k3s/join/inference-deployment.yaml +++ b/k3s/join/inference-deployment.yaml @@ -2,7 +2,6 @@ apiVersion: apps/v1 kind: Deployment metadata: name: inference - # namespace: labels: app: inference spec: @@ -16,11 +15,9 @@ spec: app: inference spec: enableServiceLinks: false - nodeSelector: - # kubernetes.io/hostname: # e.g., k8s-worker-2 - UNCOMMENT AND SET THIS - hostIPC: true # Matches genesis - # imagePullSecrets: # Add if your image is in a private registry - # - name: ghcr-credentials + hostIPC: true + imagePullSecrets: + - name: ghcr-credentials containers: - name: inference image: ghcr.io/product-science/mlnode:3.0.4-alpha2 diff --git a/k3s/join/join-overrides-configmap.template.yaml b/k3s/join/join-overrides-configmap.template.yaml index 30db156..d545835 100644 --- a/k3s/join/join-overrides-configmap.template.yaml +++ b/k3s/join/join-overrides-configmap.template.yaml @@ -10,17 +10,16 @@ data: PUBLIC_URL: "http://:" P2P_EXTERNAL_ADDRESS: "tcp://:" # e.g., port 5000 NODE_CONFIG: "/etc/node-config/node-config.json" # Path where node-config.json will be mounted from the other ConfigMap - HF_HOME: "/mnt/shared" # Matches genesis and docker-compose intent # Identical for all join workers - SEED_API_URL: "http://36.189.234.237:19250" - SEED_NODE_RPC_URL: "http://172.18.114.125:26657" - SEED_NODE_P2P_URL: "tcp://36.189.234.237:19249" # Make sure genesis node p2p is reachable at this address - DAPI_API__POC_CALLBACK_URL: "http://api:9100" # Internal service communication - DAPI_CHAIN_NODE__URL: "http://node:26657" # Internal service communication - DAPI_CHAIN_NODE__P2P_URL: "http://node:26656" # Internal service communication - RPC_SERVER_URL_1: "http://172.18.114.125:26657" - RPC_SERVER_URL_2: "http://172.18.114.125:26657" + SEED_API_URL: "http://34.9.136.116:30000" + SEED_NODE_RPC_URL: "http://34.9.136.116:30002" + SEED_NODE_P2P_URL: "tcp://34.9.136.116:30001" + DAPI_API__POC_CALLBACK_URL: "http://api:9100" + DAPI_CHAIN_NODE__URL: "http://node:26657" + DAPI_CHAIN_NODE__P2P_URL: "http://node:26656" + RPC_SERVER_URL_1: "http://34.9.136.116:30002" + RPC_SERVER_URL_2: "http://34.9.136.116:30002" # Variables from docker-compose for node TRUSTED_BLOCK_PERIOD: "2000" # Default from docker-compose @@ -30,11 +29,6 @@ data: TKMS_PORT: "26658" # Port node listens on for TMKMS # Additional for API service from docker-compose (if not already covered) - # DAPI_API__PUBLIC_URL is covered by PUBLIC_URL - # DAPI_CHAIN_NODE__SEED_API_URL is covered by SEED_API_URL - # DAPI_CHAIN_NODE__URL is DAPI_CHAIN_NODE__URL - # DAPI_CHAIN_NODE__P2P_URL is DAPI_CHAIN_NODE__P2P_URL - # NODE_CONFIG_PATH is covered by NODE_CONFIG DAPI_API__PUBLIC_SERVER_PORT: "9000" # Internal port for API service, API_PORT is for host exposure DAPI_API__ML_SERVER_PORT: "9100" DAPI_API__ADMIN_SERVER_PORT: "9200" diff --git a/k3s/join/node-statefulset.yaml b/k3s/join/node-statefulset.yaml index 2583408..b1b9ea2 100644 --- a/k3s/join/node-statefulset.yaml +++ b/k3s/join/node-statefulset.yaml @@ -18,9 +18,9 @@ spec: spec: enableServiceLinks: false nodeSelector: - # kubernetes.io/hostname: # e.g., k8s-worker-2 - UNCOMMENT AND SET THIS - # imagePullSecrets: # Add if your image is in a private registry - # - name: ghcr-credentials + kubernetes.io/hostname: # e.g., k8s-worker-2 - UNCOMMENT AND SET THIS + imagePullSecrets: # Add if your image is in a private registry + - name: ghcr-credentials containers: - name: node image: ghcr.io/product-science/inferenced:0.1.2 diff --git a/k3s/join/tmkms-deployment.yaml b/k3s/join/tmkms-deployment.yaml index 02c4691..7e278ac 100644 --- a/k3s/join/tmkms-deployment.yaml +++ b/k3s/join/tmkms-deployment.yaml @@ -17,7 +17,6 @@ apiVersion: apps/v1 kind: Deployment metadata: name: tmkms - # namespace: # Will be applied with -n flag labels: app: tmkms spec: @@ -31,8 +30,8 @@ spec: app: tmkms spec: enableServiceLinks: false - # imagePullSecrets: # Add if your image is in a private registry - # - name: ghcr-credentials + imagePullSecrets: + - name: ghcr-credentials containers: - name: tmkms image: ghcr.io/product-science/tmkms-softsign-with-keygen:0.1.2 From e91e74ed9fd03c558c9c0f0482214cc7ad482333 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 17:29:47 -0700 Subject: [PATCH 25/48] Split API into two services --- k3s/common/api-external-service.yaml | 15 +++++++++++++++ k3s/common/api-internal-service.yaml | 17 +++++++++++++++++ k3s/common/api-service.yaml | 21 --------------------- k3s/genesis/kustomization.yaml | 8 +++++--- k3s/join/kustomization.yaml | 3 ++- 5 files changed, 39 insertions(+), 25 deletions(-) create mode 100644 k3s/common/api-external-service.yaml create mode 100644 k3s/common/api-internal-service.yaml delete mode 100644 k3s/common/api-service.yaml diff --git a/k3s/common/api-external-service.yaml b/k3s/common/api-external-service.yaml new file mode 100644 index 0000000..a17979e --- /dev/null +++ b/k3s/common/api-external-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: api-external # Clear name for the external-facing service + labels: + app: api +spec: + type: NodePort + selector: + app: api + ports: + - name: public # Matches the name from your modified common api-service + port: 9000 + targetPort: public-api # Assumes container port in api-deployment is named 'public-api' + nodePort: 30000 # Your specified NodePort for external access \ No newline at end of file diff --git a/k3s/common/api-internal-service.yaml b/k3s/common/api-internal-service.yaml new file mode 100644 index 0000000..eba62e7 --- /dev/null +++ b/k3s/common/api-internal-service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: api # Or api-internal, depending on your preferred internal DNS name + labels: + app: api +spec: + type: ClusterIP # Default, but explicit for clarity + selector: + app: api + ports: + - name: ml-server # Matches the name from your modified common api-service + port: 9100 + targetPort: ml-api # Assumes container port in api-deployment is named 'ml-api' + - name: admin # Matches the name from your modified common api-service + port: 9200 + targetPort: admin-api # Assumes container port in api-deployment is named 'admin-api' \ No newline at end of file diff --git a/k3s/common/api-service.yaml b/k3s/common/api-service.yaml deleted file mode 100644 index 6abd0e3..0000000 --- a/k3s/common/api-service.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: api - labels: - app: api -spec: - type: NodePort - selector: - app: api - ports: - - name: public - port: 9000 - targetPort: 9000 - nodePort: 30002 # Expose to the outer world - - name: ml-server - port: 9100 - targetPort: 9100 - - name: admin - port: 9200 - targetPort: 9200 diff --git a/k3s/genesis/kustomization.yaml b/k3s/genesis/kustomization.yaml index f956ed7..40f07a2 100644 --- a/k3s/genesis/kustomization.yaml +++ b/k3s/genesis/kustomization.yaml @@ -13,9 +13,11 @@ resources: - inference-deployment.yaml # Common Services - - ../common/api-service.yaml - - ../common/node-service.yaml # Assuming you've moved/created this in common - - ../common/inference-service.yaml # Assuming you've moved/created this in common + - ../common/api-internal-service.yaml + - ../common/api-external-service.yaml + - ../common/node-service.yaml + - ../common/inference-service.yaml + - ../common/tmkms-service.yaml # Note: If your genesis deployments/statefulsets have different labels # than what the common services select, you might need patches here diff --git a/k3s/join/kustomization.yaml b/k3s/join/kustomization.yaml index 08327af..40bad4d 100644 --- a/k3s/join/kustomization.yaml +++ b/k3s/join/kustomization.yaml @@ -7,7 +7,8 @@ resources: - node-statefulset.yaml - ../common/node-service.yaml - api-deployment.yaml - - ../common/api-service.yaml + - ../common/api-internal-service.yaml + - ../common/api-external-service.yaml - inference-deployment.yaml - ../common/inference-service.yaml - ../common/tmkms-service.yaml From 3ec711764d45e8f67b96d75bb8602c1b22eb774c Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 17:37:11 -0700 Subject: [PATCH 26/48] Rename API service --- k3s/common/api-external-service.yaml | 15 --------------- k3s/common/api-internal-service.yaml | 17 ----------------- k3s/common/api-private-service.yaml | 17 +++++++++++++++++ k3s/common/api-public-service.yaml | 15 +++++++++++++++ k3s/genesis/api-deployment.yaml | 2 +- k3s/genesis/kustomization.yaml | 4 ++-- k3s/join/join-overrides-configmap.template.yaml | 2 +- k3s/join/kustomization.yaml | 8 ++++---- k3s/join/tmkms-deployment.yaml | 15 --------------- 9 files changed, 40 insertions(+), 55 deletions(-) delete mode 100644 k3s/common/api-external-service.yaml delete mode 100644 k3s/common/api-internal-service.yaml create mode 100644 k3s/common/api-private-service.yaml create mode 100644 k3s/common/api-public-service.yaml diff --git a/k3s/common/api-external-service.yaml b/k3s/common/api-external-service.yaml deleted file mode 100644 index a17979e..0000000 --- a/k3s/common/api-external-service.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: api-external # Clear name for the external-facing service - labels: - app: api -spec: - type: NodePort - selector: - app: api - ports: - - name: public # Matches the name from your modified common api-service - port: 9000 - targetPort: public-api # Assumes container port in api-deployment is named 'public-api' - nodePort: 30000 # Your specified NodePort for external access \ No newline at end of file diff --git a/k3s/common/api-internal-service.yaml b/k3s/common/api-internal-service.yaml deleted file mode 100644 index eba62e7..0000000 --- a/k3s/common/api-internal-service.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: api # Or api-internal, depending on your preferred internal DNS name - labels: - app: api -spec: - type: ClusterIP # Default, but explicit for clarity - selector: - app: api - ports: - - name: ml-server # Matches the name from your modified common api-service - port: 9100 - targetPort: ml-api # Assumes container port in api-deployment is named 'ml-api' - - name: admin # Matches the name from your modified common api-service - port: 9200 - targetPort: admin-api # Assumes container port in api-deployment is named 'admin-api' \ No newline at end of file diff --git a/k3s/common/api-private-service.yaml b/k3s/common/api-private-service.yaml new file mode 100644 index 0000000..742317d --- /dev/null +++ b/k3s/common/api-private-service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: api-private # RENAMED from api-internal + labels: + app: api +spec: + type: ClusterIP + selector: + app: api + ports: + - name: ml-server + port: 9100 + targetPort: 9100 # Assuming api-deployment.yaml containerPort is 9100 (or named ml-api which resolves to 9100) + - name: admin + port: 9200 + targetPort: 9200 # Assuming api-deployment.yaml containerPort is 9200 (or named admin-api which resolves to 9200) \ No newline at end of file diff --git a/k3s/common/api-public-service.yaml b/k3s/common/api-public-service.yaml new file mode 100644 index 0000000..fef2703 --- /dev/null +++ b/k3s/common/api-public-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: api-public + labels: + app: api +spec: + type: NodePort + selector: + app: api + ports: + - name: public + port: 9000 + targetPort: 9000 + nodePort: 30000 \ No newline at end of file diff --git a/k3s/genesis/api-deployment.yaml b/k3s/genesis/api-deployment.yaml index 1c69596..92827df 100644 --- a/k3s/genesis/api-deployment.yaml +++ b/k3s/genesis/api-deployment.yaml @@ -33,7 +33,7 @@ spec: - name: KEY_NAME value: "genesis" - name: DAPI_API__POC_CALLBACK_URL - value: "http://api:9100" + value: "http://api-private:9100" - name: DAPI_API__PUBLIC_URL valueFrom: configMapKeyRef: diff --git a/k3s/genesis/kustomization.yaml b/k3s/genesis/kustomization.yaml index 40f07a2..9c572b9 100644 --- a/k3s/genesis/kustomization.yaml +++ b/k3s/genesis/kustomization.yaml @@ -13,8 +13,8 @@ resources: - inference-deployment.yaml # Common Services - - ../common/api-internal-service.yaml - - ../common/api-external-service.yaml + - ../common/api-private-service.yaml + - ../common/api-public-service.yaml - ../common/node-service.yaml - ../common/inference-service.yaml - ../common/tmkms-service.yaml diff --git a/k3s/join/join-overrides-configmap.template.yaml b/k3s/join/join-overrides-configmap.template.yaml index d545835..3c53751 100644 --- a/k3s/join/join-overrides-configmap.template.yaml +++ b/k3s/join/join-overrides-configmap.template.yaml @@ -15,7 +15,7 @@ data: SEED_API_URL: "http://34.9.136.116:30000" SEED_NODE_RPC_URL: "http://34.9.136.116:30002" SEED_NODE_P2P_URL: "tcp://34.9.136.116:30001" - DAPI_API__POC_CALLBACK_URL: "http://api:9100" + DAPI_API__POC_CALLBACK_URL: "http://api-private:9100" DAPI_CHAIN_NODE__URL: "http://node:26657" DAPI_CHAIN_NODE__P2P_URL: "http://node:26656" RPC_SERVER_URL_1: "http://34.9.136.116:30002" diff --git a/k3s/join/kustomization.yaml b/k3s/join/kustomization.yaml index 40bad4d..2b7147d 100644 --- a/k3s/join/kustomization.yaml +++ b/k3s/join/kustomization.yaml @@ -2,15 +2,15 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - tmkms-pvc.yaml - - tmkms-deployment.yaml - node-statefulset.yaml - ../common/node-service.yaml - api-deployment.yaml - - ../common/api-internal-service.yaml - - ../common/api-external-service.yaml + - ../common/api-private-service.yaml + - ../common/api-public-service.yaml - inference-deployment.yaml - ../common/inference-service.yaml + - tmkms-deployment.yaml + - tmkms-pvc.yaml - ../common/tmkms-service.yaml # The actual ConfigMaps (join-node-settings and join-node-overrides) diff --git a/k3s/join/tmkms-deployment.yaml b/k3s/join/tmkms-deployment.yaml index 7e278ac..820e8b5 100644 --- a/k3s/join/tmkms-deployment.yaml +++ b/k3s/join/tmkms-deployment.yaml @@ -1,18 +1,3 @@ -# 1️⃣ Storage: one-gig PVC that binds to the default StorageClass -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: tmkms-pvc -spec: - accessModes: ["ReadWriteOnce"] # one writer, many readers on same node - resources: - requests: - storage: 1Gi - # storageClassName: "" # omit → use cluster default - # Optional: let the PV survive even if someone deletes the PVC - # volumeMode: Filesystem ---- -# 2️⃣ Workload: Deployment with an idempotent init-container apiVersion: apps/v1 kind: Deployment metadata: From ac9fe4c1b5937c6b8b18c0799707deec605ffebe Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 17:39:58 -0700 Subject: [PATCH 27/48] move configmap --- .../node-configs}/node-config-configmap.yaml | 2 +- k3s/genesis/kustomization.yaml | 6 ++--- k3s/join/api-deployment.yaml | 13 ++++------- k3s/join/kustomization.yaml | 3 +++ k3s/join/node-config-configmap.template.yaml | 23 ------------------- k3s/join/node-statefulset.yaml | 8 ------- 6 files changed, 11 insertions(+), 44 deletions(-) rename k3s/{genesis => common/node-configs}/node-config-configmap.yaml (85%) delete mode 100644 k3s/join/node-config-configmap.template.yaml diff --git a/k3s/genesis/node-config-configmap.yaml b/k3s/common/node-configs/node-config-configmap.yaml similarity index 85% rename from k3s/genesis/node-config-configmap.yaml rename to k3s/common/node-configs/node-config-configmap.yaml index 0f2d0b2..0b3d241 100644 --- a/k3s/genesis/node-config-configmap.yaml +++ b/k3s/common/node-configs/node-config-configmap.yaml @@ -1,7 +1,7 @@ apiVersion: v1 kind: ConfigMap metadata: - name: node-config-cm + name: node-config-cm # This name is referenced by genesis workloads data: node_config.json: | [ diff --git a/k3s/genesis/kustomization.yaml b/k3s/genesis/kustomization.yaml index 9c572b9..aaacc14 100644 --- a/k3s/genesis/kustomization.yaml +++ b/k3s/genesis/kustomization.yaml @@ -4,7 +4,6 @@ kind: Kustomization resources: # ConfigMaps specific to Genesis - config.yaml - - node-config-configmap.yaml - genesis-overrides-configmap.yaml # Workloads specific to Genesis @@ -19,6 +18,5 @@ resources: - ../common/inference-service.yaml - ../common/tmkms-service.yaml -# Note: If your genesis deployments/statefulsets have different labels -# than what the common services select, you might need patches here -# to align them, or ensure labels are consistent. \ No newline at end of file + # ML node config + - ../common/node-configs/node-config-configmap.yaml diff --git a/k3s/join/api-deployment.yaml b/k3s/join/api-deployment.yaml index 5d509c3..c86431b 100644 --- a/k3s/join/api-deployment.yaml +++ b/k3s/join/api-deployment.yaml @@ -36,18 +36,15 @@ spec: value: "/etc/node-config/node-config.json" volumeMounts: - name: inference-data - mountPath: /root/.inference # Shared data with node - - name: node-config-volume # Mount the node-config.json from the CM - mountPath: /etc/node-config # Mount directory + mountPath: /root/.inference + - name: node-config + mountPath: /etc/node-config readOnly: true volumes: - name: inference-data # This volume is now a hostPath hostPath: path: /srv/dai/inference # Matches genesis setup type: DirectoryOrCreate - - name: node-config-volume + - name: node-config configMap: - name: join-node-settings # Contains node-config.json - items: - - key: node-config.json - path: node-config.json + name: node-config-cm diff --git a/k3s/join/kustomization.yaml b/k3s/join/kustomization.yaml index 2b7147d..ace0918 100644 --- a/k3s/join/kustomization.yaml +++ b/k3s/join/kustomization.yaml @@ -13,6 +13,9 @@ resources: - tmkms-pvc.yaml - ../common/tmkms-service.yaml + # ML node config + - ../common/node-configs/node-config-configmap.yaml + # The actual ConfigMaps (join-node-settings and join-node-overrides) # will be provided by the overlays, not from the base templates. # The .template.yaml files in this directory are for user reference only. diff --git a/k3s/join/node-config-configmap.template.yaml b/k3s/join/node-config-configmap.template.yaml deleted file mode 100644 index 0f2d0b2..0000000 --- a/k3s/join/node-config-configmap.template.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: node-config-cm -data: - node_config.json: | - [ - { - "id": "mlnode1", - "host": "inference", - "inference_port": 5000, - "poc_port": 8080, - "max_concurrent": 500, - "models": { - "Qwen/Qwen2.5-7B-Instruct": { - "args": [ - "--quantization", - "fp8" - ] - } - } - } - ] diff --git a/k3s/join/node-statefulset.yaml b/k3s/join/node-statefulset.yaml index b1b9ea2..96fd38d 100644 --- a/k3s/join/node-statefulset.yaml +++ b/k3s/join/node-statefulset.yaml @@ -42,16 +42,8 @@ spec: volumeMounts: - name: inference-data mountPath: /root/.inference # Shared data with API - - name: node-config-volume - mountPath: /etc/node-config # Mount dir for node-config.json volumes: - name: inference-data # This volume is now a hostPath hostPath: path: /srv/dai/inference # Matches genesis setup type: DirectoryOrCreate - - name: node-config-volume - configMap: - name: join-node-settings # From join-node-config-configmap.template.yaml - items: - - key: node-config.json - path: node-config.json From b89e5775e15be2ac7b6c25d7873de0d16abd4179 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 17:55:53 -0700 Subject: [PATCH 28/48] Common base --- k3s/common/kustomization.yaml | 15 +++++++++++++++ k3s/genesis/kustomization.yaml | 18 ++++++++---------- k3s/join/kustomization.yaml | 28 +++++++++++++++------------- 3 files changed, 38 insertions(+), 23 deletions(-) create mode 100644 k3s/common/kustomization.yaml diff --git a/k3s/common/kustomization.yaml b/k3s/common/kustomization.yaml new file mode 100644 index 0000000..0818081 --- /dev/null +++ b/k3s/common/kustomization.yaml @@ -0,0 +1,15 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + # Common services + - api-public-service.yaml + - api-private-service.yaml + - node-service.yaml + - inference-service.yaml + - tmkms-service.yaml + + # Common ConfigMaps (example, adjust path if needed) + - node-configs/node-config-configmap.yaml # This is the one genesis uses from common + # If you have other common configmap templates or files that are NOT templates + # and should always be included, list them here. \ No newline at end of file diff --git a/k3s/genesis/kustomization.yaml b/k3s/genesis/kustomization.yaml index aaacc14..512c013 100644 --- a/k3s/genesis/kustomization.yaml +++ b/k3s/genesis/kustomization.yaml @@ -1,22 +1,20 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization +bases: + - ../common # Include all common resources from k3s/common/kustomization.yaml + resources: - # ConfigMaps specific to Genesis + # ConfigMaps specific to Genesis (that are NOT in common/kustomization.yaml) - config.yaml - genesis-overrides-configmap.yaml + # The 'node-config-configmap.yaml' for genesis is now included via the ../common base # Workloads specific to Genesis - node-statefulset.yaml - api-deployment.yaml - inference-deployment.yaml - # Common Services - - ../common/api-private-service.yaml - - ../common/api-public-service.yaml - - ../common/node-service.yaml - - ../common/inference-service.yaml - - ../common/tmkms-service.yaml - - # ML node config - - ../common/node-configs/node-config-configmap.yaml +# Note: Patches for genesis-specific overrides (e.g., different image tags +# or resource limits than a common base, if you had one for workloads) +# would go here. diff --git a/k3s/join/kustomization.yaml b/k3s/join/kustomization.yaml index ace0918..7edc626 100644 --- a/k3s/join/kustomization.yaml +++ b/k3s/join/kustomization.yaml @@ -1,21 +1,23 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization +bases: + - ../common # Include all common resources from k3s/common/kustomization.yaml + resources: + # Base join workloads (will be further customized by overlays) + - tmkms-pvc.yaml + - tmkms-deployment.yaml - node-statefulset.yaml - - ../common/node-service.yaml + # node-service.yaml is now in common - api-deployment.yaml - - ../common/api-private-service.yaml - - ../common/api-public-service.yaml + # api-services are now in common - inference-deployment.yaml - - ../common/inference-service.yaml - - tmkms-deployment.yaml - - tmkms-pvc.yaml - - ../common/tmkms-service.yaml - - # ML node config - - ../common/node-configs/node-config-configmap.yaml + # inference-service.yaml is now in common + # tmkms-service.yaml is now in common -# The actual ConfigMaps (join-node-settings and join-node-overrides) -# will be provided by the overlays, not from the base templates. -# The .template.yaml files in this directory are for user reference only. +# Note: The actual ConfigMaps for join nodes (join-node-settings, join-node-overrides) +# are NOT listed here. They are provided by the overlays (e.g., k3s/overlays/worker2/) +# as they are worker-specific. +# The ConfigMap templates in k3s/common/node-configs/ (like join-node-config-configmap.template.yaml) +# are for user reference to create the actual ConfigMaps in overlays. From 120ad16dbcae0ae7bfd956480b90876f7e4c962d Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 18:00:00 -0700 Subject: [PATCH 29/48] .. --- k3s/join/node-statefulset.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/k3s/join/node-statefulset.yaml b/k3s/join/node-statefulset.yaml index 96fd38d..b0df267 100644 --- a/k3s/join/node-statefulset.yaml +++ b/k3s/join/node-statefulset.yaml @@ -2,7 +2,6 @@ apiVersion: apps/v1 kind: StatefulSet metadata: name: node - # namespace: labels: app: node spec: From 76ded17217b9aebdf78af53af62de535d7f1a6bc Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 21:18:47 -0700 Subject: [PATCH 30/48] Genesis seems to work --- k3s/README.md | 11 ++++ k3s/common/kustomization.yaml | 8 +-- k3s/genesis/kustomization.yaml | 12 +--- ...es-configmap.template.yaml => config.yaml} | 3 +- k3s/join/kustomization.yaml | 16 +----- k3s/join/node-statefulset.yaml | 2 - k3s/join/tmkms-deployment.yaml | 8 +-- k3s/{common => join}/tmkms-service.yaml | 0 k3s/overlays/worker2/kustomization.yaml | 8 +-- k3s/overlays/worker3/kustomization.yaml | 55 +++++++++++++++++++ 10 files changed, 78 insertions(+), 45 deletions(-) rename k3s/join/{join-overrides-configmap.template.yaml => config.yaml} (94%) rename k3s/{common => join}/tmkms-service.yaml (100%) create mode 100644 k3s/overlays/worker3/kustomization.yaml diff --git a/k3s/README.md b/k3s/README.md index 6ebeadc..6a977aa 100644 --- a/k3s/README.md +++ b/k3s/README.md @@ -1,3 +1,14 @@ +Run genesis node + +```bash +kubectl apply -k k3s/genesis -n genesis +``` + +Stop genesis node +```bash +kubectl delete all --all -n genesis +``` + ```bash NAMESPACE=join-worker2 kubectl create namespace $NAMESPACE diff --git a/k3s/common/kustomization.yaml b/k3s/common/kustomization.yaml index 0818081..da37aa7 100644 --- a/k3s/common/kustomization.yaml +++ b/k3s/common/kustomization.yaml @@ -2,14 +2,10 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - # Common services - api-public-service.yaml - api-private-service.yaml - node-service.yaml - inference-service.yaml - - tmkms-service.yaml - # Common ConfigMaps (example, adjust path if needed) - - node-configs/node-config-configmap.yaml # This is the one genesis uses from common - # If you have other common configmap templates or files that are NOT templates - # and should always be included, list them here. \ No newline at end of file + # Common ConfigMaps + - node-configs/node-config-configmap.yaml diff --git a/k3s/genesis/kustomization.yaml b/k3s/genesis/kustomization.yaml index 512c013..5f3a868 100644 --- a/k3s/genesis/kustomization.yaml +++ b/k3s/genesis/kustomization.yaml @@ -1,20 +1,14 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization -bases: - - ../common # Include all common resources from k3s/common/kustomization.yaml - +# Changed 'bases:' to 'resources:' to include the common kustomization resources: - # ConfigMaps specific to Genesis (that are NOT in common/kustomization.yaml) + - ../common # This now correctly lists the common kustomization under resources + - config.yaml - genesis-overrides-configmap.yaml - # The 'node-config-configmap.yaml' for genesis is now included via the ../common base # Workloads specific to Genesis - node-statefulset.yaml - api-deployment.yaml - inference-deployment.yaml - -# Note: Patches for genesis-specific overrides (e.g., different image tags -# or resource limits than a common base, if you had one for workloads) -# would go here. diff --git a/k3s/join/join-overrides-configmap.template.yaml b/k3s/join/config.yaml similarity index 94% rename from k3s/join/join-overrides-configmap.template.yaml rename to k3s/join/config.yaml index 3c53751..50317f4 100644 --- a/k3s/join/join-overrides-configmap.template.yaml +++ b/k3s/join/config.yaml @@ -6,12 +6,11 @@ data: # Replace <...> placeholders with actual values for each worker # From config.env.template KEY_NAME: "" - API_PORT: "" # e.g., "8001" for worker 2, "8002" for worker 3 PUBLIC_URL: "http://:" P2P_EXTERNAL_ADDRESS: "tcp://:" # e.g., port 5000 - NODE_CONFIG: "/etc/node-config/node-config.json" # Path where node-config.json will be mounted from the other ConfigMap # Identical for all join workers + NODE_CONFIG: "/etc/node-config/node-config.json" # Path where node-config.json will be mounted from the other ConfigMap SEED_API_URL: "http://34.9.136.116:30000" SEED_NODE_RPC_URL: "http://34.9.136.116:30002" SEED_NODE_P2P_URL: "tcp://34.9.136.116:30001" diff --git a/k3s/join/kustomization.yaml b/k3s/join/kustomization.yaml index 7edc626..753cb41 100644 --- a/k3s/join/kustomization.yaml +++ b/k3s/join/kustomization.yaml @@ -1,23 +1,11 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization -bases: - - ../common # Include all common resources from k3s/common/kustomization.yaml - resources: - # Base join workloads (will be further customized by overlays) + - ../common + - tmkms-pvc.yaml - tmkms-deployment.yaml - node-statefulset.yaml - # node-service.yaml is now in common - api-deployment.yaml - # api-services are now in common - inference-deployment.yaml - # inference-service.yaml is now in common - # tmkms-service.yaml is now in common - -# Note: The actual ConfigMaps for join nodes (join-node-settings, join-node-overrides) -# are NOT listed here. They are provided by the overlays (e.g., k3s/overlays/worker2/) -# as they are worker-specific. -# The ConfigMap templates in k3s/common/node-configs/ (like join-node-config-configmap.template.yaml) -# are for user reference to create the actual ConfigMaps in overlays. diff --git a/k3s/join/node-statefulset.yaml b/k3s/join/node-statefulset.yaml index b0df267..8a991ba 100644 --- a/k3s/join/node-statefulset.yaml +++ b/k3s/join/node-statefulset.yaml @@ -16,8 +16,6 @@ spec: app: node spec: enableServiceLinks: false - nodeSelector: - kubernetes.io/hostname: # e.g., k8s-worker-2 - UNCOMMENT AND SET THIS imagePullSecrets: # Add if your image is in a private registry - name: ghcr-credentials containers: diff --git a/k3s/join/tmkms-deployment.yaml b/k3s/join/tmkms-deployment.yaml index 820e8b5..1684898 100644 --- a/k3s/join/tmkms-deployment.yaml +++ b/k3s/join/tmkms-deployment.yaml @@ -23,11 +23,7 @@ spec: imagePullPolicy: IfNotPresent env: - name: VALIDATOR_LISTEN_ADDRESS - value: "tcp://node:26658" # Assumes node service is named 'node' and exposes port 26658 for tmkms - # Add other TMKMS specific environment variables if needed from a ConfigMap - # envFrom: - # - configMapRef: - # name: join-node-overrides + value: "tcp://node:26658" ports: - containerPort: 26658 # Though tmkms connects out, its good practice to declare if it also listens on a port, even if not exposed via service # The docker-compose doesn't explicitly expose a port for tmkms itself. @@ -38,5 +34,3 @@ spec: - name: tmkms-data persistentVolumeClaim: claimName: tmkms-pvc - # nodeSelector: # Usually TMKMS runs alongside its node. Add if you want to pin it. - # kubernetes.io/hostname: # e.g., k8s-worker-2 diff --git a/k3s/common/tmkms-service.yaml b/k3s/join/tmkms-service.yaml similarity index 100% rename from k3s/common/tmkms-service.yaml rename to k3s/join/tmkms-service.yaml diff --git a/k3s/overlays/worker2/kustomization.yaml b/k3s/overlays/worker2/kustomization.yaml index 3282d1d6..521fac4 100644 --- a/k3s/overlays/worker2/kustomization.yaml +++ b/k3s/overlays/worker2/kustomization.yaml @@ -1,12 +1,8 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization -bases: - - ../../join # Points to the k3s/join directory as the base - -# Worker-specific ConfigMaps that you will create in this directory -# by copying and filling the templates from the base. resources: + - ../../join - worker2-node-config.yaml # You will create this file - worker2-overrides.yaml # You will create this file @@ -33,6 +29,8 @@ patchesStrategicMerge: nodeSelector: kubernetes.io/hostname: k8s-worker-2 +# Patches for tmkms and api deployments if needed for this worker + # To also pin TMKMS and API deployments to k8s-worker-2, add similar patches: # - |- # apiVersion: apps/v1 diff --git a/k3s/overlays/worker3/kustomization.yaml b/k3s/overlays/worker3/kustomization.yaml new file mode 100644 index 0000000..22fb843 --- /dev/null +++ b/k3s/overlays/worker3/kustomization.yaml @@ -0,0 +1,55 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../join + - worker3-node-config.yaml + - worker3-overrides.yaml + +patchesStrategicMerge: + - |- # Patch for Node StatefulSet + apiVersion: apps/v1 + kind: StatefulSet + metadata: + name: node + spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-2 + + - |- # Patch for Inference Deployment + apiVersion: apps/v1 + kind: Deployment + metadata: + name: inference + spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-2 + +# Patches for tmkms and api deployments if needed for this worker + +# To also pin TMKMS and API deployments to k8s-worker-2, add similar patches: +# - |- +# apiVersion: apps/v1 +# kind: Deployment +# metadata: +# name: tmkms +# spec: +# template: +# spec: +# nodeSelector: +# kubernetes.io/hostname: k8s-worker-2 +# +# - |- +# apiVersion: apps/v1 +# kind: Deployment +# metadata: +# name: api +# spec: +# template: +# spec: +# nodeSelector: +# kubernetes.io/hostname: k8s-worker-2 \ No newline at end of file From beffe728748f0ad90675c26e119a1337a17d3d91 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 22:12:52 -0700 Subject: [PATCH 31/48] patches --- k3s/README.md | 32 +++++++---- .../api-deployment-patch.yaml | 9 +++ .../join-k8s-worker-2/config-patch.yaml | 8 +++ .../inference-deployment-patch.yaml | 9 +++ .../join-k8s-worker-2/kustomization.yaml | 12 ++++ .../node-statefulset-patch.yaml | 9 +++ .../tmkms-deployment-patch.yaml | 9 +++ .../api-deployment-patch.yaml | 9 +++ .../join-k8s-worker-3/config-patch.yaml | 8 +++ .../inference-deployment-patch.yaml | 9 +++ .../join-k8s-worker-3/kustomization.yaml | 12 ++++ .../node-statefulset-patch.yaml | 9 +++ .../tmkms-deployment-patch.yaml | 9 +++ k3s/overlays/worker2/kustomization.yaml | 55 ------------------- k3s/overlays/worker3/kustomization.yaml | 55 ------------------- 15 files changed, 134 insertions(+), 120 deletions(-) create mode 100644 k3s/overlays/join-k8s-worker-2/api-deployment-patch.yaml create mode 100644 k3s/overlays/join-k8s-worker-2/config-patch.yaml create mode 100644 k3s/overlays/join-k8s-worker-2/inference-deployment-patch.yaml create mode 100644 k3s/overlays/join-k8s-worker-2/kustomization.yaml create mode 100644 k3s/overlays/join-k8s-worker-2/node-statefulset-patch.yaml create mode 100644 k3s/overlays/join-k8s-worker-2/tmkms-deployment-patch.yaml create mode 100644 k3s/overlays/join-k8s-worker-3/api-deployment-patch.yaml create mode 100644 k3s/overlays/join-k8s-worker-3/config-patch.yaml create mode 100644 k3s/overlays/join-k8s-worker-3/inference-deployment-patch.yaml create mode 100644 k3s/overlays/join-k8s-worker-3/kustomization.yaml create mode 100644 k3s/overlays/join-k8s-worker-3/node-statefulset-patch.yaml create mode 100644 k3s/overlays/join-k8s-worker-3/tmkms-deployment-patch.yaml delete mode 100644 k3s/overlays/worker2/kustomization.yaml delete mode 100644 k3s/overlays/worker3/kustomization.yaml diff --git a/k3s/README.md b/k3s/README.md index 6a977aa..bedf397 100644 --- a/k3s/README.md +++ b/k3s/README.md @@ -1,6 +1,7 @@ Run genesis node ```bash +kubectl create namespace genesis # if not already created kubectl apply -k k3s/genesis -n genesis ``` @@ -9,15 +10,26 @@ Stop genesis node kubectl delete all --all -n genesis ``` +Run join-worker-2 + +```bash +kubectl create namespace join-k8s-worker-2 # if not already created +kubectl apply -k k3s/overlays/join-k8s-worker-2 -n join-k8s-worker-2 +``` + +Stop join-worker-2 +```bash +kubectl delete all --all -n join-k8s-worker-2 +``` + +Run join-worker-3 + +```bash +kubectl create namespace join-k8s-worker-3 # if not already created +kubectl apply -k k3s/overlays/join-k8s-worker-3 -n join-k8s-worker-3 +``` + +Stop join-worker-3 ```bash -NAMESPACE=join-worker2 -kubectl create namespace $NAMESPACE -kubectl apply -f worker2-node-config.yaml -n $NAMESPACE -kubectl apply -f worker2-overrides.yaml -n $NAMESPACE -# Assuming nodeSelector in k3s/join/*-deployment/statefulset.yaml is set for k8s-worker-2 -kubectl apply -f k3s/join/tmkms-pvc.yaml -n $NAMESPACE -kubectl apply -f k3s/join/inference-data-pvc.yaml -n $NAMESPACE -kubectl apply -f k3s/join/tmkms-deployment.yaml -n $NAMESPACE -kubectl apply -f k3s/join/node-statefulset.yaml -n $NAMESPACE # and so on for all manifests -kubectl apply -f k3s/join/ -n $NAMESPACE # Or apply all at once after configmaps and PVCs +kubectl delete all --all -n join-k8s-worker-3 ``` diff --git a/k3s/overlays/join-k8s-worker-2/api-deployment-patch.yaml b/k3s/overlays/join-k8s-worker-2/api-deployment-patch.yaml new file mode 100644 index 0000000..c2c660e --- /dev/null +++ b/k3s/overlays/join-k8s-worker-2/api-deployment-patch.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api +spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-2 \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-2/config-patch.yaml b/k3s/overlays/join-k8s-worker-2/config-patch.yaml new file mode 100644 index 0000000..3487dff --- /dev/null +++ b/k3s/overlays/join-k8s-worker-2/config-patch.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: config +data: + KEY_NAME: "join-k8s-worker-2" + PUBLIC_URL: "35.192.7.224:3000" + P2P_EXTERNAL_ADDRESS: "35.192.7.224:30001" \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-2/inference-deployment-patch.yaml b/k3s/overlays/join-k8s-worker-2/inference-deployment-patch.yaml new file mode 100644 index 0000000..f28cae6 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-2/inference-deployment-patch.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: inference +spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-2 \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-2/kustomization.yaml b/k3s/overlays/join-k8s-worker-2/kustomization.yaml new file mode 100644 index 0000000..c6e826d --- /dev/null +++ b/k3s/overlays/join-k8s-worker-2/kustomization.yaml @@ -0,0 +1,12 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +bases: + - ../../join + +patchesStrategicMerge: + - tmkms-deployment-patch.yaml + - node-statefulset-patch.yaml + - api-deployment-patch.yaml + - inference-deployment-patch.yaml + - config-patch.yaml diff --git a/k3s/overlays/join-k8s-worker-2/node-statefulset-patch.yaml b/k3s/overlays/join-k8s-worker-2/node-statefulset-patch.yaml new file mode 100644 index 0000000..389ed99 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-2/node-statefulset-patch.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: node +spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-2 \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-2/tmkms-deployment-patch.yaml b/k3s/overlays/join-k8s-worker-2/tmkms-deployment-patch.yaml new file mode 100644 index 0000000..30660c9 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-2/tmkms-deployment-patch.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tmkms +spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-2 \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-3/api-deployment-patch.yaml b/k3s/overlays/join-k8s-worker-3/api-deployment-patch.yaml new file mode 100644 index 0000000..a728ed7 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-3/api-deployment-patch.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api +spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-3 \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-3/config-patch.yaml b/k3s/overlays/join-k8s-worker-3/config-patch.yaml new file mode 100644 index 0000000..e3f4c44 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-3/config-patch.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: config +data: + KEY_NAME: "join-k8s-worker-3" + PUBLIC_URL: "34.9.17.182:3000" + P2P_EXTERNAL_ADDRESS: "34.9.17.182:30001" \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-3/inference-deployment-patch.yaml b/k3s/overlays/join-k8s-worker-3/inference-deployment-patch.yaml new file mode 100644 index 0000000..fe57e37 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-3/inference-deployment-patch.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: inference +spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-3 \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-3/kustomization.yaml b/k3s/overlays/join-k8s-worker-3/kustomization.yaml new file mode 100644 index 0000000..c6e826d --- /dev/null +++ b/k3s/overlays/join-k8s-worker-3/kustomization.yaml @@ -0,0 +1,12 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +bases: + - ../../join + +patchesStrategicMerge: + - tmkms-deployment-patch.yaml + - node-statefulset-patch.yaml + - api-deployment-patch.yaml + - inference-deployment-patch.yaml + - config-patch.yaml diff --git a/k3s/overlays/join-k8s-worker-3/node-statefulset-patch.yaml b/k3s/overlays/join-k8s-worker-3/node-statefulset-patch.yaml new file mode 100644 index 0000000..ff157cc --- /dev/null +++ b/k3s/overlays/join-k8s-worker-3/node-statefulset-patch.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: node +spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-3 \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-3/tmkms-deployment-patch.yaml b/k3s/overlays/join-k8s-worker-3/tmkms-deployment-patch.yaml new file mode 100644 index 0000000..ed89191 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-3/tmkms-deployment-patch.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tmkms +spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-3 \ No newline at end of file diff --git a/k3s/overlays/worker2/kustomization.yaml b/k3s/overlays/worker2/kustomization.yaml deleted file mode 100644 index 521fac4..0000000 --- a/k3s/overlays/worker2/kustomization.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -resources: - - ../../join - - worker2-node-config.yaml # You will create this file - - worker2-overrides.yaml # You will create this file - -patchesStrategicMerge: - - |- # Patch for Node StatefulSet - apiVersion: apps/v1 - kind: StatefulSet - metadata: - name: node - spec: - template: - spec: - nodeSelector: - kubernetes.io/hostname: k8s-worker-2 - - - |- # Patch for Inference Deployment - apiVersion: apps/v1 - kind: Deployment - metadata: - name: inference - spec: - template: - spec: - nodeSelector: - kubernetes.io/hostname: k8s-worker-2 - -# Patches for tmkms and api deployments if needed for this worker - -# To also pin TMKMS and API deployments to k8s-worker-2, add similar patches: -# - |- -# apiVersion: apps/v1 -# kind: Deployment -# metadata: -# name: tmkms -# spec: -# template: -# spec: -# nodeSelector: -# kubernetes.io/hostname: k8s-worker-2 -# -# - |- -# apiVersion: apps/v1 -# kind: Deployment -# metadata: -# name: api -# spec: -# template: -# spec: -# nodeSelector: -# kubernetes.io/hostname: k8s-worker-2 \ No newline at end of file diff --git a/k3s/overlays/worker3/kustomization.yaml b/k3s/overlays/worker3/kustomization.yaml deleted file mode 100644 index 22fb843..0000000 --- a/k3s/overlays/worker3/kustomization.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -resources: - - ../../join - - worker3-node-config.yaml - - worker3-overrides.yaml - -patchesStrategicMerge: - - |- # Patch for Node StatefulSet - apiVersion: apps/v1 - kind: StatefulSet - metadata: - name: node - spec: - template: - spec: - nodeSelector: - kubernetes.io/hostname: k8s-worker-2 - - - |- # Patch for Inference Deployment - apiVersion: apps/v1 - kind: Deployment - metadata: - name: inference - spec: - template: - spec: - nodeSelector: - kubernetes.io/hostname: k8s-worker-2 - -# Patches for tmkms and api deployments if needed for this worker - -# To also pin TMKMS and API deployments to k8s-worker-2, add similar patches: -# - |- -# apiVersion: apps/v1 -# kind: Deployment -# metadata: -# name: tmkms -# spec: -# template: -# spec: -# nodeSelector: -# kubernetes.io/hostname: k8s-worker-2 -# -# - |- -# apiVersion: apps/v1 -# kind: Deployment -# metadata: -# name: api -# spec: -# template: -# spec: -# nodeSelector: -# kubernetes.io/hostname: k8s-worker-2 \ No newline at end of file From 96ef1a5ab53a4d8f16870ce92743ca3d1f27c7f7 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 22:19:18 -0700 Subject: [PATCH 32/48] renames --- k3s/join/api-deployment.yaml | 2 +- k3s/join/config.yaml | 2 +- k3s/join/kustomization.yaml | 1 + k3s/join/node-statefulset.yaml | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/k3s/join/api-deployment.yaml b/k3s/join/api-deployment.yaml index c86431b..58b3dd8 100644 --- a/k3s/join/api-deployment.yaml +++ b/k3s/join/api-deployment.yaml @@ -30,7 +30,7 @@ spec: name: admin-api envFrom: - configMapRef: - name: join-node-overrides # General env vars + name: config env: - name: NODE_CONFIG_PATH # Override specifically for the API pod to point to the mounted config value: "/etc/node-config/node-config.json" diff --git a/k3s/join/config.yaml b/k3s/join/config.yaml index 50317f4..ae40179 100644 --- a/k3s/join/config.yaml +++ b/k3s/join/config.yaml @@ -1,7 +1,7 @@ apiVersion: v1 kind: ConfigMap metadata: - name: join-node-overrides + name: config data: # Replace <...> placeholders with actual values for each worker # From config.env.template diff --git a/k3s/join/kustomization.yaml b/k3s/join/kustomization.yaml index 753cb41..04db6c4 100644 --- a/k3s/join/kustomization.yaml +++ b/k3s/join/kustomization.yaml @@ -9,3 +9,4 @@ resources: - node-statefulset.yaml - api-deployment.yaml - inference-deployment.yaml + - config.yaml diff --git a/k3s/join/node-statefulset.yaml b/k3s/join/node-statefulset.yaml index 8a991ba..2534e22 100644 --- a/k3s/join/node-statefulset.yaml +++ b/k3s/join/node-statefulset.yaml @@ -35,7 +35,7 @@ spec: name: tmkms envFrom: - configMapRef: - name: join-node-overrides # From join-overrides-configmap.template.yaml + name: config volumeMounts: - name: inference-data mountPath: /root/.inference # Shared data with API From 376873229a8ca9e53e2a7e3c83334b687c53e517 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 22:42:14 -0700 Subject: [PATCH 33/48] Fix port exposure --- k3s/common/api-public-service.yaml | 5 +++-- k3s/common/node-service.yaml | 6 +++--- .../join-k8s-worker-2/api-public-service-patch.yaml | 7 +++++++ k3s/overlays/join-k8s-worker-2/kustomization.yaml | 2 ++ k3s/overlays/join-k8s-worker-2/node-service-patch.yaml | 7 +++++++ .../join-k8s-worker-3/api-public-service-patch.yaml | 7 +++++++ k3s/overlays/join-k8s-worker-3/kustomization.yaml | 2 ++ k3s/overlays/join-k8s-worker-3/node-service-patch.yaml | 7 +++++++ 8 files changed, 38 insertions(+), 5 deletions(-) create mode 100644 k3s/overlays/join-k8s-worker-2/api-public-service-patch.yaml create mode 100644 k3s/overlays/join-k8s-worker-2/node-service-patch.yaml create mode 100644 k3s/overlays/join-k8s-worker-3/api-public-service-patch.yaml create mode 100644 k3s/overlays/join-k8s-worker-3/node-service-patch.yaml diff --git a/k3s/common/api-public-service.yaml b/k3s/common/api-public-service.yaml index fef2703..11527d1 100644 --- a/k3s/common/api-public-service.yaml +++ b/k3s/common/api-public-service.yaml @@ -5,11 +5,12 @@ metadata: labels: app: api spec: - type: NodePort + type: ClusterIP selector: app: api ports: - name: public port: 9000 targetPort: 9000 - nodePort: 30000 \ No newline at end of file + externalIPs: + - "34.9.136.116" diff --git a/k3s/common/node-service.yaml b/k3s/common/node-service.yaml index bff97ce..74e4e30 100644 --- a/k3s/common/node-service.yaml +++ b/k3s/common/node-service.yaml @@ -5,15 +5,15 @@ metadata: labels: app: node spec: - type: NodePort + type: ClusterIP selector: app: node ports: - name: tendermint-p2p port: 26656 targetPort: 26656 - nodePort: 30001 # Expose to the outer world - name: tendermint-rpc port: 26657 targetPort: 26657 - nodePort: 30002 # Expose to the outer world + externalIPs: + - "34.9.136.116" diff --git a/k3s/overlays/join-k8s-worker-2/api-public-service-patch.yaml b/k3s/overlays/join-k8s-worker-2/api-public-service-patch.yaml new file mode 100644 index 0000000..fae799f --- /dev/null +++ b/k3s/overlays/join-k8s-worker-2/api-public-service-patch.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Service +metadata: + name: api-public # This must match the name of the service in k3s/common +spec: + externalIPs: + - "35.192.7.224" \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-2/kustomization.yaml b/k3s/overlays/join-k8s-worker-2/kustomization.yaml index c6e826d..c29fb5a 100644 --- a/k3s/overlays/join-k8s-worker-2/kustomization.yaml +++ b/k3s/overlays/join-k8s-worker-2/kustomization.yaml @@ -10,3 +10,5 @@ patchesStrategicMerge: - api-deployment-patch.yaml - inference-deployment-patch.yaml - config-patch.yaml + - api-public-service-patch.yaml + - node-service-patch.yaml diff --git a/k3s/overlays/join-k8s-worker-2/node-service-patch.yaml b/k3s/overlays/join-k8s-worker-2/node-service-patch.yaml new file mode 100644 index 0000000..0c2815a --- /dev/null +++ b/k3s/overlays/join-k8s-worker-2/node-service-patch.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Service +metadata: + name: node # This must match the name of the service in k3s/common +spec: + externalIPs: + - "35.192.7.224" \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-3/api-public-service-patch.yaml b/k3s/overlays/join-k8s-worker-3/api-public-service-patch.yaml new file mode 100644 index 0000000..9b1cb60 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-3/api-public-service-patch.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Service +metadata: + name: api-public # This must match the name of the service in k3s/common +spec: + externalIPs: + - "34.9.17.182" \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-3/kustomization.yaml b/k3s/overlays/join-k8s-worker-3/kustomization.yaml index c6e826d..c29fb5a 100644 --- a/k3s/overlays/join-k8s-worker-3/kustomization.yaml +++ b/k3s/overlays/join-k8s-worker-3/kustomization.yaml @@ -10,3 +10,5 @@ patchesStrategicMerge: - api-deployment-patch.yaml - inference-deployment-patch.yaml - config-patch.yaml + - api-public-service-patch.yaml + - node-service-patch.yaml diff --git a/k3s/overlays/join-k8s-worker-3/node-service-patch.yaml b/k3s/overlays/join-k8s-worker-3/node-service-patch.yaml new file mode 100644 index 0000000..db253bf --- /dev/null +++ b/k3s/overlays/join-k8s-worker-3/node-service-patch.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Service +metadata: + name: node # This must match the name of the service in k3s/common +spec: + externalIPs: + - "34.9.17.182" \ No newline at end of file From ac9a3c76a88064eeb0e986c9f60e296e29001b75 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 23:08:29 -0700 Subject: [PATCH 34/48] x --- k3s/common/api-public-service.yaml | 5 ++--- k3s/common/node-service.yaml | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/k3s/common/api-public-service.yaml b/k3s/common/api-public-service.yaml index 11527d1..50b51a1 100644 --- a/k3s/common/api-public-service.yaml +++ b/k3s/common/api-public-service.yaml @@ -5,12 +5,11 @@ metadata: labels: app: api spec: - type: ClusterIP + type: NodePort selector: app: api ports: - name: public port: 9000 targetPort: 9000 - externalIPs: - - "34.9.136.116" + nodePort: 30000 diff --git a/k3s/common/node-service.yaml b/k3s/common/node-service.yaml index 74e4e30..50f4ad8 100644 --- a/k3s/common/node-service.yaml +++ b/k3s/common/node-service.yaml @@ -5,15 +5,15 @@ metadata: labels: app: node spec: - type: ClusterIP + type: NodePort selector: app: node ports: - name: tendermint-p2p port: 26656 targetPort: 26656 + nodePort: 30001 - name: tendermint-rpc port: 26657 targetPort: 26657 - externalIPs: - - "34.9.136.116" + nodePort: 30002 From 79129b4e4405f5d89da0d4621028f82bfc0fc6f9 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 23:21:11 -0700 Subject: [PATCH 35/48] move back to NodePort --- k3s/common/kustomization.yaml | 2 -- k3s/{common => genesis}/api-public-service.yaml | 0 k3s/{common => genesis}/node-service.yaml | 0 3 files changed, 2 deletions(-) rename k3s/{common => genesis}/api-public-service.yaml (100%) rename k3s/{common => genesis}/node-service.yaml (100%) diff --git a/k3s/common/kustomization.yaml b/k3s/common/kustomization.yaml index da37aa7..a8b0d80 100644 --- a/k3s/common/kustomization.yaml +++ b/k3s/common/kustomization.yaml @@ -2,9 +2,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - api-public-service.yaml - api-private-service.yaml - - node-service.yaml - inference-service.yaml # Common ConfigMaps diff --git a/k3s/common/api-public-service.yaml b/k3s/genesis/api-public-service.yaml similarity index 100% rename from k3s/common/api-public-service.yaml rename to k3s/genesis/api-public-service.yaml diff --git a/k3s/common/node-service.yaml b/k3s/genesis/node-service.yaml similarity index 100% rename from k3s/common/node-service.yaml rename to k3s/genesis/node-service.yaml From f31989948b4fbcb209985793bc62823d04c4e757 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 23:39:44 -0700 Subject: [PATCH 36/48] join overlays --- k3s/genesis/kustomization.yaml | 2 ++ .../api-public-service-patch.yaml | 7 ------- .../join-k8s-worker-2/api-public-service.yaml | 15 +++++++++++++++ .../join-k8s-worker-2/config-patch.yaml | 4 ++-- .../join-k8s-worker-2/kustomization.yaml | 6 +++--- .../join-k8s-worker-2/node-service-patch.yaml | 7 ------- .../join-k8s-worker-2/node-service.yaml | 19 +++++++++++++++++++ .../api-public-service-patch.yaml | 7 ------- .../join-k8s-worker-3/api-public-service.yaml | 15 +++++++++++++++ .../join-k8s-worker-3/config-patch.yaml | 4 ++-- .../join-k8s-worker-3/kustomization.yaml | 6 +++--- .../join-k8s-worker-3/node-service-patch.yaml | 7 ------- .../join-k8s-worker-3/node-service.yaml | 19 +++++++++++++++++++ 13 files changed, 80 insertions(+), 38 deletions(-) delete mode 100644 k3s/overlays/join-k8s-worker-2/api-public-service-patch.yaml create mode 100644 k3s/overlays/join-k8s-worker-2/api-public-service.yaml delete mode 100644 k3s/overlays/join-k8s-worker-2/node-service-patch.yaml create mode 100644 k3s/overlays/join-k8s-worker-2/node-service.yaml delete mode 100644 k3s/overlays/join-k8s-worker-3/api-public-service-patch.yaml create mode 100644 k3s/overlays/join-k8s-worker-3/api-public-service.yaml delete mode 100644 k3s/overlays/join-k8s-worker-3/node-service-patch.yaml create mode 100644 k3s/overlays/join-k8s-worker-3/node-service.yaml diff --git a/k3s/genesis/kustomization.yaml b/k3s/genesis/kustomization.yaml index 5f3a868..9ca7fc6 100644 --- a/k3s/genesis/kustomization.yaml +++ b/k3s/genesis/kustomization.yaml @@ -12,3 +12,5 @@ resources: - node-statefulset.yaml - api-deployment.yaml - inference-deployment.yaml + - api-public-service.yaml + - node-service.yaml diff --git a/k3s/overlays/join-k8s-worker-2/api-public-service-patch.yaml b/k3s/overlays/join-k8s-worker-2/api-public-service-patch.yaml deleted file mode 100644 index fae799f..0000000 --- a/k3s/overlays/join-k8s-worker-2/api-public-service-patch.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: api-public # This must match the name of the service in k3s/common -spec: - externalIPs: - - "35.192.7.224" \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-2/api-public-service.yaml b/k3s/overlays/join-k8s-worker-2/api-public-service.yaml new file mode 100644 index 0000000..cea37d9 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-2/api-public-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: api-public + labels: + app: api +spec: + type: NodePort + selector: + app: api + ports: + - name: public + port: 9000 + targetPort: 9000 + nodePort: 30010 diff --git a/k3s/overlays/join-k8s-worker-2/config-patch.yaml b/k3s/overlays/join-k8s-worker-2/config-patch.yaml index 3487dff..6054b33 100644 --- a/k3s/overlays/join-k8s-worker-2/config-patch.yaml +++ b/k3s/overlays/join-k8s-worker-2/config-patch.yaml @@ -4,5 +4,5 @@ metadata: name: config data: KEY_NAME: "join-k8s-worker-2" - PUBLIC_URL: "35.192.7.224:3000" - P2P_EXTERNAL_ADDRESS: "35.192.7.224:30001" \ No newline at end of file + PUBLIC_URL: "35.192.7.224:30010" + P2P_EXTERNAL_ADDRESS: "35.192.7.224:30011" diff --git a/k3s/overlays/join-k8s-worker-2/kustomization.yaml b/k3s/overlays/join-k8s-worker-2/kustomization.yaml index c29fb5a..ef452c4 100644 --- a/k3s/overlays/join-k8s-worker-2/kustomization.yaml +++ b/k3s/overlays/join-k8s-worker-2/kustomization.yaml @@ -1,8 +1,10 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization -bases: +resources: - ../../join + - api-public-service.yaml + - node-service.yaml patchesStrategicMerge: - tmkms-deployment-patch.yaml @@ -10,5 +12,3 @@ patchesStrategicMerge: - api-deployment-patch.yaml - inference-deployment-patch.yaml - config-patch.yaml - - api-public-service-patch.yaml - - node-service-patch.yaml diff --git a/k3s/overlays/join-k8s-worker-2/node-service-patch.yaml b/k3s/overlays/join-k8s-worker-2/node-service-patch.yaml deleted file mode 100644 index 0c2815a..0000000 --- a/k3s/overlays/join-k8s-worker-2/node-service-patch.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: node # This must match the name of the service in k3s/common -spec: - externalIPs: - - "35.192.7.224" \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-2/node-service.yaml b/k3s/overlays/join-k8s-worker-2/node-service.yaml new file mode 100644 index 0000000..cde528c --- /dev/null +++ b/k3s/overlays/join-k8s-worker-2/node-service.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + name: node + labels: + app: node +spec: + type: NodePort + selector: + app: node + ports: + - name: tendermint-p2p + port: 26656 + targetPort: 26656 + nodePort: 30011 + - name: tendermint-rpc + port: 26657 + targetPort: 26657 + nodePort: 30012 diff --git a/k3s/overlays/join-k8s-worker-3/api-public-service-patch.yaml b/k3s/overlays/join-k8s-worker-3/api-public-service-patch.yaml deleted file mode 100644 index 9b1cb60..0000000 --- a/k3s/overlays/join-k8s-worker-3/api-public-service-patch.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: api-public # This must match the name of the service in k3s/common -spec: - externalIPs: - - "34.9.17.182" \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-3/api-public-service.yaml b/k3s/overlays/join-k8s-worker-3/api-public-service.yaml new file mode 100644 index 0000000..0105a1a --- /dev/null +++ b/k3s/overlays/join-k8s-worker-3/api-public-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: api-public + labels: + app: api +spec: + type: NodePort + selector: + app: api + ports: + - name: public + port: 9000 + targetPort: 9000 + nodePort: 30020 diff --git a/k3s/overlays/join-k8s-worker-3/config-patch.yaml b/k3s/overlays/join-k8s-worker-3/config-patch.yaml index e3f4c44..fff3bf5 100644 --- a/k3s/overlays/join-k8s-worker-3/config-patch.yaml +++ b/k3s/overlays/join-k8s-worker-3/config-patch.yaml @@ -4,5 +4,5 @@ metadata: name: config data: KEY_NAME: "join-k8s-worker-3" - PUBLIC_URL: "34.9.17.182:3000" - P2P_EXTERNAL_ADDRESS: "34.9.17.182:30001" \ No newline at end of file + PUBLIC_URL: "34.9.17.182:30020" + P2P_EXTERNAL_ADDRESS: "34.9.17.182:30021" diff --git a/k3s/overlays/join-k8s-worker-3/kustomization.yaml b/k3s/overlays/join-k8s-worker-3/kustomization.yaml index c29fb5a..ef452c4 100644 --- a/k3s/overlays/join-k8s-worker-3/kustomization.yaml +++ b/k3s/overlays/join-k8s-worker-3/kustomization.yaml @@ -1,8 +1,10 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization -bases: +resources: - ../../join + - api-public-service.yaml + - node-service.yaml patchesStrategicMerge: - tmkms-deployment-patch.yaml @@ -10,5 +12,3 @@ patchesStrategicMerge: - api-deployment-patch.yaml - inference-deployment-patch.yaml - config-patch.yaml - - api-public-service-patch.yaml - - node-service-patch.yaml diff --git a/k3s/overlays/join-k8s-worker-3/node-service-patch.yaml b/k3s/overlays/join-k8s-worker-3/node-service-patch.yaml deleted file mode 100644 index db253bf..0000000 --- a/k3s/overlays/join-k8s-worker-3/node-service-patch.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: node # This must match the name of the service in k3s/common -spec: - externalIPs: - - "34.9.17.182" \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-3/node-service.yaml b/k3s/overlays/join-k8s-worker-3/node-service.yaml new file mode 100644 index 0000000..fc1e556 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-3/node-service.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + name: node + labels: + app: node +spec: + type: NodePort + selector: + app: node + ports: + - name: tendermint-p2p + port: 26656 + targetPort: 26656 + nodePort: 30021 + - name: tendermint-rpc + port: 26657 + targetPort: 26657 + nodePort: 30022 From 3a44f674939b6a627eb4f9cfb27a12933458d6c8 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 23:53:43 -0700 Subject: [PATCH 37/48] more fixes --- k3s/README.md | 6 ++++++ k3s/join/config.yaml | 2 +- k3s/join/tmkms-deployment.yaml | 2 +- k3s/join/tmkms-pvc.yaml | 1 - k3s/overlays/join-k8s-worker-2/config-patch.yaml | 2 +- k3s/overlays/join-k8s-worker-3/config-patch.yaml | 2 +- 6 files changed, 10 insertions(+), 5 deletions(-) diff --git a/k3s/README.md b/k3s/README.md index bedf397..a25be69 100644 --- a/k3s/README.md +++ b/k3s/README.md @@ -20,6 +20,9 @@ kubectl apply -k k3s/overlays/join-k8s-worker-2 -n join-k8s-worker-2 Stop join-worker-2 ```bash kubectl delete all --all -n join-k8s-worker-2 + +# To delete pvc +kubectl delete pvc tmkms-data-pvc -n join-k8s-worker-2 ``` Run join-worker-3 @@ -32,4 +35,7 @@ kubectl apply -k k3s/overlays/join-k8s-worker-3 -n join-k8s-worker-3 Stop join-worker-3 ```bash kubectl delete all --all -n join-k8s-worker-3 + +# To delete pvc +kubectl delete pvc tmkms-data-pvc -n join-k8s-worker-3 ``` diff --git a/k3s/join/config.yaml b/k3s/join/config.yaml index ae40179..3319ef6 100644 --- a/k3s/join/config.yaml +++ b/k3s/join/config.yaml @@ -6,7 +6,7 @@ data: # Replace <...> placeholders with actual values for each worker # From config.env.template KEY_NAME: "" - PUBLIC_URL: "http://:" + DAPI_API_PUBLIC_URL: "http://:" P2P_EXTERNAL_ADDRESS: "tcp://:" # e.g., port 5000 # Identical for all join workers diff --git a/k3s/join/tmkms-deployment.yaml b/k3s/join/tmkms-deployment.yaml index 1684898..8eaaaf3 100644 --- a/k3s/join/tmkms-deployment.yaml +++ b/k3s/join/tmkms-deployment.yaml @@ -33,4 +33,4 @@ spec: volumes: - name: tmkms-data persistentVolumeClaim: - claimName: tmkms-pvc + claimName: tmkms-data-pvc diff --git a/k3s/join/tmkms-pvc.yaml b/k3s/join/tmkms-pvc.yaml index b39f11b..7959e4f 100644 --- a/k3s/join/tmkms-pvc.yaml +++ b/k3s/join/tmkms-pvc.yaml @@ -2,7 +2,6 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: name: tmkms-data-pvc - # namespace: # Will be applied with -n flag spec: accessModes: - ReadWriteOnce # Suitable for a single TMKMS pod diff --git a/k3s/overlays/join-k8s-worker-2/config-patch.yaml b/k3s/overlays/join-k8s-worker-2/config-patch.yaml index 6054b33..31d1764 100644 --- a/k3s/overlays/join-k8s-worker-2/config-patch.yaml +++ b/k3s/overlays/join-k8s-worker-2/config-patch.yaml @@ -4,5 +4,5 @@ metadata: name: config data: KEY_NAME: "join-k8s-worker-2" - PUBLIC_URL: "35.192.7.224:30010" + DAPI_API_PUBLIC_URL: "35.192.7.224:30010" P2P_EXTERNAL_ADDRESS: "35.192.7.224:30011" diff --git a/k3s/overlays/join-k8s-worker-3/config-patch.yaml b/k3s/overlays/join-k8s-worker-3/config-patch.yaml index fff3bf5..93d0e75 100644 --- a/k3s/overlays/join-k8s-worker-3/config-patch.yaml +++ b/k3s/overlays/join-k8s-worker-3/config-patch.yaml @@ -4,5 +4,5 @@ metadata: name: config data: KEY_NAME: "join-k8s-worker-3" - PUBLIC_URL: "34.9.17.182:30020" + DAPI_API_PUBLIC_URL: "34.9.17.182:30020" P2P_EXTERNAL_ADDRESS: "34.9.17.182:30021" From 896455274635592afeef01d787e4f3862a7c9623 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 23:57:51 -0700 Subject: [PATCH 38/48] expose tmkms port --- k3s/join/config.yaml | 1 + k3s/overlays/join-k8s-worker-2/node-service.yaml | 3 +++ k3s/overlays/join-k8s-worker-3/node-service.yaml | 3 +++ 3 files changed, 7 insertions(+) diff --git a/k3s/join/config.yaml b/k3s/join/config.yaml index 3319ef6..60004d8 100644 --- a/k3s/join/config.yaml +++ b/k3s/join/config.yaml @@ -19,6 +19,7 @@ data: DAPI_CHAIN_NODE__P2P_URL: "http://node:26656" RPC_SERVER_URL_1: "http://34.9.136.116:30002" RPC_SERVER_URL_2: "http://34.9.136.116:30002" + TMKMS_PORT: "26658" # FIXME: is it really needed as an env var? # Variables from docker-compose for node TRUSTED_BLOCK_PERIOD: "2000" # Default from docker-compose diff --git a/k3s/overlays/join-k8s-worker-2/node-service.yaml b/k3s/overlays/join-k8s-worker-2/node-service.yaml index cde528c..ba46099 100644 --- a/k3s/overlays/join-k8s-worker-2/node-service.yaml +++ b/k3s/overlays/join-k8s-worker-2/node-service.yaml @@ -17,3 +17,6 @@ spec: port: 26657 targetPort: 26657 nodePort: 30012 + - name: tmkms + port: 26658 + targetPort: 26658 diff --git a/k3s/overlays/join-k8s-worker-3/node-service.yaml b/k3s/overlays/join-k8s-worker-3/node-service.yaml index fc1e556..9ed2fce 100644 --- a/k3s/overlays/join-k8s-worker-3/node-service.yaml +++ b/k3s/overlays/join-k8s-worker-3/node-service.yaml @@ -17,3 +17,6 @@ spec: port: 26657 targetPort: 26657 nodePort: 30022 + - name: tmkms + port: 26658 + targetPort: 26658 From dcf73aeb6defbc32fe13b017d8a845c561a4cfb1 Mon Sep 17 00:00:00 2001 From: dima Date: Wed, 14 May 2025 23:59:15 -0700 Subject: [PATCH 39/48] fixes --- k3s/README.md | 1 + k3s/overlays/join-k8s-worker-2/config-patch.yaml | 2 +- k3s/overlays/join-k8s-worker-3/config-patch.yaml | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/k3s/README.md b/k3s/README.md index a25be69..bcd2c4f 100644 --- a/k3s/README.md +++ b/k3s/README.md @@ -14,6 +14,7 @@ Run join-worker-2 ```bash kubectl create namespace join-k8s-worker-2 # if not already created + kubectl apply -k k3s/overlays/join-k8s-worker-2 -n join-k8s-worker-2 ``` diff --git a/k3s/overlays/join-k8s-worker-2/config-patch.yaml b/k3s/overlays/join-k8s-worker-2/config-patch.yaml index 31d1764..226cf74 100644 --- a/k3s/overlays/join-k8s-worker-2/config-patch.yaml +++ b/k3s/overlays/join-k8s-worker-2/config-patch.yaml @@ -4,5 +4,5 @@ metadata: name: config data: KEY_NAME: "join-k8s-worker-2" - DAPI_API_PUBLIC_URL: "35.192.7.224:30010" + DAPI_API__PUBLIC_URL: "35.192.7.224:30010" P2P_EXTERNAL_ADDRESS: "35.192.7.224:30011" diff --git a/k3s/overlays/join-k8s-worker-3/config-patch.yaml b/k3s/overlays/join-k8s-worker-3/config-patch.yaml index 93d0e75..af3419c 100644 --- a/k3s/overlays/join-k8s-worker-3/config-patch.yaml +++ b/k3s/overlays/join-k8s-worker-3/config-patch.yaml @@ -4,5 +4,5 @@ metadata: name: config data: KEY_NAME: "join-k8s-worker-3" - DAPI_API_PUBLIC_URL: "34.9.17.182:30020" + DAPI_API__PUBLIC_URL: "34.9.17.182:30020" P2P_EXTERNAL_ADDRESS: "34.9.17.182:30021" From b8700ce2cbcb2882293d589ddfed2f47c4ce6a84 Mon Sep 17 00:00:00 2001 From: dima Date: Thu, 15 May 2025 00:08:50 -0700 Subject: [PATCH 40/48] fix node config path --- k3s/join/api-deployment.yaml | 8 ++++---- k3s/join/config.yaml | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/k3s/join/api-deployment.yaml b/k3s/join/api-deployment.yaml index 58b3dd8..b85b8d8 100644 --- a/k3s/join/api-deployment.yaml +++ b/k3s/join/api-deployment.yaml @@ -32,14 +32,14 @@ spec: - configMapRef: name: config env: - - name: NODE_CONFIG_PATH # Override specifically for the API pod to point to the mounted config - value: "/etc/node-config/node-config.json" + - name: NODE_CONFIG_PATH + value: "/root/node_config.json" volumeMounts: - name: inference-data mountPath: /root/.inference - name: node-config - mountPath: /etc/node-config - readOnly: true + mountPath: /root/node_config.json + subPath: node_config.json volumes: - name: inference-data # This volume is now a hostPath hostPath: diff --git a/k3s/join/config.yaml b/k3s/join/config.yaml index 60004d8..93897c8 100644 --- a/k3s/join/config.yaml +++ b/k3s/join/config.yaml @@ -19,14 +19,14 @@ data: DAPI_CHAIN_NODE__P2P_URL: "http://node:26656" RPC_SERVER_URL_1: "http://34.9.136.116:30002" RPC_SERVER_URL_2: "http://34.9.136.116:30002" - TMKMS_PORT: "26658" # FIXME: is it really needed as an env var? + # TMKMS_PORT: "26658" # FIXME: is it really needed as an env var? # Variables from docker-compose for node TRUSTED_BLOCK_PERIOD: "2000" # Default from docker-compose CONFIG_p2p__allow_duplicate_ip: "true" CONFIG_p2p__handshake_timeout: "30s" CONFIG_p2p__dial_timeout: "30s" - TKMS_PORT: "26658" # Port node listens on for TMKMS + # TKMS_PORT: "26658" # Port node listens on for TMKMS # Additional for API service from docker-compose (if not already covered) DAPI_API__PUBLIC_SERVER_PORT: "9000" # Internal port for API service, API_PORT is for host exposure From 7270ade834e5bc6df2266d0167e6a37bc8cf63cf Mon Sep 17 00:00:00 2001 From: dima Date: Thu, 15 May 2025 00:09:17 -0700 Subject: [PATCH 41/48] comment tkms ports --- k3s/join/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k3s/join/config.yaml b/k3s/join/config.yaml index 93897c8..2918b60 100644 --- a/k3s/join/config.yaml +++ b/k3s/join/config.yaml @@ -19,7 +19,7 @@ data: DAPI_CHAIN_NODE__P2P_URL: "http://node:26656" RPC_SERVER_URL_1: "http://34.9.136.116:30002" RPC_SERVER_URL_2: "http://34.9.136.116:30002" - # TMKMS_PORT: "26658" # FIXME: is it really needed as an env var? + # TMKMS_PORT: "26658" # Variables from docker-compose for node TRUSTED_BLOCK_PERIOD: "2000" # Default from docker-compose From 844167d4fbc4a4a16805fd393b28ab8ffc258e5b Mon Sep 17 00:00:00 2001 From: dima Date: Thu, 15 May 2025 10:49:35 -0700 Subject: [PATCH 42/48] Fix DAPI_CHAIN_NODE__SEED_API_URL --- k3s/README.md | 2 ++ k3s/join/config.yaml | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/k3s/README.md b/k3s/README.md index bcd2c4f..727d003 100644 --- a/k3s/README.md +++ b/k3s/README.md @@ -14,7 +14,9 @@ Run join-worker-2 ```bash kubectl create namespace join-k8s-worker-2 # if not already created +``` +```bash kubectl apply -k k3s/overlays/join-k8s-worker-2 -n join-k8s-worker-2 ``` diff --git a/k3s/join/config.yaml b/k3s/join/config.yaml index 2918b60..5f98436 100644 --- a/k3s/join/config.yaml +++ b/k3s/join/config.yaml @@ -10,8 +10,7 @@ data: P2P_EXTERNAL_ADDRESS: "tcp://:" # e.g., port 5000 # Identical for all join workers - NODE_CONFIG: "/etc/node-config/node-config.json" # Path where node-config.json will be mounted from the other ConfigMap - SEED_API_URL: "http://34.9.136.116:30000" + DAPI_CHAIN_NODE__SEED_API_URL: "http://34.9.136.116:30000" SEED_NODE_RPC_URL: "http://34.9.136.116:30002" SEED_NODE_P2P_URL: "tcp://34.9.136.116:30001" DAPI_API__POC_CALLBACK_URL: "http://api-private:9100" From 6aa8136e07e798ab218da0c85b91cd253e0a3ae4 Mon Sep 17 00:00:00 2001 From: dima Date: Thu, 15 May 2025 11:22:10 -0700 Subject: [PATCH 43/48] TMKMS works --- k3s/README.md | 7 +++++++ k3s/join/config.yaml | 4 ++-- k3s/join/tmkms-deployment.yaml | 7 +++++++ k3s/join/tmkms-service.yaml | 14 -------------- 4 files changed, 16 insertions(+), 16 deletions(-) delete mode 100644 k3s/join/tmkms-service.yaml diff --git a/k3s/README.md b/k3s/README.md index 727d003..787f558 100644 --- a/k3s/README.md +++ b/k3s/README.md @@ -42,3 +42,10 @@ kubectl delete all --all -n join-k8s-worker-3 # To delete pvc kubectl delete pvc tmkms-data-pvc -n join-k8s-worker-3 ``` + +Clean state +```bash +gcloud compute ssh k8s-worker-1 --zone us-central1-a --command "sudo rm -rf /srv/dai" +gcloud compute ssh k8s-worker-2 --zone us-central1-a --command "sudo rm -rf /srv/dai" +gcloud compute ssh k8s-worker-3 --zone us-central1-a --command "sudo rm -rf /srv/dai" +``` \ No newline at end of file diff --git a/k3s/join/config.yaml b/k3s/join/config.yaml index 5f98436..3e84217 100644 --- a/k3s/join/config.yaml +++ b/k3s/join/config.yaml @@ -18,14 +18,14 @@ data: DAPI_CHAIN_NODE__P2P_URL: "http://node:26656" RPC_SERVER_URL_1: "http://34.9.136.116:30002" RPC_SERVER_URL_2: "http://34.9.136.116:30002" - # TMKMS_PORT: "26658" + TMKMS_PORT: "26658" # Variables from docker-compose for node TRUSTED_BLOCK_PERIOD: "2000" # Default from docker-compose CONFIG_p2p__allow_duplicate_ip: "true" CONFIG_p2p__handshake_timeout: "30s" CONFIG_p2p__dial_timeout: "30s" - # TKMS_PORT: "26658" # Port node listens on for TMKMS + TKMS_PORT: "26658" # Port node listens on for TMKMS # Additional for API service from docker-compose (if not already covered) DAPI_API__PUBLIC_SERVER_PORT: "9000" # Internal port for API service, API_PORT is for host exposure diff --git a/k3s/join/tmkms-deployment.yaml b/k3s/join/tmkms-deployment.yaml index 8eaaaf3..fcd1a7e 100644 --- a/k3s/join/tmkms-deployment.yaml +++ b/k3s/join/tmkms-deployment.yaml @@ -17,6 +17,13 @@ spec: enableServiceLinks: false imagePullSecrets: - name: ghcr-credentials + initContainers: + - name: init-tmkms-data + image: ghcr.io/product-science/tmkms-softsign-with-keygen:0.1.2 + command: ["sh", "-c", "if [ ! -f /root/stage-dir/config.toml ]; then echo 'Initializing tmkms data volume...'; cp -r /root/.tmkms/. /root/stage-dir/; else echo 'tmkms data volume already initialized.'; fi"] + volumeMounts: + - name: tmkms-data + mountPath: /root/stage-dir containers: - name: tmkms image: ghcr.io/product-science/tmkms-softsign-with-keygen:0.1.2 diff --git a/k3s/join/tmkms-service.yaml b/k3s/join/tmkms-service.yaml deleted file mode 100644 index aef2770..0000000 --- a/k3s/join/tmkms-service.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: tmkms - labels: - app: tmkms -spec: - type: ClusterIP # Internal service, not exposed externally by default - selector: - app: tmkms - ports: - - name: tmkms-port - port: 26658 - targetPort: 26658 # Assumes the tmkms container exposes port 26658 \ No newline at end of file From 94ad72ca4536244859119ed436fe1a912676ccb9 Mon Sep 17 00:00:00 2001 From: dima Date: Thu, 15 May 2025 17:27:17 -0700 Subject: [PATCH 44/48] move some variable around --- k3s/join/api-deployment.yaml | 14 ++++++++++++++ k3s/join/config.yaml | 9 --------- k3s/join/tmkms-deployment.yaml | 9 +-------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/k3s/join/api-deployment.yaml b/k3s/join/api-deployment.yaml index b85b8d8..f230214 100644 --- a/k3s/join/api-deployment.yaml +++ b/k3s/join/api-deployment.yaml @@ -34,6 +34,20 @@ spec: env: - name: NODE_CONFIG_PATH value: "/root/node_config.json" + - name: DAPI_API__PUBLIC_SERVER_PORT + value: "9000" + - name: DAPI_API__ML_SERVER_PORT + value: "9100" + - name: DAPI_API__ADMIN_SERVER_PORT + value: "9200" + - name: DAPI_API__POC_CALLBACK_URL + value: "http://api-private:9100" + - name: DAPI_CHAIN_NODE__URL + value: "http://node:26657" + - name: DAPI_CHAIN_NODE__SEED_API_URL + value: "http://34.9.136.116:30000" + - name: DAPI_CHAIN_NODE__P2P_URL + value: "http://node:26656" volumeMounts: - name: inference-data mountPath: /root/.inference diff --git a/k3s/join/config.yaml b/k3s/join/config.yaml index 3e84217..c136d43 100644 --- a/k3s/join/config.yaml +++ b/k3s/join/config.yaml @@ -10,12 +10,8 @@ data: P2P_EXTERNAL_ADDRESS: "tcp://:" # e.g., port 5000 # Identical for all join workers - DAPI_CHAIN_NODE__SEED_API_URL: "http://34.9.136.116:30000" SEED_NODE_RPC_URL: "http://34.9.136.116:30002" SEED_NODE_P2P_URL: "tcp://34.9.136.116:30001" - DAPI_API__POC_CALLBACK_URL: "http://api-private:9100" - DAPI_CHAIN_NODE__URL: "http://node:26657" - DAPI_CHAIN_NODE__P2P_URL: "http://node:26656" RPC_SERVER_URL_1: "http://34.9.136.116:30002" RPC_SERVER_URL_2: "http://34.9.136.116:30002" TMKMS_PORT: "26658" @@ -26,8 +22,3 @@ data: CONFIG_p2p__handshake_timeout: "30s" CONFIG_p2p__dial_timeout: "30s" TKMS_PORT: "26658" # Port node listens on for TMKMS - - # Additional for API service from docker-compose (if not already covered) - DAPI_API__PUBLIC_SERVER_PORT: "9000" # Internal port for API service, API_PORT is for host exposure - DAPI_API__ML_SERVER_PORT: "9100" - DAPI_API__ADMIN_SERVER_PORT: "9200" diff --git a/k3s/join/tmkms-deployment.yaml b/k3s/join/tmkms-deployment.yaml index fcd1a7e..e7290e1 100644 --- a/k3s/join/tmkms-deployment.yaml +++ b/k3s/join/tmkms-deployment.yaml @@ -17,16 +17,9 @@ spec: enableServiceLinks: false imagePullSecrets: - name: ghcr-credentials - initContainers: - - name: init-tmkms-data - image: ghcr.io/product-science/tmkms-softsign-with-keygen:0.1.2 - command: ["sh", "-c", "if [ ! -f /root/stage-dir/config.toml ]; then echo 'Initializing tmkms data volume...'; cp -r /root/.tmkms/. /root/stage-dir/; else echo 'tmkms data volume already initialized.'; fi"] - volumeMounts: - - name: tmkms-data - mountPath: /root/stage-dir containers: - name: tmkms - image: ghcr.io/product-science/tmkms-softsign-with-keygen:0.1.2 + image: ghcr.io/product-science/tmkms-softsign-with-keygen:0.1.3 imagePullPolicy: IfNotPresent env: - name: VALIDATOR_LISTEN_ADDRESS From 8ac77054c1ca8bdd6b3b387efd4d4c63c2242cb5 Mon Sep 17 00:00:00 2001 From: dima Date: Thu, 15 May 2025 17:27:47 -0700 Subject: [PATCH 45/48] Remove redundant comments --- k3s/join/api-deployment.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/k3s/join/api-deployment.yaml b/k3s/join/api-deployment.yaml index f230214..862d260 100644 --- a/k3s/join/api-deployment.yaml +++ b/k3s/join/api-deployment.yaml @@ -55,9 +55,9 @@ spec: mountPath: /root/node_config.json subPath: node_config.json volumes: - - name: inference-data # This volume is now a hostPath + - name: inference-data hostPath: - path: /srv/dai/inference # Matches genesis setup + path: /srv/dai/inference type: DirectoryOrCreate - name: node-config configMap: From e00c94bef9ba3e657f8f7bb05261b3acf3e42ce8 Mon Sep 17 00:00:00 2001 From: dima Date: Thu, 15 May 2025 17:33:53 -0700 Subject: [PATCH 46/48] Move variables --- k3s/join/config.yaml | 10 ---------- k3s/join/node-statefulset.yaml | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/k3s/join/config.yaml b/k3s/join/config.yaml index c136d43..4653b27 100644 --- a/k3s/join/config.yaml +++ b/k3s/join/config.yaml @@ -10,15 +10,5 @@ data: P2P_EXTERNAL_ADDRESS: "tcp://:" # e.g., port 5000 # Identical for all join workers - SEED_NODE_RPC_URL: "http://34.9.136.116:30002" - SEED_NODE_P2P_URL: "tcp://34.9.136.116:30001" - RPC_SERVER_URL_1: "http://34.9.136.116:30002" - RPC_SERVER_URL_2: "http://34.9.136.116:30002" TMKMS_PORT: "26658" - - # Variables from docker-compose for node - TRUSTED_BLOCK_PERIOD: "2000" # Default from docker-compose - CONFIG_p2p__allow_duplicate_ip: "true" - CONFIG_p2p__handshake_timeout: "30s" - CONFIG_p2p__dial_timeout: "30s" TKMS_PORT: "26658" # Port node listens on for TMKMS diff --git a/k3s/join/node-statefulset.yaml b/k3s/join/node-statefulset.yaml index 2534e22..f108b12 100644 --- a/k3s/join/node-statefulset.yaml +++ b/k3s/join/node-statefulset.yaml @@ -36,6 +36,27 @@ spec: envFrom: - configMapRef: name: config + env: + - name: SEED_NODE_RPC_URL + value: "http://34.9.136.116:30002" + - name: SEED_NODE_P2P_URL + value: "tcp://34.9.136.116:30001" + - name: RPC_SERVER_URL_1 + value: "http://34.9.136.116:30002" + - name: RPC_SERVER_URL_2 + value: "http://34.9.136.116:30002" + - name: SNAPSHOT_INTERVAL + value: "1000" + - name: SNAPSHOT_KEEP_RECENT + value: "5" + - name: TRUSTED_BLOCK_PERIOD + value: "2000" + - name: CONFIG_p2p__allow_duplicate_ip + value: "true" + - name: CONFIG_p2p__handshake_timeout + value: "30s" + - name: CONFIG_p2p__dial_timeout + value: "30s" volumeMounts: - name: inference-data mountPath: /root/.inference # Shared data with API From b6280f1bb7b1a3d9b7df51149201c5271f4f96d8 Mon Sep 17 00:00:00 2001 From: dima Date: Thu, 15 May 2025 17:36:36 -0700 Subject: [PATCH 47/48] change tmkms image version --- k3s/README.md | 15 ++++++++++++++- k3s/join/tmkms-deployment.yaml | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/k3s/README.md b/k3s/README.md index 787f558..57e14c2 100644 --- a/k3s/README.md +++ b/k3s/README.md @@ -48,4 +48,17 @@ Clean state gcloud compute ssh k8s-worker-1 --zone us-central1-a --command "sudo rm -rf /srv/dai" gcloud compute ssh k8s-worker-2 --zone us-central1-a --command "sudo rm -rf /srv/dai" gcloud compute ssh k8s-worker-3 --zone us-central1-a --command "sudo rm -rf /srv/dai" -``` \ No newline at end of file +``` + +Stop all +```bash +kubectl delete all --all -n genesis +kubectl delete all --all -n join-k8s-worker-2 +kubectl delete pvc tmkms-data-pvc -n join-k8s-worker-2 +kubectl delete all --all -n join-k8s-worker-3 +kubectl delete pvc tmkms-data-pvc -n join-k8s-worker-3 + +gcloud compute ssh k8s-worker-1 --zone us-central1-a --command "sudo rm -rf /srv/dai" +gcloud compute ssh k8s-worker-2 --zone us-central1-a --command "sudo rm -rf /srv/dai" +gcloud compute ssh k8s-worker-3 --zone us-central1-a --command "sudo rm -rf /srv/dai" +``` diff --git a/k3s/join/tmkms-deployment.yaml b/k3s/join/tmkms-deployment.yaml index e7290e1..a4210a0 100644 --- a/k3s/join/tmkms-deployment.yaml +++ b/k3s/join/tmkms-deployment.yaml @@ -19,7 +19,7 @@ spec: - name: ghcr-credentials containers: - name: tmkms - image: ghcr.io/product-science/tmkms-softsign-with-keygen:0.1.3 + image: ghcr.io/product-science/tmkms-softsign-with-keygen:32165a2 imagePullPolicy: IfNotPresent env: - name: VALIDATOR_LISTEN_ADDRESS From 9a840541938f1b732360463a3000cee5b26367a4 Mon Sep 17 00:00:00 2001 From: dima Date: Thu, 15 May 2025 17:44:23 -0700 Subject: [PATCH 48/48] Move TMKMS_PORT var into the deployment file --- k3s/join/config.yaml | 4 ---- k3s/join/node-statefulset.yaml | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/k3s/join/config.yaml b/k3s/join/config.yaml index 4653b27..51e533d 100644 --- a/k3s/join/config.yaml +++ b/k3s/join/config.yaml @@ -8,7 +8,3 @@ data: KEY_NAME: "" DAPI_API_PUBLIC_URL: "http://:" P2P_EXTERNAL_ADDRESS: "tcp://:" # e.g., port 5000 - - # Identical for all join workers - TMKMS_PORT: "26658" - TKMS_PORT: "26658" # Port node listens on for TMKMS diff --git a/k3s/join/node-statefulset.yaml b/k3s/join/node-statefulset.yaml index f108b12..4123303 100644 --- a/k3s/join/node-statefulset.yaml +++ b/k3s/join/node-statefulset.yaml @@ -57,6 +57,10 @@ spec: value: "30s" - name: CONFIG_p2p__dial_timeout value: "30s" + - name: TKMS_PORT + value: "26658" + - name: TMKMS_PORT + value: "26658" volumeMounts: - name: inference-data mountPath: /root/.inference # Shared data with API