diff --git a/k3s/README.md b/k3s/README.md new file mode 100644 index 0000000..57e14c2 --- /dev/null +++ b/k3s/README.md @@ -0,0 +1,64 @@ +Run genesis node + +```bash +kubectl create namespace genesis # if not already created +kubectl apply -k k3s/genesis -n genesis +``` + +Stop genesis node +```bash +kubectl delete all --all -n genesis +``` + +Run join-worker-2 + +```bash +kubectl create namespace join-k8s-worker-2 # if not already created +``` + +```bash +kubectl apply -k k3s/overlays/join-k8s-worker-2 -n join-k8s-worker-2 +``` + +Stop join-worker-2 +```bash +kubectl delete all --all -n join-k8s-worker-2 + +# To delete pvc +kubectl delete pvc tmkms-data-pvc -n join-k8s-worker-2 +``` + +Run join-worker-3 + +```bash +kubectl create namespace join-k8s-worker-3 # if not already created +kubectl apply -k k3s/overlays/join-k8s-worker-3 -n join-k8s-worker-3 +``` + +Stop join-worker-3 +```bash +kubectl delete all --all -n join-k8s-worker-3 + +# To delete pvc +kubectl delete pvc tmkms-data-pvc -n join-k8s-worker-3 +``` + +Clean state +```bash +gcloud compute ssh k8s-worker-1 --zone us-central1-a --command "sudo rm -rf /srv/dai" +gcloud compute ssh k8s-worker-2 --zone us-central1-a --command "sudo rm -rf /srv/dai" +gcloud compute ssh k8s-worker-3 --zone us-central1-a --command "sudo rm -rf /srv/dai" +``` + +Stop all +```bash +kubectl delete all --all -n genesis +kubectl delete all --all -n join-k8s-worker-2 +kubectl delete pvc tmkms-data-pvc -n join-k8s-worker-2 +kubectl delete all --all -n join-k8s-worker-3 +kubectl delete pvc tmkms-data-pvc -n join-k8s-worker-3 + +gcloud compute ssh k8s-worker-1 --zone us-central1-a --command "sudo rm -rf /srv/dai" +gcloud compute ssh k8s-worker-2 --zone us-central1-a --command "sudo rm -rf /srv/dai" +gcloud compute ssh k8s-worker-3 --zone us-central1-a --command "sudo rm -rf /srv/dai" +``` diff --git a/k3s/common/api-private-service.yaml b/k3s/common/api-private-service.yaml new file mode 100644 index 0000000..742317d --- /dev/null +++ b/k3s/common/api-private-service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: api-private # RENAMED from api-internal + labels: + app: api +spec: + type: ClusterIP + selector: + app: api + ports: + - name: ml-server + port: 9100 + targetPort: 9100 # Assuming api-deployment.yaml containerPort is 9100 (or named ml-api which resolves to 9100) + - name: admin + port: 9200 + targetPort: 9200 # Assuming api-deployment.yaml containerPort is 9200 (or named admin-api which resolves to 9200) \ No newline at end of file diff --git a/k3s/common/inference-service.yaml b/k3s/common/inference-service.yaml new file mode 100644 index 0000000..9605bd6 --- /dev/null +++ b/k3s/common/inference-service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: inference + labels: + app: inference +spec: + type: ClusterIP # Default, but explicit for clarity + selector: + app: inference + ports: + - name: poc-api + port: 8080 + targetPort: 8080 + - name: inference-api + port: 5000 + targetPort: 5000 diff --git a/k3s/common/kustomization.yaml b/k3s/common/kustomization.yaml new file mode 100644 index 0000000..a8b0d80 --- /dev/null +++ b/k3s/common/kustomization.yaml @@ -0,0 +1,9 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - api-private-service.yaml + - inference-service.yaml + + # Common ConfigMaps + - node-configs/node-config-configmap.yaml diff --git a/k3s/common/node-configs/node-config-configmap.yaml b/k3s/common/node-configs/node-config-configmap.yaml new file mode 100644 index 0000000..0b3d241 --- /dev/null +++ b/k3s/common/node-configs/node-config-configmap.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: node-config-cm # This name is referenced by genesis workloads +data: + node_config.json: | + [ + { + "id": "mlnode1", + "host": "inference", + "inference_port": 5000, + "poc_port": 8080, + "max_concurrent": 500, + "models": { + "Qwen/Qwen2.5-7B-Instruct": { + "args": [ + "--quantization", + "fp8" + ] + } + } + } + ] diff --git a/k3s/genesis/README.md b/k3s/genesis/README.md new file mode 100644 index 0000000..95f8e6d --- /dev/null +++ b/k3s/genesis/README.md @@ -0,0 +1,148 @@ +# Genesis Node K3s Deployment + +This directory contains Kubernetes manifests for deploying the Genesis Node on a k3s cluster. + +## Prerequisites + +- A running k3s cluster with at least one worker node that has GPU support (`k8s-worker-1`) +- `kubectl` configured to access your cluster +- SSH access from your management machine (or `k8s-control-plane`) to `k8s-worker-1` for state cleaning. +- `stern` (optional, for improved log viewing) + +## Deployment + +1. **Configure kubectl** either by: + - Copying `/etc/rancher/k3s/k3s.yaml` from `k8s-control-plane` to `~/.kube/config` locally, or + - Setting up an SSH tunnel (see Appendix) + +2. **Set up GitHub Container Registry authentication**: + ```bash + kubectl create secret docker-registry ghcr-credentials \ + --docker-server=ghcr.io \ + --docker-username=YOUR_GITHUB_USERNAME \ + --docker-password=YOUR_GITHUB_TOKEN + ``` + Replace `YOUR_GITHUB_USERNAME` with your GitHub username and `YOUR_GITHUB_TOKEN` with a Personal Access Token that has `read:packages` permission. (If the secret already exists, this command will fail, which is fine.) + +3. **Deploy the Genesis Node**: + ```bash + kubectl apply -f . + ``` + +4. **Verify deployment**: + ```bash + kubectl get pods -w + ``` + Wait until all pods (`node-0`, `api-*`, `tmkms-*`, `inference-*`) show `Running` status. + +## Managing the Deployment + +### View Logs + +**Using kubectl** (individual components): +```bash +kubectl logs -f node-0 # Node logs +kubectl logs -f $(kubectl get pod -l app=api -o name) # API logs +kubectl logs -f $(kubectl get pod -l app=tmkms -o name) # TMKMS logs +kubectl logs -f $(kubectl get pod -l app=inference -o name) # Inference logs +``` + +**Using stern** (all components): +```bash +stern 'node|api|tmkms|inference' --exclude-container=POD +``` + +### Restart Components + +```bash +kubectl rollout restart statefulset/node # Restart node +kubectl rollout restart deployment/api # Restart API +kubectl rollout restart deployment/tmkms # Restart TMKMS +kubectl rollout restart deployment/inference # Restart inference +``` + +### Update Configuration + +1. Edit the ConfigMap: + ```bash + kubectl edit configmap config + ``` + +2. Restart affected components: + ```bash + kubectl rollout restart statefulset/node deployment/api + ``` + +### Stop Deployment (Delete Kubernetes Resources) + +This stops the application but leaves data on the `hostPath` volumes intact. + +```bash +kubectl delete -f . +``` + +### Clean Restart (Delete Kubernetes Resources and Clear State) + +This performs a full reset, deleting Kubernetes resources and clearing persisted data from `hostPath` volumes on `k8s-worker-1`, `k8s-worker-2`, and `k8s-worker-3`. + +**1. Delete Existing Kubernetes Application Resources:** + Run this from where your `kubectl` is configured (e.g., your local machine or `k8s-control-plane`): + ```bash + kubectl delete -f . --ignore-not-found=true # Deletes app resources, ignores if clear-state-job.yaml is not found or vice-versa + kubectl delete job clear-worker-state-job --ignore-not-found=true # Ensure previous job is cleaned up + ``` + Wait for all resources to be terminated. + +**2. Clear HostPath Volume Data using a Kubernetes Job:** + Apply the `clear-state-job.yaml` manifest. This job will run pods on `k8s-worker-1`, `k8s-worker-2`, and `k8s-worker-3` to delete the contents of the specified host directories. + ```bash + kubectl apply -f clear-state-job.yaml + ``` + +**3. Monitor the State Clearing Job:** + Check the status of the job: + ```bash + kubectl get job clear-worker-state-job -w + ``` + Wait for the job to show `COMPLETIONS` as `3/3`. + + View logs from the job's pods to confirm successful clearance on each node: + ```bash + kubectl logs -l app=clear-worker-state --tail=-1 # Shows all logs from all pods of the job + ``` + +**4. Delete the State Clearing Job (Important):** + Once the job is complete, delete it to avoid re-running it accidentally and to clean up the completed pods. + ```bash + kubectl delete job clear-worker-state-job + ``` + +**5. Re-deploy Application:** + Follow steps 2-4 from the main [Deployment](#deployment) section (create GHCR secret if needed, then `kubectl apply -f .` excluding `clear-state-job.yaml` if you re-applied everything from the directory). + + A safer re-deploy command after cleanup: + ```bash + kubectl apply -f api-deployment.yaml -f api-service.yaml -f config.yaml -f genesis-overrides-configmap.yaml -f inference-deployment.yaml -f inference-service.yaml -f node-config-configmap.yaml -f node-service.yaml -f node-statefulset.yaml -f tmkms-deployment.yaml + ``` + + *Note: The `initContainer` in `tmkms-deployment.yaml` should handle permissions for its directory. If permission issues arise for `/srv/dai/inference` (used by `node` and `api`), consider adding similar `initContainers` to their respective manifests.* + +## Appendix: SSH Tunnel Setup + +If accessing the cluster remotely from your local machine, set up an SSH tunnel: + +```bash +# Start tunnel +gcloud compute ssh k8s-control-plane \ + --project=YOUR_GCP_PROJECT_ID \ + --zone=YOUR_GCE_INSTANCE_ZONE \ + -- -L 6443:127.0.0.1:6443 -N -f + +# Check tunnel status +pgrep -f 'ssh.*-L 6443:127.0.0.1:6443' > /dev/null && echo "Tunnel ACTIVE" || echo "Tunnel NOT ACTIVE" + +# Kill tunnel +pkill -f 'ssh.*-L 6443:127.0.0.1:6443' +``` + +Update your kubeconfig's server field to: `https://127.0.0.1:6443` diff --git a/k3s/genesis/api-deployment.yaml b/k3s/genesis/api-deployment.yaml new file mode 100644 index 0000000..92827df --- /dev/null +++ b/k3s/genesis/api-deployment.yaml @@ -0,0 +1,67 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api + labels: + app: api +spec: + replicas: 1 + selector: + matchLabels: + app: api + template: + metadata: + labels: + app: api + spec: + enableServiceLinks: false # Prevent k8s from propagating vars like {SERVICE_NAME}_PORT inside the container + nodeSelector: + kubernetes.io/hostname: k8s-worker-1 + imagePullSecrets: + - name: ghcr-credentials + containers: + - name: api + image: ghcr.io/product-science/api:0.1.2 + ports: + - containerPort: 9000 + name: public + - containerPort: 9100 + name: ml-server + - containerPort: 9200 + name: admin + env: + - name: KEY_NAME + value: "genesis" + - name: DAPI_API__POC_CALLBACK_URL + value: "http://api-private:9100" + - name: DAPI_API__PUBLIC_URL + valueFrom: + configMapKeyRef: + name: config + key: DAPI_API_PUBLIC_URL + - name: DAPI_CHAIN_NODE__IS_GENESIS + value: "true" + - name: DAPI_CHAIN_NODE__URL + value: "http://node:26657" # Assumes 'node' service DNS resolves + - name: NODE_CONFIG_PATH + value: "/root/node_config.json" + - name: DAPI_API__PUBLIC_SERVER_PORT + value: "9000" + - name: DAPI_API__ML_SERVER_PORT + value: "9100" + - name: DAPI_API__ADMIN_SERVER_PORT + value: "9200" + volumeMounts: + - name: data + mountPath: /root/.inference + - name: node-config + mountPath: /root/node_config.json + subPath: node_config.json + volumes: + - name: data + hostPath: + path: /srv/dai/inference + type: DirectoryOrCreate + - name: node-config + configMap: + name: node-config-cm diff --git a/k3s/genesis/api-public-service.yaml b/k3s/genesis/api-public-service.yaml new file mode 100644 index 0000000..50b51a1 --- /dev/null +++ b/k3s/genesis/api-public-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: api-public + labels: + app: api +spec: + type: NodePort + selector: + app: api + ports: + - name: public + port: 9000 + targetPort: 9000 + nodePort: 30000 diff --git a/k3s/genesis/config.yaml b/k3s/genesis/config.yaml new file mode 100644 index 0000000..dfca7df --- /dev/null +++ b/k3s/genesis/config.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: config +data: + DAPI_API_PUBLIC_URL: "http://34.9.136.116:30000" + P2P_EXTERNAL_ADDRESS: "tcp://34.9.136.116:30001" diff --git a/k3s/genesis/genesis-overrides-configmap.yaml b/k3s/genesis/genesis-overrides-configmap.yaml new file mode 100644 index 0000000..e2ad629 --- /dev/null +++ b/k3s/genesis/genesis-overrides-configmap.yaml @@ -0,0 +1,52 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: genesis-overrides-cm +data: + genesis_overrides.json: | + { + "app_state": { + "gov": { + "params": { + "voting_period": "24h0m0s", + "min_deposit": [ + { + "denom": "nicoin", + "amount": "25_000_000" + } + ], + "expedited_voting_period": "1h0m0s", + "expedited_min_deposit": [ + { + "denom": "nicoin", + "amount": "50_000_000" + } + ], + "quorum": "0.01000000000000000" + } + }, + "inference": { + "params": { + "epochParams": { + "epoch_length": "250", + "epoch_multiplier": "1", + "epoch_shift": "200", + "poc_stage_duration": "60", + "poc_exchange_duration": "5", + "poc_validation_delay": "5", + "poc_validation_duration": "20", + "default_unit_of_compute_price": "100" + } + }, + "genesis_only_params": { + "total_supply": "1000", + "originator_supply": "160", + "top_reward_amount": "120", + "standard_reward_amount": "600", + "pre_programmed_sale_amount": "120", + "top_rewards": 3, + "supply_denom": "mcicoin" + } + } + } + } diff --git a/k3s/genesis/inference-deployment.yaml b/k3s/genesis/inference-deployment.yaml new file mode 100644 index 0000000..1f731dc --- /dev/null +++ b/k3s/genesis/inference-deployment.yaml @@ -0,0 +1,52 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: inference + labels: + app: inference +spec: + replicas: 1 + selector: + matchLabels: + app: inference + template: + metadata: + labels: + app: inference + spec: + enableServiceLinks: false # Prevent k8s from propagating vars like {SERVICE_NAME}_PORT inside the container + nodeSelector: + kubernetes.io/hostname: k8s-worker-1 + hostIPC: true + imagePullSecrets: + - name: ghcr-credentials + containers: + - name: inference + image: ghcr.io/product-science/mlnode:3.0.4-alpha2 + command: + - /app/entrypoint.sh + - uvicorn + - api.app:app + - --host=0.0.0.0 + - --port=8080 + ports: + - containerPort: 8080 + name: poc-api + - containerPort: 5000 + name: inference-api + env: + - name: HF_HOME + value: "/root/.cache" + - name: VLLM_ATTENTION_BACKEND + value: "FLASHINFER" + resources: + limits: + nvidia.com/gpu: "1" + volumeMounts: + - name: cache + mountPath: /root/.cache + volumes: + - name: cache + hostPath: + path: /mnt/shared + type: DirectoryOrCreate diff --git a/k3s/genesis/kustomization.yaml b/k3s/genesis/kustomization.yaml new file mode 100644 index 0000000..9ca7fc6 --- /dev/null +++ b/k3s/genesis/kustomization.yaml @@ -0,0 +1,16 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Changed 'bases:' to 'resources:' to include the common kustomization +resources: + - ../common # This now correctly lists the common kustomization under resources + + - config.yaml + - genesis-overrides-configmap.yaml + + # Workloads specific to Genesis + - node-statefulset.yaml + - api-deployment.yaml + - inference-deployment.yaml + - api-public-service.yaml + - node-service.yaml diff --git a/k3s/genesis/node-service.yaml b/k3s/genesis/node-service.yaml new file mode 100644 index 0000000..50f4ad8 --- /dev/null +++ b/k3s/genesis/node-service.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + name: node + labels: + app: node +spec: + type: NodePort + selector: + app: node + ports: + - name: tendermint-p2p + port: 26656 + targetPort: 26656 + nodePort: 30001 + - name: tendermint-rpc + port: 26657 + targetPort: 26657 + nodePort: 30002 diff --git a/k3s/genesis/node-statefulset.yaml b/k3s/genesis/node-statefulset.yaml new file mode 100644 index 0000000..4bd5eb3 --- /dev/null +++ b/k3s/genesis/node-statefulset.yaml @@ -0,0 +1,69 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: node + labels: + app: node +spec: + serviceName: "node" + replicas: 1 + selector: + matchLabels: + app: node + template: + metadata: + labels: + app: node + spec: + enableServiceLinks: false # Prevent k8s from propagating vars like {SERVICE_NAME}_PORT inside the container + nodeSelector: + kubernetes.io/hostname: k8s-worker-1 + imagePullSecrets: + - name: ghcr-credentials + containers: + - name: node + image: ghcr.io/product-science/inferenced:0.1.2 + command: ["sh", "./init-docker-genesis.sh"] + ports: + - containerPort: 26656 + name: tendermint-p2p + - containerPort: 26657 + name: tendermint-rpc + env: + - name: KEY_NAME + value: "genesis" + - name: SNAPSHOT_INTERVAL + value: "1000" + - name: SNAPSHOT_KEEP_RECENT + value: "5" + - name: NODE_CONFIG_PATH + value: "/root/node_config.json" + - name: P2P_EXTERNAL_ADDRESS + valueFrom: + configMapKeyRef: + name: config + key: P2P_EXTERNAL_ADDRESS + - name: CONFIG_p2p__allow_duplicate_ip + value: "true" + - name: CONFIG_p2p__handshake_timeout + value: "30s" + - name: CONFIG_p2p__dial_timeout + value: "30s" + - name: INIT_TGBOT + value: "true" + - name: TGBOT_PRIVATE_KEY_PASS + value: "defaultpassword" + volumeMounts: + - name: data + mountPath: /root/.inference + - name: genesis-overrides + mountPath: /root/genesis_overrides.json + subPath: genesis_overrides.json + volumes: + - name: data + hostPath: + path: /srv/dai/inference + type: DirectoryOrCreate + - name: genesis-overrides + configMap: + name: genesis-overrides-cm diff --git a/k3s/jobs/clear-state-job.yaml b/k3s/jobs/clear-state-job.yaml new file mode 100644 index 0000000..3977773 --- /dev/null +++ b/k3s/jobs/clear-state-job.yaml @@ -0,0 +1,75 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: clear-worker-state-job + labels: + app: clear-worker-state +spec: + parallelism: 3 + completions: 3 + backoffLimit: 2 # Number of retries before marking job as failed + template: + metadata: + labels: + app: clear-worker-state # For log selection + spec: + restartPolicy: Never # Or OnFailure + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - k8s-worker-1 + - k8s-worker-2 + - k8s-worker-3 + containers: + - name: state-clearer + image: busybox + command: + - sh + - -c + - > + echo "Running state clearance on node: $(NODE_NAME)"; + rm -rf /mnt_host/srv_dai_inference/* && \ + rm -rf /mnt_host/srv_dai_inference/.* 2>/dev/null && \ + rm -rf /mnt_host/srv_dai_tmkms_data/* && \ + rm -rf /mnt_host/srv_dai_tmkms_data/.* 2>/dev/null && \ + # Uncomment below to clear /mnt/shared as well + # rm -rf /mnt_host/mnt_shared/* && \ + # rm -rf /mnt_host/mnt_shared/.* 2>/dev/null && \ + echo "Successfully cleared state directories on $(NODE_NAME)"; + sleep 5 # Keep pod alive for a few seconds for log visibility + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + securityContext: + runAsUser: 0 # Run as root to have permissions for rm -rf on host paths + # If runAsUser:0 is not sufficient, uncomment below (less secure): + # privileged: true + volumeMounts: + - name: host-srv-dai-inference + mountPath: /mnt_host/srv_dai_inference + - name: host-srv-dai-tmkms-data + mountPath: /mnt_host/srv_dai_tmkms_data + # Uncomment below if clearing /mnt/shared + # - name: host-mnt-shared + # mountPath: /mnt_host/mnt_shared + volumes: + - name: host-srv-dai-inference + hostPath: + path: /srv/dai/inference + type: DirectoryOrCreate + - name: host-srv-dai-tmkms-data + hostPath: + path: /srv/dai/tmkms_data + type: DirectoryOrCreate + # Uncomment below if clearing /mnt/shared + # - name: host-mnt-shared + # hostPath: + # path: /mnt/shared + # type: DirectoryOrCreate \ No newline at end of file diff --git a/k3s/join/api-deployment.yaml b/k3s/join/api-deployment.yaml new file mode 100644 index 0000000..862d260 --- /dev/null +++ b/k3s/join/api-deployment.yaml @@ -0,0 +1,64 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api + labels: + app: api +spec: + replicas: 1 + selector: + matchLabels: + app: api + template: + metadata: + labels: + app: api + spec: + enableServiceLinks: false + imagePullSecrets: + - name: ghcr-credentials + containers: + - name: api + image: ghcr.io/product-science/api:0.1.2 + imagePullPolicy: IfNotPresent + ports: + - containerPort: 9000 + name: public-api + - containerPort: 9100 + name: ml-api + - containerPort: 9200 + name: admin-api + envFrom: + - configMapRef: + name: config + env: + - name: NODE_CONFIG_PATH + value: "/root/node_config.json" + - name: DAPI_API__PUBLIC_SERVER_PORT + value: "9000" + - name: DAPI_API__ML_SERVER_PORT + value: "9100" + - name: DAPI_API__ADMIN_SERVER_PORT + value: "9200" + - name: DAPI_API__POC_CALLBACK_URL + value: "http://api-private:9100" + - name: DAPI_CHAIN_NODE__URL + value: "http://node:26657" + - name: DAPI_CHAIN_NODE__SEED_API_URL + value: "http://34.9.136.116:30000" + - name: DAPI_CHAIN_NODE__P2P_URL + value: "http://node:26656" + volumeMounts: + - name: inference-data + mountPath: /root/.inference + - name: node-config + mountPath: /root/node_config.json + subPath: node_config.json + volumes: + - name: inference-data + hostPath: + path: /srv/dai/inference + type: DirectoryOrCreate + - name: node-config + configMap: + name: node-config-cm diff --git a/k3s/join/config.yaml b/k3s/join/config.yaml new file mode 100644 index 0000000..51e533d --- /dev/null +++ b/k3s/join/config.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: config +data: + # Replace <...> placeholders with actual values for each worker + # From config.env.template + KEY_NAME: "" + DAPI_API_PUBLIC_URL: "http://:" + P2P_EXTERNAL_ADDRESS: "tcp://:" # e.g., port 5000 diff --git a/k3s/join/inference-deployment.yaml b/k3s/join/inference-deployment.yaml new file mode 100644 index 0000000..8641d3b --- /dev/null +++ b/k3s/join/inference-deployment.yaml @@ -0,0 +1,53 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: inference + labels: + app: inference +spec: + replicas: 1 + selector: + matchLabels: + app: inference + template: + metadata: + labels: + app: inference + spec: + enableServiceLinks: false + hostIPC: true + imagePullSecrets: + - name: ghcr-credentials + containers: + - name: inference + image: ghcr.io/product-science/mlnode:3.0.4-alpha2 + imagePullPolicy: IfNotPresent + command: + - "/app/entrypoint.sh" + - "uvicorn" + - "api.app:app" + - "--host=0.0.0.0" + - "--port=8080" + ports: + - containerPort: 8080 + name: poc-api + - containerPort: 5000 + name: inference-api + env: + - name: HF_HOME + value: "/root/.cache" + - name: HF_HUB_ENABLE_HF_TRANSFER + value: "true" + - name: VLLM_ATTENTION_BACKEND + value: "FLASHINFER" + resources: + limits: + nvidia.com/gpu: "1" # Request 1 GPU + volumeMounts: + - name: cache + mountPath: /root/.cache # HF_HOME points to /mnt/shared, which is mounted here. + volumes: + - name: cache + hostPath: + path: /mnt/shared # Matches genesis; ensure this path exists on worker nodes + type: DirectoryOrCreate diff --git a/k3s/join/kustomization.yaml b/k3s/join/kustomization.yaml new file mode 100644 index 0000000..04db6c4 --- /dev/null +++ b/k3s/join/kustomization.yaml @@ -0,0 +1,12 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../common + + - tmkms-pvc.yaml + - tmkms-deployment.yaml + - node-statefulset.yaml + - api-deployment.yaml + - inference-deployment.yaml + - config.yaml diff --git a/k3s/join/node-statefulset.yaml b/k3s/join/node-statefulset.yaml new file mode 100644 index 0000000..4123303 --- /dev/null +++ b/k3s/join/node-statefulset.yaml @@ -0,0 +1,71 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: node + labels: + app: node +spec: + serviceName: "node" + replicas: 1 # Typically 1 for a specific join node identity + selector: + matchLabels: + app: node + template: + metadata: + labels: + app: node + spec: + enableServiceLinks: false + imagePullSecrets: # Add if your image is in a private registry + - name: ghcr-credentials + containers: + - name: node + image: ghcr.io/product-science/inferenced:0.1.2 + imagePullPolicy: IfNotPresent + command: + - "sh" + - "-c" + - "./init-docker.sh" # Assuming this script is in the image's WORKDIR/PATH + ports: + - containerPort: 26656 + name: tendermint-p2p + - containerPort: 26657 + name: tendermint-rpc + - containerPort: 26658 + name: tmkms + envFrom: + - configMapRef: + name: config + env: + - name: SEED_NODE_RPC_URL + value: "http://34.9.136.116:30002" + - name: SEED_NODE_P2P_URL + value: "tcp://34.9.136.116:30001" + - name: RPC_SERVER_URL_1 + value: "http://34.9.136.116:30002" + - name: RPC_SERVER_URL_2 + value: "http://34.9.136.116:30002" + - name: SNAPSHOT_INTERVAL + value: "1000" + - name: SNAPSHOT_KEEP_RECENT + value: "5" + - name: TRUSTED_BLOCK_PERIOD + value: "2000" + - name: CONFIG_p2p__allow_duplicate_ip + value: "true" + - name: CONFIG_p2p__handshake_timeout + value: "30s" + - name: CONFIG_p2p__dial_timeout + value: "30s" + - name: TKMS_PORT + value: "26658" + - name: TMKMS_PORT + value: "26658" + volumeMounts: + - name: inference-data + mountPath: /root/.inference # Shared data with API + volumes: + - name: inference-data # This volume is now a hostPath + hostPath: + path: /srv/dai/inference # Matches genesis setup + type: DirectoryOrCreate diff --git a/k3s/join/tmkms-deployment.yaml b/k3s/join/tmkms-deployment.yaml new file mode 100644 index 0000000..a4210a0 --- /dev/null +++ b/k3s/join/tmkms-deployment.yaml @@ -0,0 +1,36 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tmkms + labels: + app: tmkms +spec: + replicas: 1 + selector: + matchLabels: + app: tmkms + template: + metadata: + labels: + app: tmkms + spec: + enableServiceLinks: false + imagePullSecrets: + - name: ghcr-credentials + containers: + - name: tmkms + image: ghcr.io/product-science/tmkms-softsign-with-keygen:32165a2 + imagePullPolicy: IfNotPresent + env: + - name: VALIDATOR_LISTEN_ADDRESS + value: "tcp://node:26658" + ports: + - containerPort: 26658 # Though tmkms connects out, its good practice to declare if it also listens on a port, even if not exposed via service + # The docker-compose doesn't explicitly expose a port for tmkms itself. + volumeMounts: + - name: tmkms-data + mountPath: /root/.tmkms + volumes: + - name: tmkms-data + persistentVolumeClaim: + claimName: tmkms-data-pvc diff --git a/k3s/join/tmkms-pvc.yaml b/k3s/join/tmkms-pvc.yaml new file mode 100644 index 0000000..7959e4f --- /dev/null +++ b/k3s/join/tmkms-pvc.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: tmkms-data-pvc +spec: + accessModes: + - ReadWriteOnce # Suitable for a single TMKMS pod + resources: + requests: + storage: 1Gi # Adjust size as needed for TMKMS data + # storageClassName: # Optional: specify if you have a particular storage class diff --git a/k3s/overlays/join-k8s-worker-2/api-deployment-patch.yaml b/k3s/overlays/join-k8s-worker-2/api-deployment-patch.yaml new file mode 100644 index 0000000..c2c660e --- /dev/null +++ b/k3s/overlays/join-k8s-worker-2/api-deployment-patch.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api +spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-2 \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-2/api-public-service.yaml b/k3s/overlays/join-k8s-worker-2/api-public-service.yaml new file mode 100644 index 0000000..cea37d9 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-2/api-public-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: api-public + labels: + app: api +spec: + type: NodePort + selector: + app: api + ports: + - name: public + port: 9000 + targetPort: 9000 + nodePort: 30010 diff --git a/k3s/overlays/join-k8s-worker-2/config-patch.yaml b/k3s/overlays/join-k8s-worker-2/config-patch.yaml new file mode 100644 index 0000000..226cf74 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-2/config-patch.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: config +data: + KEY_NAME: "join-k8s-worker-2" + DAPI_API__PUBLIC_URL: "35.192.7.224:30010" + P2P_EXTERNAL_ADDRESS: "35.192.7.224:30011" diff --git a/k3s/overlays/join-k8s-worker-2/inference-deployment-patch.yaml b/k3s/overlays/join-k8s-worker-2/inference-deployment-patch.yaml new file mode 100644 index 0000000..f28cae6 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-2/inference-deployment-patch.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: inference +spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-2 \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-2/kustomization.yaml b/k3s/overlays/join-k8s-worker-2/kustomization.yaml new file mode 100644 index 0000000..ef452c4 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-2/kustomization.yaml @@ -0,0 +1,14 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../join + - api-public-service.yaml + - node-service.yaml + +patchesStrategicMerge: + - tmkms-deployment-patch.yaml + - node-statefulset-patch.yaml + - api-deployment-patch.yaml + - inference-deployment-patch.yaml + - config-patch.yaml diff --git a/k3s/overlays/join-k8s-worker-2/node-service.yaml b/k3s/overlays/join-k8s-worker-2/node-service.yaml new file mode 100644 index 0000000..ba46099 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-2/node-service.yaml @@ -0,0 +1,22 @@ +apiVersion: v1 +kind: Service +metadata: + name: node + labels: + app: node +spec: + type: NodePort + selector: + app: node + ports: + - name: tendermint-p2p + port: 26656 + targetPort: 26656 + nodePort: 30011 + - name: tendermint-rpc + port: 26657 + targetPort: 26657 + nodePort: 30012 + - name: tmkms + port: 26658 + targetPort: 26658 diff --git a/k3s/overlays/join-k8s-worker-2/node-statefulset-patch.yaml b/k3s/overlays/join-k8s-worker-2/node-statefulset-patch.yaml new file mode 100644 index 0000000..389ed99 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-2/node-statefulset-patch.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: node +spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-2 \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-2/tmkms-deployment-patch.yaml b/k3s/overlays/join-k8s-worker-2/tmkms-deployment-patch.yaml new file mode 100644 index 0000000..30660c9 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-2/tmkms-deployment-patch.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tmkms +spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-2 \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-3/api-deployment-patch.yaml b/k3s/overlays/join-k8s-worker-3/api-deployment-patch.yaml new file mode 100644 index 0000000..a728ed7 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-3/api-deployment-patch.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api +spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-3 \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-3/api-public-service.yaml b/k3s/overlays/join-k8s-worker-3/api-public-service.yaml new file mode 100644 index 0000000..0105a1a --- /dev/null +++ b/k3s/overlays/join-k8s-worker-3/api-public-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: api-public + labels: + app: api +spec: + type: NodePort + selector: + app: api + ports: + - name: public + port: 9000 + targetPort: 9000 + nodePort: 30020 diff --git a/k3s/overlays/join-k8s-worker-3/config-patch.yaml b/k3s/overlays/join-k8s-worker-3/config-patch.yaml new file mode 100644 index 0000000..af3419c --- /dev/null +++ b/k3s/overlays/join-k8s-worker-3/config-patch.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: config +data: + KEY_NAME: "join-k8s-worker-3" + DAPI_API__PUBLIC_URL: "34.9.17.182:30020" + P2P_EXTERNAL_ADDRESS: "34.9.17.182:30021" diff --git a/k3s/overlays/join-k8s-worker-3/inference-deployment-patch.yaml b/k3s/overlays/join-k8s-worker-3/inference-deployment-patch.yaml new file mode 100644 index 0000000..fe57e37 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-3/inference-deployment-patch.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: inference +spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-3 \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-3/kustomization.yaml b/k3s/overlays/join-k8s-worker-3/kustomization.yaml new file mode 100644 index 0000000..ef452c4 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-3/kustomization.yaml @@ -0,0 +1,14 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../join + - api-public-service.yaml + - node-service.yaml + +patchesStrategicMerge: + - tmkms-deployment-patch.yaml + - node-statefulset-patch.yaml + - api-deployment-patch.yaml + - inference-deployment-patch.yaml + - config-patch.yaml diff --git a/k3s/overlays/join-k8s-worker-3/node-service.yaml b/k3s/overlays/join-k8s-worker-3/node-service.yaml new file mode 100644 index 0000000..9ed2fce --- /dev/null +++ b/k3s/overlays/join-k8s-worker-3/node-service.yaml @@ -0,0 +1,22 @@ +apiVersion: v1 +kind: Service +metadata: + name: node + labels: + app: node +spec: + type: NodePort + selector: + app: node + ports: + - name: tendermint-p2p + port: 26656 + targetPort: 26656 + nodePort: 30021 + - name: tendermint-rpc + port: 26657 + targetPort: 26657 + nodePort: 30022 + - name: tmkms + port: 26658 + targetPort: 26658 diff --git a/k3s/overlays/join-k8s-worker-3/node-statefulset-patch.yaml b/k3s/overlays/join-k8s-worker-3/node-statefulset-patch.yaml new file mode 100644 index 0000000..ff157cc --- /dev/null +++ b/k3s/overlays/join-k8s-worker-3/node-statefulset-patch.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: node +spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-3 \ No newline at end of file diff --git a/k3s/overlays/join-k8s-worker-3/tmkms-deployment-patch.yaml b/k3s/overlays/join-k8s-worker-3/tmkms-deployment-patch.yaml new file mode 100644 index 0000000..ed89191 --- /dev/null +++ b/k3s/overlays/join-k8s-worker-3/tmkms-deployment-patch.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tmkms +spec: + template: + spec: + nodeSelector: + kubernetes.io/hostname: k8s-worker-3 \ No newline at end of file