product-science · DimaOrekhovPS · May 13, 2025 · May 13, 2025 · May 13, 2025 · May 13, 2025
diff --git a/k3s/README.md b/k3s/README.md
@@ -0,0 +1,64 @@
+Run genesis node
+
+```bash
+kubectl create namespace genesis # if not already created
+kubectl apply -k k3s/genesis -n genesis
+```
+
+Stop genesis node
+```bash
+kubectl delete all --all -n genesis
+```
+
+Run join-worker-2
+
+```bash
+kubectl create namespace join-k8s-worker-2 # if not already created
+```
+
+```bash
+kubectl apply -k k3s/overlays/join-k8s-worker-2 -n join-k8s-worker-2
+```
+
+Stop join-worker-2
+```bash
+kubectl delete all --all -n join-k8s-worker-2
+
+# To delete pvc
+kubectl delete pvc tmkms-data-pvc -n join-k8s-worker-2
+```
+
+Run join-worker-3
+
+```bash
+kubectl create namespace join-k8s-worker-3 # if not already created
+kubectl apply -k k3s/overlays/join-k8s-worker-3 -n join-k8s-worker-3
+```
+
+Stop join-worker-3
+```bash
+kubectl delete all --all -n join-k8s-worker-3
+
+# To delete pvc
+kubectl delete pvc tmkms-data-pvc -n join-k8s-worker-3
+```
+
+Clean state
+```bash
+gcloud compute ssh k8s-worker-1 --zone us-central1-a --command "sudo rm -rf /srv/dai"
+gcloud compute ssh k8s-worker-2 --zone us-central1-a --command "sudo rm -rf /srv/dai"
+gcloud compute ssh k8s-worker-3 --zone us-central1-a --command "sudo rm -rf /srv/dai"
+```
+
+Stop all
+```bash
+kubectl delete all --all -n genesis
+kubectl delete all --all -n join-k8s-worker-2
+kubectl delete pvc tmkms-data-pvc -n join-k8s-worker-2
+kubectl delete all --all -n join-k8s-worker-3
+kubectl delete pvc tmkms-data-pvc -n join-k8s-worker-3
+
+gcloud compute ssh k8s-worker-1 --zone us-central1-a --command "sudo rm -rf /srv/dai"
+gcloud compute ssh k8s-worker-2 --zone us-central1-a --command "sudo rm -rf /srv/dai"
+gcloud compute ssh k8s-worker-3 --zone us-central1-a --command "sudo rm -rf /srv/dai"
+```
diff --git a/k3s/common/api-private-service.yaml b/k3s/common/api-private-service.yaml
@@ -0,0 +1,17 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: api-private # RENAMED from api-internal
+  labels:
+    app: api
+spec:
+  type: ClusterIP
+  selector:
+    app: api
+  ports:
+  - name: ml-server
+    port: 9100
+    targetPort: 9100 # Assuming api-deployment.yaml containerPort is 9100 (or named ml-api which resolves to 9100)
+  - name: admin
+    port: 9200
+    targetPort: 9200 # Assuming api-deployment.yaml containerPort is 9200 (or named admin-api which resolves to 9200) 
diff --git a/k3s/common/inference-service.yaml b/k3s/common/inference-service.yaml
@@ -0,0 +1,17 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: inference
+  labels:
+    app: inference
+spec:
+  type: ClusterIP # Default, but explicit for clarity
+  selector:
+    app: inference
+  ports:
+  - name: poc-api
+    port: 8080
+    targetPort: 8080
+  - name: inference-api
+    port: 5000
+    targetPort: 5000
diff --git a/k3s/common/kustomization.yaml b/k3s/common/kustomization.yaml
@@ -0,0 +1,9 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - api-private-service.yaml
+  - inference-service.yaml
+
+  # Common ConfigMaps
+  - node-configs/node-config-configmap.yaml
diff --git a/k3s/common/node-configs/node-config-configmap.yaml b/k3s/common/node-configs/node-config-configmap.yaml
@@ -0,0 +1,23 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: node-config-cm # This name is referenced by genesis workloads
+data:
+  node_config.json: |
+    [
+      {
+        "id": "mlnode1",
+        "host": "inference",
+        "inference_port": 5000,
+        "poc_port": 8080,
+        "max_concurrent": 500,
+        "models": {
+          "Qwen/Qwen2.5-7B-Instruct": {
+            "args": [
+              "--quantization",
+              "fp8"
+            ]
+          }
+        }
+      }
+    ]
diff --git a/k3s/genesis/README.md b/k3s/genesis/README.md
@@ -0,0 +1,148 @@
+# Genesis Node K3s Deployment
+
+This directory contains Kubernetes manifests for deploying the Genesis Node on a k3s cluster.
+
+## Prerequisites
+
+- A running k3s cluster with at least one worker node that has GPU support (`k8s-worker-1`)
+- `kubectl` configured to access your cluster
+- SSH access from your management machine (or `k8s-control-plane`) to `k8s-worker-1` for state cleaning.
+- `stern` (optional, for improved log viewing)
+
+## Deployment
+
+1. **Configure kubectl** either by:
+   - Copying `/etc/rancher/k3s/k3s.yaml` from `k8s-control-plane` to `~/.kube/config` locally, or
+   - Setting up an SSH tunnel (see Appendix)
+
+2. **Set up GitHub Container Registry authentication**:
+   ```bash
+   kubectl create secret docker-registry ghcr-credentials \
+     --docker-server=ghcr.io \
+     --docker-username=YOUR_GITHUB_USERNAME \
+     --docker-password=YOUR_GITHUB_TOKEN
+   ```
+   Replace `YOUR_GITHUB_USERNAME` with your GitHub username and `YOUR_GITHUB_TOKEN` with a Personal Access Token that has `read:packages` permission. (If the secret already exists, this command will fail, which is fine.)
+
+3. **Deploy the Genesis Node**:
+   ```bash
+   kubectl apply -f .
+   ```
+
+4. **Verify deployment**:
+   ```bash
+   kubectl get pods -w
+   ```
+   Wait until all pods (`node-0`, `api-*`, `tmkms-*`, `inference-*`) show `Running` status.
+
+## Managing the Deployment
+
+### View Logs
+
+**Using kubectl** (individual components):
+```bash
+kubectl logs -f node-0                 # Node logs
+kubectl logs -f $(kubectl get pod -l app=api -o name)        # API logs
+kubectl logs -f $(kubectl get pod -l app=tmkms -o name)      # TMKMS logs
+kubectl logs -f $(kubectl get pod -l app=inference -o name)  # Inference logs
+```
+
+**Using stern** (all components):
+```bash
+stern 'node|api|tmkms|inference' --exclude-container=POD
+```
+
+### Restart Components
+
+```bash
+kubectl rollout restart statefulset/node       # Restart node
+kubectl rollout restart deployment/api         # Restart API
+kubectl rollout restart deployment/tmkms       # Restart TMKMS
+kubectl rollout restart deployment/inference   # Restart inference
+```
+
+### Update Configuration
+
+1. Edit the ConfigMap:
+   ```bash
+   kubectl edit configmap config
+   ```
+
+2. Restart affected components:
+   ```bash
+   kubectl rollout restart statefulset/node deployment/api
+   ```
+
+### Stop Deployment (Delete Kubernetes Resources)
+
+This stops the application but leaves data on the `hostPath` volumes intact.
+
+```bash
+kubectl delete -f .
+```
+
+### Clean Restart (Delete Kubernetes Resources and Clear State)
+
+This performs a full reset, deleting Kubernetes resources and clearing persisted data from `hostPath` volumes on `k8s-worker-1`, `k8s-worker-2`, and `k8s-worker-3`.
+
+**1. Delete Existing Kubernetes Application Resources:**
+   Run this from where your `kubectl` is configured (e.g., your local machine or `k8s-control-plane`):
+   ```bash
+   kubectl delete -f . --ignore-not-found=true # Deletes app resources, ignores if clear-state-job.yaml is not found or vice-versa
+   kubectl delete job clear-worker-state-job --ignore-not-found=true # Ensure previous job is cleaned up
+   ```
+   Wait for all resources to be terminated.
+
+**2. Clear HostPath Volume Data using a Kubernetes Job:**
+   Apply the `clear-state-job.yaml` manifest. This job will run pods on `k8s-worker-1`, `k8s-worker-2`, and `k8s-worker-3` to delete the contents of the specified host directories.
+   ```bash
+   kubectl apply -f clear-state-job.yaml
+   ```
+
+**3. Monitor the State Clearing Job:**
+   Check the status of the job:
+   ```bash
+   kubectl get job clear-worker-state-job -w
+   ```
+   Wait for the job to show `COMPLETIONS` as `3/3`.
+
+   View logs from the job's pods to confirm successful clearance on each node:
+   ```bash
+   kubectl logs -l app=clear-worker-state --tail=-1 # Shows all logs from all pods of the job
+   ```
+
+**4. Delete the State Clearing Job (Important):**
+   Once the job is complete, delete it to avoid re-running it accidentally and to clean up the completed pods.
+   ```bash
+   kubectl delete job clear-worker-state-job
+   ```
+
+**5. Re-deploy Application:**
+   Follow steps 2-4 from the main [Deployment](#deployment) section (create GHCR secret if needed, then `kubectl apply -f .` excluding `clear-state-job.yaml` if you re-applied everything from the directory).
+
+   A safer re-deploy command after cleanup:
+   ```bash
+   kubectl apply -f api-deployment.yaml -f api-service.yaml -f config.yaml -f genesis-overrides-configmap.yaml -f inference-deployment.yaml -f inference-service.yaml -f node-config-configmap.yaml -f node-service.yaml -f node-statefulset.yaml -f tmkms-deployment.yaml
+   ```
+
+   *Note: The `initContainer` in `tmkms-deployment.yaml` should handle permissions for its directory. If permission issues arise for `/srv/dai/inference` (used by `node` and `api`), consider adding similar `initContainers` to their respective manifests.*
+
+## Appendix: SSH Tunnel Setup
+
+If accessing the cluster remotely from your local machine, set up an SSH tunnel:
+
+```bash
+# Start tunnel
+gcloud compute ssh k8s-control-plane \
+    --project=YOUR_GCP_PROJECT_ID \
+    --zone=YOUR_GCE_INSTANCE_ZONE \
+    -- -L 6443:127.0.0.1:6443 -N -f
+
+# Check tunnel status
+pgrep -f 'ssh.*-L 6443:127.0.0.1:6443' > /dev/null && echo "Tunnel ACTIVE" || echo "Tunnel NOT ACTIVE"
+
+# Kill tunnel
+pkill -f 'ssh.*-L 6443:127.0.0.1:6443'
+```
+
+Update your kubeconfig's server field to: `https://127.0.0.1:6443`
diff --git a/k3s/genesis/api-deployment.yaml b/k3s/genesis/api-deployment.yaml
@@ -0,0 +1,67 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: api
+  labels:
+    app: api
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: api
+  template:
+    metadata:
+      labels:
+        app: api
+    spec:
+      enableServiceLinks: false # Prevent k8s from propagating vars like {SERVICE_NAME}_PORT inside the container
+      nodeSelector:
+        kubernetes.io/hostname: k8s-worker-1
+      imagePullSecrets:
+      - name: ghcr-credentials
+      containers:
+      - name: api
+        image: ghcr.io/product-science/api:0.1.2
+        ports:
+        - containerPort: 9000
+          name: public
+        - containerPort: 9100
+          name: ml-server
+        - containerPort: 9200
+          name: admin 
+        env:
+        - name: KEY_NAME
+          value: "genesis"
+        - name: DAPI_API__POC_CALLBACK_URL
+          value: "http://api-private:9100"
+        - name: DAPI_API__PUBLIC_URL
+          valueFrom:
+            configMapKeyRef:
+              name: config
+              key: DAPI_API_PUBLIC_URL
+        - name: DAPI_CHAIN_NODE__IS_GENESIS
+          value: "true"
+        - name: DAPI_CHAIN_NODE__URL
+          value: "http://node:26657" # Assumes 'node' service DNS resolves
+        - name: NODE_CONFIG_PATH
+          value: "/root/node_config.json"
+        - name: DAPI_API__PUBLIC_SERVER_PORT
+          value: "9000"
+        - name: DAPI_API__ML_SERVER_PORT
+          value: "9100"
+        - name: DAPI_API__ADMIN_SERVER_PORT
+          value: "9200"
+        volumeMounts:
+        - name: data
+          mountPath: /root/.inference
+        - name: node-config
+          mountPath: /root/node_config.json
+          subPath: node_config.json
+      volumes:
+      - name: data
+        hostPath:
+          path: /srv/dai/inference
+          type: DirectoryOrCreate
+      - name: node-config
+        configMap:
+          name: node-config-cm 
diff --git a/k3s/genesis/api-public-service.yaml b/k3s/genesis/api-public-service.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: api-public
+  labels:
+    app: api
+spec:
+  type: NodePort
+  selector:
+    app: api
+  ports:
+  - name: public
+    port: 9000
+    targetPort: 9000
+    nodePort: 30000
diff --git a/k3s/genesis/config.yaml b/k3s/genesis/config.yaml
@@ -0,0 +1,7 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: config
+data:
+  DAPI_API_PUBLIC_URL: "http://34.9.136.116:30000"
+  P2P_EXTERNAL_ADDRESS: "tcp://34.9.136.116:30001"