Updated llama-stack blueprint and docs. (#108)

dkennetzoracle · web-flow · commit 6a749082292d · 2025-08-20T11:28:33.000-07:00
* Updated llama-stack blueprint and docs.

* Updated CPU inference PAR to 2027 exp.

* Update llamastack docs for fallback testing.

* Anchored llamastack version.

* Versioned all llama-stack containers.
diff --git a/docs/sample_blueprints/model_serving/cpu-inference/cpu-inference-gemma.json b/docs/sample_blueprints/model_serving/cpu-inference/cpu-inference-gemma.json
@@ -6,7 +6,7 @@
   "recipe_node_shape": "BM.Standard.E5.192",
   "input_object_storage": [
     {
-      "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/3qaZRZ0A38V-k0A0eYPqx8XPB06V2WLTj6zOYXKYK97k--yNzEqcV3qsa0MdUcr3/n/iduyx1qnmway/b/ollama-models/o/",
+      "par": "https://iduyx1qnmway.objectstorage.us-ashburn-1.oci.customer-oci.com/p/ActTC68_vMHU92rTYGp-XUiGQrE_P6Jl22b5OPIlcTHMzMjSS99_TAgSVsk_8zmQ/n/iduyx1qnmway/b/ollama-models/o/",
       "mount_location": "/models",
       "volume_size_in_gbs": 20
     }
diff --git a/docs/sample_blueprints/model_serving/cpu-inference/cpu-inference-mistral-bm.json b/docs/sample_blueprints/model_serving/cpu-inference/cpu-inference-mistral-bm.json
@@ -6,7 +6,7 @@
   "recipe_node_shape": "BM.Standard.E4.128",
   "input_object_storage": [
     {
-      "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/3qaZRZ0A38V-k0A0eYPqx8XPB06V2WLTj6zOYXKYK97k--yNzEqcV3qsa0MdUcr3/n/iduyx1qnmway/b/ollama-models/o/",
+      "par": "https://iduyx1qnmway.objectstorage.us-ashburn-1.oci.customer-oci.com/p/ActTC68_vMHU92rTYGp-XUiGQrE_P6Jl22b5OPIlcTHMzMjSS99_TAgSVsk_8zmQ/n/iduyx1qnmway/b/ollama-models/o/",
       "mount_location": "/models",
       "volume_size_in_gbs": 20
     }
diff --git a/docs/sample_blueprints/model_serving/cpu-inference/cpu-inference-mistral-vm.json b/docs/sample_blueprints/model_serving/cpu-inference/cpu-inference-mistral-vm.json
@@ -8,7 +8,7 @@
   "recipe_flex_shape_memory_size_in_gbs": 64,
   "input_object_storage": [
     {
-      "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/3qaZRZ0A38V-k0A0eYPqx8XPB06V2WLTj6zOYXKYK97k--yNzEqcV3qsa0MdUcr3/n/iduyx1qnmway/b/ollama-models/o/",
+      "par": "https://iduyx1qnmway.objectstorage.us-ashburn-1.oci.customer-oci.com/p/ActTC68_vMHU92rTYGp-XUiGQrE_P6Jl22b5OPIlcTHMzMjSS99_TAgSVsk_8zmQ/n/iduyx1qnmway/b/ollama-models/o/",
       "mount_location": "/models",
       "volume_size_in_gbs": 20
     }
diff --git a/docs/sample_blueprints/partner_blueprints/llama-stack/README.md b/docs/sample_blueprints/partner_blueprints/llama-stack/README.md
@@ -50,20 +50,39 @@ To test your llama stack implementation please follow the steps below.
 
 2. Install uv command line interface tool via the steps [here](https://docs.astral.sh/uv/getting-started/installation/)
 
-3. Clone the following repo: [https://github.com/meta-llama/llama-stack-evals](https://github.com/meta-llama/llama-stack-evals)
+3. Clone the following repo: [https://github.com/meta-llama/llama-verifications](https://github.com/meta-llama/llama-verifications)
 
 4. Go to your llama-stack deployment and grab the `Public Endpoint` (ex: `llamastack-app7.129-213-194-241.nip.io`)
 
 5. Run the following curl command to test the model list feature: `curl http://<llama_stack_deployment_endpoint>/v1/openai/v1/models`
 
 6. You can use llama-stack-evals repo (which you previously cloned) to run verifications / benchmark evaluations against this llama stack deployments’s OpenAI endpoint. Note: If you are using the blueprint unmodified (aka using the NousResearch/Meta-Llama-3.1-8B-Instruct model, some of the tests will fail on purpose since this tests multi-modal inputs which this model does not support)
 
+**Note**: It is possible for this test to fail if the self-signed certificate hasn't finished generating yet. The errors indicate this like:
 ```
-cd llama-stack-evals # make sure you are in the llama-stack-evals repo
+E           httpx.ConnectError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate (_ssl.c:1010)
+```
+If you see this message, it does not mean that llama-stack isn't working, just that these tests won't succeed until certs are generated.
+
+```
+cd llama-verifications # make sure you are in the llama-verifications repo
 
-uvx llama-stack-evals run-tests --openai-compat-endpoint http://<llama_stack_deployment_endpoint>/v1/openai/v1 --model "<MODEL_YOU_USED_IN_VLLM_DEPLOYMENT>"
+export OPENAI_API_KEY="t" # dummy key
+uvx llama-verifications run-tests --openai-compat-endpoint http://<llama_stack_deployment_endpoint>/v1/openai/v1 --model "<MODEL_YOU_USED_IN_VLLM_DEPLOYMENT>"
 
-# ex: uvx llama-stack-evals run-tests --openai-compat-endpoint http://llamastack-app7.129-213-194-241.nip.io/v1/openai/v1 --model "NousResearch/Meta-Llama-3.1-8B-Instruct"
+# ex: uvx llama-verifications run-tests --openai-compat-endpoint http://llamastack-app7.129-213-194-241.nip.io/v1/openai/v1 --model "Meta-Llama-3.1-8B-Instruct"
+```
+An additional way to test with `curl` if the certs have not finished (-k allows insecure):
+```bash
+curl -Lk -X POST http://<llama_stack_deployment_endpoint>/v1/openai/v1/chat/completions -H "Content-Type: application/json" -d '{
+"model": "Meta-Llama-3.1-8B-Instruct",
+    "messages": [
+      {"role": "system", "content": "You are a helpful assistant."},
+      {"role": "user", "content": "Hello! Can you tell me a fun fact about GPUs?"}
+    ],
+    "max_tokens": 100,
+    "temperature": 0.7
+  }'
 ```
 
 ## How to Use It
diff --git a/docs/sample_blueprints/partner_blueprints/llama-stack/llama_stack_basic.json b/docs/sample_blueprints/partner_blueprints/llama-stack/llama_stack_basic.json
@@ -14,7 +14,7 @@
           "recipe_flex_shape_memory_size_in_gbs": 16,
           "recipe_node_boot_volume_size_in_gbs": 200,
           "recipe_ephemeral_storage_size": 100,
-          "recipe_image_uri": "docker.io/library/postgres:latest",
+          "recipe_image_uri": "docker.io/library/postgres:13",
           "recipe_container_port": "5432",
           "recipe_host_port": "5432",
           "recipe_container_env": [
@@ -47,7 +47,7 @@
           "recipe_flex_shape_memory_size_in_gbs": 16,
           "recipe_node_boot_volume_size_in_gbs": 200,
           "recipe_ephemeral_storage_size": 100,
-          "recipe_image_uri": "docker.io/chromadb/chroma:latest",
+          "recipe_image_uri": "docker.io/chromadb/chroma:1.0.20",
           "recipe_container_port": "8000",
           "recipe_host_port": "8000",
           "recipe_container_env": [
@@ -77,7 +77,7 @@
           "recipe_id": "llm_inference_nvidia",
           "deployment_name": "vllm",
           "recipe_mode": "service",
-          "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.6.pos1",
+          "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1",
           "recipe_node_shape": "VM.GPU.A10.2",
           "input_object_storage": [
             {
@@ -87,33 +87,30 @@
               "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"]
             }
           ],
-          "recipe_container_env": [
-            {
-              "key": "tensor_parallel_size",
-              "value": "2"
-            },
-            {
-              "key": "model_name",
-              "value": "NousResearch/Meta-Llama-3.1-8B-Instruct"
-            },
-            {
-              "key": "Model_Path",
-              "value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"
-            }
-          ],
           "recipe_replica_count": 1,
           "recipe_container_port": "8000",
           "recipe_nvidia_gpu_count": 2,
           "recipe_node_pool_size": 1,
           "recipe_node_boot_volume_size_in_gbs": 200,
           "recipe_container_command_args": [
             "--model",
-            "$(Model_Path)",
+            "/models/NousResearch/Meta-Llama-3.1-8B-Instruct",
             "--tensor-parallel-size",
-            "$(tensor_parallel_size)"
+            "2",
+            "--served-model-name",
+            "Meta-Llama-3.1-8B-Instruct"
           ],
           "recipe_ephemeral_storage_size": 100,
-          "recipe_shared_memory_volume_size_limit_in_mb": 200
+          "recipe_shared_memory_volume_size_limit_in_mb": 200,
+          "recipe_readiness_probe_params": {
+            "endpoint_path": "/health",
+            "port": 8000,
+            "scheme": "HTTP",
+            "initial_delay_seconds": 20,
+            "period_seconds": 30,
+            "success_threshold": 1,
+            "timeout_seconds": 10
+          }
         },
         "exports": ["internal_dns_name"]
       },
@@ -129,7 +126,7 @@
           "recipe_flex_shape_memory_size_in_gbs": 16,
           "recipe_node_boot_volume_size_in_gbs": 200,
           "recipe_ephemeral_storage_size": 100,
-          "recipe_image_uri": "docker.io/jaegertracing/jaeger:latest",
+          "recipe_image_uri": "docker.io/jaegertracing/jaeger:2.9.0",
           "recipe_container_port": "16686",
           "recipe_additional_ingress_ports": [
             {
@@ -154,12 +151,12 @@
           "recipe_flex_shape_memory_size_in_gbs": 16,
           "recipe_node_boot_volume_size_in_gbs": 200,
           "recipe_ephemeral_storage_size": 100,
-          "recipe_image_uri": "docker.io/llamastack/distribution-postgres-demo:latest",
+          "recipe_image_uri": "docker.io/llamastack/distribution-postgres-demo:0.2.18",
           "recipe_container_port": "8321",
           "recipe_container_env": [
             {
               "key": "INFERENCE_MODEL",
-              "value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"
+              "value": "Meta-Llama-3.1-8B-Instruct"
             },
             {
               "key": "VLLM_URL",
@@ -173,6 +170,10 @@
               "key": "CHROMADB_URL",
               "value": "http://${chroma.internal_dns_name}:8000"
             },
+            {
+              "key": "ENABLE_POSTGRES",
+              "value": "1"
+            },
             {
               "key": "POSTGRES_HOST",
               "value": "${postgres.internal_dns_name}"
@@ -198,8 +199,8 @@
               "value": "console,otel_trace"
             },
             {
-              "key": "OTEL_TRACE_ENDPOINT",
-              "value": "http://${jaeger.internal_dns_name}/jaeger/v1/traces"
+              "key": "OTEL_EXPORTER_OTLP_ENDPOINT",
+              "value": "http://${jaeger.internal_dns_name}/jaeger/"
             }
           ],
           "output_object_storage": [

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@`
`6`	`6`	`"recipe_node_shape": "BM.Standard.E5.192",`
`7`	`7`	`"input_object_storage": [`
`8`	`8`	`{`
`9`		`- "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/3qaZRZ0A38V-k0A0eYPqx8XPB06V2WLTj6zOYXKYK97k--yNzEqcV3qsa0MdUcr3/n/iduyx1qnmway/b/ollama-models/o/",`
	`9`	`+ "par": "https://iduyx1qnmway.objectstorage.us-ashburn-1.oci.customer-oci.com/p/ActTC68_vMHU92rTYGp-XUiGQrE_P6Jl22b5OPIlcTHMzMjSS99_TAgSVsk_8zmQ/n/iduyx1qnmway/b/ollama-models/o/",`
`10`	`10`	`"mount_location": "/models",`
`11`	`11`	`"volume_size_in_gbs": 20`
`12`	`12`	`}`
Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@`
`6`	`6`	`"recipe_node_shape": "BM.Standard.E4.128",`
`7`	`7`	`"input_object_storage": [`
`8`	`8`	`{`
`9`		`- "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/3qaZRZ0A38V-k0A0eYPqx8XPB06V2WLTj6zOYXKYK97k--yNzEqcV3qsa0MdUcr3/n/iduyx1qnmway/b/ollama-models/o/",`
	`9`	`+ "par": "https://iduyx1qnmway.objectstorage.us-ashburn-1.oci.customer-oci.com/p/ActTC68_vMHU92rTYGp-XUiGQrE_P6Jl22b5OPIlcTHMzMjSS99_TAgSVsk_8zmQ/n/iduyx1qnmway/b/ollama-models/o/",`
`10`	`10`	`"mount_location": "/models",`
`11`	`11`	`"volume_size_in_gbs": 20`
`12`	`12`	`}`
Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@`
`8`	`8`	`"recipe_flex_shape_memory_size_in_gbs": 64,`
`9`	`9`	`"input_object_storage": [`
`10`	`10`	`{`
`11`		`- "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/3qaZRZ0A38V-k0A0eYPqx8XPB06V2WLTj6zOYXKYK97k--yNzEqcV3qsa0MdUcr3/n/iduyx1qnmway/b/ollama-models/o/",`
	`11`	`+ "par": "https://iduyx1qnmway.objectstorage.us-ashburn-1.oci.customer-oci.com/p/ActTC68_vMHU92rTYGp-XUiGQrE_P6Jl22b5OPIlcTHMzMjSS99_TAgSVsk_8zmQ/n/iduyx1qnmway/b/ollama-models/o/",`
`12`	`12`	`"mount_location": "/models",`
`13`	`13`	`"volume_size_in_gbs": 20`
`14`	`14`	`}`