Minor last min fixes

frikky · frikky · commit cd5062a9506b · 2025-01-29T21:02:29.000+01:00
diff --git a/shuffle-ai/1.0.0/Dockerfile b/shuffle-ai/1.0.0/Dockerfile
@@ -17,7 +17,14 @@ RUN apt install -y file openssl bash tini libpng-dev aspell-en
 RUN apt install -y git clang g++ make automake autoconf libtool cmake 
 RUN apt install -y autoconf-archive wget
 RUN mkdir -p /models
-RUN wget https://huggingface.co/QuantFactory/Llama-3.2-3B-GGUF/resolve/main/Llama-3.2-3B.Q8_0.gguf?download=true -O /models/Llama-3.2-3B.Q8_0.gguf
+
+# Larger model 
+RUN wget https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF/resolve/main/DeepSeek-R1-Distill-Llama-8B-Q2_K.gguf
+ENV MODEL_PATH="/models/DeepSeek-R1-Distill-Llama-8B-Q2_K.gguf"
+
+# https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF/resolve/main/DeepSeek-R1-Distill-Llama-8B-Q8_0.gguf
+#RUN wget https://huggingface.co/QuantFactory/Llama-3.2-3B-GGUF/resolve/main/Llama-3.2-3B.Q2_K.gguf?download=true -O /models/Llama-3.2-3B.Q8_0.gguf
+#RUN wget https://huggingface.co/QuantFactory/Llama-3.2-3B-GGUF/resolve/main/Llama-3.2-3B.Q2_K.gguf?download=true -O /models/Llama-3.2-3B.Q8_0.gguf
 
 # Install all of our pip packages in a single directory that we can copy to our base image later
 RUN mkdir /install
@@ -72,9 +79,6 @@ ENV SHUFFLE_APP_SDK_TIMEOUT=300
 #ENV LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/ctransformers/lib/basic/libctransformers.so
 #RUN chmod 755 /usr/local/lib/python3.10/site-packages/ctransformers/lib/basic/libctransformers.so
 
-#RUN apt install -y libffi-dev 
-
-
 COPY src /app
 WORKDIR /app
 CMD ["python", "app.py", "--log-level", "DEBUG"]
diff --git a/shuffle-ai/1.0.0/api.yaml b/shuffle-ai/1.0.0/api.yaml
@@ -12,7 +12,7 @@ contact_info:
   email: support@shuffler.io
 actions:
   - name: run_llm 
-    description: "Runs a local LLM based on ollama with any of their models from https://github.com/ollama/ollama?tab=readme-ov-file#model-library"
+    description: "Runs a local LLM, with a GPU or CPU (slow). Default model is set up in Dockerfile" 
     parameters:
       - name: question 
         description: "The input question to the model"
@@ -21,11 +21,11 @@ actions:
         example: ""
         schema:
           type: string
-      - name: model 
-        description: "The model to run"
+      - name: system_message 
+        description: "The system message use, if any"
         required: false 
         multiline: false 
-        example: "deepseek-v3"
+        example: ""
         schema:
           type: string
 
diff --git a/shuffle-ai/1.0.0/src/app.py b/shuffle-ai/1.0.0/src/app.py
@@ -25,6 +25,50 @@
 
 from shuffle_sdk import AppBase
 
+#model = "/models/Llama-3.2-3B.Q8_0.gguf" # Larger 
+#model = "/models/Llama-3.2-3B.Q2_K.gguf" # Smol
+
+#model = "/models/DeepSeek-R1-Distill-Llama-8B-Q8_0.gguf" # Larger 8-bit
+model = "/models/DeepSeek-R1-Distill-Llama-8B-Q2_K.gguf" # Smaller
+if os.getenv("MODEL_PATH"):
+    model = os.getenv("MODEL_PATH")
+
+def load_llm_model(model):
+    if not os.path.exists(model):
+        model_name = model.split("/")[-1]
+        # Check $HOME/downloads/{model}
+
+        home_path = os.path.expanduser("~")
+        print(home_path)
+
+        if os.path.exists(f"{home_path}/downloads/{model_name}"):
+            model = f"{home_path}/downloads/{model_name}"
+        else:
+            return {
+                "success": False,
+                "reason": "Model not found at path %s" % model,
+                "details": "Ensure the model path is correct"
+            }
+
+    # Check for GPU layers
+    llm = None
+    gpu_layers = os.getenv("GPU_LAYERS")
+    if gpu_layers:
+        gpu_layers = int(gpu_layers)
+        if gpu_layers > 0:
+            print("GPU Layers: %s" % gpu_layers)
+            llm = llama_cpp.Llama(model_path=model, n_gpu_layers=gpu_layers)
+        else:
+            llm = llama_cpp.Llama(model_path=model)
+    else:
+        # Check if GPU available
+        #print("No GPU layers set.")
+        llm = llama_cpp.Llama(model_path=model)
+
+    return llm
+
+llm = load_llm_model(model)
+
 class Tools(AppBase):
     __version__ = "1.0.0"
     app_name = "Shuffle AI"  
@@ -34,47 +78,47 @@ def __init__(self, redis, logger, console_logger=None):
 
     #def run_llm(self, question, model="llama3.2"):
     #def run_llm(self, question, model="deepseek-v3"):
-    def run_llm(self, question, model="/models/Llama-3.2-3B.Q8_0.gguf"):
-        self.logger.info("[DEBUG] Running LLM with model '%s'" % model)
+    def run_llm(self, question, system_message=""):
+        global llm
+        global model
 
-        if not os.path.exists(model):
-            return {
-                "success": False,
-                "reason": "Model not found at path %s" % model,
-                "details": "Ensure the model path is correct"
-            }
+        if not system_message:
+            system_message = "Be a friendly assistant",
 
-        llm = llama_cpp.Llama(model_path=model)
+        self.logger.info("[DEBUG] Running LLM with model '%s'. To overwrite path, use environment variable MODEL_PATH=<path>" % model)
 
         # https://github.com/abetlen/llama-cpp-python 
         output = llm.create_chat_completion(
+            max_tokens=100,
             messages = [
-                {"role": "system", "content": "You are an assistant who outputs in JSON format.."},
+                {
+                    "role": "system",
+                    "content": system_message,
+                },
                 {
                     "role": "user",
                     "content": question,
                 }
             ]
         )
 
-        return output
-
+        self.logger.info("[DEBUG] LLM output: %s" % output)
 
-        #model = ctransformers.AutoModelForCausalLM.from_pretrained(
-        #    model_path_or_repo_id=model,
-        #    #model_type="deepseek-v3"
-        #)
+        new_message = ""
+        if "choices" in output and len(output["choices"]) > 0:
+            new_message = output["choices"][0]["message"]["content"]
 
-        #resp = model(full_question)
-        #return resp 
+        parsed_output = {
+            "success": True,
+            "model": output["model"],
+            "tokens": output["tokens"],
+            "output": new_message,
+        }
 
-        #response = ollama.chat(model=model, messages=[
-        #    {
-        #        "role": "user", "content": question,
-        #    }
-        #])
+        if not os.getenv("GPU_LAYERS"):
+            parsed_output["debug"] = "GPU_LAYERS not set. Running on CPU. Set GPU_LAYERS to the number of GPU layers to use (e.g. 8)."
 
-        #return response["message"]["content"]
+        return output
 
     def security_assistant(self):
         # Currently testing outside the Shuffle environment