Skip to content

Commit cd5062a

Browse files
committed
Minor last min fixes
1 parent b7f7d94 commit cd5062a

File tree

3 files changed

+80
-32
lines changed

3 files changed

+80
-32
lines changed

shuffle-ai/1.0.0/Dockerfile

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,14 @@ RUN apt install -y file openssl bash tini libpng-dev aspell-en
1717
RUN apt install -y git clang g++ make automake autoconf libtool cmake
1818
RUN apt install -y autoconf-archive wget
1919
RUN mkdir -p /models
20-
RUN wget https://huggingface.co/QuantFactory/Llama-3.2-3B-GGUF/resolve/main/Llama-3.2-3B.Q8_0.gguf?download=true -O /models/Llama-3.2-3B.Q8_0.gguf
20+
21+
# Larger model
22+
RUN wget https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF/resolve/main/DeepSeek-R1-Distill-Llama-8B-Q2_K.gguf
23+
ENV MODEL_PATH="/models/DeepSeek-R1-Distill-Llama-8B-Q2_K.gguf"
24+
25+
# https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF/resolve/main/DeepSeek-R1-Distill-Llama-8B-Q8_0.gguf
26+
#RUN wget https://huggingface.co/QuantFactory/Llama-3.2-3B-GGUF/resolve/main/Llama-3.2-3B.Q2_K.gguf?download=true -O /models/Llama-3.2-3B.Q8_0.gguf
27+
#RUN wget https://huggingface.co/QuantFactory/Llama-3.2-3B-GGUF/resolve/main/Llama-3.2-3B.Q2_K.gguf?download=true -O /models/Llama-3.2-3B.Q8_0.gguf
2128

2229
# Install all of our pip packages in a single directory that we can copy to our base image later
2330
RUN mkdir /install
@@ -72,9 +79,6 @@ ENV SHUFFLE_APP_SDK_TIMEOUT=300
7279
#ENV LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/ctransformers/lib/basic/libctransformers.so
7380
#RUN chmod 755 /usr/local/lib/python3.10/site-packages/ctransformers/lib/basic/libctransformers.so
7481

75-
#RUN apt install -y libffi-dev
76-
77-
7882
COPY src /app
7983
WORKDIR /app
8084
CMD ["python", "app.py", "--log-level", "DEBUG"]

shuffle-ai/1.0.0/api.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ contact_info:
1212
1313
actions:
1414
- name: run_llm
15-
description: "Runs a local LLM based on ollama with any of their models from https://github.com/ollama/ollama?tab=readme-ov-file#model-library"
15+
description: "Runs a local LLM, with a GPU or CPU (slow). Default model is set up in Dockerfile"
1616
parameters:
1717
- name: question
1818
description: "The input question to the model"
@@ -21,11 +21,11 @@ actions:
2121
example: ""
2222
schema:
2323
type: string
24-
- name: model
25-
description: "The model to run"
24+
- name: system_message
25+
description: "The system message use, if any"
2626
required: false
2727
multiline: false
28-
example: "deepseek-v3"
28+
example: ""
2929
schema:
3030
type: string
3131

shuffle-ai/1.0.0/src/app.py

Lines changed: 68 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,50 @@
2525

2626
from shuffle_sdk import AppBase
2727

28+
#model = "/models/Llama-3.2-3B.Q8_0.gguf" # Larger
29+
#model = "/models/Llama-3.2-3B.Q2_K.gguf" # Smol
30+
31+
#model = "/models/DeepSeek-R1-Distill-Llama-8B-Q8_0.gguf" # Larger 8-bit
32+
model = "/models/DeepSeek-R1-Distill-Llama-8B-Q2_K.gguf" # Smaller
33+
if os.getenv("MODEL_PATH"):
34+
model = os.getenv("MODEL_PATH")
35+
36+
def load_llm_model(model):
37+
if not os.path.exists(model):
38+
model_name = model.split("/")[-1]
39+
# Check $HOME/downloads/{model}
40+
41+
home_path = os.path.expanduser("~")
42+
print(home_path)
43+
44+
if os.path.exists(f"{home_path}/downloads/{model_name}"):
45+
model = f"{home_path}/downloads/{model_name}"
46+
else:
47+
return {
48+
"success": False,
49+
"reason": "Model not found at path %s" % model,
50+
"details": "Ensure the model path is correct"
51+
}
52+
53+
# Check for GPU layers
54+
llm = None
55+
gpu_layers = os.getenv("GPU_LAYERS")
56+
if gpu_layers:
57+
gpu_layers = int(gpu_layers)
58+
if gpu_layers > 0:
59+
print("GPU Layers: %s" % gpu_layers)
60+
llm = llama_cpp.Llama(model_path=model, n_gpu_layers=gpu_layers)
61+
else:
62+
llm = llama_cpp.Llama(model_path=model)
63+
else:
64+
# Check if GPU available
65+
#print("No GPU layers set.")
66+
llm = llama_cpp.Llama(model_path=model)
67+
68+
return llm
69+
70+
llm = load_llm_model(model)
71+
2872
class Tools(AppBase):
2973
__version__ = "1.0.0"
3074
app_name = "Shuffle AI"
@@ -34,47 +78,47 @@ def __init__(self, redis, logger, console_logger=None):
3478

3579
#def run_llm(self, question, model="llama3.2"):
3680
#def run_llm(self, question, model="deepseek-v3"):
37-
def run_llm(self, question, model="/models/Llama-3.2-3B.Q8_0.gguf"):
38-
self.logger.info("[DEBUG] Running LLM with model '%s'" % model)
81+
def run_llm(self, question, system_message=""):
82+
global llm
83+
global model
3984

40-
if not os.path.exists(model):
41-
return {
42-
"success": False,
43-
"reason": "Model not found at path %s" % model,
44-
"details": "Ensure the model path is correct"
45-
}
85+
if not system_message:
86+
system_message = "Be a friendly assistant",
4687

47-
llm = llama_cpp.Llama(model_path=model)
88+
self.logger.info("[DEBUG] Running LLM with model '%s'. To overwrite path, use environment variable MODEL_PATH=<path>" % model)
4889

4990
# https://github.com/abetlen/llama-cpp-python
5091
output = llm.create_chat_completion(
92+
max_tokens=100,
5193
messages = [
52-
{"role": "system", "content": "You are an assistant who outputs in JSON format.."},
94+
{
95+
"role": "system",
96+
"content": system_message,
97+
},
5398
{
5499
"role": "user",
55100
"content": question,
56101
}
57102
]
58103
)
59104

60-
return output
61-
105+
self.logger.info("[DEBUG] LLM output: %s" % output)
62106

63-
#model = ctransformers.AutoModelForCausalLM.from_pretrained(
64-
# model_path_or_repo_id=model,
65-
# #model_type="deepseek-v3"
66-
#)
107+
new_message = ""
108+
if "choices" in output and len(output["choices"]) > 0:
109+
new_message = output["choices"][0]["message"]["content"]
67110

68-
#resp = model(full_question)
69-
#return resp
111+
parsed_output = {
112+
"success": True,
113+
"model": output["model"],
114+
"tokens": output["tokens"],
115+
"output": new_message,
116+
}
70117

71-
#response = ollama.chat(model=model, messages=[
72-
# {
73-
# "role": "user", "content": question,
74-
# }
75-
#])
118+
if not os.getenv("GPU_LAYERS"):
119+
parsed_output["debug"] = "GPU_LAYERS not set. Running on CPU. Set GPU_LAYERS to the number of GPU layers to use (e.g. 8)."
76120

77-
#return response["message"]["content"]
121+
return output
78122

79123
def security_assistant(self):
80124
# Currently testing outside the Shuffle environment

0 commit comments

Comments
 (0)