2525
2626from shuffle_sdk import AppBase
2727
28+ #model = "/models/Llama-3.2-3B.Q8_0.gguf" # Larger
29+ #model = "/models/Llama-3.2-3B.Q2_K.gguf" # Smol
30+
31+ #model = "/models/DeepSeek-R1-Distill-Llama-8B-Q8_0.gguf" # Larger 8-bit
32+ model = "/models/DeepSeek-R1-Distill-Llama-8B-Q2_K.gguf" # Smaller
33+ if os .getenv ("MODEL_PATH" ):
34+ model = os .getenv ("MODEL_PATH" )
35+
36+ def load_llm_model (model ):
37+ if not os .path .exists (model ):
38+ model_name = model .split ("/" )[- 1 ]
39+ # Check $HOME/downloads/{model}
40+
41+ home_path = os .path .expanduser ("~" )
42+ print (home_path )
43+
44+ if os .path .exists (f"{ home_path } /downloads/{ model_name } " ):
45+ model = f"{ home_path } /downloads/{ model_name } "
46+ else :
47+ return {
48+ "success" : False ,
49+ "reason" : "Model not found at path %s" % model ,
50+ "details" : "Ensure the model path is correct"
51+ }
52+
53+ # Check for GPU layers
54+ llm = None
55+ gpu_layers = os .getenv ("GPU_LAYERS" )
56+ if gpu_layers :
57+ gpu_layers = int (gpu_layers )
58+ if gpu_layers > 0 :
59+ print ("GPU Layers: %s" % gpu_layers )
60+ llm = llama_cpp .Llama (model_path = model , n_gpu_layers = gpu_layers )
61+ else :
62+ llm = llama_cpp .Llama (model_path = model )
63+ else :
64+ # Check if GPU available
65+ #print("No GPU layers set.")
66+ llm = llama_cpp .Llama (model_path = model )
67+
68+ return llm
69+
70+ llm = load_llm_model (model )
71+
2872class Tools (AppBase ):
2973 __version__ = "1.0.0"
3074 app_name = "Shuffle AI"
@@ -34,47 +78,47 @@ def __init__(self, redis, logger, console_logger=None):
3478
3579 #def run_llm(self, question, model="llama3.2"):
3680 #def run_llm(self, question, model="deepseek-v3"):
37- def run_llm (self , question , model = "/models/Llama-3.2-3B.Q8_0.gguf" ):
38- self .logger .info ("[DEBUG] Running LLM with model '%s'" % model )
81+ def run_llm (self , question , system_message = "" ):
82+ global llm
83+ global model
3984
40- if not os .path .exists (model ):
41- return {
42- "success" : False ,
43- "reason" : "Model not found at path %s" % model ,
44- "details" : "Ensure the model path is correct"
45- }
85+ if not system_message :
86+ system_message = "Be a friendly assistant" ,
4687
47- llm = llama_cpp . Llama ( model_path = model )
88+ self . logger . info ( "[DEBUG] Running LLM with model '%s'. To overwrite path, use environment variable MODEL_PATH=<path>" % model )
4889
4990 # https://github.com/abetlen/llama-cpp-python
5091 output = llm .create_chat_completion (
92+ max_tokens = 100 ,
5193 messages = [
52- {"role" : "system" , "content" : "You are an assistant who outputs in JSON format.." },
94+ {
95+ "role" : "system" ,
96+ "content" : system_message ,
97+ },
5398 {
5499 "role" : "user" ,
55100 "content" : question ,
56101 }
57102 ]
58103 )
59104
60- return output
61-
105+ self .logger .info ("[DEBUG] LLM output: %s" % output )
62106
63- #model = ctransformers.AutoModelForCausalLM.from_pretrained(
64- # model_path_or_repo_id=model,
65- # #model_type="deepseek-v3"
66- #)
107+ new_message = ""
108+ if "choices" in output and len (output ["choices" ]) > 0 :
109+ new_message = output ["choices" ][0 ]["message" ]["content" ]
67110
68- #resp = model(full_question)
69- #return resp
111+ parsed_output = {
112+ "success" : True ,
113+ "model" : output ["model" ],
114+ "tokens" : output ["tokens" ],
115+ "output" : new_message ,
116+ }
70117
71- #response = ollama.chat(model=model, messages=[
72- # {
73- # "role": "user", "content": question,
74- # }
75- #])
118+ if not os .getenv ("GPU_LAYERS" ):
119+ parsed_output ["debug" ] = "GPU_LAYERS not set. Running on CPU. Set GPU_LAYERS to the number of GPU layers to use (e.g. 8)."
76120
77- # return response["message"]["content"]
121+ return output
78122
79123 def security_assistant (self ):
80124 # Currently testing outside the Shuffle environment
0 commit comments