1818except Exception as e :
1919 print ("Skipping pdf2image import: %s" % e )
2020
21+
2122try :
2223 import llama_cpp
2324except Exception as e :
2425 print ("Skipping llama_cpp import: %s" % e )
2526
27+ print ("LD Library: '%s'" % os .environ .get ("LD_LIBRARY_PATH" , "" ))
28+
2629from shuffle_sdk import AppBase
2730
2831#model = "/models/Llama-3.2-3B.Q8_0.gguf" # Larger
2932#model = "/models/Llama-3.2-3B.Q2_K.gguf" # Smol
3033
31- #model = "/models/DeepSeek-R1-Distill-Llama-8B-Q8_0.gguf" # Larger 8-bit
32- model = "/models/DeepSeek-R1-Distill-Llama-8B-Q2_K.gguf" # Smaller
34+ #model = "/models/DeepSeek-R1-Distill-Llama-8B-Q2_K.gguf" # Smaller
35+ #model = "/models/Meta-Llama-3-8B.Q6_K.gguf"
36+ model = "/models/DeepSeek-R1-Distill-Llama.gguf"
3337if os .getenv ("MODEL_PATH" ):
3438 model = os .getenv ("MODEL_PATH" )
3539
3640def load_llm_model (model ):
41+ print ("Using model path '%s'" % model )
3742 if not os .path .exists (model ):
43+ print ("Could not find model at path %s" % model )
3844 model_name = model .split ("/" )[- 1 ]
3945 # Check $HOME/downloads/{model}
4046
@@ -54,20 +60,34 @@ def load_llm_model(model):
5460 innerllm = None
5561 gpu_layers = os .getenv ("GPU_LAYERS" )
5662 if gpu_layers :
63+ print ("GPU Layers: %s" % gpu_layers )
64+
5765 gpu_layers = int (gpu_layers )
5866 if gpu_layers > 0 :
59- print ("GPU Layers: %s" % gpu_layers )
6067 innerllm = llama_cpp .Llama (model_path = model , n_gpu_layers = gpu_layers )
6168 else :
62- innerllm = llama_cpp .Llama (model_path = model )
69+ innerllm = llama_cpp .Llama (model_path = model , n_gpu_layers = 8 )
6370 else :
6471 # Check if GPU available
65- #print("No GPU layers set.")
66- innerllm = llama_cpp .Llama (model_path = model )
72+ print ("No GPU layers set." )
73+ #innerllm = llama_cpp.Llama(model_path=model)
74+
75+ return {
76+ "success" : False ,
77+ "reason" : "GPU layers not set" ,
78+ "details" : "Set GPU_LAYERS environment variable to the number of GPU layers to use (e.g. 8)."
79+ }
6780
6881 return innerllm
6982
70- llm = load_llm_model (model )
83+ try :
84+ llm = load_llm_model (model )
85+ except Exception as e :
86+ print ("[ERROR] Failed to load LLM model: %s" % e )
87+ llm = {
88+ "success" : False ,
89+ "reason" : "Failed to load LLM model %s" % model ,
90+ }
7191
7292class Tools (AppBase ):
7393 __version__ = "1.0.0"
@@ -80,13 +100,35 @@ def run_llm(self, input, system_message=""):
80100 global llm
81101 global model
82102
103+ self .logger .info ("[DEBUG] LD LIbrary: '%s'. If this is empty, GPU's may not work." % os .environ .get ("LD_LIBRARY_PATH" , "" ))
104+
83105 if not system_message :
84- system_message = "Be a friendly assistant " ,
106+ system_message = "Answer their question directly. Don't use HTML or Markdown " ,
85107
86108 self .logger .info ("[DEBUG] Running LLM with model '%s'. To overwrite path, use environment variable MODEL_PATH=<path>" % model )
87109
110+ # Check if llm is a dict or not and look for success and reason in it
111+ if not llm :
112+ return {
113+ "success" : False ,
114+ "reason" : "LLM model not loaded" ,
115+ "details" : "Ensure the LLM model is loaded" ,
116+ "gpu_layers" : os .getenv ("GPU_LAYERS" ),
117+ }
118+
119+ if isinstance (llm , dict ):
120+ if "success" in llm and not llm ["success" ]:
121+ # List files in /model folder
122+ llm ["folder" ] = os .listdir ("/models" )
123+ llm ["gpu_layers" ] = os .getenv ("GPU_LAYERS" )
124+ return llm
125+
126+ self .logger .info ("[DEBUG] Running LLM with input '%s' and system message '%s'. GPU Layers: %s" % (input , system_message , os .getenv ("GPU_LAYERS" )))
127+
88128 # https://github.com/abetlen/llama-cpp-python
89129 try :
130+ print ("LLM: " , llm )
131+
90132 self .logger .info ("[DEBUG] LLM: %s" % llm )
91133 output = llm .create_chat_completion (
92134 max_tokens = 100 ,
@@ -117,14 +159,19 @@ def run_llm(self, input, system_message=""):
117159 parsed_output = {
118160 "success" : True ,
119161 "model" : output ["model" ],
120- "tokens" : output ["tokens" ],
121162 "output" : new_message ,
122163 }
123164
165+ if "tokens" in output :
166+ parsed_output ["tokens" ] = output ["tokens" ]
167+
168+ if "usage" in output :
169+ parsed_output ["tokens" ] = output ["usage" ]
170+
124171 if not os .getenv ("GPU_LAYERS" ):
125172 parsed_output ["debug" ] = "GPU_LAYERS not set. Running on CPU. Set GPU_LAYERS to the number of GPU layers to use (e.g. 8)."
126173
127- return output
174+ return parsed_output
128175
129176 def security_assistant (self ):
130177 # Currently testing outside the Shuffle environment
0 commit comments