Skip to content

Commit b24603b

Browse files
committed
feat: added model downloads chapter and new available GGUF and quantized bitsandbytes models
1 parent 1e8b9dc commit b24603b

File tree

1 file changed

+234
-46
lines changed

1 file changed

+234
-46
lines changed

agentic_rag/local_rag_agent.py

Lines changed: 234 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import os
99
import logging
1010
import time
11+
import json
12+
from pathlib import Path
1113

1214
# Configure logging
1315
logging.basicConfig(
@@ -41,60 +43,246 @@ def __init__(self, content):
4143

4244
return Response(result.strip())
4345

46+
class GGUFModelHandler:
47+
"""Handler for GGUF models using llama-cpp-python"""
48+
def __init__(self, model_path_or_repo_id: str):
49+
"""Initialize GGUF model handler
50+
51+
Args:
52+
model_path_or_repo_id: Local path to GGUF model or HuggingFace repo ID
53+
"""
54+
self.model_path_or_repo_id = model_path_or_repo_id
55+
self.model = None
56+
self._load_model()
57+
58+
def _load_model(self):
59+
"""Load GGUF model using llama-cpp-python"""
60+
try:
61+
from llama_cpp import Llama
62+
63+
# Check if model_path is a local file or HuggingFace repo ID
64+
if os.path.exists(self.model_path_or_repo_id):
65+
model_path = self.model_path_or_repo_id
66+
else:
67+
# Download from HuggingFace
68+
from huggingface_hub import hf_hub_download
69+
70+
# Try to load HuggingFace token from config
71+
try:
72+
with open('config.yaml', 'r') as f:
73+
config = yaml.safe_load(f)
74+
token = config.get('HUGGING_FACE_HUB_TOKEN')
75+
except Exception:
76+
token = None
77+
78+
# Extract repo_id and filename
79+
parts = self.model_path_or_repo_id.split('/')
80+
if len(parts) < 2:
81+
raise ValueError(f"Invalid HuggingFace repo ID: {self.model_path_or_repo_id}")
82+
83+
repo_id = '/'.join(parts[:2])
84+
85+
# Find the GGUF file in the repo
86+
from huggingface_hub import list_repo_files
87+
files = list_repo_files(repo_id, token=token)
88+
gguf_files = [f for f in files if f.endswith('.gguf')]
89+
90+
if not gguf_files:
91+
raise ValueError(f"No GGUF files found in repo: {repo_id}")
92+
93+
# Use the first GGUF file or try to find a specific one if specified
94+
if len(parts) > 2:
95+
# Try to find a specific file if specified in the path
96+
specified_file = '/'.join(parts[2:])
97+
matching_files = [f for f in gguf_files if specified_file in f]
98+
if matching_files:
99+
filename = matching_files[0]
100+
else:
101+
filename = gguf_files[0]
102+
else:
103+
filename = gguf_files[0]
104+
105+
print(f"Downloading GGUF model: {filename} from {repo_id}")
106+
model_path = hf_hub_download(
107+
repo_id=repo_id,
108+
filename=filename,
109+
token=token
110+
)
111+
112+
# Determine optimal n_gpu_layers based on available VRAM
113+
n_gpu_layers = 0
114+
if torch.cuda.is_available():
115+
# Get available VRAM in GB
116+
vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
117+
118+
# Simple heuristic for n_gpu_layers based on VRAM
119+
if vram_gb > 24:
120+
n_gpu_layers = -1 # Use all layers
121+
elif vram_gb > 16:
122+
n_gpu_layers = 32
123+
elif vram_gb > 8:
124+
n_gpu_layers = 24
125+
elif vram_gb > 4:
126+
n_gpu_layers = 16
127+
else:
128+
n_gpu_layers = 8
129+
130+
print(f"CUDA available with {vram_gb:.1f}GB VRAM. Using {n_gpu_layers} GPU layers.")
131+
else:
132+
print("CUDA not available. Using CPU only.")
133+
134+
# Load the model
135+
self.model = Llama(
136+
model_path=model_path,
137+
n_ctx=4096, # Context window size
138+
n_gpu_layers=n_gpu_layers,
139+
verbose=False
140+
)
141+
142+
print(f"✓ GGUF model loaded successfully: {os.path.basename(model_path)}")
143+
144+
except ImportError as e:
145+
raise ImportError(f"Failed to import llama_cpp. Please install with: pip install llama-cpp-python. Error: {str(e)}")
146+
except Exception as e:
147+
raise Exception(f"Failed to load GGUF model: {str(e)}")
148+
149+
def __call__(self, prompt, max_new_tokens=512, temperature=0.1, top_p=0.95, **kwargs):
150+
"""Generate text using the GGUF model"""
151+
if not self.model:
152+
raise ValueError("Model not loaded")
153+
154+
# Generate text
155+
result = self.model(
156+
prompt,
157+
max_tokens=max_new_tokens,
158+
temperature=temperature,
159+
top_p=top_p,
160+
echo=False
161+
)
162+
163+
# Format result to match transformers pipeline output
164+
formatted_result = [{
165+
"generated_text": result["choices"][0]["text"]
166+
}]
167+
168+
return formatted_result
169+
44170
class LocalRAGAgent:
45-
def __init__(self, vector_store: VectorStore, model_name: str = "mistralai/Mistral-7B-Instruct-v0.2", use_cot: bool = False, collection: str = None, skip_analysis: bool = False):
46-
"""Initialize local RAG agent with vector store and local LLM"""
171+
def __init__(self, vector_store: VectorStore, model_name: str = "mistralai/Mistral-7B-Instruct-v0.2",
172+
use_cot: bool = False, collection: str = None, skip_analysis: bool = False,
173+
quantization: str = None):
174+
"""Initialize local RAG agent with vector store and local LLM
175+
176+
Args:
177+
vector_store: Vector store for retrieving context
178+
model_name: HuggingFace model name/path or GGUF model path/repo
179+
use_cot: Whether to use Chain of Thought reasoning
180+
collection: Collection to search in (PDF, Repository, or General Knowledge)
181+
skip_analysis: Whether to skip query analysis (kept for backward compatibility)
182+
quantization: Quantization method to use (None, '4bit', '8bit')
183+
"""
47184
self.vector_store = vector_store
48185
self.use_cot = use_cot
49186
self.collection = collection
187+
self.quantization = quantization
188+
self.model_name = model_name
50189
# skip_analysis parameter kept for backward compatibility but no longer used
51190

52-
# Load HuggingFace token from config
53-
try:
54-
with open('config.yaml', 'r') as f:
55-
config = yaml.safe_load(f)
56-
token = config.get('HUGGING_FACE_HUB_TOKEN')
57-
if not token:
58-
raise ValueError("HUGGING_FACE_HUB_TOKEN not found in config.yaml")
59-
except Exception as e:
60-
raise Exception(f"Failed to load HuggingFace token from config.yaml: {str(e)}")
61-
62-
# Load model and tokenizer
63-
print("\nLoading model and tokenizer...")
64-
print("Note: Initial loading and inference with Mistral-7B can take 1-5 minutes depending on your hardware.")
65-
print("Subsequent queries will be faster but may still take 30-60 seconds per response.")
191+
# Check if this is a GGUF model
192+
self.is_gguf = model_name.endswith('.gguf') or 'GGUF' in model_name
66193

67-
# Check if CUDA is available and set appropriate dtype
68-
if torch.cuda.is_available():
69-
print("CUDA is available. Using GPU acceleration.")
70-
dtype = torch.float16
194+
if self.is_gguf:
195+
# Load GGUF model
196+
print("\nLoading GGUF model...")
197+
print(f"Model: {model_name}")
198+
print("Note: Initial loading and inference can take 1-5 minutes depending on your hardware.")
199+
200+
# Initialize GGUF model handler
201+
self.gguf_handler = GGUFModelHandler(model_name)
202+
203+
# Create pipeline-like interface
204+
self.pipeline = self.gguf_handler
205+
71206
else:
72-
print("CUDA is not available. Using CPU only (this will be slow).")
73-
dtype = torch.float32
74-
75-
self.model = AutoModelForCausalLM.from_pretrained(
76-
model_name,
77-
torch_dtype=dtype,
78-
device_map="auto",
79-
token=token,
80-
# Add optimization flags
81-
low_cpu_mem_usage=True,
82-
offload_folder="offload"
83-
)
84-
self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
85-
86-
# Create text generation pipeline with optimized settings
87-
self.pipeline = pipeline(
88-
"text-generation",
89-
model=self.model,
90-
tokenizer=self.tokenizer,
91-
max_new_tokens=512,
92-
do_sample=True,
93-
temperature=0.1,
94-
top_p=0.95,
95-
device_map="auto"
96-
)
97-
print("✓ Model loaded successfully")
207+
# Load HuggingFace token from config
208+
try:
209+
with open('config.yaml', 'r') as f:
210+
config = yaml.safe_load(f)
211+
token = config.get('HUGGING_FACE_HUB_TOKEN')
212+
if not token:
213+
raise ValueError("HUGGING_FACE_HUB_TOKEN not found in config.yaml")
214+
except Exception as e:
215+
raise Exception(f"Failed to load HuggingFace token from config.yaml: {str(e)}")
216+
217+
# Load model and tokenizer
218+
print("\nLoading model and tokenizer...")
219+
print(f"Model: {model_name}")
220+
if quantization:
221+
print(f"Quantization: {quantization}")
222+
print("Note: Initial loading and inference can take 1-5 minutes depending on your hardware.")
223+
print("Subsequent queries will be faster but may still take 30-60 seconds per response.")
224+
225+
# Check if CUDA is available and set appropriate dtype
226+
if torch.cuda.is_available():
227+
print("CUDA is available. Using GPU acceleration.")
228+
dtype = torch.float16
229+
else:
230+
print("CUDA is not available. Using CPU only (this will be slow).")
231+
dtype = torch.float32
232+
233+
# Set up model loading parameters
234+
model_kwargs = {
235+
"torch_dtype": dtype,
236+
"device_map": "auto",
237+
"token": token,
238+
"low_cpu_mem_usage": True,
239+
"offload_folder": "offload"
240+
}
241+
242+
# Apply quantization if specified
243+
if quantization == '4bit':
244+
try:
245+
from transformers import BitsAndBytesConfig
246+
quantization_config = BitsAndBytesConfig(
247+
load_in_4bit=True,
248+
bnb_4bit_compute_dtype=torch.float16,
249+
bnb_4bit_use_double_quant=True,
250+
bnb_4bit_quant_type="nf4"
251+
)
252+
model_kwargs["quantization_config"] = quantization_config
253+
print("Using 4-bit quantization with bitsandbytes")
254+
except ImportError:
255+
print("Warning: bitsandbytes not installed. Falling back to standard loading.")
256+
print("To use 4-bit quantization, install bitsandbytes: pip install bitsandbytes")
257+
elif quantization == '8bit':
258+
try:
259+
from transformers import BitsAndBytesConfig
260+
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
261+
model_kwargs["quantization_config"] = quantization_config
262+
print("Using 8-bit quantization with bitsandbytes")
263+
except ImportError:
264+
print("Warning: bitsandbytes not installed. Falling back to standard loading.")
265+
print("To use 8-bit quantization, install bitsandbytes: pip install bitsandbytes")
266+
267+
# Load model with appropriate settings
268+
self.model = AutoModelForCausalLM.from_pretrained(
269+
model_name,
270+
**model_kwargs
271+
)
272+
self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
273+
274+
# Create text generation pipeline with optimized settings
275+
self.pipeline = pipeline(
276+
"text-generation",
277+
model=self.model,
278+
tokenizer=self.tokenizer,
279+
max_new_tokens=512,
280+
do_sample=True,
281+
temperature=0.1,
282+
top_p=0.95,
283+
device_map="auto"
284+
)
285+
print("✓ Model loaded successfully")
98286

99287
# Create LLM wrapper
100288
self.llm = LocalLLM(self.pipeline)

0 commit comments

Comments
 (0)