|
8 | 8 | import os |
9 | 9 | import logging |
10 | 10 | import time |
| 11 | +import json |
| 12 | +from pathlib import Path |
11 | 13 |
|
12 | 14 | # Configure logging |
13 | 15 | logging.basicConfig( |
@@ -41,60 +43,246 @@ def __init__(self, content): |
41 | 43 |
|
42 | 44 | return Response(result.strip()) |
43 | 45 |
|
| 46 | +class GGUFModelHandler: |
| 47 | + """Handler for GGUF models using llama-cpp-python""" |
| 48 | + def __init__(self, model_path_or_repo_id: str): |
| 49 | + """Initialize GGUF model handler |
| 50 | + |
| 51 | + Args: |
| 52 | + model_path_or_repo_id: Local path to GGUF model or HuggingFace repo ID |
| 53 | + """ |
| 54 | + self.model_path_or_repo_id = model_path_or_repo_id |
| 55 | + self.model = None |
| 56 | + self._load_model() |
| 57 | + |
| 58 | + def _load_model(self): |
| 59 | + """Load GGUF model using llama-cpp-python""" |
| 60 | + try: |
| 61 | + from llama_cpp import Llama |
| 62 | + |
| 63 | + # Check if model_path is a local file or HuggingFace repo ID |
| 64 | + if os.path.exists(self.model_path_or_repo_id): |
| 65 | + model_path = self.model_path_or_repo_id |
| 66 | + else: |
| 67 | + # Download from HuggingFace |
| 68 | + from huggingface_hub import hf_hub_download |
| 69 | + |
| 70 | + # Try to load HuggingFace token from config |
| 71 | + try: |
| 72 | + with open('config.yaml', 'r') as f: |
| 73 | + config = yaml.safe_load(f) |
| 74 | + token = config.get('HUGGING_FACE_HUB_TOKEN') |
| 75 | + except Exception: |
| 76 | + token = None |
| 77 | + |
| 78 | + # Extract repo_id and filename |
| 79 | + parts = self.model_path_or_repo_id.split('/') |
| 80 | + if len(parts) < 2: |
| 81 | + raise ValueError(f"Invalid HuggingFace repo ID: {self.model_path_or_repo_id}") |
| 82 | + |
| 83 | + repo_id = '/'.join(parts[:2]) |
| 84 | + |
| 85 | + # Find the GGUF file in the repo |
| 86 | + from huggingface_hub import list_repo_files |
| 87 | + files = list_repo_files(repo_id, token=token) |
| 88 | + gguf_files = [f for f in files if f.endswith('.gguf')] |
| 89 | + |
| 90 | + if not gguf_files: |
| 91 | + raise ValueError(f"No GGUF files found in repo: {repo_id}") |
| 92 | + |
| 93 | + # Use the first GGUF file or try to find a specific one if specified |
| 94 | + if len(parts) > 2: |
| 95 | + # Try to find a specific file if specified in the path |
| 96 | + specified_file = '/'.join(parts[2:]) |
| 97 | + matching_files = [f for f in gguf_files if specified_file in f] |
| 98 | + if matching_files: |
| 99 | + filename = matching_files[0] |
| 100 | + else: |
| 101 | + filename = gguf_files[0] |
| 102 | + else: |
| 103 | + filename = gguf_files[0] |
| 104 | + |
| 105 | + print(f"Downloading GGUF model: {filename} from {repo_id}") |
| 106 | + model_path = hf_hub_download( |
| 107 | + repo_id=repo_id, |
| 108 | + filename=filename, |
| 109 | + token=token |
| 110 | + ) |
| 111 | + |
| 112 | + # Determine optimal n_gpu_layers based on available VRAM |
| 113 | + n_gpu_layers = 0 |
| 114 | + if torch.cuda.is_available(): |
| 115 | + # Get available VRAM in GB |
| 116 | + vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3) |
| 117 | + |
| 118 | + # Simple heuristic for n_gpu_layers based on VRAM |
| 119 | + if vram_gb > 24: |
| 120 | + n_gpu_layers = -1 # Use all layers |
| 121 | + elif vram_gb > 16: |
| 122 | + n_gpu_layers = 32 |
| 123 | + elif vram_gb > 8: |
| 124 | + n_gpu_layers = 24 |
| 125 | + elif vram_gb > 4: |
| 126 | + n_gpu_layers = 16 |
| 127 | + else: |
| 128 | + n_gpu_layers = 8 |
| 129 | + |
| 130 | + print(f"CUDA available with {vram_gb:.1f}GB VRAM. Using {n_gpu_layers} GPU layers.") |
| 131 | + else: |
| 132 | + print("CUDA not available. Using CPU only.") |
| 133 | + |
| 134 | + # Load the model |
| 135 | + self.model = Llama( |
| 136 | + model_path=model_path, |
| 137 | + n_ctx=4096, # Context window size |
| 138 | + n_gpu_layers=n_gpu_layers, |
| 139 | + verbose=False |
| 140 | + ) |
| 141 | + |
| 142 | + print(f"✓ GGUF model loaded successfully: {os.path.basename(model_path)}") |
| 143 | + |
| 144 | + except ImportError as e: |
| 145 | + raise ImportError(f"Failed to import llama_cpp. Please install with: pip install llama-cpp-python. Error: {str(e)}") |
| 146 | + except Exception as e: |
| 147 | + raise Exception(f"Failed to load GGUF model: {str(e)}") |
| 148 | + |
| 149 | + def __call__(self, prompt, max_new_tokens=512, temperature=0.1, top_p=0.95, **kwargs): |
| 150 | + """Generate text using the GGUF model""" |
| 151 | + if not self.model: |
| 152 | + raise ValueError("Model not loaded") |
| 153 | + |
| 154 | + # Generate text |
| 155 | + result = self.model( |
| 156 | + prompt, |
| 157 | + max_tokens=max_new_tokens, |
| 158 | + temperature=temperature, |
| 159 | + top_p=top_p, |
| 160 | + echo=False |
| 161 | + ) |
| 162 | + |
| 163 | + # Format result to match transformers pipeline output |
| 164 | + formatted_result = [{ |
| 165 | + "generated_text": result["choices"][0]["text"] |
| 166 | + }] |
| 167 | + |
| 168 | + return formatted_result |
| 169 | + |
44 | 170 | class LocalRAGAgent: |
45 | | - def __init__(self, vector_store: VectorStore, model_name: str = "mistralai/Mistral-7B-Instruct-v0.2", use_cot: bool = False, collection: str = None, skip_analysis: bool = False): |
46 | | - """Initialize local RAG agent with vector store and local LLM""" |
| 171 | + def __init__(self, vector_store: VectorStore, model_name: str = "mistralai/Mistral-7B-Instruct-v0.2", |
| 172 | + use_cot: bool = False, collection: str = None, skip_analysis: bool = False, |
| 173 | + quantization: str = None): |
| 174 | + """Initialize local RAG agent with vector store and local LLM |
| 175 | + |
| 176 | + Args: |
| 177 | + vector_store: Vector store for retrieving context |
| 178 | + model_name: HuggingFace model name/path or GGUF model path/repo |
| 179 | + use_cot: Whether to use Chain of Thought reasoning |
| 180 | + collection: Collection to search in (PDF, Repository, or General Knowledge) |
| 181 | + skip_analysis: Whether to skip query analysis (kept for backward compatibility) |
| 182 | + quantization: Quantization method to use (None, '4bit', '8bit') |
| 183 | + """ |
47 | 184 | self.vector_store = vector_store |
48 | 185 | self.use_cot = use_cot |
49 | 186 | self.collection = collection |
| 187 | + self.quantization = quantization |
| 188 | + self.model_name = model_name |
50 | 189 | # skip_analysis parameter kept for backward compatibility but no longer used |
51 | 190 |
|
52 | | - # Load HuggingFace token from config |
53 | | - try: |
54 | | - with open('config.yaml', 'r') as f: |
55 | | - config = yaml.safe_load(f) |
56 | | - token = config.get('HUGGING_FACE_HUB_TOKEN') |
57 | | - if not token: |
58 | | - raise ValueError("HUGGING_FACE_HUB_TOKEN not found in config.yaml") |
59 | | - except Exception as e: |
60 | | - raise Exception(f"Failed to load HuggingFace token from config.yaml: {str(e)}") |
61 | | - |
62 | | - # Load model and tokenizer |
63 | | - print("\nLoading model and tokenizer...") |
64 | | - print("Note: Initial loading and inference with Mistral-7B can take 1-5 minutes depending on your hardware.") |
65 | | - print("Subsequent queries will be faster but may still take 30-60 seconds per response.") |
| 191 | + # Check if this is a GGUF model |
| 192 | + self.is_gguf = model_name.endswith('.gguf') or 'GGUF' in model_name |
66 | 193 |
|
67 | | - # Check if CUDA is available and set appropriate dtype |
68 | | - if torch.cuda.is_available(): |
69 | | - print("CUDA is available. Using GPU acceleration.") |
70 | | - dtype = torch.float16 |
| 194 | + if self.is_gguf: |
| 195 | + # Load GGUF model |
| 196 | + print("\nLoading GGUF model...") |
| 197 | + print(f"Model: {model_name}") |
| 198 | + print("Note: Initial loading and inference can take 1-5 minutes depending on your hardware.") |
| 199 | + |
| 200 | + # Initialize GGUF model handler |
| 201 | + self.gguf_handler = GGUFModelHandler(model_name) |
| 202 | + |
| 203 | + # Create pipeline-like interface |
| 204 | + self.pipeline = self.gguf_handler |
| 205 | + |
71 | 206 | else: |
72 | | - print("CUDA is not available. Using CPU only (this will be slow).") |
73 | | - dtype = torch.float32 |
74 | | - |
75 | | - self.model = AutoModelForCausalLM.from_pretrained( |
76 | | - model_name, |
77 | | - torch_dtype=dtype, |
78 | | - device_map="auto", |
79 | | - token=token, |
80 | | - # Add optimization flags |
81 | | - low_cpu_mem_usage=True, |
82 | | - offload_folder="offload" |
83 | | - ) |
84 | | - self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=token) |
85 | | - |
86 | | - # Create text generation pipeline with optimized settings |
87 | | - self.pipeline = pipeline( |
88 | | - "text-generation", |
89 | | - model=self.model, |
90 | | - tokenizer=self.tokenizer, |
91 | | - max_new_tokens=512, |
92 | | - do_sample=True, |
93 | | - temperature=0.1, |
94 | | - top_p=0.95, |
95 | | - device_map="auto" |
96 | | - ) |
97 | | - print("✓ Model loaded successfully") |
| 207 | + # Load HuggingFace token from config |
| 208 | + try: |
| 209 | + with open('config.yaml', 'r') as f: |
| 210 | + config = yaml.safe_load(f) |
| 211 | + token = config.get('HUGGING_FACE_HUB_TOKEN') |
| 212 | + if not token: |
| 213 | + raise ValueError("HUGGING_FACE_HUB_TOKEN not found in config.yaml") |
| 214 | + except Exception as e: |
| 215 | + raise Exception(f"Failed to load HuggingFace token from config.yaml: {str(e)}") |
| 216 | + |
| 217 | + # Load model and tokenizer |
| 218 | + print("\nLoading model and tokenizer...") |
| 219 | + print(f"Model: {model_name}") |
| 220 | + if quantization: |
| 221 | + print(f"Quantization: {quantization}") |
| 222 | + print("Note: Initial loading and inference can take 1-5 minutes depending on your hardware.") |
| 223 | + print("Subsequent queries will be faster but may still take 30-60 seconds per response.") |
| 224 | + |
| 225 | + # Check if CUDA is available and set appropriate dtype |
| 226 | + if torch.cuda.is_available(): |
| 227 | + print("CUDA is available. Using GPU acceleration.") |
| 228 | + dtype = torch.float16 |
| 229 | + else: |
| 230 | + print("CUDA is not available. Using CPU only (this will be slow).") |
| 231 | + dtype = torch.float32 |
| 232 | + |
| 233 | + # Set up model loading parameters |
| 234 | + model_kwargs = { |
| 235 | + "torch_dtype": dtype, |
| 236 | + "device_map": "auto", |
| 237 | + "token": token, |
| 238 | + "low_cpu_mem_usage": True, |
| 239 | + "offload_folder": "offload" |
| 240 | + } |
| 241 | + |
| 242 | + # Apply quantization if specified |
| 243 | + if quantization == '4bit': |
| 244 | + try: |
| 245 | + from transformers import BitsAndBytesConfig |
| 246 | + quantization_config = BitsAndBytesConfig( |
| 247 | + load_in_4bit=True, |
| 248 | + bnb_4bit_compute_dtype=torch.float16, |
| 249 | + bnb_4bit_use_double_quant=True, |
| 250 | + bnb_4bit_quant_type="nf4" |
| 251 | + ) |
| 252 | + model_kwargs["quantization_config"] = quantization_config |
| 253 | + print("Using 4-bit quantization with bitsandbytes") |
| 254 | + except ImportError: |
| 255 | + print("Warning: bitsandbytes not installed. Falling back to standard loading.") |
| 256 | + print("To use 4-bit quantization, install bitsandbytes: pip install bitsandbytes") |
| 257 | + elif quantization == '8bit': |
| 258 | + try: |
| 259 | + from transformers import BitsAndBytesConfig |
| 260 | + quantization_config = BitsAndBytesConfig(load_in_8bit=True) |
| 261 | + model_kwargs["quantization_config"] = quantization_config |
| 262 | + print("Using 8-bit quantization with bitsandbytes") |
| 263 | + except ImportError: |
| 264 | + print("Warning: bitsandbytes not installed. Falling back to standard loading.") |
| 265 | + print("To use 8-bit quantization, install bitsandbytes: pip install bitsandbytes") |
| 266 | + |
| 267 | + # Load model with appropriate settings |
| 268 | + self.model = AutoModelForCausalLM.from_pretrained( |
| 269 | + model_name, |
| 270 | + **model_kwargs |
| 271 | + ) |
| 272 | + self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=token) |
| 273 | + |
| 274 | + # Create text generation pipeline with optimized settings |
| 275 | + self.pipeline = pipeline( |
| 276 | + "text-generation", |
| 277 | + model=self.model, |
| 278 | + tokenizer=self.tokenizer, |
| 279 | + max_new_tokens=512, |
| 280 | + do_sample=True, |
| 281 | + temperature=0.1, |
| 282 | + top_p=0.95, |
| 283 | + device_map="auto" |
| 284 | + ) |
| 285 | + print("✓ Model loaded successfully") |
98 | 286 |
|
99 | 287 | # Create LLM wrapper |
100 | 288 | self.llm = LocalLLM(self.pipeline) |
|
0 commit comments