diff --git a/WATSONX_README.md b/WATSONX_README.md new file mode 100644 index 00000000..a21bcbc6 --- /dev/null +++ b/WATSONX_README.md @@ -0,0 +1,225 @@ +# Watson X Integration with Granite Models + +This branch adds support for IBM Watson X AI with Granite models as an alternative to Ollama for running LocalGPT. + +## Overview + +LocalGPT now supports two LLM backends: +1. **Ollama** (default): Run models locally using Ollama +2. **Watson X**: Use IBM's Granite models hosted on Watson X AI + +## What Changed + +- Added `WatsonXClient` class in `rag_system/utils/watsonx_client.py` that provides an Ollama-compatible interface for Watson X +- Updated `factory.py` and `main.py` to support backend switching via environment variable +- Added `ibm-watsonx-ai` SDK dependency to `requirements.txt` +- Configuration now supports both backends through environment variables + +## Prerequisites + +To use Watson X with Granite models, you need: + +1. IBM Cloud account with Watson X access +2. Watson X API key +3. Watson X project ID + +### Getting Your Credentials + +1. Go to [IBM Cloud](https://cloud.ibm.com/) +2. Navigate to Watson X AI service +3. Create or select a project +4. Get your API key from IBM Cloud IAM +5. Copy your project ID from the Watson X project settings + +## Configuration + +### Environment Variables + +Create a `.env` file or set these environment variables: + +```bash +# Choose LLM backend (default: ollama) +LLM_BACKEND=watsonx + +# Watson X Configuration +WATSONX_API_KEY=your_api_key_here +WATSONX_PROJECT_ID=your_project_id_here +WATSONX_URL=https://us-south.ml.cloud.ibm.com + +# Model Configuration +WATSONX_GENERATION_MODEL=ibm/granite-13b-chat-v2 +WATSONX_ENRICHMENT_MODEL=ibm/granite-8b-japanese +``` + +### Available Granite Models + +Watson X offers several Granite models: +- `ibm/granite-13b-chat-v2` - General purpose chat model +- `ibm/granite-13b-instruct-v2` - Instruction-following model +- `ibm/granite-20b-multilingual` - Multilingual support +- `ibm/granite-8b-japanese` - Lightweight Japanese model +- `ibm/granite-3b-code-instruct` - Code generation model + +For a full list of available models, visit the [Watson X documentation](https://www.ibm.com/docs/en/watsonx/saas?topic=solutions-supported-foundation-models). + +## Installation + +1. Install the Watson X SDK: +```bash +pip install ibm-watsonx-ai>=1.3.39 +``` + +Or install all dependencies: +```bash +pip install -r rag_system/requirements.txt +``` + +## Usage + +### Running with Watson X + +Once configured, simply set the environment variable and run as normal: + +```bash +export LLM_BACKEND=watsonx +python -m rag_system.main api +``` + +Or in Python: + +```python +import os +os.environ['LLM_BACKEND'] = 'watsonx' + +from rag_system.factory import get_agent + +# Get agent with Watson X backend +agent = get_agent(mode="default") + +# Use as normal +result = agent.run("What is artificial intelligence?") +print(result) +``` + +### Switching Between Backends + +You can easily switch between Ollama and Watson X: + +```bash +# Use Ollama (local) +export LLM_BACKEND=ollama +python -m rag_system.main api + +# Use Watson X (cloud) +export LLM_BACKEND=watsonx +python -m rag_system.main api +``` + +## Features + +The Watson X client supports all the key features used by LocalGPT: + +- ✅ Text generation / completion +- ✅ Async generation +- ✅ Streaming responses +- ✅ Embeddings (if using Watson X embedding models) +- ✅ Custom generation parameters (temperature, max_tokens, top_p, top_k) +- ⚠️ Image/multimodal support (limited, depends on model availability) + +## API Compatibility + +The `WatsonXClient` provides the same interface as `OllamaClient`: + +```python +from rag_system.utils.watsonx_client import WatsonXClient + +client = WatsonXClient( + api_key="your_api_key", + project_id="your_project_id" +) + +# Generate completion +response = client.generate_completion( + model="ibm/granite-13b-chat-v2", + prompt="Explain quantum computing" +) + +print(response['response']) + +# Stream completion +for chunk in client.stream_completion( + model="ibm/granite-13b-chat-v2", + prompt="Write a story about AI" +): + print(chunk, end='', flush=True) +``` + +## Limitations + +1. **Embedding Models**: Watson X uses different embedding models than Ollama. Make sure to configure embedding models appropriately in `main.py` if needed. + +2. **Multimodal Support**: Image support varies by model availability in Watson X. Not all Granite models support multimodal inputs. + +3. **Streaming**: Streaming support depends on the Watson X SDK version and may fall back to returning the full response at once. + +4. **Rate Limits**: Watson X has API rate limits that may differ from local Ollama usage. Monitor your usage accordingly. + +## Troubleshooting + +### Authentication Errors + +If you see authentication errors: +- Verify your API key is correct +- Check that your project ID matches an existing Watson X project +- Ensure your IBM Cloud account has Watson X access + +### Model Not Found + +If you get model not found errors: +- Verify the model ID is correct (e.g., `ibm/granite-13b-chat-v2`) +- Check that the model is available in your Watson X instance +- Some models may require additional permissions + +### Connection Errors + +If you experience connection issues: +- Check your internet connection +- Verify the Watson X URL is correct for your region +- Check IBM Cloud status page for service outages + +## Cost Considerations + +Unlike local Ollama, Watson X is a cloud service with usage-based pricing: +- Token-based pricing for generation +- Consider your query volume +- Monitor usage through IBM Cloud dashboard + +## Reverting to Ollama + +To switch back to local Ollama: + +```bash +unset LLM_BACKEND # or set LLM_BACKEND=ollama +python -m rag_system.main api +``` + +## Support + +For Watson X specific issues: +- [IBM Watson X Documentation](https://www.ibm.com/docs/en/watsonx/saas) +- [Watson X Developer Hub](https://www.ibm.com/watsonx/developer/) +- [IBM Cloud Support](https://cloud.ibm.com/docs/get-support) + +For LocalGPT issues: +- [LocalGPT GitHub Issues](https://github.com/PromtEngineer/localGPT/issues) + +## Contributing + +If you find issues with the Watson X integration or want to add features: +1. Create an issue describing the problem/feature +2. Submit a pull request with your changes +3. Ensure all tests pass + +## License + +This integration follows the same license as LocalGPT (MIT License). diff --git a/env.example.watsonx b/env.example.watsonx new file mode 100644 index 00000000..5c3f3f86 --- /dev/null +++ b/env.example.watsonx @@ -0,0 +1,61 @@ +# ==================================================================== +# LocalGPT Watson X Configuration Example +# ==================================================================== +# This file shows how to configure LocalGPT to use IBM Watson X AI +# with Granite models instead of local Ollama. +# +# Copy this file to .env and fill in your credentials: +# cp .env.example.watsonx .env +# ==================================================================== + +# LLM Backend Selection +# Options: "ollama" (default) or "watsonx" +LLM_BACKEND=watsonx + +# ==================================================================== +# Watson X Credentials +# ==================================================================== +# Get these from your IBM Cloud Watson X project: +# 1. Go to https://cloud.ibm.com/ +# 2. Navigate to Watson X AI service +# 3. Create or select a project +# 4. Get API key from IBM Cloud IAM +# 5. Copy project ID from project settings + +# Your IBM Cloud API key +WATSONX_API_KEY=your_api_key_here + +# Your Watson X project ID +WATSONX_PROJECT_ID=your_project_id_here + +# Watson X service URL (default: us-south region) +# Options: +# - https://us-south.ml.cloud.ibm.com (US South) +# - https://eu-de.ml.cloud.ibm.com (Frankfurt) +# - https://eu-gb.ml.cloud.ibm.com (London) +# - https://jp-tok.ml.cloud.ibm.com (Tokyo) +WATSONX_URL=https://us-south.ml.cloud.ibm.com + +# ==================================================================== +# Model Configuration +# ==================================================================== +# Granite models available on Watson X + +# Main generation model for answering queries +# Options: +# - ibm/granite-13b-chat-v2 (recommended for chat) +# - ibm/granite-13b-instruct-v2 (for instructions) +# - ibm/granite-20b-multilingual (for multilingual) +# - ibm/granite-3b-code-instruct (for code) +WATSONX_GENERATION_MODEL=ibm/granite-13b-chat-v2 + +# Lightweight model for enrichment and routing +# Use a smaller model for better performance on simple tasks +WATSONX_ENRICHMENT_MODEL=ibm/granite-8b-japanese + +# ==================================================================== +# Optional: Ollama Configuration (fallback) +# ==================================================================== +# These settings are used if LLM_BACKEND=ollama + +OLLAMA_HOST=http://localhost:11434 diff --git a/rag_system/factory.py b/rag_system/factory.py index 7ed6b83e..77a79e89 100644 --- a/rag_system/factory.py +++ b/rag_system/factory.py @@ -7,11 +7,30 @@ def get_agent(mode: str = "default"): """ from rag_system.agent.loop import Agent from rag_system.utils.ollama_client import OllamaClient - from rag_system.main import PIPELINE_CONFIGS, OLLAMA_CONFIG + from rag_system.main import PIPELINE_CONFIGS, OLLAMA_CONFIG, LLM_BACKEND, WATSONX_CONFIG load_dotenv() - llm_client = OllamaClient(host=OLLAMA_CONFIG["host"]) + # Initialize the appropriate LLM client based on backend configuration + if LLM_BACKEND.lower() == "watsonx": + from rag_system.utils.watsonx_client import WatsonXClient + + if not WATSONX_CONFIG["api_key"] or not WATSONX_CONFIG["project_id"]: + raise ValueError( + "Watson X configuration incomplete. Please set WATSONX_API_KEY and WATSONX_PROJECT_ID " + "environment variables." + ) + + llm_client = WatsonXClient( + api_key=WATSONX_CONFIG["api_key"], + project_id=WATSONX_CONFIG["project_id"], + url=WATSONX_CONFIG["url"] + ) + llm_config = WATSONX_CONFIG + else: + llm_client = OllamaClient(host=OLLAMA_CONFIG["host"]) + llm_config = OLLAMA_CONFIG + config = PIPELINE_CONFIGS.get(mode, PIPELINE_CONFIGS['default']) if 'storage' not in config: @@ -24,7 +43,7 @@ def get_agent(mode: str = "default"): agent = Agent( pipeline_configs=config, llm_client=llm_client, - ollama_config=OLLAMA_CONFIG + ollama_config=llm_config ) return agent @@ -33,11 +52,31 @@ def get_indexing_pipeline(mode: str = "default"): Factory function to get an instance of the Indexing Pipeline. """ from rag_system.pipelines.indexing_pipeline import IndexingPipeline - from rag_system.main import PIPELINE_CONFIGS, OLLAMA_CONFIG + from rag_system.main import PIPELINE_CONFIGS, OLLAMA_CONFIG, LLM_BACKEND, WATSONX_CONFIG from rag_system.utils.ollama_client import OllamaClient load_dotenv() - llm_client = OllamaClient(host=OLLAMA_CONFIG["host"]) + + # Initialize the appropriate LLM client based on backend configuration + if LLM_BACKEND.lower() == "watsonx": + from rag_system.utils.watsonx_client import WatsonXClient + + if not WATSONX_CONFIG["api_key"] or not WATSONX_CONFIG["project_id"]: + raise ValueError( + "Watson X configuration incomplete. Please set WATSONX_API_KEY and WATSONX_PROJECT_ID " + "environment variables." + ) + + llm_client = WatsonXClient( + api_key=WATSONX_CONFIG["api_key"], + project_id=WATSONX_CONFIG["project_id"], + url=WATSONX_CONFIG["url"] + ) + llm_config = WATSONX_CONFIG + else: + llm_client = OllamaClient(host=OLLAMA_CONFIG["host"]) + llm_config = OLLAMA_CONFIG + config = PIPELINE_CONFIGS.get(mode, PIPELINE_CONFIGS['default']) - return IndexingPipeline(config, llm_client, OLLAMA_CONFIG) \ No newline at end of file + return IndexingPipeline(config, llm_client, llm_config) \ No newline at end of file diff --git a/rag_system/main.py b/rag_system/main.py index bfde91ca..a1f50794 100644 --- a/rag_system/main.py +++ b/rag_system/main.py @@ -25,6 +25,9 @@ # ============================================================================ # All model configurations are centralized here to prevent conflicts +# LLM Backend Configuration +LLM_BACKEND = os.getenv("LLM_BACKEND", "ollama") + # Ollama Models Configuration (for inference via Ollama) OLLAMA_CONFIG = { "host": os.getenv("OLLAMA_HOST", "http://localhost:11434"), @@ -32,6 +35,14 @@ "enrichment_model": "qwen3:0.6b", # Lightweight model for routing/enrichment } +WATSONX_CONFIG = { + "api_key": os.getenv("WATSONX_API_KEY", ""), + "project_id": os.getenv("WATSONX_PROJECT_ID", ""), + "url": os.getenv("WATSONX_URL", "https://us-south.ml.cloud.ibm.com"), + "generation_model": os.getenv("WATSONX_GENERATION_MODEL", "ibm/granite-13b-chat-v2"), + "enrichment_model": os.getenv("WATSONX_ENRICHMENT_MODEL", "ibm/granite-8b-japanese"), # Lightweight model +} + # External Model Configuration (HuggingFace models used directly) EXTERNAL_MODELS = { "embedding_model": "Qwen/Qwen3-Embedding-0.6B", # HuggingFace embedding model (1024 dims - fresh start) @@ -165,8 +176,27 @@ def get_agent(mode: str = "default") -> Agent: """ load_dotenv() - # Initialize the Ollama client with the host from config - llm_client = OllamaClient(host=OLLAMA_CONFIG["host"]) + # Initialize the appropriate LLM client based on backend configuration + if LLM_BACKEND.lower() == "watsonx": + from rag_system.utils.watsonx_client import WatsonXClient + + if not WATSONX_CONFIG["api_key"] or not WATSONX_CONFIG["project_id"]: + raise ValueError( + "Watson X configuration incomplete. Please set WATSONX_API_KEY and WATSONX_PROJECT_ID " + "environment variables." + ) + + llm_client = WatsonXClient( + api_key=WATSONX_CONFIG["api_key"], + project_id=WATSONX_CONFIG["project_id"], + url=WATSONX_CONFIG["url"] + ) + llm_config = WATSONX_CONFIG + print(f"🔧 Using Watson X backend with granite models") + else: + llm_client = OllamaClient(host=OLLAMA_CONFIG["host"]) + llm_config = OLLAMA_CONFIG + print(f"🔧 Using Ollama backend") # Get the configuration for the specified mode config = PIPELINE_CONFIGS.get(mode, PIPELINE_CONFIGS['default']) @@ -174,7 +204,7 @@ def get_agent(mode: str = "default") -> Agent: agent = Agent( pipeline_configs=config, llm_client=llm_client, - ollama_config=OLLAMA_CONFIG + ollama_config=llm_config ) return agent diff --git a/rag_system/requirements.txt b/rag_system/requirements.txt index b6eaa3af..1387f755 100644 --- a/rag_system/requirements.txt +++ b/rag_system/requirements.txt @@ -13,4 +13,5 @@ transformers sentencepiece accelerate docling -ocrmac \ No newline at end of file +ocrmac +ibm-watsonx-ai>=1.3.39 diff --git a/rag_system/utils/watsonx_client.py b/rag_system/utils/watsonx_client.py new file mode 100644 index 00000000..1c926346 --- /dev/null +++ b/rag_system/utils/watsonx_client.py @@ -0,0 +1,246 @@ +import json +from typing import List, Dict, Any, Optional +import base64 +from io import BytesIO +from PIL import Image + + +class WatsonXClient: + """ + A client for IBM Watson X AI that provides similar interface to OllamaClient + for seamless integration with the RAG system. + """ + def __init__( + self, + api_key: str, + project_id: str, + url: str = "https://us-south.ml.cloud.ibm.com", + ): + """ + Initialize the Watson X client. + + Args: + api_key: IBM Cloud API key for authentication + project_id: Watson X project ID + url: Watson X service URL (default: us-south region) + """ + self.api_key = api_key + self.project_id = project_id + self.url = url + + try: + from ibm_watsonx_ai import APIClient + from ibm_watsonx_ai import Credentials + from ibm_watsonx_ai.foundation_models import ModelInference + from ibm_watsonx_ai.foundation_models.schema import TextGenParameters + except ImportError: + raise ImportError( + "ibm-watsonx-ai package is required. " + "Install it with: pip install ibm-watsonx-ai" + ) + + self._APIClient = APIClient + self._Credentials = Credentials + self._ModelInference = ModelInference + self._TextGenParameters = TextGenParameters + + self.credentials = self._Credentials( + api_key=self.api_key, + url=self.url + ) + + self.client = self._APIClient(self.credentials) + self.client.set.default_project(self.project_id) + + def _image_to_base64(self, image: Image.Image) -> str: + """Converts a Pillow Image to a base64 string.""" + buffered = BytesIO() + image.save(buffered, format="PNG") + return base64.b64encode(buffered.getvalue()).decode('utf-8') + + def generate_embedding(self, model: str, text: str) -> List[float]: + """ + Generate embeddings using Watson X embedding models. + Note: This requires using Watson X embedding models through the embeddings API. + """ + try: + from ibm_watsonx_ai.foundation_models import Embeddings + + embedding_model = Embeddings( + model_id=model, + credentials=self.credentials, + project_id=self.project_id + ) + + result = embedding_model.embed_query(text) + return result if isinstance(result, list) else [] + + except Exception as e: + print(f"Error generating embedding: {e}") + return [] + + def generate_completion( + self, + model: str, + prompt: str, + *, + format: str = "", + images: Optional[List[Image.Image]] = None, + enable_thinking: Optional[bool] = None, + **kwargs + ) -> Dict[str, Any]: + """ + Generates a completion using Watson X foundation models. + + Args: + model: The name/ID of the Watson X model (e.g., 'ibm/granite-13b-chat-v2') + prompt: The text prompt for the model + format: The format for the response (e.g., "json") + images: List of Pillow Image objects (for multimodal models) + enable_thinking: Optional flag (not used in Watson X, kept for compatibility) + **kwargs: Additional parameters for text generation + + Returns: + Dictionary with response in Ollama-compatible format + """ + try: + gen_params = {} + + if kwargs.get('max_tokens'): + gen_params['max_new_tokens'] = kwargs['max_tokens'] + if kwargs.get('temperature'): + gen_params['temperature'] = kwargs['temperature'] + if kwargs.get('top_p'): + gen_params['top_p'] = kwargs['top_p'] + if kwargs.get('top_k'): + gen_params['top_k'] = kwargs['top_k'] + + parameters = self._TextGenParameters(**gen_params) if gen_params else None + + model_inference = self._ModelInference( + model_id=model, + credentials=self.credentials, + project_id=self.project_id, + params=parameters + ) + + if images: + print("Warning: Image support in Watson X may vary by model") + result = model_inference.generate(prompt=prompt) + else: + result = model_inference.generate(prompt=prompt) + + generated_text = "" + if isinstance(result, dict): + generated_text = result.get('results', [{}])[0].get('generated_text', '') + else: + generated_text = str(result) + + return { + 'response': generated_text, + 'model': model, + 'done': True + } + + except Exception as e: + print(f"Error generating completion: {e}") + return {'response': '', 'error': str(e)} + + async def generate_completion_async( + self, + model: str, + prompt: str, + *, + format: str = "", + images: Optional[List[Image.Image]] = None, + enable_thinking: Optional[bool] = None, + timeout: int = 60, + **kwargs + ) -> Dict[str, Any]: + """ + Asynchronous version of generate_completion. + + Note: IBM Watson X SDK may not have native async support, + so this is a wrapper around the sync version. + """ + import asyncio + + loop = asyncio.get_event_loop() + return await loop.run_in_executor( + None, + lambda: self.generate_completion( + model, prompt, format=format, images=images, + enable_thinking=enable_thinking, **kwargs + ) + ) + + def stream_completion( + self, + model: str, + prompt: str, + *, + images: Optional[List[Image.Image]] = None, + enable_thinking: Optional[bool] = None, + **kwargs + ): + """ + Generator that yields partial response strings as they arrive. + + Note: Watson X streaming support depends on the SDK version and model. + """ + try: + gen_params = {} + if kwargs.get('max_tokens'): + gen_params['max_new_tokens'] = kwargs['max_tokens'] + if kwargs.get('temperature'): + gen_params['temperature'] = kwargs['temperature'] + + parameters = self._TextGenParameters(**gen_params) if gen_params else None + + model_inference = self._ModelInference( + model_id=model, + credentials=self.credentials, + project_id=self.project_id, + params=parameters + ) + + try: + for chunk in model_inference.generate_text_stream(prompt=prompt): + if chunk: + yield chunk + except AttributeError: + result = model_inference.generate(prompt=prompt) + generated_text = "" + if isinstance(result, dict): + generated_text = result.get('results', [{}])[0].get('generated_text', '') + else: + generated_text = str(result) + yield generated_text + + except Exception as e: + print(f"Error in stream_completion: {e}") + yield "" + + +if __name__ == '__main__': + print("Watson X Client for IBM watsonx.ai integration") + print("This client provides Ollama-compatible interface for Watson X granite models") + print("\nTo use this client, you need:") + print("1. IBM Cloud API key") + print("2. Watson X project ID") + print("3. ibm-watsonx-ai package installed") + print("\nExample usage:") + print(""" + from rag_system.utils.watsonx_client import WatsonXClient + + client = WatsonXClient( + api_key="your-api-key", + project_id="your-project-id" + ) + + response = client.generate_completion( + model="ibm/granite-13b-chat-v2", + prompt="What is AI?" + ) + print(response['response']) + """)