elastic
diff --git a/‎supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/README.md‎
Lines changed: 96 additions & 0 deletions b/‎supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/README.md‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/requirements.txt‎
Lines changed: 12 additions & 0 deletions b/‎supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/requirements.txt‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/src/elastic_manager.py‎
Lines changed: 87 additions & 0 deletions b/‎supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/src/elastic_manager.py‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/src/embedding_generator.py‎
Lines changed: 120 additions & 0 deletions b/‎supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/src/embedding_generator.py‎
Lines changed: 120 additions & 0 deletions
@@ -0,0 +1,96 @@
+# Building a Multimodal RAG Pipeline with Elasticsearch: The Story of Gotham City
+
+This repository contains the code for implementing a Multimodal Retrieval-Augmented Generation (RAG) system using Elasticsearch. The system processes and analyzes different types of evidence (images, audio, text, and depth maps) to solve a crime in Gotham City.
+
+## Overview
+
+The pipeline demonstrates how to:
+- Generate unified embeddings for multiple modalities using ImageBind
+- Store and search vectors efficiently in Elasticsearch
+- Analyze evidence using GPT-4 to generate forensic reports
+
+## Prerequisites
+
+- Python 3.10+
+- Elasticsearch cluster (cloud or local)
+- OpenAI API key
+- 8GB+ RAM
+- GPU (optional but recommended)
+
+## Quick Start
+
+1. **Setup Environment**
+```bash
+# Create and activate virtual environment
+python -m venv env_mmrag
+source env_mmrag/bin/activate  # Unix/MacOS
+# or
+.\env_mmrag\Scripts\activate   # Windows
+
+# Install dependencies
+pip install -r requirements.txt
+```
+
+2. **Configure Credentials**
+Create a `.env` file:
+```env
+ELASTICSEARCH_ENDPOINT="your-elasticsearch-endpoint"
+ELASTIC_API_KEY="your-elastic-api-key"
+OPENAI_API_KEY="your-openai-api-key"
+```
+
+3. **Run the Demo**
+```bash
+# Verify file structure
+python stages/01-stage/files_check.py
+
+# Generate embeddings
+python stages/02-stage/test_embedding_generation.py
+
+# Index content
+python stages/03-stage/index_all_modalities.py
+
+# Search and analyze
+python stages/04-stage/rag_crime_analyze.py
+```
+
+## Project Structure
+
+```
+├── README.md
+├── requirements.txt
+├── src/
+│   ├── embedding_generator.py   # ImageBind wrapper
+│   ├── elastic_manager.py       # Elasticsearch operations
+│   └── llm_analyzer.py         # GPT-4 integration
+├── stages/
+│   ├── 01-stage/              # File organization
+│   ├── 02-stage/              # Embedding generation
+│   ├── 03-stage/              # Elasticsearch indexing/search
+│   └── 04-stage/              # Evidence analysis
+└── data/                      # Sample data
+    ├── images/
+    ├── audios/
+    ├── texts/
+    └── depths/
+```
+
+## Sample Data
+
+The repository includes sample evidence files:
+- Images: Crime scene photos and security camera footage
+- Audio: Suspicious sound recordings
+- Text: Mysterious notes and riddles
+- Depth Maps: 3D scene captures
+
+## How It Works
+
+1. **Evidence Collection**: Files are organized by modality in the `data/` directory
+2. **Embedding Generation**: ImageBind converts each piece of evidence into a 1024-dimensional vector
+3. **Vector Storage**: Elasticsearch stores embeddings with metadata for efficient retrieval
+4. **Similarity Search**: New evidence is compared against the database using k-NN search
+5. **Analysis**: GPT-4 analyzes the connections between evidence to identify suspects
+
+## License
+
+This project is licensed under the Elastic License 2.0.
@@ -0,0 +1,12 @@
+elasticsearch>=8.11.0
+torch>=2.0.0
+torchvision>=0.15.0
+torchaudio>=2.0.0
+imagebind @ git+https://github.com/facebookresearch/ImageBind.git
+openai>=1.0.0
+python-dotenv>=1.0.0
+numpy>=1.24.0
+pillow>=10.0.0
+opencv-python>=4.8.0
+librosa>=0.10.0
+matplotlib>=3.7.0
@@ -0,0 +1,87 @@
+from elasticsearch import Elasticsearch, helpers
+import base64
+import os
+from dotenv import load_dotenv
+import numpy as np
+
+class ElasticsearchManager:
+    """Manages multimodal operations in Elasticsearch"""
+    
+    def __init__(self):
+        load_dotenv()  # Load variables from .env
+        self.es = self._connect_elastic()
+        self.index_name = "multimodal_content"
+        self._setup_index()
+    
+    def _connect_elastic(self):
+        """Connects to Elasticsearch"""
+        return Elasticsearch(
+            os.getenv("ELASTICSEARCH_ENDPOINT"),  # Elasticsearch endpoint
+            api_key=os.getenv("ELASTIC_API_KEY")
+        )
+    
+    def _setup_index(self):
+        """Sets up the index if it doesn't exist"""
+        if not self.es.indices.exists(index=self.index_name):
+            mapping = {
+                "mappings": {
+                    "properties": {
+                        "embedding": {
+                            "type": "dense_vector",
+                            "dims": 1024,
+                            "index": True,
+                            "similarity": "cosine"
+                        },
+                        "modality": {"type": "keyword"},
+                        "content": {"type": "binary"},
+                        "description": {"type": "text"},
+                        "metadata": {"type": "object"},
+                        "content_path": {"type": "text"}
+                    }
+                }
+            }
+            self.es.indices.create(index=self.index_name, body=mapping)
+    
+    def index_content(self, embedding, modality, content=None, description="", metadata=None, content_path=None):
+        """Indexes multimodal content"""
+        doc = {
+            "embedding": embedding.tolist(),
+            "modality": modality,
+            "description": description,
+            "metadata": metadata or {},
+            "content_path": content_path
+        }
+        
+        if content:
+            doc["content"] = base64.b64encode(content).decode() if isinstance(content, bytes) else content
+        
+        return self.es.index(index=self.index_name, document=doc)
+    
+    def search_similar(self, query_embedding, modality=None, k=5):
+        """Searches for similar contents"""
+        query = {
+            "knn": {
+                "field": "embedding",
+                "query_vector": query_embedding.tolist(),
+                "k": k,
+                "num_candidates": 100,
+                "filter": [{"term": {"modality": modality}}] if modality else []
+            }
+        }
+        
+        try:
+            response = self.es.search(
+                index=self.index_name,
+                query=query,
+                size=k            
+            )
+            
+            # Return both source data and score for each hit
+            return [{
+                **hit["_source"],
+                "score": hit["_score"]
+            } for hit in response["hits"]["hits"]]
+        
+        except Exception as e:
+            print(f"Error: processing search_evidence: {str(e)}")
+            return "Error generating search evidence"
@@ -0,0 +1,120 @@
+import os
+import cv2
+from io import BytesIO
+import logging
+from torch.hub import download_url_to_file
+
+import torch
+import numpy as np
+from PIL import Image
+from imagebind import data
+from imagebind.models import imagebind_model
+
+from torchvision import transforms
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+class EmbeddingGenerator:
+    """Generates multimodal embeddings using ImageBind"""
+    
+    def __init__(self, device="cpu"):
+        self.device = device
+        self.model = self._load_model()
+        
+    def _load_model(self):
+        """Initialize and test the ImageBind model."""
+        checkpoint_path = "~/.cache/torch/checkpoints/imagebind_huge.pth"
+        os.makedirs(os.path.expanduser("~/.cache/torch/checkpoints"), exist_ok=True)
+
+        if not os.path.exists(os.path.expanduser(checkpoint_path)):
+            print("Downloading ImageBind weights...")
+            download_url_to_file(
+                "https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth",
+                os.path.expanduser(checkpoint_path)
+            )
+            
+        try:
+            checkpoint_path = os.path.expanduser("~/.cache/torch/checkpoints/imagebind_huge.pth")
+        
+            # Check if file exists
+            if not os.path.exists(checkpoint_path):
+                raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
+                
+            model = imagebind_model.imagebind_huge(pretrained=False)
+            model.load_state_dict(torch.load(checkpoint_path))
+            model.eval().to(self.device)
+            
+            # Quick test with empty text input
+            logger.info("Testing model with sample input...")
+            test_input = data.load_and_transform_text([""], self.device)
+            with torch.no_grad():
+                _ = model({"text": test_input})
+            
+            logger.info("🤖 ImageBind model initialized successfully")
+            return model
+        except Exception as e:
+            logger.error(f"🚨 Model initialization failed: {str(e)}")
+            raise
+    
+    def generate_embedding(self, input_data, modality):
+        """Generates embedding for different modalities"""
+        processors = {
+            "vision": lambda x: data.load_and_transform_vision_data(x, self.device),
+            "audio": lambda x: data.load_and_transform_audio_data(x, self.device),
+            "text": lambda x: data.load_and_transform_text(x, self.device),
+            "depth": self.process_depth
+        }
+        
+        try:
+            # Input type verification
+            if not isinstance(input_data, list):
+                raise ValueError(f"Input data must be a list. Received: {type(input_data)}")
+                
+            # Convert input data to a tensor format that the model can process
+            # For images: [batch_size, channels, height, width] 
+            # For audio: [batch_size, channels, time] 
+            # For text: [batch_size, sequence_length]
+            inputs = {modality: processors[modality](input_data)}
+            with torch.no_grad():
+                embedding = self.model(inputs)[modality]
+            return embedding.squeeze(0).cpu().numpy()
+        except Exception as e:
+            logger.error(f"Error generating {modality} embedding: {str(e)}", exc_info=True)
+            raise
+    
+
+    def process_vision(self, image_path):
+        """Processes image"""
+        return data.load_and_transform_vision_data([image_path], self.device)
+    
+    def process_audio(self, audio_path):
+        """Processes audio"""
+        return data.load_and_transform_audio_data([audio_path], self.device)
+    
+    def process_text(self, text):
+        """Processes text"""
+        return data.load_and_transform_text([text], self.device)
+    
+    def process_depth(self, depth_paths, device="cpu"):
+        """Custom processing for depth maps"""
+        try:
+            # Check file existence
+            for path in depth_paths:
+                if not os.path.exists(path):
+                    raise FileNotFoundError(f"Depth map file not found: {path}")
+            
+            # Load and transform
+            depth_images = [Image.open(path).convert("L") for path in depth_paths]
+            
+            transform = transforms.Compose([
+                transforms.Resize((224, 224)),
+                transforms.ToTensor(),
+            ])
+            
+            return torch.stack([transform(img) for img in depth_images]).to(device)
+            
+        except Exception as e:
+            logger.error(f"🚨 - Error processing depth map: {str(e)}")
+            raise