feat: add inference scripts

Andrei997 · Andrei997 · commit 0dd0291bbb78 · 2025-07-23T15:58:06.000+08:00
diff --git a/jina_embeddings/README.md b/jina_embeddings/README.md
diff --git a/jina_embeddings/infer.py b/jina_embeddings/infer.py
@@ -0,0 +1,169 @@
+import json
+import os
+import signal
+import subprocess
+import time
+
+import click # type: ignore
+import requests # type: ignore
+from sklearn.metrics.pairwise import cosine_similarity # type: ignore
+
+from model import LlamaCppServerEmbeddingModel
+
+
+@click.command()
+@click.option('--llama-bin', default='./llama-server', help='Path to llama-server binary')
+@click.option('--model', required=True, help='Path to model .gguf file')
+@click.option('--mmproj', required=True, help='Path to mmproj .gguf file')
+@click.option('--port', default=8080, help='Port for llama-server')
+@click.option('--host', default='0.0.0.0', help='Host for llama-server')
+@click.option('--ngl', default=999, help='Number of GPU layers')
+@click.option('--gpus', default='0', help='CUDA_VISIBLE_DEVICES comma separated GPU ids (e.g. "0,1")')
+@click.option('--input', 'input_path', required=True, help='Path to input txt file. Format: "[TYPE] content" where TYPE is QUERY, DOCUMENT, or IMAGE. For IMAGE, content should be the file path.')
+@click.option('--output', 'output_path', required=True, help='Path to output JSON file for embeddings')
+@click.option('--normalize-after-pooling', is_flag=True, default=False, help='Apply L2 normalization after pooling')
+@click.option('--save-cosine-sim-path', help='Path to save cosine similarity matrix as markdown table')
+@click.option('--query-prefix', default='Query: ', help='Prefix for [QUERY] lines')
+@click.option('--document-prefix', default='Passage: ', help='Prefix for [DOCUMENT] lines')
+@click.option('--image-prefix', default='Describe the image.<__image__>', help='Prefix for [IMAGE] lines')
+def main(
+    llama_bin, model, mmproj, port, host, ngl, gpus,
+    input_path, output_path,
+    normalize_after_pooling,
+    save_cosine_sim_path, query_prefix, document_prefix, image_prefix
+):
+    env = os.environ.copy()
+    env['CUDA_VISIBLE_DEVICES'] = gpus
+
+    cmd = [
+        llama_bin,
+        '-m', model,
+        '--mmproj', mmproj,
+        '--embedding',
+        '--port', str(port),
+        '-ngl', str(ngl),
+        '--host', host,
+        '--pooling', 'none'
+    ]
+    print(f"Starting llama-server with: {' '.join(cmd)}")
+    proc = subprocess.Popen(cmd, env=env)
+
+    try:
+        print("Waiting for server to start...")
+        
+        # Health check - wait until server is ready
+        max_wait_time = 300  # 5 minutes
+        check_interval = 2   # 2 seconds
+        start_time = time.time()
+        
+        while True:
+            try:
+                # Test the actual embedding endpoint with a simple request
+                test_payload = {"content": "test"}
+                health_response = requests.post(f"http://{host}:{port}/embedding", json=test_payload, timeout=10)
+                if health_response.status_code == 200:
+                    print("✅ Server is ready!")
+                    break
+                elif health_response.status_code == 503:
+                    elapsed = time.time() - start_time
+                    print(f"⏳ Server still loading model... ({elapsed:.1f}s elapsed)")
+                else:
+                    elapsed = time.time() - start_time
+                    print(f"⚠️ Unexpected server response: {health_response.status_code} ({elapsed:.1f}s elapsed)")
+            except requests.exceptions.RequestException as e:
+                elapsed = time.time() - start_time
+                print(f"⏳ Waiting for server to start... ({elapsed:.1f}s elapsed)")
+            
+            # Check if we've exceeded max wait time
+            if time.time() - start_time > max_wait_time:
+                raise TimeoutError(f"Server did not become ready within {max_wait_time} seconds")
+            
+            time.sleep(check_interval)
+
+        with open(input_path, 'r', encoding='utf-8') as f:
+            raw_lines = [line.strip() for line in f if line.strip()]
+        
+        print(f"Loaded {len(raw_lines)} sentences from {input_path}")
+
+        model = LlamaCppServerEmbeddingModel(
+            server_url=f"http://{host}:{port}",
+            normalize_after_pooling=normalize_after_pooling,
+            query_prefix=query_prefix,
+            document_prefix=document_prefix,
+            image_prefix=image_prefix
+        )
+
+        original_texts, embeddings = model.encode_from_lines(raw_lines)
+
+        output_data = [
+            {"text": text, "embedding": embedding.tolist()}
+            for text, embedding in zip(original_texts, embeddings)
+        ]
+        
+        with open(output_path, 'w', encoding='utf-8') as f_out:
+            json.dump(output_data, f_out, indent=2)
+
+        print(f"Saved embeddings to {output_path}")
+
+        # Save cosine similarity matrix if requested
+        if save_cosine_sim_path:
+            def clip_text(text, max_len=10):
+                """Clip text to max_len characters, showing first part + '...' if needed"""
+                if len(text) <= max_len:
+                    return text
+                return text[:max_len-3] + "..."
+            
+            # Extract display names from original texts  
+            display_names = []
+            for i, text in enumerate(raw_lines):
+                if text.startswith('[QUERY] '):
+                    content = text[8:]
+                    display_names.append(f"Q:{clip_text(content)}")
+                elif text.startswith('[DOCUMENT] '):
+                    content = text[11:]
+                    display_names.append(f"D:{clip_text(content)}")
+                elif text.startswith('[IMAGE] '):
+                    image_path = text[8:]
+                    filename = os.path.basename(image_path)
+                    display_names.append(f"I:{clip_text(filename)}")
+                else:
+                    display_names.append(clip_text(text))
+            
+            # Compute cosine similarity matrix
+            similarity_matrix = cosine_similarity(embeddings)
+            
+            # Create markdown table
+            with open(save_cosine_sim_path, 'w', encoding='utf-8') as f:
+                f.write("# Cosine Similarity Matrix\n\n")
+                
+                # Write header row
+                f.write("| Item |")
+                for name in display_names:
+                    f.write(f" {name} |")
+                f.write("\n")
+                
+                # Write separator row
+                f.write("|" + "---|" * (len(display_names) + 1) + "\n")
+                
+                # Write data rows
+                for i, row_name in enumerate(display_names):
+                    f.write(f"| {row_name} |")
+                    for j in range(len(display_names)):
+                        sim_score = similarity_matrix[i, j]
+                        f.write(f" {sim_score:.3f} |")
+                    f.write("\n")
+            
+            print(f"Saved cosine similarity matrix to {save_cosine_sim_path}")
+
+    finally:
+        print("Shutting down server...")
+        proc.send_signal(signal.SIGINT)
+        try:
+            proc.wait(timeout=10)
+        except subprocess.TimeoutExpired:
+            print("Server did not shut down in time; killing process.")
+            proc.kill()
+
+
+if __name__ == '__main__':
+    main() # type: ignore
diff --git a/jina_embeddings/model.py b/jina_embeddings/model.py
@@ -0,0 +1,154 @@
+import base64
+import os
+from typing import List, Optional, Tuple
+
+import numpy as np # type: ignore
+import requests # type: ignore
+from typing_extensions import TypedDict # type: ignore
+
+
+class EmbeddingRequestItem(TypedDict):
+    content: str
+    image: Optional[str]
+
+
+class LlamaCppServerEmbeddingModel:
+    def __init__(
+        self, 
+        server_url: str = "http://localhost:8080", 
+        normalize_after_pooling: bool = False, 
+        query_prefix: str = "Query: ", 
+        document_prefix: str = "Passage: ", 
+        image_prefix: str = "Describe the image.<__image__>"
+    ) -> None:
+        self.server_url = server_url
+        self.normalize_after_pooling = normalize_after_pooling
+        self.query_prefix = query_prefix
+        self.document_prefix = document_prefix
+        self.image_prefix = image_prefix
+
+    def _parse_line(self, line: str) -> Tuple[str, EmbeddingRequestItem]:
+        """Parse input line and return (original_content, EmbeddingRequestItem)"""
+        if line.startswith('[QUERY] '):
+            content = line[8:]  # Remove '[QUERY] '
+            item: EmbeddingRequestItem = { "content": self.query_prefix + content, "image": None }
+            return content, item
+        elif line.startswith('[DOCUMENT] '):
+            content = line[11:]  # Remove '[DOCUMENT] '
+            item: EmbeddingRequestItem = { "content": self.document_prefix + content, "image": None }
+            return content, item
+        elif line.startswith('[IMAGE] '):
+            image_path = line[8:]  # Remove '[IMAGE] '
+            data_url, success = self._process_image(image_path)
+            assert success, f"Failed to process image: {image_path}"
+            item: EmbeddingRequestItem = { "content": self.image_prefix, "image": data_url }
+            return image_path, item
+        else:
+            raise ValueError(f"Invalid line format: {line}. Expected '[QUERY] ', '[DOCUMENT] ', or '[IMAGE] ' prefix.")
+
+    def _process_image(self, image_path: str) -> Tuple[Optional[str], bool]:
+        """Process image file and return (data_url, success)"""
+        try:
+            with open(image_path, 'rb') as img_file:
+                image_data = base64.b64encode(img_file.read()).decode('utf-8')
+            
+            # Detect image format from extension
+            ext = os.path.splitext(image_path)[1].lower()
+            if ext in ['.jpg', '.jpeg']:
+                mime_type = 'image/jpeg'
+            elif ext == '.png':
+                mime_type = 'image/png'
+            elif ext == '.webp':
+                mime_type = 'image/webp'
+            else:
+                mime_type = 'image/jpeg'  # default
+            
+            data_url = f"data:{mime_type};base64,{image_data}"
+            return data_url, True
+            
+        except FileNotFoundError:
+            print(f"❌ Image not found: {image_path}, processing as text only")
+            return None, False
+
+    def encode(self, items: List[EmbeddingRequestItem]) -> np.ndarray:
+        """
+        Encode items. Each item should be an EmbeddingRequestItem.
+        """
+        embeddings = []
+
+        for i, item in enumerate(items):
+            payload = {"content": item["content"], "image": item["image"]}
+            is_image_request = item["image"] is not None
+            response = requests.post(f"{self.server_url}/embedding", json=payload)
+            assert response.status_code == 200, f"Server error: {response.text}"
+            embedding_data = response.json()
+
+            print(f"\n==========================")
+            print(f"🧠 Item {i + 1} embedding response")
+            print(f"📦 Type: {type(embedding_data).__name__}")
+            print(f"🔑 Keys: {list(embedding_data.keys())}")
+            print(f"🔎 Preview: {repr(embedding_data)[:500]}")
+            print(f"==========================")
+
+            raw_embedding = embedding_data["embedding"]
+            
+            print(f"🔍 Raw embedding type: {type(raw_embedding)}")
+            print(f"🔍 Raw embedding shape: {np.array(raw_embedding).shape}")
+            
+            # Check if embeddings are already normalized
+            embedding_array = np.array(raw_embedding)
+            norms = np.linalg.norm(embedding_array, axis=1)
+            if np.allclose(norms, 1.0, atol=1e-6):
+                print(f"⚠️ WARNING: Raw embeddings appear to be already normalized!")
+            
+            # Handle image token extraction
+            if is_image_request:
+                start_idx = embedding_data["start_image_token_idx"]
+                end_idx = embedding_data["end_image_token_idx"]
+                
+                print(f"🖼️ Image token indices: start={start_idx}, end={end_idx}")
+                
+                # Token-level embeddings - extract only image tokens
+                hidden_states = np.array(raw_embedding)
+                image_embeddings = hidden_states[start_idx:end_idx+1]  # +1 for inclusive end
+                
+                print(f"🖼️ Extracted image embeddings shape: {image_embeddings.shape}")
+                print(f"🖼️ Original total embeddings: {len(raw_embedding)}")
+                print(f"🖼️ Image embeddings extracted: {len(image_embeddings)}")
+                
+                # Pool only the image embeddings (always mean pool)
+                pooled = image_embeddings.mean(axis=0)
+                print(f"🖼️ Using mean pooling of image tokens")
+                    
+            else:
+                # Regular text processing - always mean pool the tokens
+                hidden_states = np.array(raw_embedding)
+                pooled = hidden_states.mean(axis=0)
+                print(f"📊 Applied mean pooling")
+
+            # Optional normalization
+            if self.normalize_after_pooling:
+                norm = np.linalg.norm(pooled)
+                if norm > 0:
+                    pooled = pooled / norm
+                    print(f"🔄 Applied L2 normalization")
+
+            embeddings.append(pooled)
+
+        return np.array(embeddings)
+
+    def encode_from_lines(self, raw_lines: List[str]) -> Tuple[List[str], np.ndarray]:
+        """
+        Process raw lines with type prefixes and return embeddings along with original content
+        Returns: (original_texts, embeddings)
+        """
+        original_texts = []
+        items = []
+        
+        for line in raw_lines:
+            original, item = self._parse_line(line.strip())
+            original_texts.append(original)
+            items.append(item)
+        
+        embeddings = self.encode(items)
+        return original_texts, embeddings