chore: fix bugs, add readme example

Andrei997 · Andrei997 · commit 51f7d7f5fb94 · 2025-07-23T16:49:43.000+08:00
diff --git a/jina_embeddings/README.md b/jina_embeddings/README.md
@@ -0,0 +1,14 @@
+# Inference example
+```bash
+python infer.py   \
+    --llama-bin /home/andrei/workspace/llama.cpp/build/bin/llama-server   \
+    --model /home/andrei/workspace/gguf/jev4-bf16.gguf   \
+    --mmproj /home/andrei/workspace/gguf/mmproj-jev4-bf16.gguf   \
+    --gpus 7   \
+    --input /home/andrei/workspace/test_data.txt   \
+    --output /home/andrei/workspace/jev4_mmtd.json   \
+    --save-cosine-sim-path /home/andrei/workspace/jev4_mmtd.md   \
+    --query-prefix "Query: "   \
+    --document-prefix "Passage: "   \
+    --normalize-after-pooling
+```
diff --git a/jina_embeddings/infer.py b/jina_embeddings/infer.py
@@ -2,15 +2,66 @@
 import os
 import signal
 import subprocess
-import time
 
 import click # type: ignore
-import requests # type: ignore
+import numpy as np # type: ignore
 from sklearn.metrics.pairwise import cosine_similarity # type: ignore
 
 from model import LlamaCppServerEmbeddingModel
 
 
+def clip_text(text: str, max_len: int = 10) -> str:
+    """Clip text to max_len characters, showing first part + '...' if needed"""
+    if len(text) <= max_len:
+        return text
+    return text[:max_len-3] + "..."
+
+
+def save_cosine_similarity_matrix(raw_lines: list[str], embeddings: np.ndarray, save_path: str) -> None:
+    """Save cosine similarity matrix as markdown table"""
+    # Extract display names from original texts  
+    display_names = []
+    for text in raw_lines:
+        if text.startswith('[QUERY] '):
+            content = text[8:]
+            display_names.append(f"Q:{clip_text(content)}")
+        elif text.startswith('[DOCUMENT] '):
+            content = text[11:]
+            display_names.append(f"D:{clip_text(content)}")
+        elif text.startswith('[IMAGE] '):
+            image_path = text[8:]
+            filename = os.path.basename(image_path)
+            display_names.append(f"I:{clip_text(filename)}")
+        else:
+            display_names.append(clip_text(text))
+    
+    # Compute cosine similarity matrix
+    similarity_matrix = cosine_similarity(embeddings)
+    
+    # Create markdown table
+    with open(save_path, 'w', encoding='utf-8') as f:
+        f.write("# Cosine Similarity Matrix\n\n")
+        
+        # Write header row
+        f.write("| Item |")
+        for name in display_names:
+            f.write(f" {name} |")
+        f.write("\n")
+        
+        # Write separator row
+        f.write("|" + "---|" * (len(display_names) + 1) + "\n")
+        
+        # Write data rows
+        for i, row_name in enumerate(display_names):
+            f.write(f"| {row_name} |")
+            for j in range(len(display_names)):
+                sim_score = similarity_matrix[i, j]
+                f.write(f" {sim_score:.3f} |")
+            f.write("\n")
+    
+    print(f"Saved cosine similarity matrix to {save_path}")
+
+
 @click.command()
 @click.option('--llama-bin', default='./llama-server', help='Path to llama-server binary')
 @click.option('--model', required=True, help='Path to model .gguf file')
@@ -49,37 +100,6 @@ def main(
     proc = subprocess.Popen(cmd, env=env)
 
     try:
-        print("Waiting for server to start...")
-        
-        # Health check - wait until server is ready
-        max_wait_time = 300  # 5 minutes
-        check_interval = 2   # 2 seconds
-        start_time = time.time()
-        
-        while True:
-            try:
-                # Test the actual embedding endpoint with a simple request
-                test_payload = {"content": "test"}
-                health_response = requests.post(f"http://{host}:{port}/embedding", json=test_payload, timeout=10)
-                if health_response.status_code == 200:
-                    print("✅ Server is ready!")
-                    break
-                elif health_response.status_code == 503:
-                    elapsed = time.time() - start_time
-                    print(f"⏳ Server still loading model... ({elapsed:.1f}s elapsed)")
-                else:
-                    elapsed = time.time() - start_time
-                    print(f"⚠️ Unexpected server response: {health_response.status_code} ({elapsed:.1f}s elapsed)")
-            except requests.exceptions.RequestException as e:
-                elapsed = time.time() - start_time
-                print(f"⏳ Waiting for server to start... ({elapsed:.1f}s elapsed)")
-            
-            # Check if we've exceeded max wait time
-            if time.time() - start_time > max_wait_time:
-                raise TimeoutError(f"Server did not become ready within {max_wait_time} seconds")
-            
-            time.sleep(check_interval)
-
         with open(input_path, 'r', encoding='utf-8') as f:
             raw_lines = [line.strip() for line in f if line.strip()]
         
@@ -93,6 +113,7 @@ def main(
             image_prefix=image_prefix
         )
 
+        model.wait_for_server()
         original_texts, embeddings = model.encode_from_lines(raw_lines)
 
         output_data = [
@@ -107,53 +128,7 @@ def main(
 
         # Save cosine similarity matrix if requested
         if save_cosine_sim_path:
-            def clip_text(text, max_len=10):
-                """Clip text to max_len characters, showing first part + '...' if needed"""
-                if len(text) <= max_len:
-                    return text
-                return text[:max_len-3] + "..."
-            
-            # Extract display names from original texts  
-            display_names = []
-            for i, text in enumerate(raw_lines):
-                if text.startswith('[QUERY] '):
-                    content = text[8:]
-                    display_names.append(f"Q:{clip_text(content)}")
-                elif text.startswith('[DOCUMENT] '):
-                    content = text[11:]
-                    display_names.append(f"D:{clip_text(content)}")
-                elif text.startswith('[IMAGE] '):
-                    image_path = text[8:]
-                    filename = os.path.basename(image_path)
-                    display_names.append(f"I:{clip_text(filename)}")
-                else:
-                    display_names.append(clip_text(text))
-            
-            # Compute cosine similarity matrix
-            similarity_matrix = cosine_similarity(embeddings)
-            
-            # Create markdown table
-            with open(save_cosine_sim_path, 'w', encoding='utf-8') as f:
-                f.write("# Cosine Similarity Matrix\n\n")
-                
-                # Write header row
-                f.write("| Item |")
-                for name in display_names:
-                    f.write(f" {name} |")
-                f.write("\n")
-                
-                # Write separator row
-                f.write("|" + "---|" * (len(display_names) + 1) + "\n")
-                
-                # Write data rows
-                for i, row_name in enumerate(display_names):
-                    f.write(f"| {row_name} |")
-                    for j in range(len(display_names)):
-                        sim_score = similarity_matrix[i, j]
-                        f.write(f" {sim_score:.3f} |")
-                    f.write("\n")
-            
-            print(f"Saved cosine similarity matrix to {save_cosine_sim_path}")
+            save_cosine_similarity_matrix(raw_lines, embeddings, save_cosine_sim_path)
 
     finally:
         print("Shutting down server...")
diff --git a/jina_embeddings/model.py b/jina_embeddings/model.py
@@ -1,5 +1,6 @@
 import base64
 import os
+import time
 from typing import List, Optional, Tuple
 
 import numpy as np # type: ignore
@@ -27,6 +28,25 @@ def __init__(
         self.document_prefix = document_prefix
         self.image_prefix = image_prefix
 
+    def wait_for_server(self, max_wait_time: int = 300, check_interval: int = 2) -> None:
+        """Wait for the server to be ready"""
+        print("Waiting for server to start...")
+        test_payload = {"content": "test"}
+
+        start_time = time.time()
+        while True:
+            elapsed = time.time() - start_time
+            if elapsed > max_wait_time:
+                raise TimeoutError(f"Server did not become ready within {max_wait_time} seconds")
+            try:
+                r = requests.post(f"{self.server_url}/embedding", json=test_payload, timeout=10)
+                assert r.status_code == 200, f"Server not ready: {r.status_code}"
+                print("✅ Server is ready!")
+                break
+            except (requests.exceptions.RequestException, AssertionError):
+                print(f"⏳ Waiting for server to start... ({elapsed:.1f}s elapsed)")
+                time.sleep(check_interval)
+
     def _parse_line(self, line: str) -> Tuple[str, EmbeddingRequestItem]:
         """Parse input line and return (original_content, EmbeddingRequestItem)"""
         if line.startswith('[QUERY] '):
@@ -77,23 +97,25 @@ def encode(self, items: List[EmbeddingRequestItem]) -> np.ndarray:
         embeddings = []
 
         for i, item in enumerate(items):
-            payload = {"content": item["content"], "image": item["image"]}
+            payload = {"content": item["content"]}
+            if item["image"]:
+                payload["image"] = item["image"]
+                
             is_image_request = item["image"] is not None
             response = requests.post(f"{self.server_url}/embedding", json=payload)
             assert response.status_code == 200, f"Server error: {response.text}"
             embedding_data = response.json()
+            raw_embedding = embedding_data["embedding"]
 
+            # TODO: optional enable logging via argument
             print(f"\n==========================")
             print(f"🧠 Item {i + 1} embedding response")
             print(f"📦 Type: {type(embedding_data).__name__}")
             print(f"🔑 Keys: {list(embedding_data.keys())}")
             print(f"🔎 Preview: {repr(embedding_data)[:500]}")
-            print(f"==========================")
-
-            raw_embedding = embedding_data["embedding"]
-            
             print(f"🔍 Raw embedding type: {type(raw_embedding)}")
             print(f"🔍 Raw embedding shape: {np.array(raw_embedding).shape}")
+            print(f"==========================")
             
             # Check if embeddings are already normalized
             embedding_array = np.array(raw_embedding)
@@ -104,27 +126,18 @@ def encode(self, items: List[EmbeddingRequestItem]) -> np.ndarray:
             # Handle image token extraction
             if is_image_request:
                 start_idx = embedding_data["start_image_token_idx"]
-                end_idx = embedding_data["end_image_token_idx"]
-                
-                print(f"🖼️ Image token indices: start={start_idx}, end={end_idx}")
-                
-                # Token-level embeddings - extract only image tokens
+                end_idx = embedding_data["end_image_token_idx"]    
                 hidden_states = np.array(raw_embedding)
                 image_embeddings = hidden_states[start_idx:end_idx+1]  # +1 for inclusive end
-                
+                pooled = image_embeddings.mean(axis=0)
+                print(f"🖼️ Image token indices: start={start_idx}, end={end_idx}")
                 print(f"🖼️ Extracted image embeddings shape: {image_embeddings.shape}")
                 print(f"🖼️ Original total embeddings: {len(raw_embedding)}")
                 print(f"🖼️ Image embeddings extracted: {len(image_embeddings)}")
-                
-                # Pool only the image embeddings (always mean pool)
-                pooled = image_embeddings.mean(axis=0)
-                print(f"🖼️ Using mean pooling of image tokens")
-                    
             else:
                 # Regular text processing - always mean pool the tokens
                 hidden_states = np.array(raw_embedding)
                 pooled = hidden_states.mean(axis=0)
-                print(f"📊 Applied mean pooling")
 
             # Optional normalization
             if self.normalize_after_pooling: