Skip to content

Commit e6bc76b

Browse files
committed
Moving to supporting blog content / formatting
1 parent 65ece4e commit e6bc76b

File tree

4 files changed

+27
-35
lines changed

4 files changed

+27
-35
lines changed

notebooks/README.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,3 @@ Notebooks are organized into the following folders:
2323
- [Alibaba Cloud](./integrations/alibabacloud-ai-search)
2424

2525
- [`enterprise-search`](./enterprise-search/): Notebooks that demonstrate use cases for working with and exporting from Elastic Enterprise Search, App Search, or Workplace Search.
26-
27-
- [`colpali`](./colpali/): Notebooks that showcase the useage of colpali.
28-

notebooks/colpali/01_colpali.ipynb renamed to supporting-blog-content/colpali/01_colpali.ipynb

Lines changed: 27 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@
2525
"source": [
2626
"!pip install -r requirements.txt\n",
2727
"from IPython.display import clear_output\n",
28-
"clear_output() # for less space usage. "
28+
"\n",
29+
"clear_output() # for less space usage."
2930
]
3031
},
3132
{
@@ -64,15 +65,15 @@
6465
],
6566
"source": [
6667
"from datasets import load_dataset\n",
67-
"from tqdm.notebook import tqdm \n",
68+
"from tqdm.notebook import tqdm\n",
6869
"import os\n",
6970
"\n",
7071
"DATASET_NAME = \"vidore/infovqa_test_subsampled\"\n",
7172
"DOCUMENT_DIR = \"searchlabs-colpali\"\n",
7273
"\n",
7374
"os.makedirs(DOCUMENT_DIR, exist_ok=True)\n",
7475
"dataset = load_dataset(DATASET_NAME, split=\"test\")\n",
75-
" \n",
76+
"\n",
7677
"for i, row in enumerate(tqdm(dataset, desc=\"Saving images to disk\")):\n",
7778
" image = row.get(\"image\")\n",
7879
" image_name = f\"image_{i}.jpg\"\n",
@@ -123,17 +124,21 @@
123124
"model = ColPali.from_pretrained(\n",
124125
" \"vidore/colpali-v1.3\",\n",
125126
" torch_dtype=torch.float32,\n",
126-
" device_map=\"mps\", # \"mps\" for Apple Silicon, \"cuda\" if available, \"cpu\" otherwise\n",
127+
" device_map=\"mps\", # \"mps\" for Apple Silicon, \"cuda\" if available, \"cpu\" otherwise\n",
127128
").eval()\n",
128129
"\n",
129130
"col_pali_processor = ColPaliProcessor.from_pretrained(model_name)\n",
130131
"\n",
132+
"\n",
131133
"def create_col_pali_image_vectors(image_path: str) -> list:\n",
132-
" batch_images = col_pali_processor.process_images([Image.open(image_path)]).to(model.device)\n",
133-
" \n",
134+
" batch_images = col_pali_processor.process_images([Image.open(image_path)]).to(\n",
135+
" model.device\n",
136+
" )\n",
137+
"\n",
134138
" with torch.no_grad():\n",
135139
" return model(**batch_images).tolist()[0]\n",
136140
"\n",
141+
"\n",
137142
"def create_col_pali_query_vectors(query: str) -> list:\n",
138143
" queries = col_pali_processor.process_queries([query]).to(model.device)\n",
139144
" with torch.no_grad():\n",
@@ -194,9 +199,9 @@
194199
" vectors_f32 = create_col_pali_image_vectors(image_path)\n",
195200
" file_to_multi_vectors[file_name] = vectors_f32\n",
196201
"\n",
197-
"with open('col_pali_vectors.pkl', 'wb') as f:\n",
202+
"with open(\"col_pali_vectors.pkl\", \"wb\") as f:\n",
198203
" pickle.dump(file_to_multi_vectors, f)\n",
199-
" \n",
204+
"\n",
200205
"print(f\"Saved {len(file_to_multi_vectors)} vector entries to disk\")"
201206
]
202207
},
@@ -239,22 +244,15 @@
239244
"\n",
240245
"es = Elasticsearch(ELASTIC_HOST, api_key=ELASTIC_API_KEY)\n",
241246
"\n",
242-
"mappings = {\n",
243-
" \"mappings\": {\n",
244-
" \"properties\": {\n",
245-
" \"col_pali_vectors\": {\n",
246-
" \"type\": \"rank_vectors\"\n",
247-
" }\n",
248-
" }\n",
249-
" }\n",
250-
"}\n",
247+
"mappings = {\"mappings\": {\"properties\": {\"col_pali_vectors\": {\"type\": \"rank_vectors\"}}}}\n",
251248
"\n",
252249
"if not es.indices.exists(index=INDEX_NAME):\n",
253250
" print(f\"[INFO] Creating index: {INDEX_NAME}\")\n",
254251
" es.indices.create(index=INDEX_NAME, body=mappings)\n",
255252
"else:\n",
256253
" print(f\"[INFO] Index '{INDEX_NAME}' already exists.\")\n",
257254
"\n",
255+
"\n",
258256
"def index_document(es_client, index, doc_id, document, retries=10, initial_backoff=1):\n",
259257
" for attempt in range(1, retries + 1):\n",
260258
" try:\n",
@@ -304,18 +302,18 @@
304302
}
305303
],
306304
"source": [
307-
"with open('col_pali_vectors.pkl', 'rb') as f:\n",
305+
"with open(\"col_pali_vectors.pkl\", \"rb\") as f:\n",
308306
" file_to_multi_vectors = pickle.load(f)\n",
309307
"\n",
310308
"for file_name, vectors in tqdm(file_to_multi_vectors.items(), desc=\"Index documents\"):\n",
311309
" if es.exists(index=INDEX_NAME, id=file_name):\n",
312310
" continue\n",
313-
" \n",
311+
"\n",
314312
" index_document(\n",
315-
" es_client=es, \n",
316-
" index=INDEX_NAME, \n",
317-
" doc_id=file_name, \n",
318-
" document={\"col_pali_vectors\": vectors}\n",
313+
" es_client=es,\n",
314+
" index=INDEX_NAME,\n",
315+
" doc_id=file_name,\n",
316+
" document={\"col_pali_vectors\": vectors},\n",
319317
" )"
320318
]
321319
},
@@ -360,18 +358,14 @@
360358
" \"_source\": False,\n",
361359
" \"query\": {\n",
362360
" \"script_score\": {\n",
363-
" \"query\": {\n",
364-
" \"match_all\": {}\n",
365-
" },\n",
361+
" \"query\": {\"match_all\": {}},\n",
366362
" \"script\": {\n",
367363
" \"source\": \"maxSimDotProduct(params.query_vector, 'col_pali_vectors')\",\n",
368-
" \"params\": {\n",
369-
" \"query_vector\": create_col_pali_query_vectors(query)\n",
370-
" }\n",
371-
" }\n",
364+
" \"params\": {\"query_vector\": create_col_pali_query_vectors(query)},\n",
365+
" },\n",
372366
" }\n",
373367
" },\n",
374-
" \"size\": 5\n",
368+
" \"size\": 5,\n",
375369
"}\n",
376370
"\n",
377371
"results = es.search(index=INDEX_NAME, body=es_query)\n",
@@ -393,9 +387,10 @@
393387
"metadata": {},
394388
"outputs": [],
395389
"source": [
396-
"# We kill the kernel forcefully to free up the memory from the ColPali model. \n",
390+
"# We kill the kernel forcefully to free up the memory from the ColPali model.\n",
397391
"print(\"Shutting down the kernel to free memory...\")\n",
398392
"import os\n",
393+
"\n",
399394
"os._exit(0)"
400395
]
401396
}

0 commit comments

Comments
 (0)