Skip to content

Commit 00edadd

Browse files
authored
Revert timeout workarounds from #725, keep chunk length increases (#732)
Reverts the processing logic changes from commit 894c424 that: - Removed @wait_for_task_completion decorator from add_embeddings_to_db - Changed to process one library at a time with 1-hour delays Keeps the chunk length increases (2000->4000 and 1000->2000) which help reduce the total number of embeddings.
1 parent 11f9be6 commit 00edadd

File tree

2 files changed

+30
-47
lines changed

2 files changed

+30
-47
lines changed

src/doc_builder/commands/embeddings.py

Lines changed: 28 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,7 @@ def process_hf_docs_command(args):
3535
"""
3636
Process documentation from HF doc-build dataset.
3737
Downloads pre-built docs and generates embeddings.
38-
Processes one library per hour to avoid overloading the database.
3938
"""
40-
from time import sleep
41-
4239
import meilisearch
4340
from tqdm import tqdm
4441

@@ -66,53 +63,39 @@ def process_hf_docs_command(args):
6663
if not meilisearch_key:
6764
raise ValueError("MEILISEARCH_KEY is required. Set via --meilisearch_key or MEILISEARCH_KEY env var.")
6865

66+
print("\n" + "=" * 80)
67+
print("GENERATING EMBEDDINGS")
68+
print("=" * 80)
69+
70+
# Collect all chunks
71+
all_chunks = []
72+
for _library_name, chunks in results.items():
73+
all_chunks.extend(chunks)
74+
75+
print(f"\nTotal chunks to embed: {len(all_chunks)}")
76+
77+
# Generate embeddings
6978
from doc_builder.build_embeddings import MEILI_INDEX_TEMP
7079

80+
embeddings = call_embedding_inference(
81+
all_chunks,
82+
hf_ie_url,
83+
hf_ie_token,
84+
is_python_module=False, # Pre-built docs are not Python modules
85+
)
86+
87+
# Push to Meilisearch
88+
print("\n" + "=" * 80)
89+
print("UPLOADING TO MEILISEARCH")
90+
print("=" * 80)
91+
7192
client = meilisearch.Client("https://edge.meilisearch.com", meilisearch_key)
7293
ITEMS_PER_CHUNK = 5000
7394

74-
# Process one library at a time, waiting 1 hour between each
75-
library_names = list(results.keys())
76-
total_libraries = len(library_names)
77-
total_embeddings = 0
78-
79-
for idx, library_name in enumerate(library_names):
80-
chunks = results[library_name]
81-
if not chunks:
82-
print(f"\n⏭️ Skipping {library_name} (no chunks)")
83-
continue
84-
85-
print("\n" + "=" * 80)
86-
print(f"📚 PROCESSING LIBRARY {idx + 1}/{total_libraries}: {library_name}")
87-
print(f" Chunks to process: {len(chunks)}")
88-
print("=" * 80)
89-
90-
# Generate embeddings for this library
91-
print(f"🔢 Generating embeddings for {library_name}...")
92-
embeddings = call_embedding_inference(
93-
chunks,
94-
hf_ie_url,
95-
hf_ie_token,
96-
is_python_module=False,
97-
)
98-
99-
# Push to Meilisearch
100-
print(f"📤 Uploading {len(embeddings)} embeddings for {library_name} to Meilisearch...")
101-
for chunk_embeddings in tqdm(chunk_list(embeddings, ITEMS_PER_CHUNK), desc=f"Uploading {library_name}"):
102-
add_embeddings_to_db(client, MEILI_INDEX_TEMP, chunk_embeddings)
103-
104-
total_embeddings += len(embeddings)
105-
print(f"✅ Finished uploading {library_name} ({len(embeddings)} embeddings)")
106-
107-
# Wait 1 hour before processing next library (except for the last one)
108-
if idx < total_libraries - 1:
109-
wait_hours = 1
110-
wait_seconds = wait_hours * 60 * 60
111-
print(f"\n⏳ Waiting {wait_hours} hour before processing next library...")
112-
print(f" Next library: {library_names[idx + 1]}")
113-
sleep(wait_seconds)
114-
115-
print(f"\n✅ Successfully uploaded {total_embeddings} total embeddings to Meilisearch")
95+
for chunk_embeddings in tqdm(chunk_list(embeddings, ITEMS_PER_CHUNK), desc="Uploading to meilisearch"):
96+
add_embeddings_to_db(client, MEILI_INDEX_TEMP, chunk_embeddings)
97+
98+
print(f"\nSuccessfully uploaded {len(embeddings)} embeddings to Meilisearch")
11699

117100
print("\n" + "=" * 80)
118101
print("✅ PROCESSING COMPLETE")

src/doc_builder/meilisearch_helper.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,7 @@ def generate_doc_id(library: str, page: str, text: str) -> str:
206206
return f"{sanitized_library}-{sanitized_page}-{content_hash}"
207207

208208

209+
@wait_for_task_completion
209210
def add_embeddings_to_db(client: Client, index_name: str, embeddings):
210211
index = client.index(index_name)
211212
payload_data = [
@@ -225,8 +226,7 @@ def add_embeddings_to_db(client: Client, index_name: str, embeddings):
225226
for e in embeddings
226227
]
227228
task_info = index.add_documents(payload_data)
228-
print(f" Submitted indexing task {task_info.task_uid} for {len(embeddings)} embeddings")
229-
return task_info
229+
return client, task_info
230230

231231

232232
def swap_indexes(

0 commit comments

Comments
 (0)