@@ -35,10 +35,7 @@ def process_hf_docs_command(args):
3535 """
3636 Process documentation from HF doc-build dataset.
3737 Downloads pre-built docs and generates embeddings.
38- Processes one library per hour to avoid overloading the database.
3938 """
40- from time import sleep
41-
4239 import meilisearch
4340 from tqdm import tqdm
4441
@@ -66,53 +63,39 @@ def process_hf_docs_command(args):
6663 if not meilisearch_key :
6764 raise ValueError ("MEILISEARCH_KEY is required. Set via --meilisearch_key or MEILISEARCH_KEY env var." )
6865
66+ print ("\n " + "=" * 80 )
67+ print ("GENERATING EMBEDDINGS" )
68+ print ("=" * 80 )
69+
70+ # Collect all chunks
71+ all_chunks = []
72+ for _library_name , chunks in results .items ():
73+ all_chunks .extend (chunks )
74+
75+ print (f"\n Total chunks to embed: { len (all_chunks )} " )
76+
77+ # Generate embeddings
6978 from doc_builder .build_embeddings import MEILI_INDEX_TEMP
7079
80+ embeddings = call_embedding_inference (
81+ all_chunks ,
82+ hf_ie_url ,
83+ hf_ie_token ,
84+ is_python_module = False , # Pre-built docs are not Python modules
85+ )
86+
87+ # Push to Meilisearch
88+ print ("\n " + "=" * 80 )
89+ print ("UPLOADING TO MEILISEARCH" )
90+ print ("=" * 80 )
91+
7192 client = meilisearch .Client ("https://edge.meilisearch.com" , meilisearch_key )
7293 ITEMS_PER_CHUNK = 5000
7394
74- # Process one library at a time, waiting 1 hour between each
75- library_names = list (results .keys ())
76- total_libraries = len (library_names )
77- total_embeddings = 0
78-
79- for idx , library_name in enumerate (library_names ):
80- chunks = results [library_name ]
81- if not chunks :
82- print (f"\n ⏭️ Skipping { library_name } (no chunks)" )
83- continue
84-
85- print ("\n " + "=" * 80 )
86- print (f"📚 PROCESSING LIBRARY { idx + 1 } /{ total_libraries } : { library_name } " )
87- print (f" Chunks to process: { len (chunks )} " )
88- print ("=" * 80 )
89-
90- # Generate embeddings for this library
91- print (f"🔢 Generating embeddings for { library_name } ..." )
92- embeddings = call_embedding_inference (
93- chunks ,
94- hf_ie_url ,
95- hf_ie_token ,
96- is_python_module = False ,
97- )
98-
99- # Push to Meilisearch
100- print (f"📤 Uploading { len (embeddings )} embeddings for { library_name } to Meilisearch..." )
101- for chunk_embeddings in tqdm (chunk_list (embeddings , ITEMS_PER_CHUNK ), desc = f"Uploading { library_name } " ):
102- add_embeddings_to_db (client , MEILI_INDEX_TEMP , chunk_embeddings )
103-
104- total_embeddings += len (embeddings )
105- print (f"✅ Finished uploading { library_name } ({ len (embeddings )} embeddings)" )
106-
107- # Wait 1 hour before processing next library (except for the last one)
108- if idx < total_libraries - 1 :
109- wait_hours = 1
110- wait_seconds = wait_hours * 60 * 60
111- print (f"\n ⏳ Waiting { wait_hours } hour before processing next library..." )
112- print (f" Next library: { library_names [idx + 1 ]} " )
113- sleep (wait_seconds )
114-
115- print (f"\n ✅ Successfully uploaded { total_embeddings } total embeddings to Meilisearch" )
95+ for chunk_embeddings in tqdm (chunk_list (embeddings , ITEMS_PER_CHUNK ), desc = "Uploading to meilisearch" ):
96+ add_embeddings_to_db (client , MEILI_INDEX_TEMP , chunk_embeddings )
97+
98+ print (f"\n Successfully uploaded { len (embeddings )} embeddings to Meilisearch" )
11699
117100 print ("\n " + "=" * 80 )
118101 print ("✅ PROCESSING COMPLETE" )
0 commit comments