11
11
DOCS_SITE = 'https://clickhouse.com/docs'
12
12
HEADER_PATTERN = re .compile (r"^(.*?)(?:\s*\{#(.*?)\})$" )
13
13
object_ids = set ()
14
-
14
+ files_processed = set ()
15
15
link_data = []
16
16
17
17
@@ -135,12 +135,14 @@ def inject_snippets(directory, content):
135
135
136
136
for snippet_name , snippet_full_path , _ in matches :
137
137
full_path = os .path .join (directory , snippet_full_path )
138
- if os .path .exists (full_path ):
139
- with open (full_path , 'r' , encoding = 'utf-8' ) as snippet_file :
140
- snippet_map [snippet_name ] = remove_code_blocks (snippet_file .read ())
141
- else :
142
- print (f"FATAL: Unable to handle snippet: { full_path } " )
143
- sys .exit (1 )
138
+ if full_path not in files_processed : # we dont index snippets more than once
139
+ if os .path .exists (full_path ):
140
+ with open (full_path , 'r' , encoding = 'utf-8' ) as snippet_file :
141
+ snippet_map [snippet_name ] = remove_code_blocks (snippet_file .read ())
142
+ files_processed .add (full_path )
143
+ else :
144
+ print (f"FATAL: Unable to handle snippet: { full_path } " )
145
+ sys .exit (1 )
144
146
content = snippet_pattern .sub ("" , content )
145
147
for snippet_name , snippet_content in snippet_map .items ():
146
148
tag_pattern = re .compile (fr"<{ snippet_name } \s*/>" )
@@ -204,7 +206,6 @@ def parse_markdown_content(metadata, content):
204
206
heading_slug = slug
205
207
lines = content .splitlines ()
206
208
current_h1 = metadata .get ('title' , '' )
207
-
208
209
current_subdoc = {
209
210
'file_path' : metadata .get ('file_path' , '' ),
210
211
'slug' : heading_slug ,
@@ -300,11 +301,13 @@ def process_markdown_directory(directory, base_directory):
300
301
for file in files :
301
302
if file .endswith ('.md' ) or file .endswith ('.mdx' ):
302
303
md_file_path = os .path .join (root , file )
303
- metadata , content = parse_metadata_and_content (directory , base_directory , md_file_path )
304
- for sub_doc in parse_markdown_content (metadata , content ):
305
- update_page_links (directory , base_directory , metadata .get ('file_path' , '' ), sub_doc ['url' ],
306
- sub_doc ['content' ])
307
- yield sub_doc
304
+ if md_file_path not in files_processed :
305
+ files_processed .add (md_file_path )
306
+ metadata , content = parse_metadata_and_content (directory , base_directory , md_file_path )
307
+ for sub_doc in parse_markdown_content (metadata , content ):
308
+ update_page_links (directory , base_directory , metadata .get ('file_path' , '' ), sub_doc ['url' ],
309
+ sub_doc ['content' ])
310
+ yield sub_doc
308
311
309
312
310
313
def send_to_algolia (client , index_name , records ):
@@ -337,16 +340,17 @@ def compute_page_rank(link_data, damping_factor=0.85, max_iter=100, tol=1e-6):
337
340
return page_rank
338
341
339
342
340
- def main (base_directory , sub_directory , algolia_app_id , algolia_api_key , algolia_index_name ,
343
+ def main (base_directory , sub_directories , algolia_app_id , algolia_api_key , algolia_index_name ,
341
344
batch_size = 1000 , dry_run = False ):
342
345
client = SearchClientSync (algolia_app_id , algolia_api_key )
343
- directory = os .path .join (base_directory , sub_directory )
344
- t = 0
345
346
docs = []
346
- for doc in process_markdown_directory (directory , base_directory ):
347
- docs .append (doc )
347
+ for sub_directory in sub_directories :
348
+ directory = os .path .join (base_directory , sub_directory )
349
+ for doc in process_markdown_directory (directory , base_directory ):
350
+ docs .append (doc )
348
351
page_rank_scores = compute_page_rank (link_data )
349
352
# Add PageRank scores to the documents
353
+ t = 0
350
354
for doc in docs :
351
355
rank = page_rank_scores .get (doc .get ('url' , '' ), 0 )
352
356
doc ['page_rank' ] = int (rank * 10000000 )
@@ -371,9 +375,9 @@ def main(base_directory, sub_directory, algolia_app_id, algolia_api_key, algolia
371
375
)
372
376
parser .add_argument (
373
377
'-s' ,
374
- '--sub_directory ' ,
378
+ '--sub_directories ' ,
375
379
help = 'Sub directory to process' ,
376
- default = 'docs/en'
380
+ default = 'docs/en,knowledgebase '
377
381
)
378
382
parser .add_argument (
379
383
'-x' ,
@@ -387,5 +391,6 @@ def main(base_directory, sub_directory, algolia_app_id, algolia_api_key, algolia
387
391
args = parser .parse_args ()
388
392
if args .dry_run :
389
393
print ('Dry running, not sending results to Algolia.' )
390
- main (args .base_directory , args .sub_directory , args .algolia_app_id , args .algolia_api_key , args .algolia_index_name ,
394
+ sub_directories = [s .strip () for s in args .sub_directories .split (',' )]
395
+ main (args .base_directory , sub_directories , args .algolia_app_id , args .algolia_api_key , args .algolia_index_name ,
391
396
dry_run = args .dry_run )
0 commit comments