@@ -26,13 +26,14 @@ def read_metadata(text):
26
26
return metadata
27
27
28
28
29
- def parse_metadata_and_content (directory , base_directory , md_file_path ,):
29
+ def parse_metadata_and_content (directory , base_directory , md_file_path , log_snippet_failure = True ):
30
30
"""Parse multiple metadata blocks and content from a Markdown file."""
31
31
try :
32
32
with open (md_file_path , 'r' , encoding = 'utf-8' ) as file :
33
33
content = file .read ()
34
- except Exception :
35
- print (f"Warning: couldn't read metadata from { md_file_path } " )
34
+ except :
35
+ if log_snippet_failure :
36
+ print (f"Warning: couldn't read metadata from { md_file_path } " )
36
37
return {}, ''
37
38
content = remove_code_blocks (content )
38
39
# Inject any snippets
@@ -54,9 +55,10 @@ def parse_metadata_and_content(directory, base_directory, md_file_path,):
54
55
if metadata ['file_path' ] == '/opt/clickhouse-docs/docs/en/guides/best-practices/sparse-primary-indexes.md' :
55
56
pass
56
57
slug = metadata .get ('slug' , '/' + os .path .split (directory )[- 1 ] + metadata ['file_path' ].replace (directory , '' ))
57
- for p in ['.md' , '.mdx' ,'"' ,"'" ]:
58
- slug = slug .removesuffix (p ).removesuffix (p )
58
+ for p in ['.md' , '.mdx' , '"' , "'" ]:
59
+ slug = slug .removeprefix (p ).removesuffix (p )
59
60
slug = slug .removesuffix ('/' )
61
+
60
62
metadata ['slug' ] = slug
61
63
return metadata , content
62
64
@@ -174,23 +176,26 @@ def extract_links_from_content(content):
174
176
175
177
176
178
# best effort at creating links between docs - handling both md and urls. Challenge here some files import others
177
- # and we don't recursivelt resolve
179
+ # e.g. /opt/clickhouse-docs/docs/en/sql-reference/formats.mdx - we don't recursively resolve here
178
180
def update_page_links (directory , base_directory , page_path , url , content ):
179
181
links = extract_links_from_content (content )
182
+ fail = False
180
183
for target in links :
181
184
if target .endswith ('.md' ) and not target .startswith ('https' ):
182
185
if os .path .isabs (target ):
183
186
c_page = os .path .abspath (base_directory + '/' + target )
184
187
else :
185
- c_page = os .path .abspath (os .path .join (os .path .dirname (page_path ), './' + target ))
186
- metadata , _ = parse_metadata_and_content (directory , base_directory , c_page )
188
+ c_page = os .path .abspath (os .path .join (os .path .dirname (page_path ), './' + target ))
189
+ metadata , _ = parse_metadata_and_content (directory , base_directory , c_page , log_snippet_failure = False )
187
190
if 'slug' in metadata :
188
191
link_data .append ((url , f'{ DOCS_SITE } { metadata .get ('slug' )} ' ))
189
192
else :
190
- print ( f"Warning: couldn't resolve link for { page_path } " )
193
+ fail = True
191
194
elif target .startswith ('/docs/' ): # ignore external links
192
195
target = target .removesuffix ('/' )
193
196
link_data .append ((url , f'{ DOCS_SITE } { target .replace ("/docs" , "" )} ' ))
197
+ if fail :
198
+ print (f"Warning: couldn't resolve link for { page_path } " )
194
199
195
200
196
201
def parse_markdown_content (metadata , content ):
@@ -297,7 +302,8 @@ def process_markdown_directory(directory, base_directory):
297
302
md_file_path = os .path .join (root , file )
298
303
metadata , content = parse_metadata_and_content (directory , base_directory , md_file_path )
299
304
for sub_doc in parse_markdown_content (metadata , content ):
300
- update_page_links (directory , base_directory , metadata .get ('file_path' , '' ), sub_doc ['url' ], sub_doc ['content' ])
305
+ update_page_links (directory , base_directory , metadata .get ('file_path' , '' ), sub_doc ['url' ],
306
+ sub_doc ['content' ])
301
307
yield sub_doc
302
308
303
309
@@ -343,15 +349,14 @@ def main(base_directory, sub_directory, algolia_app_id, algolia_api_key, algolia
343
349
# Add PageRank scores to the documents
344
350
for doc in docs :
345
351
rank = page_rank_scores .get (doc .get ('url' , '' ), 0 )
346
- print (doc ['url' ])
347
352
doc ['page_rank' ] = int (rank * 10000000 )
348
353
for i in range (0 , len (docs ), batch_size ):
349
354
batch = docs [i :i + batch_size ] # Get the current batch
350
355
if not dry_run :
351
356
send_to_algolia (client , algolia_index_name , batch )
352
357
else :
353
358
for d in batch :
354
- print (d ['url' ] + '-' + d ['page_rank' ])
359
+ print (f" { d ['url' ]} - { d ['page_rank' ]} " )
355
360
print (f'{ 'processed' if dry_run else 'indexed' } { len (batch )} records' )
356
361
t += len (batch )
357
362
print (f'total for { directory } : { 'processed' if dry_run else 'indexed' } { t } records' )
@@ -382,4 +387,5 @@ def main(base_directory, sub_directory, algolia_app_id, algolia_api_key, algolia
382
387
args = parser .parse_args ()
383
388
if args .dry_run :
384
389
print ('Dry running, not sending results to Algolia.' )
385
- main (args .base_directory , args .sub_directory , args .algolia_app_id , args .algolia_api_key , args .algolia_index_name , dry_run = args .dry_run )
390
+ main (args .base_directory , args .sub_directory , args .algolia_app_id , args .algolia_api_key , args .algolia_index_name ,
391
+ dry_run = args .dry_run )
0 commit comments