@@ -63,15 +63,14 @@ def parse_metadata_and_content(directory, base_directory, md_file_path, log_snip
63
63
# Add file path to metadata
64
64
metadata ['file_path' ] = md_file_path
65
65
# Note: we assume last sub folder in directory is in url
66
- if metadata ['file_path' ] == '/opt/clickhouse-docs/docs/en/guides/best-practices/sparse-primary-indexes.md' :
67
- pass
68
66
slug = metadata .get ('slug' , '/' + os .path .split (directory )[- 1 ] + metadata ['file_path' ].replace (directory , '' ))
69
67
for p in ['.md' , '.mdx' , '"' , "'" ]:
70
68
slug = slug .removeprefix (p ).removesuffix (p )
71
69
slug = slug .removesuffix ('/' )
72
70
content = re .sub (r'^import .+?from .+?$' , '' , content , flags = re .MULTILINE ) # remove import
73
71
content = re .sub (r'<[A-Za-z0-9_-]+\s*[^>]*\/>' , '' , content ) # report components
74
72
metadata ['slug' ] = slug
73
+ metadata ['title' ] = metadata .get ('title' , '' ).strip ()
75
74
return metadata , content
76
75
77
76
@@ -250,6 +249,7 @@ def parse_markdown_content(metadata, content):
250
249
current_subdoc ['type' ] = 'lvl1'
251
250
current_subdoc ['object_id' ] = custom_slugify (heading_slug )
252
251
current_subdoc ['hierarchy' ]['lvl1' ] = current_h1
252
+ current_subdoc ['hierarchy' ]['lvl0' ] = current_h1 if metadata .get ('title' , '' ) == '' else metadata .get ('title' , '' )
253
253
elif line .startswith ('## ' ):
254
254
if current_subdoc :
255
255
yield from split_large_document (current_subdoc )
@@ -272,7 +272,7 @@ def parse_markdown_content(metadata, content):
272
272
'objectID' : get_object_id (f'{ heading_slug } -{ current_h2 } ' ),
273
273
'type' : 'lvl2' ,
274
274
'hierarchy' : {
275
- 'lvl0' : metadata .get ('title' , '' ),
275
+ 'lvl0' : current_h1 if metadata . get ( 'title' , '' ) == '' else metadata .get ('title' , '' ),
276
276
'lvl1' : current_h1 ,
277
277
'lvl2' : current_h2 ,
278
278
}
@@ -300,7 +300,7 @@ def parse_markdown_content(metadata, content):
300
300
'objectID' : get_object_id (f'{ heading_slug } -{ current_h3 } ' ),
301
301
'type' : 'lvl3' ,
302
302
'hierarchy' : {
303
- 'lvl0' : metadata .get ('title' , '' ),
303
+ 'lvl0' : current_h1 if metadata . get ( 'title' , '' ) == '' else metadata .get ('title' , '' ),
304
304
'lvl1' : current_h1 ,
305
305
'lvl2' : current_h2 ,
306
306
'lvl3' : current_h3 ,
@@ -325,7 +325,7 @@ def parse_markdown_content(metadata, content):
325
325
'objectID' : get_object_id (f'{ heading_slug } -{ current_h4 } ' ),
326
326
'type' : 'lvl4' ,
327
327
'hierarchy' : {
328
- 'lvl0' : metadata .get ('title' , '' ),
328
+ 'lvl0' : current_h1 if metadata . get ( 'title' , '' ) == '' else metadata .get ('title' , '' ),
329
329
'lvl1' : current_h1 ,
330
330
'lvl2' : current_h2 ,
331
331
'lvl3' : current_h3 ,
@@ -404,7 +404,8 @@ def main(base_directory, sub_directories, algolia_app_id, algolia_api_key, algol
404
404
batch_size = 1000 , dry_run = False ):
405
405
temp_index_name = f"{ algolia_index_name } _temp"
406
406
client = SearchClientSync (algolia_app_id , algolia_api_key )
407
- create_new_index (client , temp_index_name )
407
+ if not dry_run :
408
+ create_new_index (client , temp_index_name )
408
409
docs = []
409
410
for sub_directory in sub_directories :
410
411
directory = os .path .join (base_directory , sub_directory )
@@ -426,14 +427,15 @@ def main(base_directory, sub_directories, algolia_app_id, algolia_api_key, algol
426
427
print (f'{ 'processed' if dry_run else 'indexed' } { len (batch )} records' )
427
428
t += len (batch )
428
429
print (f'total { 'processed' if dry_run else 'indexed' } { t } records' )
429
- print ('switching temporary index...' , end = '' )
430
- client .operation_index (
431
- index_name = temp_index_name ,
432
- operation_index_params = {
433
- "operation" : "move" ,
434
- "destination" : algolia_index_name
435
- },
436
- )
430
+ if not dry_run :
431
+ print ('switching temporary index...' , end = '' )
432
+ client .operation_index (
433
+ index_name = temp_index_name ,
434
+ operation_index_params = {
435
+ "operation" : "move" ,
436
+ "destination" : algolia_index_name
437
+ },
438
+ )
437
439
print ('done' )
438
440
439
441
0 commit comments