8
8
from algoliasearch .search .client import SearchClientSync
9
9
import networkx as nx
10
10
11
- DOCS_PREFIX = 'https://clickhouse.com/docs'
11
+ DOCS_SITE = 'https://clickhouse.com/docs'
12
12
HEADER_PATTERN = re .compile (r"^(.*?)(?:\s*\{#(.*?)\})$" )
13
13
object_ids = set ()
14
14
@@ -26,13 +26,17 @@ def read_metadata(text):
26
26
return metadata
27
27
28
28
29
- def parse_metadata_and_content (root_directory , md_file_path ):
29
+ def parse_metadata_and_content (directory , base_directory , md_file_path , ):
30
30
"""Parse multiple metadata blocks and content from a Markdown file."""
31
- with open (md_file_path , 'r' , encoding = 'utf-8' ) as file :
32
- content = file .read ()
31
+ try :
32
+ with open (md_file_path , 'r' , encoding = 'utf-8' ) as file :
33
+ content = file .read ()
34
+ except Exception :
35
+ print (f"Warning: couldn't read metadata from { md_file_path } " )
36
+ return {}, ''
33
37
content = remove_code_blocks (content )
34
38
# Inject any snippets
35
- content = inject_snippets (root_directory , content )
39
+ content = inject_snippets (base_directory , content )
36
40
# Pattern to capture multiple metadata blocks
37
41
metadata_pattern = r'---\n(.*?)\n---\n'
38
42
metadata_blocks = re .findall (metadata_pattern , content , re .DOTALL )
@@ -46,6 +50,14 @@ def parse_metadata_and_content(root_directory, md_file_path):
46
50
content = re .sub (metadata_pattern , '' , content , flags = re .DOTALL )
47
51
# Add file path to metadata
48
52
metadata ['file_path' ] = md_file_path
53
+ # Note: we assume last sub folder in directory is in url
54
+ if metadata ['file_path' ] == '/opt/clickhouse-docs/docs/en/guides/best-practices/sparse-primary-indexes.md' :
55
+ pass
56
+ slug = metadata .get ('slug' , '/' + os .path .split (directory )[- 1 ] + metadata ['file_path' ].replace (directory , '' ))
57
+ for p in ['.md' , '.mdx' ,'"' ,"'" ]:
58
+ slug = slug .removesuffix (p ).removesuffix (p )
59
+ slug = slug .removesuffix ('/' )
60
+ metadata ['slug' ] = slug
49
61
return metadata , content
50
62
51
63
@@ -161,34 +173,43 @@ def extract_links_from_content(content):
161
173
return re .findall (link_pattern , content )
162
174
163
175
164
- def update_page_rank (url , content ):
176
+ # best effort at creating links between docs - handling both md and urls. Challenge here some files import others
177
+ # and we don't recursivelt resolve
178
+ def update_page_links (directory , base_directory , page_path , url , content ):
165
179
links = extract_links_from_content (content )
166
180
for target in links :
167
- if target .startswith ('/docs/' ) and not target .endswith ('.md' ):
168
- link_data .append ((url , f'{ DOCS_PREFIX } { target .replace ("/docs" , "" )} ' ))
181
+ if target .endswith ('.md' ) and not target .startswith ('https' ):
182
+ if os .path .isabs (target ):
183
+ c_page = os .path .abspath (base_directory + '/' + target )
184
+ else :
185
+ c_page = os .path .abspath (os .path .join (os .path .dirname (page_path ), './' + target ))
186
+ metadata , _ = parse_metadata_and_content (directory , base_directory , c_page )
187
+ if 'slug' in metadata :
188
+ link_data .append ((url , f'{ DOCS_SITE } { metadata .get ('slug' )} ' ))
189
+ else :
190
+ print (f"Warning: couldn't resolve link for { page_path } " )
191
+ elif target .startswith ('/docs/' ): # ignore external links
192
+ target = target .removesuffix ('/' )
193
+ link_data .append ((url , f'{ DOCS_SITE } { target .replace ("/docs" , "" )} ' ))
169
194
170
195
171
- def parse_markdown_content (directory , metadata , content ):
196
+ def parse_markdown_content (metadata , content ):
172
197
"""Parse the Markdown content and generate sub-documents for each ##, ###, and #### heading."""
173
- slug = metadata .get ('slug' ,
174
- '/' + os .path .split (os .path .split (metadata ['file_path' ])[0 ])[1 ] + metadata ['file_path' ].replace (
175
- directory , '' ).removesuffix ('.md' ).removesuffix ('.mdx' ))
176
- slug = slug .removesuffix ('/' )
198
+ slug = metadata ['slug' ]
177
199
heading_slug = slug
178
200
lines = content .splitlines ()
179
201
current_h1 = metadata .get ('title' , '' )
180
202
181
203
current_subdoc = {
182
204
'file_path' : metadata .get ('file_path' , '' ),
183
205
'slug' : heading_slug ,
184
- 'url' : f'{ DOCS_PREFIX } { heading_slug } ' ,
206
+ 'url' : f'{ DOCS_SITE } { heading_slug } ' ,
185
207
'h1' : current_h1 ,
186
208
'content' : metadata .get ('description' , '' ),
187
209
'title' : metadata .get ('title' , '' ),
188
210
'keywords' : metadata .get ('keywords' , '' ),
189
211
'objectID' : get_object_id (heading_slug ),
190
212
}
191
-
192
213
for line in lines :
193
214
if line .startswith ('# ' ):
194
215
if line [2 :].strip ():
@@ -198,12 +219,11 @@ def parse_markdown_content(directory, metadata, content):
198
219
current_h1 = slug_match .group (2 )
199
220
heading_slug = slug_match .group (2 )
200
221
current_subdoc ['slug' ] = heading_slug
201
- current_subdoc ['url' ] = f'{ DOCS_PREFIX } { heading_slug } '
222
+ current_subdoc ['url' ] = f'{ DOCS_SITE } { heading_slug } '
202
223
current_subdoc ['h1' ] = current_h1
203
224
current_subdoc ['object_id' ] = custom_slugify (heading_slug )
204
225
elif line .startswith ('## ' ):
205
226
if current_subdoc :
206
- update_page_rank (current_subdoc ['url' ], current_subdoc ['content' ])
207
227
yield from split_large_document (current_subdoc )
208
228
current_h2 = line [3 :].strip ()
209
229
slug_match = re .match (HEADER_PATTERN , current_h2 )
@@ -215,7 +235,7 @@ def parse_markdown_content(directory, metadata, content):
215
235
current_subdoc = {
216
236
'file_path' : metadata .get ('file_path' , '' ),
217
237
'slug' : f'{ heading_slug } ' ,
218
- 'url' : f'{ DOCS_PREFIX } { heading_slug } ' ,
238
+ 'url' : f'{ DOCS_SITE } { heading_slug } ' ,
219
239
'title' : current_h2 ,
220
240
'h2' : current_h2 ,
221
241
'content' : '' ,
@@ -225,7 +245,6 @@ def parse_markdown_content(directory, metadata, content):
225
245
elif line .startswith ('### ' ):
226
246
# note we send users to the h2 or h1 even on ###
227
247
if current_subdoc :
228
- update_page_rank (current_subdoc ['url' ], current_subdoc ['content' ])
229
248
yield from split_large_document (current_subdoc )
230
249
current_h3 = line [4 :].strip ()
231
250
slug_match = re .match (HEADER_PATTERN , current_h3 )
@@ -237,7 +256,7 @@ def parse_markdown_content(directory, metadata, content):
237
256
current_subdoc = {
238
257
'file_path' : metadata .get ('file_path' , '' ),
239
258
'slug' : f'{ heading_slug } ' ,
240
- 'url' : f'{ DOCS_PREFIX } { heading_slug } ' ,
259
+ 'url' : f'{ DOCS_SITE } { heading_slug } ' ,
241
260
'title' : current_h3 ,
242
261
'h3' : current_h3 ,
243
262
'content' : '' ,
@@ -246,7 +265,6 @@ def parse_markdown_content(directory, metadata, content):
246
265
}
247
266
elif line .startswith ('#### ' ):
248
267
if current_subdoc :
249
- update_page_rank (current_subdoc ['url' ], current_subdoc ['content' ])
250
268
yield from split_large_document (current_subdoc )
251
269
current_h4 = line [5 :].strip ()
252
270
slug_match = re .match (HEADER_PATTERN , current_h4 )
@@ -255,7 +273,7 @@ def parse_markdown_content(directory, metadata, content):
255
273
current_subdoc = {
256
274
'file_path' : metadata .get ('file_path' , '' ),
257
275
'slug' : f'{ heading_slug } ' ,
258
- 'url' : f'{ DOCS_PREFIX } { heading_slug } #' ,
276
+ 'url' : f'{ DOCS_SITE } { heading_slug } #' ,
259
277
'title' : current_h4 ,
260
278
'h4' : current_h4 ,
261
279
'content' : '' ,
@@ -266,23 +284,21 @@ def parse_markdown_content(directory, metadata, content):
266
284
current_subdoc ['content' ] += line + '\n '
267
285
268
286
if current_subdoc :
269
- update_page_rank (current_subdoc ['url' ], current_subdoc ['content' ])
270
287
yield from split_large_document (current_subdoc )
271
288
272
289
273
- def process_markdown_directory (root_directory , directory ):
290
+ def process_markdown_directory (directory , base_directory ):
274
291
"""Recursively process Markdown files in a directory."""
275
- directory = os .path .abspath (directory )
276
- i = 0
277
292
for root , dirs , files in os .walk (directory ):
278
293
# Skip `_snippets` and _placeholders subfolders
279
294
dirs [:] = [d for d in dirs if d != '_snippets' and d != '_placeholders' ]
280
295
for file in files :
281
296
if file .endswith ('.md' ) or file .endswith ('.mdx' ):
282
297
md_file_path = os .path .join (root , file )
283
- metadata , content = parse_metadata_and_content (root_directory , md_file_path )
284
- for subdoc in parse_markdown_content (directory , metadata , content ):
285
- yield subdoc
298
+ metadata , content = parse_metadata_and_content (directory , base_directory , md_file_path )
299
+ for sub_doc in parse_markdown_content (metadata , content ):
300
+ update_page_links (directory , base_directory , metadata .get ('file_path' , '' ), sub_doc ['url' ], sub_doc ['content' ])
301
+ yield sub_doc
286
302
287
303
288
304
def send_to_algolia (client , index_name , records ):
@@ -315,45 +331,44 @@ def compute_page_rank(link_data, damping_factor=0.85, max_iter=100, tol=1e-6):
315
331
return page_rank
316
332
317
333
318
- def main (root_directory , sub_directories , algolia_app_id , algolia_api_key , algolia_index_name , batch_size = 1000 ,
319
- dry_run = False ):
334
+ def main (base_directory , sub_directory , algolia_app_id , algolia_api_key , algolia_index_name ,
335
+ batch_size = 1000 , dry_run = False ):
320
336
client = SearchClientSync (algolia_app_id , algolia_api_key )
321
- batch = []
337
+ directory = os . path . join ( base_directory , sub_directory )
322
338
t = 0
323
339
docs = []
324
- for sub_directory in sub_directories :
325
- input_directory = os .path .join (root_directory , sub_directory )
326
- for doc in process_markdown_directory (root_directory , input_directory ):
327
- docs .append (doc )
340
+ for doc in process_markdown_directory (directory , base_directory ):
341
+ docs .append (doc )
328
342
page_rank_scores = compute_page_rank (link_data )
329
343
# Add PageRank scores to the documents
330
344
for doc in docs :
331
345
rank = page_rank_scores .get (doc .get ('url' , '' ), 0 )
346
+ print (doc ['url' ])
332
347
doc ['page_rank' ] = int (rank * 10000000 )
333
348
for i in range (0 , len (docs ), batch_size ):
334
349
batch = docs [i :i + batch_size ] # Get the current batch
335
350
if not dry_run :
336
351
send_to_algolia (client , algolia_index_name , batch )
337
352
else :
338
- for b in batch :
339
- print (json . dumps ( b ) )
353
+ for d in batch :
354
+ print (d [ 'url' ] + '-' + d [ 'page_rank' ] )
340
355
print (f'{ 'processed' if dry_run else 'indexed' } { len (batch )} records' )
341
356
t += len (batch )
342
- print (f'total for { sub_directory } : { 'processed' if dry_run else 'indexed' } { t } records' )
357
+ print (f'total for { directory } : { 'processed' if dry_run else 'indexed' } { t } records' )
343
358
344
359
345
360
if __name__ == '__main__' :
346
361
parser = argparse .ArgumentParser (description = 'Index search pages.' )
347
362
parser .add_argument (
348
363
'-d' ,
349
- '--root_directory ' ,
364
+ '--base_directory ' ,
350
365
help = 'Path to root directory of docs repo'
351
366
)
352
367
parser .add_argument (
353
- '-p ' ,
354
- '--doc_paths ' ,
355
- default = "docs/en,knowledgebase" ,
356
- help = 'Sub path directories to index '
368
+ '-s ' ,
369
+ '--sub_directory ' ,
370
+ help = 'Sub directory to process' ,
371
+ default = 'docs/en '
357
372
)
358
373
parser .add_argument (
359
374
'-x' ,
@@ -367,6 +382,4 @@ def main(root_directory, sub_directories, algolia_app_id, algolia_api_key, algol
367
382
args = parser .parse_args ()
368
383
if args .dry_run :
369
384
print ('Dry running, not sending results to Algolia.' )
370
- sub_directories = [p .strip () for p in args .doc_paths .split (',' )]
371
- main (args .root_directory , sub_directories , args .algolia_app_id , args .algolia_api_key , args .algolia_index_name ,
372
- dry_run = args .dry_run )
385
+ main (args .base_directory , args .sub_directory , args .algolia_app_id , args .algolia_api_key , args .algolia_index_name , dry_run = args .dry_run )
0 commit comments