Skip to content

Commit f56e887

Browse files
committed
fix links in script
1 parent 5d8763a commit f56e887

File tree

2 files changed

+61
-48
lines changed

2 files changed

+61
-48
lines changed

docs/en/chdb/getting-started.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ pip install pandas pyarrow
4949
## Querying a JSON file in S3
5050

5151
Let's now have a look at how to query a JSON file that's stored in an S3 bucket.
52-
The [YouTube dislikes dataset](https://clickhouse.com/docs/en/getting-started/example-datasets/youtube-dislikes) contains more than 4 billion rows of dislikes on YouTube videos up to 2021.
52+
The [YouTube dislikes dataset](/docs/en/getting-started/example-datasets/youtube-dislikes) contains more than 4 billion rows of dislikes on YouTube videos up to 2021.
5353
We're going to work with one of the JSON files from that dataset.
5454

5555
Import chdb:

scripts/search/index_pages.py

Lines changed: 60 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from algoliasearch.search.client import SearchClientSync
99
import networkx as nx
1010

11-
DOCS_PREFIX = 'https://clickhouse.com/docs'
11+
DOCS_SITE = 'https://clickhouse.com/docs'
1212
HEADER_PATTERN = re.compile(r"^(.*?)(?:\s*\{#(.*?)\})$")
1313
object_ids = set()
1414

@@ -26,13 +26,17 @@ def read_metadata(text):
2626
return metadata
2727

2828

29-
def parse_metadata_and_content(root_directory, md_file_path):
29+
def parse_metadata_and_content(directory, base_directory, md_file_path,):
3030
"""Parse multiple metadata blocks and content from a Markdown file."""
31-
with open(md_file_path, 'r', encoding='utf-8') as file:
32-
content = file.read()
31+
try:
32+
with open(md_file_path, 'r', encoding='utf-8') as file:
33+
content = file.read()
34+
except Exception:
35+
print(f"Warning: couldn't read metadata from {md_file_path}")
36+
return {}, ''
3337
content = remove_code_blocks(content)
3438
# Inject any snippets
35-
content = inject_snippets(root_directory, content)
39+
content = inject_snippets(base_directory, content)
3640
# Pattern to capture multiple metadata blocks
3741
metadata_pattern = r'---\n(.*?)\n---\n'
3842
metadata_blocks = re.findall(metadata_pattern, content, re.DOTALL)
@@ -46,6 +50,14 @@ def parse_metadata_and_content(root_directory, md_file_path):
4650
content = re.sub(metadata_pattern, '', content, flags=re.DOTALL)
4751
# Add file path to metadata
4852
metadata['file_path'] = md_file_path
53+
# Note: we assume last sub folder in directory is in url
54+
if metadata['file_path'] == '/opt/clickhouse-docs/docs/en/guides/best-practices/sparse-primary-indexes.md':
55+
pass
56+
slug = metadata.get('slug', '/' + os.path.split(directory)[-1] + metadata['file_path'].replace(directory, ''))
57+
for p in ['.md', '.mdx','"',"'"]:
58+
slug = slug.removesuffix(p).removesuffix(p)
59+
slug = slug.removesuffix('/')
60+
metadata['slug'] = slug
4961
return metadata, content
5062

5163

@@ -161,34 +173,43 @@ def extract_links_from_content(content):
161173
return re.findall(link_pattern, content)
162174

163175

164-
def update_page_rank(url, content):
176+
# best effort at creating links between docs - handling both md and urls. Challenge here some files import others
177+
# and we don't recursivelt resolve
178+
def update_page_links(directory, base_directory, page_path, url, content):
165179
links = extract_links_from_content(content)
166180
for target in links:
167-
if target.startswith('/docs/') and not target.endswith('.md'):
168-
link_data.append((url, f'{DOCS_PREFIX}{target.replace("/docs", "")}'))
181+
if target.endswith('.md') and not target.startswith('https'):
182+
if os.path.isabs(target):
183+
c_page = os.path.abspath(base_directory + '/' + target)
184+
else:
185+
c_page = os.path.abspath(os.path.join(os.path.dirname(page_path), './'+target))
186+
metadata, _ = parse_metadata_and_content(directory, base_directory, c_page)
187+
if 'slug' in metadata:
188+
link_data.append((url, f'{DOCS_SITE}{metadata.get('slug')}'))
189+
else:
190+
print(f"Warning: couldn't resolve link for {page_path}")
191+
elif target.startswith('/docs/'): # ignore external links
192+
target = target.removesuffix('/')
193+
link_data.append((url, f'{DOCS_SITE}{target.replace("/docs", "")}'))
169194

170195

171-
def parse_markdown_content(directory, metadata, content):
196+
def parse_markdown_content(metadata, content):
172197
"""Parse the Markdown content and generate sub-documents for each ##, ###, and #### heading."""
173-
slug = metadata.get('slug',
174-
'/' + os.path.split(os.path.split(metadata['file_path'])[0])[1] + metadata['file_path'].replace(
175-
directory, '').removesuffix('.md').removesuffix('.mdx'))
176-
slug = slug.removesuffix('/')
198+
slug = metadata['slug']
177199
heading_slug = slug
178200
lines = content.splitlines()
179201
current_h1 = metadata.get('title', '')
180202

181203
current_subdoc = {
182204
'file_path': metadata.get('file_path', ''),
183205
'slug': heading_slug,
184-
'url': f'{DOCS_PREFIX}{heading_slug}',
206+
'url': f'{DOCS_SITE}{heading_slug}',
185207
'h1': current_h1,
186208
'content': metadata.get('description', ''),
187209
'title': metadata.get('title', ''),
188210
'keywords': metadata.get('keywords', ''),
189211
'objectID': get_object_id(heading_slug),
190212
}
191-
192213
for line in lines:
193214
if line.startswith('# '):
194215
if line[2:].strip():
@@ -198,12 +219,11 @@ def parse_markdown_content(directory, metadata, content):
198219
current_h1 = slug_match.group(2)
199220
heading_slug = slug_match.group(2)
200221
current_subdoc['slug'] = heading_slug
201-
current_subdoc['url'] = f'{DOCS_PREFIX}{heading_slug}'
222+
current_subdoc['url'] = f'{DOCS_SITE}{heading_slug}'
202223
current_subdoc['h1'] = current_h1
203224
current_subdoc['object_id'] = custom_slugify(heading_slug)
204225
elif line.startswith('## '):
205226
if current_subdoc:
206-
update_page_rank(current_subdoc['url'], current_subdoc['content'])
207227
yield from split_large_document(current_subdoc)
208228
current_h2 = line[3:].strip()
209229
slug_match = re.match(HEADER_PATTERN, current_h2)
@@ -215,7 +235,7 @@ def parse_markdown_content(directory, metadata, content):
215235
current_subdoc = {
216236
'file_path': metadata.get('file_path', ''),
217237
'slug': f'{heading_slug}',
218-
'url': f'{DOCS_PREFIX}{heading_slug}',
238+
'url': f'{DOCS_SITE}{heading_slug}',
219239
'title': current_h2,
220240
'h2': current_h2,
221241
'content': '',
@@ -225,7 +245,6 @@ def parse_markdown_content(directory, metadata, content):
225245
elif line.startswith('### '):
226246
# note we send users to the h2 or h1 even on ###
227247
if current_subdoc:
228-
update_page_rank(current_subdoc['url'], current_subdoc['content'])
229248
yield from split_large_document(current_subdoc)
230249
current_h3 = line[4:].strip()
231250
slug_match = re.match(HEADER_PATTERN, current_h3)
@@ -237,7 +256,7 @@ def parse_markdown_content(directory, metadata, content):
237256
current_subdoc = {
238257
'file_path': metadata.get('file_path', ''),
239258
'slug': f'{heading_slug}',
240-
'url': f'{DOCS_PREFIX}{heading_slug}',
259+
'url': f'{DOCS_SITE}{heading_slug}',
241260
'title': current_h3,
242261
'h3': current_h3,
243262
'content': '',
@@ -246,7 +265,6 @@ def parse_markdown_content(directory, metadata, content):
246265
}
247266
elif line.startswith('#### '):
248267
if current_subdoc:
249-
update_page_rank(current_subdoc['url'], current_subdoc['content'])
250268
yield from split_large_document(current_subdoc)
251269
current_h4 = line[5:].strip()
252270
slug_match = re.match(HEADER_PATTERN, current_h4)
@@ -255,7 +273,7 @@ def parse_markdown_content(directory, metadata, content):
255273
current_subdoc = {
256274
'file_path': metadata.get('file_path', ''),
257275
'slug': f'{heading_slug}',
258-
'url': f'{DOCS_PREFIX}{heading_slug}#',
276+
'url': f'{DOCS_SITE}{heading_slug}#',
259277
'title': current_h4,
260278
'h4': current_h4,
261279
'content': '',
@@ -266,23 +284,21 @@ def parse_markdown_content(directory, metadata, content):
266284
current_subdoc['content'] += line + '\n'
267285

268286
if current_subdoc:
269-
update_page_rank(current_subdoc['url'], current_subdoc['content'])
270287
yield from split_large_document(current_subdoc)
271288

272289

273-
def process_markdown_directory(root_directory, directory):
290+
def process_markdown_directory(directory, base_directory):
274291
"""Recursively process Markdown files in a directory."""
275-
directory = os.path.abspath(directory)
276-
i = 0
277292
for root, dirs, files in os.walk(directory):
278293
# Skip `_snippets` and _placeholders subfolders
279294
dirs[:] = [d for d in dirs if d != '_snippets' and d != '_placeholders']
280295
for file in files:
281296
if file.endswith('.md') or file.endswith('.mdx'):
282297
md_file_path = os.path.join(root, file)
283-
metadata, content = parse_metadata_and_content(root_directory, md_file_path)
284-
for subdoc in parse_markdown_content(directory, metadata, content):
285-
yield subdoc
298+
metadata, content = parse_metadata_and_content(directory, base_directory, md_file_path)
299+
for sub_doc in parse_markdown_content(metadata, content):
300+
update_page_links(directory, base_directory, metadata.get('file_path', ''), sub_doc['url'], sub_doc['content'])
301+
yield sub_doc
286302

287303

288304
def send_to_algolia(client, index_name, records):
@@ -315,45 +331,44 @@ def compute_page_rank(link_data, damping_factor=0.85, max_iter=100, tol=1e-6):
315331
return page_rank
316332

317333

318-
def main(root_directory, sub_directories, algolia_app_id, algolia_api_key, algolia_index_name, batch_size=1000,
319-
dry_run=False):
334+
def main(base_directory, sub_directory, algolia_app_id, algolia_api_key, algolia_index_name,
335+
batch_size=1000, dry_run=False):
320336
client = SearchClientSync(algolia_app_id, algolia_api_key)
321-
batch = []
337+
directory = os.path.join(base_directory, sub_directory)
322338
t = 0
323339
docs = []
324-
for sub_directory in sub_directories:
325-
input_directory = os.path.join(root_directory, sub_directory)
326-
for doc in process_markdown_directory(root_directory, input_directory):
327-
docs.append(doc)
340+
for doc in process_markdown_directory(directory, base_directory):
341+
docs.append(doc)
328342
page_rank_scores = compute_page_rank(link_data)
329343
# Add PageRank scores to the documents
330344
for doc in docs:
331345
rank = page_rank_scores.get(doc.get('url', ''), 0)
346+
print(doc['url'])
332347
doc['page_rank'] = int(rank * 10000000)
333348
for i in range(0, len(docs), batch_size):
334349
batch = docs[i:i + batch_size] # Get the current batch
335350
if not dry_run:
336351
send_to_algolia(client, algolia_index_name, batch)
337352
else:
338-
for b in batch:
339-
print(json.dumps(b))
353+
for d in batch:
354+
print(d['url'] + '-' + d['page_rank'])
340355
print(f'{'processed' if dry_run else 'indexed'} {len(batch)} records')
341356
t += len(batch)
342-
print(f'total for {sub_directory}: {'processed' if dry_run else 'indexed'} {t} records')
357+
print(f'total for {directory}: {'processed' if dry_run else 'indexed'} {t} records')
343358

344359

345360
if __name__ == '__main__':
346361
parser = argparse.ArgumentParser(description='Index search pages.')
347362
parser.add_argument(
348363
'-d',
349-
'--root_directory',
364+
'--base_directory',
350365
help='Path to root directory of docs repo'
351366
)
352367
parser.add_argument(
353-
'-p',
354-
'--doc_paths',
355-
default="docs/en,knowledgebase",
356-
help='Sub path directories to index'
368+
'-s',
369+
'--sub_directory',
370+
help='Sub directory to process',
371+
default='docs/en'
357372
)
358373
parser.add_argument(
359374
'-x',
@@ -367,6 +382,4 @@ def main(root_directory, sub_directories, algolia_app_id, algolia_api_key, algol
367382
args = parser.parse_args()
368383
if args.dry_run:
369384
print('Dry running, not sending results to Algolia.')
370-
sub_directories = [p.strip() for p in args.doc_paths.split(',')]
371-
main(args.root_directory, sub_directories, args.algolia_app_id, args.algolia_api_key, args.algolia_index_name,
372-
dry_run=args.dry_run)
385+
main(args.base_directory, args.sub_directory, args.algolia_app_id, args.algolia_api_key, args.algolia_index_name, dry_run=args.dry_run)

0 commit comments

Comments
 (0)