Skip to content

Commit 11435ab

Browse files
committed
clean up script
1 parent f56e887 commit 11435ab

File tree

3 files changed

+24
-18
lines changed

3 files changed

+24
-18
lines changed

scripts/search/compute_ndcg.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@
77

88

99
# Initialize Algolia client
10-
# ALGOLIA_APP_ID = "5H9UG7CX5W"
11-
# ALGOLIA_API_KEY = "4a7bf25cf3edbef29d78d5e1eecfdca5"
10+
ALGOLIA_APP_ID = "5H9UG7CX5W"
11+
ALGOLIA_API_KEY = "4a7bf25cf3edbef29d78d5e1eecfdca5"
1212

1313
# old search engine using crawler
14-
ALGOLIA_APP_ID = "62VCH2MD74"
15-
ALGOLIA_API_KEY = "b78244d947484fe3ece7bc5472e9f2af"
14+
# ALGOLIA_APP_ID = "62VCH2MD74"
15+
# ALGOLIA_API_KEY = "b78244d947484fe3ece7bc5472e9f2af"
1616

1717

1818
client = SearchClientSync(ALGOLIA_APP_ID, ALGOLIA_API_KEY)

scripts/search/index_pages.py

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,14 @@ def read_metadata(text):
2626
return metadata
2727

2828

29-
def parse_metadata_and_content(directory, base_directory, md_file_path,):
29+
def parse_metadata_and_content(directory, base_directory, md_file_path, log_snippet_failure=True):
3030
"""Parse multiple metadata blocks and content from a Markdown file."""
3131
try:
3232
with open(md_file_path, 'r', encoding='utf-8') as file:
3333
content = file.read()
34-
except Exception:
35-
print(f"Warning: couldn't read metadata from {md_file_path}")
34+
except:
35+
if log_snippet_failure:
36+
print(f"Warning: couldn't read metadata from {md_file_path}")
3637
return {}, ''
3738
content = remove_code_blocks(content)
3839
# Inject any snippets
@@ -54,9 +55,10 @@ def parse_metadata_and_content(directory, base_directory, md_file_path,):
5455
if metadata['file_path'] == '/opt/clickhouse-docs/docs/en/guides/best-practices/sparse-primary-indexes.md':
5556
pass
5657
slug = metadata.get('slug', '/' + os.path.split(directory)[-1] + metadata['file_path'].replace(directory, ''))
57-
for p in ['.md', '.mdx','"',"'"]:
58-
slug = slug.removesuffix(p).removesuffix(p)
58+
for p in ['.md', '.mdx', '"', "'"]:
59+
slug = slug.removeprefix(p).removesuffix(p)
5960
slug = slug.removesuffix('/')
61+
6062
metadata['slug'] = slug
6163
return metadata, content
6264

@@ -174,23 +176,26 @@ def extract_links_from_content(content):
174176

175177

176178
# best effort at creating links between docs - handling both md and urls. Challenge here some files import others
177-
# and we don't recursivelt resolve
179+
# e.g. /opt/clickhouse-docs/docs/en/sql-reference/formats.mdx - we don't recursively resolve here
178180
def update_page_links(directory, base_directory, page_path, url, content):
179181
links = extract_links_from_content(content)
182+
fail = False
180183
for target in links:
181184
if target.endswith('.md') and not target.startswith('https'):
182185
if os.path.isabs(target):
183186
c_page = os.path.abspath(base_directory + '/' + target)
184187
else:
185-
c_page = os.path.abspath(os.path.join(os.path.dirname(page_path), './'+target))
186-
metadata, _ = parse_metadata_and_content(directory, base_directory, c_page)
188+
c_page = os.path.abspath(os.path.join(os.path.dirname(page_path), './' + target))
189+
metadata, _ = parse_metadata_and_content(directory, base_directory, c_page, log_snippet_failure=False)
187190
if 'slug' in metadata:
188191
link_data.append((url, f'{DOCS_SITE}{metadata.get('slug')}'))
189192
else:
190-
print(f"Warning: couldn't resolve link for {page_path}")
193+
fail = True
191194
elif target.startswith('/docs/'): # ignore external links
192195
target = target.removesuffix('/')
193196
link_data.append((url, f'{DOCS_SITE}{target.replace("/docs", "")}'))
197+
if fail:
198+
print(f"Warning: couldn't resolve link for {page_path}")
194199

195200

196201
def parse_markdown_content(metadata, content):
@@ -297,7 +302,8 @@ def process_markdown_directory(directory, base_directory):
297302
md_file_path = os.path.join(root, file)
298303
metadata, content = parse_metadata_and_content(directory, base_directory, md_file_path)
299304
for sub_doc in parse_markdown_content(metadata, content):
300-
update_page_links(directory, base_directory, metadata.get('file_path', ''), sub_doc['url'], sub_doc['content'])
305+
update_page_links(directory, base_directory, metadata.get('file_path', ''), sub_doc['url'],
306+
sub_doc['content'])
301307
yield sub_doc
302308

303309

@@ -343,15 +349,14 @@ def main(base_directory, sub_directory, algolia_app_id, algolia_api_key, algolia
343349
# Add PageRank scores to the documents
344350
for doc in docs:
345351
rank = page_rank_scores.get(doc.get('url', ''), 0)
346-
print(doc['url'])
347352
doc['page_rank'] = int(rank * 10000000)
348353
for i in range(0, len(docs), batch_size):
349354
batch = docs[i:i + batch_size] # Get the current batch
350355
if not dry_run:
351356
send_to_algolia(client, algolia_index_name, batch)
352357
else:
353358
for d in batch:
354-
print(d['url'] + '-' + d['page_rank'])
359+
print(f"{d['url']} - {d['page_rank']}")
355360
print(f'{'processed' if dry_run else 'indexed'} {len(batch)} records')
356361
t += len(batch)
357362
print(f'total for {directory}: {'processed' if dry_run else 'indexed'} {t} records')
@@ -382,4 +387,5 @@ def main(base_directory, sub_directory, algolia_app_id, algolia_api_key, algolia
382387
args = parser.parse_args()
383388
if args.dry_run:
384389
print('Dry running, not sending results to Algolia.')
385-
main(args.base_directory, args.sub_directory, args.algolia_app_id, args.algolia_api_key, args.algolia_index_name, dry_run=args.dry_run)
390+
main(args.base_directory, args.sub_directory, args.algolia_app_id, args.algolia_api_key, args.algolia_index_name,
391+
dry_run=args.dry_run)

scripts/search/results.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ todate,https://clickhouse.com/docs/en/sql-reference/functions/type-conversion-fu
1717
cast,https://clickhouse.com/docs/en/sql-reference/functions/type-conversion-functions#cast,,
1818
arraymap,https://clickhouse.com/docs/en/sql-reference/functions/array-functions#arraymapfunc-arr1-,,
1919
insert,https://clickhouse.com/docs/en/guides/inserting-data,https://clickhouse.com/docs/en/sql-reference/statements/insert-into,
20-
partition,https://clickhouse.com/docs/en/partitions,https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key,,
20+
partition,https://clickhouse.com/docs/en/partitions,https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key,https://clickhouse.com/docs/en/sql-reference/statements/alter/partition,
2121
kafka,https://clickhouse.com/docs/en/engines/table-engines/integrations/kafka,https://clickhouse.com/docs/en/integrations/kafka/kafka-table-engine,https://clickhouse.com/docs/knowledgebase/kafka-to-clickhouse-setup
2222
delete,https://clickhouse.com/docs/en/deletes/overview,,
2323
final,https://clickhouse.com/docs/en/sql-reference/statements/select/from#final-modifier,https://clickhouse.com/docs/en/cloud/bestpractices/avoid-optimize-final,https://clickhouse.com/docs/en/guides/replacing-merge-tree#final-performance

0 commit comments

Comments
 (0)