Skip to content

Commit 817bf01

Browse files
committed
dont index more than once
1 parent 11435ab commit 817bf01

File tree

1 file changed

+26
-21
lines changed

1 file changed

+26
-21
lines changed

scripts/search/index_pages.py

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
DOCS_SITE = 'https://clickhouse.com/docs'
1212
HEADER_PATTERN = re.compile(r"^(.*?)(?:\s*\{#(.*?)\})$")
1313
object_ids = set()
14-
14+
files_processed = set()
1515
link_data = []
1616

1717

@@ -135,12 +135,14 @@ def inject_snippets(directory, content):
135135

136136
for snippet_name, snippet_full_path, _ in matches:
137137
full_path = os.path.join(directory, snippet_full_path)
138-
if os.path.exists(full_path):
139-
with open(full_path, 'r', encoding='utf-8') as snippet_file:
140-
snippet_map[snippet_name] = remove_code_blocks(snippet_file.read())
141-
else:
142-
print(f"FATAL: Unable to handle snippet: {full_path}")
143-
sys.exit(1)
138+
if full_path not in files_processed: # we dont index snippets more than once
139+
if os.path.exists(full_path):
140+
with open(full_path, 'r', encoding='utf-8') as snippet_file:
141+
snippet_map[snippet_name] = remove_code_blocks(snippet_file.read())
142+
files_processed.add(full_path)
143+
else:
144+
print(f"FATAL: Unable to handle snippet: {full_path}")
145+
sys.exit(1)
144146
content = snippet_pattern.sub("", content)
145147
for snippet_name, snippet_content in snippet_map.items():
146148
tag_pattern = re.compile(fr"<{snippet_name}\s*/>")
@@ -204,7 +206,6 @@ def parse_markdown_content(metadata, content):
204206
heading_slug = slug
205207
lines = content.splitlines()
206208
current_h1 = metadata.get('title', '')
207-
208209
current_subdoc = {
209210
'file_path': metadata.get('file_path', ''),
210211
'slug': heading_slug,
@@ -300,11 +301,13 @@ def process_markdown_directory(directory, base_directory):
300301
for file in files:
301302
if file.endswith('.md') or file.endswith('.mdx'):
302303
md_file_path = os.path.join(root, file)
303-
metadata, content = parse_metadata_and_content(directory, base_directory, md_file_path)
304-
for sub_doc in parse_markdown_content(metadata, content):
305-
update_page_links(directory, base_directory, metadata.get('file_path', ''), sub_doc['url'],
306-
sub_doc['content'])
307-
yield sub_doc
304+
if md_file_path not in files_processed:
305+
files_processed.add(md_file_path)
306+
metadata, content = parse_metadata_and_content(directory, base_directory, md_file_path)
307+
for sub_doc in parse_markdown_content(metadata, content):
308+
update_page_links(directory, base_directory, metadata.get('file_path', ''), sub_doc['url'],
309+
sub_doc['content'])
310+
yield sub_doc
308311

309312

310313
def send_to_algolia(client, index_name, records):
@@ -337,16 +340,17 @@ def compute_page_rank(link_data, damping_factor=0.85, max_iter=100, tol=1e-6):
337340
return page_rank
338341

339342

340-
def main(base_directory, sub_directory, algolia_app_id, algolia_api_key, algolia_index_name,
343+
def main(base_directory, sub_directories, algolia_app_id, algolia_api_key, algolia_index_name,
341344
batch_size=1000, dry_run=False):
342345
client = SearchClientSync(algolia_app_id, algolia_api_key)
343-
directory = os.path.join(base_directory, sub_directory)
344-
t = 0
345346
docs = []
346-
for doc in process_markdown_directory(directory, base_directory):
347-
docs.append(doc)
347+
for sub_directory in sub_directories:
348+
directory = os.path.join(base_directory, sub_directory)
349+
for doc in process_markdown_directory(directory, base_directory):
350+
docs.append(doc)
348351
page_rank_scores = compute_page_rank(link_data)
349352
# Add PageRank scores to the documents
353+
t = 0
350354
for doc in docs:
351355
rank = page_rank_scores.get(doc.get('url', ''), 0)
352356
doc['page_rank'] = int(rank * 10000000)
@@ -371,9 +375,9 @@ def main(base_directory, sub_directory, algolia_app_id, algolia_api_key, algolia
371375
)
372376
parser.add_argument(
373377
'-s',
374-
'--sub_directory',
378+
'--sub_directories',
375379
help='Sub directory to process',
376-
default='docs/en'
380+
default='docs/en,knowledgebase'
377381
)
378382
parser.add_argument(
379383
'-x',
@@ -387,5 +391,6 @@ def main(base_directory, sub_directory, algolia_app_id, algolia_api_key, algolia
387391
args = parser.parse_args()
388392
if args.dry_run:
389393
print('Dry running, not sending results to Algolia.')
390-
main(args.base_directory, args.sub_directory, args.algolia_app_id, args.algolia_api_key, args.algolia_index_name,
394+
sub_directories = [s.strip() for s in args.sub_directories.split(',')]
395+
main(args.base_directory, sub_directories, args.algolia_app_id, args.algolia_api_key, args.algolia_index_name,
391396
dry_run=args.dry_run)

0 commit comments

Comments
 (0)