Skip to content

Commit 2ce7d80

Browse files
committed
clean content more
1 parent 7e70818 commit 2ce7d80

File tree

2 files changed

+45
-14
lines changed

2 files changed

+45
-14
lines changed

scripts/search/index_pages.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ def parse_metadata_and_content(directory, base_directory, md_file_path, log_snip
3535
if log_snippet_failure:
3636
print(f"Warning: couldn't read metadata from {md_file_path}")
3737
return {}, ''
38-
content = remove_code_blocks(content)
38+
39+
content = clean_content(content)
3940
# Inject any snippets
4041
content = inject_snippets(base_directory, content)
4142
# Pattern to capture multiple metadata blocks
@@ -58,19 +59,12 @@ def parse_metadata_and_content(directory, base_directory, md_file_path, log_snip
5859
for p in ['.md', '.mdx', '"', "'"]:
5960
slug = slug.removeprefix(p).removesuffix(p)
6061
slug = slug.removesuffix('/')
61-
62+
content = re.sub(r'^import .+?from .+?$', '', content, flags=re.MULTILINE) # remove import
63+
content = re.sub(r'<[A-Za-z0-9_-]+\s*[^>]*\/>', '', content) # report components
6264
metadata['slug'] = slug
6365
return metadata, content
6466

6567

66-
def remove_code_blocks(content):
67-
# Remove code blocks
68-
content = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
69-
# Replace `\` followed by a non-whitespace character
70-
content = re.sub(r'\\(\S)', r'\\\\\1', content)
71-
return content
72-
73-
7468
def get_object_id(id):
7569
slug_id = custom_slugify(id)
7670
if not slug_id in object_ids:
@@ -125,6 +119,11 @@ def split_large_document(doc, max_size=10000):
125119
yield chunked_doc
126120

127121

122+
def clean_content(content):
123+
content = re.sub(r'\\(\S)', r'\\\\\1', content) # Replace `\` followed by a non-whitespace character
124+
content = re.sub(r'```.*?```', '', content, flags=re.DOTALL) # replace code blocks
125+
return content
126+
128127
def inject_snippets(directory, content):
129128
snippet_pattern = re.compile(
130129
r"import\s+(\w+)\s+from\s+['\"]@site/((.*?))['\"];",
@@ -135,10 +134,10 @@ def inject_snippets(directory, content):
135134

136135
for snippet_name, snippet_full_path, _ in matches:
137136
full_path = os.path.join(directory, snippet_full_path)
138-
if full_path not in files_processed: # we dont index snippets more than once
137+
if full_path not in files_processed: # we dont index snippets more than once
139138
if os.path.exists(full_path):
140139
with open(full_path, 'r', encoding='utf-8') as snippet_file:
141-
snippet_map[snippet_name] = remove_code_blocks(snippet_file.read())
140+
snippet_map[snippet_name] = clean_content(snippet_file.read())
142141
files_processed.add(full_path)
143142
else:
144143
print(f"FATAL: Unable to handle snippet: {full_path}")
@@ -218,9 +217,10 @@ def parse_markdown_content(metadata, content):
218217
'content': metadata.get('description', ''),
219218
'keywords': metadata.get('keywords', ''),
220219
'objectID': get_object_id(heading_slug),
221-
'type': 'lvl0',
220+
'type': 'lvl1',
222221
'hierarchy': {
223-
'lvl0': metadata.get('title', '')
222+
'lvl0': metadata.get('title', ''),
223+
'lvl1': metadata.get('title', '')
224224
}
225225
}
226226
for line in lines:

scripts/search/test.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import re
2+
3+
content = """
4+
---
5+
slug: /en/architecture/horizontal-scaling
6+
sidebar_label: Scaling out
7+
sidebar_position: 10
8+
title: Scaling out
9+
---
10+
import ReplicationShardingTerminology from '@site/docs/en/_snippets/_replication-sharding-terminology.md';
11+
import ConfigFileNote from '@site/docs/en/_snippets/_config-files.md';
12+
13+
14+
## Description
15+
This example architecture is designed to provide scalability. It includes three nodes: two combined ClickHouse plus coordination (ClickHouse Keeper) servers, and a third server with only ClickHouse Keeper to finish the quorum of three. With this example, we'll create a database, table, and a distributed table that will be able to query the data on both of the nodes.
16+
17+
18+
<ScalePlanFeatureBadge feature="The fast release channel"/>
19+
20+
<ScalePlanFeatureBadge/>
21+
"""
22+
23+
24+
def clean_content(content):
25+
# Remove code blocks
26+
content = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
27+
content = re.sub(r'^import .+?from .+?$', '', content, flags=re.MULTILINE)
28+
content = re.sub(r'<[A-Za-z0-9_-]+\s*[^>]*\/>', '', content)
29+
return content
30+
31+
print(clean_content(content))

0 commit comments

Comments
 (0)