clean content more

gingerwizard · gingerwizard · commit 2ce7d801afb5 · 2025-01-24T21:24:30.000Z
diff --git a/scripts/search/index_pages.py b/scripts/search/index_pages.py
@@ -35,7 +35,8 @@ def parse_metadata_and_content(directory, base_directory, md_file_path, log_snip
         if log_snippet_failure:
             print(f"Warning: couldn't read metadata from {md_file_path}")
         return {}, ''
-    content = remove_code_blocks(content)
+
+    content = clean_content(content)
     # Inject any snippets
     content = inject_snippets(base_directory, content)
     # Pattern to capture multiple metadata blocks
@@ -58,19 +59,12 @@ def parse_metadata_and_content(directory, base_directory, md_file_path, log_snip
     for p in ['.md', '.mdx', '"', "'"]:
         slug = slug.removeprefix(p).removesuffix(p)
     slug = slug.removesuffix('/')
-
+    content = re.sub(r'^import .+?from .+?$', '', content, flags=re.MULTILINE)  # remove import
+    content = re.sub(r'<[A-Za-z0-9_-]+\s*[^>]*\/>', '', content)  # report components
     metadata['slug'] = slug
     return metadata, content
 
 
-def remove_code_blocks(content):
-    # Remove code blocks
-    content = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
-    # Replace `\` followed by a non-whitespace character
-    content = re.sub(r'\\(\S)', r'\\\\\1', content)
-    return content
-
-
 def get_object_id(id):
     slug_id = custom_slugify(id)
     if not slug_id in object_ids:
@@ -125,6 +119,11 @@ def split_large_document(doc, max_size=10000):
             yield chunked_doc
 
 
+def clean_content(content):
+    content = re.sub(r'\\(\S)', r'\\\\\1', content)  # Replace `\` followed by a non-whitespace character
+    content = re.sub(r'```.*?```', '', content, flags=re.DOTALL)  # replace code blocks
+    return content
+
 def inject_snippets(directory, content):
     snippet_pattern = re.compile(
         r"import\s+(\w+)\s+from\s+['\"]@site/((.*?))['\"];",
@@ -135,10 +134,10 @@ def inject_snippets(directory, content):
 
     for snippet_name, snippet_full_path, _ in matches:
         full_path = os.path.join(directory, snippet_full_path)
-        if full_path not in files_processed: # we dont index snippets more than once
+        if full_path not in files_processed:  # we dont index snippets more than once
             if os.path.exists(full_path):
                 with open(full_path, 'r', encoding='utf-8') as snippet_file:
-                    snippet_map[snippet_name] = remove_code_blocks(snippet_file.read())
+                    snippet_map[snippet_name] = clean_content(snippet_file.read())
                     files_processed.add(full_path)
             else:
                 print(f"FATAL: Unable to handle snippet: {full_path}")
@@ -218,9 +217,10 @@ def parse_markdown_content(metadata, content):
         'content': metadata.get('description', ''),
         'keywords': metadata.get('keywords', ''),
         'objectID': get_object_id(heading_slug),
-        'type': 'lvl0',
+        'type': 'lvl1',
         'hierarchy': {
-            'lvl0': metadata.get('title', '')
+            'lvl0': metadata.get('title', ''),
+            'lvl1': metadata.get('title', '')
         }
     }
     for line in lines:
diff --git a/scripts/search/test.py b/scripts/search/test.py
@@ -0,0 +1,31 @@
+import re
+
+content = """
+---
+slug: /en/architecture/horizontal-scaling
+sidebar_label: Scaling out
+sidebar_position: 10
+title: Scaling out
+---
+import ReplicationShardingTerminology from '@site/docs/en/_snippets/_replication-sharding-terminology.md';
+import ConfigFileNote from '@site/docs/en/_snippets/_config-files.md';
+
+
+## Description
+This example architecture is designed to provide scalability.  It includes three nodes: two combined ClickHouse plus coordination (ClickHouse Keeper) servers, and a third server with only ClickHouse Keeper to finish the quorum of three. With this example, we'll create a database, table, and a distributed table that will be able to query the data on both of the nodes.
+
+
+<ScalePlanFeatureBadge feature="The fast release channel"/>
+
+<ScalePlanFeatureBadge/>
+"""
+
+
+def clean_content(content):
+    # Remove code blocks
+    content = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
+    content = re.sub(r'^import .+?from .+?$', '', content, flags=re.MULTILINE)
+    content = re.sub(r'<[A-Za-z0-9_-]+\s*[^>]*\/>', '', content)
+    return content
+
+print(clean_content(content))