improve search scripts

gingerwizard · gingerwizard · commit 5d8763a27e6d · 2025-01-23T23:20:05.000Z
diff --git a/docs/en/managing-data/core-concepts/partitions.md b/docs/en/managing-data/core-concepts/partitions.md
@@ -2,7 +2,7 @@
 slug: /en/partitions
 title: Table partitions
 description: What are table partitions in ClickHouse
-keywords: [partitions]
+keywords: [partitions, partition by]
 ---
 
 ## What are table partitions in ClickHouse?
@@ -12,6 +12,7 @@ keywords: [partitions]
 
 Partitions group the [data parts](/docs/en/parts) of a table in the [MergeTree engine family](/docs/en/engines/table-engines/mergetree-family) into organized, logical units, which is a way of organizing data that is conceptually meaningful and aligned with specific criteria, such as time ranges, categories, or other key attributes. These logical units make data easier to manage, query, and optimize.
 
+### Partition By
 
 Partitioning can be enabled when a table is initially defined via the [PARTITION BY clause](/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key). This clause can contain a SQL expression on any columns, the results of which will define which partition a row belongs to.
 
@@ -33,6 +34,8 @@ PARTITION BY toStartOfMonth(date);
 
 You can [query this table](https://sql.clickhouse.com/?query=U0VMRUNUICogRlJPTSB1ay51a19wcmljZV9wYWlkX3NpbXBsZV9wYXJ0aXRpb25lZA&run_query=true&tab=results) in our ClickHouse SQL Playground.
 
+### Structure on disk
+
 Whenever a set of rows is inserted into the table, instead of creating (at [least](/docs/en/operations/settings/settings#max_insert_block_size)) one single data part containing all the inserted rows (as described [here](/docs/en/parts)), ClickHouse creates one new data part for each unique partition key value among the inserted rows:
 
 <img src={require('./images/partitions.png').default} alt='INSERT PROCESSING' class='image' style={{width: '100%'}} />
diff --git a/scripts/search/README.md b/scripts/search/README.md
@@ -40,5 +40,18 @@ options:
 ## Issues
 
 1. Some pages are not optimized for retrieval e.g. 
-   a. https://clickhouse.com/docs/en/sql-reference/aggregate-functions/combinators#-if will never return for `countIf`.
-2. 
+   a. https://clickhouse.com/docs/en/sql-reference/aggregate-functions/combinators#-if will never return for `countIf`, `sumif`, `multiif`
+2. Some pages are hidden e.g. https://clickhouse.com/docs/en/install#from-docker-image - this needs to be separate page.
+3. Some pages e.g. https://clickhouse.com/docs/en/sql-reference/statements/alter need headings e.g. `Alter table`
+4. https://clickhouse.com/docs/en/optimize/sparse-primary-indexes needs to be optimized for primary key
+5. `between` we need to likely manually promote.
+6. `case when` - https://clickhouse.com/docs/en/sql-reference/functions/conditional-functions needs to be improved. Maybe keywords or a header
+7. `has` - https://clickhouse.com/docs/en/sql-reference/functions/array-functions#hasarr-elem tricky
+8. `clickhouse` - manual promotion
+9. `codec` - we need better content
+10. `shard` - need a better page
+11. `populate` - we need to have a subheading on the mv page
+12. `contains` - https://clickhouse.com/docs/en/sql-reference/functions/string-search-functions needs words
+13. `client` - maybe promote manually
+14. `config.xml` - manually promote
+15. `replica` - need more terms on https://clickhouse.com/docs/en/architecture/horizontal-scaling but we need a better page
diff --git a/scripts/search/compute_ndcg.py b/scripts/search/compute_ndcg.py
@@ -7,12 +7,12 @@
 
 
 # Initialize Algolia client
-ALGOLIA_APP_ID = "5H9UG7CX5W"
-ALGOLIA_API_KEY = "4a7bf25cf3edbef29d78d5e1eecfdca5"
+# ALGOLIA_APP_ID = "5H9UG7CX5W"
+# ALGOLIA_API_KEY = "4a7bf25cf3edbef29d78d5e1eecfdca5"
 
 # old search engine using crawler
-# ALGOLIA_APP_ID = "62VCH2MD74"
-# ALGOLIA_API_KEY = "b78244d947484fe3ece7bc5472e9f2af"
+ALGOLIA_APP_ID = "62VCH2MD74"
+ALGOLIA_API_KEY = "b78244d947484fe3ece7bc5472e9f2af"
 
 
 client = SearchClientSync(ALGOLIA_APP_ID, ALGOLIA_API_KEY)
@@ -40,9 +40,11 @@ def main(input_csv, detailed, k=3):
         rows = list(reader)
     results = []
     total_ndcg = 0
+
     for row in rows:
         term = row[0]
-        expected_links = [link for link in row[1:4] if link]  # Skip empty cells
+        # Remove duplicates in expected links - can happen as some docs return same url
+        expected_links = list(dict.fromkeys([link for link in row[1:4] if link]))  # Ensure uniqueness
 
         # Query Algolia
         response = client.search(
@@ -63,17 +65,20 @@ def main(input_csv, detailed, k=3):
         total_ndcg += ndcg
         results.append({"term": term, "nDCG": ndcg})
 
-    # Calculate Mean nDCG
-    mean_ndcg = total_ndcg / len(rows) if rows else 0
+    # Sort results by descending nDCG
+    results.sort(key=lambda x: x['nDCG'], reverse=True)
 
     # Display results
-    print(f"Mean nDCG: {mean_ndcg:.4f}")
     if detailed:
         print("\nSearch Term\t\tnDCG")
         print("=" * 30)
         for result in results:
             print(f"{result['term']}\t\t{result['nDCG']:.4f}")
 
+    # Calculate Mean nDCG
+    mean_ndcg = total_ndcg / len(rows) if rows else 0
+    print(f"Mean nDCG: {mean_ndcg:.4f}")
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Compute nDCG for Algolia search results.")
diff --git a/scripts/search/index_pages.py b/scripts/search/index_pages.py
@@ -6,11 +6,14 @@
 from ruamel.yaml import YAML
 from slugify import slugify
 from algoliasearch.search.client import SearchClientSync
+import networkx as nx
 
 DOCS_PREFIX = 'https://clickhouse.com/docs'
 HEADER_PATTERN = re.compile(r"^(.*?)(?:\s*\{#(.*?)\})$")
 object_ids = set()
 
+link_data = []
+
 
 def read_metadata(text):
     parts = text.split("\n")
@@ -135,13 +138,36 @@ def custom_slugify(text):
     # Preprocess the text to remove specific characters
     text = text.replace("(", "").replace(")", "")  # Remove parentheses
     text = text.replace(",", "")  # Remove commas
+    text = text.replace("[", "").replace("]", "")  # Remove [ and ]
+    text = text.replace("\\", "")  # Remove \
     text = text.replace("...", "-")
+    text = text.replace(" ", '-')  # Replace any whitespace character with a dash.
+    text = re.sub(r'--{2,}', '--', text)  # more than 2 -- are replaced with a --
+    text = re.sub(r'--$', '-', text)
+    text = text.replace('--', 'TEMPDOUBLEHYPHEN')
     slug = slugify(text, lowercase=True, separator='-', regex_pattern=r'[^a-zA-Z0-9_]+')
+    slug = slug.replace("tempdoublehyphen", "--")
     if text.endswith("-"):
         slug += '-'
     return slug
 
 
+def extract_links_from_content(content):
+    """
+    Extract all Markdown links from the content.
+    """
+    # Markdown link pattern: [text](link)
+    link_pattern = r'\[.*?\]\((.*?)\)'
+    return re.findall(link_pattern, content)
+
+
+def update_page_rank(url, content):
+    links = extract_links_from_content(content)
+    for target in links:
+        if target.startswith('/docs/') and not target.endswith('.md'):
+            link_data.append((url, f'{DOCS_PREFIX}{target.replace("/docs", "")}'))
+
+
 def parse_markdown_content(directory, metadata, content):
     """Parse the Markdown content and generate sub-documents for each ##, ###, and #### heading."""
     slug = metadata.get('slug',
@@ -162,6 +188,7 @@ def parse_markdown_content(directory, metadata, content):
         'keywords': metadata.get('keywords', ''),
         'objectID': get_object_id(heading_slug),
     }
+
     for line in lines:
         if line.startswith('# '):
             if line[2:].strip():
@@ -176,6 +203,7 @@ def parse_markdown_content(directory, metadata, content):
             current_subdoc['object_id'] = custom_slugify(heading_slug)
         elif line.startswith('## '):
             if current_subdoc:
+                update_page_rank(current_subdoc['url'], current_subdoc['content'])
                 yield from split_large_document(current_subdoc)
             current_h2 = line[3:].strip()
             slug_match = re.match(HEADER_PATTERN, current_h2)
@@ -188,6 +216,7 @@ def parse_markdown_content(directory, metadata, content):
                 'file_path': metadata.get('file_path', ''),
                 'slug': f'{heading_slug}',
                 'url': f'{DOCS_PREFIX}{heading_slug}',
+                'title': current_h2,
                 'h2': current_h2,
                 'content': '',
                 'keywords': metadata.get('keywords', ''),
@@ -196,22 +225,28 @@ def parse_markdown_content(directory, metadata, content):
         elif line.startswith('### '):
             # note we send users to the h2 or h1 even on ###
             if current_subdoc:
+                update_page_rank(current_subdoc['url'], current_subdoc['content'])
                 yield from split_large_document(current_subdoc)
             current_h3 = line[4:].strip()
             slug_match = re.match(HEADER_PATTERN, current_h3)
             if slug_match:
                 current_h3 = slug_match.group(2)
+                heading_slug = f"{slug}#{current_h3}"
+            else:
+                heading_slug = f"{slug}#{custom_slugify(current_h3)}"
             current_subdoc = {
                 'file_path': metadata.get('file_path', ''),
                 'slug': f'{heading_slug}',
                 'url': f'{DOCS_PREFIX}{heading_slug}',
+                'title': current_h3,
                 'h3': current_h3,
                 'content': '',
                 'keywords': metadata.get('keywords', ''),
                 'objectID': get_object_id(f'{heading_slug}-{current_h3}')
             }
         elif line.startswith('#### '):
             if current_subdoc:
+                update_page_rank(current_subdoc['url'], current_subdoc['content'])
                 yield from split_large_document(current_subdoc)
             current_h4 = line[5:].strip()
             slug_match = re.match(HEADER_PATTERN, current_h4)
@@ -221,6 +256,7 @@ def parse_markdown_content(directory, metadata, content):
                 'file_path': metadata.get('file_path', ''),
                 'slug': f'{heading_slug}',
                 'url': f'{DOCS_PREFIX}{heading_slug}#',
+                'title': current_h4,
                 'h4': current_h4,
                 'content': '',
                 'keywords': metadata.get('keywords', ''),
@@ -230,6 +266,7 @@ def parse_markdown_content(directory, metadata, content):
             current_subdoc['content'] += line + '\n'
 
     if current_subdoc:
+        update_page_rank(current_subdoc['url'], current_subdoc['content'])
         yield from split_large_document(current_subdoc)
 
 
@@ -259,30 +296,50 @@ def send_to_algolia(client, index_name, records):
         print("No records to send to Algolia.")
 
 
-def main(root_directory, sub_directories, algolia_app_id, algolia_api_key, algolia_index_name, batch_size=1000, dry_run=False):
+def compute_page_rank(link_data, damping_factor=0.85, max_iter=100, tol=1e-6):
+    """
+    Compute PageRank for a set of pages.
+
+    :param link_data: List of tuples (source, target) representing links.
+    :param damping_factor: Damping factor for PageRank.
+    :param max_iter: Maximum number of iterations.
+    :param tol: Convergence tolerance.
+    :return: Dictionary of pages and their PageRank scores.
+    """
+    # Create a directed graph
+    graph = nx.DiGraph()
+    graph.add_edges_from(link_data)
+
+    # Compute PageRank
+    page_rank = nx.pagerank(graph, alpha=damping_factor, max_iter=max_iter, tol=tol)
+    return page_rank
+
+
+def main(root_directory, sub_directories, algolia_app_id, algolia_api_key, algolia_index_name, batch_size=1000,
+         dry_run=False):
     client = SearchClientSync(algolia_app_id, algolia_api_key)
     batch = []
     t = 0
+    docs = []
     for sub_directory in sub_directories:
         input_directory = os.path.join(root_directory, sub_directory)
         for doc in process_markdown_directory(root_directory, input_directory):
-            batch.append(doc)
-            # Send batch to Algolia when it reaches the batch size
-            if len(batch) >= batch_size:
-                if not dry_run:
-                    send_to_algolia(client, algolia_index_name, batch)
-                else:
-                    for b in batch:
-                        print(json.dumps(b))
-                print(f'{'processed' if dry_run else 'indexed'} {len(batch)} records')
-                t += len(batch)
-                batch = []
-        # Send any remaining records
-        if batch:
+            docs.append(doc)
+    page_rank_scores = compute_page_rank(link_data)
+    # Add PageRank scores to the documents
+    for doc in docs:
+        rank = page_rank_scores.get(doc.get('url', ''), 0)
+        doc['page_rank'] = int(rank * 10000000)
+    for i in range(0, len(docs), batch_size):
+        batch = docs[i:i + batch_size]  # Get the current batch
+        if not dry_run:
             send_to_algolia(client, algolia_index_name, batch)
-            t += len(batch)
-            print(f'{'processed' if dry_run else 'indexed'} {len(batch)} records')
-        print(f'total for {sub_directory}: {'processed' if dry_run else 'indexed'} {t} records')
+        else:
+            for b in batch:
+                print(json.dumps(b))
+        print(f'{'processed' if dry_run else 'indexed'} {len(batch)} records')
+        t += len(batch)
+    print(f'total for {sub_directory}: {'processed' if dry_run else 'indexed'} {t} records')
 
 
 if __name__ == '__main__':
@@ -313,4 +370,3 @@ def main(root_directory, sub_directories, algolia_app_id, algolia_api_key, algol
     sub_directories = [p.strip() for p in args.doc_paths.split(',')]
     main(args.root_directory, sub_directories, args.algolia_app_id, args.algolia_api_key, args.algolia_index_name,
          dry_run=args.dry_run)
-
diff --git a/scripts/search/results.csv b/scripts/search/results.csv