Skip to content

Commit 5d8763a

Browse files
committed
improve search scripts
1 parent 5063265 commit 5d8763a

File tree

5 files changed

+144
-66
lines changed

5 files changed

+144
-66
lines changed

docs/en/managing-data/core-concepts/partitions.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
slug: /en/partitions
33
title: Table partitions
44
description: What are table partitions in ClickHouse
5-
keywords: [partitions]
5+
keywords: [partitions, partition by]
66
---
77

88
## What are table partitions in ClickHouse?
@@ -12,6 +12,7 @@ keywords: [partitions]
1212

1313
Partitions group the [data parts](/docs/en/parts) of a table in the [MergeTree engine family](/docs/en/engines/table-engines/mergetree-family) into organized, logical units, which is a way of organizing data that is conceptually meaningful and aligned with specific criteria, such as time ranges, categories, or other key attributes. These logical units make data easier to manage, query, and optimize.
1414

15+
### Partition By
1516

1617
Partitioning can be enabled when a table is initially defined via the [PARTITION BY clause](/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key). This clause can contain a SQL expression on any columns, the results of which will define which partition a row belongs to.
1718

@@ -33,6 +34,8 @@ PARTITION BY toStartOfMonth(date);
3334

3435
You can [query this table](https://sql.clickhouse.com/?query=U0VMRUNUICogRlJPTSB1ay51a19wcmljZV9wYWlkX3NpbXBsZV9wYXJ0aXRpb25lZA&run_query=true&tab=results) in our ClickHouse SQL Playground.
3536

37+
### Structure on disk
38+
3639
Whenever a set of rows is inserted into the table, instead of creating (at [least](/docs/en/operations/settings/settings#max_insert_block_size)) one single data part containing all the inserted rows (as described [here](/docs/en/parts)), ClickHouse creates one new data part for each unique partition key value among the inserted rows:
3740

3841
<img src={require('./images/partitions.png').default} alt='INSERT PROCESSING' class='image' style={{width: '100%'}} />

scripts/search/README.md

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,5 +40,18 @@ options:
4040
## Issues
4141

4242
1. Some pages are not optimized for retrieval e.g.
43-
a. https://clickhouse.com/docs/en/sql-reference/aggregate-functions/combinators#-if will never return for `countIf`.
44-
2.
43+
a. https://clickhouse.com/docs/en/sql-reference/aggregate-functions/combinators#-if will never return for `countIf`, `sumif`, `multiif`
44+
2. Some pages are hidden e.g. https://clickhouse.com/docs/en/install#from-docker-image - this needs to be separate page.
45+
3. Some pages e.g. https://clickhouse.com/docs/en/sql-reference/statements/alter need headings e.g. `Alter table`
46+
4. https://clickhouse.com/docs/en/optimize/sparse-primary-indexes needs to be optimized for primary key
47+
5. `between` we need to likely manually promote.
48+
6. `case when` - https://clickhouse.com/docs/en/sql-reference/functions/conditional-functions needs to be improved. Maybe keywords or a header
49+
7. `has` - https://clickhouse.com/docs/en/sql-reference/functions/array-functions#hasarr-elem tricky
50+
8. `clickhouse` - manual promotion
51+
9. `codec` - we need better content
52+
10. `shard` - need a better page
53+
11. `populate` - we need to have a subheading on the mv page
54+
12. `contains` - https://clickhouse.com/docs/en/sql-reference/functions/string-search-functions needs words
55+
13. `client` - maybe promote manually
56+
14. `config.xml` - manually promote
57+
15. `replica` - need more terms on https://clickhouse.com/docs/en/architecture/horizontal-scaling but we need a better page

scripts/search/compute_ndcg.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@
77

88

99
# Initialize Algolia client
10-
ALGOLIA_APP_ID = "5H9UG7CX5W"
11-
ALGOLIA_API_KEY = "4a7bf25cf3edbef29d78d5e1eecfdca5"
10+
# ALGOLIA_APP_ID = "5H9UG7CX5W"
11+
# ALGOLIA_API_KEY = "4a7bf25cf3edbef29d78d5e1eecfdca5"
1212

1313
# old search engine using crawler
14-
# ALGOLIA_APP_ID = "62VCH2MD74"
15-
# ALGOLIA_API_KEY = "b78244d947484fe3ece7bc5472e9f2af"
14+
ALGOLIA_APP_ID = "62VCH2MD74"
15+
ALGOLIA_API_KEY = "b78244d947484fe3ece7bc5472e9f2af"
1616

1717

1818
client = SearchClientSync(ALGOLIA_APP_ID, ALGOLIA_API_KEY)
@@ -40,9 +40,11 @@ def main(input_csv, detailed, k=3):
4040
rows = list(reader)
4141
results = []
4242
total_ndcg = 0
43+
4344
for row in rows:
4445
term = row[0]
45-
expected_links = [link for link in row[1:4] if link] # Skip empty cells
46+
# Remove duplicates in expected links - can happen as some docs return same url
47+
expected_links = list(dict.fromkeys([link for link in row[1:4] if link])) # Ensure uniqueness
4648

4749
# Query Algolia
4850
response = client.search(
@@ -63,17 +65,20 @@ def main(input_csv, detailed, k=3):
6365
total_ndcg += ndcg
6466
results.append({"term": term, "nDCG": ndcg})
6567

66-
# Calculate Mean nDCG
67-
mean_ndcg = total_ndcg / len(rows) if rows else 0
68+
# Sort results by descending nDCG
69+
results.sort(key=lambda x: x['nDCG'], reverse=True)
6870

6971
# Display results
70-
print(f"Mean nDCG: {mean_ndcg:.4f}")
7172
if detailed:
7273
print("\nSearch Term\t\tnDCG")
7374
print("=" * 30)
7475
for result in results:
7576
print(f"{result['term']}\t\t{result['nDCG']:.4f}")
7677

78+
# Calculate Mean nDCG
79+
mean_ndcg = total_ndcg / len(rows) if rows else 0
80+
print(f"Mean nDCG: {mean_ndcg:.4f}")
81+
7782

7883
if __name__ == "__main__":
7984
parser = argparse.ArgumentParser(description="Compute nDCG for Algolia search results.")

scripts/search/index_pages.py

Lines changed: 74 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,14 @@
66
from ruamel.yaml import YAML
77
from slugify import slugify
88
from algoliasearch.search.client import SearchClientSync
9+
import networkx as nx
910

1011
DOCS_PREFIX = 'https://clickhouse.com/docs'
1112
HEADER_PATTERN = re.compile(r"^(.*?)(?:\s*\{#(.*?)\})$")
1213
object_ids = set()
1314

15+
link_data = []
16+
1417

1518
def read_metadata(text):
1619
parts = text.split("\n")
@@ -135,13 +138,36 @@ def custom_slugify(text):
135138
# Preprocess the text to remove specific characters
136139
text = text.replace("(", "").replace(")", "") # Remove parentheses
137140
text = text.replace(",", "") # Remove commas
141+
text = text.replace("[", "").replace("]", "") # Remove [ and ]
142+
text = text.replace("\\", "") # Remove \
138143
text = text.replace("...", "-")
144+
text = text.replace(" ", '-') # Replace any whitespace character with a dash.
145+
text = re.sub(r'--{2,}', '--', text) # more than 2 -- are replaced with a --
146+
text = re.sub(r'--$', '-', text)
147+
text = text.replace('--', 'TEMPDOUBLEHYPHEN')
139148
slug = slugify(text, lowercase=True, separator='-', regex_pattern=r'[^a-zA-Z0-9_]+')
149+
slug = slug.replace("tempdoublehyphen", "--")
140150
if text.endswith("-"):
141151
slug += '-'
142152
return slug
143153

144154

155+
def extract_links_from_content(content):
156+
"""
157+
Extract all Markdown links from the content.
158+
"""
159+
# Markdown link pattern: [text](link)
160+
link_pattern = r'\[.*?\]\((.*?)\)'
161+
return re.findall(link_pattern, content)
162+
163+
164+
def update_page_rank(url, content):
165+
links = extract_links_from_content(content)
166+
for target in links:
167+
if target.startswith('/docs/') and not target.endswith('.md'):
168+
link_data.append((url, f'{DOCS_PREFIX}{target.replace("/docs", "")}'))
169+
170+
145171
def parse_markdown_content(directory, metadata, content):
146172
"""Parse the Markdown content and generate sub-documents for each ##, ###, and #### heading."""
147173
slug = metadata.get('slug',
@@ -162,6 +188,7 @@ def parse_markdown_content(directory, metadata, content):
162188
'keywords': metadata.get('keywords', ''),
163189
'objectID': get_object_id(heading_slug),
164190
}
191+
165192
for line in lines:
166193
if line.startswith('# '):
167194
if line[2:].strip():
@@ -176,6 +203,7 @@ def parse_markdown_content(directory, metadata, content):
176203
current_subdoc['object_id'] = custom_slugify(heading_slug)
177204
elif line.startswith('## '):
178205
if current_subdoc:
206+
update_page_rank(current_subdoc['url'], current_subdoc['content'])
179207
yield from split_large_document(current_subdoc)
180208
current_h2 = line[3:].strip()
181209
slug_match = re.match(HEADER_PATTERN, current_h2)
@@ -188,6 +216,7 @@ def parse_markdown_content(directory, metadata, content):
188216
'file_path': metadata.get('file_path', ''),
189217
'slug': f'{heading_slug}',
190218
'url': f'{DOCS_PREFIX}{heading_slug}',
219+
'title': current_h2,
191220
'h2': current_h2,
192221
'content': '',
193222
'keywords': metadata.get('keywords', ''),
@@ -196,22 +225,28 @@ def parse_markdown_content(directory, metadata, content):
196225
elif line.startswith('### '):
197226
# note we send users to the h2 or h1 even on ###
198227
if current_subdoc:
228+
update_page_rank(current_subdoc['url'], current_subdoc['content'])
199229
yield from split_large_document(current_subdoc)
200230
current_h3 = line[4:].strip()
201231
slug_match = re.match(HEADER_PATTERN, current_h3)
202232
if slug_match:
203233
current_h3 = slug_match.group(2)
234+
heading_slug = f"{slug}#{current_h3}"
235+
else:
236+
heading_slug = f"{slug}#{custom_slugify(current_h3)}"
204237
current_subdoc = {
205238
'file_path': metadata.get('file_path', ''),
206239
'slug': f'{heading_slug}',
207240
'url': f'{DOCS_PREFIX}{heading_slug}',
241+
'title': current_h3,
208242
'h3': current_h3,
209243
'content': '',
210244
'keywords': metadata.get('keywords', ''),
211245
'objectID': get_object_id(f'{heading_slug}-{current_h3}')
212246
}
213247
elif line.startswith('#### '):
214248
if current_subdoc:
249+
update_page_rank(current_subdoc['url'], current_subdoc['content'])
215250
yield from split_large_document(current_subdoc)
216251
current_h4 = line[5:].strip()
217252
slug_match = re.match(HEADER_PATTERN, current_h4)
@@ -221,6 +256,7 @@ def parse_markdown_content(directory, metadata, content):
221256
'file_path': metadata.get('file_path', ''),
222257
'slug': f'{heading_slug}',
223258
'url': f'{DOCS_PREFIX}{heading_slug}#',
259+
'title': current_h4,
224260
'h4': current_h4,
225261
'content': '',
226262
'keywords': metadata.get('keywords', ''),
@@ -230,6 +266,7 @@ def parse_markdown_content(directory, metadata, content):
230266
current_subdoc['content'] += line + '\n'
231267

232268
if current_subdoc:
269+
update_page_rank(current_subdoc['url'], current_subdoc['content'])
233270
yield from split_large_document(current_subdoc)
234271

235272

@@ -259,30 +296,50 @@ def send_to_algolia(client, index_name, records):
259296
print("No records to send to Algolia.")
260297

261298

262-
def main(root_directory, sub_directories, algolia_app_id, algolia_api_key, algolia_index_name, batch_size=1000, dry_run=False):
299+
def compute_page_rank(link_data, damping_factor=0.85, max_iter=100, tol=1e-6):
300+
"""
301+
Compute PageRank for a set of pages.
302+
303+
:param link_data: List of tuples (source, target) representing links.
304+
:param damping_factor: Damping factor for PageRank.
305+
:param max_iter: Maximum number of iterations.
306+
:param tol: Convergence tolerance.
307+
:return: Dictionary of pages and their PageRank scores.
308+
"""
309+
# Create a directed graph
310+
graph = nx.DiGraph()
311+
graph.add_edges_from(link_data)
312+
313+
# Compute PageRank
314+
page_rank = nx.pagerank(graph, alpha=damping_factor, max_iter=max_iter, tol=tol)
315+
return page_rank
316+
317+
318+
def main(root_directory, sub_directories, algolia_app_id, algolia_api_key, algolia_index_name, batch_size=1000,
319+
dry_run=False):
263320
client = SearchClientSync(algolia_app_id, algolia_api_key)
264321
batch = []
265322
t = 0
323+
docs = []
266324
for sub_directory in sub_directories:
267325
input_directory = os.path.join(root_directory, sub_directory)
268326
for doc in process_markdown_directory(root_directory, input_directory):
269-
batch.append(doc)
270-
# Send batch to Algolia when it reaches the batch size
271-
if len(batch) >= batch_size:
272-
if not dry_run:
273-
send_to_algolia(client, algolia_index_name, batch)
274-
else:
275-
for b in batch:
276-
print(json.dumps(b))
277-
print(f'{'processed' if dry_run else 'indexed'} {len(batch)} records')
278-
t += len(batch)
279-
batch = []
280-
# Send any remaining records
281-
if batch:
327+
docs.append(doc)
328+
page_rank_scores = compute_page_rank(link_data)
329+
# Add PageRank scores to the documents
330+
for doc in docs:
331+
rank = page_rank_scores.get(doc.get('url', ''), 0)
332+
doc['page_rank'] = int(rank * 10000000)
333+
for i in range(0, len(docs), batch_size):
334+
batch = docs[i:i + batch_size] # Get the current batch
335+
if not dry_run:
282336
send_to_algolia(client, algolia_index_name, batch)
283-
t += len(batch)
284-
print(f'{'processed' if dry_run else 'indexed'} {len(batch)} records')
285-
print(f'total for {sub_directory}: {'processed' if dry_run else 'indexed'} {t} records')
337+
else:
338+
for b in batch:
339+
print(json.dumps(b))
340+
print(f'{'processed' if dry_run else 'indexed'} {len(batch)} records')
341+
t += len(batch)
342+
print(f'total for {sub_directory}: {'processed' if dry_run else 'indexed'} {t} records')
286343

287344

288345
if __name__ == '__main__':
@@ -313,4 +370,3 @@ def main(root_directory, sub_directories, algolia_app_id, algolia_api_key, algol
313370
sub_directories = [p.strip() for p in args.doc_paths.split(',')]
314371
main(args.root_directory, sub_directories, args.algolia_app_id, args.algolia_api_key, args.algolia_index_name,
315372
dry_run=args.dry_run)
316-

0 commit comments

Comments
 (0)