Skip to content

Commit 2abdda4

Browse files
committed
update search scripts
1 parent 16a38ca commit 2abdda4

File tree

7 files changed

+270
-11
lines changed

7 files changed

+270
-11
lines changed

docs/en/integrations/data-ingestion/kafka/kafka-clickhouse-connect-sink.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ slug: /en/integrations/kafka/clickhouse-kafka-connect-sink
55
description: The official Kafka connector from ClickHouse.
66
---
77

8-
import ConnectionDetails from '@site/docs/en/\_snippets/\_gather_your_details_http.mdx';
8+
import ConnectionDetails from '@site/docs/en/_snippets/_gather_your_details_http.mdx';
99

1010
# ClickHouse Kafka Connect Sink
1111

docs/en/integrations/data-visualization/mitzu-and-clickhouse.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ keywords: [clickhouse, Mitzu, connect, integrate, ui]
55
description: Mitzu is a no-code warehouse-native product analytics application.
66
---
77

8-
import ConnectionDetails from '@site/docs/en/\_snippets/\_gather_your_details_http.mdx';
8+
import ConnectionDetails from '@site/docs/en/_snippets/_gather_your_details_http.mdx';
99

1010
# Connecting Mitzu to ClickHouse
1111

docs/en/integrations/data-visualization/omni-and-clickhouse.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ keywords: [clickhouse, Omni, connect, integrate, ui]
55
description: Omni is an enterprise platform for BI, data applications, and embedded analytics that helps you explore and share insights in real time.
66
---
77

8-
import ConnectionDetails from '@site/docs/en/\_snippets/\_gather_your_details_http.mdx';
8+
import ConnectionDetails from '@site/docs/en/_snippets/_gather_your_details_http.mdx';
99

1010
# Omni
1111

scripts/search/README.md

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,7 @@ options:
3131

3232
## Results
3333

34-
35-
| Date | Average nDCG | Results |
36-
|------------|--------------|------------------------------------------------------------------------------------------------|
37-
| 20/01/2024 | 0.5010 | [here](https://pastila.nl/?008231f5/bc107912f8a5074d70201e27b1a66c6c#cB/yJOsZPOWi9h8xAkuTUQ==) |
38-
| | | |
39-
34+
| **Date** | **Average nDCG** | **Results** | **Changes** |
35+
|-------------|------------------|----------------------------------------------------------------------------------------------|----------------------------------------------|
36+
| 20/01/2024 | 0.4700 | [View Results](https://pastila.nl/?008231f5/bc107912f8a5074d70201e27b1a66c6c#cB/yJOsZPOWi9h8xAkuTUQ==) | Baseline |
37+
| 21/01/2024 | 0.4783 | [View Results](https://pastila.nl/?00bb2c2f/936a9a3af62a9bdda186af5f37f55782#m7Hg0i9F1YCesMW6ot25yA==) | Index `_` character and move language to English |

scripts/search/compute_ndcg.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
from algoliasearch.search.client import SearchClientSync
55

66
# Initialize Algolia client
7-
ALGOLIA_APP_ID = "62VCH2MD74"
8-
ALGOLIA_API_KEY = "b78244d947484fe3ece7bc5472e9f2af"
7+
ALGOLIA_APP_ID = "5H9UG7CX5W"
8+
ALGOLIA_API_KEY = "4a7bf25cf3edbef29d78d5e1eecfdca5"
99
ALGOLIA_INDEX_NAME = "clickhouse"
1010

1111
client = SearchClientSync(ALGOLIA_APP_ID, ALGOLIA_API_KEY)

scripts/search/index_pages.py

Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
import argparse
2+
import json
3+
import os
4+
import re
5+
import sys
6+
7+
import yaml
8+
from slugify import slugify
9+
from algoliasearch.search.client import SearchClient, SearchClientSync
10+
11+
DOCS_PREFIX = 'http://clickhouse.com/docs'
12+
CODE_PATTERN = re.compile(r"```.*?```", re.DOTALL)
13+
14+
15+
def parse_metadata_and_content(md_file_path):
16+
"""Parse metadata and content from a Markdown file."""
17+
with open(md_file_path, 'r', encoding='utf-8') as file:
18+
content = file.read()
19+
20+
# Extract metadata block
21+
metadata_match = re.match(r'^---\n(.*?)\n---\n', content, re.DOTALL)
22+
metadata = {}
23+
if metadata_match:
24+
metadata = yaml.safe_load(metadata_match.group(1))
25+
content = content[metadata_match.end():] # Remove metadata from content
26+
metadata['file_path'] = md_file_path
27+
return metadata, content
28+
29+
def remove_code_blocks(content):
30+
"""
31+
Remove all code blocks (``` ... ```) from the Markdown content.
32+
"""
33+
return CODE_PATTERN.sub('', content)
34+
35+
36+
def split_large_document(doc, max_size=10000):
37+
max_size = max_size * 0.9 # buffer
38+
"""
39+
Splits a document into smaller chunks if its content exceeds max_size bytes - 10000 is the max size for algolia.
40+
Appends a number to the objectID for each chunk if splitting is necessary.
41+
"""
42+
content = doc['content']
43+
size = len(json.dumps(doc).encode('utf-8'))
44+
if size <= max_size:
45+
doc['objectID'] = slugify(doc['slug'], lowercase=True, separator='-')
46+
yield doc
47+
else:
48+
# Split content into smaller chunks
49+
parts = []
50+
current_chunk = []
51+
# get current size without content
52+
del doc['content']
53+
initial_size = len(json.dumps(doc).encode('utf-8'))
54+
current_size = initial_size
55+
56+
for line in content.splitlines(keepends=True):
57+
line_size = len(line.encode('utf-8'))
58+
if current_size + line_size > max_size:
59+
parts.append(''.join(current_chunk))
60+
current_chunk = []
61+
current_size = initial_size
62+
current_chunk.append(line)
63+
current_size += line_size
64+
65+
if current_chunk:
66+
parts.append(''.join(current_chunk))
67+
68+
# Yield each part as a separate document
69+
for i, part in enumerate(parts, start=1):
70+
chunked_doc = doc.copy()
71+
chunked_doc['content'] = part
72+
chunked_doc['objectID'] = f"{slugify(doc['url'], lowercase=True, separator='-')}-{i}"
73+
yield chunked_doc
74+
75+
76+
# TODO: this is currently language specific
77+
def inject_snippets(directory, content):
78+
snippet_pattern = re.compile(
79+
r"import\s+(\w+)\s+from\s+['\"]@site/docs/en/((.*?))['\"];",
80+
re.DOTALL
81+
)
82+
matches = snippet_pattern.findall(content)
83+
snippet_map = {}
84+
85+
for snippet_name, snippet_full_path, _ in matches:
86+
full_path = os.path.join(directory, snippet_full_path)
87+
if os.path.exists(full_path):
88+
with open(full_path, 'r', encoding='utf-8') as snippet_file:
89+
snippet_map[snippet_name] = snippet_file.read()
90+
else:
91+
print(f"FATAL: Unable to handle snippet: {full_path}")
92+
sys.exit(1)
93+
content = snippet_pattern.sub("", content)
94+
for snippet_name, snippet_content in snippet_map.items():
95+
tag_pattern = re.compile(fr"<{snippet_name}\s*/>")
96+
try:
97+
content = tag_pattern.sub(re.escape(snippet_content), content)
98+
except Exception as e:
99+
print(e)
100+
return content
101+
102+
103+
def parse_markdown_content(directory, metadata, content):
104+
"""Parse the Markdown content and generate sub-documents for each ##, ###, and #### heading."""
105+
current_h1 = None
106+
current_h2 = None
107+
current_h3 = None
108+
slug = metadata.get(
109+
'slug',
110+
'/' + os.path.split(os.path.split(metadata['file_path'])[0])[1] + metadata['file_path'].replace(directory,
111+
'').removesuffix(
112+
'.md').removesuffix('.mdx')
113+
)
114+
115+
116+
# Inject any snippets
117+
content = inject_snippets(directory, content)
118+
# Remove any code blocks - we don't wanna index
119+
content = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
120+
lines = content.splitlines()
121+
current_subdoc = None
122+
for line in lines:
123+
if line.startswith('# '):
124+
current_h1 = line[2:].strip() if line[2:].strip() else metadata.get('title', '')
125+
doc = {
126+
'file_path': metadata.get('file_path', ''),
127+
'slug': slug,
128+
'url': f'{DOCS_PREFIX}{slug}',
129+
'h1': current_h1,
130+
'content': metadata.get('description', ''),
131+
'title': metadata.get('title', ''),
132+
'keywords': metadata.get('keywords', '')
133+
}
134+
yield from split_large_document(doc)
135+
elif line.startswith('## '):
136+
# TODO: capture case with no h1
137+
if current_subdoc:
138+
yield from split_large_document(current_subdoc)
139+
current_h2 = line[3:].strip()
140+
current_h3 = None # Reset h3 when a new h2 is found
141+
heading_slug = slugify(current_h2, lowercase=True, separator='-')
142+
current_subdoc = {
143+
'file_path': metadata.get('file_path', ''),
144+
'slug': f'{slug}#{heading_slug}',
145+
'url': f'{DOCS_PREFIX}{slug}#{heading_slug}',
146+
'h1': current_h1,
147+
'h2': current_h2,
148+
'content': '',
149+
'keywords': metadata.get('keywords', ''),
150+
}
151+
elif line.startswith('### '):
152+
if current_subdoc:
153+
yield from split_large_document(current_subdoc)
154+
current_h3 = line[4:].strip()
155+
heading_slug = slugify(f'{current_h2} {current_h3}', lowercase=True, separator='-')
156+
current_subdoc = {
157+
'file_path': metadata.get('file_path', ''),
158+
'slug': f'{slug}#{heading_slug}',
159+
'url': f'{DOCS_PREFIX}{slug}#{heading_slug}',
160+
'h1': current_h1,
161+
'h2': current_h2,
162+
'h3': current_h3,
163+
'content': '',
164+
'keywords': metadata.get('keywords', ''),
165+
}
166+
elif line.startswith('#### '):
167+
if current_subdoc:
168+
yield from split_large_document(current_subdoc)
169+
current_h4 = line[5:].strip()
170+
heading_slug = slugify(f'{current_h2} {current_h3} {current_h4}', lowercase=True, separator='-')
171+
current_subdoc = {
172+
'file_path': metadata.get('file_path', ''),
173+
'slug': f'{slug}#{heading_slug}',
174+
'url': f'{DOCS_PREFIX}{slug}#{heading_slug}',
175+
'h1': current_h1,
176+
'h2': current_h2,
177+
'h3': current_h3,
178+
'h4': current_h4,
179+
'content': '',
180+
'keywords': metadata.get('keywords', ''),
181+
}
182+
elif current_subdoc:
183+
current_subdoc['content'] += line + '\n'
184+
185+
if current_subdoc:
186+
yield from split_large_document(current_subdoc)
187+
188+
189+
def process_markdown_directory(directory):
190+
"""Recursively process Markdown files in a directory."""
191+
directory = os.path.abspath(directory)
192+
i = 0
193+
for root, dirs, files in os.walk(directory):
194+
# Skip `_snippets` and _placeholders subfolders
195+
dirs[:] = [d for d in dirs if d != '_snippets' and d != '_placeholders']
196+
for file in files:
197+
if file.endswith('.md') or file.endswith('.mdx'):
198+
md_file_path = os.path.join(root, file)
199+
metadata, content = parse_metadata_and_content(md_file_path)
200+
for subdoc in parse_markdown_content(directory, metadata, content):
201+
yield subdoc
202+
203+
204+
def send_to_algolia(client, index_name, records):
205+
"""Send records to Algolia."""
206+
if records:
207+
client.batch(index_name=index_name, batch_write_params={
208+
"requests": [{"action": "addObject", "body": record} for record in records],
209+
})
210+
print(f"Successfully sent {len(records)} records to Algolia.")
211+
else:
212+
print("No records to send to Algolia.")
213+
214+
215+
# TODO: handle snippets - handle the markdown with mdx
216+
def main(input_directory, algolia_app_id, algolia_api_key, algolia_index_name, batch_size=1000):
217+
client = SearchClientSync(algolia_app_id, algolia_api_key)
218+
219+
batch = []
220+
t = 0
221+
for doc in process_markdown_directory(input_directory):
222+
# Ensure each record has a unique objectID
223+
doc['objectID'] = slugify(doc['url'], lowercase=True, separator='-')
224+
batch.append(doc)
225+
226+
# Send batch to Algolia when it reaches the batch size
227+
if len(batch) >= batch_size:
228+
send_to_algolia(client, algolia_index_name, batch)
229+
print(f'indexed {len(batch)} records')
230+
t += len(batch)
231+
batch = []
232+
# Send any remaining records
233+
if batch:
234+
send_to_algolia(client, algolia_index_name, batch)
235+
t += len(batch)
236+
print(f'indexed {len(batch)} records')
237+
print(f'total: indexed {t} records')
238+
239+
if __name__ == '__main__':
240+
parser = argparse.ArgumentParser(description='Index search pages.')
241+
parser.add_argument(
242+
'-d',
243+
'--input_directory',
244+
help='Path to root directory of docs'
245+
)
246+
parser.add_argument('--algolia_app_id', required=True, help='Algolia Application ID')
247+
parser.add_argument('--algolia_api_key', required=True, help='Algolia Admin API Key')
248+
parser.add_argument('--algolia_index_name', required=True, help='Algolia Index Name')
249+
args = parser.parse_args()
250+
main(args.input_directory, args.algolia_app_id, args.algolia_api_key, args.algolia_index_name)

scripts/search/results.csv

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,3 +198,14 @@ alias,https://clickhouse.com/docs/en/sql-reference/syntax#expression-aliases,,
198198
first,https://clickhouse.com/docs/en/sql-reference/aggregate-functions/reference/first_value,,
199199
over,https://clickhouse.com/docs/en/sql-reference/window-functions,,
200200
lead,https://clickhouse.com/docs/en/sql-reference/window-functions,,
201+
max_threads,https://clickhouse.com/docs/en/operations/settings/settings#max_threads,,
202+
max_insert_threads,https://clickhouse.com/docs/en/operations/settings/settings#max_insert_threads,,
203+
min_insert_block_size_bytes,https://clickhouse.com/docs/en/operations/settings/settings#min_insert_block_size_bytes,,
204+
min_insert_block_size_rows,https://clickhouse.com/docs/en/operations/settings/settings#min_insert_block_size_rows,,
205+
allow_experimental_parallel_reading_from_replicas,https://clickhouse.com/docs/en/operations/settings/settings#allow_experimental_parallel_reading_from_replicas,,
206+
join_algorithm,https://clickhouse.com/docs/en/operations/settings/settings#join_algorithm,,
207+
max_memory_usage,https://clickhouse.com/docs/en/operations/settings/settings#max_memory_usage,,
208+
max_bytes_before_external_group_by,https://clickhouse.com/docs/en/operations/settings/settings#max_bytes_before_external_group_by,,
209+
max_bytes_before_external_sort,https://clickhouse.com/docs/en/operations/settings/settings#max_bytes_before_external_sort,,
210+
result_overflow_mode,https://clickhouse.com/docs/en/operations/settings/settings#result_overflow_mode,,
211+
use_query_cache,https://clickhouse.com/docs/en/operations/settings/settings#use_query_cache,,

0 commit comments

Comments
 (0)