|
| 1 | +import argparse |
| 2 | +import json |
| 3 | +import os |
| 4 | +import re |
| 5 | +import sys |
| 6 | + |
| 7 | +import yaml |
| 8 | +from slugify import slugify |
| 9 | +from algoliasearch.search.client import SearchClient, SearchClientSync |
| 10 | + |
| 11 | +DOCS_PREFIX = 'http://clickhouse.com/docs' |
| 12 | +CODE_PATTERN = re.compile(r"```.*?```", re.DOTALL) |
| 13 | + |
| 14 | + |
| 15 | +def parse_metadata_and_content(md_file_path): |
| 16 | + """Parse metadata and content from a Markdown file.""" |
| 17 | + with open(md_file_path, 'r', encoding='utf-8') as file: |
| 18 | + content = file.read() |
| 19 | + |
| 20 | + # Extract metadata block |
| 21 | + metadata_match = re.match(r'^---\n(.*?)\n---\n', content, re.DOTALL) |
| 22 | + metadata = {} |
| 23 | + if metadata_match: |
| 24 | + metadata = yaml.safe_load(metadata_match.group(1)) |
| 25 | + content = content[metadata_match.end():] # Remove metadata from content |
| 26 | + metadata['file_path'] = md_file_path |
| 27 | + return metadata, content |
| 28 | + |
| 29 | +def remove_code_blocks(content): |
| 30 | + """ |
| 31 | + Remove all code blocks (``` ... ```) from the Markdown content. |
| 32 | + """ |
| 33 | + return CODE_PATTERN.sub('', content) |
| 34 | + |
| 35 | + |
| 36 | +def split_large_document(doc, max_size=10000): |
| 37 | + max_size = max_size * 0.9 # buffer |
| 38 | + """ |
| 39 | + Splits a document into smaller chunks if its content exceeds max_size bytes - 10000 is the max size for algolia. |
| 40 | + Appends a number to the objectID for each chunk if splitting is necessary. |
| 41 | + """ |
| 42 | + content = doc['content'] |
| 43 | + size = len(json.dumps(doc).encode('utf-8')) |
| 44 | + if size <= max_size: |
| 45 | + doc['objectID'] = slugify(doc['slug'], lowercase=True, separator='-') |
| 46 | + yield doc |
| 47 | + else: |
| 48 | + # Split content into smaller chunks |
| 49 | + parts = [] |
| 50 | + current_chunk = [] |
| 51 | + # get current size without content |
| 52 | + del doc['content'] |
| 53 | + initial_size = len(json.dumps(doc).encode('utf-8')) |
| 54 | + current_size = initial_size |
| 55 | + |
| 56 | + for line in content.splitlines(keepends=True): |
| 57 | + line_size = len(line.encode('utf-8')) |
| 58 | + if current_size + line_size > max_size: |
| 59 | + parts.append(''.join(current_chunk)) |
| 60 | + current_chunk = [] |
| 61 | + current_size = initial_size |
| 62 | + current_chunk.append(line) |
| 63 | + current_size += line_size |
| 64 | + |
| 65 | + if current_chunk: |
| 66 | + parts.append(''.join(current_chunk)) |
| 67 | + |
| 68 | + # Yield each part as a separate document |
| 69 | + for i, part in enumerate(parts, start=1): |
| 70 | + chunked_doc = doc.copy() |
| 71 | + chunked_doc['content'] = part |
| 72 | + chunked_doc['objectID'] = f"{slugify(doc['url'], lowercase=True, separator='-')}-{i}" |
| 73 | + yield chunked_doc |
| 74 | + |
| 75 | + |
| 76 | +# TODO: this is currently language specific |
| 77 | +def inject_snippets(directory, content): |
| 78 | + snippet_pattern = re.compile( |
| 79 | + r"import\s+(\w+)\s+from\s+['\"]@site/docs/en/((.*?))['\"];", |
| 80 | + re.DOTALL |
| 81 | + ) |
| 82 | + matches = snippet_pattern.findall(content) |
| 83 | + snippet_map = {} |
| 84 | + |
| 85 | + for snippet_name, snippet_full_path, _ in matches: |
| 86 | + full_path = os.path.join(directory, snippet_full_path) |
| 87 | + if os.path.exists(full_path): |
| 88 | + with open(full_path, 'r', encoding='utf-8') as snippet_file: |
| 89 | + snippet_map[snippet_name] = snippet_file.read() |
| 90 | + else: |
| 91 | + print(f"FATAL: Unable to handle snippet: {full_path}") |
| 92 | + sys.exit(1) |
| 93 | + content = snippet_pattern.sub("", content) |
| 94 | + for snippet_name, snippet_content in snippet_map.items(): |
| 95 | + tag_pattern = re.compile(fr"<{snippet_name}\s*/>") |
| 96 | + try: |
| 97 | + content = tag_pattern.sub(re.escape(snippet_content), content) |
| 98 | + except Exception as e: |
| 99 | + print(e) |
| 100 | + return content |
| 101 | + |
| 102 | + |
| 103 | +def parse_markdown_content(directory, metadata, content): |
| 104 | + """Parse the Markdown content and generate sub-documents for each ##, ###, and #### heading.""" |
| 105 | + current_h1 = None |
| 106 | + current_h2 = None |
| 107 | + current_h3 = None |
| 108 | + slug = metadata.get( |
| 109 | + 'slug', |
| 110 | + '/' + os.path.split(os.path.split(metadata['file_path'])[0])[1] + metadata['file_path'].replace(directory, |
| 111 | + '').removesuffix( |
| 112 | + '.md').removesuffix('.mdx') |
| 113 | + ) |
| 114 | + |
| 115 | + |
| 116 | + # Inject any snippets |
| 117 | + content = inject_snippets(directory, content) |
| 118 | + # Remove any code blocks - we don't wanna index |
| 119 | + content = re.sub(r'```.*?```', '', content, flags=re.DOTALL) |
| 120 | + lines = content.splitlines() |
| 121 | + current_subdoc = None |
| 122 | + for line in lines: |
| 123 | + if line.startswith('# '): |
| 124 | + current_h1 = line[2:].strip() if line[2:].strip() else metadata.get('title', '') |
| 125 | + doc = { |
| 126 | + 'file_path': metadata.get('file_path', ''), |
| 127 | + 'slug': slug, |
| 128 | + 'url': f'{DOCS_PREFIX}{slug}', |
| 129 | + 'h1': current_h1, |
| 130 | + 'content': metadata.get('description', ''), |
| 131 | + 'title': metadata.get('title', ''), |
| 132 | + 'keywords': metadata.get('keywords', '') |
| 133 | + } |
| 134 | + yield from split_large_document(doc) |
| 135 | + elif line.startswith('## '): |
| 136 | + # TODO: capture case with no h1 |
| 137 | + if current_subdoc: |
| 138 | + yield from split_large_document(current_subdoc) |
| 139 | + current_h2 = line[3:].strip() |
| 140 | + current_h3 = None # Reset h3 when a new h2 is found |
| 141 | + heading_slug = slugify(current_h2, lowercase=True, separator='-') |
| 142 | + current_subdoc = { |
| 143 | + 'file_path': metadata.get('file_path', ''), |
| 144 | + 'slug': f'{slug}#{heading_slug}', |
| 145 | + 'url': f'{DOCS_PREFIX}{slug}#{heading_slug}', |
| 146 | + 'h1': current_h1, |
| 147 | + 'h2': current_h2, |
| 148 | + 'content': '', |
| 149 | + 'keywords': metadata.get('keywords', ''), |
| 150 | + } |
| 151 | + elif line.startswith('### '): |
| 152 | + if current_subdoc: |
| 153 | + yield from split_large_document(current_subdoc) |
| 154 | + current_h3 = line[4:].strip() |
| 155 | + heading_slug = slugify(f'{current_h2} {current_h3}', lowercase=True, separator='-') |
| 156 | + current_subdoc = { |
| 157 | + 'file_path': metadata.get('file_path', ''), |
| 158 | + 'slug': f'{slug}#{heading_slug}', |
| 159 | + 'url': f'{DOCS_PREFIX}{slug}#{heading_slug}', |
| 160 | + 'h1': current_h1, |
| 161 | + 'h2': current_h2, |
| 162 | + 'h3': current_h3, |
| 163 | + 'content': '', |
| 164 | + 'keywords': metadata.get('keywords', ''), |
| 165 | + } |
| 166 | + elif line.startswith('#### '): |
| 167 | + if current_subdoc: |
| 168 | + yield from split_large_document(current_subdoc) |
| 169 | + current_h4 = line[5:].strip() |
| 170 | + heading_slug = slugify(f'{current_h2} {current_h3} {current_h4}', lowercase=True, separator='-') |
| 171 | + current_subdoc = { |
| 172 | + 'file_path': metadata.get('file_path', ''), |
| 173 | + 'slug': f'{slug}#{heading_slug}', |
| 174 | + 'url': f'{DOCS_PREFIX}{slug}#{heading_slug}', |
| 175 | + 'h1': current_h1, |
| 176 | + 'h2': current_h2, |
| 177 | + 'h3': current_h3, |
| 178 | + 'h4': current_h4, |
| 179 | + 'content': '', |
| 180 | + 'keywords': metadata.get('keywords', ''), |
| 181 | + } |
| 182 | + elif current_subdoc: |
| 183 | + current_subdoc['content'] += line + '\n' |
| 184 | + |
| 185 | + if current_subdoc: |
| 186 | + yield from split_large_document(current_subdoc) |
| 187 | + |
| 188 | + |
| 189 | +def process_markdown_directory(directory): |
| 190 | + """Recursively process Markdown files in a directory.""" |
| 191 | + directory = os.path.abspath(directory) |
| 192 | + i = 0 |
| 193 | + for root, dirs, files in os.walk(directory): |
| 194 | + # Skip `_snippets` and _placeholders subfolders |
| 195 | + dirs[:] = [d for d in dirs if d != '_snippets' and d != '_placeholders'] |
| 196 | + for file in files: |
| 197 | + if file.endswith('.md') or file.endswith('.mdx'): |
| 198 | + md_file_path = os.path.join(root, file) |
| 199 | + metadata, content = parse_metadata_and_content(md_file_path) |
| 200 | + for subdoc in parse_markdown_content(directory, metadata, content): |
| 201 | + yield subdoc |
| 202 | + |
| 203 | + |
| 204 | +def send_to_algolia(client, index_name, records): |
| 205 | + """Send records to Algolia.""" |
| 206 | + if records: |
| 207 | + client.batch(index_name=index_name, batch_write_params={ |
| 208 | + "requests": [{"action": "addObject", "body": record} for record in records], |
| 209 | + }) |
| 210 | + print(f"Successfully sent {len(records)} records to Algolia.") |
| 211 | + else: |
| 212 | + print("No records to send to Algolia.") |
| 213 | + |
| 214 | + |
| 215 | +# TODO: handle snippets - handle the markdown with mdx |
| 216 | +def main(input_directory, algolia_app_id, algolia_api_key, algolia_index_name, batch_size=1000): |
| 217 | + client = SearchClientSync(algolia_app_id, algolia_api_key) |
| 218 | + |
| 219 | + batch = [] |
| 220 | + t = 0 |
| 221 | + for doc in process_markdown_directory(input_directory): |
| 222 | + # Ensure each record has a unique objectID |
| 223 | + doc['objectID'] = slugify(doc['url'], lowercase=True, separator='-') |
| 224 | + batch.append(doc) |
| 225 | + |
| 226 | + # Send batch to Algolia when it reaches the batch size |
| 227 | + if len(batch) >= batch_size: |
| 228 | + send_to_algolia(client, algolia_index_name, batch) |
| 229 | + print(f'indexed {len(batch)} records') |
| 230 | + t += len(batch) |
| 231 | + batch = [] |
| 232 | + # Send any remaining records |
| 233 | + if batch: |
| 234 | + send_to_algolia(client, algolia_index_name, batch) |
| 235 | + t += len(batch) |
| 236 | + print(f'indexed {len(batch)} records') |
| 237 | + print(f'total: indexed {t} records') |
| 238 | + |
| 239 | +if __name__ == '__main__': |
| 240 | + parser = argparse.ArgumentParser(description='Index search pages.') |
| 241 | + parser.add_argument( |
| 242 | + '-d', |
| 243 | + '--input_directory', |
| 244 | + help='Path to root directory of docs' |
| 245 | + ) |
| 246 | + parser.add_argument('--algolia_app_id', required=True, help='Algolia Application ID') |
| 247 | + parser.add_argument('--algolia_api_key', required=True, help='Algolia Admin API Key') |
| 248 | + parser.add_argument('--algolia_index_name', required=True, help='Algolia Index Name') |
| 249 | + args = parser.parse_args() |
| 250 | + main(args.input_directory, args.algolia_app_id, args.algolia_api_key, args.algolia_index_name) |
0 commit comments