-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsplit_dump.py
More file actions
62 lines (42 loc) · 1.49 KB
/
split_dump.py
File metadata and controls
62 lines (42 loc) · 1.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python3
import os
import xml.etree.ElementTree as ET
from sys import argv, stderr
from collections.abc import Generator
from typing import TypeVar
# Gemini wants 20 MB at most
MAX_FILE_SIZE_MB = 20 - 1
MAX_FILE_SIZE = MAX_FILE_SIZE_MB * 1024 * 1024
T = TypeVar("T")
def chunks(lst: list[T], n: int) -> Generator[list[T], None, None]:
for i in range(0, len(lst), n):
yield lst[i : i + n]
def split_dump(input_file: str, output_dir: str):
tree = ET.parse(input_file)
root = tree.getroot()
namespace = root.tag.split('}')[0].strip('{')
ET.register_namespace('', namespace)
siteinfo, pages = root.find("{*}siteinfo"), root.findall("{*}page")
if siteinfo is None:
raise ValueError("No <siteinfo> element found")
if not len(pages):
raise ValueError("No <page> elements found")
os.makedirs(output_dir, exist_ok=True)
i = 0
for chunk in chunks(pages, 1000):
out_root = ET.Element(root.tag, attrib=root.attrib)
out_root.append(siteinfo)
for page in chunk:
out_root.append(page)
out_tree = ET.ElementTree(out_root)
path = os.path.join(output_dir, f"split_{i:03}.xml")
out_tree.write(path, encoding="utf-8")
i += 1
def main():
if len(argv) not in (2, 3):
print("Usage", argv[0], "<dump>", "[output_dir]", file=stderr)
return 1
split_dump(argv[1], argv[2] if len(argv) >= 3 else "split")
return 0
if __name__ == "__main__":
exit(main())