|
1 | 1 | import json |
2 | 2 | import os |
3 | | -import os.path |
| 3 | +from multiprocessing import Pool |
| 4 | +from pathlib import Path |
| 5 | +from signal import SIG_IGN, SIGINT, signal |
| 6 | +from typing import Any, Callable, List, Tuple |
4 | 7 |
|
5 | 8 | import requests |
6 | 9 | from tqdm import tqdm |
| 10 | +from yarl import URL |
7 | 11 |
|
8 | 12 |
|
9 | | -def do_clone(src_url: str, dest_dir: str): |
10 | | - if not src_url.endswith("/"): |
11 | | - src_url += "/" |
| 13 | +def _do_in_parallel(worker: Callable, data: List, desc: str) -> None: |
| 14 | + cpus = os.cpu_count() |
| 15 | + pool = Pool(cpus, initializer=lambda: signal(SIGINT, SIG_IGN)) |
| 16 | + try: |
| 17 | + for _ in tqdm(pool.imap_unordered(worker, data), total=len(data), desc=f"{desc} ({cpus}x)"): |
| 18 | + pass |
| 19 | + except KeyboardInterrupt as interrupt: |
| 20 | + pool.terminate() |
| 21 | + pool.join() |
| 22 | + raise interrupt |
12 | 23 |
|
13 | | - if not dest_dir.endswith("/"): |
14 | | - dest_dir += "/" |
15 | 24 |
|
16 | | - def safe_open_w(file_name): |
17 | | - os.makedirs(os.path.dirname(file_name), exist_ok=True) |
18 | | - return open(file_name, "w") |
| 25 | +class Cloner: |
19 | 26 |
|
20 | | - def print_json(data, file_name): |
21 | | - transformed_data = json.dumps(data, indent=4, sort_keys=True) |
22 | | - transformed_data = transformed_data.replace(src_url, "/") |
23 | | - print(transformed_data, file=safe_open_w(file_name)) |
| 27 | + _src_url: URL |
| 28 | + _dest_dir: Path |
24 | 29 |
|
25 | | - # Root |
| 30 | + def __init__(self, src_url: str, dest_dir: str): |
| 31 | + if src_url.endswith("/"): |
| 32 | + src_url = src_url[:-1] |
| 33 | + if not dest_dir.endswith("/"): |
| 34 | + dest_dir += "/" |
26 | 35 |
|
27 | | - url = src_url + "api/v2/" |
28 | | - endpoints = requests.get(url) |
| 36 | + self._src_url = URL(src_url) |
| 37 | + self._dest_dir = Path(dest_dir) |
29 | 38 |
|
30 | | - path = dest_dir + url.replace(src_url, "") + "index.json" |
31 | | - print_json(endpoints.json(), path) |
| 39 | + def _crawl(self, url: URL, save: bool = True) -> Any: |
| 40 | + try: |
| 41 | + data = requests.get(url).json() |
| 42 | + except json.JSONDecodeError as err: |
| 43 | + tqdm.write(f"JSON decode failure: {url}") |
| 44 | + return None |
32 | 45 |
|
33 | | - # Endpoints |
| 46 | + if save: |
| 47 | + out_data = json.dumps(data, indent=4, sort_keys=True) |
| 48 | + out_data = out_data.replace(str(self._src_url), "") |
| 49 | + file = self._dest_dir.joinpath((url / "index.json").path[1:]) |
| 50 | + file.parent.mkdir(parents=True, exist_ok=True) |
| 51 | + file.write_text(out_data) |
34 | 52 |
|
35 | | - for endpoint in tqdm(endpoints.json().values()): |
36 | | - # Zero index |
37 | | - url = endpoint + "?limit=0" |
38 | | - resource_list = requests.get(url) |
39 | | - count = str(resource_list.json()["count"]) |
| 53 | + return data |
40 | 54 |
|
41 | | - # Full index |
42 | | - url = endpoint + "?limit=" + count |
43 | | - resource_list = requests.get(url) |
44 | | - endpoint_path = endpoint.replace(src_url, "") |
45 | | - path = dest_dir + endpoint_path + "index.json" |
46 | | - print_json(resource_list.json(), path) |
| 55 | + def _crawl_index(self) -> List[URL]: |
| 56 | + index = self._crawl(self._src_url / "api/v2") |
| 57 | + return [URL(url_str) for url_str in index.values()] |
47 | 58 |
|
48 | | - # All resources |
49 | | - desc = list(filter(None, endpoint_path.split("/")))[-1] |
50 | | - for resourceSummary in tqdm(resource_list.json()["results"], desc=desc): |
51 | | - resource_url = resourceSummary["url"] |
52 | | - path = dest_dir + resource_url.replace(src_url, "") + "index.json" |
| 59 | + def _crawl_resource_list(self, url: URL) -> List[URL]: |
| 60 | + zero_url = url.with_query({"limit": 0, "offset": 0}) |
| 61 | + count = self._crawl(zero_url, save=False)["count"] |
| 62 | + full_url = url.with_query({"limit": count, "offset": 0}) |
| 63 | + resource_list = self._crawl(full_url) |
| 64 | + return [URL(resource_ref["url"]) for resource_ref in resource_list["results"]] |
53 | 65 |
|
54 | | - resource = requests.get(resource_url) |
55 | | - print_json(resource.json(), path) |
| 66 | + def clone_single(self, endpoint_and_id: Tuple[str, str]) -> None: |
| 67 | + endpoint, id = endpoint_and_id |
| 68 | + res_url = self._src_url / "api/v2" / endpoint / id |
| 69 | + self._crawl(res_url) |
| 70 | + if endpoint == "pokemon": |
| 71 | + self._crawl(res_url / "encounters") |
56 | 72 |
|
57 | | - if endpoint.endswith("/pokemon/"): |
58 | | - resource_url += "encounters/" |
59 | | - path = dest_dir + resource_url.replace(src_url, "") + "index.json" |
60 | | - if not os.path.isfile(path): |
61 | | - resource = requests.get(resource_url) |
62 | | - print_json(resource.json(), path) |
| 73 | + def clone_endpoint(self, endpoint: str): |
| 74 | + res_list_url = self._src_url / "api/v2" / endpoint |
| 75 | + res_urls = self._crawl_resource_list(res_list_url) |
| 76 | + singles = [(endpoint, url.parent.name) for url in res_urls] |
| 77 | + _do_in_parallel( |
| 78 | + worker=self.clone_single, |
| 79 | + data=singles, |
| 80 | + desc=res_list_url.name, |
| 81 | + ) |
| 82 | + |
| 83 | + def clone_all(self) -> None: |
| 84 | + resource_lists = self._crawl_index() |
| 85 | + for res_list_url in tqdm(resource_lists, desc="clone"): |
| 86 | + endpoint = res_list_url.parent.name |
| 87 | + self.clone_endpoint(endpoint) |
| 88 | + |
| 89 | + |
| 90 | +def do_clone(src_url: str, dest_dir: str, select: List[str]) -> None: |
| 91 | + cloner = Cloner(src_url, dest_dir) |
| 92 | + |
| 93 | + if not select: |
| 94 | + cloner.clone_all() |
| 95 | + |
| 96 | + for sel in select: |
| 97 | + if "/" in sel: |
| 98 | + cloner.clone_single(tuple(filter(None, sel.split("/")))[0:2]) |
| 99 | + else: |
| 100 | + cloner.clone_endpoint(sel) |
0 commit comments