Skip to content

Commit 2adb913

Browse files
committed
Make ditto clone parallel
1 parent a73d4df commit 2adb913

File tree

6 files changed

+110
-47
lines changed

6 files changed

+110
-47
lines changed

pokeapi_ditto/commands/analyze.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import os
44
import re
55
from pathlib import Path
6-
from typing import List, Dict, TypeVar
6+
from typing import Dict, List, TypeVar
77

88
from genson import SchemaBuilder
99
from tqdm import tqdm

pokeapi_ditto/commands/clone.py

Lines changed: 81 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,100 @@
11
import json
22
import os
3-
import os.path
3+
from multiprocessing import Pool
4+
from pathlib import Path
5+
from signal import SIG_IGN, SIGINT, signal
6+
from typing import Any, Callable, List, Tuple
47

58
import requests
69
from tqdm import tqdm
10+
from yarl import URL
711

812

9-
def do_clone(src_url: str, dest_dir: str):
10-
if not src_url.endswith("/"):
11-
src_url += "/"
13+
def _do_in_parallel(worker: Callable, data: List, desc: str) -> None:
14+
cpus = os.cpu_count()
15+
pool = Pool(cpus, initializer=lambda: signal(SIGINT, SIG_IGN))
16+
try:
17+
for _ in tqdm(pool.imap_unordered(worker, data), total=len(data), desc=f"{desc} ({cpus}x)"):
18+
pass
19+
except KeyboardInterrupt as interrupt:
20+
pool.terminate()
21+
pool.join()
22+
raise interrupt
1223

13-
if not dest_dir.endswith("/"):
14-
dest_dir += "/"
1524

16-
def safe_open_w(file_name):
17-
os.makedirs(os.path.dirname(file_name), exist_ok=True)
18-
return open(file_name, "w")
25+
class Cloner:
1926

20-
def print_json(data, file_name):
21-
transformed_data = json.dumps(data, indent=4, sort_keys=True)
22-
transformed_data = transformed_data.replace(src_url, "/")
23-
print(transformed_data, file=safe_open_w(file_name))
27+
_src_url: URL
28+
_dest_dir: Path
2429

25-
# Root
30+
def __init__(self, src_url: str, dest_dir: str):
31+
if src_url.endswith("/"):
32+
src_url = src_url[:-1]
33+
if not dest_dir.endswith("/"):
34+
dest_dir += "/"
2635

27-
url = src_url + "api/v2/"
28-
endpoints = requests.get(url)
36+
self._src_url = URL(src_url)
37+
self._dest_dir = Path(dest_dir)
2938

30-
path = dest_dir + url.replace(src_url, "") + "index.json"
31-
print_json(endpoints.json(), path)
39+
def _crawl(self, url: URL, save: bool = True) -> Any:
40+
try:
41+
data = requests.get(url).json()
42+
except json.JSONDecodeError as err:
43+
tqdm.write(f"JSON decode failure: {url}")
44+
return None
3245

33-
# Endpoints
46+
if save:
47+
out_data = json.dumps(data, indent=4, sort_keys=True)
48+
out_data = out_data.replace(str(self._src_url), "")
49+
file = self._dest_dir.joinpath((url / "index.json").path[1:])
50+
file.parent.mkdir(parents=True, exist_ok=True)
51+
file.write_text(out_data)
3452

35-
for endpoint in tqdm(endpoints.json().values()):
36-
# Zero index
37-
url = endpoint + "?limit=0"
38-
resource_list = requests.get(url)
39-
count = str(resource_list.json()["count"])
53+
return data
4054

41-
# Full index
42-
url = endpoint + "?limit=" + count
43-
resource_list = requests.get(url)
44-
endpoint_path = endpoint.replace(src_url, "")
45-
path = dest_dir + endpoint_path + "index.json"
46-
print_json(resource_list.json(), path)
55+
def _crawl_index(self) -> List[URL]:
56+
index = self._crawl(self._src_url / "api/v2")
57+
return [URL(url_str) for url_str in index.values()]
4758

48-
# All resources
49-
desc = list(filter(None, endpoint_path.split("/")))[-1]
50-
for resourceSummary in tqdm(resource_list.json()["results"], desc=desc):
51-
resource_url = resourceSummary["url"]
52-
path = dest_dir + resource_url.replace(src_url, "") + "index.json"
59+
def _crawl_resource_list(self, url: URL) -> List[URL]:
60+
zero_url = url.with_query({"limit": 0, "offset": 0})
61+
count = self._crawl(zero_url, save=False)["count"]
62+
full_url = url.with_query({"limit": count, "offset": 0})
63+
resource_list = self._crawl(full_url)
64+
return [URL(resource_ref["url"]) for resource_ref in resource_list["results"]]
5365

54-
resource = requests.get(resource_url)
55-
print_json(resource.json(), path)
66+
def clone_single(self, endpoint_and_id: Tuple[str, str]) -> None:
67+
endpoint, id = endpoint_and_id
68+
res_url = self._src_url / "api/v2" / endpoint / id
69+
self._crawl(res_url)
70+
if endpoint == "pokemon":
71+
self._crawl(res_url / "encounters")
5672

57-
if endpoint.endswith("/pokemon/"):
58-
resource_url += "encounters/"
59-
path = dest_dir + resource_url.replace(src_url, "") + "index.json"
60-
if not os.path.isfile(path):
61-
resource = requests.get(resource_url)
62-
print_json(resource.json(), path)
73+
def clone_endpoint(self, endpoint: str):
74+
res_list_url = self._src_url / "api/v2" / endpoint
75+
res_urls = self._crawl_resource_list(res_list_url)
76+
singles = [(endpoint, url.parent.name) for url in res_urls]
77+
_do_in_parallel(
78+
worker=self.clone_single,
79+
data=singles,
80+
desc=res_list_url.name,
81+
)
82+
83+
def clone_all(self) -> None:
84+
resource_lists = self._crawl_index()
85+
for res_list_url in tqdm(resource_lists, desc="clone"):
86+
endpoint = res_list_url.parent.name
87+
self.clone_endpoint(endpoint)
88+
89+
90+
def do_clone(src_url: str, dest_dir: str, select: List[str]) -> None:
91+
cloner = Cloner(src_url, dest_dir)
92+
93+
if not select:
94+
cloner.clone_all()
95+
96+
for sel in select:
97+
if "/" in sel:
98+
cloner.clone_single(tuple(filter(None, sel.split("/")))[0:2])
99+
else:
100+
cloner.clone_endpoint(sel)

pokeapi_ditto/commands/models.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
from odictliteral import odict
44

5-
65
COMMON_MODELS: OrderedDict = odict[
76
f"/schema/v2/api_resource.json":{
87
"properties": {"url": {"type": "string"}},

pokeapi_ditto/main.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ def __init__(self):
1818
clone_args = subparsers.add_parser("clone")
1919
clone_args.add_argument("--src-url", type=str, default="http://localhost/")
2020
clone_args.add_argument("--dest-dir", type=str, default="./data")
21+
clone_args.add_argument("--select", nargs='+', default=[])
2122

2223
transform_args = subparsers.add_parser("transform")
2324
transform_args.add_argument("--src-dir", type=str, default="./data")

pyproject.lock

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,15 @@ version = "4.3.0"
141141
[package.dependencies]
142142
six = ">=1.0.0,<2.0.0"
143143

144+
[[package]]
145+
category = "main"
146+
description = "multidict implementation"
147+
name = "multidict"
148+
optional = false
149+
platform = "*"
150+
python-versions = ">=3.4.1"
151+
version = "4.4.1"
152+
144153
[[package]]
145154
category = "main"
146155
description = "A tidier way of coding literal OrderedDicts"
@@ -259,8 +268,21 @@ platform = "*"
259268
python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4"
260269
version = "1.23"
261270

271+
[[package]]
272+
category = "main"
273+
description = "Yet another URL library"
274+
name = "yarl"
275+
optional = false
276+
platform = "*"
277+
python-versions = ">=3.5.3"
278+
version = "1.2.6"
279+
280+
[package.dependencies]
281+
idna = ">=2.0"
282+
multidict = ">=4.0"
283+
262284
[metadata]
263-
content-hash = "ff522e2cdda2435753106f9e21b08854c6846d5a6a221f0f0a1c3647e4818f4c"
285+
content-hash = "c12dfe2c2c968ef9d9e8bd8de2297cd274b51957fd77992d54088db9a29093a2"
264286
platform = "*"
265287
python-versions = "^3.6"
266288

@@ -279,6 +301,7 @@ idna = ["156a6814fb5ac1fc6850fb002e0852d56c0c8d2531923a51032d1b70760e186e", "684
279301
isort = ["1153601da39a25b14ddc54955dbbacbb6b2d19135386699e2ad58517953b34af", "b9c40e9750f3d77e6e4d441d8b0266cf555e7cdabdcff33c4fd06366ca761ef8", "ec9ef8f4a9bc6f71eec99e1806bfa2de401650d996c59330782b89a5555c1497"]
280302
mccabe = ["ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42", "dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"]
281303
more-itertools = ["c187a73da93e7a8acc0001572aebc7e3c69daf7bf6881a2cea10650bd4420092", "c476b5d3a34e12d40130bc2f935028b5f636df8f372dc2c1c01dc19681b2039e", "fcbfeaea0be121980e15bc97b3817b5202ca73d0eae185b4550cbfce2a3ebb3d"]
304+
multidict = ["3be539df400562f0e6c2089dd0b512150abd41189723405097ef1d11fd165658", "4ec2ba081c710e8ddf12adf9867ea8969e12aa8f0d0cd8270c706538a622aecb", "5d93048a352bf1318615a9d483a17c87f2f0f8ca2be0157ff353753de73e2636", "728222ed41e57ef01b41282df7b15b80c3da7fb0e664c8898d2efb4d969ee7e8", "7bbef78bf0b9b038f97e172a1a861e6c1a109be69592ed4ac0f2e9a23ec1fbd2", "80fd036a56bcb9dd9ba63a55b884b67fa18b46b02400528cb4bd3e2cc98791fe", "833fb16b7da8437ae067675e6e0a22cf0d5935d578fadd2a8474be702ade2cae", "9cee5290ec55d17917fcb71bab853cc376c3e30d45a21d96dca106f72a46b357", "b75218d23692e4e8ade6b7a1dd2de21ef0342e83057487b69e32732e93d5f769", "c0be167d860667125d397152192c4bb084e91a07130a90aef7f927b23f73120f", "c17d317dab455354ed6f95c42df84cc34b4bd9cace121fd4ee88307a16cc2482", "d5432a55ded1d1df572960f0a4d87fe771d4238729392b8ca995e0e50021ae8f", "d75a539aae854e19c79d39aa88d235c3eeead2cc37cdcb6c3fb193e4b0ba0b78", "d8fd869eec069dd45831486cc074ad9f82051f317371f9e1e11ef83668e23bde", "dc043e177e4bbc2539203af61e9b31c5f5de468df661ed52d9fa13bc868a1ce7", "e1ab80f1aa32f97c1cbbfb47725292617a443538103a0c5d5a419eb9629c7415", "e2e1e4b34ecf7f752594836ca338457945c2c1d4f928dd10e7c2319669af6c4e", "e9bceec13f5ea83fc434daa703c22f7263a1a7d3d9d4d53330b103cc3cfa875d", "f2d6d48932154807f6dddd093f6de0ef75356d330dda9df68c5106ccea8dda48", "fe79338660331d3a4a6f5326300d0c0e0b5f47edfe32e1a0626aa7b2e2bf08c4", "febbbf93912fdbe9455ac1673284df2f4561c5f075ef72aff90b445788feda96"]
282305
odictliteral = ["88405c7fab7ff7a54c7b9fac9fd69264e526b0024b8265bc042ba3a797f0c161"]
283306
pluggy = ["6e3836e39f4d36ae72840833db137f7b7d35105079aee6ec4a62d9f80d594dd1", "95eb8364a4708392bae89035f45341871286a333f749c3141c20573d2b3876e1"]
284307
py = ["06a30435d058473046be836d3fc4f27167fd84c45b99704f2fb5509ef61f9af1", "50402e9d1c9005d759426988a492e0edaadb7f4e68bcddfea586bc7432d009c6"]
@@ -290,3 +313,4 @@ six = ["70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", "832d
290313
toml = ["380178cde50a6a79f9d2cf6f42a62a5174febe5eea4126fe4038785f1d888d42", "a7901919d3e4f92ffba7ff40a9d697e35bbbc8a8049fe8da742f34c83606d957"]
291314
tqdm = ["18f1818ce951aeb9ea162ae1098b43f583f7d057b34d706f66939353d1208889", "df02c0650160986bac0218bb07952245fc6960d23654648b5d5526ad5a4128c9"]
292315
urllib3 = ["a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf", "b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5"]
316+
yarl = ["2556b779125621b311844a072e0ed367e8409a18fa12cbd68eb1258d187820f9", "4aec0769f1799a9d4496827292c02a7b1f75c0bab56ab2b60dd94ebb57cbd5ee", "55369d95afaacf2fa6b49c84d18b51f1704a6560c432a0f9a1aeb23f7b971308", "6c098b85442c8fe3303e708bbb775afd0f6b29f77612e8892627bcab4b939357", "9182cd6f93412d32e009020a44d6d170d2093646464a88aeec2aef50592f8c78", "c8cbc21bbfa1dd7d5386d48cc814fe3d35b80f60299cdde9279046f399c3b0d8", "db6f70a4b09cde813a4807843abaaa60f3b15fb4a2a06f9ae9c311472662daa1", "f17495e6fe3d377e3faac68121caef6f974fcb9e046bc075bcff40d8e5cc69a4", "f85900b9cca0c67767bb61b2b9bd53208aaa7373dae633dbe25d179b4bf38aa7"]

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "pokeapi-ditto"
3-
version = "0.4.0"
3+
version = "0.5.0"
44
description = "Ditto is a server that serves a static copy of PokeAPI's data."
55
license = "Apache-2.0"
66
authors = ["Sargun Vohra <[email protected]>"]
@@ -27,6 +27,7 @@ requests = "^2.19"
2727
genson = "^1.0"
2828
tqdm = "^4.26"
2929
odictliteral = "^1.0"
30+
yarl = "^1.2"
3031

3132
[tool.poetry.dev-dependencies]
3233
pytest = "^3.0"

0 commit comments

Comments
 (0)