|
1 | 1 | """Formatters for assets retrieved from Kili API.""" |
2 | 2 |
|
| 3 | +import asyncio |
3 | 4 | import json |
4 | | -from concurrent.futures import ThreadPoolExecutor, as_completed |
5 | 5 |
|
6 | | -import requests |
| 6 | +import httpx |
7 | 7 |
|
8 | 8 | from kili.adapters.http_client import HttpClient |
9 | | -from kili.core.helpers import get_response_json, is_url, log_raise_for_status |
| 9 | +from kili.core.helpers import is_url |
10 | 10 | from kili.domain.types import ListOrTuple |
11 | 11 |
|
12 | 12 | # Batch size for parallel JSON response downloads (same as export service) |
13 | 13 | JSON_RESPONSE_BATCH_SIZE = 10 |
14 | 14 |
|
15 | 15 |
|
16 | 16 | def load_json_from_link(link: str, http_client: HttpClient) -> dict: |
17 | | - """Load json from link.""" |
| 17 | + """Load json from link (synchronous fallback for non-batch operations).""" |
18 | 18 | if link == "" or not is_url(link): |
19 | 19 | return {} |
20 | 20 |
|
21 | 21 | response = http_client.get(link, timeout=30) |
22 | | - log_raise_for_status(response) |
23 | | - return get_response_json(response) |
| 22 | + response.raise_for_status() |
| 23 | + return response.json() |
24 | 24 |
|
25 | 25 |
|
26 | | -def download_json_responses_parallel( |
27 | | - url_to_label_mapping: list[tuple[str, dict]], http_client: HttpClient |
28 | | -) -> None: |
29 | | - """Download JSON responses in parallel and assign to labels. |
| 26 | +async def _download_json_response(url: str) -> dict: |
| 27 | + """Download and parse JSON response from a URL using asyncio. |
30 | 28 |
|
31 | 29 | Args: |
32 | | - url_to_label_mapping: List of tuples (url, label_dict) to download |
33 | | - http_client: HTTP client to use for downloads |
| 30 | + url: URL to download the JSON response from |
| 31 | +
|
| 32 | + Returns: |
| 33 | + Parsed JSON response as a dictionary |
34 | 34 | """ |
35 | | - if not url_to_label_mapping: |
36 | | - return |
| 35 | + try: |
| 36 | + async with httpx.AsyncClient(verify=os.getenv("KILI__VERIFY_SSL") != "False") as client: |
| 37 | + response = await client.get(url, timeout=30.0) |
| 38 | + response.raise_for_status() |
| 39 | + return response.json() |
| 40 | + except (httpx.HTTPError, json.JSONDecodeError): |
| 41 | + # Return empty dict on error to ensure consistent response format |
| 42 | + return {} |
| 43 | + |
| 44 | + |
| 45 | +async def _download_json_responses_async(url_to_label_mapping: list[tuple[str, dict]]) -> None: |
| 46 | + """Download JSON responses in parallel using asyncio. |
37 | 47 |
|
| 48 | + Args: |
| 49 | + url_to_label_mapping: List of tuples (url, label_dict) to download |
| 50 | + """ |
38 | 51 | # Process in batches to limit concurrent connections |
39 | 52 | for i in range(0, len(url_to_label_mapping), JSON_RESPONSE_BATCH_SIZE): |
40 | 53 | batch = url_to_label_mapping[i : i + JSON_RESPONSE_BATCH_SIZE] |
41 | 54 |
|
42 | | - # Download all URLs in the batch in parallel |
43 | | - with ThreadPoolExecutor(max_workers=JSON_RESPONSE_BATCH_SIZE) as executor: |
44 | | - # Submit all download tasks |
45 | | - future_to_label = { |
46 | | - executor.submit(load_json_from_link, url, http_client): label |
47 | | - for url, label in batch |
48 | | - } |
49 | | - |
50 | | - # Collect results as they complete |
51 | | - for future in as_completed(future_to_label): |
52 | | - label = future_to_label[future] |
53 | | - try: |
54 | | - json_response = future.result() |
55 | | - label["jsonResponse"] = json_response |
56 | | - if "jsonResponseUrl" in label: |
57 | | - del label["jsonResponseUrl"] |
58 | | - except (requests.RequestException, json.JSONDecodeError, TimeoutError): |
59 | | - # Set empty dict to ensure consistent response format |
60 | | - label["jsonResponse"] = {} |
61 | | - if "jsonResponseUrl" in label: |
62 | | - del label["jsonResponseUrl"] |
| 55 | + # Download all URLs in the batch in parallel using asyncio.gather |
| 56 | + download_tasks = [_download_json_response(url) for url, _ in batch] |
| 57 | + json_responses = await asyncio.gather(*download_tasks) |
| 58 | + |
| 59 | + # Assign the downloaded responses back to their labels and remove the URL |
| 60 | + for (_, label), json_response in zip(batch, json_responses, strict=False): |
| 61 | + label["jsonResponse"] = json_response |
| 62 | + if "jsonResponseUrl" in label: |
| 63 | + del label["jsonResponseUrl"] |
| 64 | + |
| 65 | + |
| 66 | +def download_json_responses_parallel(url_to_label_mapping: list[tuple[str, dict]]) -> None: |
| 67 | + """Download JSON responses in parallel and assign to labels. |
| 68 | +
|
| 69 | + Args: |
| 70 | + url_to_label_mapping: List of tuples (url, label_dict) to download |
| 71 | + """ |
| 72 | + if not url_to_label_mapping: |
| 73 | + return |
| 74 | + |
| 75 | + # Run async downloads in a synchronous context |
| 76 | + asyncio.run(_download_json_responses_async(url_to_label_mapping)) |
63 | 77 |
|
64 | 78 |
|
65 | 79 | def _parse_label_json_response(label: dict) -> None: |
@@ -111,6 +125,6 @@ def load_asset_json_fields(asset: dict, fields: ListOrTuple[str], http_client: H |
111 | 125 | _process_label_json_response(asset["latestLabel"], url_to_label_mapping) |
112 | 126 |
|
113 | 127 | if url_to_label_mapping: |
114 | | - download_json_responses_parallel(url_to_label_mapping, http_client) |
| 128 | + download_json_responses_parallel(url_to_label_mapping) |
115 | 129 |
|
116 | 130 | return asset |
0 commit comments