Skip to content

Commit ca8ddd0

Browse files
committed
added back logic to split some requests but do not parallelize.
1 parent 460196f commit ca8ddd0

File tree

1 file changed

+143
-14
lines changed

1 file changed

+143
-14
lines changed

mp_api/client/core/client.py

Lines changed: 143 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,9 @@ def _submit_requests( # noqa
577577
) -> dict:
578578
"""Handle submitting requests sequentially with pagination.
579579
580+
If criteria contains comma-separated parameters (except those that are naturally comma-separated),
581+
split them into multiple sequential requests and combine results.
582+
580583
Arguments:
581584
criteria: dictionary of criteria to filter down
582585
url: url used to make request
@@ -588,22 +591,148 @@ def _submit_requests( # noqa
588591
Returns:
589592
Dictionary containing data and metadata
590593
"""
591-
total_data = {"data": []} # type: dict
592-
593-
# Get first page to determine total number of documents
594-
initial_criteria = copy(criteria)
595-
data, total_num_docs = self._submit_request_and_process(
596-
url=url,
597-
verify=True,
598-
params=initial_criteria,
599-
use_document_model=use_document_model,
600-
timeout=timeout,
601-
)
594+
# Parameters that naturally support comma-separated values and should NOT be split
595+
no_split_params = {
596+
"elements",
597+
"exclude_elements",
598+
"possible_species",
599+
"coordination_envs",
600+
"coordination_envs_anonymous",
601+
"has_props",
602+
"gb_plane",
603+
"rotation_axis",
604+
"keywords",
605+
"substrate_orientation",
606+
"film_orientation",
607+
"synthesis_type",
608+
"operations",
609+
"condition_mixing_device",
610+
"condition_mixing_media",
611+
"condition_heating_atmosphere",
612+
"_fields",
613+
"formula",
614+
"chemsys",
615+
}
616+
617+
# Check if we need to split any comma-separated parameters
618+
split_param = None
619+
split_values = None
620+
total_num_docs = 0 # Initialize before try/else blocks
621+
622+
for key, value in criteria.items():
623+
if (
624+
isinstance(value, str)
625+
and "," in value
626+
and key not in no_split_params
627+
and not key.startswith("_")
628+
):
629+
split_param = key
630+
split_values = value.split(",")
631+
break
632+
633+
# If we found a parameter to split, try the request first and only split on error
634+
if split_param and split_values and len(split_values) > 1:
635+
try:
636+
# First, try the request with all values as-is
637+
initial_criteria = copy(criteria)
638+
data, total_num_docs = self._submit_request_and_process(
639+
url=url,
640+
verify=True,
641+
params=initial_criteria,
642+
use_document_model=use_document_model,
643+
timeout=timeout,
644+
)
602645

603-
total_data["data"].extend(data["data"])
646+
# Check if we got 0 results - some parameters are silently ignored by the API
647+
# when passed as comma-separated values, so we need to split them anyway
648+
if total_num_docs == 0 and len(split_values) > 1:
649+
# Treat this the same as a 422 error - split into batches
650+
raise MPRestError(
651+
"Got 0 results for comma-separated parameter, will try splitting"
652+
)
653+
654+
# If successful, continue with normal pagination
655+
total_data = {"data": []} # type: dict
656+
total_data["data"].extend(data["data"])
657+
658+
if "meta" in data:
659+
total_data["meta"] = data["meta"]
660+
661+
# Continue with pagination if needed (handled below)
662+
663+
except MPRestError as e:
664+
# If we get 422 or 414 error, or 0 results for comma-separated params, split into batches
665+
if "422" in str(e) or "414" in str(e) or "Got 0 results" in str(e):
666+
total_data = {"data": []} # type: dict
667+
total_num_docs = 0
668+
669+
# Batch the split values to reduce number of requests
670+
# Use batches of up to 100 values to balance URL length and request count
671+
batch_size = min(100, max(1, len(split_values) // 10))
672+
673+
# Setup progress bar for split parameter requests
674+
num_batches = ceil(len(split_values) / batch_size)
675+
pbar_message = f"Retrieving {len(split_values)} {split_param} values in {num_batches} batches"
676+
pbar = (
677+
tqdm(
678+
desc=pbar_message,
679+
total=num_batches,
680+
)
681+
if not self.mute_progress_bars
682+
else None
683+
)
684+
685+
for i in range(0, len(split_values), batch_size):
686+
batch = split_values[i : i + batch_size]
687+
split_criteria = copy(criteria)
688+
split_criteria[split_param] = ",".join(batch)
689+
690+
# Recursively call _submit_requests with the batch
691+
# This will trigger another split if the batch is still too large
692+
result = self._submit_requests(
693+
url=url,
694+
criteria=split_criteria,
695+
use_document_model=use_document_model,
696+
chunk_size=chunk_size,
697+
num_chunks=num_chunks,
698+
timeout=timeout,
699+
)
700+
701+
total_data["data"].extend(result["data"])
702+
if "meta" in result:
703+
total_data["meta"] = result["meta"]
704+
total_num_docs += result["meta"].get("total_doc", 0)
705+
706+
if pbar is not None:
707+
pbar.update(1)
708+
709+
if pbar is not None:
710+
pbar.close()
711+
712+
# Update total_doc if we have meta
713+
if "meta" in total_data:
714+
total_data["meta"]["total_doc"] = total_num_docs
715+
716+
return total_data
717+
else:
718+
# Re-raise other errors
719+
raise
720+
else:
721+
# No splitting needed - get first page
722+
total_data = {"data": []} # type: dict
723+
initial_criteria = copy(criteria)
724+
data, total_num_docs = self._submit_request_and_process(
725+
url=url,
726+
verify=True,
727+
params=initial_criteria,
728+
use_document_model=use_document_model,
729+
timeout=timeout,
730+
)
731+
732+
total_data["data"].extend(data["data"])
604733

605-
if "meta" in data:
606-
total_data["meta"] = data["meta"]
734+
if "meta" in data:
735+
total_data["meta"] = data["meta"]
607736

608737
# Get max number of response pages
609738
max_pages = (

0 commit comments

Comments
 (0)