Skip to content

Commit 5b1738b

Browse files
committed
ENH: add get_instance_file_URL() function
Re #161 this will return AWS/GCP URLs for a file given SOPInstanceUID, only for microscopy instances
1 parent f964e96 commit 5b1738b

File tree

2 files changed

+50
-0
lines changed

2 files changed

+50
-0
lines changed

idc_index/index.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,48 @@ def get_series_file_URLs(self, seriesInstanceUID, source_bucket_location="aws"):
676676

677677
return file_names
678678

679+
def get_instance_file_URL(self, sopInstanceUID, source_bucket_location="aws"):
680+
"""
681+
Get the bucket URL of the file corresponding to a given SOPInstanceUID.
682+
683+
This function will only return the URL for the Slide Microscopy (SM) instances,
684+
which are maintained in the `sm_instance_index` table.
685+
686+
Args:
687+
sopInstanceUID: string containing the value of DICOM SOPInstanceUID
688+
source_bucket_location: string containing the source bucket location, either "aws" or "gcp"
689+
690+
Returns:
691+
string containing the bucket URL of the file corresponding to the SOPInstanceUID,
692+
or None if the SOPInstanceUID is not recognized
693+
"""
694+
695+
# sm_instance_index is required to complete this operation - install it!
696+
self.fetch_index("sm_instance_index")
697+
698+
if sopInstanceUID not in self.sm_instance_index["SOPInstanceUID"].values:
699+
raise ValueError("SOPInstanceUID not found in IDC sm_instance_index.")
700+
701+
# merge with the main index to get series_aws_url
702+
selected_instance_df = self.sm_instance_index[
703+
self.sm_instance_index["SOPInstanceUID"] == sopInstanceUID
704+
].copy()[["SeriesInstanceUID", "SOPInstanceUID", "crdc_instance_uuid"]]
705+
selected_instance_df = pd.merge(
706+
selected_instance_df,
707+
self.index,
708+
on="SeriesInstanceUID",
709+
how="left",
710+
)
711+
712+
if source_bucket_location == "gcp":
713+
# replace AWS with the GCP bucket
714+
self._replace_aws_with_gcp_buckets(selected_instance_df, "series_aws_url")
715+
716+
# instance files are named using crdc_instance_uuid
717+
series_url = selected_instance_df.iloc[0]["series_aws_url"][:-1]
718+
instance_uuid = selected_instance_df.iloc[0]["crdc_instance_uuid"]
719+
return series_url + instance_uuid + ".dcm"
720+
679721
def get_viewer_URL(
680722
self, seriesInstanceUID=None, studyInstanceUID=None, viewer_selector=None
681723
):

tests/idcindex.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -597,6 +597,14 @@ def test_series_files_URLs(self):
597597
assert len(files_aws) > 0
598598
assert len(files_gcp) == len(files_aws)
599599

600+
def test_instance_file_URLs(self):
601+
c = IDCClient()
602+
sopInstanceUID = "1.3.6.1.4.1.5962.99.1.1900325859.924065538.1719887277027.10.0"
603+
file_url = "s3://idc-open-data/763fe058-7d25-4ba7-9b29-fd3d6c41dc4b/210f0529-c767-4795-9acf-bad2f4877427.dcm"
604+
files_aws = c.get_instance_file_URL(sopInstanceUID, "aws")
605+
files_gcp = c.get_instance_file_URL(sopInstanceUID, "gcp")
606+
assert files_aws == files_gcp == file_url
607+
600608

601609
if __name__ == "__main__":
602610
unittest.main()

0 commit comments

Comments
 (0)