@@ -133,6 +133,11 @@ def __init__(self):
133133 },
134134 }
135135
136+ # these will point to the dataframes containing the respective indices, once installed
137+ self .sm_index = None
138+ self .sm_instance_index = None
139+ self .clinical_index = None
140+
136141 # Lookup s5cmd
137142 self .s5cmdPath = shutil .which ("s5cmd" )
138143 if self .s5cmdPath is None :
@@ -355,7 +360,9 @@ def fetch_index(self, index_name) -> None:
355360 # self.index[["series_aws_url", "SeriesInstanceUID"]],
356361 # on="SeriesInstanceUID", how="left"
357362 # )
358- setattr (self .__class__ , index_name , index_table )
363+ # TODO: consider switching to class variable!
364+ # setattr(self.__class__, index_name, index_table)
365+ setattr (self , index_name , index_table )
359366 self .indices_overview [index_name ]["installed" ] = True
360367 self .indices_overview [index_name ]["file_path" ] = filepath
361368
@@ -676,6 +683,54 @@ def get_series_file_URLs(self, seriesInstanceUID, source_bucket_location="aws"):
676683
677684 return file_names
678685
686+ def get_instance_file_URL (self , sopInstanceUID , source_bucket_location = "aws" ):
687+ """
688+ Get the bucket URL of the file corresponding to a given SOPInstanceUID.
689+
690+ This function will only return the URL for the Slide Microscopy (SM) instances,
691+ which are maintained in the `sm_instance_index` table.
692+
693+ Args:
694+ sopInstanceUID: string containing the value of DICOM SOPInstanceUID
695+ source_bucket_location: string containing the source bucket location, either "aws" or "gcp"
696+
697+ Returns:
698+ string containing the bucket URL of the file corresponding to the SOPInstanceUID,
699+ or None if the SOPInstanceUID is not recognized
700+ """
701+
702+ # sm_instance_index is required to complete this operation - install it!
703+ self .fetch_index ("sm_instance_index" )
704+
705+ if self .sm_instance_index is None :
706+ logger .error (
707+ "sm_instance_index could not be installed. Please install it first using fetch_index."
708+ )
709+ return None
710+
711+ if sopInstanceUID not in self .sm_instance_index ["SOPInstanceUID" ].values : # pylint: disable=unsubscriptable-object
712+ raise ValueError ("SOPInstanceUID not found in IDC sm_instance_index." )
713+
714+ # merge with the main index to get series_aws_url
715+ selected_instance_df = self .sm_instance_index [ # pylint: disable=unsubscriptable-object
716+ self .sm_instance_index ["SOPInstanceUID" ] == sopInstanceUID # pylint: disable=unsubscriptable-object
717+ ].copy ()[["SeriesInstanceUID" , "SOPInstanceUID" , "crdc_instance_uuid" ]]
718+ selected_instance_df = pd .merge (
719+ selected_instance_df ,
720+ self .index ,
721+ on = "SeriesInstanceUID" ,
722+ how = "left" ,
723+ )
724+
725+ if source_bucket_location == "gcp" :
726+ # replace AWS with the GCP bucket
727+ self ._replace_aws_with_gcp_buckets (selected_instance_df , "series_aws_url" )
728+
729+ # instance files are named using crdc_instance_uuid
730+ series_url = selected_instance_df .iloc [0 ]["series_aws_url" ][:- 1 ]
731+ instance_uuid = selected_instance_df .iloc [0 ]["crdc_instance_uuid" ]
732+ return series_url + instance_uuid + ".dcm"
733+
679734 def get_viewer_URL (
680735 self , seriesInstanceUID = None , studyInstanceUID = None , viewer_selector = None
681736 ):
@@ -1721,8 +1776,8 @@ def download_from_selection(
17211776 # If SOPInstanceUID(s) are given, we need to join the main index with the instance-level index
17221777 sm_instance_index = None
17231778 if sopInstanceUID :
1724- if hasattr (
1725- self , " sm_instance_index"
1779+ if (
1780+ self . sm_instance_index is not None
17261781 ): # check if instance-level index is installed
17271782 download_df = self .sm_instance_index
17281783 sm_instance_index = self .sm_instance_index
@@ -2138,12 +2193,12 @@ def sql_query(self, sql_query):
21382193 logger .debug ("Executing SQL query: " + sql_query )
21392194 # TODO: find a more elegant way to automate the following: https://www.perplexity.ai/search/write-python-code-that-iterate-XY9ppywbQFSRnOpgbwx_uQ
21402195 index = self .index
2141- if hasattr ( self , " sm_index" ) :
2196+ if self . sm_index is not None :
21422197 sm_index = self .sm_index
2143- if hasattr ( self , " sm_instance_index" ) :
2198+ if self . sm_instance_index is not None :
21442199 sm_instance_index = self .sm_instance_index
2145- if hasattr ( self , " clinical_index" ) :
2200+ if self . clinical_index is not None :
21462201 clinical_index = self .clinical_index
2147- if hasattr ( self , " prior_versions_index" ) :
2202+ if self . prior_versions_index is not None :
21482203 prior_versions_index = self .prior_versions_index
21492204 return duckdb .query (sql_query ).to_df ()
0 commit comments