@@ -630,27 +630,37 @@ def get_series_file_URLs(self, seriesInstanceUID, source_bucket_location="aws"):
630630 Returns:
631631 list of strings containing the AWS S3 URLs of the files corresponding to the SeriesInstanceUID
632632 """
633- # Query to get the S3 URL
634- s3url_query = f"""
635- SELECT
636- CONCAT('s3://',aws_bucket,'/',crdc_series_uuid,'/*') as series_aws_url
637- FROM
638- index
639- WHERE
640- SeriesInstanceUID='{ seriesInstanceUID } '
641- """
642- s3url_query_df = self .sql_query (s3url_query )
633+ if seriesInstanceUID not in self .index ["SeriesInstanceUID" ].values :
634+ raise ValueError ("SeriesInstanceUID not found in IDC index." )
635+
636+ selected_series_df = self .index [
637+ self .index ["SeriesInstanceUID" ] == seriesInstanceUID
638+ ].copy ()
639+ selected_series_df ["series_aws_url" ] = (
640+ "s3://"
641+ + selected_series_df ["aws_bucket" ]
642+ + "/"
643+ + selected_series_df ["crdc_series_uuid" ]
644+ + "/"
645+ )
643646
647+ endpoint = aws_endpoint_url
644648 if source_bucket_location == "gcp" :
645- self ._replace_aws_with_gcp_buckets (s3url_query_df , "series_aws_url" )
646- s3_url = s3url_query_df . series_aws_url [ 0 ]
649+ self ._replace_aws_with_gcp_buckets (selected_series_df , "series_aws_url" )
650+ endpoint = gcp_endpoint_url
647651
648- # Remove the last character from the S3 URL
649- s3_url = s3_url [:- 1 ]
652+ s3_url = selected_series_df ["series_aws_url" ].values [0 ]
650653
651654 # Run the s5cmd ls command and capture its output
652655 result = subprocess .run (
653- [self .s5cmdPath , "--no-sign-request" , "ls" , s3_url ],
656+ [
657+ self .s5cmdPath ,
658+ "--endpoint-url" ,
659+ endpoint ,
660+ "--no-sign-request" ,
661+ "ls" ,
662+ s3_url ,
663+ ],
654664 stdout = subprocess .PIPE ,
655665 check = False ,
656666 )
@@ -974,7 +984,6 @@ def _validate_update_manifest_and_get_download_size(
974984 index_temp.index_crdc_series_uuid = manifest_temp.manifest_crdc_series_uuid
975985 """
976986 merged_df = duckdb .sql (missing_series_sql ).df ()
977- print (merged_df )
978987 if not all (merged_df ["crdc_series_uuid_match" ]):
979988 missing_manifest_cp_cmds = merged_df .loc [
980989 ~ merged_df ["crdc_series_uuid_match" ], "manifest_cp_cmd"
@@ -1592,18 +1601,8 @@ def citations_from_manifest(
15921601 index_copy ["crdc_series_uuid" ] = index_copy ["series_aws_url" ].str .extract (
15931602 uuid_pattern , expand = False
15941603 )
1595- query = """
1596- SELECT
1597- SeriesInstanceUID
1598- FROM
1599- index_copy
1600- JOIN
1601- manifest_df
1602- ON
1603- index_copy.crdc_series_uuid = manifest_df.crdc_series_uuid
1604- """
16051604
1606- result_df = self . sql_query ( query )
1605+ result_df = pd . merge ( manifest_df , index_copy , on = "crdc_series_uuid" , how = "left" )
16071606
16081607 return self .citations_from_selection (
16091608 seriesInstanceUID = result_df ["SeriesInstanceUID" ].tolist (),
@@ -1722,11 +1721,13 @@ def download_from_selection(
17221721 downloadDir = self ._check_create_directory (downloadDir )
17231722
17241723 # If SOPInstanceUID(s) are given, we need to join the main index with the instance-level index
1724+ sm_instance_index = None
17251725 if sopInstanceUID :
17261726 if hasattr (
17271727 self , "sm_instance_index"
17281728 ): # check if instance-level index is installed
17291729 download_df = self .sm_instance_index
1730+ sm_instance_index = self .sm_instance_index
17301731 else :
17311732 logger .error (
17321733 "Instance-level access not possible because instance-level index not installed."
@@ -1844,7 +1845,8 @@ def download_from_selection(
18441845 JOIN
18451846 index using (seriesInstanceUID)
18461847 """
1847- result_df = self .sql_query (sql )
1848+ index = self .index
1849+ result_df = duckdb .query (sql ).df ()
18481850 # Download the files and make temporary file to store the list of files to download
18491851
18501852 with tempfile .NamedTemporaryFile (mode = "w" , delete = False ) as manifest_file :
@@ -2135,14 +2137,15 @@ def sql_query(self, sql_query):
21352137 duckdb.Error: any exception that duckdb.query() raises
21362138 """
21372139
2138- index = self .index
2139-
21402140 logger .debug ("Executing SQL query: " + sql_query )
21412141 # TODO: find a more elegant way to automate the following: https://www.perplexity.ai/search/write-python-code-that-iterate-XY9ppywbQFSRnOpgbwx_uQ
2142+ index = self .index
21422143 if hasattr (self , "sm_index" ):
21432144 sm_index = self .sm_index
21442145 if hasattr (self , "sm_instance_index" ):
21452146 sm_instance_index = self .sm_instance_index
21462147 if hasattr (self , "clinical_index" ):
21472148 clinical_index = self .clinical_index
2149+ if hasattr (self , "prior_versions_index" ):
2150+ prior_versions_index = self .prior_versions_index
21482151 return duckdb .query (sql_query ).to_df ()
0 commit comments