Update to use newer duckdb and update numpy (#158)

fedorov · web-flow · commit deb12398516b · 2025-03-27T16:09:19.000-04:00
* ENH: remove unnecessary sql_query and fix file list function The latter had the part handling aws/gcs endpoint missing. Plus added test for get_series_file_URLs() * ENH: revisit queries to make sure all referenced DFs are in scope also update the requirements for duckdb version to use up to the latest Re #125 * ENH: upgrade numpy to up to 2.2.4 Re #157
diff --git a/idc_index/index.py b/idc_index/index.py
@@ -630,27 +630,37 @@ def get_series_file_URLs(self, seriesInstanceUID, source_bucket_location="aws"):
         Returns:
             list of strings containing the AWS S3 URLs of the files corresponding to the SeriesInstanceUID
         """
-        # Query to get the S3 URL
-        s3url_query = f"""
-        SELECT
-        CONCAT('s3://',aws_bucket,'/',crdc_series_uuid,'/*') as series_aws_url
-        FROM
-        index
-        WHERE
-        SeriesInstanceUID='{seriesInstanceUID}'
-        """
-        s3url_query_df = self.sql_query(s3url_query)
+        if seriesInstanceUID not in self.index["SeriesInstanceUID"].values:
+            raise ValueError("SeriesInstanceUID not found in IDC index.")
+
+        selected_series_df = self.index[
+            self.index["SeriesInstanceUID"] == seriesInstanceUID
+        ].copy()
+        selected_series_df["series_aws_url"] = (
+            "s3://"
+            + selected_series_df["aws_bucket"]
+            + "/"
+            + selected_series_df["crdc_series_uuid"]
+            + "/"
+        )
 
+        endpoint = aws_endpoint_url
         if source_bucket_location == "gcp":
-            self._replace_aws_with_gcp_buckets(s3url_query_df, "series_aws_url")
-        s3_url = s3url_query_df.series_aws_url[0]
+            self._replace_aws_with_gcp_buckets(selected_series_df, "series_aws_url")
+            endpoint = gcp_endpoint_url
 
-        # Remove the last character from the S3 URL
-        s3_url = s3_url[:-1]
+        s3_url = selected_series_df["series_aws_url"].values[0]
 
         # Run the s5cmd ls command and capture its output
         result = subprocess.run(
-            [self.s5cmdPath, "--no-sign-request", "ls", s3_url],
+            [
+                self.s5cmdPath,
+                "--endpoint-url",
+                endpoint,
+                "--no-sign-request",
+                "ls",
+                s3_url,
+            ],
             stdout=subprocess.PIPE,
             check=False,
         )
@@ -974,7 +984,6 @@ def _validate_update_manifest_and_get_download_size(
                 index_temp.index_crdc_series_uuid = manifest_temp.manifest_crdc_series_uuid
             """
             merged_df = duckdb.sql(missing_series_sql).df()
-            print(merged_df)
             if not all(merged_df["crdc_series_uuid_match"]):
                 missing_manifest_cp_cmds = merged_df.loc[
                     ~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd"
@@ -1592,18 +1601,8 @@ def citations_from_manifest(
         index_copy["crdc_series_uuid"] = index_copy["series_aws_url"].str.extract(
             uuid_pattern, expand=False
         )
-        query = """
-        SELECT
-          SeriesInstanceUID
-        FROM
-          index_copy
-        JOIN
-          manifest_df
-        ON
-          index_copy.crdc_series_uuid = manifest_df.crdc_series_uuid
-        """
 
-        result_df = self.sql_query(query)
+        result_df = pd.merge(manifest_df, index_copy, on="crdc_series_uuid", how="left")
 
         return self.citations_from_selection(
             seriesInstanceUID=result_df["SeriesInstanceUID"].tolist(),
@@ -1722,11 +1721,13 @@ def download_from_selection(
         downloadDir = self._check_create_directory(downloadDir)
 
         # If SOPInstanceUID(s) are given, we need to join the main index with the instance-level index
+        sm_instance_index = None
         if sopInstanceUID:
             if hasattr(
                 self, "sm_instance_index"
             ):  # check if instance-level index is installed
                 download_df = self.sm_instance_index
+                sm_instance_index = self.sm_instance_index
             else:
                 logger.error(
                     "Instance-level access not possible because instance-level index not installed."
@@ -1844,7 +1845,8 @@ def download_from_selection(
                 JOIN
                     index using (seriesInstanceUID)
                 """
-        result_df = self.sql_query(sql)
+        index = self.index
+        result_df = duckdb.query(sql).df()
         # Download the files and make temporary file to store the list of files to download
 
         with tempfile.NamedTemporaryFile(mode="w", delete=False) as manifest_file:
@@ -2135,14 +2137,15 @@ def sql_query(self, sql_query):
             duckdb.Error: any exception that duckdb.query() raises
         """
 
-        index = self.index
-
         logger.debug("Executing SQL query: " + sql_query)
         # TODO: find a more elegant way to automate the following:  https://www.perplexity.ai/search/write-python-code-that-iterate-XY9ppywbQFSRnOpgbwx_uQ
+        index = self.index
         if hasattr(self, "sm_index"):
             sm_index = self.sm_index
         if hasattr(self, "sm_instance_index"):
             sm_instance_index = self.sm_instance_index
         if hasattr(self, "clinical_index"):
             clinical_index = self.clinical_index
+        if hasattr(self, "prior_versions_index"):
+            prior_versions_index = self.prior_versions_index
         return duckdb.query(sql_query).to_df()
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,10 +33,10 @@ classifiers = [
 dynamic = ["version"]
 dependencies = [
   "click",
-  'duckdb>=0.10.0,<1.1.0',
+  'duckdb>=0.10.0,<=1.2.1',
   "idc-index-data==20.0.3",
   "packaging",
-  "pandas<2.2",
+  "pandas<=2.2.4",
   "platformdirs",
   "psutil",
   "pyarrow",
diff --git a/tests/idcindex.py b/tests/idcindex.py
@@ -587,6 +587,16 @@ def test_clinical_index_install(self):
         nlst_clinical = i.get_clinical_table("nlst_clinical")
         assert nlst_clinical is not None
 
+    def test_series_files_URLs(self):
+        c = IDCClient()
+        seriesInstanceUID = (
+            "1.3.6.1.4.1.14519.5.2.1.3671.4754.228015946741563785297552112143"
+        )
+        files_aws = c.get_series_file_URLs(seriesInstanceUID, "aws")
+        files_gcp = c.get_series_file_URLs(seriesInstanceUID, "gcp")
+        assert len(files_aws) > 0
+        assert len(files_gcp) == len(files_aws)
+
 
 if __name__ == "__main__":
     unittest.main()