Skip to content

Commit deb1239

Browse files
authored
Update to use newer duckdb and update numpy (#158)
* ENH: remove unnecessary sql_query and fix file list function The latter had the part handling aws/gcs endpoint missing. Plus added test for get_series_file_URLs() * ENH: revisit queries to make sure all referenced DFs are in scope also update the requirements for duckdb version to use up to the latest Re #125 * ENH: upgrade numpy to up to 2.2.4 Re #157
1 parent 326bcc7 commit deb1239

File tree

3 files changed

+45
-32
lines changed

3 files changed

+45
-32
lines changed

idc_index/index.py

Lines changed: 33 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -630,27 +630,37 @@ def get_series_file_URLs(self, seriesInstanceUID, source_bucket_location="aws"):
630630
Returns:
631631
list of strings containing the AWS S3 URLs of the files corresponding to the SeriesInstanceUID
632632
"""
633-
# Query to get the S3 URL
634-
s3url_query = f"""
635-
SELECT
636-
CONCAT('s3://',aws_bucket,'/',crdc_series_uuid,'/*') as series_aws_url
637-
FROM
638-
index
639-
WHERE
640-
SeriesInstanceUID='{seriesInstanceUID}'
641-
"""
642-
s3url_query_df = self.sql_query(s3url_query)
633+
if seriesInstanceUID not in self.index["SeriesInstanceUID"].values:
634+
raise ValueError("SeriesInstanceUID not found in IDC index.")
635+
636+
selected_series_df = self.index[
637+
self.index["SeriesInstanceUID"] == seriesInstanceUID
638+
].copy()
639+
selected_series_df["series_aws_url"] = (
640+
"s3://"
641+
+ selected_series_df["aws_bucket"]
642+
+ "/"
643+
+ selected_series_df["crdc_series_uuid"]
644+
+ "/"
645+
)
643646

647+
endpoint = aws_endpoint_url
644648
if source_bucket_location == "gcp":
645-
self._replace_aws_with_gcp_buckets(s3url_query_df, "series_aws_url")
646-
s3_url = s3url_query_df.series_aws_url[0]
649+
self._replace_aws_with_gcp_buckets(selected_series_df, "series_aws_url")
650+
endpoint = gcp_endpoint_url
647651

648-
# Remove the last character from the S3 URL
649-
s3_url = s3_url[:-1]
652+
s3_url = selected_series_df["series_aws_url"].values[0]
650653

651654
# Run the s5cmd ls command and capture its output
652655
result = subprocess.run(
653-
[self.s5cmdPath, "--no-sign-request", "ls", s3_url],
656+
[
657+
self.s5cmdPath,
658+
"--endpoint-url",
659+
endpoint,
660+
"--no-sign-request",
661+
"ls",
662+
s3_url,
663+
],
654664
stdout=subprocess.PIPE,
655665
check=False,
656666
)
@@ -974,7 +984,6 @@ def _validate_update_manifest_and_get_download_size(
974984
index_temp.index_crdc_series_uuid = manifest_temp.manifest_crdc_series_uuid
975985
"""
976986
merged_df = duckdb.sql(missing_series_sql).df()
977-
print(merged_df)
978987
if not all(merged_df["crdc_series_uuid_match"]):
979988
missing_manifest_cp_cmds = merged_df.loc[
980989
~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd"
@@ -1592,18 +1601,8 @@ def citations_from_manifest(
15921601
index_copy["crdc_series_uuid"] = index_copy["series_aws_url"].str.extract(
15931602
uuid_pattern, expand=False
15941603
)
1595-
query = """
1596-
SELECT
1597-
SeriesInstanceUID
1598-
FROM
1599-
index_copy
1600-
JOIN
1601-
manifest_df
1602-
ON
1603-
index_copy.crdc_series_uuid = manifest_df.crdc_series_uuid
1604-
"""
16051604

1606-
result_df = self.sql_query(query)
1605+
result_df = pd.merge(manifest_df, index_copy, on="crdc_series_uuid", how="left")
16071606

16081607
return self.citations_from_selection(
16091608
seriesInstanceUID=result_df["SeriesInstanceUID"].tolist(),
@@ -1722,11 +1721,13 @@ def download_from_selection(
17221721
downloadDir = self._check_create_directory(downloadDir)
17231722

17241723
# If SOPInstanceUID(s) are given, we need to join the main index with the instance-level index
1724+
sm_instance_index = None
17251725
if sopInstanceUID:
17261726
if hasattr(
17271727
self, "sm_instance_index"
17281728
): # check if instance-level index is installed
17291729
download_df = self.sm_instance_index
1730+
sm_instance_index = self.sm_instance_index
17301731
else:
17311732
logger.error(
17321733
"Instance-level access not possible because instance-level index not installed."
@@ -1844,7 +1845,8 @@ def download_from_selection(
18441845
JOIN
18451846
index using (seriesInstanceUID)
18461847
"""
1847-
result_df = self.sql_query(sql)
1848+
index = self.index
1849+
result_df = duckdb.query(sql).df()
18481850
# Download the files and make temporary file to store the list of files to download
18491851

18501852
with tempfile.NamedTemporaryFile(mode="w", delete=False) as manifest_file:
@@ -2135,14 +2137,15 @@ def sql_query(self, sql_query):
21352137
duckdb.Error: any exception that duckdb.query() raises
21362138
"""
21372139

2138-
index = self.index
2139-
21402140
logger.debug("Executing SQL query: " + sql_query)
21412141
# TODO: find a more elegant way to automate the following: https://www.perplexity.ai/search/write-python-code-that-iterate-XY9ppywbQFSRnOpgbwx_uQ
2142+
index = self.index
21422143
if hasattr(self, "sm_index"):
21432144
sm_index = self.sm_index
21442145
if hasattr(self, "sm_instance_index"):
21452146
sm_instance_index = self.sm_instance_index
21462147
if hasattr(self, "clinical_index"):
21472148
clinical_index = self.clinical_index
2149+
if hasattr(self, "prior_versions_index"):
2150+
prior_versions_index = self.prior_versions_index
21482151
return duckdb.query(sql_query).to_df()

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,10 @@ classifiers = [
3333
dynamic = ["version"]
3434
dependencies = [
3535
"click",
36-
'duckdb>=0.10.0,<1.1.0',
36+
'duckdb>=0.10.0,<=1.2.1',
3737
"idc-index-data==20.0.3",
3838
"packaging",
39-
"pandas<2.2",
39+
"pandas<=2.2.4",
4040
"platformdirs",
4141
"psutil",
4242
"pyarrow",

tests/idcindex.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,16 @@ def test_clinical_index_install(self):
587587
nlst_clinical = i.get_clinical_table("nlst_clinical")
588588
assert nlst_clinical is not None
589589

590+
def test_series_files_URLs(self):
591+
c = IDCClient()
592+
seriesInstanceUID = (
593+
"1.3.6.1.4.1.14519.5.2.1.3671.4754.228015946741563785297552112143"
594+
)
595+
files_aws = c.get_series_file_URLs(seriesInstanceUID, "aws")
596+
files_gcp = c.get_series_file_URLs(seriesInstanceUID, "gcp")
597+
assert len(files_aws) > 0
598+
assert len(files_gcp) == len(files_aws)
599+
590600

591601
if __name__ == "__main__":
592602
unittest.main()

0 commit comments

Comments
 (0)