Skip to content

Commit 5ffbf5e

Browse files
Merge pull request #3 from histolab/integrate-TCIA-endpoints
Integrate tcia endpoints
2 parents 68752ce + 47f5b57 commit 5ffbf5e

File tree

10 files changed

+335
-42
lines changed

10 files changed

+335
-42
lines changed

README.md

Lines changed: 61 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,75 @@ The GDC API drives the GDC Data and Submission Portals and provides programmatic
1616
### Installation
1717
`pip install gdc-api-wrapper`
1818

19+
## TCGA API Reference
20+
1921
### Download single file
2022
```python
21-
from gdcapiwrapper.data import Data
23+
from gdcapiwrapper.tcga import Data
2224
Data.download(uuid="uuid-file-you-wanna-download", path="/local/path", name="filename")
2325
```
2426
NOTE: `path` and `name` are optional, by default path is your current directory and if name is
2527
not provided it will be saved with the UUID as filname.
2628

2729
### Download multiple files
2830
```python
29-
from gdcapiwrapper.data import Data
30-
Data.download_multiple(uuid_list=["UUID1", "UUID2", "UUID3"], path="/local/path")
31+
from gdcapiwrapper.tcga import Data
32+
response, filename =Data.download_multiple(uuid_list=["UUID1", "UUID2", "UUID3"], path="/local/path")
3133
```
3234
NOTE: `path` is optional, by default path is your current directory.
35+
36+
37+
## TCIA API Reference
38+
39+
### Get a list of SOPInstanceUID for a given series
40+
```python
41+
from gdcapiwrapper.tcia import Data
42+
# Example for CSV, HTML, XML
43+
response, filename = Data.sop_instance_uids(
44+
series_instance_uid="uid.series.instance",
45+
format_="JSON",
46+
path="/local/path",
47+
name="filename"
48+
)
49+
# Example for JSON
50+
response, json = Data.sop_instance_uids(series_instance_uid="uid.series.instance")
51+
```
52+
Formats allowed: `["CSV", "HTML", "JSON", "XML"]`, default: `JSON`. When `JSON` is requested the API will not save any
53+
json file on disk, returns an in memory json object.
54+
55+
NOTE: `path` and `name` are optional, by default path is your current directory and if name is
56+
not provided it will be saved with the SeriesInstance as filename.
57+
58+
### Download Single DICOM image
59+
```python
60+
from gdcapiwrapper.tcia import Data
61+
response, filename = Data.download_single_image(
62+
series_instance_uid="uid.series.instance",
63+
sop_instance_uid="uid.sop.instance",
64+
path="/local/path",
65+
name="filename.dcm",
66+
)
67+
```
68+
NOTE: `path` and `name` are optional, by default path is your current directory and if name is
69+
not provided it will be saved with the SOPInstanceUID as filename.
70+
71+
### Download set of images in a zip file
72+
```python
73+
from gdcapiwrapper.tcia import Data
74+
response, filename = Data.download_series_instance_images(
75+
series_instance_uid="uid.series.instance",
76+
path="/local/path",
77+
name="filename.zip")
78+
```
79+
NOTE: `path` and `name` are optional, by default path is your current directory and if name is
80+
not provided it will be saved with the SOPInstanceUID as filename.
81+
82+
## Changelog
83+
84+
### 0.1
85+
- TCGA Api endpoints
86+
87+
### 0.2
88+
- Bug Fix on TCGA Apis
89+
- Public interface refactoring [breaking change]
90+
- TCIA Api endpoints

src/gdcapiwrapper/__init__.py

Lines changed: 2 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,5 @@
11
# encoding: utf-8
22

3-
import os
4-
import requests
3+
"""Initialization module for gdcapiwrapper package."""
54

6-
__version__ = "0.1"
7-
GDC_API_TOKEN = os.environ.get("GCC_API_TOKEN", None)
8-
GDC_API_BASE_URL = os.environ.get("GDC_API_BASE_URL", "https://api.gdc.cancer.gov/")
9-
10-
11-
class APIBaseURLStatusError(Exception):
12-
pass
13-
14-
15-
class APITokenMissingError(Exception):
16-
pass
17-
18-
19-
request = requests.get(f"{GDC_API_BASE_URL}/status")
20-
21-
22-
if request.status_code != 200:
23-
raise APIBaseURLStatusError(
24-
f"{GDC_API_BASE_URL} status: {request.status_code}."
25-
"The resource seems to be unavailable"
26-
)
27-
28-
session = requests.Session()
29-
session.params = {"api_token": GDC_API_TOKEN, "api_base_url": GDC_API_BASE_URL}
30-
31-
from .data import Data # isort:skip # noqa
5+
__version__ = "0.2b"

src/gdcapiwrapper/enums.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# encoding: utf-8
2+
3+
from enum import Enum
4+
5+
6+
class FORMAT_TYPE(Enum):
7+
"""Enumerated values representing the various types of file format."""
8+
9+
# ---member definitions---
10+
CSV = "CSV"
11+
HTML = "HTML"
12+
JSON = "JSON"
13+
XML = "XML"
14+
15+
# ---allowed formats for TCIA apis---
16+
TCIA_ALLOWED_FORMATS = frozenset((CSV, HTML, JSON, XML))

src/gdcapiwrapper/exceptions.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# encoding: utf-8
2+
3+
4+
class APIBaseURLStatusError(Exception):
5+
pass
6+
7+
8+
class APITokenMissingError(Exception):
9+
pass

src/gdcapiwrapper/tcga/__init__.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# encoding: utf-8
2+
3+
import os
4+
import requests
5+
6+
from ..exceptions import APIBaseURLStatusError
7+
8+
9+
TCGA_API_TOKEN = os.environ.get("TCGA_API_TOKEN", None)
10+
TCGA_API_BASE_URL = os.environ.get("TCGA_API_BASE_URL", "https://api.gdc.cancer.gov/")
11+
12+
13+
request = requests.get(f"{TCGA_API_BASE_URL}/status")
14+
15+
16+
if request.status_code != 200:
17+
raise APIBaseURLStatusError(
18+
f"{TCGA_API_BASE_URL} status: {request.status_code}."
19+
"The resource seems to be unavailable"
20+
)
21+
22+
session = requests.Session()
23+
session.params = {"api_token": TCGA_API_TOKEN, "api_base_url": TCGA_API_BASE_URL}
24+
25+
from .tcga import Data # isort:skip # noqa

src/gdcapiwrapper/data.py renamed to src/gdcapiwrapper/tcga/tcga.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# encoding: utf-8
22

3-
43
import os
54
import re
65
from datetime import datetime
@@ -11,15 +10,15 @@
1110
from tqdm import tqdm
1211

1312
from . import session
14-
from .util import copyfileobj
13+
from ..util import copyfileobj
1514

1615
__data_endpoint__ = "data"
1716

1817
base_url = f"{session.params.get('api_base_url')}/{__data_endpoint__}"
1918

2019

2120
class Data(object):
22-
""" Provides Data objects for https://api.gdc.cancer.gov/data/ `Data Endpoints`
21+
"""Provides Data objects for https://api.gdc.cancer.gov/data/ `Data Endpoints`
2322
2423
Includes endpoints for file(s) download
2524
"""
@@ -46,7 +45,7 @@ def download(
4645
"""
4746
url = f"{base_url}/{uuid}"
4847

49-
local_filename = uuid if not name else name
48+
local_filename = name if name else uuid
5049
with requests.get(url, stream=True) as r:
5150
total_size = int(r.headers.get("content-length", 0))
5251
bar = tqdm(total=total_size, unit="iB", unit_scale=True)

src/gdcapiwrapper/tcia/__init__.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# encoding: utf-8
2+
3+
import os
4+
import requests
5+
6+
7+
TCIA_API_TOKEN = os.environ.get("TCIA_API_TOKEN", None)
8+
TCIA_API_BASE_URL = os.environ.get(
9+
"TCIA_API_BASE_URL", "https://services.cancerimagingarchive.net/services/v4/TCIA"
10+
)
11+
12+
13+
session = requests.Session()
14+
session.params = {"api_token": TCIA_API_TOKEN, "api_base_url": TCIA_API_BASE_URL}
15+
16+
from .tcia import Data # isort:skip # noqa

src/gdcapiwrapper/tcia/tcia.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
# encoding: utf-8
2+
3+
import os
4+
from typing import Tuple
5+
6+
import requests
7+
from responses import Response
8+
from tqdm import tqdm
9+
10+
from ..enums import FORMAT_TYPE as FT
11+
from . import session
12+
from ..util import copyfileobj
13+
14+
__data_endpoint__ = "query"
15+
16+
base_url = f"{session.params.get('api_base_url')}/{__data_endpoint__}"
17+
18+
19+
class Data(object):
20+
"""Provides Data objects for
21+
https://services.cancerimagingarchive.net/services/v4/TCIA/ `Data Endpoints`
22+
"""
23+
24+
@classmethod
25+
def download_single_image(
26+
cls,
27+
series_instance_uid: str,
28+
sop_instance_uid: str,
29+
path: str = ".",
30+
name: str = None,
31+
) -> Tuple[Response, str]:
32+
"""Returns a SINGLE DICOM Object.
33+
34+
A single image is identified by its SeriesInstanceUID and SOPInstanceUID.
35+
This API will always be used following the `sop_instance_uids`
36+
37+
Parameters
38+
---------
39+
series_instance_uid : str
40+
SeriesInstance UID
41+
sop_instance_uid: str
42+
SOPInstanceUID UID
43+
path: str
44+
Local path where save file (default: current path)
45+
name: str
46+
Filename. If not provided it will be saved with SOPInstance UID as name
47+
48+
Returns
49+
-------
50+
tuple
51+
response, filename absolute path
52+
"""
53+
url = (
54+
f"{base_url}/getSingleImage?SeriesInstanceUID={series_instance_uid}&"
55+
f"SOPInstanceUID={sop_instance_uid}"
56+
)
57+
local_filename = name if name else f"{sop_instance_uid}.dcm"
58+
with requests.get(url, stream=True) as r:
59+
total_size = int(r.headers.get("content-length", 0))
60+
bar = tqdm(total=total_size, unit="iB", unit_scale=True)
61+
with open(os.path.join(path, local_filename), "wb") as f:
62+
copyfileobj(r.raw, f, bar)
63+
return r, local_filename
64+
65+
@classmethod
66+
def download_series_instance_images(
67+
cls, series_instance_uid: str, path: str = ".", name: str = None
68+
) -> Tuple[Response, str]:
69+
"""Returns a single Zip file with set of images for the given SeriesInstance.
70+
71+
Parameters
72+
---------
73+
series_instance_uid : str
74+
SeriesInstance UID
75+
path: str
76+
Local path where save file (default: current path)
77+
name: str
78+
Filename. If not provided it will be saved with SOPInstance UID as name
79+
80+
Returns
81+
-------
82+
tuple
83+
response, filename absolute path
84+
"""
85+
url = f"{base_url}/getImage?SeriesInstanceUID={series_instance_uid}"
86+
local_filename = name if name else f"{series_instance_uid}.zip"
87+
with requests.get(url, stream=True) as r:
88+
total_size = int(r.headers.get("content-length", 0))
89+
bar = tqdm(total=total_size, unit="iB", unit_scale=True)
90+
with open(os.path.join(path, local_filename), "wb") as f:
91+
copyfileobj(r.raw, f, bar)
92+
return r, local_filename
93+
94+
@classmethod
95+
def sop_instance_uids(
96+
cls,
97+
series_instance_uid: str,
98+
format_: str = "JSON",
99+
path: str = ".",
100+
name: str = None,
101+
) -> Tuple[Response, str]:
102+
"""Return a list of SOPInstanceUID for a given SeriesInstanceUID
103+
104+
Parameters
105+
---------
106+
series_instance_uid : str
107+
SeriesInstance UID
108+
format_ : str
109+
Output format. This endpoint supports CSV/HTML/XML/JSON
110+
path: str
111+
Local path where save file (default: current path)
112+
name: str
113+
Filename. If not provided it will be saved with SeriesInstance UID as name
114+
115+
Returns
116+
-------
117+
tuple
118+
response, filename absolute path or json
119+
"""
120+
if format_.upper() not in FT.TCIA_ALLOWED_FORMATS.value:
121+
raise ValueError(
122+
f"Format not allowed. Allowed formats:"
123+
f"{list(FT.TCIA_ALLOWED_FORMATS.value)}, got {format_}."
124+
)
125+
url = (
126+
f"{base_url}/getSOPInstanceUIDs?SeriesInstanceUID={series_instance_uid}&"
127+
f"format={format_}"
128+
)
129+
r = requests.get(url)
130+
if format_.upper() == "JSON":
131+
return r, r.json()
132+
133+
local_filename = name if name else f"{series_instance_uid}.{format_.lower()}"
134+
with open(os.path.join(path, local_filename), "wb") as f:
135+
f.write(r.content)
136+
return r, local_filename

tests/mockserver.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212

1313
class MockServerRequestHandler(BaseHTTPRequestHandler):
14-
API_PATTERN = re.compile(r"/data|/")
14+
API_PATTERN = re.compile(r"/data|query|/")
1515

1616
def do_GET(self):
1717
if re.search(self.API_PATTERN, self.path):

0 commit comments

Comments
 (0)