Merge pull request #3 from histolab/integrate-TCIA-endpoints

ernestoarbitrio · web-flow · commit 5ffbf5eaaee1 · 2020-11-10T17:43:26.000+01:00
Integrate tcia endpoints
diff --git a/README.md b/README.md
@@ -16,17 +16,75 @@ The GDC API drives the GDC Data and Submission Portals and provides programmatic
 ### Installation
 `pip install gdc-api-wrapper`
 
+## TCGA API Reference
+
 ### Download single file
 ```python
-from gdcapiwrapper.data import Data
+from gdcapiwrapper.tcga import Data
 Data.download(uuid="uuid-file-you-wanna-download", path="/local/path", name="filename")
 ```
 NOTE: `path` and `name` are optional, by default path is your current directory and if name is 
 not provided it will be saved with the UUID as filname.
 
 ### Download multiple files
 ```python
-from gdcapiwrapper.data import Data
-Data.download_multiple(uuid_list=["UUID1", "UUID2", "UUID3"], path="/local/path")
+from gdcapiwrapper.tcga import Data
+response, filename =Data.download_multiple(uuid_list=["UUID1", "UUID2", "UUID3"], path="/local/path")
 ```
 NOTE: `path` is optional, by default path is your current directory.
+
+
+## TCIA API Reference
+
+### Get a list of SOPInstanceUID for a given series
+```python
+from gdcapiwrapper.tcia import Data
+# Example for CSV, HTML, XML
+response, filename = Data.sop_instance_uids(
+                        series_instance_uid="uid.series.instance",
+                        format_="JSON",
+                        path="/local/path", 
+                        name="filename"
+                    )
+# Example for JSON
+response, json = Data.sop_instance_uids(series_instance_uid="uid.series.instance")
+```
+Formats allowed: `["CSV", "HTML", "JSON", "XML"]`, default: `JSON`. When `JSON` is requested the API will not save any
+json file on disk, returns an in memory json object.
+ 
+NOTE: `path` and `name` are optional, by default path is your current directory and if name is 
+not provided it will be saved with the SeriesInstance as filename.
+
+### Download Single DICOM image
+```python
+from gdcapiwrapper.tcia import Data
+response, filename = Data.download_single_image(
+                        series_instance_uid="uid.series.instance",
+                        sop_instance_uid="uid.sop.instance",
+                        path="/local/path",
+                        name="filename.dcm",
+                    )
+```
+NOTE: `path` and `name` are optional, by default path is your current directory and if name is 
+not provided it will be saved with the SOPInstanceUID as filename.
+
+### Download set of images in a zip file 
+```python
+from gdcapiwrapper.tcia import Data
+response, filename = Data.download_series_instance_images(
+                        series_instance_uid="uid.series.instance",
+                        path="/local/path",
+                        name="filename.zip")
+```
+NOTE: `path` and `name` are optional, by default path is your current directory and if name is 
+not provided it will be saved with the SOPInstanceUID as filename.
+
+## Changelog
+
+### 0.1
+- TCGA Api endpoints
+
+### 0.2
+- Bug Fix on TCGA Apis
+- Public interface refactoring [breaking change]
+- TCIA Api endpoints
diff --git a/src/gdcapiwrapper/__init__.py b/src/gdcapiwrapper/__init__.py
@@ -1,31 +1,5 @@
 # encoding: utf-8
 
-import os
-import requests
+"""Initialization module for gdcapiwrapper package."""
 
-__version__ = "0.1"
-GDC_API_TOKEN = os.environ.get("GCC_API_TOKEN", None)
-GDC_API_BASE_URL = os.environ.get("GDC_API_BASE_URL", "https://api.gdc.cancer.gov/")
-
-
-class APIBaseURLStatusError(Exception):
-    pass
-
-
-class APITokenMissingError(Exception):
-    pass
-
-
-request = requests.get(f"{GDC_API_BASE_URL}/status")
-
-
-if request.status_code != 200:
-    raise APIBaseURLStatusError(
-        f"{GDC_API_BASE_URL} status: {request.status_code}."
-        "The resource seems to be unavailable"
-    )
-
-session = requests.Session()
-session.params = {"api_token": GDC_API_TOKEN, "api_base_url": GDC_API_BASE_URL}
-
-from .data import Data  # isort:skip # noqa
+__version__ = "0.2b"
diff --git a/src/gdcapiwrapper/enums.py b/src/gdcapiwrapper/enums.py
@@ -0,0 +1,16 @@
+# encoding: utf-8
+
+from enum import Enum
+
+
+class FORMAT_TYPE(Enum):
+    """Enumerated values representing the various types of file format."""
+
+    # ---member definitions---
+    CSV = "CSV"
+    HTML = "HTML"
+    JSON = "JSON"
+    XML = "XML"
+
+    # ---allowed formats for TCIA apis---
+    TCIA_ALLOWED_FORMATS = frozenset((CSV, HTML, JSON, XML))
diff --git a/src/gdcapiwrapper/exceptions.py b/src/gdcapiwrapper/exceptions.py
@@ -0,0 +1,9 @@
+# encoding: utf-8
+
+
+class APIBaseURLStatusError(Exception):
+    pass
+
+
+class APITokenMissingError(Exception):
+    pass
diff --git a/src/gdcapiwrapper/tcga/__init__.py b/src/gdcapiwrapper/tcga/__init__.py
@@ -0,0 +1,25 @@
+# encoding: utf-8
+
+import os
+import requests
+
+from ..exceptions import APIBaseURLStatusError
+
+
+TCGA_API_TOKEN = os.environ.get("TCGA_API_TOKEN", None)
+TCGA_API_BASE_URL = os.environ.get("TCGA_API_BASE_URL", "https://api.gdc.cancer.gov/")
+
+
+request = requests.get(f"{TCGA_API_BASE_URL}/status")
+
+
+if request.status_code != 200:
+    raise APIBaseURLStatusError(
+        f"{TCGA_API_BASE_URL} status: {request.status_code}."
+        "The resource seems to be unavailable"
+    )
+
+session = requests.Session()
+session.params = {"api_token": TCGA_API_TOKEN, "api_base_url": TCGA_API_BASE_URL}
+
+from .tcga import Data  # isort:skip # noqa
diff --git a/src/gdcapiwrapper/tcga/tcga.py b/src/gdcapiwrapper/tcga/tcga.py
@@ -1,6 +1,5 @@
 # encoding: utf-8
 
-
 import os
 import re
 from datetime import datetime
@@ -11,15 +10,15 @@
 from tqdm import tqdm
 
 from . import session
-from .util import copyfileobj
+from ..util import copyfileobj
 
 __data_endpoint__ = "data"
 
 base_url = f"{session.params.get('api_base_url')}/{__data_endpoint__}"
 
 
 class Data(object):
-    """ Provides Data objects for https://api.gdc.cancer.gov/data/ `Data Endpoints`
+    """Provides Data objects for https://api.gdc.cancer.gov/data/ `Data Endpoints`
 
     Includes endpoints for file(s) download
     """
@@ -46,7 +45,7 @@ def download(
         """
         url = f"{base_url}/{uuid}"
 
-        local_filename = uuid if not name else name
+        local_filename = name if name else uuid
         with requests.get(url, stream=True) as r:
             total_size = int(r.headers.get("content-length", 0))
             bar = tqdm(total=total_size, unit="iB", unit_scale=True)
diff --git a/src/gdcapiwrapper/tcia/__init__.py b/src/gdcapiwrapper/tcia/__init__.py
@@ -0,0 +1,16 @@
+# encoding: utf-8
+
+import os
+import requests
+
+
+TCIA_API_TOKEN = os.environ.get("TCIA_API_TOKEN", None)
+TCIA_API_BASE_URL = os.environ.get(
+    "TCIA_API_BASE_URL", "https://services.cancerimagingarchive.net/services/v4/TCIA"
+)
+
+
+session = requests.Session()
+session.params = {"api_token": TCIA_API_TOKEN, "api_base_url": TCIA_API_BASE_URL}
+
+from .tcia import Data  # isort:skip # noqa
diff --git a/src/gdcapiwrapper/tcia/tcia.py b/src/gdcapiwrapper/tcia/tcia.py
@@ -0,0 +1,136 @@
+# encoding: utf-8
+
+import os
+from typing import Tuple
+
+import requests
+from responses import Response
+from tqdm import tqdm
+
+from ..enums import FORMAT_TYPE as FT
+from . import session
+from ..util import copyfileobj
+
+__data_endpoint__ = "query"
+
+base_url = f"{session.params.get('api_base_url')}/{__data_endpoint__}"
+
+
+class Data(object):
+    """Provides Data objects for
+    https://services.cancerimagingarchive.net/services/v4/TCIA/ `Data Endpoints`
+    """
+
+    @classmethod
+    def download_single_image(
+        cls,
+        series_instance_uid: str,
+        sop_instance_uid: str,
+        path: str = ".",
+        name: str = None,
+    ) -> Tuple[Response, str]:
+        """Returns a SINGLE DICOM Object.
+
+        A single image is identified by its SeriesInstanceUID and SOPInstanceUID.
+        This API will always be used following the `sop_instance_uids`
+
+        Parameters
+        ---------
+        series_instance_uid : str
+            SeriesInstance UID
+        sop_instance_uid: str
+            SOPInstanceUID UID
+        path: str
+            Local path where save file (default: current path)
+        name: str
+            Filename. If not provided it will be saved with SOPInstance UID as name
+
+        Returns
+        -------
+        tuple
+            response, filename absolute path
+        """
+        url = (
+            f"{base_url}/getSingleImage?SeriesInstanceUID={series_instance_uid}&"
+            f"SOPInstanceUID={sop_instance_uid}"
+        )
+        local_filename = name if name else f"{sop_instance_uid}.dcm"
+        with requests.get(url, stream=True) as r:
+            total_size = int(r.headers.get("content-length", 0))
+            bar = tqdm(total=total_size, unit="iB", unit_scale=True)
+            with open(os.path.join(path, local_filename), "wb") as f:
+                copyfileobj(r.raw, f, bar)
+        return r, local_filename
+
+    @classmethod
+    def download_series_instance_images(
+        cls, series_instance_uid: str, path: str = ".", name: str = None
+    ) -> Tuple[Response, str]:
+        """Returns a single Zip file with set of images for the given SeriesInstance.
+
+        Parameters
+        ---------
+        series_instance_uid : str
+            SeriesInstance UID
+        path: str
+            Local path where save file (default: current path)
+        name: str
+            Filename. If not provided it will be saved with SOPInstance UID as name
+
+        Returns
+        -------
+        tuple
+            response, filename absolute path
+        """
+        url = f"{base_url}/getImage?SeriesInstanceUID={series_instance_uid}"
+        local_filename = name if name else f"{series_instance_uid}.zip"
+        with requests.get(url, stream=True) as r:
+            total_size = int(r.headers.get("content-length", 0))
+            bar = tqdm(total=total_size, unit="iB", unit_scale=True)
+            with open(os.path.join(path, local_filename), "wb") as f:
+                copyfileobj(r.raw, f, bar)
+        return r, local_filename
+
+    @classmethod
+    def sop_instance_uids(
+        cls,
+        series_instance_uid: str,
+        format_: str = "JSON",
+        path: str = ".",
+        name: str = None,
+    ) -> Tuple[Response, str]:
+        """Return a list of SOPInstanceUID for a given SeriesInstanceUID
+
+        Parameters
+        ---------
+        series_instance_uid : str
+            SeriesInstance UID
+        format_ : str
+            Output format. This endpoint supports CSV/HTML/XML/JSON
+        path: str
+            Local path where save file (default: current path)
+        name: str
+            Filename. If not provided it will be saved with SeriesInstance UID as name
+
+        Returns
+        -------
+        tuple
+            response, filename absolute path or json
+        """
+        if format_.upper() not in FT.TCIA_ALLOWED_FORMATS.value:
+            raise ValueError(
+                f"Format not allowed. Allowed formats:"
+                f"{list(FT.TCIA_ALLOWED_FORMATS.value)}, got {format_}."
+            )
+        url = (
+            f"{base_url}/getSOPInstanceUIDs?SeriesInstanceUID={series_instance_uid}&"
+            f"format={format_}"
+        )
+        r = requests.get(url)
+        if format_.upper() == "JSON":
+            return r, r.json()
+
+        local_filename = name if name else f"{series_instance_uid}.{format_.lower()}"
+        with open(os.path.join(path, local_filename), "wb") as f:
+            f.write(r.content)
+        return r, local_filename
diff --git a/tests/mockserver.py b/tests/mockserver.py
@@ -11,7 +11,7 @@
 
 
 class MockServerRequestHandler(BaseHTTPRequestHandler):
-    API_PATTERN = re.compile(r"/data|/")
+    API_PATTERN = re.compile(r"/data|query|/")
 
     def do_GET(self):
         if re.search(self.API_PATTERN, self.path):
diff --git a/tests/unit/test_data.py b/tests/unit/test_data.py