Merge pull request #2 from Clarifai/DEVX-371-export_update

sanjaychelliah · web-flow · commit 89fc2f20749b · 2024-03-19T02:23:41.000+05:30
[DEVX-371]: Export from Clarifai Platform to other formats
diff --git a/README.md b/README.md
@@ -67,9 +67,11 @@ from clarifai_datautils import ImageAnnotations
 #import from folder
 coco_dataset = ImageAnnotations.import_from(path='folder_path',format= 'coco_detection')
 
-#clarifai dataset loader object
-coco_dataset.dataloader
-
+#Using clarifai SDK to upload to Clarifai Platform
+#export CLARIFAI_PAT={your personal access token}  # set PAT as env variable
+from clarifai.client.dataset import Dataset
+dataset = Dataset(user_id="user_id", app_id="app_id", dataset_id="dataset_id")
+dataset.upload_dataset(dataloader=coco_dataset.dataloader)
 
 #info about loaded dataset
 coco_dataset.get_info()
diff --git a/clarifai_datautils/constants/annotations.py b/clarifai_datautils/constants/annotations.py
@@ -1,6 +1,7 @@
 IMAGE_ANNOTATION_FORMATS = [
     'coco_segmentation', 'voc_detection', 'yolo', 'cifar', 'coco_detection', 'cvat', 'imagenet',
-    'kitti', 'label_me', 'mnist', 'open_images', 'vgg_face2', 'lfw', 'cityscapes', 'ade20k2017'
+    'kitti', 'label_me', 'mnist', 'open_images', 'vgg_face2', 'lfw', 'cityscapes', 'ade20k2017',
+    'clarifai'
 ]
 
 IMAGE_ANNOTATION_TASKS = ['visual_classification', 'visual_detection', 'visual_segmentation']
@@ -11,6 +12,7 @@
     'mnist': 'visual_classification',
     'vgg_face2': 'visual_classification',
     'lfw': 'visual_classification',
+    'clarifai': 'visual_detection',
     'voc_detection': 'visual_detection',
     'yolo': 'visual_detection',
     'coco_detection': 'visual_detection',
diff --git a/clarifai_datautils/image/annotation_conversion/README.md b/clarifai_datautils/image/annotation_conversion/README.md
@@ -11,9 +11,6 @@ from clarifai_datautils import ImageAnnotations
 #import from folder
 coco_dataset = ImageAnnotations.import_from(path='folder_path',format= 'coco_detection')
 
-#clarifai dataset loader object
-coco_dataset.dataloader
-
 
 #info about loaded dataset
 coco_dataset.get_info()
@@ -24,7 +21,7 @@ coco_dataset.export_to('voc_detection')
 ```
 
 
-### With Clarifai Python SDK
+### Upload using Clarifai Python SDK
 ```python
 from clarifai_datautils import ImageAnnotations
 coco_dataset = ImageAnnotations.import_from(path='folder_path',format= 'coco_detection')
@@ -38,6 +35,24 @@ dataset.upload_dataset(dataloader=coco_dataset.dataloader)
 ```
 
 
+### Export to other formats from Clarifai Platform
+```python
+
+#clarifai SDK
+#export CLARIFAI_PAT={your personal access token}  # set PAT as env variable
+from clarifai.client.dataset import Dataset
+dataset = Dataset(user_id="user_id", app_id="app_id", dataset_id="dataset_id")
+dataset.export(save_path='output.zip',split='train')
+
+#Extract the zip file and pass the folder to ImageAnnotations
+from clarifai_datautils import ImageAnnotations
+clarifai_dataset = ImageAnnotations.import_from(path='folder_path',format= 'clarifai')
+
+#export to other formats
+clarifai_dataset.export_to(path='output_path',format='coco_detection',save_images=True)
+
+```
+
 ## Supported Formats
 
 | Annotation format                                                                                | Format       |      TASK       |
@@ -54,6 +69,7 @@ dataset.upload_dataset(dataloader=coco_dataset.dataloader)
 | [Kitti](http://www.cvlibs.net/datasets/kitti/index.php)                                          | kitti     | detection  |
 | [LabelMe](http://labelme.csail.mit.edu/Release3.0)                                               | label_me     | detection  |
 | [Open Images](https://storage.googleapis.com/openimages/web/download.html)                       | open_images     | detection  |
+| [Clarifai](https://github.com/Clarifai/examples/tree/main/Data_Utils)                       | clarifai     | detection  |
 | [COCO(segmentation)](http://cocodataset.org/#format-data)                                     | coco_segmentation     | segmentation  |
 | [Cityscapes](https://www.cityscapes-dataset.com/)                                                | cityscapes     | segmentation  |
 | [ADE](https://www.cityscapes-dataset.com/)                                                       | ade20k2017     | segmentation  |
diff --git a/clarifai_datautils/image/annotation_conversion/annotations.py b/clarifai_datautils/image/annotation_conversion/annotations.py
@@ -12,6 +12,7 @@
 from clarifai_datautils.image.annotation_conversion.loaders import (ClassificationDataLoader,
                                                                     DetectionDataLoader,
                                                                     SegmentationDataLoader)
+from clarifai_datautils.image.annotation_conversion.utils import Clarifai_to_Datumaro
 
 
 class ImageAnnotations():
@@ -56,11 +57,15 @@ def import_from(cls, path: str, format: str) -> Dataset:
     #task of the dataset
     task = IMAGE_ANNOTATION_FORMATS_TO_TASKS[format]
 
-    try:
-      format_name = IMAGE_FORMAT_MAP[format]
-      dataset = Dataset.import_from(path, format_name)
-    except (DatasetError, DatasetImportError, DatasetNotFoundError) as ex:
-      raise AnnotationsDatasetError(ex)
+    #import dataset
+    if format == 'clarifai':
+      dataset = Clarifai_to_Datumaro(path).convert()
+    else:
+      try:
+        format_name = IMAGE_FORMAT_MAP[format]
+        dataset = Dataset.import_from(path, format_name)
+      except (DatasetError, DatasetImportError, DatasetNotFoundError) as ex:
+        raise AnnotationsDatasetError(ex)
 
     return ImageAnnotations(dataset, format, task)
 
@@ -84,12 +89,13 @@ def get_info(self,) -> Dict[str, Any]:
         'categories': list(self._dataset.get_categories_info())
     }
 
-  def export_to(self, path: str, format: str) -> None:
+  def export_to(self, path: str, format: str, save_images: bool = False) -> None:
     """Exports a dataset to a given path and format.
 
     Args:
         path (str): The path to the dataset.
         format (str): The format of the dataset.
+        save_images (bool): Whether to save the images or not.
 
     Example:
         >>> from clarifai_datautils import ImageAnnotations
@@ -99,9 +105,13 @@ def export_to(self, path: str, format: str) -> None:
     if format not in IMAGE_ANNOTATION_FORMATS:
       raise AnnotationsFormatError('Invalid format')
 
+    if format == 'clarifai':
+      raise AnnotationsFormatError(
+          'Cannot export to clarifai format. Use clarifai SDK to upload the dataset.')
+
     try:
       format_name = IMAGE_FORMAT_MAP[format]
-      self._dataset.export(path, format_name)
+      self._dataset.export(path, format_name, save_media=save_images)
     except Exception as ex:
       raise AnnotationsDatasetError(ex)
 
@@ -130,6 +140,19 @@ def detect_format(path: str) -> str:
       raise AnnotationsFormatError('Given folder does not contain a supported dataset format')
     return dataset_format
 
+  @staticmethod
+  def list_formats() -> list:
+    """Lists the supported formats.
+
+    Returns:
+        A list of supported formats.
+
+    Example:
+        >>> from clarifai_datautils import ImageAnnotations
+        >>> ImageAnnotations.list_formats()
+    """
+    return IMAGE_ANNOTATION_FORMATS
+
   @property
   def dataloader(self) -> ClarifaiDataLoader:
     """Returns a Clarifai Dataloader Object to pass to SDK Dataset Upload Functionality.
diff --git a/clarifai_datautils/image/annotation_conversion/utils.py b/clarifai_datautils/image/annotation_conversion/utils.py
@@ -0,0 +1,98 @@
+import json
+import os
+from typing import Tuple
+
+import PIL
+from datumaro.components.annotation import Bbox
+from datumaro.components.dataset import Dataset
+from datumaro.components.dataset_base import DatasetItem
+from datumaro.components.media import Image
+
+from clarifai_datautils.errors import AnnotationsDatasetError, AnnotationsFormatError
+
+
+class Clarifai_to_Datumaro():
+
+  def __init__(
+      self,
+      main_path: str,
+  ):
+    """Converts a clarifai dataset to a Datumaro dataset.
+
+    Args:
+        path (str): The path to the clarifai dataset.
+
+    """
+    self.main_path = main_path
+    self.image_list = os.listdir(os.path.join(self.main_path, 'inputs'))
+    self.annotations_list = os.listdir(os.path.join(self.main_path, 'annotations'))
+    self.label_map = {}
+
+  def convert(self) -> Dataset:
+    """Check folder format and creates a Datumaro Dataset.
+
+    Returns:
+        A Datumaro dataset object.
+    """
+    self.check_folder()
+    # create a dataset
+    dataset = Dataset.from_iterable(
+        iterable=[self.create_item(path) for path in self.image_list],
+        media_type=Image,
+        categories=list(self.label_map.keys()))
+
+    return dataset
+
+  def create_item(self, image_path: str) -> DatasetItem:
+    """Creates a Datumaro item from an image path."""
+    image_full_path = os.path.join(self.main_path, 'inputs', image_path)
+    image_data = Image.from_file(image_full_path)
+    width, height = PIL.Image.open(image_full_path).size
+    try:
+      with open(
+          os.path.join(self.main_path, 'annotations', image_path.split('.png')[0] + '.json'),
+          'r') as file:
+        item_data = json.load(file)
+      # create annotations
+      annotations = []
+      for annot in item_data:
+        #check if the annotation has a bounding box
+        if 'regionInfo' in annot.keys() and 'boundingBox' in annot['regionInfo'].keys():
+          x, y, w, h = self.clarifai_bbox_to_datumaro_bbox(annot['regionInfo']['boundingBox'],
+                                                           width, height)
+          label = annot['data']['concepts'][0]['name']
+          value = self.label_map.get(label, len(self.label_map))
+          self.label_map[label] = value
+          annotations.append(Bbox(x=x, y=y, w=w, h=h, label=value))
+
+    except FileNotFoundError:
+      annotations = []
+
+    return DatasetItem(id=image_path.split('.png')[0], media=image_data, annotations=annotations)
+
+  def clarifai_bbox_to_datumaro_bbox(self, clarifai_bbox, width, height) -> Tuple[int]:
+    left_col = clarifai_bbox['leftCol'] * width
+    top_row = clarifai_bbox['topRow'] * height
+    right_col = clarifai_bbox['rightCol'] * width
+    bottom_row = clarifai_bbox['bottomRow'] * height
+
+    obj_box = (left_col, top_row, right_col - left_col, bottom_row - top_row)
+    return obj_box
+
+  def check_folder(self):
+    """Checks the clarifai folder format."""
+    if not os.path.exists(self.main_path):
+      raise AnnotationsDatasetError(f'Folder not found at {self.main_path}')
+
+    if not os.path.exists(os.path.join(self.main_path, 'inputs')):
+      raise AnnotationsFormatError(
+          f'Folder does not contain an "inputs" folder at {self.main_path}')
+    if not os.path.exists(os.path.join(self.main_path, 'annotations')):
+      raise AnnotationsFormatError(
+          f'Folder does not contain an "annotations" folder at {self.main_path}')
+
+    if not all(img.endswith('.png') for img in self.image_list):
+      raise AnnotationsFormatError(f'Folder should only contain images at {self.main_path}/inputs')
+    if not all(img.endswith('.json') for img in self.annotations_list):
+      raise AnnotationsFormatError(
+          f'Folder should only contain annotations at {self.main_path}/annotations')
diff --git a/tests/annotations/test_clarifai_loader.py b/tests/annotations/test_clarifai_loader.py
@@ -14,6 +14,7 @@
 KITTI_PATH = get_asset_path('kitti_detection')
 LABEL_ME_PATH = get_asset_path('labelme_dataset')
 OPEN_IMAGES_PATH = get_asset_path('openimages_dataset')
+CLARIFAI_PATH = get_asset_path('clarifai_dataset')
 
 COCO_SEGMENTATION_PATH = get_asset_path('coco_segmentation')
 CITYSCAPES_PATH = get_asset_path('cityscapes_dataset')
@@ -83,8 +84,6 @@ def test_coco_detection_loader(self,):
     dataloader = annotation_object.dataloader
     assert dataloader.task == 'visual_detection'
     assert len(dataloader) == 2
-    assert dataloader[0].labels == ['b']
-    assert dataloader[0].id == 'a'
     assert isinstance(dataloader[0].image_bytes, bytes)
 
   def test_cvat_loader(self,):
@@ -121,6 +120,14 @@ def test_open_images_loader(self,):
     assert dataloader[1].id == 'aa'
     assert isinstance(dataloader[0].image_bytes, bytes)
 
+  def test_clarifai_loader(self,):
+    annotation_object = ImageAnnotations.import_from(path=CLARIFAI_PATH, format='clarifai')
+    dataloader = annotation_object.dataloader
+    assert dataloader.task == 'visual_detection'
+    assert len(dataloader) == 1
+    assert dataloader[0].id == '000464'
+    assert isinstance(dataloader[0].image_bytes, bytes)
+
   def test_coco_segmentation_loader(self,):
     annotation_object = ImageAnnotations.import_from(
         path=COCO_SEGMENTATION_PATH, format='coco_segmentation')
diff --git a/tests/annotations/test_import_formats.py b/tests/annotations/test_import_formats.py
@@ -16,6 +16,7 @@
 KITTI_PATH = get_asset_path('kitti_detection')
 LABEL_ME_PATH = get_asset_path('labelme_dataset')
 OPEN_IMAGES_PATH = get_asset_path('openimages_dataset')
+CLARIFAI_PATH = get_asset_path('clarifai_dataset')
 
 COCO_SEGMENTATION_PATH = get_asset_path('coco_segmentation')
 CITYSCAPES_PATH = get_asset_path('cityscapes_dataset')
@@ -118,6 +119,13 @@ def test_open_images_import(self,):
     assert annotation_object.task == 'visual_detection'
     assert len(annotation_object._dataset._data) == 2  # 2 images
 
+  def test_clarifai_import(self,):
+    annotation_object = ImageAnnotations.import_from(path=CLARIFAI_PATH, format='clarifai')
+    assert annotation_object.annotation_format == 'clarifai'
+    assert annotation_object.task == 'visual_detection'
+    assert len(annotation_object._dataset._data) == 1  # 1 images
+    assert annotation_object._dataset.get_annotations() == 2  # 2 annotations
+
   def test_coco_segmentation_import(self,):
     annotation_object = ImageAnnotations.import_from(
         path=COCO_SEGMENTATION_PATH, format='coco_segmentation')
diff --git a/tests/assets/clarifai_dataset/annotations/train_dataset-2007_000464.json b/tests/assets/clarifai_dataset/annotations/train_dataset-2007_000464.json
@@ -0,0 +1 @@
+[{"id": "c72ffee676ad6ae88acfc72791bdae14", "regionInfo": {"boundingBox": {"topRow": 0.502, "leftCol": 0.18666667, "bottomRow": 0.626, "rightCol": 0.5733333}}, "data": {"concepts": [{"id": "id-cow", "name": "cow", "value": 1.0, "appId": "demo_train_1402"}]}}, {"id": "b75d9daca8d6b1f07d2e4a39f36bb4c7", "regionInfo": {"boundingBox": {"topRow": 0.402, "leftCol": 0.152, "bottomRow": 0.588, "rightCol": 0.64}}, "data": {"concepts": [{"id": "id-cow", "name": "cow", "value": 1.0, "appId": "demo_train_1402"}]}}]
diff --git a/tests/assets/clarifai_dataset/inputs/train_dataset-2007_000464.png b/tests/assets/clarifai_dataset/inputs/train_dataset-2007_000464.png

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+[{"id": "c72ffee676ad6ae88acfc72791bdae14", "regionInfo": {"boundingBox": {"topRow": 0.502, "leftCol": 0.18666667, "bottomRow": 0.626, "rightCol": 0.5733333}}, "data": {"concepts": [{"id": "id-cow", "name": "cow", "value": 1.0, "appId": "demo_train_1402"}]}}, {"id": "b75d9daca8d6b1f07d2e4a39f36bb4c7", "regionInfo": {"boundingBox": {"topRow": 0.402, "leftCol": 0.152, "bottomRow": 0.588, "rightCol": 0.64}}, "data": {"concepts": [{"id": "id-cow", "name": "cow", "value": 1.0, "appId": "demo_train_1402"}]}}]