Clarifai
diff --git a/‎README.md‎
Lines changed: 28 additions & 1 deletion b/‎README.md‎
Lines changed: 28 additions & 1 deletion
diff --git a/‎clarifai_datautils/image/annotation_conversion/base.py‎ renamed to ‎clarifai_datautils/base/__init__.py‎
Lines changed: 5 additions & 4 deletions b/‎clarifai_datautils/image/annotation_conversion/base.py‎ renamed to ‎clarifai_datautils/base/__init__.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎clarifai_datautils/image/annotation_conversion/features.py‎ renamed to ‎clarifai_datautils/base/features.py‎
Lines changed: 13 additions & 0 deletions b/‎clarifai_datautils/image/annotation_conversion/features.py‎ renamed to ‎clarifai_datautils/base/features.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎clarifai_datautils/constants/base.py‎
Lines changed: 9 additions & 0 deletions b/‎clarifai_datautils/constants/base.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎clarifai_datautils/constants/pipeline.py‎
Lines changed: 4 additions & 0 deletions b/‎clarifai_datautils/constants/pipeline.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎clarifai_datautils/image/annotation_conversion/annotations.py‎
Lines changed: 5 additions & 4 deletions b/‎clarifai_datautils/image/annotation_conversion/annotations.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎clarifai_datautils/image/annotation_conversion/loaders.py‎
Lines changed: 7 additions & 6 deletions b/‎clarifai_datautils/image/annotation_conversion/loaders.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎clarifai_datautils/text/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎clarifai_datautils/text/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎clarifai_datautils/text/pipeline/PDF.py‎
Lines changed: 63 additions & 0 deletions b/‎clarifai_datautils/text/pipeline/PDF.py‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎clarifai_datautils/text/pipeline/README.md‎
Lines changed: 40 additions & 0 deletions b/‎clarifai_datautils/text/pipeline/README.md‎
Lines changed: 40 additions & 0 deletions
@@ -17,6 +17,7 @@ This is a collection of utilities for handling various types of multimedia data.
 * **[Getting Started](#getting-started)**
 * **[Features](#features)**
   * [Image Utils](#image-utils)
+  * [Data Ingestion Pipeline](#ingestion-pipeline)
 * **[Usage](#usage)**
 * **[Examples](#more-examples)**
 
@@ -58,7 +59,9 @@ annotated_dataset = ImageAnnotations.import_from(path= 'folder_path', format= 'a
   - Load various annotated image datasets and export to clarifai Platform
   - Convert from one annotation format to other supported annotation formats
 
-
+### Data Ingestion Pipeline
+  - Easy to use pipelines to load data from files and ingest into clarifai platfrom.
+  - Load text files(pdf, doc, etc..) , transform, chunk and upload to the Clarifai Platform
 
 ## Usage
 ### Image Annotation Loader
@@ -81,6 +84,30 @@ coco_dataset.get_info()
 coco_dataset.export_to('voc_detection')
 ```
 
+
+### Data Ingestion Pipelines
+```python
+from clarifai_datautils.text import Pipeline, PDFPartition
+from clarifai_datautils.text.pipeline.cleaners import Clean_extra_whitespace
+
+# Define the pipeline
+pipeline = Pipeline(
+    name='pipeline-1',
+    transformations=[
+        PDFPartition(chunking_strategy = "by_title",max_characters = 1024),
+        Clean_extra_whitespace()
+    ]
+)
+
+
+# Using SDK to upload
+from clarifai.client import Dataset
+dataset = Dataset(dataset_url)
+dataset.upload_dataset(pipeline.run(files = file_path, loader = True))
+
+```
+
+
 ## More Examples
 
 See many more code examples in this [repo](https://github.com/Clarifai/examples).
@@ -1,13 +1,14 @@
 from typing import TypeVar, Union
 
-from .features import (VisualClassificationFeatures, VisualDetectionFeatures,
-                       VisualSegmentationFeatures)
+from clarifai_datautils.constants.base import DATASET_UPLOAD_TASKS
 
-DATASET_UPLOAD_TASKS = ["visual_classification", "visual_detection", "visual_segmentation"]
+from .features import (TextFeatures, VisualClassificationFeatures, VisualDetectionFeatures,
+                       VisualSegmentationFeatures)
 
 OutputFeaturesType = TypeVar(
     'OutputFeaturesType',
-    bound=Union[VisualClassificationFeatures, VisualDetectionFeatures, VisualSegmentationFeatures])
+    bound=Union[VisualClassificationFeatures, VisualDetectionFeatures, VisualSegmentationFeatures,
+                TextFeatures])
 
 
 class ClarifaiDataLoader:
 
@@ -12,6 +12,7 @@ class VisualClassificationFeatures:
   id: Optional[int] = None  # image_id
   metadata: Optional[dict] = None
   image_bytes: Optional[bytes] = None
+  label_ids: Optional[List[str]] = None
 
 
 @dataclass
@@ -24,6 +25,7 @@ class VisualDetectionFeatures:
   id: Optional[int] = None  # image_id
   metadata: Optional[dict] = None
   image_bytes: Optional[bytes] = None
+  label_ids: Optional[List[str]] = None
 
 
 @dataclass
@@ -36,3 +38,14 @@ class VisualSegmentationFeatures:
   id: Optional[int] = None  # image_id
   metadata: Optional[dict] = None
   image_bytes: Optional[bytes] = None
+  label_ids: Optional[List[str]] = None
+
+
+@dataclass
+class TextFeatures:
+  """Text classification datasets preprocessing output features."""
+  text: str
+  labels: List[Union[str, int]]  # List[str or int] to cater for multi-class tasks
+  id: Optional[int] = None  # text_id
+  metadata: Optional[dict] = None
+  label_ids: Optional[List[str]] = None
@@ -0,0 +1,9 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class DATASET_UPLOAD_TASKS:
+  VISUAL_CLASSIFICATION: str = "visual_classification"
+  VISUAL_DETECTION: str = "visual_detection"
+  VISUAL_SEGMENTATION: str = "visual_segmentation"
+  TEXT_CLASSIFICATION: str = "text_classification"
@@ -0,0 +1,4 @@
+MAX_CHARACTERS = 500
+
+MAX_NODES = 10
+SKIP_NODES = 1
@@ -4,11 +4,12 @@
 from datumaro.components.errors import (DatasetError, DatasetImportError, DatasetNotFoundError,
                                         MultipleFormatsMatchError)
 
+from clarifai_datautils.base import ClarifaiDataLoader
 from clarifai_datautils.constants.annotations import (IMAGE_ANNOTATION_FORMATS,
                                                       IMAGE_ANNOTATION_FORMATS_TO_TASKS,
                                                       IMAGE_FORMAT_MAP)
+from clarifai_datautils.constants.base import DATASET_UPLOAD_TASKS
 from clarifai_datautils.errors import AnnotationsDatasetError, AnnotationsFormatError
-from clarifai_datautils.image.annotation_conversion.base import ClarifaiDataLoader
 from clarifai_datautils.image.annotation_conversion.loaders import (ClassificationDataLoader,
                                                                     DetectionDataLoader,
                                                                     SegmentationDataLoader)
@@ -165,11 +166,11 @@ def dataloader(self) -> ClarifaiDataLoader:
         >>> format = ImageAnnotations.import_from(path=folder_path, format = 'coco_detection')
         >>> clarifai_dataset_loader = format.dataloader
     """
-    if self.task == 'visual_classification':
+    if self.task == DATASET_UPLOAD_TASKS.VISUAL_CLASSIFICATION:
       return ClassificationDataLoader(self._dataset)
-    elif self.task == 'visual_detection':
+    elif self.task == DATASET_UPLOAD_TASKS.VISUAL_DETECTION:
       return DetectionDataLoader(self._dataset)
-    elif self.task == 'visual_segmentation':
+    elif self.task == DATASET_UPLOAD_TASKS.VISUAL_SEGMENTATION:
       return SegmentationDataLoader(self._dataset)
 
   def __str__(self) -> str:
 
@@ -5,10 +5,11 @@
 import numpy as np
 from datumaro.components.annotation import AnnotationType
 from datumaro.components.media import ImageFromNumpy
+from clarifai_datautils.constants.base import DATASET_UPLOAD_TASKS
 
-from .base import ClarifaiDataLoader
-from .features import (VisualClassificationFeatures, VisualDetectionFeatures,
-                       VisualSegmentationFeatures)
+from ...base import ClarifaiDataLoader
+from ...base.features import (VisualClassificationFeatures, VisualDetectionFeatures,
+                              VisualSegmentationFeatures)
 
 delimiters = [",", "|", ";", "/", "\\", ":", " "]
 
@@ -35,7 +36,7 @@ def __init__(self, annotation_object):
 
   @property
   def task(self):
-    return "visual_classification"
+    return DATASET_UPLOAD_TASKS.VISUAL_CLASSIFICATION
 
   def __getitem__(self, index: int):
     dataset_item = self.annotation_object.get(
@@ -90,7 +91,7 @@ def __init__(self, annotation_object):
 
   @property
   def task(self):
-    return "visual_detection"
+    return DATASET_UPLOAD_TASKS.VISUAL_DETECTION
 
   def __getitem__(self, index: int):
     dataset_item = self.annotation_object.get(
@@ -170,7 +171,7 @@ def __init__(self, annotation_object):
 
   @property
   def task(self):
-    return "visual_segmentation"
+    return DATASET_UPLOAD_TASKS.VISUAL_SEGMENTATION
 
   def __getitem__(self, index: int):
     dataset_item = self.annotation_object.get(
 
@@ -0,0 +1,5 @@
+from clarifai_datautils.text.pipeline.base import Pipeline
+from clarifai_datautils.text.pipeline.PDF import PDFPartition
+from clarifai_datautils.text.pipeline.Text import TextPartition
+
+__all__ = ['Pipeline', 'PDFPartition', 'TextPartition']
@@ -0,0 +1,63 @@
+from typing import List
+
+from unstructured.partition.pdf import partition_pdf
+
+from clarifai_datautils.constants.pipeline import MAX_CHARACTERS
+
+from .base import BaseTransform
+
+
+class PDFPartition(BaseTransform):
+  """Partitions PDF file into text elements."""
+
+  def __init__(self,
+               ocr: bool = False,
+               chunking_strategy: str = "basic",
+               max_characters=MAX_CHARACTERS,
+               overlap=None,
+               overlap_all=True,
+               **kwargs):
+    """Initializes an PDFPartition object.
+
+    Args:
+        ocr (bool): Whether to use OCR.
+        chunking_strategy (str): Chunking strategy to use.
+        max_characters (int): Maximum number of characters in a chunk.
+        overlap (int): Number of characters to overlap between chunks.
+        overlap_all (bool): Whether to overlap all chunks.
+        kwargs: Additional keyword arguments.
+
+    """
+    if chunking_strategy not in ["basic", "by_title"]:
+      raise ValueError("chunking_strategy should be either 'basic' or 'by_title'.")
+    self.chunking_strategy = chunking_strategy
+    self.strategy = "fast"  #"ocr" if ocr else "fast"   #TODO: Add OCR Strategy and hi_res strategy
+    self.max_characters = max_characters
+    self.overlap = overlap
+    self.overlap_all = overlap_all
+    self.kwargs = kwargs
+
+  def __call__(self, elements: List[str]) -> List[str]:
+    """Applies the transformation.
+
+    Args:
+        elements (List[str]): List of text elements.
+
+    Returns:
+        List of transformed text elements.
+
+    """
+    file_elements = []
+    for filename in elements:
+      file_element = partition_pdf(
+          filename=filename,
+          strategy=self.strategy,
+          chunking_strategy=self.chunking_strategy,
+          max_characters=self.max_characters,
+          overlap=self.overlap,
+          overlap_all=self.overlap_all,
+          **self.kwargs)
+      file_elements.extend(file_element)
+      del file_element
+
+    return file_elements
@@ -0,0 +1,40 @@
+# Data Ingestion Pipeline
+Load text files(pdf, doc, etc..) , transform, chunk and upload to the Clarifai Platform
+
+## Features
+
+- File Partitioning
+- Cleaning Chunks
+- Metadata Extraction
+
+
+## Usage
+
+```python
+from clarifai_datautils.text import Pipeline, PDFPartition
+from clarifai_datautils.text.pipeline.cleaners import Clean_extra_whitespace
+
+# Define the pipeline
+pipeline = Pipeline(
+    name='pipeline-1',
+    transformations=[
+        PDFPartition(chunking_strategy = "by_title",max_characters = 1024),
+        Clean_extra_whitespace()
+    ]
+)
+
+
+# Using SDK to upload
+from clarifai.client import Dataset
+dataset = Dataset(dataset_url)
+dataset.upload_dataset(pipeline.run(files = file_path, loader = True))
+
+```
+
+## Supported File Formats
+- PDF
+- Text(.txt)
+
+
+## Resources
+This functionality makes use of the [Unstructured Framework](https://github.com/Unstructured-IO/unstructured)