Clarifai
diff --git a/‎README.md‎
Lines changed: 28 additions & 1 deletion b/‎README.md‎
Lines changed: 28 additions & 1 deletion
diff --git a/‎clarifai_datautils/base/__init__.py‎
Lines changed: 2 additions & 4 deletions b/‎clarifai_datautils/base/__init__.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎clarifai_datautils/constants/base.py‎
Lines changed: 9 additions & 0 deletions b/‎clarifai_datautils/constants/base.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎clarifai_datautils/image/annotation_conversion/annotations.py‎
Lines changed: 4 additions & 3 deletions b/‎clarifai_datautils/image/annotation_conversion/annotations.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎clarifai_datautils/image/annotation_conversion/loaders.py‎
Lines changed: 4 additions & 3 deletions b/‎clarifai_datautils/image/annotation_conversion/loaders.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎clarifai_datautils/text/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎clarifai_datautils/text/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎clarifai_datautils/text/pipeline/PDF.py‎
Lines changed: 14 additions & 13 deletions b/‎clarifai_datautils/text/pipeline/PDF.py‎
Lines changed: 14 additions & 13 deletions
diff --git a/‎clarifai_datautils/text/pipeline/README.md‎
Lines changed: 40 additions & 0 deletions b/‎clarifai_datautils/text/pipeline/README.md‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎clarifai_datautils/text/pipeline/Text.py‎
Lines changed: 14 additions & 13 deletions b/‎clarifai_datautils/text/pipeline/Text.py‎
Lines changed: 14 additions & 13 deletions
diff --git a/‎clarifai_datautils/text/pipeline/base.py‎
Lines changed: 4 additions & 5 deletions b/‎clarifai_datautils/text/pipeline/base.py‎
Lines changed: 4 additions & 5 deletions
@@ -17,6 +17,7 @@ This is a collection of utilities for handling various types of multimedia data.
 * **[Getting Started](#getting-started)**
 * **[Features](#features)**
   * [Image Utils](#image-utils)
+  * [Data Ingestion Pipeline](#ingestion-pipeline)
 * **[Usage](#usage)**
 * **[Examples](#more-examples)**
 
@@ -58,7 +59,9 @@ annotated_dataset = ImageAnnotations.import_from(path= 'folder_path', format= 'a
   - Load various annotated image datasets and export to clarifai Platform
   - Convert from one annotation format to other supported annotation formats
 
-
+### Data Ingestion Pipeline
+  - Easy to use pipelines to load data from files and ingest into clarifai platfrom.
+  - Load text files(pdf, doc, etc..) , transform, chunk and upload to the Clarifai Platform
 
 ## Usage
 ### Image Annotation Loader
@@ -81,6 +84,30 @@ coco_dataset.get_info()
 coco_dataset.export_to('voc_detection')
 ```
 
+
+### Data Ingestion Pipelines
+```python
+from clarifai_datautils.text import Pipeline, PDFPartition
+from clarifai_datautils.text.pipeline.cleaners import Clean_extra_whitespace
+
+# Define the pipeline
+pipeline = Pipeline(
+    name='pipeline-1',
+    transformations=[
+        PDFPartition(chunking_strategy = "by_title",max_characters = 1024),
+        Clean_extra_whitespace()
+    ]
+)
+
+
+# Using SDK to upload
+from clarifai.client import Dataset
+dataset = Dataset(dataset_url)
+dataset.upload_dataset(pipeline.run(files = file_path, loader = True))
+
+```
+
+
 ## More Examples
 
 See many more code examples in this [repo](https://github.com/Clarifai/examples).
@@ -1,12 +1,10 @@
 from typing import TypeVar, Union
 
+from clarifai_datautils.constants.base import DATASET_UPLOAD_TASKS
+
 from .features import (TextFeatures, VisualClassificationFeatures, VisualDetectionFeatures,
                        VisualSegmentationFeatures)
 
-DATASET_UPLOAD_TASKS = [
-    "visual_classification", "visual_detection", "visual_segmentation", "text_classification"
-]
-
 OutputFeaturesType = TypeVar(
     'OutputFeaturesType',
     bound=Union[VisualClassificationFeatures, VisualDetectionFeatures, VisualSegmentationFeatures,
 
@@ -0,0 +1,9 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class DATASET_UPLOAD_TASKS:
+  VISUAL_CLASSIFICATION: str = "visual_classification"
+  VISUAL_DETECTION: str = "visual_detection"
+  VISUAL_SEGMENTATION: str = "visual_segmentation"
+  TEXT_CLASSIFICATION: str = "text_classification"
@@ -8,6 +8,7 @@
 from clarifai_datautils.constants.annotations import (IMAGE_ANNOTATION_FORMATS,
                                                       IMAGE_ANNOTATION_FORMATS_TO_TASKS,
                                                       IMAGE_FORMAT_MAP)
+from clarifai_datautils.constants.base import DATASET_UPLOAD_TASKS
 from clarifai_datautils.errors import AnnotationsDatasetError, AnnotationsFormatError
 from clarifai_datautils.image.annotation_conversion.loaders import (ClassificationDataLoader,
                                                                     DetectionDataLoader,
@@ -165,11 +166,11 @@ def dataloader(self) -> ClarifaiDataLoader:
         >>> format = ImageAnnotations.import_from(path=folder_path, format = 'coco_detection')
         >>> clarifai_dataset_loader = format.dataloader
     """
-    if self.task == 'visual_classification':
+    if self.task == DATASET_UPLOAD_TASKS.VISUAL_CLASSIFICATION:
       return ClassificationDataLoader(self._dataset)
-    elif self.task == 'visual_detection':
+    elif self.task == DATASET_UPLOAD_TASKS.VISUAL_DETECTION:
       return DetectionDataLoader(self._dataset)
-    elif self.task == 'visual_segmentation':
+    elif self.task == DATASET_UPLOAD_TASKS.VISUAL_SEGMENTATION:
       return SegmentationDataLoader(self._dataset)
 
   def __str__(self) -> str:
 
@@ -5,6 +5,7 @@
 import numpy as np
 from datumaro.components.annotation import AnnotationType
 from datumaro.components.media import ImageFromNumpy
+from clarifai_datautils.constants.base import DATASET_UPLOAD_TASKS
 
 from ...base import ClarifaiDataLoader
 from ...base.features import (VisualClassificationFeatures, VisualDetectionFeatures,
@@ -35,7 +36,7 @@ def __init__(self, annotation_object):
 
   @property
   def task(self):
-    return "visual_classification"
+    return DATASET_UPLOAD_TASKS.VISUAL_CLASSIFICATION
 
   def __getitem__(self, index: int):
     dataset_item = self.annotation_object.get(
@@ -90,7 +91,7 @@ def __init__(self, annotation_object):
 
   @property
   def task(self):
-    return "visual_detection"
+    return DATASET_UPLOAD_TASKS.VISUAL_DETECTION
 
   def __getitem__(self, index: int):
     dataset_item = self.annotation_object.get(
@@ -170,7 +171,7 @@ def __init__(self, annotation_object):
 
   @property
   def task(self):
-    return "visual_segmentation"
+    return DATASET_UPLOAD_TASKS.VISUAL_SEGMENTATION
 
   def __getitem__(self, index: int):
     dataset_item = self.annotation_object.get(
 
@@ -1,5 +1,5 @@
 from clarifai_datautils.text.pipeline.base import Pipeline
 from clarifai_datautils.text.pipeline.PDF import PDFPartition
-from clarifai_datautils.text.pipeline.Text import Text_Partition
+from clarifai_datautils.text.pipeline.Text import TextPartition
 
-__all__ = ['Pipeline', 'PDFPartition', 'Text_Partition']
+__all__ = ['Pipeline', 'PDFPartition', 'TextPartition']
@@ -19,14 +19,15 @@ def __init__(self,
                **kwargs):
     """Initializes an PDFPartition object.
 
-        Args:
-            ocr (bool): Whether to use OCR.
-            chunking_strategy (str): Chunking strategy to use.
-            max_characters (int): Maximum number of characters in a chunk.
-            overlap (int): Number of characters to overlap between chunks.
-            overlap_all (bool): Whether to overlap all chunks.
-            kwargs: Additional keyword arguments.
-        """
+    Args:
+        ocr (bool): Whether to use OCR.
+        chunking_strategy (str): Chunking strategy to use.
+        max_characters (int): Maximum number of characters in a chunk.
+        overlap (int): Number of characters to overlap between chunks.
+        overlap_all (bool): Whether to overlap all chunks.
+        kwargs: Additional keyword arguments.
+
+    """
     if chunking_strategy not in ["basic", "by_title"]:
       raise ValueError("chunking_strategy should be either 'basic' or 'by_title'.")
     self.chunking_strategy = chunking_strategy
@@ -39,13 +40,13 @@ def __init__(self,
   def __call__(self, elements: List[str]) -> List[str]:
     """Applies the transformation.
 
-        Args:
-            elements (List[str]): List of text elements.
+    Args:
+        elements (List[str]): List of text elements.
 
-        Returns:
-            List of transformed text elements.
+    Returns:
+        List of transformed text elements.
 
-        """
+    """
     file_elements = []
     for filename in elements:
       file_element = partition_pdf(
 
@@ -0,0 +1,40 @@
+# Data Ingestion Pipeline
+Load text files(pdf, doc, etc..) , transform, chunk and upload to the Clarifai Platform
+
+## Features
+
+- File Partitioning
+- Cleaning Chunks
+- Metadata Extraction
+
+
+## Usage
+
+```python
+from clarifai_datautils.text import Pipeline, PDFPartition
+from clarifai_datautils.text.pipeline.cleaners import Clean_extra_whitespace
+
+# Define the pipeline
+pipeline = Pipeline(
+    name='pipeline-1',
+    transformations=[
+        PDFPartition(chunking_strategy = "by_title",max_characters = 1024),
+        Clean_extra_whitespace()
+    ]
+)
+
+
+# Using SDK to upload
+from clarifai.client import Dataset
+dataset = Dataset(dataset_url)
+dataset.upload_dataset(pipeline.run(files = file_path, loader = True))
+
+```
+
+## Supported File Formats
+- PDF
+- Text(.txt)
+
+
+## Resources
+This functionality makes use of the [Unstructured Framework](https://github.com/Unstructured-IO/unstructured)
@@ -7,7 +7,7 @@
 from .base import BaseTransform
 
 
-class Text_Partition(BaseTransform):
+class TextPartition(BaseTransform):
   """Partitions PDF file into text elements."""
 
   def __init__(self,
@@ -18,13 +18,14 @@ def __init__(self,
                **kwargs):
     """Initializes an PDFPartition object.
 
-        Args:
-            chunking_strategy (str): Chunking strategy to use.
-            max_characters (int): Maximum number of characters in a chunk.
-            overlap (int): Number of characters to overlap between chunks.
-            overlap_all (bool): Whether to overlap all chunks.
-            kwargs: Additional keyword arguments.
-        """
+    Args:
+        chunking_strategy (str): Chunking strategy to use.
+        max_characters (int): Maximum number of characters in a chunk.
+        overlap (int): Number of characters to overlap between chunks.
+        overlap_all (bool): Whether to overlap all chunks.
+        kwargs: Additional keyword arguments.
+
+    """
     if chunking_strategy not in ["basic", "by_title"]:
       raise ValueError("chunking_strategy should be either 'basic' or 'by_title'.")
     self.chunking_strategy = chunking_strategy
@@ -36,13 +37,13 @@ def __init__(self,
   def __call__(self, elements: List[str]) -> List[str]:
     """Applies the transformation.
 
-        Args:
-            elements (List[str]): List of text elements.
+    Args:
+        elements (List[str]): List of text elements.
 
-        Returns:
-            List of transformed text elements.
+    Returns:
+        List of transformed text elements.
 
-        """
+    """
     file_elements = []
     for filename in elements:
       file_element = partition_text(
 
@@ -1,5 +1,5 @@
 import os
-from typing import List
+from typing import List, Type
 
 from tqdm import tqdm
 
@@ -23,7 +23,7 @@ class Pipeline:
   def __init__(
       self,
       name: str,
-      transformations: List,
+      transformations: List[Type[BaseTransform]],
   ):
     """Initializes an Pipeline object.
 
@@ -57,7 +57,7 @@ def run(self,
         List of transformed elements or ClarifaiDataLoader object.
 
     Example:
-        >>> from clarifai-datautils.text import Pipeline
+        >>> from clarifai_datautils.text import Pipeline
         >>> dataloader = Pipeline().run(files = 'xx.pdf', loader = True))
     """
     if files is None and folder is None:
@@ -99,8 +99,7 @@ def load() -> 'Pipeline':
     pass
 
   def save(self,) -> None:
-    """Saves the pipeline to a yaml file.
-    """
+    """Saves the pipeline to a yaml file."""
     #TODO: Implement this
     pass