Skip to content

Commit 970a67f

Browse files
review changes
1 parent 2803a8f commit 970a67f

File tree

19 files changed

+289
-221
lines changed

19 files changed

+289
-221
lines changed

README.md

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ This is a collection of utilities for handling various types of multimedia data.
1717
* **[Getting Started](#getting-started)**
1818
* **[Features](#features)**
1919
* [Image Utils](#image-utils)
20+
* [Data Ingestion Pipeline](#ingestion-pipeline)
2021
* **[Usage](#usage)**
2122
* **[Examples](#more-examples)**
2223

@@ -58,7 +59,9 @@ annotated_dataset = ImageAnnotations.import_from(path= 'folder_path', format= 'a
5859
- Load various annotated image datasets and export to clarifai Platform
5960
- Convert from one annotation format to other supported annotation formats
6061

61-
62+
### Data Ingestion Pipeline
63+
- Easy to use pipelines to load data from files and ingest into clarifai platfrom.
64+
- Load text files(pdf, doc, etc..) , transform, chunk and upload to the Clarifai Platform
6265

6366
## Usage
6467
### Image Annotation Loader
@@ -81,6 +84,30 @@ coco_dataset.get_info()
8184
coco_dataset.export_to('voc_detection')
8285
```
8386

87+
88+
### Data Ingestion Pipelines
89+
```python
90+
from clarifai_datautils.text import Pipeline, PDFPartition
91+
from clarifai_datautils.text.pipeline.cleaners import Clean_extra_whitespace
92+
93+
# Define the pipeline
94+
pipeline = Pipeline(
95+
name='pipeline-1',
96+
transformations=[
97+
PDFPartition(chunking_strategy = "by_title",max_characters = 1024),
98+
Clean_extra_whitespace()
99+
]
100+
)
101+
102+
103+
# Using SDK to upload
104+
from clarifai.client import Dataset
105+
dataset = Dataset(dataset_url)
106+
dataset.upload_dataset(pipeline.run(files = file_path, loader = True))
107+
108+
```
109+
110+
84111
## More Examples
85112

86113
See many more code examples in this [repo](https://github.com/Clarifai/examples).

clarifai_datautils/base/__init__.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
11
from typing import TypeVar, Union
22

3+
from clarifai_datautils.constants.base import DATASET_UPLOAD_TASKS
4+
35
from .features import (TextFeatures, VisualClassificationFeatures, VisualDetectionFeatures,
46
VisualSegmentationFeatures)
57

6-
DATASET_UPLOAD_TASKS = [
7-
"visual_classification", "visual_detection", "visual_segmentation", "text_classification"
8-
]
9-
108
OutputFeaturesType = TypeVar(
119
'OutputFeaturesType',
1210
bound=Union[VisualClassificationFeatures, VisualDetectionFeatures, VisualSegmentationFeatures,
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from dataclasses import dataclass
2+
3+
4+
@dataclass
5+
class DATASET_UPLOAD_TASKS:
6+
VISUAL_CLASSIFICATION: str = "visual_classification"
7+
VISUAL_DETECTION: str = "visual_detection"
8+
VISUAL_SEGMENTATION: str = "visual_segmentation"
9+
TEXT_CLASSIFICATION: str = "text_classification"

clarifai_datautils/image/annotation_conversion/annotations.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from clarifai_datautils.constants.annotations import (IMAGE_ANNOTATION_FORMATS,
99
IMAGE_ANNOTATION_FORMATS_TO_TASKS,
1010
IMAGE_FORMAT_MAP)
11+
from clarifai_datautils.constants.base import DATASET_UPLOAD_TASKS
1112
from clarifai_datautils.errors import AnnotationsDatasetError, AnnotationsFormatError
1213
from clarifai_datautils.image.annotation_conversion.loaders import (ClassificationDataLoader,
1314
DetectionDataLoader,
@@ -165,11 +166,11 @@ def dataloader(self) -> ClarifaiDataLoader:
165166
>>> format = ImageAnnotations.import_from(path=folder_path, format = 'coco_detection')
166167
>>> clarifai_dataset_loader = format.dataloader
167168
"""
168-
if self.task == 'visual_classification':
169+
if self.task == DATASET_UPLOAD_TASKS.VISUAL_CLASSIFICATION:
169170
return ClassificationDataLoader(self._dataset)
170-
elif self.task == 'visual_detection':
171+
elif self.task == DATASET_UPLOAD_TASKS.VISUAL_DETECTION:
171172
return DetectionDataLoader(self._dataset)
172-
elif self.task == 'visual_segmentation':
173+
elif self.task == DATASET_UPLOAD_TASKS.VISUAL_SEGMENTATION:
173174
return SegmentationDataLoader(self._dataset)
174175

175176
def __str__(self) -> str:

clarifai_datautils/image/annotation_conversion/loaders.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import numpy as np
66
from datumaro.components.annotation import AnnotationType
77
from datumaro.components.media import ImageFromNumpy
8+
from clarifai_datautils.constants.base import DATASET_UPLOAD_TASKS
89

910
from ...base import ClarifaiDataLoader
1011
from ...base.features import (VisualClassificationFeatures, VisualDetectionFeatures,
@@ -35,7 +36,7 @@ def __init__(self, annotation_object):
3536

3637
@property
3738
def task(self):
38-
return "visual_classification"
39+
return DATASET_UPLOAD_TASKS.VISUAL_CLASSIFICATION
3940

4041
def __getitem__(self, index: int):
4142
dataset_item = self.annotation_object.get(
@@ -90,7 +91,7 @@ def __init__(self, annotation_object):
9091

9192
@property
9293
def task(self):
93-
return "visual_detection"
94+
return DATASET_UPLOAD_TASKS.VISUAL_DETECTION
9495

9596
def __getitem__(self, index: int):
9697
dataset_item = self.annotation_object.get(
@@ -170,7 +171,7 @@ def __init__(self, annotation_object):
170171

171172
@property
172173
def task(self):
173-
return "visual_segmentation"
174+
return DATASET_UPLOAD_TASKS.VISUAL_SEGMENTATION
174175

175176
def __getitem__(self, index: int):
176177
dataset_item = self.annotation_object.get(
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from clarifai_datautils.text.pipeline.base import Pipeline
22
from clarifai_datautils.text.pipeline.PDF import PDFPartition
3-
from clarifai_datautils.text.pipeline.Text import Text_Partition
3+
from clarifai_datautils.text.pipeline.Text import TextPartition
44

5-
__all__ = ['Pipeline', 'PDFPartition', 'Text_Partition']
5+
__all__ = ['Pipeline', 'PDFPartition', 'TextPartition']

clarifai_datautils/text/pipeline/PDF.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,15 @@ def __init__(self,
1919
**kwargs):
2020
"""Initializes an PDFPartition object.
2121
22-
Args:
23-
ocr (bool): Whether to use OCR.
24-
chunking_strategy (str): Chunking strategy to use.
25-
max_characters (int): Maximum number of characters in a chunk.
26-
overlap (int): Number of characters to overlap between chunks.
27-
overlap_all (bool): Whether to overlap all chunks.
28-
kwargs: Additional keyword arguments.
29-
"""
22+
Args:
23+
ocr (bool): Whether to use OCR.
24+
chunking_strategy (str): Chunking strategy to use.
25+
max_characters (int): Maximum number of characters in a chunk.
26+
overlap (int): Number of characters to overlap between chunks.
27+
overlap_all (bool): Whether to overlap all chunks.
28+
kwargs: Additional keyword arguments.
29+
30+
"""
3031
if chunking_strategy not in ["basic", "by_title"]:
3132
raise ValueError("chunking_strategy should be either 'basic' or 'by_title'.")
3233
self.chunking_strategy = chunking_strategy
@@ -39,13 +40,13 @@ def __init__(self,
3940
def __call__(self, elements: List[str]) -> List[str]:
4041
"""Applies the transformation.
4142
42-
Args:
43-
elements (List[str]): List of text elements.
43+
Args:
44+
elements (List[str]): List of text elements.
4445
45-
Returns:
46-
List of transformed text elements.
46+
Returns:
47+
List of transformed text elements.
4748
48-
"""
49+
"""
4950
file_elements = []
5051
for filename in elements:
5152
file_element = partition_pdf(
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Data Ingestion Pipeline
2+
Load text files(pdf, doc, etc..) , transform, chunk and upload to the Clarifai Platform
3+
4+
## Features
5+
6+
- File Partitioning
7+
- Cleaning Chunks
8+
- Metadata Extraction
9+
10+
11+
## Usage
12+
13+
```python
14+
from clarifai_datautils.text import Pipeline, PDFPartition
15+
from clarifai_datautils.text.pipeline.cleaners import Clean_extra_whitespace
16+
17+
# Define the pipeline
18+
pipeline = Pipeline(
19+
name='pipeline-1',
20+
transformations=[
21+
PDFPartition(chunking_strategy = "by_title",max_characters = 1024),
22+
Clean_extra_whitespace()
23+
]
24+
)
25+
26+
27+
# Using SDK to upload
28+
from clarifai.client import Dataset
29+
dataset = Dataset(dataset_url)
30+
dataset.upload_dataset(pipeline.run(files = file_path, loader = True))
31+
32+
```
33+
34+
## Supported File Formats
35+
- PDF
36+
- Text(.txt)
37+
38+
39+
## Resources
40+
This functionality makes use of the [Unstructured Framework](https://github.com/Unstructured-IO/unstructured)

clarifai_datautils/text/pipeline/Text.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from .base import BaseTransform
88

99

10-
class Text_Partition(BaseTransform):
10+
class TextPartition(BaseTransform):
1111
"""Partitions PDF file into text elements."""
1212

1313
def __init__(self,
@@ -18,13 +18,14 @@ def __init__(self,
1818
**kwargs):
1919
"""Initializes an PDFPartition object.
2020
21-
Args:
22-
chunking_strategy (str): Chunking strategy to use.
23-
max_characters (int): Maximum number of characters in a chunk.
24-
overlap (int): Number of characters to overlap between chunks.
25-
overlap_all (bool): Whether to overlap all chunks.
26-
kwargs: Additional keyword arguments.
27-
"""
21+
Args:
22+
chunking_strategy (str): Chunking strategy to use.
23+
max_characters (int): Maximum number of characters in a chunk.
24+
overlap (int): Number of characters to overlap between chunks.
25+
overlap_all (bool): Whether to overlap all chunks.
26+
kwargs: Additional keyword arguments.
27+
28+
"""
2829
if chunking_strategy not in ["basic", "by_title"]:
2930
raise ValueError("chunking_strategy should be either 'basic' or 'by_title'.")
3031
self.chunking_strategy = chunking_strategy
@@ -36,13 +37,13 @@ def __init__(self,
3637
def __call__(self, elements: List[str]) -> List[str]:
3738
"""Applies the transformation.
3839
39-
Args:
40-
elements (List[str]): List of text elements.
40+
Args:
41+
elements (List[str]): List of text elements.
4142
42-
Returns:
43-
List of transformed text elements.
43+
Returns:
44+
List of transformed text elements.
4445
45-
"""
46+
"""
4647
file_elements = []
4748
for filename in elements:
4849
file_element = partition_text(

clarifai_datautils/text/pipeline/base.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import os
2-
from typing import List
2+
from typing import List, Type
33

44
from tqdm import tqdm
55

@@ -23,7 +23,7 @@ class Pipeline:
2323
def __init__(
2424
self,
2525
name: str,
26-
transformations: List,
26+
transformations: List[Type[BaseTransform]],
2727
):
2828
"""Initializes an Pipeline object.
2929
@@ -57,7 +57,7 @@ def run(self,
5757
List of transformed elements or ClarifaiDataLoader object.
5858
5959
Example:
60-
>>> from clarifai-datautils.text import Pipeline
60+
>>> from clarifai_datautils.text import Pipeline
6161
>>> dataloader = Pipeline().run(files = 'xx.pdf', loader = True))
6262
"""
6363
if files is None and folder is None:
@@ -99,8 +99,7 @@ def load() -> 'Pipeline':
9999
pass
100100

101101
def save(self,) -> None:
102-
"""Saves the pipeline to a yaml file.
103-
"""
102+
"""Saves the pipeline to a yaml file."""
104103
#TODO: Implement this
105104
pass
106105

0 commit comments

Comments
 (0)