Skip to content

Commit 13cf2e7

Browse files
Merge pull request #32 from Clarifai/DEVX-454-Data-Ingestion-Pipeline-for-Other-Formats
[DEVX-454]: Added Support for Docx & Markdown in Data Ingestion Pipeline
2 parents 765e959 + 6cc75c4 commit 13cf2e7

File tree

17 files changed

+442
-10
lines changed

17 files changed

+442
-10
lines changed

clarifai_datautils/base/features.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ class VisualSegmentationFeatures:
4545
class TextFeatures:
4646
"""Text classification datasets preprocessing output features."""
4747
text: str
48-
labels: List[Union[str, int]] # List[str or int] to cater for multi-class tasks
48+
labels: List[Union[str, int]] = None # List[str or int] to cater for multi-class tasks
4949
id: Optional[int] = None # text_id
5050
metadata: Optional[dict] = None
5151
label_ids: Optional[List[str]] = None
Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
from clarifai_datautils.multimodal.pipeline.base import Pipeline
2+
from clarifai_datautils.multimodal.pipeline.Docx import DocxPartition
3+
from clarifai_datautils.multimodal.pipeline.Markdown import MarkdownPartition
24
from clarifai_datautils.multimodal.pipeline.PDF import PDFPartition
35
from clarifai_datautils.multimodal.pipeline.Text import TextPartition
46

5-
__all__ = ['Pipeline', 'PDFPartition', 'TextPartition', 'PDFPartitionMultimodal']
7+
__all__ = [
8+
'Pipeline', 'PDFPartition', 'TextPartition', 'PDFPartitionMultimodal', 'DocxPartition',
9+
'MarkdownPartition'
10+
]
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
from typing import List
2+
try:
3+
from unstructured.partition.docx import partition_docx
4+
except ImportError:
5+
raise ImportError(
6+
"Could not import unstructured package. "
7+
"Please install it with `pip install 'unstructured[pdf] @ git+https://github.com/clarifai/unstructured.git@support_clarifai_model'`."
8+
)
9+
10+
from clarifai_datautils.constants.pipeline import MAX_CHARACTERS
11+
12+
from .basetransform import BaseTransform
13+
14+
15+
class DocxPartition(BaseTransform):
16+
"""Partitions Docx file into text elements."""
17+
18+
def __init__(self,
19+
chunking_strategy: str = "basic",
20+
max_characters=MAX_CHARACTERS,
21+
overlap=None,
22+
overlap_all=True,
23+
**kwargs):
24+
"""Initializes an DocxPartition object.
25+
26+
Args:
27+
chunking_strategy (str): Chunking strategy to use.
28+
max_characters (int): Maximum number of characters in a chunk.
29+
overlap (int): Number of characters to overlap between chunks.
30+
overlap_all (bool): Whether to overlap all chunks.
31+
kwargs: Additional keyword arguments.
32+
33+
"""
34+
if chunking_strategy not in ["basic", "by_title"]:
35+
raise ValueError("chunking_strategy should be either 'basic' or 'by_title'.")
36+
self.chunking_strategy = chunking_strategy
37+
self.max_characters = max_characters
38+
self.overlap = overlap
39+
self.overlap_all = overlap_all
40+
self.kwargs = kwargs
41+
42+
def __call__(self, elements: List[str]) -> List[str]:
43+
"""Applies the transformation.
44+
45+
Args:
46+
elements (List[str]): List of text elements.
47+
48+
Returns:
49+
List of transformed text elements.
50+
51+
"""
52+
file_elements = []
53+
for filename in elements:
54+
file_element = partition_docx(
55+
filename=filename,
56+
chunking_strategy=self.chunking_strategy,
57+
max_characters=self.max_characters,
58+
overlap=self.overlap,
59+
overlap_all=self.overlap_all,
60+
**self.kwargs)
61+
file_elements.extend(file_element)
62+
del file_element
63+
64+
return file_elements
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
from typing import List
2+
try:
3+
from unstructured.partition.md import partition_md
4+
except ImportError:
5+
raise ImportError(
6+
"Could not import unstructured package. "
7+
"Please install it with `pip install 'unstructured[pdf] @ git+https://github.com/clarifai/unstructured.git@support_clarifai_model'`."
8+
)
9+
10+
from clarifai_datautils.constants.pipeline import MAX_CHARACTERS
11+
12+
from .basetransform import BaseTransform
13+
14+
15+
class MarkdownPartition(BaseTransform):
16+
"""Partitions Markdown file into text elements."""
17+
18+
def __init__(self,
19+
chunking_strategy: str = "basic",
20+
max_characters=MAX_CHARACTERS,
21+
overlap=None,
22+
overlap_all=True,
23+
**kwargs):
24+
"""Initializes an MarkdownPartition object.
25+
26+
Args:
27+
chunking_strategy (str): Chunking strategy to use.
28+
max_characters (int): Maximum number of characters in a chunk.
29+
overlap (int): Number of characters to overlap between chunks.
30+
overlap_all (bool): Whether to overlap all chunks.
31+
kwargs: Additional keyword arguments.
32+
33+
"""
34+
if chunking_strategy not in ["basic", "by_title"]:
35+
raise ValueError("chunking_strategy should be either 'basic' or 'by_title'.")
36+
self.chunking_strategy = chunking_strategy
37+
self.max_characters = max_characters
38+
self.overlap = overlap
39+
self.overlap_all = overlap_all
40+
self.kwargs = kwargs
41+
42+
def __call__(self, elements: List[str]) -> List[str]:
43+
"""Applies the transformation.
44+
45+
Args:
46+
elements (List[str]): List of text elements.
47+
48+
Returns:
49+
List of transformed text elements.
50+
51+
"""
52+
file_elements = []
53+
for filename in elements:
54+
file_element = partition_md(
55+
filename=filename,
56+
chunking_strategy=self.chunking_strategy,
57+
max_characters=self.max_characters,
58+
overlap=self.overlap,
59+
overlap_all=self.overlap_all,
60+
**self.kwargs)
61+
file_elements.extend(file_element)
62+
del file_element
63+
64+
return file_elements

clarifai_datautils/multimodal/pipeline/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ dataset.upload_dataset(pipeline.run(files = file_path, loader = True))
4141
## Supported File Formats
4242
- PDF
4343
- Text(.txt)
44+
- DOCx
45+
- Markdown(.md)
4446

4547

4648
## Resources

clarifai_datautils/multimodal/pipeline/Text.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414

1515
class TextPartition(BaseTransform):
16-
"""Partitions PDF file into text elements."""
16+
"""Partitions Text file into text elements."""
1717

1818
def __init__(self,
1919
chunking_strategy: str = "basic",

clarifai_datautils/multimodal/pipeline/base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ def load(self, name) -> 'Pipeline':
102102
'ocr_pdf': Custom_Pipelines.ocr_pdf_pipeline(),
103103
'structured_pdf': Custom_Pipelines.structured_pdf_pipeline(),
104104
'standard_text': Custom_Pipelines.standard_text_pipeline(),
105+
'standard_docx': Custom_Pipelines.standard_docx_pipeline(),
106+
'standard_markdown': Custom_Pipelines.standard_markdown_pipeline(),
105107
}
106108
try:
107109
if self.name in self.custom_pipelines_map:

clarifai_datautils/multimodal/pipeline/custom_pipeline.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
from clarifai_datautils.constants.pipeline import * # noqa: F403
22
from clarifai_datautils.multimodal.pipeline.cleaners import (Clean_extra_whitespace,
33
Group_broken_paragraphs)
4+
from clarifai_datautils.multimodal.pipeline.Docx import DocxPartition
45
from clarifai_datautils.multimodal.pipeline.extractors import (ExtractDateTimeTz,
56
ExtractEmailAddress)
7+
from clarifai_datautils.multimodal.pipeline.Markdown import MarkdownPartition
68
from clarifai_datautils.multimodal.pipeline.PDF import PDFPartition
79
from clarifai_datautils.multimodal.pipeline.Text import TextPartition
810

@@ -47,3 +49,17 @@ def standard_text_pipeline():
4749
Clean_extra_whitespace(),
4850
Group_broken_paragraphs(),
4951
]
52+
53+
def standard_docx_pipeline():
54+
return [
55+
DocxPartition(max_characters=1024, overlap=None),
56+
Clean_extra_whitespace(),
57+
Group_broken_paragraphs(),
58+
]
59+
60+
def standard_markdown_pipeline():
61+
return [
62+
MarkdownPartition(max_characters=1024, overlap=None),
63+
Clean_extra_whitespace(),
64+
Group_broken_paragraphs(),
65+
]

clarifai_datautils/multimodal/pipeline/loaders.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,7 @@ def __getitem__(self, index: int):
3939
if self.elements[index].to_dict()['type'] == 'Table':
4040
meta['type'] = 'table'
4141

42-
return MultiModalFeatures(
43-
text=text, image_bytes=image_data, labels=[self.pipeline_name], metadata=meta, id=id)
42+
return MultiModalFeatures(text=text, image_bytes=image_data, metadata=meta, id=id)
4443

4544
def __len__(self):
4645
return len(self.elements)
@@ -65,10 +64,7 @@ def __getitem__(self, index: int):
6564
id = self.elements[index].to_dict().get('element_id', None)
6665
id = id[:48] if id is not None else None
6766
return TextFeatures(
68-
text=self.elements[index].text,
69-
labels=self.pipeline_name,
70-
metadata=self.elements[index].metadata.to_dict(),
71-
id=id)
67+
text=self.elements[index].text, metadata=self.elements[index].metadata.to_dict(), id=id)
7268

7369
def __len__(self):
7470
return len(self.elements)

requirements-dev.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@ unstructured[pdf] @ git+https://github.com/clarifai/unstructured.git@support_cla
33
llama-index-core==0.10.33
44
llama-index-llms-clarifai==0.1.2
55
pi_heif==0.18.0
6+
markdown==3.7
7+
python-docx==1.1.2

0 commit comments

Comments
 (0)