Merge pull request #32 from Clarifai/DEVX-454-Data-Ingestion-Pipeline-for-Other-Formats

srikanthbachala20 · web-flow · commit 13cf2e76ba02 · 2025-01-02T19:34:14.000+05:30
[DEVX-454]: Added Support for Docx &amp; Markdown in Data Ingestion Pipeline
diff --git a/clarifai_datautils/base/features.py b/clarifai_datautils/base/features.py
@@ -45,7 +45,7 @@ class VisualSegmentationFeatures:
 class TextFeatures:
   """Text classification datasets preprocessing output features."""
   text: str
-  labels: List[Union[str, int]]  # List[str or int] to cater for multi-class tasks
+  labels: List[Union[str, int]] = None  # List[str or int] to cater for multi-class tasks
   id: Optional[int] = None  # text_id
   metadata: Optional[dict] = None
   label_ids: Optional[List[str]] = None
diff --git a/clarifai_datautils/multimodal/__init__.py b/clarifai_datautils/multimodal/__init__.py
@@ -1,5 +1,10 @@
 from clarifai_datautils.multimodal.pipeline.base import Pipeline
+from clarifai_datautils.multimodal.pipeline.Docx import DocxPartition
+from clarifai_datautils.multimodal.pipeline.Markdown import MarkdownPartition
 from clarifai_datautils.multimodal.pipeline.PDF import PDFPartition
 from clarifai_datautils.multimodal.pipeline.Text import TextPartition
 
-__all__ = ['Pipeline', 'PDFPartition', 'TextPartition', 'PDFPartitionMultimodal']
+__all__ = [
+    'Pipeline', 'PDFPartition', 'TextPartition', 'PDFPartitionMultimodal', 'DocxPartition',
+    'MarkdownPartition'
+]
diff --git a/clarifai_datautils/multimodal/pipeline/Docx.py b/clarifai_datautils/multimodal/pipeline/Docx.py
@@ -0,0 +1,64 @@
+from typing import List
+try:
+  from unstructured.partition.docx import partition_docx
+except ImportError:
+  raise ImportError(
+      "Could not import unstructured package. "
+      "Please install it with `pip install 'unstructured[pdf] @ git+https://github.com/clarifai/unstructured.git@support_clarifai_model'`."
+  )
+
+from clarifai_datautils.constants.pipeline import MAX_CHARACTERS
+
+from .basetransform import BaseTransform
+
+
+class DocxPartition(BaseTransform):
+  """Partitions Docx file into text elements."""
+
+  def __init__(self,
+               chunking_strategy: str = "basic",
+               max_characters=MAX_CHARACTERS,
+               overlap=None,
+               overlap_all=True,
+               **kwargs):
+    """Initializes an DocxPartition object.
+
+    Args:
+        chunking_strategy (str): Chunking strategy to use.
+        max_characters (int): Maximum number of characters in a chunk.
+        overlap (int): Number of characters to overlap between chunks.
+        overlap_all (bool): Whether to overlap all chunks.
+        kwargs: Additional keyword arguments.
+
+    """
+    if chunking_strategy not in ["basic", "by_title"]:
+      raise ValueError("chunking_strategy should be either 'basic' or 'by_title'.")
+    self.chunking_strategy = chunking_strategy
+    self.max_characters = max_characters
+    self.overlap = overlap
+    self.overlap_all = overlap_all
+    self.kwargs = kwargs
+
+  def __call__(self, elements: List[str]) -> List[str]:
+    """Applies the transformation.
+
+    Args:
+        elements (List[str]): List of text elements.
+
+    Returns:
+        List of transformed text elements.
+
+    """
+    file_elements = []
+    for filename in elements:
+      file_element = partition_docx(
+          filename=filename,
+          chunking_strategy=self.chunking_strategy,
+          max_characters=self.max_characters,
+          overlap=self.overlap,
+          overlap_all=self.overlap_all,
+          **self.kwargs)
+      file_elements.extend(file_element)
+      del file_element
+
+    return file_elements
diff --git a/clarifai_datautils/multimodal/pipeline/Markdown.py b/clarifai_datautils/multimodal/pipeline/Markdown.py
@@ -0,0 +1,64 @@
+from typing import List
+try:
+  from unstructured.partition.md import partition_md
+except ImportError:
+  raise ImportError(
+      "Could not import unstructured package. "
+      "Please install it with `pip install 'unstructured[pdf] @ git+https://github.com/clarifai/unstructured.git@support_clarifai_model'`."
+  )
+
+from clarifai_datautils.constants.pipeline import MAX_CHARACTERS
+
+from .basetransform import BaseTransform
+
+
+class MarkdownPartition(BaseTransform):
+  """Partitions Markdown file into text elements."""
+
+  def __init__(self,
+               chunking_strategy: str = "basic",
+               max_characters=MAX_CHARACTERS,
+               overlap=None,
+               overlap_all=True,
+               **kwargs):
+    """Initializes an MarkdownPartition object.
+
+    Args:
+        chunking_strategy (str): Chunking strategy to use.
+        max_characters (int): Maximum number of characters in a chunk.
+        overlap (int): Number of characters to overlap between chunks.
+        overlap_all (bool): Whether to overlap all chunks.
+        kwargs: Additional keyword arguments.
+
+    """
+    if chunking_strategy not in ["basic", "by_title"]:
+      raise ValueError("chunking_strategy should be either 'basic' or 'by_title'.")
+    self.chunking_strategy = chunking_strategy
+    self.max_characters = max_characters
+    self.overlap = overlap
+    self.overlap_all = overlap_all
+    self.kwargs = kwargs
+
+  def __call__(self, elements: List[str]) -> List[str]:
+    """Applies the transformation.
+
+    Args:
+        elements (List[str]): List of text elements.
+
+    Returns:
+        List of transformed text elements.
+
+    """
+    file_elements = []
+    for filename in elements:
+      file_element = partition_md(
+          filename=filename,
+          chunking_strategy=self.chunking_strategy,
+          max_characters=self.max_characters,
+          overlap=self.overlap,
+          overlap_all=self.overlap_all,
+          **self.kwargs)
+      file_elements.extend(file_element)
+      del file_element
+
+    return file_elements
diff --git a/clarifai_datautils/multimodal/pipeline/README.md b/clarifai_datautils/multimodal/pipeline/README.md
@@ -41,6 +41,8 @@ dataset.upload_dataset(pipeline.run(files = file_path, loader = True))
 ## Supported File Formats
 - PDF
 - Text(.txt)
+- DOCx
+- Markdown(.md)
 
 
 ## Resources
diff --git a/clarifai_datautils/multimodal/pipeline/Text.py b/clarifai_datautils/multimodal/pipeline/Text.py
@@ -13,7 +13,7 @@
 
 
 class TextPartition(BaseTransform):
-  """Partitions PDF file into text elements."""
+  """Partitions Text file into text elements."""
 
   def __init__(self,
                chunking_strategy: str = "basic",
diff --git a/clarifai_datautils/multimodal/pipeline/base.py b/clarifai_datautils/multimodal/pipeline/base.py
@@ -102,6 +102,8 @@ def load(self, name) -> 'Pipeline':
         'ocr_pdf': Custom_Pipelines.ocr_pdf_pipeline(),
         'structured_pdf': Custom_Pipelines.structured_pdf_pipeline(),
         'standard_text': Custom_Pipelines.standard_text_pipeline(),
+        'standard_docx': Custom_Pipelines.standard_docx_pipeline(),
+        'standard_markdown': Custom_Pipelines.standard_markdown_pipeline(),
     }
     try:
       if self.name in self.custom_pipelines_map:
diff --git a/clarifai_datautils/multimodal/pipeline/custom_pipeline.py b/clarifai_datautils/multimodal/pipeline/custom_pipeline.py
@@ -1,8 +1,10 @@
 from clarifai_datautils.constants.pipeline import *  # noqa: F403
 from clarifai_datautils.multimodal.pipeline.cleaners import (Clean_extra_whitespace,
                                                              Group_broken_paragraphs)
+from clarifai_datautils.multimodal.pipeline.Docx import DocxPartition
 from clarifai_datautils.multimodal.pipeline.extractors import (ExtractDateTimeTz,
                                                                ExtractEmailAddress)
+from clarifai_datautils.multimodal.pipeline.Markdown import MarkdownPartition
 from clarifai_datautils.multimodal.pipeline.PDF import PDFPartition
 from clarifai_datautils.multimodal.pipeline.Text import TextPartition
 
@@ -47,3 +49,17 @@ def standard_text_pipeline():
         Clean_extra_whitespace(),
         Group_broken_paragraphs(),
     ]
+
+  def standard_docx_pipeline():
+    return [
+        DocxPartition(max_characters=1024, overlap=None),
+        Clean_extra_whitespace(),
+        Group_broken_paragraphs(),
+    ]
+
+  def standard_markdown_pipeline():
+    return [
+        MarkdownPartition(max_characters=1024, overlap=None),
+        Clean_extra_whitespace(),
+        Group_broken_paragraphs(),
+    ]
diff --git a/clarifai_datautils/multimodal/pipeline/loaders.py b/clarifai_datautils/multimodal/pipeline/loaders.py
@@ -39,8 +39,7 @@ def __getitem__(self, index: int):
     if self.elements[index].to_dict()['type'] == 'Table':
       meta['type'] = 'table'
 
-    return MultiModalFeatures(
-        text=text, image_bytes=image_data, labels=[self.pipeline_name], metadata=meta, id=id)
+    return MultiModalFeatures(text=text, image_bytes=image_data, metadata=meta, id=id)
 
   def __len__(self):
     return len(self.elements)
@@ -65,10 +64,7 @@ def __getitem__(self, index: int):
     id = self.elements[index].to_dict().get('element_id', None)
     id = id[:48] if id is not None else None
     return TextFeatures(
-        text=self.elements[index].text,
-        labels=self.pipeline_name,
-        metadata=self.elements[index].metadata.to_dict(),
-        id=id)
+        text=self.elements[index].text, metadata=self.elements[index].metadata.to_dict(), id=id)
 
   def __len__(self):
     return len(self.elements)
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -3,3 +3,5 @@ unstructured[pdf] @ git+https://github.com/clarifai/unstructured.git@support_cla
 llama-index-core==0.10.33
 llama-index-llms-clarifai==0.1.2
 pi_heif==0.18.0
+markdown==3.7
+python-docx==1.1.2
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,5 @@
 llama-index-core==0.10.33
 llama-index-llms-clarifai==0.1.2
 pi_heif==0.18.0
+markdown==3.7
+python-docx==1.1.2
diff --git a/tests/pipelines/assets/DOCX_TestPage.docx b/tests/pipelines/assets/DOCX_TestPage.docx
diff --git a/tests/pipelines/assets/markdown-sample.md b/tests/pipelines/assets/markdown-sample.md
@@ -0,0 +1,157 @@
+An h1 header
+============
+
+Paragraphs are separated by a blank line.
+
+2nd paragraph. *Italic*, **bold**, and `monospace`. Itemized lists
+look like:
+
+  * this one
+  * that one
+  * the other one
+
+Note that --- not considering the asterisk --- the actual text
+content starts at 4-columns in.
+
+> Block quotes are
+> written like so.
+>
+> They can span multiple paragraphs,
+> if you like.
+
+Use 3 dashes for an em-dash. Use 2 dashes for ranges (ex., "it's all
+in chapters 12--14"). Three dots ... will be converted to an ellipsis.
+Unicode is supported. ☺
+
+
+
+An h2 header
+------------
+
+Here's a numbered list:
+
+ 1. first item
+ 2. second item
+ 3. third item
+
+Note again how the actual text starts at 4 columns in (4 characters
+from the left side). Here's a code sample:
+
+    # Let me re-iterate ...
+    for i in 1 .. 10 { do-something(i) }
+
+As you probably guessed, indented 4 spaces. By the way, instead of
+indenting the block, you can use delimited blocks, if you like:
+
+~~~
+define foobar() {
+    print "Welcome to flavor country!";
+}
+~~~
+
+(which makes copying & pasting easier). You can optionally mark the
+delimited block for Pandoc to syntax highlight it:
+
+~~~python
+import time
+# Quick, count to ten!
+for i in range(10):
+    # (but not *too* quick)
+    time.sleep(0.5)
+    print i
+~~~
+
+
+
+### An h3 header ###
+
+Now a nested list:
+
+ 1. First, get these ingredients:
+
+      * carrots
+      * celery
+      * lentils
+
+ 2. Boil some water.
+
+ 3. Dump everything in the pot and follow
+    this algorithm:
+
+        find wooden spoon
+        uncover pot
+        stir
+        cover pot
+        balance wooden spoon precariously on pot handle
+        wait 10 minutes
+        goto first step (or shut off burner when done)
+
+    Do not bump wooden spoon or it will fall.
+
+Notice again how text always lines up on 4-space indents (including
+that last line which continues item 3 above).
+
+Here's a link to [a website](http://foo.bar), to a [local
+doc](local-doc.html), and to a [section heading in the current
+doc](#an-h2-header). Here's a footnote [^1].
+
+[^1]: Footnote text goes here.
+
+Tables can look like this:
+
+size  material      color
+----  ------------  ------------
+9     leather       brown
+10    hemp canvas   natural
+11    glass         transparent
+
+Table: Shoes, their sizes, and what they're made of
+
+(The above is the caption for the table.) Pandoc also supports
+multi-line tables:
+
+--------  -----------------------
+keyword   text
+--------  -----------------------
+red       Sunsets, apples, and
+          other red or reddish
+          things.
+
+green     Leaves, grass, frogs
+          and other things it's
+          not easy being.
+--------  -----------------------
+
+A horizontal rule follows.
+
+***
+
+Here's a definition list:
+
+apples
+  : Good for making applesauce.
+oranges
+  : Citrus!
+tomatoes
+  : There's no "e" in tomatoe.
+
+Again, text is indented 4 spaces. (Put a blank line between each
+term/definition pair to spread things out more.)
+
+Here's a "line block":
+
+| Line one
+|   Line too
+| Line tree
+
+and images can be specified like so:
+
+![example image](example-image.jpg "An exemplary image")
+
+Inline math equations go in like so: $\omega = d\phi / dt$. Display
+math should get its own line and be put in in double-dollarsigns:
+
+$$I = \int \rho R^{2} dV$$
+
+And note that you can backslash-escape any punctuation characters
+which you wish to be displayed literally, ex.: \`foo\`, \*bar\*, etc.
diff --git a/tests/pipelines/test_docx_pipelines.py b/tests/pipelines/test_docx_pipelines.py
diff --git a/tests/pipelines/test_markdown_pipelines.py b/tests/pipelines/test_markdown_pipelines.py
diff --git a/tests/pipelines/test_ready_to_use_pipelines.py b/tests/pipelines/test_ready_to_use_pipelines.py
diff --git a/tests/pipelines/test_text_pipelines.py b/tests/pipelines/test_text_pipelines.py

Original file line number	Diff line number	Diff line change
`@@ -102,6 +102,8 @@ def load(self, name) -> 'Pipeline':`
`102`	`102`	`'ocr_pdf': Custom_Pipelines.ocr_pdf_pipeline(),`
`103`	`103`	`'structured_pdf': Custom_Pipelines.structured_pdf_pipeline(),`
`104`	`104`	`'standard_text': Custom_Pipelines.standard_text_pipeline(),`
	`105`	`+ 'standard_docx': Custom_Pipelines.standard_docx_pipeline(),`
	`106`	`+ 'standard_markdown': Custom_Pipelines.standard_markdown_pipeline(),`
`105`	`107`	`}`
`106`	`108`	`try:`
`107`	`109`	`if self.name in self.custom_pipelines_map:`