[DEVX-454]: Added tests for Docx & Markdown Pipelines

srikanthbachala20 · srikanthbachala20 · commit 6cc75c4e7be8 · 2025-01-02T18:38:06.000+05:30
diff --git a/tests/pipelines/assets/DOCX_TestPage.docx b/tests/pipelines/assets/DOCX_TestPage.docx
diff --git a/tests/pipelines/assets/markdown-sample.md b/tests/pipelines/assets/markdown-sample.md
@@ -0,0 +1,157 @@
+An h1 header
+============
+
+Paragraphs are separated by a blank line.
+
+2nd paragraph. *Italic*, **bold**, and `monospace`. Itemized lists
+look like:
+
+  * this one
+  * that one
+  * the other one
+
+Note that --- not considering the asterisk --- the actual text
+content starts at 4-columns in.
+
+> Block quotes are
+> written like so.
+>
+> They can span multiple paragraphs,
+> if you like.
+
+Use 3 dashes for an em-dash. Use 2 dashes for ranges (ex., "it's all
+in chapters 12--14"). Three dots ... will be converted to an ellipsis.
+Unicode is supported. ☺
+
+
+
+An h2 header
+------------
+
+Here's a numbered list:
+
+ 1. first item
+ 2. second item
+ 3. third item
+
+Note again how the actual text starts at 4 columns in (4 characters
+from the left side). Here's a code sample:
+
+    # Let me re-iterate ...
+    for i in 1 .. 10 { do-something(i) }
+
+As you probably guessed, indented 4 spaces. By the way, instead of
+indenting the block, you can use delimited blocks, if you like:
+
+~~~
+define foobar() {
+    print "Welcome to flavor country!";
+}
+~~~
+
+(which makes copying & pasting easier). You can optionally mark the
+delimited block for Pandoc to syntax highlight it:
+
+~~~python
+import time
+# Quick, count to ten!
+for i in range(10):
+    # (but not *too* quick)
+    time.sleep(0.5)
+    print i
+~~~
+
+
+
+### An h3 header ###
+
+Now a nested list:
+
+ 1. First, get these ingredients:
+
+      * carrots
+      * celery
+      * lentils
+
+ 2. Boil some water.
+
+ 3. Dump everything in the pot and follow
+    this algorithm:
+
+        find wooden spoon
+        uncover pot
+        stir
+        cover pot
+        balance wooden spoon precariously on pot handle
+        wait 10 minutes
+        goto first step (or shut off burner when done)
+
+    Do not bump wooden spoon or it will fall.
+
+Notice again how text always lines up on 4-space indents (including
+that last line which continues item 3 above).
+
+Here's a link to [a website](http://foo.bar), to a [local
+doc](local-doc.html), and to a [section heading in the current
+doc](#an-h2-header). Here's a footnote [^1].
+
+[^1]: Footnote text goes here.
+
+Tables can look like this:
+
+size  material      color
+----  ------------  ------------
+9     leather       brown
+10    hemp canvas   natural
+11    glass         transparent
+
+Table: Shoes, their sizes, and what they're made of
+
+(The above is the caption for the table.) Pandoc also supports
+multi-line tables:
+
+--------  -----------------------
+keyword   text
+--------  -----------------------
+red       Sunsets, apples, and
+          other red or reddish
+          things.
+
+green     Leaves, grass, frogs
+          and other things it's
+          not easy being.
+--------  -----------------------
+
+A horizontal rule follows.
+
+***
+
+Here's a definition list:
+
+apples
+  : Good for making applesauce.
+oranges
+  : Citrus!
+tomatoes
+  : There's no "e" in tomatoe.
+
+Again, text is indented 4 spaces. (Put a blank line between each
+term/definition pair to spread things out more.)
+
+Here's a "line block":
+
+| Line one
+|   Line too
+| Line tree
+
+and images can be specified like so:
+
+![example image](example-image.jpg "An exemplary image")
+
+Inline math equations go in like so: $\omega = d\phi / dt$. Display
+math should get its own line and be put in in double-dollarsigns:
+
+$$I = \int \rho R^{2} dV$$
+
+And note that you can backslash-escape any punctuation characters
+which you wish to be displayed literally, ex.: \`foo\`, \*bar\*, etc.
diff --git a/tests/pipelines/test_docx_pipelines.py b/tests/pipelines/test_docx_pipelines.py
@@ -0,0 +1,52 @@
+import os.path as osp
+
+from clarifai_datautils.multimodal import DocxPartition, Pipeline
+from clarifai_datautils.multimodal.pipeline.cleaners import Clean_extra_whitespace
+from clarifai_datautils.multimodal.pipeline.extractors import ExtractTextAfter
+
+DOCX_FILE_PATH = osp.abspath(osp.join(osp.dirname(__file__), "assets", "DOCX_TestPage.docx"))
+
+
+class TestDocxPipelines:
+  """Tests for pipeline transformations."""
+
+  def test_pipeline(self,):
+    """Tests for pipeline
+    """
+
+    pipeline = Pipeline(
+        name='pipeline-1',
+        transformations=[
+            DocxPartition(chunking_strategy="by_title", max_characters=1024),
+            Clean_extra_whitespace(),
+        ])
+    assert pipeline.name == 'pipeline-1'
+    assert len(pipeline.transformations) == 2
+
+  def test_pipeline_run(self,):
+    """Tests for pipeline run"""
+    pipeline = Pipeline(
+        name='pipeline-1',
+        transformations=[
+            DocxPartition(chunking_strategy="by_title", max_characters=1024),
+            Clean_extra_whitespace(),
+            ExtractTextAfter(key='text_after', string='Test Complete,')
+        ])
+    elements = pipeline.run(files=DOCX_FILE_PATH)
+    assert len(elements) == 1
+    assert elements[0].text[:9] == 'Test Page'
+    assert elements[0].metadata['filename'] == 'DOCX_TestPage.docx'
+    assert elements[0].metadata['text_after'] == 'you may close this File.'
+
+  def test_pipeline_run_chunker(self,):
+    """Tests for pipeline run with chunker"""
+    pipeline = Pipeline(
+        name='pipeline-1',
+        transformations=[
+            DocxPartition(chunking_strategy="by_title", max_characters=100),
+            Clean_extra_whitespace(),
+        ])
+    elements = pipeline.run(files=DOCX_FILE_PATH)
+    assert len(elements) == 6
+    assert elements[0].metadata['filename'] == 'DOCX_TestPage.docx'
+    assert elements[0].metadata['languages'] == ['eng']
diff --git a/tests/pipelines/test_markdown_pipelines.py b/tests/pipelines/test_markdown_pipelines.py
@@ -0,0 +1,52 @@
+import os.path as osp
+
+from clarifai_datautils.multimodal import MarkdownPartition, Pipeline
+from clarifai_datautils.multimodal.pipeline.cleaners import Clean_extra_whitespace
+from clarifai_datautils.multimodal.pipeline.extractors import ExtractTextAfter
+
+MARKDOWN_FILE_PATH = osp.abspath(osp.join(osp.dirname(__file__), "assets", "markdown-sample.md"))
+
+
+class TestMarkdownPipelines:
+  """Tests for pipeline transformations."""
+
+  def test_pipeline(self,):
+    """Tests for pipeline
+    """
+
+    pipeline = Pipeline(
+        name='pipeline-1',
+        transformations=[
+            MarkdownPartition(chunking_strategy="by_title", max_characters=1024),
+            Clean_extra_whitespace(),
+        ])
+    assert pipeline.name == 'pipeline-1'
+    assert len(pipeline.transformations) == 2
+
+  def test_pipeline_run(self,):
+    """Tests for pipeline run"""
+    pipeline = Pipeline(
+        name='pipeline-1',
+        transformations=[
+            MarkdownPartition(chunking_strategy="by_title", max_characters=1024),
+            Clean_extra_whitespace(),
+            ExtractTextAfter(key='text_after', string='will be converted to an ellipsis. ')
+        ])
+    elements = pipeline.run(files=MARKDOWN_FILE_PATH)
+    assert len(elements) == 4
+    assert elements[0].text[:9] == 'An h1 hea'
+    assert elements[0].metadata['filename'] == 'markdown-sample.md'
+    assert elements[0].metadata['text_after'] == 'Unicode is supported. ☺'
+
+  def test_pipeline_run_chunker(self,):
+    """Tests for pipeline run with chunker"""
+    pipeline = Pipeline(
+        name='pipeline-1',
+        transformations=[
+            MarkdownPartition(chunking_strategy="by_title", max_characters=100),
+            Clean_extra_whitespace(),
+        ])
+    elements = pipeline.run(files=MARKDOWN_FILE_PATH)
+    assert len(elements) == 43
+    assert elements[0].metadata['filename'] == 'markdown-sample.md'
+    assert elements[0].metadata['languages'] == ['eng']
diff --git a/tests/pipelines/test_ready_to_use_pipelines.py b/tests/pipelines/test_ready_to_use_pipelines.py
@@ -59,3 +59,21 @@ def test_pipeline_standard_text(self,):
     assert pipeline.transformations[0].__class__.__name__ == 'TextPartition'
     assert pipeline.transformations[1].__class__.__name__ == 'Clean_extra_whitespace'
     assert pipeline.transformations[2].__class__.__name__ == 'Group_broken_paragraphs'
+
+  def test_pipeline_standard_docx(self,):
+    """Tests for standard docx pipeline"""
+    pipeline = Pipeline.load(name='standard_docx')
+    assert pipeline.name == 'standard_docx'
+    assert len(pipeline.transformations) == 3
+    assert pipeline.transformations[0].__class__.__name__ == 'DocxPartition'
+    assert pipeline.transformations[1].__class__.__name__ == 'Clean_extra_whitespace'
+    assert pipeline.transformations[2].__class__.__name__ == 'Group_broken_paragraphs'
+
+  def test_pipeline_standard_markdown(self,):
+    """Tests for standard markdown pipeline"""
+    pipeline = Pipeline.load(name='standard_markdown')
+    assert pipeline.name == 'standard_markdown'
+    assert len(pipeline.transformations) == 3
+    assert pipeline.transformations[0].__class__.__name__ == 'MarkdownPartition'
+    assert pipeline.transformations[1].__class__.__name__ == 'Clean_extra_whitespace'
+    assert pipeline.transformations[2].__class__.__name__ == 'Group_broken_paragraphs'
diff --git a/tests/pipelines/test_text_pipelines.py b/tests/pipelines/test_text_pipelines.py
@@ -8,7 +8,7 @@
     osp.join(osp.dirname(__file__), "assets", "book-war-and-peace-1p.txt"))
 
 
-class TestPDFPipelines:
+class TestTextPipelines:
   """Tests for pipeline transformations."""
 
   def test_pipeline(self,):