Skip to content

Commit 6cc75c4

Browse files
[DEVX-454]: Added tests for Docx & Markdown Pipelines
1 parent 755bbd4 commit 6cc75c4

File tree

6 files changed

+280
-1
lines changed

6 files changed

+280
-1
lines changed
50.8 KB
Binary file not shown.
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
An h1 header
2+
============
3+
4+
Paragraphs are separated by a blank line.
5+
6+
2nd paragraph. *Italic*, **bold**, and `monospace`. Itemized lists
7+
look like:
8+
9+
* this one
10+
* that one
11+
* the other one
12+
13+
Note that --- not considering the asterisk --- the actual text
14+
content starts at 4-columns in.
15+
16+
> Block quotes are
17+
> written like so.
18+
>
19+
> They can span multiple paragraphs,
20+
> if you like.
21+
22+
Use 3 dashes for an em-dash. Use 2 dashes for ranges (ex., "it's all
23+
in chapters 12--14"). Three dots ... will be converted to an ellipsis.
24+
Unicode is supported. ☺
25+
26+
27+
28+
An h2 header
29+
------------
30+
31+
Here's a numbered list:
32+
33+
1. first item
34+
2. second item
35+
3. third item
36+
37+
Note again how the actual text starts at 4 columns in (4 characters
38+
from the left side). Here's a code sample:
39+
40+
# Let me re-iterate ...
41+
for i in 1 .. 10 { do-something(i) }
42+
43+
As you probably guessed, indented 4 spaces. By the way, instead of
44+
indenting the block, you can use delimited blocks, if you like:
45+
46+
~~~
47+
define foobar() {
48+
print "Welcome to flavor country!";
49+
}
50+
~~~
51+
52+
(which makes copying & pasting easier). You can optionally mark the
53+
delimited block for Pandoc to syntax highlight it:
54+
55+
~~~python
56+
import time
57+
# Quick, count to ten!
58+
for i in range(10):
59+
# (but not *too* quick)
60+
time.sleep(0.5)
61+
print i
62+
~~~
63+
64+
65+
66+
### An h3 header ###
67+
68+
Now a nested list:
69+
70+
1. First, get these ingredients:
71+
72+
* carrots
73+
* celery
74+
* lentils
75+
76+
2. Boil some water.
77+
78+
3. Dump everything in the pot and follow
79+
this algorithm:
80+
81+
find wooden spoon
82+
uncover pot
83+
stir
84+
cover pot
85+
balance wooden spoon precariously on pot handle
86+
wait 10 minutes
87+
goto first step (or shut off burner when done)
88+
89+
Do not bump wooden spoon or it will fall.
90+
91+
Notice again how text always lines up on 4-space indents (including
92+
that last line which continues item 3 above).
93+
94+
Here's a link to [a website](http://foo.bar), to a [local
95+
doc](local-doc.html), and to a [section heading in the current
96+
doc](#an-h2-header). Here's a footnote [^1].
97+
98+
[^1]: Footnote text goes here.
99+
100+
Tables can look like this:
101+
102+
size material color
103+
---- ------------ ------------
104+
9 leather brown
105+
10 hemp canvas natural
106+
11 glass transparent
107+
108+
Table: Shoes, their sizes, and what they're made of
109+
110+
(The above is the caption for the table.) Pandoc also supports
111+
multi-line tables:
112+
113+
-------- -----------------------
114+
keyword text
115+
-------- -----------------------
116+
red Sunsets, apples, and
117+
other red or reddish
118+
things.
119+
120+
green Leaves, grass, frogs
121+
and other things it's
122+
not easy being.
123+
-------- -----------------------
124+
125+
A horizontal rule follows.
126+
127+
***
128+
129+
Here's a definition list:
130+
131+
apples
132+
: Good for making applesauce.
133+
oranges
134+
: Citrus!
135+
tomatoes
136+
: There's no "e" in tomatoe.
137+
138+
Again, text is indented 4 spaces. (Put a blank line between each
139+
term/definition pair to spread things out more.)
140+
141+
Here's a "line block":
142+
143+
| Line one
144+
| Line too
145+
| Line tree
146+
147+
and images can be specified like so:
148+
149+
![example image](example-image.jpg "An exemplary image")
150+
151+
Inline math equations go in like so: $\omega = d\phi / dt$. Display
152+
math should get its own line and be put in in double-dollarsigns:
153+
154+
$$I = \int \rho R^{2} dV$$
155+
156+
And note that you can backslash-escape any punctuation characters
157+
which you wish to be displayed literally, ex.: \`foo\`, \*bar\*, etc.
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import os.path as osp
2+
3+
from clarifai_datautils.multimodal import DocxPartition, Pipeline
4+
from clarifai_datautils.multimodal.pipeline.cleaners import Clean_extra_whitespace
5+
from clarifai_datautils.multimodal.pipeline.extractors import ExtractTextAfter
6+
7+
DOCX_FILE_PATH = osp.abspath(osp.join(osp.dirname(__file__), "assets", "DOCX_TestPage.docx"))
8+
9+
10+
class TestDocxPipelines:
11+
"""Tests for pipeline transformations."""
12+
13+
def test_pipeline(self,):
14+
"""Tests for pipeline
15+
"""
16+
17+
pipeline = Pipeline(
18+
name='pipeline-1',
19+
transformations=[
20+
DocxPartition(chunking_strategy="by_title", max_characters=1024),
21+
Clean_extra_whitespace(),
22+
])
23+
assert pipeline.name == 'pipeline-1'
24+
assert len(pipeline.transformations) == 2
25+
26+
def test_pipeline_run(self,):
27+
"""Tests for pipeline run"""
28+
pipeline = Pipeline(
29+
name='pipeline-1',
30+
transformations=[
31+
DocxPartition(chunking_strategy="by_title", max_characters=1024),
32+
Clean_extra_whitespace(),
33+
ExtractTextAfter(key='text_after', string='Test Complete,')
34+
])
35+
elements = pipeline.run(files=DOCX_FILE_PATH)
36+
assert len(elements) == 1
37+
assert elements[0].text[:9] == 'Test Page'
38+
assert elements[0].metadata['filename'] == 'DOCX_TestPage.docx'
39+
assert elements[0].metadata['text_after'] == 'you may close this File.'
40+
41+
def test_pipeline_run_chunker(self,):
42+
"""Tests for pipeline run with chunker"""
43+
pipeline = Pipeline(
44+
name='pipeline-1',
45+
transformations=[
46+
DocxPartition(chunking_strategy="by_title", max_characters=100),
47+
Clean_extra_whitespace(),
48+
])
49+
elements = pipeline.run(files=DOCX_FILE_PATH)
50+
assert len(elements) == 6
51+
assert elements[0].metadata['filename'] == 'DOCX_TestPage.docx'
52+
assert elements[0].metadata['languages'] == ['eng']
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import os.path as osp
2+
3+
from clarifai_datautils.multimodal import MarkdownPartition, Pipeline
4+
from clarifai_datautils.multimodal.pipeline.cleaners import Clean_extra_whitespace
5+
from clarifai_datautils.multimodal.pipeline.extractors import ExtractTextAfter
6+
7+
MARKDOWN_FILE_PATH = osp.abspath(osp.join(osp.dirname(__file__), "assets", "markdown-sample.md"))
8+
9+
10+
class TestMarkdownPipelines:
11+
"""Tests for pipeline transformations."""
12+
13+
def test_pipeline(self,):
14+
"""Tests for pipeline
15+
"""
16+
17+
pipeline = Pipeline(
18+
name='pipeline-1',
19+
transformations=[
20+
MarkdownPartition(chunking_strategy="by_title", max_characters=1024),
21+
Clean_extra_whitespace(),
22+
])
23+
assert pipeline.name == 'pipeline-1'
24+
assert len(pipeline.transformations) == 2
25+
26+
def test_pipeline_run(self,):
27+
"""Tests for pipeline run"""
28+
pipeline = Pipeline(
29+
name='pipeline-1',
30+
transformations=[
31+
MarkdownPartition(chunking_strategy="by_title", max_characters=1024),
32+
Clean_extra_whitespace(),
33+
ExtractTextAfter(key='text_after', string='will be converted to an ellipsis. ')
34+
])
35+
elements = pipeline.run(files=MARKDOWN_FILE_PATH)
36+
assert len(elements) == 4
37+
assert elements[0].text[:9] == 'An h1 hea'
38+
assert elements[0].metadata['filename'] == 'markdown-sample.md'
39+
assert elements[0].metadata['text_after'] == 'Unicode is supported. ☺'
40+
41+
def test_pipeline_run_chunker(self,):
42+
"""Tests for pipeline run with chunker"""
43+
pipeline = Pipeline(
44+
name='pipeline-1',
45+
transformations=[
46+
MarkdownPartition(chunking_strategy="by_title", max_characters=100),
47+
Clean_extra_whitespace(),
48+
])
49+
elements = pipeline.run(files=MARKDOWN_FILE_PATH)
50+
assert len(elements) == 43
51+
assert elements[0].metadata['filename'] == 'markdown-sample.md'
52+
assert elements[0].metadata['languages'] == ['eng']

tests/pipelines/test_ready_to_use_pipelines.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,21 @@ def test_pipeline_standard_text(self,):
5959
assert pipeline.transformations[0].__class__.__name__ == 'TextPartition'
6060
assert pipeline.transformations[1].__class__.__name__ == 'Clean_extra_whitespace'
6161
assert pipeline.transformations[2].__class__.__name__ == 'Group_broken_paragraphs'
62+
63+
def test_pipeline_standard_docx(self,):
64+
"""Tests for standard docx pipeline"""
65+
pipeline = Pipeline.load(name='standard_docx')
66+
assert pipeline.name == 'standard_docx'
67+
assert len(pipeline.transformations) == 3
68+
assert pipeline.transformations[0].__class__.__name__ == 'DocxPartition'
69+
assert pipeline.transformations[1].__class__.__name__ == 'Clean_extra_whitespace'
70+
assert pipeline.transformations[2].__class__.__name__ == 'Group_broken_paragraphs'
71+
72+
def test_pipeline_standard_markdown(self,):
73+
"""Tests for standard markdown pipeline"""
74+
pipeline = Pipeline.load(name='standard_markdown')
75+
assert pipeline.name == 'standard_markdown'
76+
assert len(pipeline.transformations) == 3
77+
assert pipeline.transformations[0].__class__.__name__ == 'MarkdownPartition'
78+
assert pipeline.transformations[1].__class__.__name__ == 'Clean_extra_whitespace'
79+
assert pipeline.transformations[2].__class__.__name__ == 'Group_broken_paragraphs'

tests/pipelines/test_text_pipelines.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
osp.join(osp.dirname(__file__), "assets", "book-war-and-peace-1p.txt"))
99

1010

11-
class TestPDFPipelines:
11+
class TestTextPipelines:
1212
"""Tests for pipeline transformations."""
1313

1414
def test_pipeline(self,):

0 commit comments

Comments
 (0)