Skip to content

Commit 09c977c

Browse files
committed
test case
1 parent f7dd88a commit 09c977c

File tree

3 files changed

+58
-20
lines changed

3 files changed

+58
-20
lines changed

clarifai_datautils/multimodal/pipeline/summarizer.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,16 @@ def __call__(self, elements: List) -> List:
4747
for _, element in enumerate(elements):
4848
element.metadata.update(
4949
ElementMetadata.from_dict({
50-
'is_original': True,
51-
'input_id': f'{random.randint(1000000, 99999999)}'
50+
'is_original': True
5251
}))
5352
if isinstance(element, Image):
53+
element.metadata.update(
54+
ElementMetadata.from_dict({
55+
'input_id': f'{random.randint(1000000, 99999999)}'
56+
}))
5457
img_elements.append(element)
5558
# new_elements = Parallel(n_jobs=len(elements))(delayed(self._summarize_image)(element) for element in img_elements)
56-
new_elements = self._summarize_image(elements)
59+
new_elements = self._summarize_image(img_elements)
5760
elements.extend(new_elements)
5861
return elements
5962

@@ -67,7 +70,7 @@ def _summarize_image(self, image_elements: List[Image]) -> List[CompositeElement
6770
Summarized image element.
6871
6972
"""
70-
img_inputs = []
73+
img_inputs = []
7174
for element in image_elements:
7275
if not isinstance(element, Image):
7376
continue
@@ -80,11 +83,11 @@ def _summarize_image(self, image_elements: List[Image]) -> List[CompositeElement
8083
resp = self.model.predict(img_inputs)
8184

8285
new_elements = []
83-
for i, element in enumerate(resp.outputs):
86+
for i, output in enumerate(resp.outputs):
8487
summary = ""
8588
if image_elements[i].text:
8689
summary = image_elements[i].text
87-
summary = summary + " \n " + element.data.text.raw
90+
summary = summary + " \n " + output.data.text.raw
8891
eid = image_elements[i].metadata.input_id
8992
meta_dict = {'source_input_id': eid, 'is_original': False}
9093
comp_element = CompositeElement(

testing/test.ipynb

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -188,16 +188,16 @@
188188
},
189189
{
190190
"cell_type": "code",
191-
"execution_count": 3,
191+
"execution_count": 2,
192192
"metadata": {},
193193
"outputs": [
194194
{
195195
"data": {
196196
"text/plain": [
197-
"<clarifai_datautils.multimodal.pipeline.base.Pipeline at 0x2e35a5810>"
197+
"<clarifai_datautils.multimodal.pipeline.base.Pipeline at 0x2deee6b60>"
198198
]
199199
},
200-
"execution_count": 3,
200+
"execution_count": 2,
201201
"metadata": {},
202202
"output_type": "execute_result"
203203
}
@@ -347,23 +347,57 @@
347347
},
348348
{
349349
"cell_type": "code",
350-
"execution_count": 4,
350+
"execution_count": 3,
351351
"metadata": {},
352352
"outputs": [
353353
{
354354
"name": "stderr",
355355
"output_type": "stream",
356356
"text": [
357-
"Applying Transformations: 100%|██████████| 3/3 [01:46<00:00, 35.57s/it]\n",
358-
"Uploading Dataset: 100%|██████████| 2/2 [00:29<00:00, 14.69s/it]\n"
357+
"Applying Transformations: 33%|███▎ | 1/3 [00:08<00:16, 8.04s/it]"
358+
]
359+
},
360+
{
361+
"name": "stdout",
362+
"output_type": "stream",
363+
"text": [
364+
"2\n",
365+
"dict_keys(['detection_class_prob', 'coordinates', 'last_modified', 'filetype', 'languages', 'page_number', 'image_base64', 'image_mime_type', 'file_directory', 'filename', 'is_original', 'input_id'])\n",
366+
"dict_keys(['detection_class_prob', 'coordinates', 'last_modified', 'filetype', 'languages', 'page_number', 'image_base64', 'image_mime_type', 'file_directory', 'filename', 'is_original', 'input_id'])\n"
367+
]
368+
},
369+
{
370+
"name": "stderr",
371+
"output_type": "stream",
372+
"text": [
373+
"Applying Transformations: 100%|██████████| 3/3 [00:44<00:00, 14.69s/it]\n"
374+
]
375+
},
376+
{
377+
"name": "stdout",
378+
"output_type": "stream",
379+
"text": [
380+
"########\n",
381+
"2\n",
382+
"<class 'unstructured.documents.elements.Image'>\n",
383+
"dict_keys(['detection_class_prob', 'coordinates', 'last_modified', 'filetype', 'languages', 'page_number', 'image_base64', 'image_mime_type', 'file_directory', 'filename', 'is_original', 'input_id'])\n",
384+
"<class 'unstructured.documents.elements.Image'>\n",
385+
"dict_keys(['detection_class_prob', 'coordinates', 'last_modified', 'filetype', 'languages', 'page_number', 'image_base64', 'image_mime_type', 'file_directory', 'filename', 'is_original', 'input_id'])\n"
386+
]
387+
},
388+
{
389+
"name": "stderr",
390+
"output_type": "stream",
391+
"text": [
392+
"Uploading Dataset: 100%|██████████| 1/1 [00:29<00:00, 29.36s/it]\n"
359393
]
360394
}
361395
],
362396
"source": [
363397
"# Using SDK to upload\n",
364398
"from clarifai.client import Dataset\n",
365399
"dataset = Dataset(url='https://clarifai.com/mansi_k/datautils_testapp/datasets/d1', pat=os.environ['CLARIFAI_PAT'])\n",
366-
"dataset.upload_dataset(new_pipeline.run(files=\"./200945-1.p65.pdf\", loader=True))"
400+
"dataset.upload_dataset(new_pipeline.run(files=\"/Users/mansikhamkar/work/clarifai/clarifai-python-datautils/tests/pipelines/assets/Multimodal_sample_file.pdf\", loader=True))"
367401
]
368402
}
369403
],

tests/pipelines/test_multimodal_pipelines.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ def test_pipeline_summarize(self,):
7373
from clarifai_datautils.multimodal.pipeline.cleaners import Clean_extra_whitespace
7474
from clarifai_datautils.multimodal.pipeline.PDF import PDFPartitionMultimodal
7575
from clarifai_datautils.multimodal.pipeline.summarizer import ImageSummarizer
76+
import os
7677

7778
pipeline = Pipeline(
7879
name='pipeline-1',
@@ -82,16 +83,16 @@ def test_pipeline_summarize(self,):
8283
ImageSummarizer()
8384
])
8485
elements = pipeline.run(files=PDF_FILE_PATH, loader=False)
85-
assert len(elements) == 15
86+
87+
assert len(elements) == 17
8688
assert isinstance(elements, list)
8789
assert elements[0].metadata.to_dict()['filename'] == 'Multimodal_sample_file.pdf'
8890
assert elements[0].metadata.to_dict()['page_number'] == 1
89-
assert elements[0].metadata.to_dict()['email_address'] == ['[email protected]']
9091
assert elements[6].__class__.__name__ == 'Table'
91-
assert elements[-2].__class__.__name__ == 'Image'
92-
assert elements[-2].metadata.is_original is True
93-
assert elements[-2].metadata.input_id is not None
94-
id = elements[-2].metadata.input_id
92+
assert elements[-3].__class__.__name__ == 'Image'
93+
assert elements[-3].metadata.is_original is True
94+
assert elements[-3].metadata.input_id is not None
95+
id = elements[-3].metadata.input_id
9596
assert elements[-1].__class__.__name__ == 'CompositeElement'
9697
assert elements[-1].metadata.is_original is False
97-
assert elements[-1].metadata.source_input_id == 'summarized_' + id
98+
assert elements[-1].metadata.source_input_id == id

0 commit comments

Comments
 (0)