Skip to content

Commit e785a0f

Browse files
feat: Thumbnail and PDF for new column type (#3193)
* draft: adding pdf cell type * draft: adding pdf functions for rows processing * draft: detect modalities * Rollback * Fix dependencies * Adding width and height * Adding test in assets * Adding pdf size y bytes * Add test for get_cell_value_value * Add more tests * Fix test parquet and info * FIx typo * Update datasets library and add test for modalities * Add e2e * Add missing test for first-rows and filter * Debug e2e * Fix copy/paste bug * Fix documents name * Fix typo * Address code review suggestions * Add Pdf spec in OpenApi
1 parent 2d2ed2c commit e785a0f

File tree

42 files changed

+1760
-411
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+1760
-411
lines changed

docs/source/openapi.json

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,9 @@
392392
},
393393
{
394394
"$ref": "#/components/schemas/VideoFeature"
395+
},
396+
{
397+
"$ref": "#/components/schemas/PdfFeature"
395398
}
396399
]
397400
},
@@ -577,6 +580,19 @@
577580
}
578581
}
579582
},
583+
"PdfFeature": {
584+
"type": "object",
585+
"required": ["_type"],
586+
"properties": {
587+
"_type": {
588+
"type": "string",
589+
"enum": ["Pdf"]
590+
},
591+
"decode": {
592+
"type": "boolean"
593+
}
594+
}
595+
},
580596
"VideoFeature": {
581597
"type": "object",
582598
"required": ["_type"],
@@ -657,6 +673,9 @@
657673
},
658674
{
659675
"$ref": "#/components/schemas/VideoCell"
676+
},
677+
{
678+
"$ref": "#/components/schemas/PdfCell"
660679
}
661680
]
662681
},
@@ -804,6 +823,22 @@
804823
]
805824
}
806825
},
826+
"PdfCell": {
827+
"type": "object",
828+
"properties": {
829+
"src": {
830+
"type": "string",
831+
"format": "uri"
832+
},
833+
"thumbnail":{
834+
"$ref": "#/components/schemas/ImageCell"
835+
},
836+
"size_bytes": {
837+
"type": "integer"
838+
}
839+
},
840+
"required": ["src", "thumbnail", "size_bytes"]
841+
},
807842
"VideoCell": {
808843
"type": "object",
809844
"properties": {

e2e/tests/conftest.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,3 +94,18 @@ def normal_user_audios_public_dataset() -> Iterator[str]:
9494
},
9595
) as dataset:
9696
yield dataset
97+
98+
99+
@pytest.fixture(scope="session")
100+
def normal_user_pdfs_public_dataset() -> Iterator[str]:
101+
with tmp_dataset(
102+
namespace=NORMAL_USER,
103+
token=NORMAL_USER_TOKEN,
104+
files={
105+
"1.pdf": str(Path(__file__).resolve().parent / "data" / "pdfs" / "1.pdf"),
106+
"2.pdf": str(Path(__file__).resolve().parent / "data" / "pdfs" / "2.pdf"),
107+
"metadata.csv": str(Path(__file__).resolve().parent / "data" / "pdfs" / "metadata.csv"),
108+
},
109+
) as dataset:
110+
print("Created dataset:", dataset)
111+
yield dataset

e2e/tests/data/pdfs/1.pdf

8.6 KB
Binary file not shown.

e2e/tests/data/pdfs/2.pdf

1.29 KB
Binary file not shown.

e2e/tests/data/pdfs/metadata.csv

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
file_name,text,has_images
2+
1.pdf,This is a PDF with an embedded image below,1
3+
2.pdf,This is a dummy PDF in US Letter size,0

e2e/tests/test_52_search.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,34 @@ def test_search_audios_endpoint(normal_user_audios_public_dataset: str) -> None:
102102
# ensure the URL is valid
103103
response = poll(url, url="")
104104
assert response.status_code == 200, response
105+
106+
107+
def test_search_pdfs_endpoint(normal_user_pdfs_public_dataset: str) -> None:
108+
dataset = normal_user_pdfs_public_dataset
109+
config, split = get_default_config_split()
110+
query = "embedded"
111+
rows_response = poll_until_ready_and_assert(
112+
relative_url=f"/search?dataset={dataset}&config={config}&split={split}&query={query}",
113+
dataset=dataset,
114+
should_retry_x_error_codes=["ResponseNotFound"],
115+
# ^ I had 404 errors without it. It should return something else at one point.
116+
)
117+
content = rows_response.json()
118+
119+
# ensure the URL is signed
120+
url = content["rows"][0]["row"]["pdf"]["src"]
121+
assert "document.pdf?Expires=" in url, url
122+
assert "&Signature=" in url, url
123+
assert "&Key-Pair-Id=" in url, url
124+
# ensure the URL is valid
125+
response = poll(url, url="")
126+
assert response.status_code == 200, response
127+
128+
# ensure the PDF's thumbnail URL is signed
129+
thumbnail_url = content["rows"][0]["row"]["pdf"]["thumbnail"]["src"]
130+
assert "document.pdf.png?Expires=" in thumbnail_url, thumbnail_url
131+
assert "&Signature=" in thumbnail_url, thumbnail_url
132+
assert "&Key-Pair-Id=" in thumbnail_url, thumbnail_url
133+
# ensure the PDF's thumbnail URL is valid
134+
response = poll(thumbnail_url, url="")
135+
assert response.status_code == 200, response

e2e/tests/test_53_filter.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,3 +162,34 @@ def test_filter_audios_endpoint(normal_user_audios_public_dataset: str) -> None:
162162
# ensure the URL is valid
163163
response = poll(url, url="")
164164
assert response.status_code == 200, response
165+
166+
167+
def test_filter_pdfs_endpoint(normal_user_pdfs_public_dataset: str) -> None:
168+
dataset = normal_user_pdfs_public_dataset
169+
config, split = get_default_config_split()
170+
where = "has_images=1"
171+
rows_response = poll_until_ready_and_assert(
172+
relative_url=f"/filter?dataset={dataset}&config={config}&split={split}&where={where}",
173+
dataset=dataset,
174+
should_retry_x_error_codes=["ResponseNotFound"],
175+
# ^ I had 404 errors without it. It should return something else at one point.
176+
)
177+
content = rows_response.json()
178+
179+
# ensure the URL is signed
180+
url = content["rows"][0]["row"]["pdf"]["src"]
181+
assert "document.pdf?Expires=" in url, url
182+
assert "&Signature=" in url, url
183+
assert "&Key-Pair-Id=" in url, url
184+
# ensure the URL is valid
185+
response = poll(url, url="")
186+
assert response.status_code == 200, response
187+
188+
# ensure the PDF's thumbnail URL is signed
189+
thumbnail_url = content["rows"][0]["row"]["pdf"]["thumbnail"]["src"]
190+
assert "document.pdf.png?Expires=" in thumbnail_url, thumbnail_url
191+
assert "&Signature=" in thumbnail_url, thumbnail_url
192+
assert "&Key-Pair-Id=" in thumbnail_url, thumbnail_url
193+
# ensure the PDF's thumbnail URL is valid
194+
response = poll(thumbnail_url, url="")
195+
assert response.status_code == 200, response

e2e/tests/test_54_rows.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,33 @@ def test_rows_audios_endpoint(normal_user_audios_public_dataset: str) -> None:
8181
# ensure the URL is valid
8282
response = poll(url, url="")
8383
assert response.status_code == 200, response
84+
85+
86+
def test_rows_pdfs_endpoint(normal_user_pdfs_public_dataset: str) -> None:
87+
dataset = normal_user_pdfs_public_dataset
88+
config, split = get_default_config_split()
89+
rows_response = poll_until_ready_and_assert(
90+
relative_url=f"/rows?dataset={dataset}&config={config}&split={split}",
91+
dataset=dataset,
92+
should_retry_x_error_codes=["ResponseNotFound"],
93+
# ^ I had 404 errors without it. It should return something else at one point.
94+
)
95+
content = rows_response.json()
96+
97+
# ensure the PDF URL is signed
98+
url = content["rows"][0]["row"]["pdf"]["src"]
99+
assert "document.pdf?Expires=" in url, url
100+
assert "&Signature=" in url, url
101+
assert "&Key-Pair-Id=" in url, url
102+
# ensure the PDF URL is valid
103+
response = poll(url, url="")
104+
assert response.status_code == 200, response
105+
106+
# ensure the PDF's thumbnail URL is signed
107+
thumbnail_url = content["rows"][0]["row"]["pdf"]["thumbnail"]["src"]
108+
assert "document.pdf.png?Expires=" in thumbnail_url, thumbnail_url
109+
assert "&Signature=" in thumbnail_url, thumbnail_url
110+
assert "&Key-Pair-Id=" in thumbnail_url, thumbnail_url
111+
# ensure the PDF's thumbnail URL is valid
112+
response = poll(thumbnail_url, url="")
113+
assert response.status_code == 200, response

e2e/tests/test_55_first_rows.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,36 @@ def test_first_rows_audios_endpoint(normal_user_audios_public_dataset: str) -> N
4848
# ensure the URL is valid
4949
response = poll(url, url="")
5050
assert response.status_code == 200, response
51+
52+
53+
def test_first_rows_pdfs_endpoint(normal_user_pdfs_public_dataset: str) -> None:
54+
dataset = normal_user_pdfs_public_dataset
55+
config, split = get_default_config_split()
56+
rows_response = poll_until_ready_and_assert(
57+
relative_url=f"/first-rows?dataset={dataset}&config={config}&split={split}",
58+
dataset=dataset,
59+
should_retry_x_error_codes=["ResponseNotFound"],
60+
# ^ I had 404 errors without it. It should return something else at one point.
61+
)
62+
content = rows_response.json()
63+
64+
# ensure the URL is signed
65+
url = content["rows"][0]["row"]["pdf"]["src"]
66+
assert isinstance(url, str), url
67+
assert "document.pdf?Expires=" in url, url
68+
assert "&Signature=" in url, url
69+
assert "&Key-Pair-Id=" in url, url
70+
# ensure the URL has been signed only once
71+
assert url.count("Expires") == 1, url
72+
# ensure the URL is valid
73+
response = poll(url, url="")
74+
assert response.status_code == 200, response
75+
76+
# ensure the PDF's thumbnail URL is signed
77+
thumbnail_url = content["rows"][0]["row"]["pdf"]["thumbnail"]["src"]
78+
assert "document.pdf.png?Expires=" in thumbnail_url, thumbnail_url
79+
assert "&Signature=" in thumbnail_url, thumbnail_url
80+
assert "&Key-Pair-Id=" in thumbnail_url, thumbnail_url
81+
# ensure the PDF's thumbnail URL is valid
82+
response = poll(thumbnail_url, url="")
83+
assert response.status_code == 200, response

0 commit comments

Comments
 (0)