|
4 | 4 | import json |
5 | 5 | import io |
6 | 6 | import pytest |
| 7 | +import re |
7 | 8 | import requests |
8 | 9 | import ast |
9 | 10 | import pandas as pd |
@@ -236,6 +237,53 @@ def test_api_with_different_encodings(): |
236 | 237 | assert "invalid start byte" in str(excinfo.value) |
237 | 238 |
|
238 | 239 |
|
| 240 | +def test_xml_keep_tags_param(): |
| 241 | + """ |
| 242 | + Verify that responses do not include xml tags unless requested |
| 243 | + """ |
| 244 | + client = TestClient(app) |
| 245 | + test_file = Path("sample-docs") / "fake-xml.xml" |
| 246 | + response = client.post( |
| 247 | + MAIN_API_ROUTE, |
| 248 | + files=[("files", (str(test_file), open(test_file, "rb")))], |
| 249 | + data={"strategy": "hi_res"}, |
| 250 | + ) |
| 251 | + assert response.status_code == 200 |
| 252 | + response_without_xml_tags = response.json() |
| 253 | + |
| 254 | + response = client.post( |
| 255 | + MAIN_API_ROUTE, |
| 256 | + files=[("files", (str(test_file), open(test_file, "rb")))], |
| 257 | + data={"xml_keep_tags": "true", "strategy": "hi_res"}, |
| 258 | + ) |
| 259 | + assert response.status_code == 200 |
| 260 | + response_with_xml_tags = response.json()[3:] # skip the initial encoding tag(s) |
| 261 | + |
| 262 | + # The responses should have the same content except for the xml tags |
| 263 | + response_with_xml_tags_index, response_without_xml_tags_index = 0, 0 |
| 264 | + while response_without_xml_tags_index < len(response_without_xml_tags): |
| 265 | + xml_tagged_line = response_with_xml_tags[response_with_xml_tags_index]["text"] |
| 266 | + assert xml_tagged_line.startswith("<") |
| 267 | + assert xml_tagged_line.endswith(">") |
| 268 | + |
| 269 | + # if there is content on this line, ensure it matches the content on the non tagged line |
| 270 | + xml_tagged_line_content = xml_tagged_line.split(">", 1)[1] # remove opening tag |
| 271 | + if not xml_tagged_line_content: |
| 272 | + response_with_xml_tags_index += 1 |
| 273 | + |
| 274 | + else: |
| 275 | + xml_tagged_line_content = xml_tagged_line_content.split("<", 1)[0] # remove closing tag |
| 276 | + |
| 277 | + xml_untagged_line = response_without_xml_tags[response_without_xml_tags_index]["text"] |
| 278 | + xml_tagged_line_content_parsed = re.sub( |
| 279 | + "&", "&", xml_tagged_line_content |
| 280 | + ) # xml_keep_tags does not currently parse the inner content |
| 281 | + assert xml_tagged_line_content_parsed == xml_untagged_line |
| 282 | + |
| 283 | + response_with_xml_tags_index += 1 |
| 284 | + response_without_xml_tags_index += 1 |
| 285 | + |
| 286 | + |
239 | 287 | @pytest.mark.parametrize( |
240 | 288 | "example_filename", |
241 | 289 | [ |
|
0 commit comments