Skip to content

Commit b3ae207

Browse files
authored
Add timeout and strip image html tags (#2373)
1 parent 04dcfdc commit b3ae207

File tree

5 files changed

+39
-2
lines changed

5 files changed

+39
-2
lines changed

tools/paddleocr/manifest.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
version: 0.1.2
1+
version: 0.1.3
22
type: plugin
33
author: langgenius
44
name: paddleocr

tools/paddleocr/tools/document_parsing.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
from dify_plugin import Tool
66
from dify_plugin.entities.tool import ToolInvokeMessage
77

8+
from tools.utils import remove_img_from_markdown
9+
10+
REQUEST_TIMEOUT = (10, 600)
11+
812

913
class DocumentParsingTool(Tool):
1014
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
@@ -68,20 +72,24 @@ def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessag
6872
api_url,
6973
headers={"Authorization": f"token {access_token}"},
7074
json=params,
75+
timeout=REQUEST_TIMEOUT,
7176
)
7277
resp.raise_for_status()
7378
result = resp.json()
7479
except requests.exceptions.JSONDecodeError as e:
7580
raise RuntimeError(
7681
f"Failed to decode JSON response from PaddleOCR API: {resp.text}"
7782
) from e
83+
except requests.exceptions.Timeout as e:
84+
raise RuntimeError("PaddleOCR API request timed out") from e
7885
except requests.exceptions.RequestException as e:
7986
raise RuntimeError(f"PaddleOCR API request failed: {e}") from e
8087

8188
markdown_text_list = []
8289
for item in result.get("result", {}).get("layoutParsingResults", []):
8390
markdown_text = item.get("markdown", {}).get("text")
8491
if markdown_text is not None:
92+
markdown_text = remove_img_from_markdown(markdown_text)
8593
markdown_text_list.append(markdown_text)
8694
yield self.create_text_message("\n\n".join(markdown_text_list))
8795
yield self.create_json_message(result)

tools/paddleocr/tools/document_parsing_vl.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
from dify_plugin import Tool
66
from dify_plugin.entities.tool import ToolInvokeMessage
77

8+
from tools.utils import remove_img_from_markdown
9+
10+
REQUEST_TIMEOUT = (10, 600)
11+
812

913
class DocumentParsingVlTool(Tool):
1014
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
@@ -54,22 +58,26 @@ def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessag
5458
try:
5559
resp = requests.post(
5660
api_url,
57-
headers={"Authorization": f"Bearer {access_token}"},
61+
headers={"Authorization": f"token {access_token}"},
5862
json=params,
63+
timeout=REQUEST_TIMEOUT,
5964
)
6065
resp.raise_for_status()
6166
result = resp.json()
6267
except requests.exceptions.JSONDecodeError as e:
6368
raise RuntimeError(
6469
f"Failed to decode JSON response from PaddleOCR API: {resp.text}"
6570
) from e
71+
except requests.exceptions.Timeout as e:
72+
raise RuntimeError("PaddleOCR API request timed out") from e
6673
except requests.exceptions.RequestException as e:
6774
raise RuntimeError(f"PaddleOCR API request failed: {e}") from e
6875

6976
markdown_text_list = []
7077
for item in result.get("result", {}).get("layoutParsingResults", []):
7178
markdown_text = item.get("markdown", {}).get("text")
7279
if markdown_text is not None:
80+
markdown_text = remove_img_from_markdown(markdown_text)
7381
markdown_text_list.append(markdown_text)
7482
yield self.create_text_message("\n\n".join(markdown_text_list))
7583
yield self.create_json_message(result)

tools/paddleocr/tools/text_recognition.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from dify_plugin import Tool
66
from dify_plugin.entities.tool import ToolInvokeMessage
77

8+
REQUEST_TIMEOUT = (10, 600)
9+
810

911
class TextRecognitionTool(Tool):
1012
def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessage]:
@@ -48,13 +50,16 @@ def _invoke(self, tool_parameters: dict[str, Any]) -> Generator[ToolInvokeMessag
4850
api_url,
4951
headers={"Authorization": f"token {access_token}"},
5052
json=params,
53+
timeout=REQUEST_TIMEOUT,
5154
)
5255
resp.raise_for_status()
5356
result = resp.json()
5457
except requests.exceptions.JSONDecodeError as e:
5558
raise RuntimeError(
5659
f"Failed to decode JSON response from PaddleOCR API: {resp.text}"
5760
) from e
61+
except requests.exceptions.Timeout as e:
62+
raise RuntimeError("PaddleOCR API request timed out") from e
5863
except requests.exceptions.RequestException as e:
5964
raise RuntimeError(f"PaddleOCR API request failed: {e}") from e
6065

tools/paddleocr/tools/utils.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import re
2+
3+
MARKDOWN_IMAGE_PATTERN = re.compile(
4+
r"""
5+
<div[^>]*>\s*
6+
<img[^>]*/>\s*
7+
</div>
8+
|
9+
<img[^>]*/>
10+
""",
11+
re.IGNORECASE | re.VERBOSE | re.DOTALL
12+
)
13+
14+
15+
def remove_img_from_markdown(markdown: str) -> str:
16+
return MARKDOWN_IMAGE_PATTERN.sub("", markdown)

0 commit comments

Comments
 (0)