|
6 | 6 | import time |
7 | 7 | import json |
8 | 8 | from typing import Dict, List, Optional, Union |
| 9 | +from io import BytesIO |
| 10 | +from fastapi import UploadFile |
9 | 11 |
|
10 | 12 | from .base_client import BaseClient, AsyncBaseClient |
11 | 13 | from . import ParseResponse, EquationProcessingResponse |
@@ -177,21 +179,41 @@ class DocumentHelper: |
177 | 179 | def __init__(self, ax_client: Axiomatic): |
178 | 180 | self._ax_client = ax_client |
179 | 181 |
|
180 | | - def pdf_from_url(self, url: str) -> ParseResponse: |
| 182 | + def pdf_from_file(self, path: str): |
| 183 | + """Open a PDF document from a file path and parse it into a Markdown response.""" |
| 184 | + with open(path, "rb") as f: |
| 185 | + file_bytes = f.read() |
| 186 | + |
| 187 | + # Create a tuple with (filename, content and content-type) |
| 188 | + # we do this because .parse expects a FastAPI Uploadfile |
| 189 | + file_name = path.split("/")[-1] |
| 190 | + file_tuple = (file_name, file_bytes, "application/pdf") |
| 191 | + |
| 192 | + response = self._ax_client.document.parse(file=file_tuple) |
| 193 | + return response |
| 194 | + |
| 195 | + def pdf_from_url(self, url: str): |
181 | 196 | """Download a PDF document from a URL and parse it into a Markdown response.""" |
182 | | - if "arxiv" in url and "abs" in url: |
| 197 | + if "arxiv.org" in url and "abs" in url: |
183 | 198 | url = url.replace("abs", "pdf") |
184 | 199 | print("The URL is an arXiv abstract page. Replacing 'abs' with 'pdf' to download the PDF.") |
185 | | - file = requests.get(url) |
186 | | - response = self._ax_client.document.parse(file=file.content) |
187 | | - return response |
| 200 | + response = requests.get(url) |
| 201 | + |
| 202 | + if response.status_code != 200: |
| 203 | + raise Exception(f"Failed to download PDF. Status code: {response.status_code}") |
| 204 | + |
| 205 | + # Extract filename from URL or use a default |
| 206 | + file_name = url.split("/")[-1] |
| 207 | + if not file_name.endswith(".pdf"): |
| 208 | + file_name = "document.pdf" |
| 209 | + |
| 210 | + # Create a tuple with (filename, content and content-type) |
| 211 | + # we do this because .parse expects a FastAPI Uploadfile |
| 212 | + file_tuple = (file_name, response.content, "application/pdf") |
| 213 | + |
| 214 | + parse_response = self._ax_client.document.parse(file=file_tuple) |
| 215 | + return parse_response |
188 | 216 |
|
189 | | - def pdf_from_file(self, path: str) -> ParseResponse: |
190 | | - """Open a PDF document from a file path and parse it into a Markdown response.""" |
191 | | - with open(path, "rb") as f: |
192 | | - file = f.read() |
193 | | - response = self._ax_client.document.parse(file=file) |
194 | | - return response |
195 | 217 |
|
196 | 218 | def plot_b64_images(self, images: Dict[str, str]): |
197 | 219 | """Plot a dictionary of base64 images.""" |
|
0 commit comments