|
7 | 7 | import time |
8 | 8 | import urllib.error |
9 | 9 | import urllib.request |
10 | | -import uuid |
11 | 10 | from pathlib import Path |
12 | 11 | from typing import Any |
13 | 12 |
|
14 | 13 |
|
15 | 14 | class AstaPaperFinder: |
16 | | - """Client for Asta Paper Finder API""" |
| 15 | + """Client for Asta Paper Finder API using headless endpoint""" |
17 | 16 |
|
18 | | - def __init__(self, base_url: str = "REDACTED_ASTA_PROD_URL"): |
| 17 | + def __init__(self, base_url: str = "REDACTED_MABOOL_WORKERS_URL"): |
19 | 18 | self.base_url = base_url |
20 | | - self.mabool_url = "REDACTED_MABOOL_DEMO_URL" |
21 | | - self.user_id = str(uuid.uuid4()) |
22 | 19 | self.headers = { |
23 | | - "X-Anonymous-User-ID": self.user_id, |
24 | 20 | "Content-Type": "application/json", |
25 | 21 | } |
26 | 22 |
|
27 | 23 | def _request( |
28 | 24 | self, url: str, method: str = "GET", data: dict | None = None |
29 | | - ) -> dict[str, Any] | list: |
| 25 | + ) -> dict[str, Any]: |
30 | 26 | """Make an HTTP request and return JSON response""" |
31 | 27 | body = json.dumps(data).encode() if data else None |
32 | 28 | req = urllib.request.Request( |
33 | 29 | url, data=body, headers=self.headers, method=method |
34 | 30 | ) |
35 | | - response = urllib.request.urlopen(req) |
36 | | - return json.loads(response.read()) |
37 | | - |
38 | | - def create_thread(self) -> str: |
39 | | - """Create a new thread""" |
40 | | - result = self._request(f"{self.base_url}/api/chat/thread", method="PUT") |
41 | | - return result["thread"]["key"] |
42 | | - |
43 | | - def send_message( |
44 | | - self, text: str, thread_id: str, profile: str = "paper-finder-only" |
45 | | - ) -> dict[str, Any]: |
46 | | - """Send a message to the thread""" |
47 | | - return self._request( |
48 | | - f"{self.base_url}/api/chat/message", |
49 | | - method="POST", |
50 | | - data={"text": text, "thread_id": thread_id, "profile": profile}, |
51 | | - ) |
52 | | - |
53 | | - def get_widget_id(self, thread_id: str, max_retries: int = 20) -> str | None: |
54 | | - """Get the widget ID from thread events""" |
55 | | - url = f"{self.base_url}/api/rest/thread/{thread_id}/event/widget_paper_finder" |
56 | | - for _ in range(max_retries): |
57 | | - try: |
58 | | - req = urllib.request.Request(url, headers=self.headers) |
59 | | - response = urllib.request.urlopen(req) |
60 | | - data = json.loads(response.read()) |
61 | | - last_event = data.get("last_event") |
62 | | - if last_event and isinstance(last_event, dict): |
63 | | - event_data = last_event.get("data") |
64 | | - if event_data and isinstance(event_data, dict): |
65 | | - widget_id = event_data.get("id") |
66 | | - if widget_id: |
67 | | - return widget_id |
68 | | - except urllib.error.HTTPError: |
69 | | - pass |
70 | | - time.sleep(2) |
71 | | - return None |
72 | | - |
73 | | - def get_widget_results(self, widget_id: str) -> dict[str, Any] | list: |
74 | | - """Get widget results from mabool service""" |
75 | | - url = f"{self.mabool_url}/api/2/rounds/{widget_id}/result/widget" |
76 | | - req = urllib.request.Request(url, headers=self.headers) |
77 | | - response = urllib.request.urlopen(req) |
78 | | - return json.loads(response.read()) |
79 | | - |
80 | | - def poll_for_results(self, widget_id: str, timeout: int = 300): |
81 | | - """Poll for results until completion or timeout""" |
82 | | - start_time = time.time() |
83 | | - poll_interval = 2 |
84 | | - |
85 | | - while time.time() - start_time < timeout: |
| 31 | + try: |
| 32 | + response = urllib.request.urlopen(req) |
| 33 | + return json.loads(response.read()) |
| 34 | + except urllib.error.HTTPError as e: |
| 35 | + error_body = e.read().decode("utf-8") |
86 | 36 | try: |
87 | | - result = self.get_widget_results(widget_id) |
88 | | - |
89 | | - # Handle if result is a list - got the papers directly |
90 | | - if isinstance(result, list): |
91 | | - return { |
92 | | - "roundStatus": {"kind": "completed"}, |
93 | | - "results": result, |
94 | | - "thread_id": None, |
95 | | - "widget_id": widget_id, |
96 | | - } |
97 | | - |
98 | | - # Handle dict response with roundStatus |
99 | | - status = result.get("roundStatus", {}).get("kind", "unknown") |
100 | | - |
101 | | - if status == "completed": |
102 | | - return result |
103 | | - elif status == "failed": |
104 | | - error = result.get("roundStatus", {}).get("error", "Unknown error") |
105 | | - raise Exception(f"Paper finder failed: {error}") |
106 | | - |
107 | | - except urllib.error.HTTPError as e: |
108 | | - if e.code != 404: |
109 | | - raise |
110 | | - |
111 | | - time.sleep(poll_interval) |
112 | | - |
113 | | - raise TimeoutError(f"Timeout after {timeout} seconds") |
114 | | - |
115 | | - def start_search(self, query: str) -> str: |
116 | | - """Start a paper search and return thread_id immediately (non-blocking)""" |
117 | | - thread_id = self.create_thread() |
118 | | - self.send_message(query, thread_id) |
119 | | - return thread_id |
| 37 | + error_data = json.loads(error_body) |
| 38 | + error_msg = error_data.get("detail", str(e)) |
| 39 | + except json.JSONDecodeError: |
| 40 | + error_msg = error_body or str(e) |
| 41 | + raise Exception(f"API request failed: {error_msg}") from e |
120 | 42 |
|
121 | 43 | def find_papers( |
122 | | - self, query: str, timeout: int = 300, save_to_file: Path | None = None |
| 44 | + self, |
| 45 | + query: str, |
| 46 | + timeout: int = 300, |
| 47 | + save_to_file: Path | None = None, |
| 48 | + operation_mode: str = "infer", |
| 49 | + include_full_metadata: bool = True, |
123 | 50 | ) -> dict[str, Any]: |
124 | | - """Complete workflow to find papers (blocking). |
| 51 | + """Execute a one-shot paper search using the headless endpoint. |
125 | 52 |
|
126 | 53 | Args: |
127 | 54 | query: Search query |
128 | | - timeout: Maximum time to wait for results |
| 55 | + timeout: Maximum time to wait for results (seconds) |
129 | 56 | save_to_file: Optional path to save results. If None, no file is saved. |
| 57 | + operation_mode: Search strategy - 'infer', 'fast', or 'diligent' (default: 'infer') |
| 58 | + include_full_metadata: Whether to return full paper details (default: True) |
130 | 59 |
|
131 | 60 | Returns: |
132 | | - Complete search results including widget data |
| 61 | + Complete search results with papers |
133 | 62 | """ |
134 | | - thread_id = self.start_search(query) |
| 63 | + url = f"{self.base_url}/api/3/headless/paper-search" |
| 64 | + |
| 65 | + request_body = { |
| 66 | + "query": query, |
| 67 | + "operation_mode": operation_mode, |
| 68 | + "include_full_metadata": include_full_metadata, |
| 69 | + "timeout_seconds": timeout, |
| 70 | + } |
135 | 71 |
|
136 | | - # Get widget ID |
137 | | - widget_id = self.get_widget_id(thread_id) |
138 | | - if not widget_id: |
139 | | - raise Exception("Failed to get widget ID after retries") |
| 72 | + # Make the synchronous request |
| 73 | + result = self._request(url, method="POST", data=request_body) |
140 | 74 |
|
141 | | - # Poll for results |
142 | | - widget_result = self.poll_for_results(widget_id, timeout) |
| 75 | + # Check for errors |
| 76 | + if "error" in result and result["error"]: |
| 77 | + error = result["error"] |
| 78 | + raise Exception(f"Paper search failed: {error}") |
143 | 79 |
|
144 | | - papers = widget_result.get("results", []) |
| 80 | + papers = result.get("papers", []) |
145 | 81 |
|
146 | | - # Build complete search data |
| 82 | + # Build search data in format compatible with existing models |
147 | 83 | search_data = { |
148 | 84 | "query": query, |
149 | | - "thread_id": thread_id, |
150 | | - "widget_id": widget_id, |
| 85 | + "widget": { |
| 86 | + "results": papers, |
| 87 | + "response_text": result.get("response_text", ""), |
| 88 | + }, |
151 | 89 | "status": "completed", |
152 | | - "widget": widget_result, |
153 | 90 | "timestamp": time.time(), |
154 | 91 | "paper_count": len(papers), |
155 | 92 | } |
|
0 commit comments