Skip to content

Commit 988bb98

Browse files
authored
Papers search agent (#184)
* added download papers agent * removed tool, connected create_dataset and search_papers agents * remove paperscraper * added downloaded papers to frontend * added authors/journals/institutions search * fix in conf, fix in frontend, removed prints * fix papers download * minor fixes * minor fix
1 parent 6da594e commit 988bb98

File tree

12 files changed

+1361
-42
lines changed

12 files changed

+1361
-42
lines changed

ChemCoScientist/agents/agents.py

Lines changed: 67 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import os
33
import time
44
import json
5+
import logging
56
from typing import Annotated
67
import operator
78
import streamlit as st
@@ -21,10 +22,13 @@
2122
)
2223
from ChemCoScientist.tools import chem_tools, nanoparticle_tools, paper_analysis_tools, data_tools, chem_ocr_tools
2324
from ChemCoScientist.tools.ml_tools import agents_tools as automl_tools
25+
from ChemCoScientist.download_papers.functions import download_papers
2426

2527
from ChemCoScientist.agents.agents_prompts import paper_agent_prompt, coder_prompt
2628
from definitions import ROOT_DIR
2729

30+
logger = logging.getLogger(__name__)
31+
2832

2933
def get_all_files(directory: str):
3034
"""
@@ -344,11 +348,11 @@ def paper_analysis_agent(state: dict, config: dict) -> Command:
344348
Command: An object containing the next step in the process ('replan' or `END`) and
345349
updates to the state, including recorded steps, responses, and extracted metadata.
346350
"""
347-
print("--------------------------------")
348-
print("Paper agent called")
349-
print(f"Current task: {state['task']}")
350-
print(f"Current input: {state['input']}")
351-
print("--------------------------------")
351+
logger.info("--------------------------------")
352+
logger.info("Paper agent called")
353+
logger.info(f"Current task: {state['task']}")
354+
logger.info(f"Current input: {state['input']}")
355+
logger.info("--------------------------------")
352356

353357
llm: BaseChatModel = config["configurable"]["llm"]
354358

@@ -385,7 +389,7 @@ def paper_analysis_agent(state: dict, config: dict) -> Command:
385389
"metadata": Annotated[dict, operator.or_](updated_metadata),
386390
})
387391
except Exception as e:
388-
print(f"Paper analysis agent error: {str(e)}. Retrying ({attempt + 1}/3)")
392+
logger.error(f"Paper analysis agent error: {str(e)}. Retrying ({attempt + 1}/3)")
389393
time.sleep(1.2 ** attempt)
390394

391395
return Command(goto=END, update={
@@ -459,3 +463,60 @@ def chem_ocr_agent(state: dict, config: dict) -> Command:
459463
"response": "I cannot extract molecules or reactions right now."
460464
"Can I help with something else?"
461465
})
466+
467+
468+
def papers_search_agent(state: dict, config: dict) -> Command:
469+
"""
470+
Searches for entity IDs or scientific papers based on user query and downloads papers' PDFs.
471+
472+
This agent utilizes the OpenAlex API to find and download
473+
PDFs of scientific papers relevant to the user's specified topic or query.
474+
475+
Args:
476+
state (dict): The current state of the interaction, including the user's task.
477+
config (dict): Configuration settings, including the language model to use.
478+
479+
Returns:
480+
Command: An object containing the next step in the process ('replan' or `END`) and
481+
updates to the state, including recorded steps, responses, and extracted metadata.
482+
"""
483+
logger.info("--------------------------------")
484+
logger.info("Papers search and download agent called")
485+
logger.info(f"Current task: {state['task']}")
486+
logger.info(f"Current input: {state['input']}")
487+
logger.info("--------------------------------")
488+
489+
task = state["task"]
490+
491+
for attempt in range(3):
492+
try:
493+
result = download_papers(task)
494+
495+
answer_serialized = json.dumps(result["answer"], sort_keys=True)
496+
497+
updated_metadata = state.get("metadata", {}).copy()
498+
downloaded_papers_metadata = {"downloaded_papers": result.get("metadata", None)}
499+
if downloaded_papers_metadata["downloaded_papers"]:
500+
if "downloaded_papers" in updated_metadata.keys():
501+
updated_metadata["downloaded_papers"].update(downloaded_papers_metadata["downloaded_papers"])
502+
else:
503+
updated_metadata.update(downloaded_papers_metadata)
504+
505+
return Command(update={
506+
"past_steps": Annotated[set, operator.or_](set([
507+
(task, answer_serialized)
508+
])),
509+
"nodes_calls": Annotated[set, operator.or_](set([
510+
("papers_search_agent", (("text", answer_serialized),))
511+
])),
512+
"metadata": Annotated[dict, operator.or_](updated_metadata),
513+
})
514+
except Exception as e:
515+
logger.error(f"Papers search agent error: {str(e)}. Retrying ({attempt + 1}/3)")
516+
time.sleep(1.2 ** attempt)
517+
518+
return Command(goto=END, update={
519+
"response": "I cannot download papers right now."
520+
"Can I help with something else?"
521+
})
522+

ChemCoScientist/agents/agents_prompts.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,4 +114,8 @@
114114
You must detect and output every plausible chemical structure present in the image, even if the image is low-quality,
115115
sketchy, partial, or ambiguous. When uncertain, infer the most likely structure based on visible atoms, bonds, and geometry.
116116
Never return ‘no molecules detected’—instead describe all candidate structures with confidence scores.
117+
"""
118+
119+
papers_search_prompt = """
120+
You are a helpful assistant. You search for papers in OpenAlex based on a user query and download papers' PDFs.
117121
"""

ChemCoScientist/conf/create_conf.py

Lines changed: 85 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
nanoparticle_node,
1616
paper_analysis_agent,
1717
coder_agent,
18-
chem_ocr_agent
18+
chem_ocr_agent,
19+
papers_search_agent
1920
)
2021
#from CoScientist.scientific_agents.agents import coder_agent
2122
from ChemCoScientist.tools import chem_tools_rendered, nano_tools_rendered, tools_rendered, data_tools_rendered, \
@@ -64,6 +65,12 @@
6465
6566
Failure handling:
6667
If no relevant papers are found, state "no match in database" and still run "web_search".
68+
69+
Special behavior for dataset creation:
70+
- If 'create_dataset_from_papers' is requested but no papers are uploaded:
71+
1) Augment the user query to search for relevant papers.
72+
2) Automatically invoke 'papers_search_agent' to find relevant papers with augmented query.
73+
3) After successful download, retry dataset creation with downloaded papers.
6774
"""
6875

6976
web_search_description = """
@@ -131,13 +138,66 @@
131138
"""
132139

133140

141+
papers_search_agent_description = """
142+
Agent name: papers_search_agent
143+
144+
Purpose:
145+
Search OpenAlex for relevant scientific papers, download their PDF files, and return
146+
download metadata for downstream processing.
147+
148+
When to activate:
149+
- User requests finding or downloading papers for a given research topic, author,
150+
journal, or institution.
151+
152+
Procedure (implementation details):
153+
1) Use an LLM (via the configured `VISION_LLM_URL`) to generate the appropriate
154+
OpenAlex API request URL for the user's query.
155+
2) Call OpenAlex (with retry logic) and inspect the returned `results`.
156+
3) For each result containing a `content_urls.pdf`, download the PDF and save it to
157+
the configured `DOWNLOADED_PAPERS_PATH` using a sanitized filename.
158+
4) Return a structured response containing a human-readable `answer` and `metadata`.
159+
When PDFs were downloaded, `metadata.papers` contains the list of saved file paths.
160+
For queries that resolve to an entity (author/source/institution), the agent may
161+
return an `id` in `metadata` instead of (or in addition to) downloaded files.
162+
163+
Two-step / entity-ID flow:
164+
- The agent can be used in a two-step pattern for author/journal/institution queries:
165+
1) First call the agent to resolve the target entity to an OpenAlex ID (the agent
166+
will return `metadata.id` when it detects an entity-resolution response).
167+
2) Then call the agent again (or include the resolved ID in the original query)
168+
to search for and download papers associated with that entity. This two-step
169+
approach is supported by the implementation and recommended for precise author or
170+
source-based searches.
171+
172+
Notes and constraints:
173+
- The agent builds the OpenAlex request via an LLM and then performs the HTTP calls
174+
directly; network retry/backoff logic is applied for robustness.
175+
- The agent downloads PDFs listed in `content_urls.pdf` from OpenAlex results and
176+
saves them locally; it does not attempt to bypass paywalls beyond what OpenAlex
177+
exposes in `content_urls`.
178+
179+
Inputs:
180+
- user_query: str
181+
182+
Outputs:
183+
- A dict with an `answer` string and optional `metadata` dict. When downloads occur,
184+
`metadata.papers` is a list of downloaded file paths; when an entity ID is resolved,
185+
`metadata.id` is provided.
186+
187+
Failure handling:
188+
- If no papers are found or downloads fail, the agent returns an explanatory `answer`
189+
and an empty or absent `metadata.papers`.
190+
"""
191+
192+
134193
additional_agents_description = (
135194
automl_agent_description
136195
+ dataset_builder_agent_description
137196
+ coder_agent_description
138197
+ paper_analysis_agent_description
139198
+ web_search_description
140199
+ chem_ocr_agent_description
200+
+ papers_search_agent_description
141201
)
142202

143203
conf = {
@@ -162,7 +222,8 @@
162222
"coder_agent",
163223
"paper_analysis_agent",
164224
"web_search",
165-
"chem_ocr_agent"
225+
"chem_ocr_agent",
226+
"papers_search_agent"
166227
],
167228
# nodes for scenario agents
168229
"scenario_agent_funcs": {
@@ -173,7 +234,8 @@
173234
"coder_agent": coder_agent,
174235
"paper_analysis_agent": paper_analysis_agent,
175236
"web_search": web_search_node,
176-
"chem_ocr_agent": chem_ocr_agent
237+
"chem_ocr_agent": chem_ocr_agent,
238+
"papers_search_agent": papers_search_agent
177239
},
178240
# descripton for agents tools - if using langchain @tool
179241
# or description of agent capabilities in free format
@@ -239,6 +301,9 @@
239301
7. You must include all information you see in user prompt to your plan
240302
8. If you get a general question about chemistry first call paper_analysis_agent. Use web search
241303
only if paper_analysis_agent has no answer.
304+
9. If you get a query to find or download papers, use papers_search_agent:
305+
- For topic-based searches (e.g., "Download papers about CRISPR CAS"), directly search for papers using that topic.
306+
- For author, journal, or institution searches, create two sequential subtasks: first resolve the entity's OpenAlex ID, then search for papers using that ID.
242307
""",
243308
"desc_restrictions": """
244309
- You cant name agents
@@ -275,6 +340,23 @@
275340
["Generate 5 molecules related to MEK1", "Generate 3 molecules using the GSK model"]
276341
]
277342
}
343+
344+
Example 4 (author search):
345+
Request: "Find papers by author 'Jane Q. Researcher' about quantum dots"
346+
Response: {
347+
"steps": [
348+
["Search OpenAlex for author ID for 'Jane Q. Researcher'"],
349+
["Search OpenAlex for papers by the found author ID about quantum dots"]
350+
]
351+
}
352+
353+
Example 5 (topic search):
354+
Request: "Download papers about CRISPR CAS"
355+
Response: {
356+
"steps": [
357+
["Search and download papers about CRISPR CAS"]
358+
]
359+
}
278360
""",
279361
"additional_hints": """
280362
- If multiple molecules, files, or entities are processed in the same way, group those actions together as parallel subtasks.
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
2+
import requests
3+
import re
4+
import os
5+
import time
6+
import base64
7+
import logging
8+
from typing import Dict, List, Any
9+
10+
from protollm.connectors import create_llm_connector, get_allowed_providers
11+
from langchain_core.messages import SystemMessage, HumanMessage
12+
from dotenv import load_dotenv
13+
from definitions import CONFIG_PATH
14+
15+
from ChemCoScientist.download_papers.prompt import OPENALEX_QUERY_PROMPT
16+
17+
logging.basicConfig(level=logging.INFO)
18+
logger = logging.getLogger(__name__)
19+
20+
load_dotenv(CONFIG_PATH)
21+
VISION_LLM_URL = os.environ.get("VISION_LLM_URL")
22+
DOWNLOADED_PAPERS_PATH = os.environ.get("DOWNLOADED_PAPERS_PATH")
23+
OPENALEX_API_KEY = os.environ.get("OPENALEX_API_KEY")
24+
25+
26+
def sanitize_filename(name: str) -> str:
27+
"""Remove invalid filename characters from a string."""
28+
return re.sub(r'[\\/*?:"<>|]', "", name)
29+
30+
31+
def request_with_retry(
32+
url: str,
33+
max_retries: int = 3,
34+
timeout: int = 30
35+
) -> requests.Response:
36+
"""Make an HTTP GET request with automatic retry logic for rate limits and server errors."""
37+
for attempt in range(max_retries):
38+
try:
39+
response = requests.get(url, timeout=timeout)
40+
if response.status_code == 200:
41+
return response
42+
elif response.status_code == 403:
43+
# Rate limited
44+
wait_time = 2 ** attempt # 1s, 2s, 4s, 8s, 16s
45+
time.sleep(wait_time)
46+
elif response.status_code >= 500:
47+
# Server error
48+
wait_time = 2 ** attempt
49+
time.sleep(wait_time)
50+
else:
51+
# Other error, don't retry
52+
response.raise_for_status()
53+
except requests.exceptions.Timeout:
54+
if attempt < max_retries - 1:
55+
logger.info(f"Retrying... Attempt {attempt + 2}")
56+
time.sleep(2 ** attempt)
57+
else:
58+
raise
59+
raise Exception(f"Failed after {max_retries} retries")
60+
61+
62+
def download_from_openalex(pdf_url: str, paper_title: str) -> str:
63+
"""Download a PDF from a given URL and save it with a sanitized paper title."""
64+
response = request_with_retry(pdf_url)
65+
filepath = f"{DOWNLOADED_PAPERS_PATH}/{sanitize_filename(paper_title)}.pdf"
66+
with open(filepath, "wb") as f:
67+
f.write(response.content)
68+
logger.info(f"Downloaded: {filepath}")
69+
return filepath
70+
71+
72+
def generate_openalex_url(query: str) -> Dict[str, Any]:
73+
"""Uses an LLM to generate the appropriate API request for OpenAlex."""
74+
llm = create_llm_connector(VISION_LLM_URL, extra_body={"provider": {"only": get_allowed_providers()}})
75+
76+
content = [{"type": "text", "text": f"USER QUESTION: {query}"}]
77+
78+
messages = [
79+
SystemMessage(content=OPENALEX_QUERY_PROMPT),
80+
HumanMessage(content=content)
81+
]
82+
83+
res = llm.invoke(messages)
84+
return res.content
85+
86+
87+
def download_papers(task: str) -> List[str]:
88+
"""Search for papers matching a task query and download their PDFs using OpenAlex."""
89+
url = generate_openalex_url(task)
90+
logger.info(f"Generated OpenAlex API request URL: {url}")
91+
response = request_with_retry(url)
92+
if response.json().get("results", []) == []:
93+
return {'answer': 'No papers found for the given query.'}
94+
if "works" in url:
95+
logger.info("Downloading PDFs...")
96+
downloaded_paths = []
97+
titles = []
98+
for work in response.json().get("results", []):
99+
title = work["title"]
100+
titles.append(title)
101+
url = work["content_urls"]["pdf"] + f"?api_key={OPENALEX_API_KEY}"
102+
downloaded_path = download_from_openalex(url, title)
103+
downloaded_paths.append(downloaded_path)
104+
if downloaded_paths:
105+
return {'answer': f'Papers were successfully downloaded: {", ".join(titles)}.',
106+
'metadata': {'papers': downloaded_paths}}
107+
108+
if "authors" in url or "sources" in url or "institutions" in url:
109+
id = response.json().get("results", [])[0]["id"]
110+
return {'answer': f'Entity ID: {id}'}
111+
112+
if __name__ == "__main__":
113+
result = download_papers("find papers by Yann LeCun")
114+
print(result)

0 commit comments

Comments
 (0)