aimclub
diff --git a/‎ChemCoScientist/agents/agents.py‎
Lines changed: 67 additions & 6 deletions b/‎ChemCoScientist/agents/agents.py‎
Lines changed: 67 additions & 6 deletions
diff --git a/‎ChemCoScientist/agents/agents_prompts.py‎
Lines changed: 4 additions & 0 deletions b/‎ChemCoScientist/agents/agents_prompts.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎ChemCoScientist/conf/create_conf.py‎
Lines changed: 85 additions & 3 deletions b/‎ChemCoScientist/conf/create_conf.py‎
Lines changed: 85 additions & 3 deletions
diff --git a/‎ChemCoScientist/download_papers/functions.py‎
Lines changed: 114 additions & 0 deletions b/‎ChemCoScientist/download_papers/functions.py‎
Lines changed: 114 additions & 0 deletions
@@ -2,6 +2,7 @@
 import os
 import time
 import json
+import logging
 from typing import Annotated
 import operator
 import streamlit as st
@@ -21,10 +22,13 @@
 )
 from ChemCoScientist.tools import chem_tools, nanoparticle_tools, paper_analysis_tools, data_tools, chem_ocr_tools
 from ChemCoScientist.tools.ml_tools import agents_tools as automl_tools
+from ChemCoScientist.download_papers.functions import download_papers
 
 from ChemCoScientist.agents.agents_prompts import paper_agent_prompt, coder_prompt
 from definitions import ROOT_DIR
 
+logger = logging.getLogger(__name__)
+
 
 def get_all_files(directory: str):
     """
@@ -344,11 +348,11 @@ def paper_analysis_agent(state: dict, config: dict) -> Command:
         Command: An object containing the next step in the process ('replan' or `END`) and
         updates to the state, including recorded steps, responses, and extracted metadata.
     """
-    print("--------------------------------")
-    print("Paper agent called")
-    print(f"Current task: {state['task']}")
-    print(f"Current input: {state['input']}")
-    print("--------------------------------")
+    logger.info("--------------------------------")
+    logger.info("Paper agent called")
+    logger.info(f"Current task: {state['task']}")
+    logger.info(f"Current input: {state['input']}")
+    logger.info("--------------------------------")
 
     llm: BaseChatModel = config["configurable"]["llm"]
 
@@ -385,7 +389,7 @@ def paper_analysis_agent(state: dict, config: dict) -> Command:
                 "metadata": Annotated[dict, operator.or_](updated_metadata),
             })
         except Exception as e:
-            print(f"Paper analysis agent error: {str(e)}. Retrying ({attempt + 1}/3)")
+            logger.error(f"Paper analysis agent error: {str(e)}. Retrying ({attempt + 1}/3)")
             time.sleep(1.2 ** attempt)
 
     return Command(goto=END, update={
@@ -459,3 +463,60 @@ def chem_ocr_agent(state: dict, config: dict) -> Command:
         "response": "I cannot extract molecules or reactions right now."
                     "Can I help with something else?"
     })
+
+
+def papers_search_agent(state: dict, config: dict) -> Command:
+    """
+    Searches for entity IDs or scientific papers based on user query and downloads papers' PDFs.
+
+    This agent utilizes the OpenAlex API to find and download 
+    PDFs of scientific papers relevant to the user's specified topic or query.
+
+    Args:
+        state (dict): The current state of the interaction, including the user's task.
+        config (dict): Configuration settings, including the language model to use.
+
+    Returns:
+        Command: An object containing the next step in the process ('replan' or `END`) and
+        updates to the state, including recorded steps, responses, and extracted metadata.
+    """
+    logger.info("--------------------------------")
+    logger.info("Papers search and download agent called")
+    logger.info(f"Current task: {state['task']}")
+    logger.info(f"Current input: {state['input']}")
+    logger.info("--------------------------------")
+
+    task = state["task"]
+
+    for attempt in range(3):
+        try:            
+            result = download_papers(task)
+            
+            answer_serialized = json.dumps(result["answer"], sort_keys=True)
+
+            updated_metadata = state.get("metadata", {}).copy()
+            downloaded_papers_metadata = {"downloaded_papers": result.get("metadata", None)}
+            if downloaded_papers_metadata["downloaded_papers"]:
+                if "downloaded_papers" in updated_metadata.keys():
+                    updated_metadata["downloaded_papers"].update(downloaded_papers_metadata["downloaded_papers"])
+                else:
+                    updated_metadata.update(downloaded_papers_metadata)
+
+            return Command(update={
+                "past_steps": Annotated[set, operator.or_](set([
+                    (task, answer_serialized)
+                ])),
+                "nodes_calls": Annotated[set, operator.or_](set([
+                    ("papers_search_agent", (("text", answer_serialized),))
+                ])),
+                "metadata": Annotated[dict, operator.or_](updated_metadata),
+            })
+        except Exception as e:
+            logger.error(f"Papers search agent error: {str(e)}. Retrying ({attempt + 1}/3)")
+            time.sleep(1.2 ** attempt)
+
+    return Command(goto=END, update={
+        "response": "I cannot download papers right now."
+                    "Can I help with something else?"
+    })
+
@@ -114,4 +114,8 @@
 You must detect and output every plausible chemical structure present in the image, even if the image is low-quality,
 sketchy, partial, or ambiguous. When uncertain, infer the most likely structure based on visible atoms, bonds, and geometry.
 Never return ‘no molecules detected’—instead describe all candidate structures with confidence scores.
+"""
+
+papers_search_prompt = """
+You are a helpful assistant. You search for papers in OpenAlex based on a user query and download papers' PDFs.
 """
@@ -15,7 +15,8 @@
     nanoparticle_node,
     paper_analysis_agent,
     coder_agent,
-    chem_ocr_agent
+    chem_ocr_agent,
+    papers_search_agent
 )
 #from CoScientist.scientific_agents.agents import coder_agent
 from ChemCoScientist.tools import chem_tools_rendered, nano_tools_rendered, tools_rendered, data_tools_rendered, \
@@ -64,6 +65,12 @@
 
 Failure handling:
 If no relevant papers are found, state "no match in database" and still run "web_search".
+
+Special behavior for dataset creation:
+- If 'create_dataset_from_papers' is requested but no papers are uploaded:
+  1) Augment the user query to search for relevant papers.
+  2) Automatically invoke 'papers_search_agent' to find relevant papers with augmented query.
+  3) After successful download, retry dataset creation with downloaded papers.
 """
 
 web_search_description = """
@@ -131,13 +138,66 @@
 """
 
 
+papers_search_agent_description = """
+Agent name: papers_search_agent
+
+Purpose:
+Search OpenAlex for relevant scientific papers, download their PDF files, and return
+download metadata for downstream processing.
+
+When to activate:
+- User requests finding or downloading papers for a given research topic, author,
+    journal, or institution.
+
+Procedure (implementation details):
+1) Use an LLM (via the configured `VISION_LLM_URL`) to generate the appropriate
+     OpenAlex API request URL for the user's query.
+2) Call OpenAlex (with retry logic) and inspect the returned `results`.
+3) For each result containing a `content_urls.pdf`, download the PDF and save it to
+     the configured `DOWNLOADED_PAPERS_PATH` using a sanitized filename.
+4) Return a structured response containing a human-readable `answer` and `metadata`.
+    When PDFs were downloaded, `metadata.papers` contains the list of saved file paths.
+    For queries that resolve to an entity (author/source/institution), the agent may
+    return an `id` in `metadata` instead of (or in addition to) downloaded files.
+
+Two-step / entity-ID flow:
+- The agent can be used in a two-step pattern for author/journal/institution queries:
+  1) First call the agent to resolve the target entity to an OpenAlex ID (the agent
+    will return `metadata.id` when it detects an entity-resolution response).
+  2) Then call the agent again (or include the resolved ID in the original query)
+    to search for and download papers associated with that entity. This two-step
+    approach is supported by the implementation and recommended for precise author or
+    source-based searches.
+
+Notes and constraints:
+- The agent builds the OpenAlex request via an LLM and then performs the HTTP calls
+    directly; network retry/backoff logic is applied for robustness.
+- The agent downloads PDFs listed in `content_urls.pdf` from OpenAlex results and
+    saves them locally; it does not attempt to bypass paywalls beyond what OpenAlex
+    exposes in `content_urls`.
+
+Inputs:
+- user_query: str
+
+Outputs:
+- A dict with an `answer` string and optional `metadata` dict. When downloads occur,
+    `metadata.papers` is a list of downloaded file paths; when an entity ID is resolved,
+    `metadata.id` is provided.
+
+Failure handling:
+- If no papers are found or downloads fail, the agent returns an explanatory `answer`
+    and an empty or absent `metadata.papers`.
+"""
+
+
 additional_agents_description = (
     automl_agent_description
     + dataset_builder_agent_description
     + coder_agent_description
     + paper_analysis_agent_description
     + web_search_description
     + chem_ocr_agent_description
+    + papers_search_agent_description
 )
 
 conf = {
@@ -162,7 +222,8 @@
             "coder_agent",
             "paper_analysis_agent",
             "web_search",
-            "chem_ocr_agent"
+            "chem_ocr_agent",
+            "papers_search_agent"
         ],
         # nodes for scenario agents
         "scenario_agent_funcs": {
@@ -173,7 +234,8 @@
             "coder_agent": coder_agent,
             "paper_analysis_agent": paper_analysis_agent,
             "web_search": web_search_node,
-            "chem_ocr_agent": chem_ocr_agent
+            "chem_ocr_agent": chem_ocr_agent,
+            "papers_search_agent": papers_search_agent
         },
         # descripton for agents tools - if using langchain @tool
         # or description of agent capabilities in free format
@@ -239,6 +301,9 @@
                     7. You must include all information you see in user prompt to your plan
                     8. If you get a general question about chemistry first call paper_analysis_agent. Use web search
                     only if paper_analysis_agent has no answer. 
+                    9. If you get a query to find or download papers, use papers_search_agent:
+                       - For topic-based searches (e.g., "Download papers about CRISPR CAS"), directly search for papers using that topic.
+                       - For author, journal, or institution searches, create two sequential subtasks: first resolve the entity's OpenAlex ID, then search for papers using that ID.
                     """,
                 "desc_restrictions": """
                     - You cant name agents
@@ -275,6 +340,23 @@
                             ["Generate 5 molecules related to MEK1", "Generate 3 molecules using the GSK model"]
                         ]
                     }
+
+                    Example 4 (author search):
+                    Request: "Find papers by author 'Jane Q. Researcher' about quantum dots"
+                    Response: {
+                        "steps": [
+                            ["Search OpenAlex for author ID for 'Jane Q. Researcher'"],
+                            ["Search OpenAlex for papers by the found author ID about quantum dots"]
+                        ]
+                    }
+
+                    Example 5 (topic search):
+                    Request: "Download papers about CRISPR CAS"
+                    Response: {
+                        "steps": [
+                            ["Search and download papers about CRISPR CAS"]
+                        ]
+                    }
                     """,
                 "additional_hints": """
                     - If multiple molecules, files, or entities are processed in the same way, group those actions together as parallel subtasks.
 
@@ -0,0 +1,114 @@
+
+import requests
+import re
+import os
+import time
+import base64
+import logging
+from typing import Dict, List, Any
+
+from protollm.connectors import create_llm_connector, get_allowed_providers
+from langchain_core.messages import SystemMessage, HumanMessage
+from dotenv import load_dotenv
+from definitions import CONFIG_PATH
+
+from ChemCoScientist.download_papers.prompt import OPENALEX_QUERY_PROMPT
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+load_dotenv(CONFIG_PATH)
+VISION_LLM_URL = os.environ.get("VISION_LLM_URL")
+DOWNLOADED_PAPERS_PATH = os.environ.get("DOWNLOADED_PAPERS_PATH")
+OPENALEX_API_KEY = os.environ.get("OPENALEX_API_KEY")
+
+
+def sanitize_filename(name: str) -> str:
+    """Remove invalid filename characters from a string."""
+    return re.sub(r'[\\/*?:"<>|]', "", name)
+
+
+def request_with_retry(
+    url: str,
+    max_retries: int = 3,
+    timeout: int = 30
+) -> requests.Response:
+    """Make an HTTP GET request with automatic retry logic for rate limits and server errors."""
+    for attempt in range(max_retries):
+        try:
+            response = requests.get(url, timeout=timeout)
+            if response.status_code == 200:
+                return response
+            elif response.status_code == 403:
+                # Rate limited
+                wait_time = 2 ** attempt  # 1s, 2s, 4s, 8s, 16s
+                time.sleep(wait_time)
+            elif response.status_code >= 500:
+                # Server error
+                wait_time = 2 ** attempt
+                time.sleep(wait_time)
+            else:
+                # Other error, don't retry
+                response.raise_for_status()
+        except requests.exceptions.Timeout:
+            if attempt < max_retries - 1:
+                logger.info(f"Retrying... Attempt {attempt + 2}")
+                time.sleep(2 ** attempt)
+            else:
+                raise
+    raise Exception(f"Failed after {max_retries} retries")
+
+
+def download_from_openalex(pdf_url: str, paper_title: str) -> str:
+    """Download a PDF from a given URL and save it with a sanitized paper title."""
+    response = request_with_retry(pdf_url)
+    filepath = f"{DOWNLOADED_PAPERS_PATH}/{sanitize_filename(paper_title)}.pdf"
+    with open(filepath, "wb") as f:
+        f.write(response.content)
+    logger.info(f"Downloaded: {filepath}")
+    return filepath
+
+
+def generate_openalex_url(query: str) -> Dict[str, Any]:
+    """Uses an LLM to generate the appropriate API request for OpenAlex."""
+    llm = create_llm_connector(VISION_LLM_URL, extra_body={"provider": {"only": get_allowed_providers()}})
+
+    content = [{"type": "text", "text": f"USER QUESTION: {query}"}]
+
+    messages = [
+        SystemMessage(content=OPENALEX_QUERY_PROMPT),
+        HumanMessage(content=content)
+    ]
+
+    res = llm.invoke(messages)
+    return res.content
+
+
+def download_papers(task: str) -> List[str]:
+    """Search for papers matching a task query and download their PDFs using OpenAlex."""
+    url = generate_openalex_url(task)
+    logger.info(f"Generated OpenAlex API request URL: {url}")
+    response = request_with_retry(url)
+    if response.json().get("results", []) == []:
+        return {'answer': 'No papers found for the given query.'}
+    if "works" in url:
+        logger.info("Downloading PDFs...")
+        downloaded_paths = []
+        titles = []
+        for work in response.json().get("results", []):
+            title = work["title"]
+            titles.append(title)
+            url = work["content_urls"]["pdf"] + f"?api_key={OPENALEX_API_KEY}"
+            downloaded_path = download_from_openalex(url, title)
+            downloaded_paths.append(downloaded_path)
+        if downloaded_paths:
+            return {'answer': f'Papers were successfully downloaded: {", ".join(titles)}.',
+                    'metadata': {'papers': downloaded_paths}}
+    
+    if "authors" in url or "sources" in url or "institutions" in url:
+        id = response.json().get("results", [])[0]["id"]
+        return {'answer': f'Entity ID: {id}'}
+
+if __name__ == "__main__":
+    result = download_papers("find papers by Yann LeCun")
+    print(result)