Merge pull request #65 from MDverse/improve-dataset-search-in-figshare

pierrepo · web-flow · commit 858f1314709b · 2026-01-21T22:09:00.000+01:00
Improve dataset search in figshare
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -33,3 +33,4 @@ repos:
   rev: '1.9.2'
   hooks:
   - id: bandit
+    args: ["--exclude", "tests"]
diff --git a/docs/figshare.md b/docs/figshare.md
@@ -19,9 +19,9 @@ So we don't expect files to have an individual size above 20 GB.
 
 Figshare requires a token to access its API: [How to get a personnal token](https://info.figshare.com/user-guide/how-to-get-a-personal-token/)
 
-### URL
+### Base URL
 
-https://api.figshare.com/v2/
+<https://api.figshare.com/v2/>
 
 ### Query
 
@@ -31,19 +31,21 @@ https://api.figshare.com/v2/
 
 > We do not have automatic rate limiting in place for API requests. However, we do carry out monitoring to detect and mitigate abuse and prevent the platform's resources from being overused. We recommend that clients use the API responsibly and do not make more than one request per second. We reserve the right to throttle or block requests if we detect abuse.
 
-Source: https://docs.figshare.com/#figshare_documentation_api_description_rate_limiting
+Source: <https://docs.figshare.com/#figshare_documentation_api_description_rate_limiting>
 
 ## Datasets
 
 ### Search for MD-related datasets
 
 - Endpoint: `/articles/search`
+- HTTP method: POST
 - Documentation: <https://docs.figshare.com/#articles_search>
+- [Documentation](https://docs.figshare.com/#search_search_operators) for search operators and searchable attributes
 
-We seach MD-related datasets by searching for file types and keywords if necessary. Keywords are searched into `:title:`, `:description:` and `:keywords:` text fields. Example queries:
+We search MD-related datasets by searching for file types and keywords if necessary. Keywords are searched into `:title:`, `:description:` and `:keywords:` text fields. Search query examples:
 
 ```none
-resource_type.type:"dataset" AND filetype:"tpr"
+:extension: tpr
 ```
 
 or
@@ -53,6 +55,21 @@ or
 :extension: mdp AND (:title: 'gromacs' OR :description: 'gromacs' OR :keyword: 'gromacs')
 ```
 
+The POST parameters look like this:
+
+```json
+{
+    "order": "published_date",
+    "search_for": ":extension: xtc",
+    "page": 1,
+    "page_size": 10,
+    "order_direction": "desc",
+    "item_type": 3
+}
+```
+
+We search only for datasets by probiding the parameter `"item_type": 3`.
+
 Example datasets:
 
 - [Molecular dynamics of DSB in nucleosome](https://figshare.com/articles/dataset/M1_gro/5840706)
@@ -66,6 +83,7 @@ We search for all file types and keywords. Results are paginated by batch of 100
 ### Get metadata for a given dataset
 
 - Endpoint: `/articles/{dataset_id}`
+- HTTP method: GET
 - Documentation: <https://docs.figshare.com/#public_article>
 
 Example dataset "[Molecular dynamics of DSB in nucleosome](https://figshare.com/articles/dataset/M1_gro/5840706)":
diff --git a/src/mdverse_scrapers/core/network.py b/src/mdverse_scrapers/core/network.py
@@ -10,7 +10,7 @@
 import loguru
 import pycurl
 from selenium import webdriver
-from selenium.common.exceptions import WebDriverException
+from selenium.common.exceptions import TimeoutException, WebDriverException
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as ec
@@ -293,15 +293,18 @@ def get_html_page_with_selenium(
         driver = webdriver.Chrome(options=options)
         driver.get(url)
         page_content = (
-            WebDriverWait(driver, 10)
+            WebDriverWait(driver, 5)
             .until(ec.visibility_of_element_located((By.CSS_SELECTOR, tag)))
             .text
         )
         driver.quit()
+    except TimeoutException:
+        logger.error("Timeout while retrieving page:")
+        logger.error(url)
     except WebDriverException as e:
         logger.error("Cannot retrieve page:")
         logger.error(url)
-        logger.error(f"Selenium error: {e}")
+        logger.debug(f"Selenium error: {e}")
         return None
     if not page_content:
         logger.error("Retrieved page content is empty.")
diff --git a/tests/core/test_network.py b/tests/core/test_network.py
@@ -1,6 +1,10 @@
 """Tests for the network module."""
 
+import json
+
+import mdverse_scrapers.core.network as network
 import mdverse_scrapers.core.toolbox as toolbox
+from mdverse_scrapers.core.logger import create_logger
 
 
 def test_make_http_get_request_with_retries_200():
@@ -38,3 +42,37 @@ def test_make_http_get_request_with_retries_404():
         max_attempts=1,
     )
     assert response is None
+
+
+def test_get_html_page_with_selenium_good_url():
+    """Test the get_html_page_with_selenium function with a bad URL."""
+    url = "https://figshare.com/ndownloader/files/21988230/preview/21988230/structure.json"
+    expected_json = {
+        "files": [],
+        "path": "ROOT",
+        "dirs": [
+            {
+                "files": [
+                    {"path": "NIPAM-FF1.3x/NIPAM-64-wat-ch-1.3.top"},
+                    {"path": "NIPAM-FF1.3x/NIPAM-64-wat.gro"},
+                    {"path": "NIPAM-FF1.3x/md.mdp"},
+                    {"path": "NIPAM-FF1.3x/NIPAM-ch-1.3.itp"},
+                ],
+                "path": "NIPAM-FF1.3x",
+                "dirs": [],
+            }
+        ],
+    }
+    content = network.get_html_page_with_selenium(url=url, tag="pre")
+    assert json.loads(content) == expected_json
+
+
+def test_get_html_page_with_selenium_bad_url(capsys) -> None:
+    """Test the get_html_page_with_selenium function with a bad URL."""
+    url = "https://figshare.com/ndownloader/files/28089615/preview/28089615/structure.json"
+    content = network.get_html_page_with_selenium(
+        url=url, tag="pre", logger=create_logger(level="DEBUG")
+    )
+    assert content is None
+    captured = capsys.readouterr()
+    assert "Timeout while retrieving page" in captured.out