Skip to content
This repository was archived by the owner on Mar 10, 2026. It is now read-only.

Commit 858f131

Browse files
authored
Merge pull request #65 from MDverse/improve-dataset-search-in-figshare
Improve dataset search in figshare
2 parents 87e985c + 0f4b96d commit 858f131

File tree

4 files changed

+68
-8
lines changed

4 files changed

+68
-8
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,4 @@ repos:
3333
rev: '1.9.2'
3434
hooks:
3535
- id: bandit
36+
args: ["--exclude", "tests"]

docs/figshare.md

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ So we don't expect files to have an individual size above 20 GB.
1919

2020
Figshare requires a token to access its API: [How to get a personnal token](https://info.figshare.com/user-guide/how-to-get-a-personal-token/)
2121

22-
### URL
22+
### Base URL
2323

24-
https://api.figshare.com/v2/
24+
<https://api.figshare.com/v2/>
2525

2626
### Query
2727

@@ -31,19 +31,21 @@ https://api.figshare.com/v2/
3131

3232
> We do not have automatic rate limiting in place for API requests. However, we do carry out monitoring to detect and mitigate abuse and prevent the platform's resources from being overused. We recommend that clients use the API responsibly and do not make more than one request per second. We reserve the right to throttle or block requests if we detect abuse.
3333
34-
Source: https://docs.figshare.com/#figshare_documentation_api_description_rate_limiting
34+
Source: <https://docs.figshare.com/#figshare_documentation_api_description_rate_limiting>
3535

3636
## Datasets
3737

3838
### Search for MD-related datasets
3939

4040
- Endpoint: `/articles/search`
41+
- HTTP method: POST
4142
- Documentation: <https://docs.figshare.com/#articles_search>
43+
- [Documentation](https://docs.figshare.com/#search_search_operators) for search operators and searchable attributes
4244

43-
We seach MD-related datasets by searching for file types and keywords if necessary. Keywords are searched into `:title:`, `:description:` and `:keywords:` text fields. Example queries:
45+
We search MD-related datasets by searching for file types and keywords if necessary. Keywords are searched into `:title:`, `:description:` and `:keywords:` text fields. Search query examples:
4446

4547
```none
46-
resource_type.type:"dataset" AND filetype:"tpr"
48+
:extension: tpr
4749
```
4850

4951
or
@@ -53,6 +55,21 @@ or
5355
:extension: mdp AND (:title: 'gromacs' OR :description: 'gromacs' OR :keyword: 'gromacs')
5456
```
5557

58+
The POST parameters look like this:
59+
60+
```json
61+
{
62+
"order": "published_date",
63+
"search_for": ":extension: xtc",
64+
"page": 1,
65+
"page_size": 10,
66+
"order_direction": "desc",
67+
"item_type": 3
68+
}
69+
```
70+
71+
We search only for datasets by probiding the parameter `"item_type": 3`.
72+
5673
Example datasets:
5774

5875
- [Molecular dynamics of DSB in nucleosome](https://figshare.com/articles/dataset/M1_gro/5840706)
@@ -66,6 +83,7 @@ We search for all file types and keywords. Results are paginated by batch of 100
6683
### Get metadata for a given dataset
6784

6885
- Endpoint: `/articles/{dataset_id}`
86+
- HTTP method: GET
6987
- Documentation: <https://docs.figshare.com/#public_article>
7088

7189
Example dataset "[Molecular dynamics of DSB in nucleosome](https://figshare.com/articles/dataset/M1_gro/5840706)":

src/mdverse_scrapers/core/network.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import loguru
1111
import pycurl
1212
from selenium import webdriver
13-
from selenium.common.exceptions import WebDriverException
13+
from selenium.common.exceptions import TimeoutException, WebDriverException
1414
from selenium.webdriver.chrome.options import Options
1515
from selenium.webdriver.common.by import By
1616
from selenium.webdriver.support import expected_conditions as ec
@@ -293,15 +293,18 @@ def get_html_page_with_selenium(
293293
driver = webdriver.Chrome(options=options)
294294
driver.get(url)
295295
page_content = (
296-
WebDriverWait(driver, 10)
296+
WebDriverWait(driver, 5)
297297
.until(ec.visibility_of_element_located((By.CSS_SELECTOR, tag)))
298298
.text
299299
)
300300
driver.quit()
301+
except TimeoutException:
302+
logger.error("Timeout while retrieving page:")
303+
logger.error(url)
301304
except WebDriverException as e:
302305
logger.error("Cannot retrieve page:")
303306
logger.error(url)
304-
logger.error(f"Selenium error: {e}")
307+
logger.debug(f"Selenium error: {e}")
305308
return None
306309
if not page_content:
307310
logger.error("Retrieved page content is empty.")

tests/core/test_network.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
"""Tests for the network module."""
22

3+
import json
4+
5+
import mdverse_scrapers.core.network as network
36
import mdverse_scrapers.core.toolbox as toolbox
7+
from mdverse_scrapers.core.logger import create_logger
48

59

610
def test_make_http_get_request_with_retries_200():
@@ -38,3 +42,37 @@ def test_make_http_get_request_with_retries_404():
3842
max_attempts=1,
3943
)
4044
assert response is None
45+
46+
47+
def test_get_html_page_with_selenium_good_url():
48+
"""Test the get_html_page_with_selenium function with a bad URL."""
49+
url = "https://figshare.com/ndownloader/files/21988230/preview/21988230/structure.json"
50+
expected_json = {
51+
"files": [],
52+
"path": "ROOT",
53+
"dirs": [
54+
{
55+
"files": [
56+
{"path": "NIPAM-FF1.3x/NIPAM-64-wat-ch-1.3.top"},
57+
{"path": "NIPAM-FF1.3x/NIPAM-64-wat.gro"},
58+
{"path": "NIPAM-FF1.3x/md.mdp"},
59+
{"path": "NIPAM-FF1.3x/NIPAM-ch-1.3.itp"},
60+
],
61+
"path": "NIPAM-FF1.3x",
62+
"dirs": [],
63+
}
64+
],
65+
}
66+
content = network.get_html_page_with_selenium(url=url, tag="pre")
67+
assert json.loads(content) == expected_json
68+
69+
70+
def test_get_html_page_with_selenium_bad_url(capsys) -> None:
71+
"""Test the get_html_page_with_selenium function with a bad URL."""
72+
url = "https://figshare.com/ndownloader/files/28089615/preview/28089615/structure.json"
73+
content = network.get_html_page_with_selenium(
74+
url=url, tag="pre", logger=create_logger(level="DEBUG")
75+
)
76+
assert content is None
77+
captured = capsys.readouterr()
78+
assert "Timeout while retrieving page" in captured.out

0 commit comments

Comments
 (0)