From 85904f85136be4e3a01135a6852370a52d221603 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Tue, 20 Jan 2026 18:21:23 +0100 Subject: [PATCH 1/6] fix: Bypass zip file content browsing with Selenium --- pyproject.toml | 1 + src/mdverse_scrapers/core/network.py | 153 ++++++++++++++++++++ src/mdverse_scrapers/scrapers/figshare.py | 27 ++-- uv.lock | 162 ++++++++++++++++++++++ 4 files changed, 327 insertions(+), 16 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 69c1405..5620d55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "pyyaml>=6.0.2", "requests>=2.32.3", "scipy>=1.15.2", + "selenium>=4.40.0", ] [dependency-groups] diff --git a/src/mdverse_scrapers/core/network.py b/src/mdverse_scrapers/core/network.py index 21f1ce3..fa7aea1 100644 --- a/src/mdverse_scrapers/core/network.py +++ b/src/mdverse_scrapers/core/network.py @@ -1,10 +1,20 @@ """Common functions and network utilities.""" +import json import time from enum import StrEnum +from io import BytesIO import httpx import loguru +import certifi +import pycurl +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.common.exceptions import WebDriverException +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC class HttpMethod(StrEnum): @@ -148,3 +158,146 @@ def make_http_request_with_retries( else: logger.info("Retrying...") return None + + +def parse_response_headers(headers_bytes: bytes) -> dict[str, str]: + """Parse HTTP response header from bytes to a dictionary. + + Returns + ------- + dict + A dictionary of HTTP response headers. + """ + headers = {} + headers_text = headers_bytes.decode("utf-8") + for line in headers_text.split("\r\n"): + if ": " in line: + key, value = line.split(": ", maxsplit=1) + headers[key] = value + return headers + + +def send_http_request_with_retries_pycurl( + url: str, + data: dict | None = None, + delay_before_request: float = 1.0, + logger: "loguru.Logger" = loguru.logger, + ) -> dict: + """Query the Figshare API and return the JSON response. + + Parameters + ---------- + url : str + URL to send the request to. + data : dict, optional + Data to send in the request body (for POST requests). + delay_before_request : float, optional + Time to wait before sending the request, in seconds. + + Returns + ------- + dict + A dictionary with the following keys: + - status_code: HTTP status code of the response. + - elapsed_time: Time taken to perform the request. + - headers: Dictionary of response headers. + - response: JSON response from the API. + """ + # First, we wait. + # https://docs.figshare.com/#figshare_documentation_api_description_rate_limiting + # "We recommend that clients use the API responsibly + # and do not make more than one request per second." + headers = { + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36" + ), + "Content-Type": "application/json", + } + time.sleep(delay_before_request) + results = {} + # Initialize a Curl object. + curl = pycurl.Curl() + # Set the URL to send the request to. + curl.setopt(curl.URL, url) + # Add headers as a list of strings. + headers_lst = [f"{key}: {value}" for key, value in headers.items()] + curl.setopt(curl.HTTPHEADER, headers_lst) + # Handle SSL certificates. + curl.setopt(curl.CAINFO, certifi.where()) + # Follow redirect. + curl.setopt(curl.FOLLOWLOCATION, True) + # If data is provided, set the request to POST and add the data. + if data is not None: + curl.setopt(curl.POST, True) + data_json = json.dumps(data) + curl.setopt(curl.POSTFIELDS, data_json) + # Capture the response body in a buffer. + body_buffer = BytesIO() + curl.setopt(curl.WRITEFUNCTION, body_buffer.write) + # Capture the response headers in a buffer. + header_buffer = BytesIO() + curl.setopt(curl.HEADERFUNCTION, header_buffer.write) + # Perform the request. + curl.perform() + # Get the HTTP status code. + status_code = curl.getinfo(curl.RESPONSE_CODE) + results["status_code"] = status_code + # Get elapsed time. + elapsed_time = curl.getinfo(curl.TOTAL_TIME) + results["elapsed_time"] = elapsed_time + # Close the Curl object. + curl.close() + # Get the response headers from the buffer. + response_headers = parse_response_headers(header_buffer.getvalue()) + results["headers"] = response_headers + # Get the response body from the buffer. + response = body_buffer.getvalue() + # Convert the response body from bytes to a string. + response = response.decode("utf-8") + # Convert the response string to a JSON object. + try: + response = json.loads(response) + except json.JSONDecodeError: + logger.error("Error decoding JSON response:") + logger.error(response[:100]) + response = None + results["response"] = response + return results + + +def get_html_page_with_selenium(url: str, tag: str = "body", logger: "loguru.Logger" = loguru.logger) -> str | None: + """Get HTML page content using Selenium. + + Parameters + ---------- + url : str + URL of the web page to retrieve. + tag : str, optional + HTML tag to wait for before retrieving the page content (default is "body"). + + Returns + ------- + str | None + HTML content of the page, or None if an error occurs. + """ + options = Options() + options.add_argument("--headless") + options.add_argument("--enable-javascript") + page_content = "" + logger.info("Retrieving page with Selenium:") + logger.info(url) + try: + driver = webdriver.Chrome(options=options) + driver.get(url) + page_content = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, tag))).text + driver.quit() + except WebDriverException as e: + logger.error("Cannot retrieve page:") + logger.error(url) + logger.error(f"Selenium error: {e}") + return None + if not page_content: + logger.error("Retrieved page content is empty.") + return None + return page_content diff --git a/src/mdverse_scrapers/scrapers/figshare.py b/src/mdverse_scrapers/scrapers/figshare.py index bbb13f7..197d4c9 100644 --- a/src/mdverse_scrapers/scrapers/figshare.py +++ b/src/mdverse_scrapers/scrapers/figshare.py @@ -1,4 +1,5 @@ """Scrape molecular dynamics datasets and files from Figshare.""" +from arrow import get import json import os @@ -14,6 +15,7 @@ from ..core.figshare_api import FigshareAPI from ..core.logger import create_logger +from ..core.network import get_html_page_with_selenium from ..core.toolbox import ( ContextManager, DataType, @@ -61,12 +63,13 @@ def extract_files_from_json_response( def extract_files_from_zip_file( - file_id: str, logger: "loguru.Logger" = loguru.logger, max_attempts: int = 3 -) -> list[str]: + file_id: str, logger: "loguru.Logger" = loguru.logger) -> list[str]: """Extract files from a zip file content. No endpoint is available in the Figshare API. We perform a direct HTTP GET request to the zip file content url. + We need to use the Selenium library to emulate a browser request + as direct requests fail with a 202 status code. Known issue with: https://figshare.com/ndownloader/files/31660220/preview/31660220/structure.json @@ -75,10 +78,8 @@ def extract_files_from_zip_file( ---------- file_id : str ID of the zip file to get content from. - logger : loguru.Logger + logger : "loguru.Logger" Logger object. - max_attempts : int - Maximum number of attempts to fetch the zip file content. Returns ------- @@ -90,23 +91,17 @@ def extract_files_from_zip_file( f"https://figshare.com/ndownloader/files/{file_id}" f"/preview/{file_id}/structure.json" ) - response = make_http_get_request_with_retries( - url=url, - logger=logger, - max_attempts=max_attempts, - timeout=30, - delay_before_request=2, - ) + response = get_html_page_with_selenium(url, tag="pre", logger=logger) if response is None: logger.warning("Cannot get zip file content.") return file_names # Extract file names from JSON response. try: - file_names = extract_files_from_json_response(response.json()) + file_names = extract_files_from_json_response(json.loads(response)) except (json.decoder.JSONDecodeError, ValueError) as exc: - logger.warning(f"Cannot extract files from JSON response: {exc}") - logger.debug(f"Status code: {response.status_code}") - logger.debug(response.text) + logger.warning(f"Cannot extract files from HTML response: {exc}") + logger.debug("Response content:") + logger.debug(response) logger.success(f"Found {len(file_names)} files.") return file_names diff --git a/uv.lock b/uv.lock index bdc8374..0c81436 100644 --- a/uv.lock +++ b/uv.lock @@ -102,6 +102,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/39/e7eaf1799466a4aef85b6a4fe7bd175ad2b1c6345066aa33f1f58d4b18d0/asttokens-3.0.1-py3-none-any.whl", hash = "sha256:15a3ebc0f43c2d0a50eeafea25e19046c68398e487b9f1f5b517f7c0f40f976a", size = 27047, upload-time = "2025-11-15T16:43:16.109Z" }, ] +[[package]] +name = "async-generator" +version = "1.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ce/b6/6fa6b3b598a03cba5e80f829e0dadbb49d7645f523d209b2fb7ea0bbb02a/async_generator-1.10.tar.gz", hash = "sha256:6ebb3d106c12920aaae42ccb6f787ef5eefdcdd166ea3d628fa8476abe712144", size = 29870, upload-time = "2018-08-01T03:36:21.69Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/71/52/39d20e03abd0ac9159c162ec24b93fbcaa111e8400308f2465432495ca2b/async_generator-1.10-py3-none-any.whl", hash = "sha256:01c7bf666359b4967d2cda0000cc2e4af16a0ae098cbffcb8472fb9e8ad6585b", size = 18857, upload-time = "2018-08-01T03:36:20.029Z" }, +] + [[package]] name = "async-lru" version = "2.0.5" @@ -1455,6 +1464,7 @@ dependencies = [ { name = "pyyaml" }, { name = "requests" }, { name = "scipy" }, + { name = "selenium" }, ] [package.dev-dependencies] @@ -1493,6 +1503,7 @@ requires-dist = [ { name = "pyyaml", specifier = ">=6.0.2" }, { name = "requests", specifier = ">=2.32.3" }, { name = "scipy", specifier = ">=1.15.2" }, + { name = "selenium", specifier = ">=4.40.0" }, ] [package.metadata.requires-dev] @@ -1586,6 +1597,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/f2/08ace4142eb281c12701fc3b93a10795e4d4dc7f753911d836675050f886/msgpack-1.1.2-cp314-cp314t-win_arm64.whl", hash = "sha256:d99ef64f349d5ec3293688e91486c5fdb925ed03807f64d98d205d2713c60b46", size = 70868, upload-time = "2025-10-08T09:15:44.959Z" }, ] +[[package]] +name = "mypy-extensions" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/6e/371856a3fb9d31ca8dac321cda606860fa4548858c0cc45d9d1d4ca2628b/mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558", size = 6343, upload-time = "2025-04-22T14:54:24.164Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" }, +] + [[package]] name = "narwhals" version = "2.15.0" @@ -1732,6 +1752,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ad/0d/eca3d962f9eef265f01a8e0d20085c6dd1f443cbffc11b6dede81fd82356/numpy-2.4.1-cp314-cp314t-win_arm64.whl", hash = "sha256:6436cffb4f2bf26c974344439439c95e152c9a527013f26b3577be6c2ca64295", size = 10667121, upload-time = "2026-01-10T06:44:41.644Z" }, ] +[[package]] +name = "outcome" +version = "1.3.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/98/df/77698abfac98571e65ffeb0c1fba8ffd692ab8458d617a0eed7d9a8d38f2/outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8", size = 21060, upload-time = "2023-10-26T04:26:04.361Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/55/8b/5ab7257531a5d830fc8000c476e63c935488d74609b50f9384a643ec0a62/outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b", size = 10692, upload-time = "2023-10-26T04:26:02.532Z" }, +] + [[package]] name = "packaging" version = "25.0" @@ -2244,6 +2276,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c4/db/6c3a0e150a38ea90ac215cc8cbf0fa1c57a7975c9f2d06eb7cb12a86c997/pysankeybeta-1.4.2-py3-none-any.whl", hash = "sha256:80720949ce05689101c2a247977e05a46b3eb91913cf73bd7ced54d5200f3dc0", size = 20792, upload-time = "2024-06-14T14:27:02.866Z" }, ] +[[package]] +name = "pysocks" +version = "1.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bd/11/293dd436aea955d45fc4e8a35b6ae7270f5b8e00b53cf6c024c83b657a11/PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0", size = 284429, upload-time = "2019-09-20T02:07:35.714Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8d/59/b4572118e098ac8e46e399a1dd0f2d85403ce8bbaad9ec79373ed6badaf9/PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5", size = 16725, upload-time = "2019-09-20T02:06:22.938Z" }, +] + [[package]] name = "pytest" version = "9.0.2" @@ -2670,6 +2711,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987", size = 294914, upload-time = "2024-01-25T13:21:49.598Z" }, ] +[[package]] +name = "selenium" +version = "4.40.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "trio" }, + { name = "trio-typing" }, + { name = "trio-websocket" }, + { name = "types-certifi" }, + { name = "types-urllib3" }, + { name = "typing-extensions" }, + { name = "urllib3", extra = ["socks"] }, + { name = "websocket-client" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/ef/a5727fa7b33d20d296322adf851b76072d8d3513e1b151969d3228437faf/selenium-4.40.0.tar.gz", hash = "sha256:a88f5905d88ad0b84991c2386ea39e2bbde6d6c334be38df5842318ba98eaa8c", size = 930444, upload-time = "2026-01-18T23:12:31.565Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9d/74/eb9d6540aca1911106fa0877b8e9ef24171bc18857937a6b0ffe0586c623/selenium-4.40.0-py3-none-any.whl", hash = "sha256:c8823fc02e2c771d9ad9a0cf899cee7de1a57a6697e3d0b91f67566129f2b729", size = 9608184, upload-time = "2026-01-18T23:12:29.435Z" }, +] + [[package]] name = "send2trash" version = "2.1.0" @@ -2697,6 +2758,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, ] +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, +] + +[[package]] +name = "sortedcontainers" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" }, +] + [[package]] name = "soupsieve" version = "2.8.1" @@ -2840,6 +2919,72 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, ] +[[package]] +name = "trio" +version = "0.32.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "cffi", marker = "implementation_name != 'pypy' and os_name == 'nt'" }, + { name = "idna" }, + { name = "outcome" }, + { name = "sniffio" }, + { name = "sortedcontainers" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d8/ce/0041ddd9160aac0031bcf5ab786c7640d795c797e67c438e15cfedf815c8/trio-0.32.0.tar.gz", hash = "sha256:150f29ec923bcd51231e1d4c71c7006e65247d68759dd1c19af4ea815a25806b", size = 605323, upload-time = "2025-10-31T07:18:17.466Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/41/bf/945d527ff706233636c73880b22c7c953f3faeb9d6c7e2e85bfbfd0134a0/trio-0.32.0-py3-none-any.whl", hash = "sha256:4ab65984ef8370b79a76659ec87aa3a30c5c7c83ff250b4de88c29a8ab6123c5", size = 512030, upload-time = "2025-10-31T07:18:15.885Z" }, +] + +[[package]] +name = "trio-typing" +version = "0.10.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "async-generator" }, + { name = "importlib-metadata" }, + { name = "mypy-extensions" }, + { name = "packaging" }, + { name = "trio" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b5/74/a87aafa40ec3a37089148b859892cbe2eef08d132c816d58a60459be5337/trio-typing-0.10.0.tar.gz", hash = "sha256:065ee684296d52a8ab0e2374666301aec36ee5747ac0e7a61f230250f8907ac3", size = 38747, upload-time = "2023-12-01T02:54:55.508Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/89/ff/9bd795273eb14fac7f6a59d16cc8c4d0948a619a1193d375437c7f50f3eb/trio_typing-0.10.0-py3-none-any.whl", hash = "sha256:6d0e7ec9d837a2fe03591031a172533fbf4a1a95baf369edebfc51d5a49f0264", size = 42224, upload-time = "2023-12-01T02:54:54.1Z" }, +] + +[[package]] +name = "trio-websocket" +version = "0.12.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "outcome" }, + { name = "trio" }, + { name = "wsproto" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/3c/8b4358e81f2f2cfe71b66a267f023a91db20a817b9425dd964873796980a/trio_websocket-0.12.2.tar.gz", hash = "sha256:22c72c436f3d1e264d0910a3951934798dcc5b00ae56fc4ee079d46c7cf20fae", size = 33549, upload-time = "2025-02-25T05:16:58.947Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/19/eb640a397bba49ba49ef9dbe2e7e5c04202ba045b6ce2ec36e9cadc51e04/trio_websocket-0.12.2-py3-none-any.whl", hash = "sha256:df605665f1db533f4a386c94525870851096a223adcb97f72a07e8b4beba45b6", size = 21221, upload-time = "2025-02-25T05:16:57.545Z" }, +] + +[[package]] +name = "types-certifi" +version = "2021.10.8.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/52/68/943c3aeaf14624712a0357c4a67814dba5cea36d194f5c764dad7959a00c/types-certifi-2021.10.8.3.tar.gz", hash = "sha256:72cf7798d165bc0b76e1c10dd1ea3097c7063c42c21d664523b928e88b554a4f", size = 2095, upload-time = "2022-06-09T15:19:05.244Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b5/63/2463d89481e811f007b0e1cd0a91e52e141b47f9de724d20db7b861dcfec/types_certifi-2021.10.8.3-py3-none-any.whl", hash = "sha256:b2d1e325e69f71f7c78e5943d410e650b4707bb0ef32e4ddf3da37f54176e88a", size = 2136, upload-time = "2022-06-09T15:19:03.127Z" }, +] + +[[package]] +name = "types-urllib3" +version = "1.26.25.14" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/73/de/b9d7a68ad39092368fb21dd6194b362b98a1daeea5dcfef5e1adb5031c7e/types-urllib3-1.26.25.14.tar.gz", hash = "sha256:229b7f577c951b8c1b92c1bc2b2fdb0b49847bd2af6d1cc2a2e3dd340f3bda8f", size = 11239, upload-time = "2023-07-20T15:19:31.307Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/11/7b/3fc711b2efea5e85a7a0bbfe269ea944aa767bbba5ec52f9ee45d362ccf3/types_urllib3-1.26.25.14-py3-none-any.whl", hash = "sha256:9683bbb7fb72e32bfe9d2be6e04875fbe1b3eeec3cbb4ea231435aa7fd6b4f0e", size = 15377, upload-time = "2023-07-20T15:19:30.379Z" }, +] + [[package]] name = "typing-extensions" version = "4.15.0" @@ -2898,6 +3043,11 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, ] +[package.optional-dependencies] +socks = [ + { name = "pysocks" }, +] + [[package]] name = "watermark" version = "2.6.0" @@ -2957,6 +3107,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" }, ] +[[package]] +name = "wsproto" +version = "1.3.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c7/79/12135bdf8b9c9367b8701c2c19a14c913c120b882d50b014ca0d38083c2c/wsproto-1.3.2.tar.gz", hash = "sha256:b86885dcf294e15204919950f666e06ffc6c7c114ca900b060d6e16293528294", size = 50116, upload-time = "2025-11-20T18:18:01.871Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a4/f5/10b68b7b1544245097b2a1b8238f66f2fc6dcaeb24ba5d917f52bd2eed4f/wsproto-1.3.2-py3-none-any.whl", hash = "sha256:61eea322cdf56e8cc904bd3ad7573359a242ba65688716b0710a5eb12beab584", size = 24405, upload-time = "2025-11-20T18:18:00.454Z" }, +] + [[package]] name = "zipp" version = "3.23.0" From d2bf7b0d9e79691635f04dbed8a5bdbc0165e138 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Tue, 20 Jan 2026 18:36:40 +0100 Subject: [PATCH 2/6] style: Reorganize function arguments --- .pre-commit-config.yaml | 5 ++--- src/mdverse_scrapers/core/network.py | 6 +++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 03f472c..167ce67 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,12 +24,11 @@ repos: hooks: # Run the linter. - id: ruff-check - types_or: [ python, pyi ] + types: [python] args: [ --fix ] # Run the formatter. - id: ruff-format - types_or: [ python, pyi ] - + types: [python] - repo: https://github.com/PyCQA/bandit rev: '1.9.2' hooks: diff --git a/src/mdverse_scrapers/core/network.py b/src/mdverse_scrapers/core/network.py index fa7aea1..d1b2a1d 100644 --- a/src/mdverse_scrapers/core/network.py +++ b/src/mdverse_scrapers/core/network.py @@ -266,7 +266,11 @@ def send_http_request_with_retries_pycurl( return results -def get_html_page_with_selenium(url: str, tag: str = "body", logger: "loguru.Logger" = loguru.logger) -> str | None: +def get_html_page_with_selenium( + url: str, + tag: str = "body", + logger: "loguru.Logger" = loguru.logger + ) -> str | None: """Get HTML page content using Selenium. Parameters From da80b1df154dd2bd869dd3d02a92195fc6ae824e Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Tue, 20 Jan 2026 18:42:33 +0100 Subject: [PATCH 3/6] style: fix styling --- .pre-commit-config.yaml | 3 +-- src/mdverse_scrapers/core/network.py | 26 ++++++++++++++------------ 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 167ce67..d5172be 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ # Install pre-commit hooks with: # prek install -exclude: "scripts/*|tmp/*|.*.mdp|" +exclude: "scripts/*|tmp/*" repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 @@ -33,4 +33,3 @@ repos: rev: '1.9.2' hooks: - id: bandit - diff --git a/src/mdverse_scrapers/core/network.py b/src/mdverse_scrapers/core/network.py index d1b2a1d..6017ae4 100644 --- a/src/mdverse_scrapers/core/network.py +++ b/src/mdverse_scrapers/core/network.py @@ -5,16 +5,16 @@ from enum import StrEnum from io import BytesIO +import certifi import httpx import loguru -import certifi import pycurl from selenium import webdriver -from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import WebDriverException -from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support import expected_conditions as ec +from selenium.webdriver.support.ui import WebDriverWait class HttpMethod(StrEnum): @@ -182,7 +182,7 @@ def send_http_request_with_retries_pycurl( data: dict | None = None, delay_before_request: float = 1.0, logger: "loguru.Logger" = loguru.logger, - ) -> dict: +) -> dict: """Query the Figshare API and return the JSON response. Parameters @@ -226,10 +226,10 @@ def send_http_request_with_retries_pycurl( # Handle SSL certificates. curl.setopt(curl.CAINFO, certifi.where()) # Follow redirect. - curl.setopt(curl.FOLLOWLOCATION, True) + curl.setopt(curl.FOLLOWLOCATION, True) # noqa: FBT003 # If data is provided, set the request to POST and add the data. if data is not None: - curl.setopt(curl.POST, True) + curl.setopt(curl.POST, True) # noqa: FBT003 data_json = json.dumps(data) curl.setopt(curl.POSTFIELDS, data_json) # Capture the response body in a buffer. @@ -267,10 +267,8 @@ def send_http_request_with_retries_pycurl( def get_html_page_with_selenium( - url: str, - tag: str = "body", - logger: "loguru.Logger" = loguru.logger - ) -> str | None: + url: str, tag: str = "body", logger: "loguru.Logger" = loguru.logger +) -> str | None: """Get HTML page content using Selenium. Parameters @@ -294,7 +292,11 @@ def get_html_page_with_selenium( try: driver = webdriver.Chrome(options=options) driver.get(url) - page_content = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, tag))).text + page_content = ( + WebDriverWait(driver, 10) + .until(ec.visibility_of_element_located((By.CSS_SELECTOR, tag))) + .text + ) driver.quit() except WebDriverException as e: logger.error("Cannot retrieve page:") From 4b84e8f55a669e88569d638e9c9060ab0f486686 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Tue, 20 Jan 2026 18:55:48 +0100 Subject: [PATCH 4/6] docs: Update figshare docs --- docs/figshare.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/figshare.md b/docs/figshare.md index d250dc8..4b5395a 100644 --- a/docs/figshare.md +++ b/docs/figshare.md @@ -52,10 +52,12 @@ Datasets: - [a-Synuclein short MD simulations:homo-A53T](https://figshare.com/articles/dataset/a-Synuclein_short_MD_simulations_homo-A53T/7007552) - [Molecular Dynamics Protocol with Gromacs 4.0.7](https://figshare.com/articles/dataset/Molecular_Dynamics_Protocol_with_Gromacs_4_0_7/104603) -### zip files +### Zip files -Zip files content is available, like for Zenodo, but individual file sizes are not available. +Zip files content is available with a preview (similar to Zenodo). The only metadata available is the file name (no file size, no md5sum). -Example: -- For this dataset: [Molecular Dynamics Simulations](https://figshare.com/articles/dataset/Molecular_Dynamics_Simulations/30307108?file=58572346) -- Content of the file: [Molecular Dynamics Simulations.zip](https://figshare.com/ndownloader/files/58572346/preview/58572346/structure.json) +Example. For the dataset "[Molecular Dynamics Simulations](https://figshare.com/articles/dataset/Molecular_Dynamics_Simulations/30307108?file=58572346)" : + +- The content of the file "Molecular Dynamics Simulations.zip" is available at + +We need to emulate a web browser to get access to the URLs describing the content of zip files. From 78d25a9ff5a0541bf8f3e1850cecffebc5ba7a11 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Wed, 21 Jan 2026 13:51:03 +0100 Subject: [PATCH 5/6] docs: Update Figshare API documentatation --- docs/figshare.md | 83 +++++++++++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 30 deletions(-) diff --git a/docs/figshare.md b/docs/figshare.md index 4b5395a..3f31b17 100644 --- a/docs/figshare.md +++ b/docs/figshare.md @@ -1,63 +1,86 @@ -# FigShare documentation +# Figshare documentation ## File size -According to FigShare [FAQ](https://help.figshare.com/): +According to Figshare [documentation](https://info.figshare.com/user-guide/file-size-limits-and-storage/): -> Freely-available Figshare.com accounts have the following limits for sharing scholarly content: -storage quota: 20GB -max individual file size: 20GB -max no of collections: 100 -max no of projects: 100 -max no of items: 500 -max no of files per item: 500 -max no of collaborators on project: 100 -max no of authors per item, collection: 100 -max no of item version: 50 -If you have more than 500 files that you need to include in an item, please create an archive (or archives) for the files (e.g. zip file). -If an individual would like to publish outputs larger than 20GB (up to many TBs), please consider Figshare+, our Figshare repository for FAIR-ly sharing big datasets that allows for more storage, larger files, additional metadata and license options, and expert support. There is a one-time cost associated with Figshare+ to cover the cost of storing the data persistently ad infinitum. Find out more about Figshare+ or get in touch at review@figshare.com with the storage amount needed and we will find the best way to support your data sharing. +> All figshare.com accounts are provided with 20GB of private storage and are able to upload individual files up to 20GB. -> For those using an institutional version of Figshare, the number of collaboration spaces will be determined by your institution. Please contact your administrator. - -So we don't expect much files to have an individual size above 20 GB. +So we don't expect files to have an individual size above 20 GB. ## API -- [How to get a personnal token](https://info.figshare.com/user-guide/how-to-get-a-personal-token/) -- [REST API](https://docs.figshare.com/) +### Documentation + +- [How to use the Figshare API](https://info.figshare.com/user-guide/how-to-use-the-figshare-api/) +- [API documentation](https://docs.figshare.com/) + +### Token + +Figshare requires a token to access its API: [How to get a personnal token](https://info.figshare.com/user-guide/how-to-get-a-personal-token/) + +### URL -## Query +https://api.figshare.com/v2/ -[Search guide](https://help.figshare.com/article/how-to-use-advanced-search-in-figshare) +### Query -## Rate limiting +[Search guide](https://docs.figshare.com/#search) -https://docs.figshare.com/#figshare_documentation_api_description_rate_limiting +### Rate limiting > We do not have automatic rate limiting in place for API requests. However, we do carry out monitoring to detect and mitigate abuse and prevent the platform's resources from being overused. We recommend that clients use the API responsibly and do not make more than one request per second. We reserve the right to throttle or block requests if we detect abuse. -## Dataset examples +Source: https://docs.figshare.com/#figshare_documentation_api_description_rate_limiting -### MD-related file types +## Datasets -Query: +### Search for MD-related datasets + +- Endpoint: `/articles/search` +- Documentation: + +We seach MD-related datasets by searching for file types and keywords if necessary. Keywords are searche into `:title:`, `:description:` and `:keywords:` text fields. Example queries: ```none resource_type.type:"dataset" AND filetype:"tpr" ``` -Datasets: +or + +```none +:extension: mdp AND (:title: 'md simulation' OR :description: 'md simulation' OR :keyword: 'md simulation') +:extension: mdp AND (:title: 'gromacs' OR :description: 'gromacs' OR :keyword: 'gromacs') +``` + +Example datasets: - [Molecular dynamics of DSB in nucleosome](https://figshare.com/articles/dataset/M1_gro/5840706) - [a-Synuclein short MD simulations:homo-A53T](https://figshare.com/articles/dataset/a-Synuclein_short_MD_simulations_homo-A53T/7007552) - [Molecular Dynamics Protocol with Gromacs 4.0.7](https://figshare.com/articles/dataset/Molecular_Dynamics_Protocol_with_Gromacs_4_0_7/104603) +### Search strategy + +We search for all file types and keywords. Results are paginated by batch of 100 datasets. + +### Get metadata for a given dataset + +- Endpoint: `/articles/{dataset_id}` +- Documentation: + +Example dataset "[Molecular dynamics of DSB in nucleosome](https://figshare.com/articles/dataset/M1_gro/5840706)": + +- web view: +- API view: + +All metadata related to a given dataset is provided, as well as all files metadata. + ### Zip files -Zip files content is available with a preview (similar to Zenodo). The only metadata available is the file name (no file size, no md5sum). +Zip files content is available with a preview (similar to Zenodo). The only metadata available within this preview is the file name (no file size, no md5sum). -Example. For the dataset "[Molecular Dynamics Simulations](https://figshare.com/articles/dataset/Molecular_Dynamics_Simulations/30307108?file=58572346)" : +Example dataset "[Molecular Dynamics Simulations](https://figshare.com/articles/dataset/Molecular_Dynamics_Simulations/30307108?file=58572346)": - The content of the file "Molecular Dynamics Simulations.zip" is available at -We need to emulate a web browser to get access to the URLs describing the content of zip files. +We need to emulate a web browser to access the URLs linking to the contents of zip files. Otherwise, we get a 202 code. From 12328f36b13882a9760b241bd6c5291263c5b0e6 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Wed, 21 Jan 2026 13:53:32 +0100 Subject: [PATCH 6/6] docs: Fix typo --- docs/figshare.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/figshare.md b/docs/figshare.md index 3f31b17..3af3da9 100644 --- a/docs/figshare.md +++ b/docs/figshare.md @@ -40,7 +40,7 @@ Source: https://docs.figshare.com/#figshare_documentation_api_description_rate_l - Endpoint: `/articles/search` - Documentation: -We seach MD-related datasets by searching for file types and keywords if necessary. Keywords are searche into `:title:`, `:description:` and `:keywords:` text fields. Example queries: +We seach MD-related datasets by searching for file types and keywords if necessary. Keywords are searched into `:title:`, `:description:` and `:keywords:` text fields. Example queries: ```none resource_type.type:"dataset" AND filetype:"tpr"