Skip to content
This repository was archived by the owner on Mar 10, 2026. It is now read-only.

Commit 5f0cb47

Browse files
authored
Merge pull request #49 from MDverse/update-figshare-cli
feat: use Click for CLI
2 parents 93bd906 + f55ff73 commit 5f0cb47

File tree

3 files changed

+32
-16
lines changed

3 files changed

+32
-16
lines changed

src/mdverse_scrapers/scrapers/figshare.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22

33
import json
44
import os
5-
import pathlib
65
import sys
76
import time
87
from datetime import datetime, timedelta
8+
from pathlib import Path
99

10+
import click
1011
import loguru
1112
import pandas as pd
1213
from dotenv import load_dotenv
@@ -21,7 +22,6 @@
2122
extract_date,
2223
extract_file_extension,
2324
find_remove_false_positive_datasets,
24-
get_scraper_cli_arguments,
2525
make_http_get_request_with_retries,
2626
read_query_file,
2727
remove_excluded_files,
@@ -435,21 +435,37 @@ def get_metadata_for_datasets(
435435
return datasets_df, files_df
436436

437437

438-
def main() -> None:
438+
@click.command(
439+
help="Command line interface for MDverse scrapers",
440+
epilog="Happy scraping!",
441+
)
442+
@click.option(
443+
"--output-dir",
444+
"output_dir_path",
445+
type=click.Path(exists=False, file_okay=False, dir_okay=True, path_type=Path),
446+
required=True,
447+
help="Output directory path to save results.",
448+
)
449+
@click.option(
450+
"--query-file",
451+
"query_file_path",
452+
type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path),
453+
required=True,
454+
help="Query parameters file (YAML format).",
455+
)
456+
def main(output_dir_path: Path, query_file_path: Path) -> None:
439457
"""Scrape Figshare datasets and files."""
440458
# Define data repository name.
441459
repository_name = "figshare"
442460
# Keep track of script duration.
443461
start_time = time.perf_counter()
444-
# Parse input CLI arguments.
445-
args = get_scraper_cli_arguments()
446462
# Create context manager.
447-
output_path = pathlib.Path(args.output) / repository_name
463+
output_path = output_dir_path / repository_name
448464
output_path.mkdir(parents=True, exist_ok=True)
449465
context = ContextManager(
450466
logger=create_logger(logpath=f"{output_path}/{repository_name}_scraping.log"),
451467
output_path=output_path,
452-
query_file_name=pathlib.Path(args.query),
468+
query_file_name=query_file_path,
453469
)
454470
# Log script name and doctring.
455471
context.logger.info(__file__)
@@ -483,7 +499,9 @@ def main() -> None:
483499

484500
# Remove unwanted files based on exclusion lists.
485501
context.logger.info("Removing unwanted files...")
486-
_, _, exclude_files, exclude_paths = read_query_file(args.query, context.logger)
502+
_, _, exclude_files, exclude_paths = read_query_file(
503+
query_file_path, context.logger
504+
)
487505
files_df = remove_excluded_files(files_df, exclude_files, exclude_paths)
488506
context.logger.info("-" * 30)
489507

src/mdverse_scrapers/scrapers/nomad.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
This script scrapes molecular dynamics datasets from the NOMAD repository
44
https://nomad-lab.eu/prod/v1/gui/search/entries
55
"""
6+
67
import json
78
import sys
89
import time
@@ -504,8 +505,7 @@ def main(output_dir_path: Path) -> None:
504505
)
505506
# Parse and validate NOMAD dataset metadata with a pydantic model (DatasetMetadata)
506507
datasets_normalized_metadata = normalize_datasets_metadata(
507-
datasets_selected_metadata,
508-
logger=logger
508+
datasets_selected_metadata, logger=logger
509509
)
510510
# Save datasets metadata to parquet file.
511511
export_list_of_models_to_parquet(

src/mdverse_scrapers/scrapers/zenodo.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22

33
import json
44
import os
5-
from pathlib import Path
65
import sys
76
import time
87
from datetime import datetime, timedelta
8+
from pathlib import Path
99

1010
import click
1111
import loguru
@@ -379,9 +379,7 @@ def extract_records(
379379
"title": clean_text(hit["metadata"]["title"]),
380380
"author": clean_text(hit["metadata"]["creators"][0]["name"]),
381381
"keywords": "none",
382-
"description": clean_text(
383-
hit["metadata"].get("description", "")
384-
),
382+
"description": clean_text(hit["metadata"].get("description", "")),
385383
}
386384
if "keywords" in hit["metadata"]:
387385
dataset_dict["keywords"] = ";".join(
@@ -624,12 +622,12 @@ def main(output_dir_path: Path, query_file_path: Path):
624622
# Keep track of script duration.
625623
start_time = time.perf_counter()
626624
# Create context manager.
627-
output_path = Path(output_dir_path) / repository_name
625+
output_path = output_dir_path / repository_name
628626
output_path.mkdir(parents=True, exist_ok=True)
629627
context = ContextManager(
630628
logger=create_logger(logpath=f"{output_path}/{repository_name}_scraping.log"),
631629
output_path=output_path,
632-
query_file_name=Path(query_file_path),
630+
query_file_name=query_file_path,
633631
)
634632
# Log script name and doctring.
635633
context.logger.info(__file__)

0 commit comments

Comments
 (0)