|
2 | 2 |
|
3 | 3 | import json |
4 | 4 | import os |
5 | | -import pathlib |
6 | 5 | import sys |
7 | 6 | import time |
8 | 7 | from datetime import datetime, timedelta |
| 8 | +from pathlib import Path |
9 | 9 |
|
| 10 | +import click |
10 | 11 | import loguru |
11 | 12 | import pandas as pd |
12 | 13 | from dotenv import load_dotenv |
|
21 | 22 | extract_date, |
22 | 23 | extract_file_extension, |
23 | 24 | find_remove_false_positive_datasets, |
24 | | - get_scraper_cli_arguments, |
25 | 25 | make_http_get_request_with_retries, |
26 | 26 | read_query_file, |
27 | 27 | remove_excluded_files, |
@@ -435,21 +435,37 @@ def get_metadata_for_datasets( |
435 | 435 | return datasets_df, files_df |
436 | 436 |
|
437 | 437 |
|
438 | | -def main() -> None: |
| 438 | +@click.command( |
| 439 | + help="Command line interface for MDverse scrapers", |
| 440 | + epilog="Happy scraping!", |
| 441 | +) |
| 442 | +@click.option( |
| 443 | + "--output-dir", |
| 444 | + "output_dir_path", |
| 445 | + type=click.Path(exists=False, file_okay=False, dir_okay=True, path_type=Path), |
| 446 | + required=True, |
| 447 | + help="Output directory path to save results.", |
| 448 | +) |
| 449 | +@click.option( |
| 450 | + "--query-file", |
| 451 | + "query_file_path", |
| 452 | + type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path), |
| 453 | + required=True, |
| 454 | + help="Query parameters file (YAML format).", |
| 455 | +) |
| 456 | +def main(output_dir_path: Path, query_file_path: Path) -> None: |
439 | 457 | """Scrape Figshare datasets and files.""" |
440 | 458 | # Define data repository name. |
441 | 459 | repository_name = "figshare" |
442 | 460 | # Keep track of script duration. |
443 | 461 | start_time = time.perf_counter() |
444 | | - # Parse input CLI arguments. |
445 | | - args = get_scraper_cli_arguments() |
446 | 462 | # Create context manager. |
447 | | - output_path = pathlib.Path(args.output) / repository_name |
| 463 | + output_path = output_dir_path / repository_name |
448 | 464 | output_path.mkdir(parents=True, exist_ok=True) |
449 | 465 | context = ContextManager( |
450 | 466 | logger=create_logger(logpath=f"{output_path}/{repository_name}_scraping.log"), |
451 | 467 | output_path=output_path, |
452 | | - query_file_name=pathlib.Path(args.query), |
| 468 | + query_file_name=query_file_path, |
453 | 469 | ) |
454 | 470 | # Log script name and doctring. |
455 | 471 | context.logger.info(__file__) |
@@ -483,7 +499,9 @@ def main() -> None: |
483 | 499 |
|
484 | 500 | # Remove unwanted files based on exclusion lists. |
485 | 501 | context.logger.info("Removing unwanted files...") |
486 | | - _, _, exclude_files, exclude_paths = read_query_file(args.query, context.logger) |
| 502 | + _, _, exclude_files, exclude_paths = read_query_file( |
| 503 | + query_file_path, context.logger |
| 504 | + ) |
487 | 505 | files_df = remove_excluded_files(files_df, exclude_files, exclude_paths) |
488 | 506 | context.logger.info("-" * 30) |
489 | 507 |
|
|
0 commit comments