Skip to content
This repository was archived by the owner on Mar 10, 2026. It is now read-only.

Commit 42ee2e3

Browse files
committed
feat: Add number of datasets and files
1 parent 19c914b commit 42ee2e3

File tree

1 file changed

+22
-14
lines changed

1 file changed

+22
-14
lines changed

src/mdverse_scrapers/models/scraper.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from typing import Self
66

77
import loguru
8-
from pydantic import BaseModel, Field, computed_field, model_validator
8+
from pydantic import BaseModel, DirectoryPath, Field, FilePath, model_validator
99

1010
from .enums import DatasetSourceName, DataType
1111

@@ -17,26 +17,36 @@ class ScraperContext(BaseModel):
1717
...,
1818
description="Data repository to be scraped.",
1919
)
20-
output_dir_path: str | Path = Field(
20+
output_dir_path: DirectoryPath = Field(
2121
...,
2222
description="Output directory path for the scraper results.",
2323
)
24-
query_file_path: str | Path | None = Field(
24+
query_file_path: FilePath | None = Field(
2525
None,
2626
description="Path to the query file for the scraper.",
2727
)
28-
log_file_path: str | Path | None = Field(
28+
log_file_path: Path | None = Field(
2929
None,
3030
description="Path to the log file for the scraper.",
3131
)
32-
datasets_parquet_file_path: str | Path | None = Field(
32+
datasets_parquet_file_path: Path | None = Field(
3333
None,
3434
description="Path to the output parquet file for datasets metadata.",
3535
)
36-
files_parquet_file_path: str | Path | None = Field(
36+
number_of_datasets_scraped: int = Field(
37+
0,
38+
ge=0,
39+
description="Number of datasets scraped.",
40+
)
41+
files_parquet_file_path: Path | None = Field(
3742
None,
3843
description="Path to the output parquet file for files metadata.",
3944
)
45+
number_of_files_scraped: int = Field(
46+
0,
47+
ge=0,
48+
description="Number of files scraped.",
49+
)
4050
token: str | None = Field(
4151
None,
4252
description="Access token or API key.",
@@ -45,12 +55,10 @@ class ScraperContext(BaseModel):
4555
loguru.logger,
4656
description="Logger instance for logging scraper activities.",
4757
)
48-
49-
@computed_field
50-
@property
51-
def start_time(self) -> datetime:
52-
"""Datetime when the scraper context was created."""
53-
return datetime.now()
58+
start_time: datetime = Field(
59+
default_factory=lambda: datetime.now(),
60+
description="Datetime when the scraper started.",
61+
)
5462

5563
@model_validator(mode="after")
5664
def create_output_dir_path(self) -> Self:
@@ -66,14 +74,14 @@ def create_output_dir_path(self) -> Self:
6674
self.output_dir_path = (
6775
Path(self.output_dir_path)
6876
/ self.data_source_name.value
69-
/ datetime.now().strftime("%Y%m%d")
77+
/ self.start_time.strftime("%Y%m%d")
7078
)
7179
self.output_dir_path.mkdir(parents=True, exist_ok=True)
7280
# Define log file path.
7381
self.log_file_path = (
7482
self.output_dir_path / f"{self.data_source_name.value}_scraper.log"
7583
)
76-
# Define output parquet file path.
84+
# Define output parquet file path for datasets and files metadata.
7785
self.datasets_parquet_file_path = (
7886
self.output_dir_path
7987
/ f"{self.data_source_name.value}_{DataType.DATASETS.value}.parquet"

0 commit comments

Comments
 (0)