55from typing import Self
66
77import loguru
8- from pydantic import BaseModel , Field , computed_field , model_validator
8+ from pydantic import BaseModel , DirectoryPath , Field , FilePath , model_validator
99
1010from .enums import DatasetSourceName , DataType
1111
@@ -17,26 +17,36 @@ class ScraperContext(BaseModel):
1717 ...,
1818 description = "Data repository to be scraped." ,
1919 )
20- output_dir_path : str | Path = Field (
20+ output_dir_path : DirectoryPath = Field (
2121 ...,
2222 description = "Output directory path for the scraper results." ,
2323 )
24- query_file_path : str | Path | None = Field (
24+ query_file_path : FilePath | None = Field (
2525 None ,
2626 description = "Path to the query file for the scraper." ,
2727 )
28- log_file_path : str | Path | None = Field (
28+ log_file_path : Path | None = Field (
2929 None ,
3030 description = "Path to the log file for the scraper." ,
3131 )
32- datasets_parquet_file_path : str | Path | None = Field (
32+ datasets_parquet_file_path : Path | None = Field (
3333 None ,
3434 description = "Path to the output parquet file for datasets metadata." ,
3535 )
36- files_parquet_file_path : str | Path | None = Field (
36+ number_of_datasets_scraped : int = Field (
37+ 0 ,
38+ ge = 0 ,
39+ description = "Number of datasets scraped." ,
40+ )
41+ files_parquet_file_path : Path | None = Field (
3742 None ,
3843 description = "Path to the output parquet file for files metadata." ,
3944 )
45+ number_of_files_scraped : int = Field (
46+ 0 ,
47+ ge = 0 ,
48+ description = "Number of files scraped." ,
49+ )
4050 token : str | None = Field (
4151 None ,
4252 description = "Access token or API key." ,
@@ -45,12 +55,10 @@ class ScraperContext(BaseModel):
4555 loguru .logger ,
4656 description = "Logger instance for logging scraper activities." ,
4757 )
48-
49- @computed_field
50- @property
51- def start_time (self ) -> datetime :
52- """Datetime when the scraper context was created."""
53- return datetime .now ()
58+ start_time : datetime = Field (
59+ default_factory = lambda : datetime .now (),
60+ description = "Datetime when the scraper started." ,
61+ )
5462
5563 @model_validator (mode = "after" )
5664 def create_output_dir_path (self ) -> Self :
@@ -66,14 +74,14 @@ def create_output_dir_path(self) -> Self:
6674 self .output_dir_path = (
6775 Path (self .output_dir_path )
6876 / self .data_source_name .value
69- / datetime . now () .strftime ("%Y%m%d" )
77+ / self . start_time .strftime ("%Y%m%d" )
7078 )
7179 self .output_dir_path .mkdir (parents = True , exist_ok = True )
7280 # Define log file path.
7381 self .log_file_path = (
7482 self .output_dir_path / f"{ self .data_source_name .value } _scraper.log"
7583 )
76- # Define output parquet file path.
84+ # Define output parquet file path for datasets and files metadata .
7785 self .datasets_parquet_file_path = (
7886 self .output_dir_path
7987 / f"{ self .data_source_name .value } _{ DataType .DATASETS .value } .parquet"
0 commit comments