Skip to content

Commit 40af1bc

Browse files
committed
prints stats for loaded data sizes
1 parent 5dc209d commit 40af1bc

File tree

1 file changed

+9
-4
lines changed
  • chebai/preprocessing/datasets

1 file changed

+9
-4
lines changed

chebai/preprocessing/datasets/base.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -472,14 +472,17 @@ def _set_processed_data_props(self):
472472
- self._num_of_labels: Number of target labels in the dataset.
473473
- self._feature_vector_size: Maximum feature vector length across all data points.
474474
"""
475-
data_pt = torch.load(
476-
os.path.join(self.processed_dir, self.processed_file_names_dict["data"]),
477-
weights_only=False,
475+
pt_file_path = os.path.join(
476+
self.processed_dir, self.processed_file_names_dict["data"]
478477
)
478+
data_pt = torch.load(pt_file_path, weights_only=False)
479479

480480
self._num_of_labels = len(data_pt[0]["labels"])
481481
self._feature_vector_size = max(len(d["features"]) for d in data_pt)
482482

483+
print(
484+
f"Number of samples in encoded data ({pt_file_path}): {len(data_pt)} samples"
485+
)
483486
print(f"Number of labels for loaded data: {self._num_of_labels}")
484487
print(f"Feature vector size: {self._feature_vector_size}")
485488

@@ -934,7 +937,9 @@ def _get_data_size(input_file_path: str) -> int:
934937
int: The size of the data.
935938
"""
936939
with open(input_file_path, "rb") as f:
937-
return len(pd.read_pickle(f))
940+
df = pd.read_pickle(f)
941+
print(f"Processed data size ({input_file_path}): {len(df)} rows")
942+
return len(df)
938943

939944
@abstractmethod
940945
def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, None]:

0 commit comments

Comments
 (0)