Skip to content

Commit 8ee5c4b

Browse files
committed
streamline batch size in PubchemBatched
1 parent 1e68032 commit 8ee5c4b

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

chebai/preprocessing/datasets/pubchem.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ class PubChemBatched(PubChem):
217217

218218
READER: Type[dr.ChemDataReader] = dr.ChemDataReader
219219

220-
def __init__(self, train_batch_size=10_000_000, *args, **kwargs):
220+
def __init__(self, train_batch_size=1_000_000, *args, **kwargs):
221221
super(PubChemBatched, self).__init__(*args, **kwargs)
222222
self.curr_epoch = 0
223223
self.train_batch_size = train_batch_size
@@ -275,8 +275,8 @@ def _tokenize_batched(self, data):
275275
for i, d in enumerate(tqdm.tqdm(data, total=len(data))):
276276
if d["features"] is not None:
277277
batch.append(self.reader.to_data(d))
278-
if i % 1_000_000 == 0 and i > 0:
279-
print(f"Saving batch {i // 1_000_000}")
278+
if i % self.train_batch_size == 0 and i > 0:
279+
print(f"Saving batch {i // self.train_batch_size}")
280280
batch = [b for b in batch if b["features"] is not None]
281281
if self.n_token_limit is not None:
282282
batch = [

0 commit comments

Comments
 (0)