Skip to content

Commit 16aaa37

Browse files
authored
dc: try to fix dataset_stats for DataChain.from_storage() generated dataset (#151)
1 parent 5312913 commit 16aaa37

File tree

2 files changed

+13
-1
lines changed

2 files changed

+13
-1
lines changed

src/datachain/data_storage/warehouse.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,9 @@ def dataset_stats(
390390
expressions: tuple[_ColumnsClauseArgument[Any], ...] = (
391391
sa.func.count(table.c.sys__id),
392392
)
393-
if "size" in table.columns:
393+
if "file__size" in table.columns:
394+
expressions = (*expressions, sa.func.sum(table.c.file__size))
395+
elif "size" in table.columns:
394396
expressions = (*expressions, sa.func.sum(table.c.size))
395397
query = select(*expressions)
396398
((nrows, *rest),) = self.db.execute(query)

tests/func/test_datachain.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pandas as pd
55
import pytest
66

7+
from datachain.dataset import DatasetStats
78
from datachain.lib.dc import DataChain
89
from datachain.lib.file import File
910

@@ -205,3 +206,12 @@ def test_show_no_truncate(capsys, catalog):
205206
for i in range(3):
206207
assert client[i] in normalized_output
207208
assert details[i] in normalized_output
209+
210+
211+
def test_from_storage_dataset_stats(tmp_dir, catalog):
212+
for i in range(4):
213+
(tmp_dir / f"file{i}.txt").write_text(f"file{i}")
214+
215+
dc = DataChain.from_storage(tmp_dir.as_uri(), catalog=catalog).save("test-data")
216+
stats = catalog.dataset_stats(dc.name, dc.version)
217+
assert stats == DatasetStats(num_objects=4, size=20)

0 commit comments

Comments
 (0)