Skip to content

Commit 2a4693e

Browse files
authored
drop dataset stats from catalog and cli (#878)
1 parent 10c2702 commit 2a4693e

File tree

10 files changed

+30
-102
lines changed

10 files changed

+30
-102
lines changed

src/datachain/catalog/catalog.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
DatasetDependency,
3939
DatasetListRecord,
4040
DatasetRecord,
41-
DatasetStats,
4241
DatasetStatus,
4342
StorageURI,
4443
create_dataset_uri,
@@ -1235,17 +1234,6 @@ def dataset_table_export_file_names(self, name: str, version: int) -> list[str]:
12351234
dataset = self.get_dataset(name)
12361235
return self.warehouse.dataset_table_export_file_names(dataset, version)
12371236

1238-
def dataset_stats(self, name: str, version: Optional[int]) -> DatasetStats:
1239-
"""
1240-
Returns tuple with dataset stats: total number of rows and total dataset size.
1241-
"""
1242-
dataset = self.get_dataset(name)
1243-
dataset_version = dataset.get_version(version or dataset.latest_version)
1244-
return DatasetStats(
1245-
num_objects=dataset_version.num_objects,
1246-
size=dataset_version.size,
1247-
)
1248-
12491237
def remove_dataset(
12501238
self,
12511239
name: str,

src/datachain/cli/__init__.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from .commands import (
1212
clear_cache,
1313
completion,
14-
dataset_stats,
1514
du,
1615
edit_dataset,
1716
garbage_collect,
@@ -182,13 +181,6 @@ def handle_dataset_command(args, catalog):
182181
all=args.all,
183182
team=args.team,
184183
),
185-
"stats": lambda: dataset_stats(
186-
catalog,
187-
args.name,
188-
args.version,
189-
show_bytes=args.bytes,
190-
si=args.si,
191-
),
192184
}
193185

194186
handler = dataset_commands.get(args.datasets_cmd)

src/datachain/cli/commands/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from .datasets import (
2-
dataset_stats,
32
edit_dataset,
43
list_datasets,
54
list_datasets_local,
@@ -15,7 +14,6 @@
1514
__all__ = [
1615
"clear_cache",
1716
"completion",
18-
"dataset_stats",
1917
"du",
2018
"edit_dataset",
2119
"garbage_collect",

src/datachain/cli/commands/datasets.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33

44
from tabulate import tabulate
55

6-
from datachain import utils
7-
86
if TYPE_CHECKING:
97
from datachain.catalog import Catalog
108

@@ -109,20 +107,3 @@ def edit_dataset(
109107

110108
if (all or studio) and token:
111109
edit_studio_dataset(team, name, new_name, description, labels)
112-
113-
114-
def dataset_stats(
115-
catalog: "Catalog",
116-
name: str,
117-
version: int,
118-
show_bytes=False,
119-
si=False,
120-
):
121-
stats = catalog.dataset_stats(name, version)
122-
123-
if stats:
124-
print(f"Number of objects: {stats.num_objects}")
125-
if show_bytes:
126-
print(f"Total objects size: {stats.size}")
127-
else:
128-
print(f"Total objects size: {utils.sizeof_fmt(stats.size, si=si): >7}")

src/datachain/cli/parser/__init__.py

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -307,31 +307,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
307307
help="The team to delete a dataset. By default, it will use team from config",
308308
)
309309

310-
dataset_stats_parser = datasets_subparser.add_parser(
311-
"stats", parents=[parent_parser], description="Show basic dataset statistics."
312-
)
313-
dataset_stats_parser.add_argument("name", type=str, help="Dataset name")
314-
dataset_stats_parser.add_argument(
315-
"--version",
316-
action="store",
317-
default=None,
318-
type=int,
319-
help="Dataset version",
320-
)
321-
dataset_stats_parser.add_argument(
322-
"-b",
323-
"--bytes",
324-
default=False,
325-
action="store_true",
326-
help="Display size in bytes instead of human-readable size",
327-
)
328-
dataset_stats_parser.add_argument(
329-
"--si",
330-
default=False,
331-
action="store_true",
332-
help="Display size using powers of 1000 not 1024",
333-
)
334-
335310
parse_ls = subp.add_parser(
336311
"ls", parents=[parent_parser], description="List storage contents."
337312
)

src/datachain/dataset.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -150,12 +150,6 @@ def __hash__(self):
150150
return hash(f"{self.type}_{self.name}_{self.version}")
151151

152152

153-
@dataclass
154-
class DatasetStats:
155-
num_objects: Optional[int] # None if table is missing
156-
size: Optional[int] # in bytes None if table is missing or empty
157-
158-
159153
class DatasetStatus:
160154
CREATED = 1
161155
PENDING = 2

src/datachain/remote/studio.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,12 @@
1616
import websockets
1717

1818
from datachain.config import Config
19-
from datachain.dataset import DatasetStats
2019
from datachain.error import DataChainError
2120
from datachain.utils import STUDIO_URL, retry_with_backoff
2221

2322
T = TypeVar("T")
2423
LsData = Optional[list[dict[str, Any]]]
2524
DatasetInfoData = Optional[dict[str, Any]]
26-
DatasetStatsData = Optional[DatasetStats]
2725
DatasetRowsData = Optional[Iterable[dict[str, Any]]]
2826
DatasetJobVersionsData = Optional[dict[str, Any]]
2927
DatasetExportStatus = Optional[dict[str, Any]]

tests/func/test_catalog.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717
def listing_stats(uri, catalog):
1818
list_dataset_name, _, _ = parse_listing_uri(uri, catalog.client_config)
1919
dataset = catalog.get_dataset(list_dataset_name)
20-
return catalog.dataset_stats(dataset.name, dataset.latest_version)
20+
dataset_version = dataset.get_version(dataset.latest_version)
21+
return dataset_version.num_objects, dataset_version.size
2122

2223

2324
@pytest.fixture
@@ -582,23 +583,23 @@ def test_listing_stats(cloud_test_catalog):
582583
listing_stats(src_uri, catalog)
583584

584585
catalog.enlist_source(src_uri)
585-
stats = listing_stats(src_uri, catalog)
586-
assert stats.num_objects == 7
587-
assert stats.size == 36
586+
num_objects, size = listing_stats(src_uri, catalog)
587+
assert num_objects == 7
588+
assert size == 36
588589

589590
catalog.enlist_source(f"{src_uri}/dogs/", update=True)
590-
stats = listing_stats(src_uri, catalog)
591-
assert stats.num_objects == 7
592-
assert stats.size == 36
591+
num_objects, size = listing_stats(src_uri, catalog)
592+
assert num_objects == 7
593+
assert size == 36
593594

594-
stats = listing_stats(f"{src_uri}/dogs/", catalog)
595-
assert stats.num_objects == 4
596-
assert stats.size == 15
595+
num_objects, size = listing_stats(f"{src_uri}/dogs/", catalog)
596+
assert num_objects == 4
597+
assert size == 15
597598

598599
catalog.enlist_source(f"{src_uri}/dogs/")
599-
stats = listing_stats(src_uri, catalog)
600-
assert stats.num_objects == 7
601-
assert stats.size == 36
600+
num_objects, size = listing_stats(src_uri, catalog)
601+
assert num_objects == 7
602+
assert size == 36
602603

603604

604605
@pytest.mark.parametrize("cloud_type", ["s3", "azure", "gs"], indirect=True)
@@ -608,15 +609,15 @@ def test_enlist_source_handles_slash(cloud_test_catalog):
608609
src_path = f"{src_uri}/dogs"
609610

610611
catalog.enlist_source(src_path)
611-
stats = listing_stats(src_path, catalog)
612-
assert stats.num_objects == len(DEFAULT_TREE["dogs"])
613-
assert stats.size == 15
612+
num_objects, size = listing_stats(src_path, catalog)
613+
assert num_objects == len(DEFAULT_TREE["dogs"])
614+
assert size == 15
614615

615616
src_path = f"{src_uri}/dogs"
616617
catalog.enlist_source(src_path, update=True)
617-
stats = listing_stats(src_path, catalog)
618-
assert stats.num_objects == len(DEFAULT_TREE["dogs"])
619-
assert stats.size == 15
618+
num_objects, size = listing_stats(src_path, catalog)
619+
assert num_objects == len(DEFAULT_TREE["dogs"])
620+
assert size == 15
620621

621622

622623
@pytest.mark.parametrize("cloud_type", ["s3", "azure", "gs"], indirect=True)
@@ -626,10 +627,10 @@ def test_enlist_source_handles_glob(cloud_test_catalog):
626627
src_path = f"{src_uri}/dogs/*.jpg"
627628

628629
catalog.enlist_source(src_path)
629-
stats = listing_stats(src_path, catalog)
630+
num_objects, size = listing_stats(src_path, catalog)
630631

631-
assert stats.num_objects == len(DEFAULT_TREE["dogs"])
632-
assert stats.size == 15
632+
assert num_objects == len(DEFAULT_TREE["dogs"])
633+
assert size == 15
633634

634635

635636
@pytest.mark.parametrize("cloud_type", ["s3", "azure", "gs"], indirect=True)

tests/func/test_datachain.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from datachain import DataModel, func
2121
from datachain.catalog.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE
2222
from datachain.data_storage.sqlite import SQLiteWarehouse
23-
from datachain.dataset import DatasetDependencyType, DatasetStats
23+
from datachain.dataset import DatasetDependencyType
2424
from datachain.func import path as pathfunc
2525
from datachain.lib.dc import C, DataChain
2626
from datachain.lib.file import File, ImageFile
@@ -515,8 +515,9 @@ def test_from_storage_dataset_stats(tmp_dir, test_session):
515515
dc = DataChain.from_storage(tmp_dir.as_uri(), session=test_session).save(
516516
"test-data"
517517
)
518-
stats = test_session.catalog.dataset_stats(dc.name, dc.version)
519-
assert stats == DatasetStats(num_objects=4, size=20)
518+
version = test_session.catalog.get_dataset(dc.name).get_version(dc.version)
519+
assert version.num_objects == 4
520+
assert version.size == 20
520521

521522

522523
def test_from_storage_check_rows(tmp_dir, test_session):

tests/func/test_datasets.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -845,9 +845,9 @@ def test_row_random(cloud_test_catalog):
845845

846846
def test_dataset_stats_registered_ds(cloud_test_catalog, dogs_dataset):
847847
catalog = cloud_test_catalog.catalog
848-
stats = catalog.dataset_stats(dogs_dataset.name, 1)
849-
assert stats.num_objects == 4
850-
assert stats.size == 15
848+
dataset = catalog.get_dataset(dogs_dataset.name).get_version(1)
849+
assert dataset.num_objects == 4
850+
assert dataset.size == 15
851851
rows_count = catalog.warehouse.dataset_rows_count(dogs_dataset, 1)
852852
assert rows_count == 4
853853

0 commit comments

Comments
 (0)