Skip to content

Commit 73866f2

Browse files
feat(dataset): filter ls-files by tag (#2950)
1 parent 6995098 commit 73866f2

File tree

11 files changed

+247
-103
lines changed

11 files changed

+247
-103
lines changed

renku/command/format/dataset_files.py

Lines changed: 35 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -52,42 +52,36 @@ def tabular(records, *, columns=None):
5252

5353

5454
@inject.autoparams()
55-
def get_lfs_tracking(records, client_dispatcher: IClientDispatcher):
56-
"""Check if files are tracked in git lfs.
57-
58-
Args:
59-
records: File records to check.
60-
client_dispatcher(IClientDispatcher): Injected client dispatcher.
61-
"""
62-
client = client_dispatcher.current_client
63-
64-
paths = (r.path for r in records)
65-
attrs = client.repository.get_attributes(*paths)
66-
67-
for record in records:
68-
if attrs.get(str(record.path), {}).get("filter") == "lfs":
69-
record.is_lfs = True
70-
else:
71-
record.is_lfs = False
72-
73-
74-
@inject.autoparams()
75-
def get_lfs_file_sizes(records, client_dispatcher: IClientDispatcher):
76-
"""Try to get file size from Git LFS.
55+
def get_lfs_tracking_and_file_sizes(records, has_tag: bool, client_dispatcher: IClientDispatcher):
56+
"""Try to get file size from Git LFS and check if files are tracked in git lfs.
7757
7858
Args:
7959
records: File records tog et size for.
60+
has_tag(bool): Whether sizes are retrieved for a given tag instead of HEAD commit
8061
client_dispatcher(IClientDispatcher): Injected client dispatcher.
8162
"""
8263
from humanize import naturalsize # Slow import
8364

8465
client = client_dispatcher.current_client
8566

67+
def get_lfs_tracking():
68+
paths = (r.path for r in records)
69+
attrs = client.repository.get_attributes(*paths)
70+
71+
for record in records:
72+
if attrs.get(str(record.path), {}).get("filter") == "lfs":
73+
record.is_lfs = True
74+
else:
75+
record.is_lfs = False
76+
8677
lfs_files_sizes = {}
8778

8879
try:
8980
lfs_run = run(
90-
("git", "lfs", "ls-files", "--name-only", "--size"), stdout=PIPE, cwd=client.path, universal_newlines=True
81+
("git", "lfs", "ls-files", "--name-only", "--size", "--deleted"),
82+
stdout=PIPE,
83+
cwd=client.path,
84+
universal_newlines=True,
9185
)
9286
except SubprocessError:
9387
pass
@@ -106,15 +100,29 @@ def get_lfs_file_sizes(records, client_dispatcher: IClientDispatcher):
106100
size = size.replace(" B", " B")
107101
lfs_files_sizes[path] = size
108102

109-
non_lfs_files_sizes = {
110-
o.path: o.size for o in client.repository.head.commit.traverse() if o.path not in lfs_files_sizes
111-
}
112-
non_lfs_files_sizes = {k: naturalsize(v).upper().replace("BYTES", " B") for k, v in non_lfs_files_sizes.items()}
103+
if has_tag:
104+
checksums = [r.entity.checksum for r in records]
105+
sizes = client.repository.get_sizes(*checksums)
106+
non_lfs_files_sizes = {
107+
r.entity.path: naturalsize(s).upper().replace("BYTES", " B") for r, s in zip(records, sizes)
108+
}
109+
else:
110+
non_lfs_files_sizes = {
111+
o.path: o.size for o in client.repository.head.commit.traverse() if o.path not in lfs_files_sizes
112+
}
113+
non_lfs_files_sizes = {k: naturalsize(v).upper().replace("BYTES", " B") for k, v in non_lfs_files_sizes.items()}
114+
115+
# NOTE: Check .gitattributes file to see if a file is in LFS
116+
get_lfs_tracking()
113117

114118
for record in records:
115119
size = lfs_files_sizes.get(record.path) or non_lfs_files_sizes.get(record.path)
116120
record.size = size
117121

122+
# NOTE: When listing a tag we assume that the file is in LFS if it was in LFS at some point in time
123+
if has_tag:
124+
record.is_lfs = lfs_files_sizes.get(record.path) is not None
125+
118126

119127
def jsonld(records, **kwargs):
120128
"""Format dataset files as JSON-LD.

renku/core/dataset/dataset.py

Lines changed: 74 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
from renku.core.dataset.providers import ProviderFactory
3737
from renku.core.dataset.providers.models import ProviderDataset, ProviderDatasetFile
3838
from renku.core.dataset.request_model import ImageRequestModel
39-
from renku.core.dataset.tag import add_dataset_tag, prompt_access_token, prompt_tag_selection
39+
from renku.core.dataset.tag import add_dataset_tag, get_dataset_by_tag, prompt_access_token, prompt_tag_selection
4040
from renku.core.interface.client_dispatcher import IClientDispatcher
4141
from renku.core.interface.database_dispatcher import IDatabaseDispatcher
4242
from renku.core.interface.dataset_gateway import IDatasetGateway
@@ -249,10 +249,11 @@ def edit_dataset(
249249
return updated
250250

251251

252-
@inject.autoparams()
252+
@inject.autoparams("client_dispatcher")
253253
def list_dataset_files(
254254
client_dispatcher: IClientDispatcher,
255-
datasets=None,
255+
datasets: List[str] = None,
256+
tag: Optional[str] = None,
256257
creators=None,
257258
include=None,
258259
exclude=None,
@@ -261,19 +262,22 @@ def list_dataset_files(
261262
262263
Args:
263264
client_dispatcher(IClientDispatcher): Injected client dispatcher.
264-
datasets: Datasets to list files for (Default value = None).
265+
datasets(List[str]): Datasets to list files for (Default value = None).
266+
tag(str): Tag to filter by (Default value = None).
265267
creators: Creators to filter by (Default value = None).
266268
include: Include filters for file paths (Default value = None).
267269
exclude: Exclude filters for file paths (Default value = None).
268270
269271
Returns:
270272
List[DynamicProxy]: Filtered dataset files.
271273
"""
272-
from renku.command.format.dataset_files import get_lfs_file_sizes, get_lfs_tracking
274+
from renku.command.format.dataset_files import get_lfs_tracking_and_file_sizes
273275

274276
client = client_dispatcher.current_client
275277

276-
records = filter_dataset_files(names=datasets, creators=creators, include=include, exclude=exclude, immutable=True)
278+
records = filter_dataset_files(
279+
names=datasets, tag=tag, creators=creators, include=include, exclude=exclude, immutable=True
280+
)
277281
for record in records:
278282
record.title = record.dataset.title
279283
record.dataset_name = record.dataset.name
@@ -285,8 +289,7 @@ def list_dataset_files(
285289
record.name = Path(record.entity.path).name
286290
record.added = record.date_added
287291

288-
get_lfs_file_sizes(records)
289-
get_lfs_tracking(records)
292+
get_lfs_tracking_and_file_sizes(records, has_tag=bool(tag))
290293

291294
return records
292295

@@ -1145,90 +1148,93 @@ def update_external_files(client: "LocalClient", records: List[DynamicProxy], dr
11451148
return updated_files
11461149

11471150

1148-
@inject.autoparams()
1151+
@inject.autoparams("client_dispatcher", "dataset_gateway")
11491152
def filter_dataset_files(
11501153
client_dispatcher: IClientDispatcher,
11511154
dataset_gateway: IDatasetGateway,
1152-
names=None,
1153-
creators=None,
1154-
include=None,
1155-
exclude=None,
1156-
ignore=None,
1157-
immutable=False,
1155+
names: Optional[List[str]] = None,
1156+
tag: Optional[str] = None,
1157+
creators: Optional[Union[str, List[str], Tuple[str]]] = None,
1158+
include: Optional[List[str]] = None,
1159+
exclude: Optional[List[str]] = None,
1160+
ignore: Optional[List[str]] = None,
1161+
immutable: bool = False,
11581162
) -> List[DynamicProxy]:
11591163
"""Filter dataset files by specified filters.
11601164
11611165
Args:
11621166
client_dispatcher(IClientDispatcher): Injected client dispatcher.
11631167
dataset_gateway(IDatasetGateway):Injected dataset gateway.
1164-
names: Filter by specified dataset names. (Default value = None).
1165-
creators: Filter by creators. (Default value = None).
1166-
include: Include files matching file pattern. (Default value = None).
1167-
exclude: Exclude files matching file pattern. (Default value = None).
1168-
ignore: Ignored datasets. (Default value = None).
1169-
immutable: Return immutable copies of dataset objects. (Default value = False).
1168+
names(Optional[List[str]]): Filter by specified dataset names (Default value = None).
1169+
tag(Optional[str]): Filter by specified tag (Default value = None).
1170+
creators(Optional[Union[str, List[str], Tuple[str]]]): Filter by creators (Default value = None).
1171+
include(Optional[List[str]]): Tuple containing patterns to which include from result (Default value = None).
1172+
exclude(Optional[List[str]]): Tuple containing patterns to which exclude from result (Default value = None).
1173+
ignore(Optional[List[str]]): Ignored datasets (Default value = None).
1174+
immutable(bool): Return immutable copies of dataset objects (Default value = False).
11701175
11711176
Returns:
11721177
List[DynamicProxy]: List of filtered files sorted by date added.
11731178
"""
1179+
1180+
def should_include(filepath: Path) -> bool:
1181+
"""Check if file matches one of include filters and not in exclude filter."""
1182+
if exclude:
1183+
for pattern in exclude:
1184+
if filepath.match(pattern):
1185+
return False
1186+
1187+
if include:
1188+
for pattern in include:
1189+
if filepath.match(pattern):
1190+
return True
1191+
return False
1192+
1193+
return True
1194+
11741195
client = client_dispatcher.current_client
11751196

11761197
if isinstance(creators, str):
1177-
creators = set(creators.split(","))
1178-
1179-
if isinstance(creators, list) or isinstance(creators, tuple):
1180-
creators = set(creators)
1198+
creators_set = set(creators.split(","))
1199+
elif isinstance(creators, list) or isinstance(creators, tuple):
1200+
creators_set = set(creators)
1201+
else:
1202+
creators_set = set()
11811203

11821204
records = []
1183-
unused_names = set(names)
1205+
unused_names = set(names) if names is not None else set()
1206+
11841207
for dataset in dataset_gateway.get_all_active_datasets():
1208+
if (names and dataset.name not in names) or (ignore and dataset.name in ignore):
1209+
continue
1210+
1211+
if tag:
1212+
dataset = get_dataset_by_tag(dataset=dataset, tag=tag) # type: ignore
1213+
if not dataset:
1214+
continue
1215+
11851216
if not immutable:
11861217
dataset = dataset.copy()
1187-
if (not names or dataset.name in names) and (not ignore or dataset.name not in ignore):
1188-
if unused_names:
1189-
unused_names.remove(dataset.name)
1190-
for file in dataset.files:
1191-
record = DynamicProxy(file)
1192-
record.dataset = dataset
1193-
record.client = client
1194-
path = Path(record.entity.path)
1195-
match = _include_exclude(path, include, exclude)
1196-
1197-
if creators:
1198-
c: Person
1199-
dataset_creators = {c.name for c in dataset.creators}
1200-
match = match and creators.issubset(dataset_creators)
1201-
1202-
if match:
1203-
records.append(record)
12041218

1205-
if unused_names:
1206-
unused_names_str = ", ".join(unused_names)
1207-
raise errors.ParameterError(f"Dataset does not exist: {unused_names_str}")
1219+
if unused_names:
1220+
unused_names.remove(dataset.name)
12081221

1209-
return sorted(records, key=lambda r: r.date_added)
1222+
if creators_set:
1223+
dataset_creators = {creator.name for creator in dataset.creators}
1224+
if not creators_set.issubset(dataset_creators):
1225+
continue
12101226

1227+
for file in dataset.files:
1228+
if not should_include(Path(file.entity.path)):
1229+
continue
12111230

1212-
def _include_exclude(file_path, include=None, exclude=None):
1213-
"""Check if file matches one of include filters and not in exclude filter.
1231+
record = DynamicProxy(file)
1232+
record.dataset = dataset
1233+
record.client = client
1234+
records.append(record)
12141235

1215-
Args:
1216-
file_path: Path to the file.
1217-
include: Tuple containing patterns to which include from result (Default value = None).
1218-
exclude: Tuple containing patterns to which exclude from result (Default value = None).
1236+
if unused_names:
1237+
unused_names_str = ", ".join(unused_names)
1238+
raise errors.ParameterError(f"These datasets don't exist: {unused_names_str}")
12191239

1220-
Returns:
1221-
bool: True if a file should be included, False otherwise.
1222-
"""
1223-
if exclude is not None and exclude:
1224-
for pattern in exclude:
1225-
if file_path.match(pattern):
1226-
return False
1227-
1228-
if include is not None and include:
1229-
for pattern in include:
1230-
if file_path.match(pattern):
1231-
return True
1232-
return False
1233-
1234-
return True
1240+
return sorted(records, key=lambda r: r.date_added)

renku/core/dataset/tag.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
from renku.core import errors
2525
from renku.core.dataset.datasets_provenance import DatasetsProvenance
2626
from renku.core.util import communication
27-
from renku.domain_model.dataset import DatasetTag, Url
27+
from renku.domain_model.dataset import Dataset, DatasetTag, Url
28+
from renku.infrastructure.gateway.dataset_gateway import DatasetGateway
2829
from renku.infrastructure.immutable import DynamicProxy
2930

3031

@@ -94,6 +95,25 @@ def remove_dataset_tags(dataset_name: str, tags: List[str]):
9495
datasets_provenance.remove_tag(dataset, tag)
9596

9697

98+
def get_dataset_by_tag(dataset: Dataset, tag: str) -> Optional[Dataset]:
99+
"""Return a version of dataset that has a specific tag.
100+
101+
Args:
102+
dataset(Dataset): A dataset to return its tagged version.
103+
tag(str): Tag name to search for.
104+
105+
Returns:
106+
Optional[Dataset]: The dataset pointed to by the tag or None if nothing found.
107+
"""
108+
dataset_gateway = DatasetGateway()
109+
110+
tags = dataset_gateway.get_all_tags(dataset)
111+
selected_tag = next((t for t in tags if t.name == tag), None)
112+
if selected_tag is None:
113+
return None
114+
return dataset_gateway.get_by_id(selected_tag.dataset_id.value)
115+
116+
97117
def prompt_access_token(exporter):
98118
"""Prompt user for an access token for a provider.
99119

renku/core/util/metadata.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@
3434
from renku.domain_model.provenance.agent import Person
3535

3636

37-
def construct_creators(creators: Optional[List[Union[dict, str]]], ignore_email=False):
37+
def construct_creators(
38+
creators: List[Union[dict, str]], ignore_email=False
39+
) -> Tuple[List["Person"], List[Union[dict, str]]]:
3840
"""Parse input and return a list of Person."""
3941
creators = creators or []
4042

@@ -46,7 +48,8 @@ def construct_creators(creators: Optional[List[Union[dict, str]]], ignore_email=
4648
for creator in creators:
4749
person, no_email_warning = construct_creator(creator, ignore_email=ignore_email)
4850

49-
people.append(person)
51+
if person:
52+
people.append(person)
5053

5154
if no_email_warning:
5255
no_email_warnings.append(no_email_warning)

0 commit comments

Comments
 (0)