Skip to content

Commit e1124bd

Browse files
authored
Drop controller from primary Dataset SQL table (#2835)
PBENCH-445 I've been talking about this a long time; after pushing PR #2826 which will require rebuilding the DB, I figured I might as well try to take this additional step at the same time. There are a lot of changed files; most of the changes (including the legacy test gold files) are small and straightforward.
1 parent fbcd7f6 commit e1124bd

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+202
-390
lines changed

lib/pbench/server/api/resources/datasets_list.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ def __init__(self, config: PbenchServerConfig, logger: logging.Logger):
3131
Parameter("name", ParamType.STRING),
3232
Parameter("owner", ParamType.USER),
3333
Parameter("access", ParamType.ACCESS),
34-
Parameter("controller", ParamType.STRING),
3534
Parameter("start", ParamType.DATE),
3635
Parameter("end", ParamType.DATE),
3736
Parameter(
@@ -82,9 +81,6 @@ def _get(self, json_data: JSON, request: Request) -> Response:
8281
if "name" in json:
8382
self.logger.info("Adding name query")
8483
query = query.filter(Dataset.name.contains(json["name"]))
85-
if "controller" in json:
86-
self.logger.info("Adding controller query")
87-
query = query.filter(Dataset.controller == json["controller"])
8884
query = self._build_sql_query(json, query)
8985

9086
# Useful for debugging, but verbose: this displays the fully expanded
@@ -105,7 +101,6 @@ def _get(self, json_data: JSON, request: Request) -> Response:
105101
for dataset in results:
106102
d = {
107103
"name": dataset.name,
108-
"controller": dataset.controller,
109104
"run_id": dataset.md5,
110105
}
111106
try:

lib/pbench/server/api/resources/upload_api.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
Metadata,
1717
States,
1818
)
19-
from pbench.server.filetree import DatasetNotFound, FileTree, Tarball
19+
from pbench.server.filetree import DatasetNotFound, FileTree
2020
from pbench.server.utils import UtcTimeHelper, filesize_bytes
2121

2222

@@ -149,26 +149,23 @@ def put(self, filename: str):
149149
try:
150150
dataset = Dataset(
151151
owner=username,
152-
controller=controller,
153-
path=tar_full_path,
152+
name=Dataset.stem(tar_full_path),
154153
md5=md5sum,
155154
)
156155
dataset.add()
157156
except DatasetDuplicate:
158-
dataset_name = Tarball.stem(tar_full_path)
157+
dataset_name = Dataset.stem(tar_full_path)
159158
self.logger.info(
160-
"Dataset already exists, user = {}, ctrl = {!a}, file = {!a}",
159+
"Dataset already exists, user = {}, file = {!a}",
161160
username,
162-
controller,
163161
dataset_name,
164162
)
165163
try:
166164
duplicate = Dataset.query(name=dataset_name)
167165
except DatasetNotFound:
168166
self.logger.error(
169-
"Duplicate dataset {}:{}:{} not found",
167+
"Duplicate dataset {}:{} not found",
170168
username,
171-
controller,
172169
dataset_name,
173170
)
174171
raise CleanupTime(

lib/pbench/server/database/models/datasets.py

Lines changed: 58 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
import copy
22
import datetime
33
import enum
4-
import os
54
from pathlib import Path
65
import re
7-
from typing import Any, List, Optional, Tuple, Union
6+
from typing import Any, List, Optional, Union
87

98
from sqlalchemy import Column, DateTime, Enum, event, ForeignKey, Integer, JSON, String
109
from sqlalchemy.exc import IntegrityError, SQLAlchemyError
@@ -24,13 +23,25 @@ class DatasetError(Exception):
2423
pass
2524

2625

26+
class DatasetBadName(DatasetError):
27+
"""
28+
Specified filename does not follow Pbench tarball naming rules.
29+
"""
30+
31+
def __init__(self, name: Path):
32+
self.name: str = str(name)
33+
34+
def __str__(self) -> str:
35+
return f"File name {self.name!r} does not end in {Dataset.TARBALL_SUFFIX!r}"
36+
37+
2738
class DatasetSqlError(DatasetError):
2839
"""
2940
SQLAlchemy errors reported through Dataset operations.
3041
31-
The exception will identify the controller and name of the target dataset,
32-
along with the operation being attempted; the __cause__ will specify the
33-
original SQLAlchemy exception.
42+
The exception will identify the name of the target dataset, along with the
43+
operation being attempted; the __cause__ will specify the original
44+
SQLAlchemy exception.
3445
"""
3546

3647
def __init__(self, operation: str, **kwargs):
@@ -46,12 +57,11 @@ class DatasetDuplicate(DatasetError):
4657
Attempt to create a Dataset that already exists.
4758
"""
4859

49-
def __init__(self, controller: str, name: str):
50-
self.controller = controller
60+
def __init__(self, name: str):
5161
self.name = name
5262

5363
def __str__(self):
54-
return f"Duplicate dataset {self.controller}|{self.name}"
64+
return f"Duplicate dataset {self.name!r}"
5565

5666

5767
class DatasetNotFound(DatasetError):
@@ -98,8 +108,8 @@ class DatasetTerminalStateViolation(DatasetTransitionError):
98108
An attempt was made to change the state of a dataset currently in a
99109
terminal state.
100110
101-
The error text will identify the dataset by controller and name, and both
102-
the current and requested new states.
111+
The error text will identify the dataset by name, and both the current and
112+
requested new states.
103113
"""
104114

105115
def __init__(self, dataset: "Dataset", requested_state: "States"):
@@ -115,8 +125,8 @@ class DatasetBadStateTransition(DatasetTransitionError):
115125
An attempt was made to advance a dataset to a new state that's not
116126
reachable from the current state.
117127
118-
The error text will identify the dataset by controller and name, and both
119-
the current and requested new states.
128+
The error text will identify the dataset by name, and both the current and
129+
requested new states.
120130
"""
121131

122132
def __init__(self, dataset: "Dataset", requested_state: "States"):
@@ -364,7 +374,6 @@ class Dataset(Database.Base):
364374
id Generated unique ID of table row
365375
owner Owning username of the dataset
366376
access Dataset is "private" to owner, or "public"
367-
controller Name of controller node
368377
name Base name of dataset (tarball)
369378
md5 The dataset MD5 hash (Elasticsearch ID)
370379
created Tarball metadata timestamp (set during PUT)
@@ -415,9 +424,6 @@ class Dataset(Database.Base):
415424
# Access policy for Dataset (public or private)
416425
access = Column(String(255), unique=False, nullable=False, default="private")
417426

418-
# Host name of the system that collected the data
419-
controller = Column(String(255), unique=False, nullable=False)
420-
421427
# FIXME:
422428
# Ideally, `md5` would not be `nullable`, but allowing it means that
423429
# pbench-server-prep-shim-002 utility can construct a Dataset object
@@ -467,6 +473,31 @@ def is_tarball(path: Union[Path, str]) -> bool:
467473
"""
468474
return str(path).endswith(Dataset.TARBALL_SUFFIX)
469475

476+
@staticmethod
477+
def stem(path: Union[str, Path]) -> str:
478+
"""
479+
The Path.stem() removes a single suffix, so our standard "a.tar.xz"
480+
returns "a.tar" instead of "a". We could double-stem, but instead
481+
this just checks for the expected 7 character suffix and strips it.
482+
483+
If the path does not end in ".tar.xz" then the full path.name is
484+
returned.
485+
486+
Args:
487+
path: A file path that might be a Pbench tarball
488+
489+
Raises:
490+
BadFilename: the path name does not end in TARBALL_SUFFIX
491+
492+
Returns:
493+
The stripped "stem" of the dataset
494+
"""
495+
p = Path(path)
496+
if __class__.is_tarball(p):
497+
return p.name[: -len(Dataset.TARBALL_SUFFIX)]
498+
else:
499+
raise DatasetBadName(p)
500+
470501
@validates("state")
471502
def validate_state(self, key: str, value: Any) -> States:
472503
"""
@@ -532,55 +563,6 @@ def validate_access(self, key: str, value: str) -> str:
532563
return access
533564
raise DatasetBadParameterType(value, "access keyword")
534565

535-
@staticmethod
536-
def _render_path(patharg=None, controllerarg=None, namearg=None) -> Tuple[str, str]:
537-
"""
538-
Process a `path` string and convert it into `controller` and/or `name`
539-
strings.
540-
541-
This pre-processes the controller and name before they are presented to
542-
a query or constructor. If the calling context has only the full file
543-
path of a dataset, this can extract both "controller" and "name" from
544-
the path. It can also be used to construct either "controller" or
545-
"name", as it will fill in either or both if not already present in
546-
the dict.
547-
548-
If the path is a symlink (e.g., a "TO-BACKUP" or other Pbench command
549-
link, most likely when a dataset is quarantined due to some consistency
550-
check failure), this code will follow the link in order to construct a
551-
controller name from the original path without needing to make any
552-
possibly fragile assumptions regarding the structure of the symlink
553-
name.
554-
555-
Args:
556-
patharg: A tarball file path from which the controller (host)
557-
name, the tarball dataset name (basename minus extension),
558-
or both will be derived.
559-
controllerarg: The controller name (hostname) of the dataset;
560-
this is retained if specified, or will be constructed
561-
from "path" if not present.
562-
namearg: The dataset name (file path stem);
563-
this is retained if specified, or will be constructed from
564-
"path" if not present
565-
566-
Returns:
567-
A tuple of (controller, name) based on the three arguments
568-
"""
569-
controller_result = controllerarg
570-
name_result = namearg
571-
572-
if patharg:
573-
path = Path(patharg)
574-
if path.is_symlink():
575-
path = Path(os.path.realpath(path))
576-
if not name_result:
577-
name_result = path.name
578-
if Dataset.is_tarball(name_result):
579-
name_result = name_result[: -len(Dataset.TARBALL_SUFFIX)]
580-
if not controller_result:
581-
controller_result = path.parent.name
582-
return controller_result, name_result
583-
584566
@staticmethod
585567
def create(**kwargs) -> "Dataset":
586568
"""
@@ -590,15 +572,7 @@ def create(**kwargs) -> "Dataset":
590572
Args:
591573
kwargs (dict):
592574
"owner": The owner of the dataset; defaults to None.
593-
"path": A tarball file path from which the controller (host)
594-
name, the tarball dataset name (basename minus extension),
595-
or both will be derived.
596-
"controller": The controller name (hostname) of the dataset;
597-
this is retained if specified, or will be constructed
598-
from "path" if not present.
599-
"name": The dataset name (file path stem);
600-
this is retained if specified, or will be constructed from
601-
"path" if not present.
575+
"name": The dataset name (file path stem).
602576
"state": The initial state of the new dataset.
603577
604578
Returns:
@@ -608,18 +582,14 @@ def create(**kwargs) -> "Dataset":
608582
dataset = Dataset(**kwargs)
609583
dataset.add()
610584
except Exception:
611-
Dataset.logger.exception(
612-
"Failed create: {}|{}", kwargs.get("controller"), kwargs.get("name")
613-
)
585+
Dataset.logger.exception("Failed create: {}", kwargs.get("name"))
614586
raise
615587
return dataset
616588

617589
@staticmethod
618-
def attach(path=None, name=None, state=None) -> "Dataset":
590+
def attach(name=None, state=None) -> "Dataset":
619591
"""
620-
Attempt to find dataset for the specified dataset name, or using a
621-
specified file path (see _render_path and the path_init event listener
622-
for details).
592+
Attempt to find dataset for the specified dataset name.
623593
624594
If state is specified, attach will attempt to advance the dataset to
625595
that state.
@@ -628,12 +598,7 @@ def attach(path=None, name=None, state=None) -> "Dataset":
628598
using a full path, use Dataset.query instead.
629599
630600
Args:
631-
"path": A tarball file path from which the controller (host)
632-
name, the tarball dataset name (basename minus extension),
633-
or both will be derived.
634-
"name": The dataset name (file path stem);
635-
this is retained if specified, or will be constructed from
636-
"path" if not present.
601+
"name": The dataset name (file path stem).
637602
"state": The desired state to advance the dataset.
638603
639604
Raises:
@@ -649,12 +614,11 @@ def attach(path=None, name=None, state=None) -> "Dataset":
649614
Dataset: a dataset object in the desired state (if specified)
650615
"""
651616
# Make sure we have a name if only path was specified
652-
_, name = Dataset._render_path(patharg=path, namearg=name)
653617
dataset = Dataset.query(name=name)
654618

655619
if dataset is None:
656620
Dataset.logger.warning("Dataset {} not found", name)
657-
raise DatasetNotFound(path=path, name=name)
621+
raise DatasetNotFound(name=name)
658622
elif state:
659623
dataset.advance(state)
660624
return dataset
@@ -682,7 +646,7 @@ def __str__(self) -> str:
682646
Returns:
683647
string: Representation of the dataset
684648
"""
685-
return f"{self.owner.username}({self.owner_id})|{self.controller}|{self.name}"
649+
return f"{self.owner.username}({self.owner_id})|{self.name}"
686650

687651
def advance(self, new_state: States):
688652
"""
@@ -726,15 +690,13 @@ def add(self):
726690
Database.db_session.add(self)
727691
Database.db_session.commit()
728692
except IntegrityError as e:
729-
Dataset.logger.warning(
730-
"Duplicate dataset {}|{}: {}", self.controller, self.name, e
731-
)
693+
Dataset.logger.warning("Duplicate dataset {}: {}", self.name, e)
732694
Database.db_session.rollback()
733-
raise DatasetDuplicate(self.controller, self.name) from None
695+
raise DatasetDuplicate(self.name) from None
734696
except Exception:
735697
self.logger.exception("Can't add {} to DB", str(self))
736698
Database.db_session.rollback()
737-
raise DatasetSqlError("adding", controller=self.controller, name=self.name)
699+
raise DatasetSqlError("adding", name=self.name)
738700

739701
def update(self):
740702
"""
@@ -746,9 +708,7 @@ def update(self):
746708
except Exception:
747709
self.logger.error("Can't update {} in DB", str(self))
748710
Database.db_session.rollback()
749-
raise DatasetSqlError(
750-
"updating", controller=self.controller, name=self.name
751-
)
711+
raise DatasetSqlError("updating", name=self.name)
752712

753713
def delete(self):
754714
"""
@@ -762,33 +722,6 @@ def delete(self):
762722
raise
763723

764724

765-
@event.listens_for(Dataset, "init")
766-
def path_init(target, args, kwargs):
767-
"""
768-
Listen for an init event on a Dataset to process a path before the
769-
SQLAlchemy constructor sees it.
770-
771-
We want the constructor to see both "controller" and "name" parameters in
772-
the kwargs representing the initial SQL column values. This listener allows
773-
us to provide those values by specifying a file path, which is processed
774-
into controller and name using the internal _render_path() helper.
775-
776-
We will remove "path" from kwargs so that SQLAlchemy doesn't see it. (This
777-
is an explicitly allowed side effect of the listener architecture.)
778-
"""
779-
if "path" in kwargs:
780-
controller, name = Dataset._render_path(
781-
patharg=kwargs.get("path"),
782-
controllerarg=kwargs.get("controller"),
783-
namearg=kwargs.get("name"),
784-
)
785-
if "controller" not in kwargs:
786-
kwargs["controller"] = controller
787-
if "name" not in kwargs:
788-
kwargs["name"] = name
789-
del kwargs["path"]
790-
791-
792725
class Metadata(Database.Base):
793726
"""
794727
Retain secondary information about datasets

0 commit comments

Comments
 (0)