Skip to content

Commit c0946eb

Browse files
authored
Make contents API scale (distributed-system-analysis#3609)
* Make `contents` API scale PBENCH-1321 The `/datasets/{id}/contents` API includes into several unexpectedly expensive steps: 1. Finding the tarball (by MD5 value) within the `ARCHIVE` tree using a `glob` 2. Fully discovering all tarballs within the controller directory 3. Unpacking the tarball into a cache directory using `tar` 4. Building a "map" of the contents of the unpacked tarball subtree This PR includes mitigations for all but the `tar` unpack step: 1. Use the `server.tarball-path` metadata instead of searching the disk 2. Only discover the target tarball rather than the entire controller 3. Skip the "map" and evaluate the actual target path within the cache Finding a tarball within our 30Tb `ARCHIVE` tree can take many minutes, while identifying the controller directory from the tarball path takes a fraction of a second. Depending on the number of tarballs within a controller (some have many), full controller discovery has been observed to take half a minute; while populating only the target tarball takes a fraction of a second. Building the map for a large tarball tree can take minutes, whereas discovery of the actual relative file path within the cache runs at native (Python) file system speeds.
1 parent d6b8f26 commit c0946eb

File tree

7 files changed

+558
-340
lines changed

7 files changed

+558
-340
lines changed

lib/pbench/cli/server/report.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -486,12 +486,12 @@ def report(
486486

487487
try:
488488
config = config_setup(context)
489-
logger = get_pbench_logger("report-generator", config)
489+
logger = get_pbench_logger("pbench-report-generator", config)
490490
if any((all, archive, backup, cache)):
491491
cache_m = CacheManager(config, logger)
492492
verifier.status("starting discovery")
493493
watcher.update("discovering cache")
494-
cache_m.full_discovery()
494+
cache_m.full_discovery(search=False)
495495
watcher.update("processing reports")
496496
verifier.status("finished discovery")
497497
if all or archive:

lib/pbench/cli/server/tree_manage.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def tree_manage(
8080
logger = None
8181
try:
8282
config = config_setup(context)
83-
logger = get_pbench_logger("cachemanager", config)
83+
logger = get_pbench_logger("pbench-tree-manager", config)
8484
cache_m = CacheManager(config, logger)
8585
cache_m.full_discovery()
8686
if display:
Lines changed: 9 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from http import HTTPStatus
2-
from pathlib import Path
32

43
from flask import current_app, jsonify
54
from flask.wrappers import Request, Response
@@ -22,8 +21,6 @@
2221
BadDirpath,
2322
CacheExtractBadPath,
2423
CacheManager,
25-
CacheObject,
26-
CacheType,
2724
TarballNotFound,
2825
)
2926
from pbench.server.database.models.datasets import Dataset
@@ -65,100 +62,22 @@ def _get(self, params: ApiParams, req: Request, context: ApiContext) -> Response
6562

6663
dataset: Dataset = params.uri["dataset"]
6764
target = params.uri.get("target")
68-
path = Path("." if target in ("/", None) else target)
65+
path = "." if target in ("/", None) else target
66+
67+
prefix = current_app.server_config.rest_uri
68+
origin = (
69+
f"{self._get_uri_base(req).host}{prefix}/datasets/{dataset.resource_id}"
70+
)
6971

7072
cache_m = CacheManager(self.config, current_app.logger)
7173
try:
72-
info = cache_m.find_entry(dataset.resource_id, path)
74+
info = cache_m.get_contents(dataset.resource_id, path, origin)
7375
except (BadDirpath, CacheExtractBadPath, TarballNotFound) as e:
7476
raise APIAbort(HTTPStatus.NOT_FOUND, str(e))
7577
except Exception as e:
7678
raise APIInternalError(f"Cache find error: {str(e)!r}")
7779

78-
prefix = current_app.server_config.rest_uri
79-
origin = (
80-
f"{self._get_uri_base(req).host}{prefix}/datasets/{dataset.resource_id}"
81-
)
82-
83-
details: CacheObject = info["details"]
84-
if details.type is CacheType.DIRECTORY:
85-
children = info["children"] if "children" in info else {}
86-
dir_list = []
87-
file_list = []
88-
89-
for c, value in children.items():
90-
d: CacheObject = value["details"]
91-
if d.type is CacheType.DIRECTORY:
92-
dir_list.append(
93-
{
94-
"name": c,
95-
"type": d.type.name,
96-
"uri": f"{origin}/contents/{d.location}",
97-
}
98-
)
99-
elif d.type is CacheType.SYMLINK:
100-
if d.resolve_type is CacheType.DIRECTORY:
101-
uri = f"{origin}/contents/{d.resolve_path}"
102-
elif d.resolve_type is CacheType.FILE:
103-
uri = f"{origin}/inventory/{d.resolve_path}"
104-
else:
105-
uri = f"{origin}/inventory/{d.location}"
106-
file_list.append(
107-
{
108-
"name": c,
109-
"type": d.type.name,
110-
"link": str(d.resolve_path),
111-
"link_type": d.resolve_type.name,
112-
"uri": uri,
113-
}
114-
)
115-
else:
116-
r = {
117-
"name": c,
118-
"type": d.type.name,
119-
"uri": f"{origin}/inventory/{d.location}",
120-
}
121-
if d.type is CacheType.FILE:
122-
r["size"] = d.size
123-
file_list.append(r)
124-
125-
dir_list.sort(key=lambda d: d["name"])
126-
file_list.sort(key=lambda d: d["name"])
127-
128-
# Normalize because we want the "root" directory to be reported as
129-
# "" rather than as Path's favored "."
130-
loc = str(details.location)
131-
name = details.name
132-
if loc == ".":
133-
loc = ""
134-
name = ""
135-
val = {
136-
"name": name,
137-
"type": details.type.name,
138-
"directories": dir_list,
139-
"files": file_list,
140-
"uri": f"{origin}/contents/{loc}",
141-
}
142-
else:
143-
access = "inventory"
144-
link = str(details.location)
145-
if details.type is CacheType.SYMLINK:
146-
if details.resolve_type is CacheType.DIRECTORY:
147-
access = "contents"
148-
if details.resolve_type in (CacheType.FILE, CacheType.DIRECTORY):
149-
link = str(details.resolve_path)
150-
val = {
151-
"name": details.name,
152-
"type": details.type.name,
153-
"uri": f"{origin}/{access}/{link}",
154-
}
155-
if details.type is CacheType.SYMLINK:
156-
val["link"] = link
157-
val["link_type"] = details.resolve_type.name
158-
elif details.type is CacheType.FILE:
159-
val["size"] = details.size
160-
16180
try:
162-
return jsonify(val)
81+
return jsonify(info)
16382
except Exception as e:
164-
raise APIInternalError(f"JSONIFY {val}: {str(e)!r}")
83+
raise APIInternalError(f"JSONIFY {info}: {str(e)!r}")

0 commit comments

Comments
 (0)