Skip to content

Commit 702379d

Browse files
authored
Improve the efficiency of working with the DSA. (#1691)
* Improve the efficiency of working with the DSA. Before, this often queried all of the items, files, or annotations in the system. Now it only queries what is required to perform the desired task. I've disabled a status check (by having it always return 0 values). If this is actually needed, we should add an endpoint to the DSA to do this efficiently. The listing endpoints do return counts in their headers. The slow query is getting the count of images with annotations. Signed-off-by: David Manthey <[email protected]> * Find items regardless of folder; be faster for known folders --------- Signed-off-by: David Manthey <[email protected]>
1 parent 8318962 commit 702379d

File tree

1 file changed

+31
-12
lines changed

1 file changed

+31
-12
lines changed

monailabel/datastore/dsa.py

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def get_label_by_image_id(self, image_id: str, tag: str) -> str:
8484
def get_annotations_by_image_id(self, image_id: str) -> Dict[str, Dict[str, List]]:
8585
image_id, name = self._name_to_id(image_id)
8686

87-
data = self.gc.get("annotation", parameters={"limit": 0})
87+
data = self.gc.get(f"annotation/item/{image_id}", parameters={"limit": 0})
8888
result: Dict[str, Dict[str, List]] = {}
8989

9090
# TODO(avirodov): probably can request only annotation for a given image_id, need to check how.
@@ -136,26 +136,40 @@ def get_image(self, image_id: str, params=None) -> Any:
136136
def _name_to_id(self, name):
137137
folders = self.folders if self.folders else self._get_all_folders()
138138
for folder in folders:
139+
# First check if the name is directly present
140+
data = self.gc.get("item", parameters={"folderId": folder, "name": name, "limit": 0})
141+
for d in data:
142+
if d.get("largeImage"):
143+
return d["_id"], d["name"]
144+
# next check if the name is present in a stem form
139145
data = self.gc.get("item", parameters={"folderId": folder, "limit": 0})
140146
for d in data:
141147
if d.get("largeImage") and d["name"] == name or Path(d["name"]).stem == name:
142148
return d["_id"], d["name"]
143-
return name
149+
# Next check if the name is anywhere in the system
150+
data = self.gc.get("item", parameters={"text": f'"{name}"' if '"' not in name else name, "limit": 0})
151+
for d in data:
152+
if d.get("largeImage") and d["name"] == name or Path(d["name"]).stem == name:
153+
return d["_id"], d["name"]
154+
# If we fail to find the item, the best we can do is return the name
155+
return name, name
144156

145157
def get_image_uri(self, image_id: str) -> str:
146158
try:
147-
name = self.get_image_info(image_id)["name"]
159+
info = self.get_image_info(image_id)
160+
name = info["name"]
161+
file_id = info.get("largeImage", {}).get("fileId")
148162
except girder_client.HttpError:
149163
image_id, name = self._name_to_id(image_id)
164+
file_id = None
150165

151166
if self.asset_store_path:
152-
data = self.gc.get(f"item/{image_id}/files", parameters={"limit": 0})
153-
assets = [d["assetstoreId"] for d in data]
154-
for asset in assets:
155-
files = self.gc.get(f"assetstore/{asset}/files", parameters={"limit": 0})
156-
for f in files:
157-
if f["itemId"] == image_id:
158-
return str(os.path.join(self.asset_store_path, f["path"]))
167+
if file_id is None:
168+
data = self.gc.get(f"item/{image_id}/files", parameters={"limit": 0})
169+
file_id = data[0]["_id"]
170+
f = self.gc.get(f"resource/{file_id}?type=file")
171+
if "path" in f and os.path.exists(os.path.join(self.asset_store_path, f["path"])):
172+
return str(os.path.join(self.asset_store_path, f["path"]))
159173
else:
160174
cached = os.path.join(self.cache_path, name)
161175
if os.path.exists(cached):
@@ -243,9 +257,14 @@ def get_dataset_archive(self, limit_cases: Optional[int]) -> str:
243257
raise NotImplementedError
244258

245259
def status(self) -> Dict[str, Any]:
260+
# This is a very costly query, disable it for now
261+
# return {
262+
# "total": len(self.list_images()),
263+
# "completed": len(self.get_labeled_images()),
264+
# }
246265
return {
247-
"total": len(self.list_images()),
248-
"completed": len(self.get_labeled_images()),
266+
"total": 0,
267+
"completed": 0,
249268
}
250269

251270
def json(self):

0 commit comments

Comments
 (0)