Skip to content

Commit 1ef85d8

Browse files
authored
feat: removes temp workaround for co assets (#114)
1 parent c7a9e11 commit 1ef85d8

File tree

3 files changed

+140
-82
lines changed

3 files changed

+140
-82
lines changed

src/aind_data_asset_indexer/codeocean_bucket_indexer.py

Lines changed: 40 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,12 @@
1212

1313
import boto3
1414
import dask.bag as dask_bag
15-
import requests
1615
from aind_data_schema.core.metadata import ExternalPlatforms
1716
from codeocean import CodeOcean
17+
from codeocean.data_asset import DataAssetSearchOrigin, DataAssetSearchParams
1818
from mypy_boto3_s3 import S3Client
1919
from pymongo import MongoClient
2020
from pymongo.operations import UpdateOne
21-
from requests.exceptions import ReadTimeout
2221
from urllib3.util import Retry
2322

2423
from aind_data_asset_indexer.models import CodeOceanIndexBucketJobSettings
@@ -53,30 +52,51 @@ def __init__(self, job_settings: CodeOceanIndexBucketJobSettings):
5352
"""Class constructor."""
5453
self.job_settings = job_settings
5554

56-
def _get_external_data_asset_records(self) -> Optional[List[dict]]:
55+
@staticmethod
56+
def _get_external_data_asset_records(
57+
co_client: CodeOcean,
58+
) -> Optional[List[dict]]:
5759
"""
5860
Retrieves list of code ocean ids and locations for external data
5961
assets. The timeout is set to 600 seconds.
62+
63+
Parameters
64+
----------
65+
co_client : CodeOcean
66+
6067
Returns
6168
-------
6269
List[dict] | None
6370
List items have shape {"id": str, "location": str}. If error occurs,
6471
return None.
72+
6573
"""
6674
try:
67-
response = requests.get(
68-
self.job_settings.temp_codeocean_endpoint,
69-
timeout=600,
75+
search_params = DataAssetSearchParams(
76+
archived=False,
77+
origin=DataAssetSearchOrigin.External,
78+
limit=1000,
7079
)
71-
if response.status_code == 200:
72-
return response.json()
73-
else:
74-
return None
75-
except ReadTimeout:
76-
logging.error(
77-
f"Read timed out at "
78-
f"{self.job_settings.temp_codeocean_endpoint}"
80+
data_assets = co_client.data_assets.search_data_assets_iterator(
81+
search_params=search_params
7982
)
83+
external_records = []
84+
for data_asset in data_assets:
85+
data_asset_source = data_asset.source_bucket
86+
if (
87+
data_asset_source is not None
88+
and data_asset_source.bucket is not None
89+
and data_asset_source.prefix is not None
90+
):
91+
bucket = data_asset_source.bucket
92+
prefix = data_asset_source.prefix
93+
location = f"s3://{bucket}/{prefix}"
94+
external_records.append(
95+
{"id": data_asset.id, "location": location}
96+
)
97+
return external_records
98+
except Exception as e:
99+
logging.exception(e)
80100
return None
81101

82102
@staticmethod
@@ -98,7 +118,7 @@ def _map_external_list_to_dict(external_recs: List[dict]) -> dict:
98118
"""
99119
new_records = dict()
100120
for r in external_recs:
101-
location = r.get("source")
121+
location = r.get("location")
102122
rec_id = r["id"]
103123
if location is not None and new_records.get(location) is not None:
104124
old_id_set = new_records.get(location)
@@ -141,7 +161,7 @@ def _get_co_links_from_record(
141161
return external_links
142162

143163
def _update_external_links_in_docdb(
144-
self, docdb_client: MongoClient
164+
self, docdb_client: MongoClient, co_client: CodeOcean
145165
) -> None:
146166
"""
147167
This method will:
@@ -160,7 +180,9 @@ def _update_external_links_in_docdb(
160180
161181
"""
162182
# Should return a list like [{"id": co_id, "location": "s3://..."},]
163-
list_of_co_ids_and_locations = self._get_external_data_asset_records()
183+
list_of_co_ids_and_locations = self._get_external_data_asset_records(
184+
co_client=co_client
185+
)
164186
db = docdb_client[self.job_settings.doc_db_db_name]
165187
collection = db[self.job_settings.doc_db_collection_name]
166188
if list_of_co_ids_and_locations is not None:
@@ -424,7 +446,7 @@ def run_job(self):
424446
# Use existing client to add external links to fields
425447
logging.info("Adding links to records.")
426448
self._update_external_links_in_docdb(
427-
docdb_client=iterator_docdb_client
449+
docdb_client=iterator_docdb_client, co_client=co_client
428450
)
429451
logging.info("Finished adding links to records")
430452
all_docdb_records = dict()

src/aind_data_asset_indexer/models.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -124,11 +124,12 @@ class CodeOceanIndexBucketJobSettings(IndexJobSettings):
124124
doc_db_collection_name: str
125125
codeocean_domain: str
126126
codeocean_token: SecretStr
127-
temp_codeocean_endpoint: str = Field(
127+
temp_codeocean_endpoint: Optional[str] = Field(
128+
default=None,
128129
description=(
129-
"Temp proxy to access code ocean information from their analytics "
130-
"databases."
131-
)
130+
"(deprecated) Temp proxy to access code ocean information from "
131+
"their analytics databases. Will be removed in a future release."
132+
),
132133
)
133134

134135
@classmethod

tests/test_codeocean_bucket_indexer.py

Lines changed: 95 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,20 @@
11
"""Tests methods in codeocean_bucket_indexer module"""
22

3-
import json
43
import os
54
import unittest
65
from datetime import datetime, timezone
76
from pathlib import Path
87
from unittest.mock import MagicMock, call, patch
98

9+
from codeocean import CodeOcean
10+
from codeocean.data_asset import (
11+
DataAsset,
12+
DataAssetOrigin,
13+
DataAssetState,
14+
DataAssetType,
15+
SourceBucket,
16+
)
1017
from pymongo.operations import UpdateOne
11-
from requests import Response
12-
from requests.exceptions import ReadTimeout
1318

1419
from aind_data_asset_indexer.codeocean_bucket_indexer import (
1520
CodeOceanIndexBucketJob,
@@ -102,54 +107,85 @@ def setUpClass(cls) -> None:
102107
},
103108
]
104109

105-
cls.example_temp_endpoint_response = [
106-
{"id": "abc-123", "source": "s3://bucket/prefix1"},
107-
{"id": "def-456", "source": "s3://bucket/prefix1"},
108-
{"id": "ghi-789", "source": "s3://bucket/prefix2"},
110+
cls.example_search_iterator_response = [
111+
DataAsset(
112+
id="abc-123",
113+
created=0,
114+
name="prefix1",
115+
mount="prefix1",
116+
state=DataAssetState.Ready,
117+
type=DataAssetType.Dataset,
118+
last_used=0,
119+
source_bucket=SourceBucket(
120+
bucket="bucket",
121+
prefix="prefix1",
122+
origin=DataAssetOrigin.AWS,
123+
),
124+
),
125+
DataAsset(
126+
id="def-456",
127+
created=0,
128+
name="prefix1",
129+
mount="prefix1",
130+
state=DataAssetState.Ready,
131+
type=DataAssetType.Dataset,
132+
last_used=0,
133+
source_bucket=SourceBucket(
134+
bucket="bucket",
135+
prefix="prefix1",
136+
origin=DataAssetOrigin.AWS,
137+
),
138+
),
139+
DataAsset(
140+
id="ghi-789",
141+
created=0,
142+
name="prefix2",
143+
mount="prefix2",
144+
state=DataAssetState.Ready,
145+
type=DataAssetType.Dataset,
146+
last_used=0,
147+
source_bucket=SourceBucket(
148+
bucket="bucket",
149+
prefix="prefix2",
150+
origin=DataAssetOrigin.AWS,
151+
),
152+
),
109153
]
110154

111-
@patch("requests.get")
112-
def test_get_external_data_asset_records(self, mock_get: MagicMock):
155+
@patch("codeocean.data_asset.DataAssets.search_data_assets_iterator")
156+
def test_get_external_data_asset_records(self, mock_search: MagicMock):
113157
"""Tests the _get_external_data_asset_records method"""
114-
example_response = self.example_temp_endpoint_response
115-
mock_get_response = Response()
116-
mock_get_response.status_code = 200
117-
mock_get_response._content = json.dumps(example_response).encode(
118-
"utf-8"
158+
mock_search.return_value = self.example_search_iterator_response
159+
response = self.basic_job._get_external_data_asset_records(
160+
co_client=CodeOcean(domain="www.example.com", token="")
119161
)
120-
mock_get.return_value = mock_get_response
121-
response = self.basic_job._get_external_data_asset_records()
122-
self.assertEqual(example_response, response)
162+
expected_response = [
163+
{"id": "abc-123", "location": "s3://bucket/prefix1"},
164+
{"id": "def-456", "location": "s3://bucket/prefix1"},
165+
{"id": "ghi-789", "location": "s3://bucket/prefix2"},
166+
]
167+
self.assertEqual(expected_response, response)
123168

124-
@patch("requests.get")
125-
def test_get_external_data_asset_records_error(self, mock_get: MagicMock):
169+
@patch("codeocean.data_asset.DataAssets.search_data_assets_iterator")
170+
def test_get_external_data_asset_records_err(self, mock_search: MagicMock):
126171
"""Tests the _get_external_data_asset_records method when an error
127172
response is returned"""
128-
mock_get_response = Response()
129-
mock_get_response.status_code = 500
130-
mock_get.return_value = mock_get_response
131-
response = self.basic_job._get_external_data_asset_records()
132-
self.assertIsNone(response)
133-
134-
@patch("requests.get")
135-
def test_get_external_data_asset_records_read_timeout(
136-
self, mock_get: MagicMock
137-
):
138-
"""Tests the _get_external_data_asset_records method when the read
139-
times out."""
140-
mock_get.side_effect = ReadTimeout()
173+
mock_search.side_effect = Exception("Something went wrong!")
141174
with self.assertLogs(level="DEBUG") as captured:
142-
response = self.basic_job._get_external_data_asset_records()
143-
expected_log_messages = [
144-
"ERROR:root:Read timed out at http://some_url:8080/created_after/0"
145-
]
146-
self.assertEqual(expected_log_messages, captured.output)
175+
response = self.basic_job._get_external_data_asset_records(
176+
co_client=CodeOcean(domain="www.example.com", token="")
177+
)
147178
self.assertIsNone(response)
179+
self.assertIsNotNone(captured.output)
148180

149181
def test_map_external_list_to_dict(self):
150182
"""Tests _map_external_list_to_dict method"""
151183
mapped_response = self.basic_job._map_external_list_to_dict(
152-
self.example_temp_endpoint_response
184+
[
185+
{"id": "abc-123", "location": "s3://bucket/prefix1"},
186+
{"id": "def-456", "location": "s3://bucket/prefix1"},
187+
{"id": "ghi-789", "location": "s3://bucket/prefix2"},
188+
]
153189
)
154190
expected_response = {
155191
"s3://bucket/prefix1": {"abc-123", "def-456"},
@@ -185,27 +221,21 @@ def test_get_co_links_from_record_legacy(self):
185221
self.assertEqual(["abc-123", "def-456"], output)
186222

187223
@patch("aind_data_asset_indexer.codeocean_bucket_indexer.MongoClient")
188-
@patch("requests.get")
224+
@patch("codeocean.data_asset.DataAssets.search_data_assets_iterator")
189225
@patch("aind_data_asset_indexer.codeocean_bucket_indexer.paginate_docdb")
190226
@patch("aind_data_asset_indexer.codeocean_bucket_indexer.datetime")
191227
def test_update_external_links_in_docdb(
192228
self,
193229
mock_datetime: MagicMock,
194230
mock_paginate: MagicMock,
195-
mock_get: MagicMock,
231+
mock_search: MagicMock,
196232
mock_docdb_client: MagicMock,
197233
):
198234
"""Tests _update_external_links_in_docdb method."""
199235
mock_datetime.utcnow.return_value = datetime(2024, 9, 5)
200236

201-
# Mock requests get response
202-
example_response = self.example_temp_endpoint_response
203-
mock_get_response = Response()
204-
mock_get_response.status_code = 200
205-
mock_get_response._content = json.dumps(example_response).encode(
206-
"utf-8"
207-
)
208-
mock_get.return_value = mock_get_response
237+
# Mock code ocean search response
238+
mock_search.return_value = self.example_search_iterator_response
209239

210240
# Mock bulk_write
211241
mock_db = MagicMock()
@@ -237,7 +267,8 @@ def test_update_external_links_in_docdb(
237267

238268
with self.assertLogs(level="DEBUG") as captured:
239269
self.basic_job._update_external_links_in_docdb(
240-
docdb_client=mock_docdb_client
270+
docdb_client=mock_docdb_client,
271+
co_client=CodeOcean(domain="www.example.com", token=""),
241272
)
242273
expected_log_messages = [
243274
"INFO:root:No code ocean data asset ids found for "
@@ -284,31 +315,31 @@ def test_update_external_links_in_docdb(
284315
mock_collection.bulk_write.assert_has_calls(expected_bulk_write_calls)
285316

286317
@patch("aind_data_asset_indexer.codeocean_bucket_indexer.MongoClient")
287-
@patch("requests.get")
318+
@patch("codeocean.data_asset.DataAssets.search_data_assets_iterator")
288319
@patch("aind_data_asset_indexer.codeocean_bucket_indexer.paginate_docdb")
289320
def test_update_external_links_in_docdb_error(
290321
self,
291322
mock_paginate: MagicMock,
292-
mock_get: MagicMock,
323+
mock_search: MagicMock,
293324
mock_docdb_client: MagicMock,
294325
):
295326
"""Tests _update_external_links_in_docdb method when there is an
296327
error retrieving info from the temp endpoint."""
297-
# Mock requests get response
298-
mock_get_response = Response()
299-
mock_get_response.status_code = 500
300-
mock_get.return_value = mock_get_response
328+
# Mock search response
329+
mock_search.side_effect = Exception("Something went wrong!")
301330

302331
mock_db = MagicMock()
303332
mock_docdb_client.__getitem__.return_value = mock_db
304333
with self.assertLogs(level="DEBUG") as captured:
305334
self.basic_job._update_external_links_in_docdb(
306-
docdb_client=mock_docdb_client
335+
docdb_client=mock_docdb_client,
336+
co_client=CodeOcean(domain="www.example.com", token=""),
307337
)
308-
expected_log_messages = [
338+
expected_log_message = (
309339
"ERROR:root:There was an error retrieving external links!"
310-
]
311-
self.assertEqual(expected_log_messages, captured.output)
340+
)
341+
self.assertEqual(2, len(captured.output))
342+
self.assertEqual(expected_log_message, captured.output[1])
312343
mock_paginate.assert_not_called()
313344

314345
@patch("aind_data_asset_indexer.codeocean_bucket_indexer.MongoClient")
@@ -568,8 +599,10 @@ def test_delete_records_from_docdb(
568599
"aind_data_asset_indexer.codeocean_bucket_indexer."
569600
"get_all_processed_codeocean_asset_records"
570601
)
602+
@patch("aind_data_asset_indexer.codeocean_bucket_indexer.CodeOcean")
571603
def test_run_job(
572604
self,
605+
mock_codeocean_client: MagicMock,
573606
mock_get_all_co_records: MagicMock,
574607
mock_docdb_client: MagicMock,
575608
mock_paginate_docdb: MagicMock,
@@ -581,6 +614,8 @@ def test_run_job(
581614
one record, add one record, and delete one record."""
582615
mock_mongo_client = MagicMock()
583616
mock_docdb_client.return_value = mock_mongo_client
617+
mock_co_client = MagicMock()
618+
mock_codeocean_client.return_value = mock_co_client
584619
mock_get_all_co_records.return_value = dict(
585620
[(r["location"], r) for r in self.example_codeocean_records]
586621
)
@@ -602,7 +637,7 @@ def test_run_job(
602637
self.assertEqual(expected_log_messages, captured.output)
603638

604639
mock_update_external_links_in_docdb.assert_called_once_with(
605-
docdb_client=mock_mongo_client
640+
docdb_client=mock_mongo_client, co_client=mock_co_client
606641
)
607642
mock_process_codeocean_records.assert_called_once_with(
608643
records=[self.example_codeocean_records[0]]

0 commit comments

Comments
 (0)