Skip to content

Commit 1a1f9f7

Browse files
authored
Make add and remove endpoints accept workflow ids in addition to crawl ids (#3228)
Fixes #3225 Modifies the existing collection `/add` and `/remove` endpoints to accept a list of workflow ids in addition to crawl ids, and gracefully handles any overlap between the two. This will allow us to more easily add all of a workflow's crawls to a collection in the frontend.
1 parent 108d9ae commit 1a1f9f7

File tree

4 files changed

+87
-7
lines changed

4 files changed

+87
-7
lines changed

backend/btrixcloud/colls.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
UpdateColl,
2828
DedupeIndexStats,
2929
DedupeIndexFile,
30-
AddRemoveCrawlList,
30+
CollectionAddRemove,
3131
BaseCrawl,
3232
CrawlFileOut,
3333
Organization,
@@ -1452,13 +1452,17 @@ async def update_collection(
14521452
response_model=CollOut,
14531453
)
14541454
async def add_crawl_to_collection(
1455-
crawlList: AddRemoveCrawlList,
1455+
add_remove: CollectionAddRemove,
14561456
coll_id: UUID,
14571457
request: Request,
14581458
org: Organization = Depends(org_crawl_dep),
14591459
) -> CollOut:
1460+
crawl_ids = set(add_remove.crawlIds)
1461+
crawl_ids.update(
1462+
await colls.crawl_ops.get_config_crawl_ids(add_remove.crawlconfigIds)
1463+
)
14601464
return await colls.add_crawls_to_collection(
1461-
coll_id, crawlList.crawlIds, org, headers=dict(request.headers)
1465+
coll_id, list(crawl_ids), org, headers=dict(request.headers)
14621466
)
14631467

14641468
@app.post(
@@ -1467,13 +1471,17 @@ async def add_crawl_to_collection(
14671471
response_model=CollOut,
14681472
)
14691473
async def remove_crawl_from_collection(
1470-
crawlList: AddRemoveCrawlList,
1474+
add_remove: CollectionAddRemove,
14711475
coll_id: UUID,
14721476
request: Request,
14731477
org: Organization = Depends(org_crawl_dep),
14741478
) -> CollOut:
1479+
crawl_ids = set(add_remove.crawlIds)
1480+
crawl_ids.update(
1481+
await colls.crawl_ops.get_config_crawl_ids(add_remove.crawlconfigIds)
1482+
)
14751483
return await colls.remove_crawls_from_collection(
1476-
coll_id, crawlList.crawlIds, org, headers=dict(request.headers)
1484+
coll_id, list(crawl_ids), org, headers=dict(request.headers)
14771485
)
14781486

14791487
@app.delete(

backend/btrixcloud/crawls.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,12 @@ async def list_crawls(
386386

387387
return crawls, total
388388

389+
async def get_config_crawl_ids(self, cids: list[UUID]) -> list[str]:
390+
"""get list of crawl ids belonging to given crawlconfigs"""
391+
res = self.crawls.find({"cid": {"$in": cids}}, {"_id": 1})
392+
res_list = await res.to_list()
393+
return [res["_id"] for res in res_list]
394+
389395
async def get_active_crawls(self, oid: UUID, limit: int) -> list[str]:
390396
"""get list of waiting crawls, sorted from earliest to latest"""
391397
res = (

backend/btrixcloud/models.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1859,10 +1859,11 @@ class UpdateCollHomeUrl(BaseModel):
18591859

18601860

18611861
# ============================================================================
1862-
class AddRemoveCrawlList(BaseModel):
1863-
"""Collections to add or remove from collection"""
1862+
class CollectionAddRemove(BaseModel):
1863+
"""Items to add or remove from collection"""
18641864

18651865
crawlIds: List[str] = []
1866+
crawlconfigIds: List[UUID] = []
18661867

18671868

18681869
# ============================================================================

backend/test/test_collections.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,71 @@ def test_add_remove_crawl_from_collection(
354354
)
355355
assert _coll_id not in r.json()["collectionIds"]
356356

357+
358+
def test_add_remove_config_crawls_from_collection(
359+
crawler_auth_headers,
360+
default_org_id,
361+
crawler_crawl_id,
362+
crawler_config_id,
363+
admin_crawl_id,
364+
admin_config_id,
365+
):
366+
# Add crawls by config and crawl id
367+
r = requests.post(
368+
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add",
369+
json={"crawlIds": [admin_crawl_id], "crawlconfigIds": [crawler_config_id]},
370+
headers=crawler_auth_headers,
371+
)
372+
assert r.status_code == 200
373+
data = r.json()
374+
assert data["id"] == _coll_id
375+
assert data["crawlCount"] == 2
376+
assert data["pageCount"] > 0
377+
assert data["uniquePageCount"] > 0
378+
assert data["totalSize"] > 0
379+
assert data["modified"] >= modified
380+
assert data["tags"] == ["wr-test-2", "wr-test-1"]
381+
assert data["dateEarliest"]
382+
assert data["dateLatest"]
383+
assert data["topPageHosts"]
384+
385+
# Remove crawls by crawl and config id, and test that specifying a
386+
# config and also a crawl in that config separately is handled
387+
# gracefully
388+
r = requests.post(
389+
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/remove",
390+
json={
391+
"crawlIds": [crawler_crawl_id],
392+
"crawlconfigIds": [admin_config_id, crawler_config_id],
393+
},
394+
headers=crawler_auth_headers,
395+
)
396+
assert r.status_code == 200
397+
data = r.json()
398+
assert data["id"] == _coll_id
399+
assert data["crawlCount"] == 0
400+
assert data["pageCount"] == 0
401+
assert data["uniquePageCount"] == 0
402+
assert data["totalSize"] == 0
403+
assert data["modified"] >= modified
404+
assert data.get("tags", []) == []
405+
assert data.get("dateEarliest") is None
406+
assert data.get("dateLatest") is None
407+
assert data["topPageHosts"] == []
408+
409+
# Verify crawls were removed
410+
r = requests.get(
411+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
412+
headers=crawler_auth_headers,
413+
)
414+
assert _coll_id not in r.json()["collectionIds"]
415+
416+
r = requests.get(
417+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
418+
headers=crawler_auth_headers,
419+
)
420+
assert _coll_id not in r.json()["collectionIds"]
421+
357422
# Add crawls back for further tests
358423
r = requests.post(
359424
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add",

0 commit comments

Comments
 (0)