Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 19 additions & 5 deletions backend/btrixcloud/colls.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
UpdateColl,
DedupeIndexStats,
DedupeIndexFile,
AddRemoveCrawlList,
CollectionAddRemove,
BaseCrawl,
CrawlFileOut,
Organization,
Expand Down Expand Up @@ -1452,13 +1452,20 @@ async def update_collection(
response_model=CollOut,
)
async def add_crawl_to_collection(
crawlList: AddRemoveCrawlList,
add_remove: CollectionAddRemove,
coll_id: UUID,
request: Request,
org: Organization = Depends(org_crawl_dep),
) -> CollOut:
crawl_ids = set(add_remove.crawlIds)

for crawl_id in await colls.crawl_ops.get_config_crawl_ids(
add_remove.crawlconfigIds
):
crawl_ids.add(crawl_id)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very small nit but I believe this could be reduced to a single update call on the set as well:

Suggested change
for crawl_id in await colls.crawl_ops.get_config_crawl_ids(
add_remove.crawlconfigIds
):
crawl_ids.add(crawl_id)
crawl_ids.update(await colls.crawl_ops.get_config_crawl_ids(
add_remove.crawlconfigIds
))


return await colls.add_crawls_to_collection(
coll_id, crawlList.crawlIds, org, headers=dict(request.headers)
coll_id, list(crawl_ids), org, headers=dict(request.headers)
)

@app.post(
Expand All @@ -1467,13 +1474,20 @@ async def add_crawl_to_collection(
response_model=CollOut,
)
async def remove_crawl_from_collection(
crawlList: AddRemoveCrawlList,
add_remove: CollectionAddRemove,
coll_id: UUID,
request: Request,
org: Organization = Depends(org_crawl_dep),
) -> CollOut:
crawl_ids = set(add_remove.crawlIds)

for crawl_id in await colls.crawl_ops.get_config_crawl_ids(
add_remove.crawlconfigIds
):
crawl_ids.add(crawl_id)

return await colls.remove_crawls_from_collection(
coll_id, crawlList.crawlIds, org, headers=dict(request.headers)
coll_id, list(crawl_ids), org, headers=dict(request.headers)
)

@app.delete(
Expand Down
6 changes: 6 additions & 0 deletions backend/btrixcloud/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,12 @@ async def list_crawls(

return crawls, total

async def get_config_crawl_ids(self, cids: list[UUID]) -> list[str]:
"""get list of crawl ids belonging to given crawlconfigs"""
res = self.crawls.find({"cid": {"$in": cids}}, {"_id": 1})
res_list = await res.to_list()
return [res["_id"] for res in res_list]

async def get_active_crawls(self, oid: UUID, limit: int) -> list[str]:
"""get list of waiting crawls, sorted from earliest to latest"""
res = (
Expand Down
5 changes: 3 additions & 2 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1859,10 +1859,11 @@ class UpdateCollHomeUrl(BaseModel):


# ============================================================================
class AddRemoveCrawlList(BaseModel):
"""Collections to add or remove from collection"""
class CollectionAddRemove(BaseModel):
"""Items to add or remove from collection"""

crawlIds: List[str] = []
crawlconfigIds: List[UUID] = []


# ============================================================================
Expand Down
65 changes: 65 additions & 0 deletions backend/test/test_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,71 @@ def test_add_remove_crawl_from_collection(
)
assert _coll_id not in r.json()["collectionIds"]


def test_add_remove_config_crawls_from_collection(
crawler_auth_headers,
default_org_id,
crawler_crawl_id,
crawler_config_id,
admin_crawl_id,
admin_config_id,
):
# Add crawls by config and crawl id
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add",
json={"crawlIds": [admin_crawl_id], "crawlconfigIds": [crawler_config_id]},
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["crawlCount"] == 2
assert data["pageCount"] > 0
assert data["uniquePageCount"] > 0
assert data["totalSize"] > 0
assert data["modified"] >= modified
assert data["tags"] == ["wr-test-2", "wr-test-1"]
assert data["dateEarliest"]
assert data["dateLatest"]
assert data["topPageHosts"]

# Remove crawls by crawl and config id, and test that specifying a
# config and also a crawl in that config separately is handled
# gracefully
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/remove",
json={
"crawlIds": [crawler_crawl_id],
"crawlconfigIds": [admin_config_id, crawler_config_id],
},
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == _coll_id
assert data["crawlCount"] == 0
assert data["pageCount"] == 0
assert data["uniquePageCount"] == 0
assert data["totalSize"] == 0
assert data["modified"] >= modified
assert data.get("tags", []) == []
assert data.get("dateEarliest") is None
assert data.get("dateLatest") is None
assert data["topPageHosts"] == []

# Verify crawls were removed
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id not in r.json()["collectionIds"]

r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id not in r.json()["collectionIds"]

# Add crawls back for further tests
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add",
Expand Down
Loading