diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 4a368ef2f0..d94717786d 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -25,7 +25,7 @@ from .models import ( SUCCESSFUL_STATES, - CrawlConfigTags, + TagsResponse, CrawlFile, CrawlFileOut, BaseCrawl, @@ -984,13 +984,22 @@ async def get_org_last_crawl_finished(self, oid: UUID) -> Optional[datetime]: return last_crawl_finished - async def get_all_crawls_tag_counts(self, org: Organization): - """get distinct tags from all archived items for this org""" + async def get_all_crawls_tag_counts( + self, + org: Organization, + only_successful: bool = True, + type_: Optional[str] = None, + ): + """get distinct tags from archived items for this org""" + match_query: Dict[str, Any] = {"oid": org.id} + if only_successful: + match_query["state"] = {"$in": SUCCESSFUL_STATES} + if type_ in ("crawl", "upload"): + match_query["type"] = type_ + tags = await self.crawls.aggregate( [ - # Match only against the states of archived items that might be - # displayed in the frontend - {"$match": {"oid": org.id, "state": {"$in": SUCCESSFUL_STATES}}}, + {"$match": match_query}, {"$unwind": "$tags"}, {"$group": {"_id": "$tags", "count": {"$sum": 1}}}, {"$project": {"tag": "$_id", "count": "$count", "_id": 0}}, @@ -1094,10 +1103,20 @@ async def get_all_crawls_search_values( @app.get( "/orgs/{oid}/all-crawls/tagCounts", tags=["all-crawls"], - response_model=CrawlConfigTags, + response_model=TagsResponse, ) - async def get_all_crawls_tag_counts(org: Organization = Depends(org_viewer_dep)): - return {"tags": await ops.get_all_crawls_tag_counts(org)} + async def get_all_crawls_tag_counts( + org: Organization = Depends(org_viewer_dep), + onlySuccessful: bool = True, + crawlType: Optional[str] = None, + ): + if crawlType and crawlType not in ("crawl", "upload"): + raise HTTPException(status_code=400, detail="invalid_crawl_type") + + tags = await ops.get_all_crawls_tag_counts( + org, only_successful=onlySuccessful, type_=crawlType + ) + return {"tags": tags} @app.get( "/orgs/{oid}/all-crawls/{crawl_id}", diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index e1c9d60b0f..72dc8f6e0d 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -26,7 +26,7 @@ ConfigRevision, CrawlConfig, CrawlConfigOut, - CrawlConfigTags, + TagsResponse, CrawlOut, CrawlOutWithResources, UpdateCrawlConfig, @@ -1622,7 +1622,7 @@ async def get_crawl_config_tags(org: Organization = Depends(org_viewer_dep)): """ return await ops.get_crawl_config_tags(org) - @router.get("/tagCounts", response_model=CrawlConfigTags) + @router.get("/tagCounts", response_model=TagsResponse) async def get_crawl_config_tag_counts(org: Organization = Depends(org_viewer_dep)): return {"tags": await ops.get_crawl_config_tag_counts(org)} diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index c6c8780f06..a55597799d 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -78,6 +78,7 @@ CrawlQueueResponse, MatchCrawlQueueResponse, CrawlLogLine, + TagsResponse, ) @@ -1355,6 +1356,20 @@ async def delete_crawls( deleted=count, storageQuotaReached=quota_reached ) + @app.get( + "/orgs/{oid}/crawls/tagCounts", + tags=["crawls"], + response_model=TagsResponse, + ) + async def get_crawls_tag_counts( + org: Organization = Depends(org_viewer_dep), + onlySuccessful: bool = True, + ): + tags = await ops.get_all_crawls_tag_counts( + org, only_successful=onlySuccessful, type_="crawl" + ) + return {"tags": tags} + @app.get("/orgs/all/crawls/stats", tags=["crawls"], response_model=bytes) async def get_all_orgs_crawl_stats( user: User = Depends(user_dep), diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 7652197a72..5f2b4f9da3 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -603,18 +603,18 @@ class CrawlConfigAddedResponse(BaseModel): # ============================================================================ -class CrawlConfigTagCount(BaseModel): - """Response model for crawlconfig tag count""" +class TagCount(BaseModel): + """Response model for crawlconfig/crawl tag count""" tag: str count: int # ============================================================================ -class CrawlConfigTags(BaseModel): - """Response model for crawlconfig tags""" +class TagsResponse(BaseModel): + """Response model for crawlconfig/crawl tags""" - tags: List[CrawlConfigTagCount] + tags: List[TagCount] # ============================================================================ diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index 23c0f1257b..d874bf603a 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -28,6 +28,7 @@ AddedResponseIdQuota, FilePreparer, MIN_UPLOAD_PART_SIZE, + TagsResponse, ) from .pagination import paginated_format, DEFAULT_PAGE_SIZE from .utils import dt_now @@ -362,6 +363,19 @@ async def list_uploads( ) return paginated_format(uploads, total, page, pageSize) + @app.get( + "/orgs/{oid}/uploads/tagCounts", + tags=["uploads"], + response_model=TagsResponse, + ) + async def get_uploads_tag_counts( + org: Organization = Depends(org_viewer_dep), + ): + tags = await ops.get_all_crawls_tag_counts( + org, only_successful=False, type_="upload" + ) + return {"tags": tags} + @app.get( "/orgs/{oid}/uploads/{crawlid}", tags=["uploads"], diff --git a/backend/test/conftest.py b/backend/test/conftest.py index 5f212d0d40..cafa24c7d9 100644 --- a/backend/test/conftest.py +++ b/backend/test/conftest.py @@ -31,6 +31,8 @@ NON_DEFAULT_ORG_NAME = "Non-default org" NON_DEFAULT_ORG_SLUG = "non-default-org" +RUNNING_STATES = ["running", "pending-wait", "generate-wacz", "uploading-wacz"] + FAILED_STATES = ["canceled", "failed", "skipped_quota_reached"] SUCCESSFUL_STATES = ["complete", "stopped_by_user", "stopped_quota_reached"] @@ -266,6 +268,7 @@ def qa_crawl_id(crawler_auth_headers, default_org_id): "runNow": True, "name": "Crawler User Crawl for Testing QA", "description": "crawler test crawl for qa", + "tags": ["qa", "wr-test-1"], "config": {"seeds": [{"url": "https://old.webrecorder.net/"}], "limit": 1}, "crawlerChannel": "test", } @@ -295,6 +298,7 @@ def wr_specs_crawl_id(crawler_auth_headers, default_org_id): crawl_data = { "runNow": True, "name": "Webrecorder Specs sample crawl", + "tags": ["wr-test-1"], "config": {"seeds": [{"url": "https://specs.webrecorder.net/"}], "limit": 1}, } r = requests.post( @@ -358,6 +362,7 @@ def auto_add_crawl_id(crawler_auth_headers, default_org_id, auto_add_collection_ "runNow": True, "name": "Auto Add", "description": "For testing auto-adding new workflow crawls to collections", + "tags": ["wr-test-1"], "autoAddCollections": [auto_add_collection_id], "config": { "seeds": [{"url": "https://old.webrecorder.net/"}], @@ -399,6 +404,7 @@ def all_crawls_crawl_id(crawler_auth_headers, default_org_id): "runNow": True, "name": "All Crawls Test Crawl", "description": "Lorem ipsum", + "tags": ["all-crawls", "wr-test-2"], "config": { "seeds": [{"url": "https://old.webrecorder.net/"}], "exclude": "community", @@ -458,6 +464,7 @@ def all_crawls_delete_crawl_ids(admin_auth_headers, default_org_id): "runNow": True, "name": "All Crawls Delete Test Workflow", "description": "Lorem ipsum", + "tags": ["wr-test-1", "to-delete"], "config": { "seeds": [{"url": "https://old.webrecorder.net/"}], "exclude": "community", @@ -520,6 +527,7 @@ def custom_behaviors_crawl_id(admin_auth_headers, default_org_id): crawl_data = { "runNow": True, "name": "Custom Behavior Logs", + "tags": ["behaviors", "wr-test-1"], "config": { "seeds": [{"url": "https://specs.webrecorder.net/"}], "customBehaviors": [ @@ -551,6 +559,59 @@ def custom_behaviors_crawl_id(admin_auth_headers, default_org_id): return crawl_id +@pytest.fixture(scope="session") +def canceled_crawl_id(admin_auth_headers, default_org_id): + crawl_data = { + "runNow": True, + "name": "Canceled crawl", + "tags": ["canceled"], + "config": { + "seeds": [{"url": "https://old.webrecorder.net/"}], + "limit": 5, + }, + "browserWindows": 1, + } + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=admin_auth_headers, + json=crawl_data, + ) + data = r.json() + + crawl_id = data["run_now_job"] + + # Cancel crawl after it's started + while True: + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json", + headers=admin_auth_headers, + ) + data = r.json() + if data["state"] in RUNNING_STATES: + break + time.sleep(5) + + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/cancel", + headers=admin_auth_headers, + ) + data = r.json() + assert data["success"] == True + + # Wait until crawl finishes + while True: + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json", + headers=admin_auth_headers, + ) + data = r.json() + if data["state"] in FINISHED_STATES: + break + time.sleep(5) + + return crawl_id + + @pytest.fixture(scope="session") def url_list_config_id(crawler_auth_headers, default_org_id): # Start crawl. @@ -558,6 +619,7 @@ def url_list_config_id(crawler_auth_headers, default_org_id): "runNow": False, "name": "URL List config", "description": "Contains 3 seeds", + "tags": ["wr-test-1", "seed-list"], "config": { "seeds": [ {"url": "https://old.webrecorder.net"}, diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index c388d75860..cc23dbb11c 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -1065,6 +1065,104 @@ def test_clear_all_presigned_urls( assert r.json()["success"] +def test_all_crawls_tag_counts(crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/tagCounts", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json() == { + "tags": [ + {"tag": "wr-test-1", "count": 3}, + {"tag": "wr-test-2", "count": 2}, + {"tag": "all-crawls", "count": 1}, + {"tag": "behaviors", "count": 1}, + {"tag": "four", "count": 1}, + {"tag": "qa", "count": 1}, + {"tag": "three", "count": 1}, + {"tag": "wr-test-1-updated-again", "count": 1}, + {"tag": "wr-test-2-updated-again", "count": 1}, + ] + } + + +def test_all_crawls_tag_counts_including_failed( + crawler_auth_headers, default_org_id, canceled_crawl_id +): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/tagCounts?onlySuccessful=false", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json() == { + "tags": [ + {"tag": "wr-test-1", "count": 3}, + {"tag": "wr-test-2", "count": 2}, + {"tag": "all-crawls", "count": 1}, + {"tag": "behaviors", "count": 1}, + {"tag": "canceled", "count": 1}, + {"tag": "four", "count": 1}, + {"tag": "qa", "count": 1}, + {"tag": "three", "count": 1}, + {"tag": "wr-test-1-updated-again", "count": 1}, + {"tag": "wr-test-2-updated-again", "count": 1}, + ] + } + + +def test_crawls_tag_counts(crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/tagCounts", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json() == { + "tags": [ + {"tag": "wr-test-1", "count": 3}, + {"tag": "wr-test-2", "count": 2}, + {"tag": "all-crawls", "count": 1}, + {"tag": "behaviors", "count": 1}, + {"tag": "qa", "count": 1}, + ] + } + + +def test_crawls_tag_counts_including_failed( + crawler_auth_headers, default_org_id, canceled_crawl_id +): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/tagCounts?onlySuccessful=false", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json() == { + "tags": [ + {"tag": "wr-test-1", "count": 3}, + {"tag": "wr-test-2", "count": 2}, + {"tag": "all-crawls", "count": 1}, + {"tag": "behaviors", "count": 1}, + {"tag": "canceled", "count": 1}, + {"tag": "qa", "count": 1}, + ] + } + + +def test_uploads_tag_counts(crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/uploads/tagCounts", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json() == { + "tags": [ + {"tag": "four", "count": 1}, + {"tag": "three", "count": 1}, + {"tag": "wr-test-1-updated-again", "count": 1}, + {"tag": "wr-test-2-updated-again", "count": 1}, + ] + } + + def test_delete_form_upload_and_crawls_from_all_crawls( admin_auth_headers, crawler_auth_headers,