webrecorder · tw4l · Oct 22, 2025
diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py
@@ -25,7 +25,7 @@
 
 from .models import (
     SUCCESSFUL_STATES,
-    CrawlConfigTags,
+    TagsResponse,
     CrawlFile,
     CrawlFileOut,
     BaseCrawl,
@@ -984,13 +984,22 @@ async def get_org_last_crawl_finished(self, oid: UUID) -> Optional[datetime]:
 
         return last_crawl_finished
 
-    async def get_all_crawls_tag_counts(self, org: Organization):
-        """get distinct tags from all archived items for this org"""
+    async def get_all_crawls_tag_counts(
+        self,
+        org: Organization,
+        only_successful: bool = True,
+        type_: Optional[str] = None,
+    ):
+        """get distinct tags from archived items for this org"""
+        match_query: Dict[str, Any] = {"oid": org.id}
+        if only_successful:
+            match_query["state"] = {"$in": SUCCESSFUL_STATES}
+        if type_ in ("crawl", "upload"):
+            match_query["type"] = type_
+
         tags = await self.crawls.aggregate(
             [
-                # Match only against the states of archived items that might be
-                # displayed in the frontend
-                {"$match": {"oid": org.id, "state": {"$in": SUCCESSFUL_STATES}}},
+                {"$match": match_query},
                 {"$unwind": "$tags"},
                 {"$group": {"_id": "$tags", "count": {"$sum": 1}}},
                 {"$project": {"tag": "$_id", "count": "$count", "_id": 0}},
@@ -1094,10 +1103,20 @@ async def get_all_crawls_search_values(
     @app.get(
         "/orgs/{oid}/all-crawls/tagCounts",
         tags=["all-crawls"],
-        response_model=CrawlConfigTags,
+        response_model=TagsResponse,
     )
-    async def get_all_crawls_tag_counts(org: Organization = Depends(org_viewer_dep)):
-        return {"tags": await ops.get_all_crawls_tag_counts(org)}
+    async def get_all_crawls_tag_counts(
+        org: Organization = Depends(org_viewer_dep),
+        onlySuccessful: bool = True,
+        crawlType: Optional[str] = None,
+    ):
+        if crawlType and crawlType not in ("crawl", "upload"):
+            raise HTTPException(status_code=400, detail="invalid_crawl_type")
+
+        tags = await ops.get_all_crawls_tag_counts(
+            org, only_successful=onlySuccessful, type_=crawlType
+        )
+        return {"tags": tags}
 
     @app.get(
         "/orgs/{oid}/all-crawls/{crawl_id}",

diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py
@@ -26,7 +26,7 @@
     ConfigRevision,
     CrawlConfig,
     CrawlConfigOut,
-    CrawlConfigTags,
+    TagsResponse,
     CrawlOut,
     CrawlOutWithResources,
     UpdateCrawlConfig,
@@ -1622,7 +1622,7 @@ async def get_crawl_config_tags(org: Organization = Depends(org_viewer_dep)):
         """
         return await ops.get_crawl_config_tags(org)
 
-    @router.get("/tagCounts", response_model=CrawlConfigTags)
+    @router.get("/tagCounts", response_model=TagsResponse)
     async def get_crawl_config_tag_counts(org: Organization = Depends(org_viewer_dep)):
         return {"tags": await ops.get_crawl_config_tag_counts(org)}
 

diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py
@@ -78,6 +78,7 @@
     CrawlQueueResponse,
     MatchCrawlQueueResponse,
     CrawlLogLine,
+    TagsResponse,
 )
 
 
@@ -1355,6 +1356,20 @@ async def delete_crawls(
             deleted=count, storageQuotaReached=quota_reached
         )
 
+    @app.get(
+        "/orgs/{oid}/crawls/tagCounts",
+        tags=["crawls"],
+        response_model=TagsResponse,
+    )
+    async def get_crawls_tag_counts(
+        org: Organization = Depends(org_viewer_dep),
+        onlySuccessful: bool = True,
+    ):
+        tags = await ops.get_all_crawls_tag_counts(
+            org, only_successful=onlySuccessful, type_="crawl"
+        )
+        return {"tags": tags}
+
     @app.get("/orgs/all/crawls/stats", tags=["crawls"], response_model=bytes)
     async def get_all_orgs_crawl_stats(
         user: User = Depends(user_dep),

diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py
@@ -603,18 +603,18 @@ class CrawlConfigAddedResponse(BaseModel):
 
 
 # ============================================================================
-class CrawlConfigTagCount(BaseModel):
-    """Response model for crawlconfig tag count"""
+class TagCount(BaseModel):
+    """Response model for crawlconfig/crawl tag count"""
 
     tag: str
     count: int
 
 
 # ============================================================================
-class CrawlConfigTags(BaseModel):
-    """Response model for crawlconfig tags"""
+class TagsResponse(BaseModel):
+    """Response model for crawlconfig/crawl tags"""
 
-    tags: List[CrawlConfigTagCount]
+    tags: List[TagCount]
 
 
 # ============================================================================

diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py
@@ -28,6 +28,7 @@
     AddedResponseIdQuota,
     FilePreparer,
     MIN_UPLOAD_PART_SIZE,
+    TagsResponse,
 )
 from .pagination import paginated_format, DEFAULT_PAGE_SIZE
 from .utils import dt_now
@@ -362,6 +363,19 @@ async def list_uploads(
         )
         return paginated_format(uploads, total, page, pageSize)
 
+    @app.get(
+        "/orgs/{oid}/uploads/tagCounts",
+        tags=["uploads"],
+        response_model=TagsResponse,
+    )
+    async def get_uploads_tag_counts(
+        org: Organization = Depends(org_viewer_dep),
+    ):
+        tags = await ops.get_all_crawls_tag_counts(
+            org, only_successful=False, type_="upload"
+        )
+        return {"tags": tags}
+
     @app.get(
         "/orgs/{oid}/uploads/{crawlid}",
         tags=["uploads"],

diff --git a/backend/test/conftest.py b/backend/test/conftest.py
@@ -31,6 +31,8 @@
 NON_DEFAULT_ORG_NAME = "Non-default org"
 NON_DEFAULT_ORG_SLUG = "non-default-org"
 
+RUNNING_STATES = ["running", "pending-wait", "generate-wacz", "uploading-wacz"]
+
 FAILED_STATES = ["canceled", "failed", "skipped_quota_reached"]
 
 SUCCESSFUL_STATES = ["complete", "stopped_by_user", "stopped_quota_reached"]
@@ -266,6 +268,7 @@ def qa_crawl_id(crawler_auth_headers, default_org_id):
         "runNow": True,
         "name": "Crawler User Crawl for Testing QA",
         "description": "crawler test crawl for qa",
+        "tags": ["qa", "wr-test-1"],
         "config": {"seeds": [{"url": "https://old.webrecorder.net/"}], "limit": 1},
         "crawlerChannel": "test",
     }
@@ -295,6 +298,7 @@ def wr_specs_crawl_id(crawler_auth_headers, default_org_id):
     crawl_data = {
         "runNow": True,
         "name": "Webrecorder Specs sample crawl",
+        "tags": ["wr-test-1"],
         "config": {"seeds": [{"url": "https://specs.webrecorder.net/"}], "limit": 1},
     }
     r = requests.post(
@@ -358,6 +362,7 @@ def auto_add_crawl_id(crawler_auth_headers, default_org_id, auto_add_collection_
         "runNow": True,
         "name": "Auto Add",
         "description": "For testing auto-adding new workflow crawls to collections",
+        "tags": ["wr-test-1"],
         "autoAddCollections": [auto_add_collection_id],
         "config": {
             "seeds": [{"url": "https://old.webrecorder.net/"}],
@@ -399,6 +404,7 @@ def all_crawls_crawl_id(crawler_auth_headers, default_org_id):
         "runNow": True,
         "name": "All Crawls Test Crawl",
         "description": "Lorem ipsum",
+        "tags": ["all-crawls", "wr-test-2"],
         "config": {
             "seeds": [{"url": "https://old.webrecorder.net/"}],
             "exclude": "community",
@@ -458,6 +464,7 @@ def all_crawls_delete_crawl_ids(admin_auth_headers, default_org_id):
         "runNow": True,
         "name": "All Crawls Delete Test Workflow",
         "description": "Lorem ipsum",
+        "tags": ["wr-test-1", "to-delete"],
         "config": {
             "seeds": [{"url": "https://old.webrecorder.net/"}],
             "exclude": "community",
@@ -520,6 +527,7 @@ def custom_behaviors_crawl_id(admin_auth_headers, default_org_id):
     crawl_data = {
         "runNow": True,
         "name": "Custom Behavior Logs",
+        "tags": ["behaviors", "wr-test-1"],
         "config": {
             "seeds": [{"url": "https://specs.webrecorder.net/"}],
             "customBehaviors": [
@@ -551,13 +559,67 @@ def custom_behaviors_crawl_id(admin_auth_headers, default_org_id):
     return crawl_id
 
 
+@pytest.fixture(scope="session")
+def canceled_crawl_id(admin_auth_headers, default_org_id):
+    crawl_data = {
+        "runNow": True,
+        "name": "Canceled crawl",
+        "tags": ["canceled"],
+        "config": {
+            "seeds": [{"url": "https://old.webrecorder.net/"}],
+            "limit": 5,
+        },
+        "browserWindows": 1,
+    }
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
+        headers=admin_auth_headers,
+        json=crawl_data,
+    )
+    data = r.json()
+
+    crawl_id = data["run_now_job"]
+
+    # Cancel crawl after it's started
+    while True:
+        r = requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
+            headers=admin_auth_headers,
+        )
+        data = r.json()
+        if data["state"] in RUNNING_STATES:
+            break
+        time.sleep(5)
+
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/cancel",
+        headers=admin_auth_headers,
+    )
+    data = r.json()
+    assert data["success"] == True
+
+    # Wait until crawl finishes
+    while True:
+        r = requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
+            headers=admin_auth_headers,
+        )
+        data = r.json()
+        if data["state"] in FINISHED_STATES:
+            break
+        time.sleep(5)
+
+    return crawl_id
+
+
 @pytest.fixture(scope="session")
 def url_list_config_id(crawler_auth_headers, default_org_id):
     # Start crawl.
     crawl_data = {
         "runNow": False,
         "name": "URL List config",
         "description": "Contains 3 seeds",
+        "tags": ["wr-test-1", "seed-list"],
         "config": {
             "seeds": [
                 {"url": "https://old.webrecorder.net"},

diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py
@@ -1065,6 +1065,104 @@ def test_clear_all_presigned_urls(
     assert r.json()["success"]
 
 
+def test_all_crawls_tag_counts(crawler_auth_headers, default_org_id):
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/tagCounts",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    assert r.json() == {
+        "tags": [
+            {"tag": "wr-test-1", "count": 3},
+            {"tag": "wr-test-2", "count": 2},
+            {"tag": "all-crawls", "count": 1},
+            {"tag": "behaviors", "count": 1},
+            {"tag": "four", "count": 1},
+            {"tag": "qa", "count": 1},
+            {"tag": "three", "count": 1},
+            {"tag": "wr-test-1-updated-again", "count": 1},
+            {"tag": "wr-test-2-updated-again", "count": 1},
+        ]
+    }
+
+
+def test_all_crawls_tag_counts_including_failed(
+    crawler_auth_headers, default_org_id, canceled_crawl_id
+):
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/tagCounts?onlySuccessful=false",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    assert r.json() == {
+        "tags": [
+            {"tag": "wr-test-1", "count": 3},
+            {"tag": "wr-test-2", "count": 2},
+            {"tag": "all-crawls", "count": 1},
+            {"tag": "behaviors", "count": 1},
+            {"tag": "canceled", "count": 1},
+            {"tag": "four", "count": 1},
+            {"tag": "qa", "count": 1},
+            {"tag": "three", "count": 1},
+            {"tag": "wr-test-1-updated-again", "count": 1},
+            {"tag": "wr-test-2-updated-again", "count": 1},
+        ]
+    }
+
+
+def test_crawls_tag_counts(crawler_auth_headers, default_org_id):
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/tagCounts",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    assert r.json() == {
+        "tags": [
+            {"tag": "wr-test-1", "count": 3},
+            {"tag": "wr-test-2", "count": 2},
+            {"tag": "all-crawls", "count": 1},
+            {"tag": "behaviors", "count": 1},
+            {"tag": "qa", "count": 1},
+        ]
+    }
+
+
+def test_crawls_tag_counts_including_failed(
+    crawler_auth_headers, default_org_id, canceled_crawl_id
+):
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/tagCounts?onlySuccessful=false",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    assert r.json() == {
+        "tags": [
+            {"tag": "wr-test-1", "count": 3},
+            {"tag": "wr-test-2", "count": 2},
+            {"tag": "all-crawls", "count": 1},
+            {"tag": "behaviors", "count": 1},
+            {"tag": "canceled", "count": 1},
+            {"tag": "qa", "count": 1},
+        ]
+    }
+
+
+def test_uploads_tag_counts(crawler_auth_headers, default_org_id):
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/uploads/tagCounts",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    assert r.json() == {
+        "tags": [
+            {"tag": "four", "count": 1},
+            {"tag": "three", "count": 1},
+            {"tag": "wr-test-1-updated-again", "count": 1},
+            {"tag": "wr-test-2-updated-again", "count": 1},
+        ]
+    }
+
+
 def test_delete_form_upload_and_crawls_from_all_crawls(
     admin_auth_headers,
     crawler_auth_headers,