Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 28 additions & 9 deletions backend/btrixcloud/basecrawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from .models import (
SUCCESSFUL_STATES,
CrawlConfigTags,
TagsResponse,
CrawlFile,
CrawlFileOut,
BaseCrawl,
Expand Down Expand Up @@ -984,13 +984,22 @@ async def get_org_last_crawl_finished(self, oid: UUID) -> Optional[datetime]:

return last_crawl_finished

async def get_all_crawls_tag_counts(self, org: Organization):
"""get distinct tags from all archived items for this org"""
async def get_all_crawls_tag_counts(
self,
org: Organization,
only_successful: bool = True,
type_: Optional[str] = None,
):
"""get distinct tags from archived items for this org"""
match_query: Dict[str, Any] = {"oid": org.id}
if only_successful:
match_query["state"] = {"$in": SUCCESSFUL_STATES}
if type_ in ("crawl", "upload"):
match_query["type"] = type_

tags = await self.crawls.aggregate(
[
# Match only against the states of archived items that might be
# displayed in the frontend
{"$match": {"oid": org.id, "state": {"$in": SUCCESSFUL_STATES}}},
{"$match": match_query},
{"$unwind": "$tags"},
{"$group": {"_id": "$tags", "count": {"$sum": 1}}},
{"$project": {"tag": "$_id", "count": "$count", "_id": 0}},
Expand Down Expand Up @@ -1094,10 +1103,20 @@ async def get_all_crawls_search_values(
@app.get(
"/orgs/{oid}/all-crawls/tagCounts",
tags=["all-crawls"],
response_model=CrawlConfigTags,
response_model=TagsResponse,
)
async def get_all_crawls_tag_counts(org: Organization = Depends(org_viewer_dep)):
return {"tags": await ops.get_all_crawls_tag_counts(org)}
async def get_all_crawls_tag_counts(
org: Organization = Depends(org_viewer_dep),
onlySuccessful: bool = True,
crawlType: Optional[str] = None,
):
if crawlType and crawlType not in ("crawl", "upload"):
raise HTTPException(status_code=400, detail="invalid_crawl_type")

tags = await ops.get_all_crawls_tag_counts(
org, only_successful=onlySuccessful, type_=crawlType
)
return {"tags": tags}

@app.get(
"/orgs/{oid}/all-crawls/{crawl_id}",
Expand Down
4 changes: 2 additions & 2 deletions backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
ConfigRevision,
CrawlConfig,
CrawlConfigOut,
CrawlConfigTags,
TagsResponse,
CrawlOut,
CrawlOutWithResources,
UpdateCrawlConfig,
Expand Down Expand Up @@ -1622,7 +1622,7 @@ async def get_crawl_config_tags(org: Organization = Depends(org_viewer_dep)):
"""
return await ops.get_crawl_config_tags(org)

@router.get("/tagCounts", response_model=CrawlConfigTags)
@router.get("/tagCounts", response_model=TagsResponse)
async def get_crawl_config_tag_counts(org: Organization = Depends(org_viewer_dep)):
return {"tags": await ops.get_crawl_config_tag_counts(org)}

Expand Down
15 changes: 15 additions & 0 deletions backend/btrixcloud/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
CrawlQueueResponse,
MatchCrawlQueueResponse,
CrawlLogLine,
TagsResponse,
)


Expand Down Expand Up @@ -1355,6 +1356,20 @@ async def delete_crawls(
deleted=count, storageQuotaReached=quota_reached
)

@app.get(
"/orgs/{oid}/crawls/tagCounts",
tags=["crawls"],
response_model=TagsResponse,
)
async def get_crawls_tag_counts(
org: Organization = Depends(org_viewer_dep),
onlySuccessful: bool = True,
):
tags = await ops.get_all_crawls_tag_counts(
org, only_successful=onlySuccessful, type_="crawl"
)
return {"tags": tags}

@app.get("/orgs/all/crawls/stats", tags=["crawls"], response_model=bytes)
async def get_all_orgs_crawl_stats(
user: User = Depends(user_dep),
Expand Down
10 changes: 5 additions & 5 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,18 +603,18 @@ class CrawlConfigAddedResponse(BaseModel):


# ============================================================================
class CrawlConfigTagCount(BaseModel):
"""Response model for crawlconfig tag count"""
class TagCount(BaseModel):
"""Response model for crawlconfig/crawl tag count"""

tag: str
count: int


# ============================================================================
class CrawlConfigTags(BaseModel):
"""Response model for crawlconfig tags"""
class TagsResponse(BaseModel):
"""Response model for crawlconfig/crawl tags"""

tags: List[CrawlConfigTagCount]
tags: List[TagCount]


# ============================================================================
Expand Down
14 changes: 14 additions & 0 deletions backend/btrixcloud/uploads.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
AddedResponseIdQuota,
FilePreparer,
MIN_UPLOAD_PART_SIZE,
TagsResponse,
)
from .pagination import paginated_format, DEFAULT_PAGE_SIZE
from .utils import dt_now
Expand Down Expand Up @@ -362,6 +363,19 @@ async def list_uploads(
)
return paginated_format(uploads, total, page, pageSize)

@app.get(
"/orgs/{oid}/uploads/tagCounts",
tags=["uploads"],
response_model=TagsResponse,
)
async def get_uploads_tag_counts(
org: Organization = Depends(org_viewer_dep),
):
tags = await ops.get_all_crawls_tag_counts(
org, only_successful=False, type_="upload"
)
return {"tags": tags}

@app.get(
"/orgs/{oid}/uploads/{crawlid}",
tags=["uploads"],
Expand Down
62 changes: 62 additions & 0 deletions backend/test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
NON_DEFAULT_ORG_NAME = "Non-default org"
NON_DEFAULT_ORG_SLUG = "non-default-org"

RUNNING_STATES = ["running", "pending-wait", "generate-wacz", "uploading-wacz"]

FAILED_STATES = ["canceled", "failed", "skipped_quota_reached"]

SUCCESSFUL_STATES = ["complete", "stopped_by_user", "stopped_quota_reached"]
Expand Down Expand Up @@ -266,6 +268,7 @@ def qa_crawl_id(crawler_auth_headers, default_org_id):
"runNow": True,
"name": "Crawler User Crawl for Testing QA",
"description": "crawler test crawl for qa",
"tags": ["qa", "wr-test-1"],
"config": {"seeds": [{"url": "https://old.webrecorder.net/"}], "limit": 1},
"crawlerChannel": "test",
}
Expand Down Expand Up @@ -295,6 +298,7 @@ def wr_specs_crawl_id(crawler_auth_headers, default_org_id):
crawl_data = {
"runNow": True,
"name": "Webrecorder Specs sample crawl",
"tags": ["wr-test-1"],
"config": {"seeds": [{"url": "https://specs.webrecorder.net/"}], "limit": 1},
}
r = requests.post(
Expand Down Expand Up @@ -358,6 +362,7 @@ def auto_add_crawl_id(crawler_auth_headers, default_org_id, auto_add_collection_
"runNow": True,
"name": "Auto Add",
"description": "For testing auto-adding new workflow crawls to collections",
"tags": ["wr-test-1"],
"autoAddCollections": [auto_add_collection_id],
"config": {
"seeds": [{"url": "https://old.webrecorder.net/"}],
Expand Down Expand Up @@ -399,6 +404,7 @@ def all_crawls_crawl_id(crawler_auth_headers, default_org_id):
"runNow": True,
"name": "All Crawls Test Crawl",
"description": "Lorem ipsum",
"tags": ["all-crawls", "wr-test-2"],
"config": {
"seeds": [{"url": "https://old.webrecorder.net/"}],
"exclude": "community",
Expand Down Expand Up @@ -458,6 +464,7 @@ def all_crawls_delete_crawl_ids(admin_auth_headers, default_org_id):
"runNow": True,
"name": "All Crawls Delete Test Workflow",
"description": "Lorem ipsum",
"tags": ["wr-test-1", "to-delete"],
"config": {
"seeds": [{"url": "https://old.webrecorder.net/"}],
"exclude": "community",
Expand Down Expand Up @@ -520,6 +527,7 @@ def custom_behaviors_crawl_id(admin_auth_headers, default_org_id):
crawl_data = {
"runNow": True,
"name": "Custom Behavior Logs",
"tags": ["behaviors", "wr-test-1"],
"config": {
"seeds": [{"url": "https://specs.webrecorder.net/"}],
"customBehaviors": [
Expand Down Expand Up @@ -551,13 +559,67 @@ def custom_behaviors_crawl_id(admin_auth_headers, default_org_id):
return crawl_id


@pytest.fixture(scope="session")
def canceled_crawl_id(admin_auth_headers, default_org_id):
crawl_data = {
"runNow": True,
"name": "Canceled crawl",
"tags": ["canceled"],
"config": {
"seeds": [{"url": "https://old.webrecorder.net/"}],
"limit": 5,
},
"browserWindows": 1,
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
data = r.json()

crawl_id = data["run_now_job"]

# Cancel crawl after it's started
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] in RUNNING_STATES:
break
time.sleep(5)

r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/cancel",
headers=admin_auth_headers,
)
data = r.json()
assert data["success"] == True

# Wait until crawl finishes
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] in FINISHED_STATES:
break
time.sleep(5)

return crawl_id


@pytest.fixture(scope="session")
def url_list_config_id(crawler_auth_headers, default_org_id):
# Start crawl.
crawl_data = {
"runNow": False,
"name": "URL List config",
"description": "Contains 3 seeds",
"tags": ["wr-test-1", "seed-list"],
"config": {
"seeds": [
{"url": "https://old.webrecorder.net"},
Expand Down
98 changes: 98 additions & 0 deletions backend/test/test_uploads.py
Original file line number Diff line number Diff line change
Expand Up @@ -1065,6 +1065,104 @@ def test_clear_all_presigned_urls(
assert r.json()["success"]


def test_all_crawls_tag_counts(crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/tagCounts",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json() == {
"tags": [
{"tag": "wr-test-1", "count": 3},
{"tag": "wr-test-2", "count": 2},
{"tag": "all-crawls", "count": 1},
{"tag": "behaviors", "count": 1},
{"tag": "four", "count": 1},
{"tag": "qa", "count": 1},
{"tag": "three", "count": 1},
{"tag": "wr-test-1-updated-again", "count": 1},
{"tag": "wr-test-2-updated-again", "count": 1},
]
}


def test_all_crawls_tag_counts_including_failed(
crawler_auth_headers, default_org_id, canceled_crawl_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/tagCounts?onlySuccessful=false",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json() == {
"tags": [
{"tag": "wr-test-1", "count": 3},
{"tag": "wr-test-2", "count": 2},
{"tag": "all-crawls", "count": 1},
{"tag": "behaviors", "count": 1},
{"tag": "canceled", "count": 1},
{"tag": "four", "count": 1},
{"tag": "qa", "count": 1},
{"tag": "three", "count": 1},
{"tag": "wr-test-1-updated-again", "count": 1},
{"tag": "wr-test-2-updated-again", "count": 1},
]
}


def test_crawls_tag_counts(crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/tagCounts",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json() == {
"tags": [
{"tag": "wr-test-1", "count": 3},
{"tag": "wr-test-2", "count": 2},
{"tag": "all-crawls", "count": 1},
{"tag": "behaviors", "count": 1},
{"tag": "qa", "count": 1},
]
}


def test_crawls_tag_counts_including_failed(
crawler_auth_headers, default_org_id, canceled_crawl_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/tagCounts?onlySuccessful=false",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json() == {
"tags": [
{"tag": "wr-test-1", "count": 3},
{"tag": "wr-test-2", "count": 2},
{"tag": "all-crawls", "count": 1},
{"tag": "behaviors", "count": 1},
{"tag": "canceled", "count": 1},
{"tag": "qa", "count": 1},
]
}


def test_uploads_tag_counts(crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/tagCounts",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json() == {
"tags": [
{"tag": "four", "count": 1},
{"tag": "three", "count": 1},
{"tag": "wr-test-1-updated-again", "count": 1},
{"tag": "wr-test-2-updated-again", "count": 1},
]
}


def test_delete_form_upload_and_crawls_from_all_crawls(
admin_auth_headers,
crawler_auth_headers,
Expand Down
Loading