Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions backend/btrixcloud/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@

JWT_TOKEN_LIFETIME = int(os.environ.get("JWT_TOKEN_LIFETIME_MINUTES", 60))

# set to one year
INTERNAL_JWT_TOKEN_LIFETIME = 60 * 24 * 365

BTRIX_SUBS_APP_API_KEY = os.environ.get("BTRIX_SUBS_APP_API_KEY", "")

ALGORITHM = "HS256"
Expand All @@ -39,6 +42,7 @@
PWD_CONTEXT = CryptContext(schemes=["bcrypt"], deprecated="auto")

# Audiences
CUSTOM_AUTH_AUD = "btrix:custom-auth"
AUTH_AUD = "btrix:auth"
RESET_AUD = "btrix:reset"
VERIFY_AUD = "btrix:verify"
Expand Down Expand Up @@ -117,6 +121,26 @@ def create_access_token(user: User) -> str:
return generate_jwt({"sub": str(user.id), "aud": AUTH_AUD}, JWT_TOKEN_LIFETIME)


# ============================================================================
def create_custom_jwt_token(sub: str, data: dict[str, str]) -> str:
"""create jwt token for internal crawler access"""
return generate_jwt(
{**data, "sub": sub, "aud": CUSTOM_AUTH_AUD},
INTERNAL_JWT_TOKEN_LIFETIME,
)


# ============================================================================
def get_custom_jwt_token(request: Request) -> dict[str, str]:
"""return data from custom jwt token"""
token = request.query_params.get("auth_bearer") or ""
try:
return decode_jwt(token, [CUSTOM_AUTH_AUD])
# pylint: disable=bare-except
except:
return {}


# ============================================================================
def verify_password(plain_password: str, hashed_password: str) -> bool:
"""verify password by hash"""
Expand Down
28 changes: 28 additions & 0 deletions backend/btrixcloud/colls.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
case_insensitive_collation,
)

from .auth import get_custom_jwt_token

from .crawlmanager import CrawlManager

if TYPE_CHECKING:
Expand Down Expand Up @@ -1287,6 +1289,22 @@ def init_collections_api(
org_viewer_dep = orgs.org_viewer_dep
org_public = orgs.org_public

async def coll_internal_access_dep(
coll_id: UUID, token_data: dict[str, str] = Depends(get_custom_jwt_token)
) -> UUID:
# first, check subject match collection id and type is collection
if token_data.get("sub_type") == "coll" and token_data.get("sub") == str(
coll_id
):
# second, check that the k8s object access is scoped to exists
if await crawl_manager.validate_k8s_obj_exists(
token_data.get("scope_type", ""), token_data.get("scope", "")
):
return coll_id

# otherwise, deny access
raise HTTPException(status_code=403, detail="access_denied")

@app.post(
"/orgs/{oid}/collections",
tags=["collections"],
Expand Down Expand Up @@ -1373,6 +1391,16 @@ async def get_collection_replay(
coll_id, org, resources=True, headers=dict(request.headers)
)

@app.get(
"/orgs/{oid}/collections/{coll_id}/internal/replay.json",
tags=["collections"],
response_model=ResourcesOnly,
)
async def get_internal_replay(
oid: UUID, coll_id: UUID = Depends(coll_internal_access_dep)
):
return await colls.get_internal_replay_list(coll_id, oid)

@app.get(
"/orgs/{oid}/collections/{coll_id}/public/replay.json",
tags=["collections"],
Expand Down
22 changes: 22 additions & 0 deletions backend/btrixcloud/crawlmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from .utils import dt_now, date_to_str, scale_from_browser_windows
from .k8sapi import K8sAPI, ApiException
from .auth import create_custom_jwt_token

from .models import (
StorageRef,
Expand All @@ -25,6 +26,8 @@

DEFAULT_NAMESPACE: str = os.environ.get("DEFAULT_NAMESPACE", "default")

BACKEND_ORIGIN: str = os.environ.get("BACKEND_ORIGIN", "")


# ============================================================================
# pylint: disable=too-many-public-methods
Expand Down Expand Up @@ -241,6 +244,17 @@ async def run_index_import_job(
else f"purge-index-{coll_id}"
)

if job_type in ("purge", "import"):
auth_bearer = create_custom_jwt_token(
coll_id, {"sub_type": "coll", "scope_type": "job", "scope": name}
)
import_source_url = (
f"{BACKEND_ORIGIN}/api/orgs/{oid}/collections/{coll_id}"
+ f"/internal/replay.json?auth_bearer={auth_bearer}"
)
else:
import_source_url = ""

params = {
"name": name,
"id": coll_id,
Expand All @@ -250,6 +264,7 @@ async def run_index_import_job(
"job_type": job_type,
"redis_url": self.get_redis_url("coll-" + str(coll_id)),
"crawl_id": crawl_id,
"import_source_url": import_source_url,
}

data = self.templates.env.get_template("index-import-job.yaml").render(params)
Expand All @@ -269,6 +284,13 @@ async def run_index_import_job(

return name

async def validate_k8s_obj_exists(self, obj_type: str, name: str) -> bool:
"""return true/false if specified k8s object exists"""
if obj_type == "job":
return await self.has_job(name)

return False

async def delete_dedupe_index_resources(self, oid: str, coll_id: str) -> None:
"""Delete dedupe index-related jobs and index itself"""
await self._delete_jobs(f"role=index-import-job,oid={oid},coll={coll_id}")
Expand Down
11 changes: 11 additions & 0 deletions backend/btrixcloud/k8sapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,17 @@ async def unsuspend_k8s_job(self, name) -> dict:
traceback.print_exc()
return {"error": str(exc)}

async def has_job(self, name) -> bool:
"""return true/false if job exists"""
try:
await self.batch_api.read_namespaced_job(
name=name, namespace=self.namespace
)
return True
# pylint: disable=bare-except
except:
return False

async def print_pod_logs(self, pod_names, lines=100):
"""print pod logs"""
for pod in pod_names:
Expand Down
27 changes: 4 additions & 23 deletions backend/btrixcloud/operator/collindexjob.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
"""Operator handler for Dedupe Index Import Job"""

from uuid import UUID

from .models import (
MCBaseRequest,
MCDecoratorSyncData,
Expand Down Expand Up @@ -70,11 +68,9 @@ async def sync_job(self, data: MCDecoratorSyncData):
data, coll_id, oid, allowed_states
)

# keep configmap if already exists, or add if index is ready
# keep configmap if exists or add only if index is ready
if configmap or index_ready:
attachments = await self.load_import_configmap(
coll_id, name, oid, configmap
)
attachments = self.create_configmap(coll_id, name)

# delete succeeded job
if data.object.get("status", {}).get("succeeded", 0) >= 1:
Expand All @@ -83,26 +79,11 @@ async def sync_job(self, data: MCDecoratorSyncData):

return MCDecoratorSyncResponse(attachments=attachments)

async def load_import_configmap(self, coll_id: str, name: str, oid: str, configmap):
"""create configmap for import job, lookup resources only on first init"""
# pylint: disable=duplicate-code
if configmap and not self.is_configmap_update_needed("config.json", configmap):
metadata = configmap["metadata"]
configmap["metadata"] = {
"name": metadata["name"],
"namespace": metadata["namespace"],
"labels": metadata["labels"],
}
return [configmap]

replay_list = await self.coll_ops.get_internal_replay_list(
UUID(coll_id), UUID(oid)
)

def create_configmap(self, coll_id: str, name: str) -> list[str]:
"""create configmap as a semaphore for when job is ready. no actual data"""
params = {}
params["name"] = name
params["namespace"] = self.k8s.shared_params["namespace"]
params["id"] = coll_id
params["config"] = replay_list.json()

return self.load_from_yaml("index-import-configmap.yaml", params)
3 changes: 0 additions & 3 deletions chart/app-templates/index-import-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,3 @@ metadata:
labels:
coll: {{ id }}
role: configmap

data:
config.json: {{ config | tojson }}
4 changes: 2 additions & 2 deletions chart/app-templates/index-import-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,11 @@ spec:

{% if job_type == "import" %}
- --sourceUrl
- /btrix-config/config.json
- {{ import_source_url }}

{% elif job_type == "purge" %}
- --sourceUrl
- /btrix-config/config.json
- {{ import_source_url }}
- --removing

{% elif job_type == "commit" %}
Expand Down
2 changes: 2 additions & 0 deletions chart/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ data:

CRAWLER_FQDN_SUFFIX: ".{{ .Values.crawler_namespace }}{{ .Values.fqdn_suffix }}"

BACKEND_ORIGIN: "http://browsertrix-cloud-backend.{{ .Release.Namespace }}{{ .Values.fqdn_suffix }}:8000"

DEFAULT_ORG: "{{ .Values.default_org }}"

INVITE_EXPIRE_SECONDS: "{{ .Values.invite_expire_seconds }}"
Expand Down
6 changes: 5 additions & 1 deletion frontend/frontend.conf.template
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ server {
# serve a 404 page for /replay/ path, as that should be taken over by RWP
location /replay/ {
default_type application/json;
return 404 "{\"error\": \"placeholder_for_replay\"}";
return 404 "{\"detail\":\"placeholder_for_replay\"}";
}

# used by docker only: k8s deployment handles /api directly via ingress
Expand All @@ -90,6 +90,10 @@ server {
proxy_read_timeout 300;
}

location ~* /internal/ {
return 403 "{\"detail\":\"access_denied\"}";
}

location ~* /watch/([^/]+)/([^/]+)/([^/]+)/ws {
set $org $1;
set $crawl $2;
Expand Down
Loading