Skip to content

Commit ec4b270

Browse files
committed
wip
1 parent f3f55d3 commit ec4b270

File tree

113 files changed

+6948
-2415
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

113 files changed

+6948
-2415
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ VOLUME "$DATA_DIR"
393393
EXPOSE 8000
394394

395395
HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
396-
CMD curl --silent 'http://localhost:8000/health/' | grep -q 'OK'
396+
CMD curl --silent 'http://admin.archivebox.localhost:8000/health/' | grep -q 'OK'
397397

398398
ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
399399
CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"]

README.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,8 @@ archivebox init --setup
104104
curl -fsSL 'https://get.archivebox.io' | bash
105105
</code></pre>
106106
<br/>
107-
<sub>Open <a href="http://localhost:8000"><code>http://localhost:8000</code></a> to see your server's Web UI ➡️</sub>
107+
<sub>Open <a href="http://web.archivebox.localhost:8000"><code>http://web.archivebox.localhost:8000</code></a> for the public UI and <a href="http://admin.archivebox.localhost:8000"><code>http://admin.archivebox.localhost:8000</code></a> for the admin UI ➡️</sub><br/>
108+
<sub>Set <code>LISTEN_HOST</code> to change the base domain; <code>web.</code> and <code>admin.</code> subdomains are used automatically.</sub>
108109
</details>
109110
<br/>
110111

@@ -469,6 +470,7 @@ For more discussion on managed and paid hosting options see here: <a href="https
469470
#### ➡️&nbsp; Next Steps
470471

471472
- Import URLs from some of the supported [Input Formats](#input-formats) or view the supported [Output Formats](#output-formats)...
473+
- (Optional) Create a persona and import browser cookies to archive logged-in sites: `archivebox persona create --import=chrome personal`
472474
- Tweak your UI or archiving behavior [Configuration](#configuration), read about some of the [Caveats](#caveats), or [Troubleshoot](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting)
473475
- Read about the [Dependencies](#dependencies) used for archiving, the [Upgrading Process](https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives), or the [Archive Layout](#archive-layout) on disk...
474476
- Or check out our full [Documentation](#documentation) or [Community Wiki](#internet-archiving-ecosystem)...
@@ -495,6 +497,11 @@ docker compose run archivebox help
495497

496498
# equivalent: docker run -it -v $PWD:/data archivebox/archivebox [subcommand] [--help]
497499
docker run -it -v $PWD:/data archivebox/archivebox help
500+
501+
# optional: import your browser cookies into a persona for logged-in archiving
502+
archivebox persona create --import=chrome personal
503+
# supported: chrome/chromium/brave/edge (Chromium-based only)
504+
# re-running import merges/dedupes cookies.txt (by domain/path/name) but replaces chrome_user_data
498505
```
499506

500507
#### ArchiveBox Subcommands
@@ -587,7 +594,8 @@ docker run -v $PWD:/data -it archivebox/archivebox archivebox manage createsuper
587594
docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox
588595
</code></pre>
589596

590-
<sup>Open <a href="http://localhost:8000"><code>http://localhost:8000</code></a> to see your server's Web UI ➡️</sup>
597+
<sup>Open <a href="http://web.archivebox.localhost:8000"><code>http://web.archivebox.localhost:8000</code></a> for the public UI and <a href="http://admin.archivebox.localhost:8000"><code>http://admin.archivebox.localhost:8000</code></a> for the admin UI ➡️</sup><br/>
598+
<sup>Set <code>LISTEN_HOST</code> to change the base domain; <code>web.</code> and <code>admin.</code> subdomains are used automatically.</sup>
591599
<br/><br/>
592600
<i>For more info, see our <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#ui-usage">Usage: Web UI</a> wiki. ➡️</i>
593601
<br/><br/>

archivebox/api/auth.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,12 +127,25 @@ class UsernameAndPasswordAuth(UserPassAuthCheck, HttpBasicAuth):
127127
"""Allow authenticating by passing username & password via HTTP Basic Authentication (not recommended)"""
128128
pass
129129

130+
class DjangoSessionAuth:
131+
"""Allow authenticating with existing Django session cookies (same-origin only)."""
132+
def __call__(self, request: HttpRequest) -> Optional[AbstractBaseUser]:
133+
return self.authenticate(request)
134+
135+
def authenticate(self, request: HttpRequest, **kwargs) -> Optional[AbstractBaseUser]:
136+
user = getattr(request, 'user', None)
137+
if user and user.is_authenticated:
138+
request._api_auth_method = self.__class__.__name__
139+
if not user.is_superuser:
140+
raise HttpError(403, 'Valid session but User does not have permission (make sure user.is_superuser=True)')
141+
return cast(AbstractBaseUser, user)
142+
return None
143+
130144
### Enabled Auth Methods
131145

132146
API_AUTH_METHODS = [
133147
HeaderTokenAuth(),
134148
BearerTokenAuth(),
135149
QueryParamTokenAuth(),
136150
# django_auth_superuser, # django admin cookie auth, not secure to use with csrf=False
137-
UsernameAndPasswordAuth(),
138151
]

archivebox/api/middleware.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
__package__ = 'archivebox.api'
2+
3+
from django.http import HttpResponse
4+
5+
6+
class ApiCorsMiddleware:
7+
"""Attach permissive CORS headers for API routes (token-based auth)."""
8+
9+
def __init__(self, get_response):
10+
self.get_response = get_response
11+
12+
def __call__(self, request):
13+
if request.path.startswith('/api/'):
14+
if request.method == 'OPTIONS' and request.META.get('HTTP_ACCESS_CONTROL_REQUEST_METHOD'):
15+
response = HttpResponse(status=204)
16+
return self._add_cors_headers(request, response)
17+
18+
response = self.get_response(request)
19+
return self._add_cors_headers(request, response)
20+
21+
return self.get_response(request)
22+
23+
def _add_cors_headers(self, request, response):
24+
origin = request.META.get('HTTP_ORIGIN')
25+
if not origin:
26+
return response
27+
28+
response['Access-Control-Allow-Origin'] = '*'
29+
response['Access-Control-Allow-Methods'] = 'GET, POST, PUT, PATCH, DELETE, OPTIONS'
30+
response['Access-Control-Allow-Headers'] = (
31+
'Authorization, X-ArchiveBox-API-Key, Content-Type, X-CSRFToken'
32+
)
33+
response['Access-Control-Max-Age'] = '600'
34+
return response

archivebox/api/v1_core.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,11 @@ def resolve_archiveresults(obj, context):
188188
return ArchiveResult.objects.none()
189189

190190

191+
class SnapshotUpdateSchema(Schema):
192+
status: str | None = None
193+
retry_at: datetime | None = None
194+
195+
191196
class SnapshotFilterSchema(FilterSchema):
192197
id: Optional[str] = Field(None, q=['id__icontains', 'timestamp__startswith'])
193198
created_by_id: str = Field(None, q='crawl__created_by_id')
@@ -225,6 +230,31 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool = True):
225230
return Snapshot.objects.get(Q(id__icontains=snapshot_id))
226231

227232

233+
@router.patch("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="patch_snapshot")
234+
def patch_snapshot(request, snapshot_id: str, data: SnapshotUpdateSchema):
235+
"""Update a snapshot (e.g., set status=sealed to cancel queued work)."""
236+
try:
237+
snapshot = Snapshot.objects.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id))
238+
except Snapshot.DoesNotExist:
239+
snapshot = Snapshot.objects.get(Q(id__icontains=snapshot_id))
240+
241+
payload = data.dict(exclude_unset=True)
242+
243+
if 'status' in payload:
244+
if payload['status'] not in Snapshot.StatusChoices.values:
245+
raise HttpError(400, f'Invalid status: {payload["status"]}')
246+
snapshot.status = payload['status']
247+
if snapshot.status == Snapshot.StatusChoices.SEALED and 'retry_at' not in payload:
248+
snapshot.retry_at = None
249+
250+
if 'retry_at' in payload:
251+
snapshot.retry_at = payload['retry_at']
252+
253+
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
254+
request.with_archiveresults = False
255+
return snapshot
256+
257+
228258
### Tag #########################################################################
229259

230260
class TagSchema(Schema):

archivebox/api/v1_crawls.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
from uuid import UUID
44
from typing import List
55
from datetime import datetime
6+
from django.utils import timezone
67

78
from django.db.models import Q
89
from django.contrib.auth import get_user_model
910

1011
from ninja import Router, Schema
12+
from ninja.errors import HttpError
1113

1214
from archivebox.core.models import Snapshot
1315
from archivebox.crawls.models import Crawl
@@ -54,6 +56,11 @@ def resolve_snapshots(obj, context):
5456
return Snapshot.objects.none()
5557

5658

59+
class CrawlUpdateSchema(Schema):
60+
status: str | None = None
61+
retry_at: datetime | None = None
62+
63+
5764
@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
5865
def get_crawls(request):
5966
return Crawl.objects.all().distinct()
@@ -79,3 +86,32 @@ def get_crawl(request, crawl_id: str, as_rss: bool=False, with_snapshots: bool=F
7986

8087
return crawl
8188

89+
90+
@router.patch("/crawl/{crawl_id}", response=CrawlSchema, url_name="patch_crawl")
91+
def patch_crawl(request, crawl_id: str, data: CrawlUpdateSchema):
92+
"""Update a crawl (e.g., set status=sealed to cancel queued work)."""
93+
crawl = Crawl.objects.get(id__icontains=crawl_id)
94+
payload = data.dict(exclude_unset=True)
95+
96+
if 'status' in payload:
97+
if payload['status'] not in Crawl.StatusChoices.values:
98+
raise HttpError(400, f'Invalid status: {payload["status"]}')
99+
crawl.status = payload['status']
100+
if crawl.status == Crawl.StatusChoices.SEALED and 'retry_at' not in payload:
101+
crawl.retry_at = None
102+
103+
if 'retry_at' in payload:
104+
crawl.retry_at = payload['retry_at']
105+
106+
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
107+
108+
if payload.get('status') == Crawl.StatusChoices.SEALED:
109+
Snapshot.objects.filter(
110+
crawl=crawl,
111+
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
112+
).update(
113+
status=Snapshot.StatusChoices.SEALED,
114+
retry_at=None,
115+
modified_at=timezone.now(),
116+
)
117+
return crawl

0 commit comments

Comments
 (0)