Skip to content

Commit 7ceaeae

Browse files
committed
rename archive_org to archivedotorg, add BinaryWorker, fix config pass-through
1 parent 456aaee commit 7ceaeae

32 files changed

+1111
-110
lines changed

.claude/settings.local.json

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
{
22
"permissions": {
33
"allow": [
4+
"Read(**)",
5+
"Glob(**)",
6+
"Grep(**)",
47
"Bash(python -m archivebox:*)",
58
"Bash(ls:*)",
69
"Bash(xargs:*)",
@@ -29,5 +32,19 @@
2932
"Bash(done)",
3033
"Bash(coverage erase:*)"
3134
]
35+
},
36+
"hooks": {
37+
"PreToolUse": [
38+
{
39+
"matcher": "Bash",
40+
"hooks": [
41+
{
42+
"type": "command",
43+
"command": "REPO_ROOT=$(git rev-parse --show-toplevel 2>/dev/null); if [ -n \"$REPO_ROOT\" ] && [ \"$PWD\" != \"$REPO_ROOT\" ]; then echo \"ERROR: Not in repo root ($REPO_ROOT). Current dir: $PWD\" >&2; exit 1; fi",
44+
"statusMessage": "Checking working directory..."
45+
}
46+
]
47+
}
48+
]
3249
}
3350
}

archivebox/base_models/models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,4 +128,4 @@ def output_dir_str(self) -> str:
128128

129129
@property
130130
def output_dir(self) -> Path:
131-
raise NotImplementedError(f'{self.__class__.__name__} must implement output_dir property')
131+
raise NotImplementedError(f"{self.__class__.__name__} must implement output_dir property")

archivebox/cli/archivebox_run.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,11 @@ def process_stdin_records() -> int:
5959
"""
6060
from django.utils import timezone
6161

62-
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
62+
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_BINARY
6363
from archivebox.base_models.models import get_or_create_system_user_pk
6464
from archivebox.core.models import Snapshot, ArchiveResult
6565
from archivebox.crawls.models import Crawl
66+
from archivebox.machine.models import Binary
6667
from archivebox.workers.orchestrator import Orchestrator
6768

6869
records = list(read_stdin())
@@ -137,6 +138,26 @@ def process_stdin_records() -> int:
137138
output_records.append(archiveresult.to_json())
138139
queued_count += 1
139140

141+
elif record_type == TYPE_BINARY:
142+
# Binary records - create or update and queue for installation
143+
if record_id:
144+
# Existing binary - re-queue
145+
try:
146+
binary = Binary.objects.get(id=record_id)
147+
except Binary.DoesNotExist:
148+
binary = Binary.from_json(record)
149+
else:
150+
# New binary - create it
151+
binary = Binary.from_json(record)
152+
153+
if binary:
154+
binary.retry_at = timezone.now()
155+
if binary.status != Binary.StatusChoices.INSTALLED:
156+
binary.status = Binary.StatusChoices.QUEUED
157+
binary.save()
158+
output_records.append(binary.to_json())
159+
queued_count += 1
160+
140161
else:
141162
# Unknown type - pass through
142163
output_records.append(record)
@@ -222,7 +243,8 @@ def run_snapshot_worker(snapshot_id: str) -> int:
222243
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
223244
@click.option('--crawl-id', help="Run orchestrator for specific crawl only")
224245
@click.option('--snapshot-id', help="Run worker for specific snapshot only")
225-
def main(daemon: bool, crawl_id: str, snapshot_id: str):
246+
@click.option('--binary-id', help="Run worker for specific binary only")
247+
def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
226248
"""
227249
Process queued work.
228250
@@ -231,11 +253,27 @@ def main(daemon: bool, crawl_id: str, snapshot_id: str):
231253
- No args + TTY: Run orchestrator for all work
232254
- --crawl-id: Run orchestrator for that crawl only
233255
- --snapshot-id: Run worker for that snapshot only
256+
- --binary-id: Run worker for that binary only
234257
"""
235258
# Snapshot worker mode
236259
if snapshot_id:
237260
sys.exit(run_snapshot_worker(snapshot_id))
238261

262+
# Binary worker mode
263+
if binary_id:
264+
from archivebox.workers.worker import BinaryWorker
265+
try:
266+
worker = BinaryWorker(binary_id=binary_id, worker_id=0)
267+
worker.runloop()
268+
sys.exit(0)
269+
except KeyboardInterrupt:
270+
sys.exit(0)
271+
except Exception as e:
272+
rprint(f'[red]Worker error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
273+
import traceback
274+
traceback.print_exc()
275+
sys.exit(1)
276+
239277
# Crawl worker mode
240278
if crawl_id:
241279
from archivebox.workers.worker import CrawlWorker

archivebox/config/configset.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ def get_config(
123123
user: Any = None,
124124
crawl: Any = None,
125125
snapshot: Any = None,
126+
archiveresult: Any = None,
126127
machine: Any = None,
127128
) -> Dict[str, Any]:
128129
"""
@@ -145,11 +146,26 @@ def get_config(
145146
user: User object with config JSON field
146147
crawl: Crawl object with config JSON field
147148
snapshot: Snapshot object with config JSON field
149+
archiveresult: ArchiveResult object (auto-fetches snapshot)
148150
machine: Machine object with config JSON field (defaults to Machine.current())
149151
152+
Note: Objects are auto-fetched from relationships if not provided:
153+
- snapshot auto-fetched from archiveresult.snapshot
154+
- crawl auto-fetched from snapshot.crawl
155+
- user auto-fetched from crawl.created_by
156+
150157
Returns:
151158
Merged config dict
152159
"""
160+
# Auto-fetch related objects from relationships
161+
if snapshot is None and archiveresult and hasattr(archiveresult, "snapshot"):
162+
snapshot = archiveresult.snapshot
163+
164+
if crawl is None and snapshot and hasattr(snapshot, "crawl"):
165+
crawl = snapshot.crawl
166+
167+
if user is None and crawl and hasattr(crawl, "created_by"):
168+
user = crawl.created_by
153169
from archivebox.config.constants import CONSTANTS
154170
from archivebox.config.common import (
155171
SHELL_CONFIG,
@@ -197,12 +213,18 @@ def get_config(
197213
if machine and hasattr(machine, "config") and machine.config:
198214
config.update(machine.config)
199215

200-
# Override with environment variables
216+
# Override with environment variables (for keys that exist in config)
201217
for key in config:
202218
env_val = os.environ.get(key)
203219
if env_val is not None:
204220
config[key] = _parse_env_value(env_val, config.get(key))
205221

222+
# Also add NEW environment variables (not yet in config)
223+
# This is important for worker subprocesses that receive config via Process.env
224+
for key, value in os.environ.items():
225+
if key.isupper() and key not in config: # Only uppercase keys (config convention)
226+
config[key] = _parse_env_value(value, None)
227+
206228
# Also check plugin config aliases in environment
207229
try:
208230
from archivebox.hooks import discover_plugin_configs
@@ -335,7 +357,7 @@ def _parse_env_value(value: str, default: Any = None) -> Any:
335357
"title": 5,
336358
"favicon": 5,
337359
"headers": 5,
338-
"archive_org": 2,
360+
"archivedotorg": 2,
339361
"readability": 3,
340362
"mercury": 3,
341363
"git": 2,

archivebox/core/forms.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ def __init__(self, *args, **kwargs):
147147
'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
148148
}
149149
archiving = {
150-
'archive_org', 'favicon', 'forumdl', 'gallerydl', 'git',
150+
'archivedotorg', 'favicon', 'forumdl', 'gallerydl', 'git',
151151
'htmltotext', 'media', 'mercury', 'papersdl', 'readability', 'wget'
152152
}
153153
parsing = {

archivebox/core/migrations/0007_archiveresult.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ class Migration(migrations.Migration):
120120
('output', models.CharField(max_length=512)),
121121
('start_ts', models.DateTimeField()),
122122
('end_ts', models.DateTimeField()),
123-
('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)),
123+
('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archivedotorg', 'archivedotorg')], max_length=32)),
124124
('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
125125
],
126126
),

archivebox/core/migrations/0011_auto_20210216_1331.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,6 @@ class Migration(migrations.Migration):
1919
migrations.AlterField(
2020
model_name='archiveresult',
2121
name='extractor',
22-
field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
22+
field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
2323
),
2424
]

archivebox/core/migrations/0021_auto_20220914_0934.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,6 @@ class Migration(migrations.Migration):
1313
migrations.AlterField(
1414
model_name='archiveresult',
1515
name='extractor',
16-
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
16+
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
1717
),
1818
]

archivebox/core/migrations/0022_auto_20231023_2008.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,6 @@ class Migration(migrations.Migration):
1313
migrations.AlterField(
1414
model_name='archiveresult',
1515
name='extractor',
16-
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('htmltotext', 'htmltotext'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
16+
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('htmltotext', 'htmltotext'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
1717
),
1818
]

archivebox/core/models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1973,7 +1973,7 @@ def find_best_output_in_dir(dir_path: Path, plugin_name: str) -> Optional[str]:
19731973
canonical = {
19741974
'index_path': 'index.html',
19751975
'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
1976-
'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
1976+
'archivedotorg_path': f'https://web.archive.org/web/{self.base_url}',
19771977
}
19781978

19791979
# Scan each ArchiveResult's output directory for the best file

0 commit comments

Comments
 (0)