Skip to content

Commit 6fadcf5

Browse files
committed
remove model health stats from models that dont need it
1 parent e903fa1 commit 6fadcf5

File tree

5 files changed

+46
-1016
lines changed

5 files changed

+46
-1016
lines changed

archivebox/core/migrations/0023_upgrade_to_0_9_0.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,11 @@ def upgrade_core_tables(apps, schema_editor):
143143
if has_added and not has_bookmarked_at:
144144
# Migrating from v0.7.2 (has added/updated, no bookmarked_at/created_at/modified_at)
145145
print('Migrating Snapshot from v0.7.2 schema...')
146+
# Debug: Check what data we're about to copy
147+
cursor.execute("SELECT id, added, updated FROM core_snapshot LIMIT 3")
148+
sample_data = cursor.fetchall()
149+
print(f'DEBUG 0023: Sample Snapshot data before migration: {sample_data}')
150+
146151
cursor.execute("""
147152
INSERT OR IGNORE INTO core_snapshot_new (
148153
id, url, timestamp, title, bookmarked_at, created_at, modified_at
@@ -154,6 +159,11 @@ def upgrade_core_tables(apps, schema_editor):
154159
COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at
155160
FROM core_snapshot;
156161
""")
162+
163+
# Debug: Check what was inserted
164+
cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot_new LIMIT 3")
165+
inserted_data = cursor.fetchall()
166+
print(f'DEBUG 0023: Sample Snapshot data after INSERT: {inserted_data}')
157167
elif has_bookmarked_at and not has_added:
158168
# Migrating from v0.8.6rc0 (already has bookmarked_at/created_at/modified_at)
159169
print('Migrating Snapshot from v0.8.6rc0 schema...')
@@ -298,12 +308,15 @@ class Migration(migrations.Migration):
298308
),
299309
],
300310
state_operations=[
301-
# Remove old ArchiveResult fields
302-
migrations.RemoveField(model_name='archiveresult', name='extractor'),
303-
migrations.RemoveField(model_name='archiveresult', name='output'),
304-
# Remove old Snapshot fields
311+
# NOTE: We do NOT remove extractor/output here for ArchiveResult!
312+
# They are still in the database and will be removed by migration 0025
313+
# after copying their data to the new field names (plugin, output_str).
314+
315+
# However, for Snapshot, we DO remove added/updated here because
316+
# the database operations above already renamed them to bookmarked_at/created_at/modified_at.
305317
migrations.RemoveField(model_name='snapshot', name='added'),
306318
migrations.RemoveField(model_name='snapshot', name='updated'),
319+
307320
# SnapshotTag table already exists from v0.7.2, just declare it in state
308321
migrations.CreateModel(
309322
name='SnapshotTag',

archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def copy_old_fields_to_new(apps, schema_editor):
2525
count = cursor.fetchone()[0]
2626
print(f'DEBUG 0025: Updated {count} rows with plugin data')
2727
else:
28-
print(f'DEBUG 0025: NOT copying - extractor in cols: {extractor" in cols}, plugin in cols: {"plugin" in cols}')
28+
print(f'DEBUG 0025: NOT copying - extractor in cols: {"extractor" in cols}, plugin in cols: {"plugin" in cols}')
2929

3030
if 'output' in cols and 'output_str' in cols:
3131
# Copy output -> output_str
@@ -239,6 +239,16 @@ class Migration(migrations.Migration):
239239
copy_old_fields_to_new,
240240
reverse_code=migrations.RunPython.noop,
241241
),
242+
# Now remove the old ArchiveResult fields after data has been copied
243+
migrations.RemoveField(
244+
model_name='archiveresult',
245+
name='extractor',
246+
),
247+
migrations.RemoveField(
248+
model_name='archiveresult',
249+
name='output',
250+
),
251+
# NOTE: Snapshot's added/updated fields were already removed by migration 0023
242252
migrations.AlterField(
243253
model_name='archiveresult',
244254
name='end_ts',

archivebox/core/models.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
get_plugins, get_plugin_name, get_plugin_icon,
3030
)
3131
from archivebox.base_models.models import (
32-
ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
32+
ModelWithUUID, ModelWithOutputDir,
3333
ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
3434
get_or_create_system_user_pk,
3535
)
@@ -40,7 +40,7 @@
4040

4141

4242

43-
class Tag(ModelWithSerializers):
43+
class Tag(ModelWithUUID):
4444
# Keep AutoField for compatibility with main branch migrations
4545
# Don't use UUIDField here - requires complex FK transformation
4646
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
@@ -2254,7 +2254,7 @@ def enter_sealed(self):
22542254
)
22552255

22562256

2257-
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
2257+
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithStateMachine):
22582258
class StatusChoices(models.TextChoices):
22592259
QUEUED = 'queued', 'Queued'
22602260
STARTED = 'started', 'Started'
@@ -2551,11 +2551,20 @@ def save_search_index(self):
25512551
pass
25522552

25532553
def cascade_health_update(self, success: bool):
2554-
"""Update health stats for self, parent Snapshot, and grandparent Crawl."""
2555-
self.increment_health_stats(success)
2554+
"""Update health stats for parent Snapshot, Crawl, and execution infrastructure (Binary, Machine, NetworkInterface)."""
2555+
# Update archival hierarchy
25562556
self.snapshot.increment_health_stats(success)
25572557
self.snapshot.crawl.increment_health_stats(success)
25582558

2559+
# Update execution infrastructure
2560+
if self.binary:
2561+
self.binary.increment_health_stats(success)
2562+
if self.binary.machine:
2563+
self.binary.machine.increment_health_stats(success)
2564+
2565+
if self.iface:
2566+
self.iface.increment_health_stats(success)
2567+
25592568
def run(self):
25602569
"""
25612570
Execute this ArchiveResult's hook and update status.

archivebox/crawls/models.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,14 @@
1616
from rich import print
1717

1818
from archivebox.config import CONSTANTS
19-
from archivebox.base_models.models import ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk
19+
from archivebox.base_models.models import ModelWithUUID, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk
2020
from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
2121

2222
if TYPE_CHECKING:
2323
from archivebox.core.models import Snapshot, ArchiveResult
2424

2525

26-
class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
26+
class CrawlSchedule(ModelWithUUID, ModelWithNotes):
2727
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
2828
created_at = models.DateTimeField(default=timezone.now, db_index=True)
2929
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
@@ -197,9 +197,9 @@ def from_json(record: dict, overrides: dict = None):
197197

198198
@property
199199
def output_dir_parent(self) -> str:
200-
"""Construct parent directory: users/{user_id}/crawls/{YYYYMMDD}"""
200+
"""Construct parent directory: users/{username}/crawls/{YYYYMMDD}"""
201201
date_str = self.created_at.strftime('%Y%m%d')
202-
return f'users/{self.created_by_id}/crawls/{date_str}'
202+
return f'users/{self.created_by.username}/crawls/{date_str}'
203203

204204
@property
205205
def output_dir_name(self) -> str:

0 commit comments

Comments
 (0)