Skip to content

Commit 057b49a

Browse files
committed
Update status command to use DB as source of truth
Remove imports of deleted folder utility functions and rewrite status command to query Snapshot model directly. This aligns with the fs_version refactor where the DB is the single source of truth. - Use Snapshot.objects queries for indexed/archived/unarchived counts - Scan filesystem directly for present/orphaned directory counts - Simplify output to focus on essential status information
1 parent 767458e commit 057b49a

File tree

1 file changed

+27
-41
lines changed

1 file changed

+27
-41
lines changed

archivebox/cli/archivebox_status.py

Lines changed: 27 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,6 @@
1111
from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR
1212
from archivebox.config.common import SHELL_CONFIG
1313
from archivebox.misc.legacy import parse_json_links_details
14-
from archivebox.misc.folders import (
15-
get_indexed_folders,
16-
get_archived_folders,
17-
get_invalid_folders,
18-
get_unarchived_folders,
19-
get_present_folders,
20-
get_valid_folders,
21-
get_duplicate_folders,
22-
get_orphaned_folders,
23-
get_corrupted_folders,
24-
get_unrecognized_folders,
25-
)
2614
from archivebox.misc.system import get_dir_size
2715
from archivebox.misc.logging_util import printable_filesize
2816

@@ -55,42 +43,40 @@ def status(out_dir: Path=DATA_DIR) -> None:
5543
size = printable_filesize(num_bytes)
5644
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
5745

58-
num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
59-
num_archived = len(get_archived_folders(links, out_dir=out_dir))
60-
num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
61-
print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
62-
print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
63-
print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
64-
65-
num_present = len(get_present_folders(links, out_dir=out_dir))
66-
num_valid = len(get_valid_folders(links, out_dir=out_dir))
46+
# Use DB as source of truth for snapshot status
47+
num_indexed = links.count()
48+
num_archived = links.filter(status='archived').count() or links.exclude(downloaded_at=None).count()
49+
num_unarchived = links.filter(status='queued').count() or links.filter(downloaded_at=None).count()
50+
print(f' > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)')
51+
print(f' > archived: {num_archived}'.ljust(36), '(snapshots with archived content)')
52+
print(f' > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)')
53+
54+
# Count directories on filesystem
55+
num_present = 0
56+
orphaned_dirs = []
57+
if ARCHIVE_DIR.exists():
58+
for entry in ARCHIVE_DIR.iterdir():
59+
if entry.is_dir():
60+
num_present += 1
61+
if not links.filter(timestamp=entry.name).exists():
62+
orphaned_dirs.append(str(entry))
63+
64+
num_valid = min(num_present, num_indexed) # approximate
6765
print()
68-
print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
69-
print(f' > [green]valid:[/green] {num_valid}'.ljust(36), f' ({get_valid_folders.__doc__})')
70-
71-
duplicate = get_duplicate_folders(links, out_dir=out_dir)
72-
orphaned = get_orphaned_folders(links, out_dir=out_dir)
73-
corrupted = get_corrupted_folders(links, out_dir=out_dir)
74-
unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
75-
num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
76-
print(f' > [red]invalid:[/red] {num_invalid}'.ljust(36), f' ({get_invalid_folders.__doc__})')
77-
print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
78-
print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
79-
print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
80-
print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
66+
print(f' > present: {num_present}'.ljust(36), '(directories in archive/)')
67+
print(f' > [green]valid:[/green] {num_valid}'.ljust(36), ' (directories with matching DB entry)')
68+
69+
num_orphaned = len(orphaned_dirs)
70+
print(f' > [red]orphaned:[/red] {num_orphaned}'.ljust(36), ' (directories without matching DB entry)')
8171

8272
if num_indexed:
83-
print(' [violet]Hint:[/violet] You can list link data directories by status like so:')
84-
print(' [green]archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)[/green]')
73+
print(' [violet]Hint:[/violet] You can list snapshots by status like so:')
74+
print(' [green]archivebox list --status=<status> (e.g. archived, queued, etc.)[/green]')
8575

86-
if orphaned:
76+
if orphaned_dirs:
8777
print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:')
8878
print(' [green]archivebox init[/green]')
8979

90-
if num_invalid:
91-
print(' [violet]Hint:[/violet] You may need to manually remove or fix some invalid data directories, afterwards make sure to run:')
92-
print(' [green]archivebox init[/green]')
93-
9480
print()
9581
print('[green]\\[*] Scanning recent archive changes and user logins:[/green]')
9682
print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]')

0 commit comments

Comments
 (0)