Skip to content

Commit 8f8d64e

Browse files
committed
Added command
1 parent fb09bbc commit 8f8d64e

File tree

1 file changed

+157
-0
lines changed

1 file changed

+157
-0
lines changed
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
""""
2+
docker-compose -f local.yml run --rm django python manage.py export_urls_to_csv \
3+
--output physics_of_the_cosmos.csv --collections physics_of_the_cosmos
4+
5+
"""
6+
7+
import csv
8+
import json
9+
import os
10+
from pathlib import Path
11+
12+
from django.apps import apps
13+
from django.core.management.base import BaseCommand
14+
15+
from sde_collections.models.collection import Collection
16+
from sde_collections.models.collection_choice_fields import Divisions, DocumentTypes
17+
18+
19+
class Command(BaseCommand):
20+
help = "Export URLs from DumpUrl, DeltaUrl, or CuratedUrl models to CSV"
21+
22+
def add_arguments(self, parser):
23+
parser.add_argument(
24+
"--model",
25+
type=str,
26+
default="CuratedUrl",
27+
choices=["DumpUrl", "DeltaUrl", "CuratedUrl"],
28+
help="Model to export (default: CuratedUrl)",
29+
)
30+
parser.add_argument(
31+
"--collections", nargs="+", type=str, help="Collection config_folders to filter by (default: all)"
32+
)
33+
parser.add_argument(
34+
"--output", type=str, default="exported_urls.csv", help="Output CSV file path (default: exported_urls.csv)"
35+
)
36+
parser.add_argument(
37+
"--batch-size", type=int, default=1000, help="Number of records to process in each batch (default: 1000)"
38+
)
39+
40+
def handle(self, *args, **options):
41+
model_name = options["model"]
42+
collection_folders = options["collections"]
43+
output_file = options["output"]
44+
batch_size = options["batch_size"]
45+
46+
csv_exports_dir = Path("csv_exports")
47+
csv_exports_dir.mkdir(parents=True, exist_ok=True)
48+
49+
output_file = os.path.join(csv_exports_dir, os.path.basename(output_file))
50+
51+
self.stdout.write(f"Exporting {model_name} to {output_file}")
52+
53+
# Get the model class
54+
model_class = apps.get_model("sde_collections", model_name)
55+
56+
# Build the queryset
57+
queryset = model_class.objects.all()
58+
59+
# Filter by collections if specified
60+
if collection_folders:
61+
collections = Collection.objects.filter(config_folder__in=collection_folders)
62+
if not collections.exists():
63+
self.stderr.write(self.style.ERROR("No collections found with the specified folder names"))
64+
return
65+
queryset = queryset.filter(collection__in=collections)
66+
self.stdout.write(f"Filtering by {len(collections)} collections")
67+
68+
# Get total count for progress reporting
69+
total_count = queryset.count()
70+
if total_count == 0:
71+
self.stdout.write(self.style.WARNING(f"No {model_name} records found matching the criteria"))
72+
return
73+
74+
self.stdout.write(f"Found {total_count} records to export")
75+
76+
# Define all fields to export
77+
base_fields = [
78+
"url",
79+
"scraped_title",
80+
"scraped_text",
81+
"generated_title",
82+
"visited",
83+
"document_type",
84+
"division",
85+
]
86+
87+
# Add paired field tags separately
88+
tag_fields = ["tdamm_tag_manual", "tdamm_tag_ml"]
89+
90+
# Add model-specific fields
91+
model_specific_fields = []
92+
if model_name == "DeltaUrl":
93+
model_specific_fields.append("to_delete")
94+
95+
# Add collection field
96+
collection_fields = ["collection__name", "collection__config_folder"]
97+
98+
# Combine all fields
99+
fields = base_fields + tag_fields + model_specific_fields + collection_fields
100+
101+
# Get choice dictionaries for lookup
102+
document_type_choices = dict(DocumentTypes.choices)
103+
division_choices = dict(Divisions.choices)
104+
105+
# Write to CSV
106+
try:
107+
with open(output_file, "w", newline="", encoding="utf-8") as csv_file:
108+
writer = csv.writer(csv_file)
109+
110+
# Write header
111+
writer.writerow(fields)
112+
113+
# Write data rows in batches
114+
processed_count = 0
115+
for i in range(0, total_count, batch_size):
116+
batch = queryset[i : i + batch_size]
117+
118+
for obj in batch:
119+
row = []
120+
for field in fields:
121+
if field == "collection__name":
122+
value = obj.collection.name
123+
elif field == "collection__config_folder":
124+
value = obj.collection.config_folder
125+
elif field == "document_type":
126+
doc_type = getattr(obj, field)
127+
if doc_type is not None:
128+
value = document_type_choices.get(doc_type, str(doc_type))
129+
else:
130+
value = ""
131+
elif field == "division":
132+
div = getattr(obj, field)
133+
if div is not None:
134+
value = division_choices.get(div, str(div))
135+
else:
136+
value = ""
137+
elif field in tag_fields:
138+
tags = getattr(obj, field)
139+
if tags:
140+
value = json.dumps(tags)
141+
else:
142+
value = ""
143+
else:
144+
value = getattr(obj, field, "")
145+
row.append(value)
146+
writer.writerow(row)
147+
processed_count += 1
148+
149+
# Report progress
150+
progress_pct = processed_count / total_count * 100
151+
self.stdout.write(f"Processed {processed_count}/{total_count} records ({progress_pct:.1f}%)")
152+
153+
self.stdout.write(self.style.SUCCESS(f"Successfully exported {processed_count} URLs to {output_file}"))
154+
155+
except Exception as e:
156+
self.stderr.write(self.style.ERROR(f"Error exporting to CSV: {e}"))
157+
raise

0 commit comments

Comments
 (0)