Skip to content

Commit 0b3f61b

Browse files
authored
Merge pull request #1301 from NASA-IMPACT/staging
Staging to dev - CSV Export
2 parents fb09bbc + 4410186 commit 0b3f61b

File tree

2 files changed

+176
-0
lines changed

2 files changed

+176
-0
lines changed

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,3 +183,14 @@ For each PR made, an entry should be added to this changelog. It should contain
183183
- physics_of_the_cosmos
184184
- stsci_space_telescope_science_institute
185185
- Once the front end has been updated to allow for tag edits, all astrophysics collections will be marked to be run through the pipeline
186+
187+
- 1298-csv-export-command-for-urls
188+
- Description: Added a new Django management command to export URLs (DumpUrl, DeltaUrl, or CuratedUrl) to CSV files for analysis or backup purposes. The command allows filtering by collection and provides configurable export options.
189+
- Changes:
190+
- Created a new management command `export_urls_to_csv.py` to extract URL data to CSV format
191+
- Implemented options to filter exports by model type and specific collections
192+
- Added support for excluding full text content with the `--full_text` flag to reduce file size
193+
- Included proper handling for paired fields (tdamm_tag_manual, tdamm_tag_ml)
194+
- Added automatic creation of a dedicated `csv_exports` directory for storing export files
195+
- Implemented batched processing to efficiently handle large datasets
196+
- Added progress reporting during export operations
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
""""
2+
docker-compose -f local.yml run --rm django python manage.py export_urls_to_csv \
3+
--output physics_of_the_cosmos.csv --collections physics_of_the_cosmos
4+
5+
"""
6+
7+
import csv
8+
import json
9+
import os
10+
from pathlib import Path
11+
12+
from django.apps import apps
13+
from django.core.management.base import BaseCommand
14+
15+
from sde_collections.models.collection import Collection
16+
from sde_collections.models.collection_choice_fields import Divisions, DocumentTypes
17+
18+
19+
class Command(BaseCommand):
20+
help = "Export URLs from DumpUrl, DeltaUrl, or CuratedUrl models to CSV"
21+
22+
def add_arguments(self, parser):
23+
parser.add_argument(
24+
"--model",
25+
type=str,
26+
default="CuratedUrl",
27+
choices=["DumpUrl", "DeltaUrl", "CuratedUrl"],
28+
help="Model to export (default: CuratedUrl)",
29+
)
30+
parser.add_argument(
31+
"--collections", nargs="+", type=str, help="Collection config_folders to filter by (default: all)"
32+
)
33+
parser.add_argument(
34+
"--output", type=str, default="exported_urls.csv", help="Output CSV file path (default: exported_urls.csv)"
35+
)
36+
parser.add_argument(
37+
"--batch-size", type=int, default=1000, help="Number of records to process in each batch (default: 1000)"
38+
)
39+
parser.add_argument(
40+
"--full_text", action="store_true", default=False, help="Include full text in export (default: False)"
41+
)
42+
43+
def handle(self, *args, **options):
44+
model_name = options["model"]
45+
collection_folders = options["collections"]
46+
output_file = options["output"]
47+
batch_size = options["batch_size"]
48+
49+
csv_exports_dir = Path("csv_exports")
50+
csv_exports_dir.mkdir(parents=True, exist_ok=True)
51+
52+
output_file = os.path.join(csv_exports_dir, os.path.basename(output_file))
53+
54+
self.stdout.write(f"Exporting {model_name} to {output_file}")
55+
56+
# Get the model class
57+
model_class = apps.get_model("sde_collections", model_name)
58+
59+
# Build the queryset
60+
queryset = model_class.objects.all()
61+
62+
# Filter by collections if specified
63+
if collection_folders:
64+
collections = Collection.objects.filter(config_folder__in=collection_folders)
65+
if not collections.exists():
66+
self.stderr.write(self.style.ERROR("No collections found with the specified folder names"))
67+
return
68+
queryset = queryset.filter(collection__in=collections)
69+
self.stdout.write(f"Filtering by {len(collections)} collections")
70+
71+
# Get total count for progress reporting
72+
total_count = queryset.count()
73+
if total_count == 0:
74+
self.stdout.write(self.style.WARNING(f"No {model_name} records found matching the criteria"))
75+
return
76+
77+
self.stdout.write(f"Found {total_count} records to export")
78+
79+
# Define all fields to export
80+
base_fields = [
81+
"url",
82+
"scraped_title",
83+
"generated_title",
84+
"document_type",
85+
"division",
86+
]
87+
88+
# Add scraped_text only if full_text is True
89+
if options["full_text"]:
90+
base_fields.append("scraped_text")
91+
self.stdout.write("Including full text content in export")
92+
else:
93+
self.stdout.write("Excluding full text content from export (use --full_text to include)")
94+
95+
# Add paired field tags separately
96+
tag_fields = ["tdamm_tag_manual", "tdamm_tag_ml"]
97+
98+
# Add model-specific fields
99+
model_specific_fields = []
100+
if model_name == "DeltaUrl":
101+
model_specific_fields.append("to_delete")
102+
103+
# Add collection field
104+
collection_fields = ["collection__name", "collection__config_folder"]
105+
106+
# Combine all fields
107+
fields = base_fields + tag_fields + model_specific_fields + collection_fields
108+
109+
# Get choice dictionaries for lookup
110+
document_type_choices = dict(DocumentTypes.choices)
111+
division_choices = dict(Divisions.choices)
112+
113+
# Write to CSV
114+
try:
115+
with open(output_file, "w", newline="", encoding="utf-8") as csv_file:
116+
writer = csv.writer(csv_file)
117+
118+
# Write header
119+
writer.writerow(fields)
120+
121+
# Write data rows in batches
122+
processed_count = 0
123+
for i in range(0, total_count, batch_size):
124+
batch = queryset[i : i + batch_size]
125+
126+
for obj in batch:
127+
row = []
128+
for field in fields:
129+
if field == "collection__name":
130+
value = obj.collection.name
131+
elif field == "collection__config_folder":
132+
value = obj.collection.config_folder
133+
elif field == "document_type":
134+
doc_type = getattr(obj, field)
135+
if doc_type is not None:
136+
value = document_type_choices.get(doc_type, str(doc_type))
137+
else:
138+
value = ""
139+
elif field == "division":
140+
div = getattr(obj, field)
141+
if div is not None:
142+
value = division_choices.get(div, str(div))
143+
else:
144+
value = ""
145+
elif field in tag_fields:
146+
tags = getattr(obj, field)
147+
if tags:
148+
value = json.dumps(tags)
149+
else:
150+
value = ""
151+
else:
152+
value = getattr(obj, field, "")
153+
row.append(value)
154+
writer.writerow(row)
155+
processed_count += 1
156+
157+
# Report progress
158+
progress_pct = processed_count / total_count * 100
159+
self.stdout.write(f"Processed {processed_count}/{total_count} records ({progress_pct:.1f}%)")
160+
161+
self.stdout.write(self.style.SUCCESS(f"Successfully exported {processed_count} URLs to {output_file}"))
162+
163+
except Exception as e:
164+
self.stderr.write(self.style.ERROR(f"Error exporting to CSV: {e}"))
165+
raise

0 commit comments

Comments
 (0)