Skip to content

Commit e518189

Browse files
authored
Merge pull request #1303 from NASA-IMPACT/url_dump_script
add script to dump curated url list with excludes
2 parents 0b3f61b + c0e017e commit e518189

File tree

1 file changed

+168
-0
lines changed

1 file changed

+168
-0
lines changed
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
"""
2+
this is meant to be run from within a shell. you can do it in the following way:
3+
4+
establish a coding container
5+
6+
```shell
7+
tmux new -s docker_django
8+
tmux attach -t docker_django
9+
tmux kill-session -t docker_django
10+
```
11+
12+
```bash
13+
dmshell
14+
```
15+
16+
copy paste this code into the shell and run it
17+
18+
getting the info out of the container
19+
20+
```bash
21+
docker cp 593dab064a15:/tmp/curated_urls_status.json ./curated_urls_status.json
22+
```
23+
24+
move it onto local
25+
```bash
26+
scp sde:/home/ec2-user/sde_indexing_helper/curated_urls_status.json .
27+
```
28+
29+
"""
30+
31+
import concurrent.futures
32+
import json
33+
import os
34+
from collections import defaultdict
35+
36+
from django.db import connection
37+
38+
from sde_collections.models.delta_url import CuratedUrl
39+
40+
41+
def process_chunk(chunk_start, chunk_size, total_count):
42+
"""Process a chunk of curated URLs and return data grouped by collection"""
43+
# Close any existing DB connections to avoid sharing connections between processes
44+
connection.close()
45+
46+
# Get the chunk of data with collection information
47+
curated_urls_chunk = (
48+
CuratedUrl.objects.select_related("collection")
49+
.all()
50+
.with_exclusion_status()
51+
.order_by("url")[chunk_start : chunk_start + chunk_size]
52+
)
53+
54+
# Group URLs by collection folder name
55+
collection_data = defaultdict(list)
56+
for url in curated_urls_chunk:
57+
collection_folder = url.collection.config_folder
58+
included = not url.excluded # Convert to boolean inclusion status
59+
60+
collection_data[collection_folder].append({"url": url.url, "included": included})
61+
62+
# Save to a temporary file
63+
temp_path = f"/tmp/chunk{chunk_start}.json"
64+
with open(temp_path, "w") as f:
65+
json.dump(dict(collection_data), f)
66+
67+
processed = min(chunk_start + chunk_size, total_count)
68+
print(f"Processed {processed}/{total_count} URLs")
69+
70+
return temp_path
71+
72+
73+
def export_curated_urls_with_status():
74+
"""Export all curated URLs with their inclusion status, grouped by collection"""
75+
output_path = "/tmp/curated_urls_status.json"
76+
77+
# Get the total count and status statistics
78+
curated_urls = CuratedUrl.objects.all().with_exclusion_status()
79+
total_count = curated_urls.count()
80+
excluded_count = curated_urls.filter(excluded=True).count()
81+
included_count = curated_urls.filter(excluded=False).count()
82+
83+
print(f"Total URLs: {total_count}")
84+
print(f" Excluded: {excluded_count}")
85+
print(f" Included: {included_count}")
86+
87+
# Define chunk size and calculate number of chunks
88+
chunk_size = 10000
89+
chunk_starts = list(range(0, total_count, chunk_size))
90+
91+
# Process chunks in parallel
92+
temp_files = []
93+
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
94+
# Submit all tasks
95+
future_to_chunk = {
96+
executor.submit(process_chunk, chunk_start, chunk_size, total_count): chunk_start
97+
for chunk_start in chunk_starts
98+
}
99+
100+
# Collect results as they complete
101+
for future in concurrent.futures.as_completed(future_to_chunk):
102+
chunk_start = future_to_chunk[future]
103+
try:
104+
temp_file = future.result()
105+
temp_files.append(temp_file)
106+
except Exception as e:
107+
print(f"Chunk starting at {chunk_start} generated an exception: {e}")
108+
109+
# Combine all temp files into final output
110+
combined_data = {}
111+
112+
# Sort temp files by chunk start position
113+
temp_files.sort(key=lambda x: int(os.path.basename(x).replace("chunk", "").split(".")[0]))
114+
115+
for temp_file in temp_files:
116+
with open(temp_file) as infile:
117+
chunk_data = json.load(infile)
118+
# Merge chunk data into combined data
119+
for collection_folder, urls in chunk_data.items():
120+
if collection_folder not in combined_data:
121+
combined_data[collection_folder] = []
122+
combined_data[collection_folder].extend(urls)
123+
124+
# Clean up temp file
125+
os.unlink(temp_file)
126+
127+
# Write the final combined data
128+
with open(output_path, "w") as outfile:
129+
json.dump(combined_data, outfile, indent=2)
130+
131+
# Verify export completed successfully
132+
if os.path.exists(output_path):
133+
file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
134+
print(f"Export complete. File saved to: {output_path}")
135+
print(f"File size: {file_size_mb:.2f} MB")
136+
137+
# Sanity check: Count the total included and excluded URLs in the final file
138+
final_included = 0
139+
final_excluded = 0
140+
141+
# Read the file back and count
142+
with open(output_path) as infile:
143+
file_data = json.load(infile)
144+
for collection_folder, urls in file_data.items():
145+
for url_data in urls:
146+
if url_data["included"]:
147+
final_included += 1
148+
else:
149+
final_excluded += 1
150+
151+
print("\nSanity check on final file:")
152+
print(f"Total URLs in file: {final_included + final_excluded}")
153+
print(f" Included: {final_included}")
154+
print(f" Excluded: {final_excluded}")
155+
156+
# Check if counts match
157+
if final_included == included_count and final_excluded == excluded_count:
158+
print("✅ Counts match database query results!")
159+
else:
160+
print("⚠️ Warning: Final counts don't match initial database query!")
161+
print(f" Database included: {included_count}, File included: {final_included}")
162+
print(f" Database excluded: {excluded_count}, File excluded: {final_excluded}")
163+
else:
164+
print("ERROR: Output file was not created!")
165+
166+
167+
# Run the export function
168+
export_curated_urls_with_status()

0 commit comments

Comments
 (0)