Skip to content

Commit 71cf024

Browse files
authored
Merge pull request #1312 from NASA-IMPACT/dev
Merge dev into staging
2 parents 4410186 + 54e7b08 commit 71cf024

File tree

8 files changed

+629
-6
lines changed

8 files changed

+629
-6
lines changed

.github/ISSUE_TEMPLATE/config.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
blank_issues_enabled: false

.pre-commit-config.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,18 +61,18 @@ repos:
6161
- types-requests
6262

6363
- repo: https://github.com/PyCQA/bandit
64-
rev: '1.7.0'
64+
rev: "1.7.0"
6565
hooks:
6666
- id: bandit
67-
args: ['-r', '--configfile=bandit-config.yml']
67+
args: ["-r", "--configfile=bandit-config.yml"]
68+
additional_dependencies:
69+
- pbr
6870

6971
- repo: https://github.com/zricethezav/gitleaks
70-
rev: 'v8.0.4'
72+
rev: "v8.0.4"
7173
hooks:
7274
- id: gitleaks
73-
args: ['--config=gitleaks-config.toml']
74-
75-
75+
args: ["--config=gitleaks-config.toml"]
7676

7777
ci:
7878
autoupdate_schedule: weekly

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,13 @@ For each PR made, an entry should be added to this changelog. It should contain
1212
- etc.
1313

1414
## Changelog
15+
### 3.1.??
16+
- 1232-process-the-full-text-dump
17+
- Description: A script was added `/scripts/sde_dump_processing/clean_text_dump.py` which cleans dumps from sinequa. The sinequa dump does not respect normal csv new line formatting, so that a dump of 1.8 million records becomes a csv of 900 million lines. This script can detect the headers and process the dump with the three possible sources TDAMM, SDE, and scripts, in order to create a final, clean csv. It has a simple CLI which allows setting the input and output, the verbosity of the logs, etc. Because the input files can be very large, the script streams them instead of holding them in memory.
18+
- Changes:
19+
- add file /scripts/sde_dump_processing/clean_text_dump.py`
1520

21+
### 3.1.0
1622
- 1209-bug-fix-document-type-creator-form
1723
- Description: The dropdown on the pattern creation form needs to be set as multi as the default option since this is why the doc type creator form is used for the majority of multi-URL pattern creations. This should be applied to doc types, division types, and titles as well.
1824
- Changes:

config/settings/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@
102102
"http://sciencediscoveryengine.nasa.gov",
103103
"https://localhost:4200",
104104
"http://localhost:4200",
105+
"https://science.data.nasa.gov/",
105106
]
106107

107108
# MIGRATIONS
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
"""
2+
this is meant to be run from within a shell. you can do it in the following way:
3+
4+
establish a coding container
5+
6+
```shell
7+
tmux new -s docker_django
8+
tmux attach -t docker_django
9+
tmux kill-session -t docker_django
10+
```
11+
12+
```bash
13+
dmshell
14+
```
15+
16+
copy paste this code into the shell and run it
17+
18+
getting the info out of the container
19+
20+
```bash
21+
docker cp 593dab064a15:/tmp/curated_urls_status.json ./curated_urls_status.json
22+
```
23+
24+
move it onto local
25+
```bash
26+
scp sde:/home/ec2-user/sde_indexing_helper/curated_urls_status.json .
27+
```
28+
29+
"""
30+
31+
import concurrent.futures
32+
import json
33+
import os
34+
from collections import defaultdict
35+
36+
from django.db import connection
37+
38+
from sde_collections.models.delta_url import CuratedUrl
39+
40+
41+
def process_chunk(chunk_start, chunk_size, total_count):
42+
"""Process a chunk of curated URLs and return data grouped by collection"""
43+
# Close any existing DB connections to avoid sharing connections between processes
44+
connection.close()
45+
46+
# Get the chunk of data with collection information
47+
curated_urls_chunk = (
48+
CuratedUrl.objects.select_related("collection")
49+
.all()
50+
.with_exclusion_status()
51+
.order_by("url")[chunk_start : chunk_start + chunk_size]
52+
)
53+
54+
# Group URLs by collection folder name
55+
collection_data = defaultdict(list)
56+
for url in curated_urls_chunk:
57+
collection_folder = url.collection.config_folder
58+
included = not url.excluded # Convert to boolean inclusion status
59+
60+
collection_data[collection_folder].append({"url": url.url, "included": included})
61+
62+
# Save to a temporary file
63+
temp_path = f"/tmp/chunk{chunk_start}.json"
64+
with open(temp_path, "w") as f:
65+
json.dump(dict(collection_data), f)
66+
67+
processed = min(chunk_start + chunk_size, total_count)
68+
print(f"Processed {processed}/{total_count} URLs")
69+
70+
return temp_path
71+
72+
73+
def export_curated_urls_with_status():
74+
"""Export all curated URLs with their inclusion status, grouped by collection"""
75+
output_path = "/tmp/curated_urls_status.json"
76+
77+
# Get the total count and status statistics
78+
curated_urls = CuratedUrl.objects.all().with_exclusion_status()
79+
total_count = curated_urls.count()
80+
excluded_count = curated_urls.filter(excluded=True).count()
81+
included_count = curated_urls.filter(excluded=False).count()
82+
83+
print(f"Total URLs: {total_count}")
84+
print(f" Excluded: {excluded_count}")
85+
print(f" Included: {included_count}")
86+
87+
# Define chunk size and calculate number of chunks
88+
chunk_size = 10000
89+
chunk_starts = list(range(0, total_count, chunk_size))
90+
91+
# Process chunks in parallel
92+
temp_files = []
93+
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
94+
# Submit all tasks
95+
future_to_chunk = {
96+
executor.submit(process_chunk, chunk_start, chunk_size, total_count): chunk_start
97+
for chunk_start in chunk_starts
98+
}
99+
100+
# Collect results as they complete
101+
for future in concurrent.futures.as_completed(future_to_chunk):
102+
chunk_start = future_to_chunk[future]
103+
try:
104+
temp_file = future.result()
105+
temp_files.append(temp_file)
106+
except Exception as e:
107+
print(f"Chunk starting at {chunk_start} generated an exception: {e}")
108+
109+
# Combine all temp files into final output
110+
combined_data = {}
111+
112+
# Sort temp files by chunk start position
113+
temp_files.sort(key=lambda x: int(os.path.basename(x).replace("chunk", "").split(".")[0]))
114+
115+
for temp_file in temp_files:
116+
with open(temp_file) as infile:
117+
chunk_data = json.load(infile)
118+
# Merge chunk data into combined data
119+
for collection_folder, urls in chunk_data.items():
120+
if collection_folder not in combined_data:
121+
combined_data[collection_folder] = []
122+
combined_data[collection_folder].extend(urls)
123+
124+
# Clean up temp file
125+
os.unlink(temp_file)
126+
127+
# Write the final combined data
128+
with open(output_path, "w") as outfile:
129+
json.dump(combined_data, outfile, indent=2)
130+
131+
# Verify export completed successfully
132+
if os.path.exists(output_path):
133+
file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
134+
print(f"Export complete. File saved to: {output_path}")
135+
print(f"File size: {file_size_mb:.2f} MB")
136+
137+
# Sanity check: Count the total included and excluded URLs in the final file
138+
final_included = 0
139+
final_excluded = 0
140+
141+
# Read the file back and count
142+
with open(output_path) as infile:
143+
file_data = json.load(infile)
144+
for collection_folder, urls in file_data.items():
145+
for url_data in urls:
146+
if url_data["included"]:
147+
final_included += 1
148+
else:
149+
final_excluded += 1
150+
151+
print("\nSanity check on final file:")
152+
print(f"Total URLs in file: {final_included + final_excluded}")
153+
print(f" Included: {final_included}")
154+
print(f" Excluded: {final_excluded}")
155+
156+
# Check if counts match
157+
if final_included == included_count and final_excluded == excluded_count:
158+
print("✅ Counts match database query results!")
159+
else:
160+
print("⚠️ Warning: Final counts don't match initial database query!")
161+
print(f" Database included: {included_count}, File included: {final_included}")
162+
print(f" Database excluded: {excluded_count}, File excluded: {final_excluded}")
163+
else:
164+
print("ERROR: Output file was not created!")
165+
166+
167+
# Run the export function
168+
export_curated_urls_with_status()

0 commit comments

Comments
 (0)