NASA-IMPACT
diff --git a/‎.github/ISSUE_TEMPLATE/config.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/ISSUE_TEMPLATE/config.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 6 additions & 6 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎config/settings/base.py‎
Lines changed: 1 addition & 0 deletions b/‎config/settings/base.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎scripts/dump_url_list_excludes_includes.py‎
Lines changed: 168 additions & 0 deletions b/‎scripts/dump_url_list_excludes_includes.py‎
Lines changed: 168 additions & 0 deletions
@@ -0,0 +1 @@
+blank_issues_enabled: false
@@ -61,18 +61,18 @@ repos:
           - types-requests
 
   - repo: https://github.com/PyCQA/bandit
-    rev: '1.7.0'
+    rev: "1.7.0"
     hooks:
       - id: bandit
-        args: ['-r', '--configfile=bandit-config.yml']
+        args: ["-r", "--configfile=bandit-config.yml"]
+        additional_dependencies:
+          - pbr
 
   - repo: https://github.com/zricethezav/gitleaks
-    rev: 'v8.0.4'
+    rev: "v8.0.4"
     hooks:
       - id: gitleaks
-        args: ['--config=gitleaks-config.toml']
-
-
+        args: ["--config=gitleaks-config.toml"]
 
 ci:
   autoupdate_schedule: weekly
 
@@ -12,7 +12,13 @@ For each PR made, an entry should be added to this changelog. It should contain
   - etc.
 
 ## Changelog
+### 3.1.??
+- 1232-process-the-full-text-dump
+  - Description: A script was added `/scripts/sde_dump_processing/clean_text_dump.py` which cleans dumps from sinequa. The sinequa dump does not respect normal csv new line formatting, so that a dump of 1.8 million records becomes a csv of 900 million lines. This script can detect the headers and process the dump with the three possible sources TDAMM, SDE, and scripts, in order to create a final, clean csv. It has a simple CLI which allows setting the input and output, the verbosity of the logs, etc. Because the input files can be very large, the script streams them instead of holding them in memory.
+  - Changes:
+    - add file /scripts/sde_dump_processing/clean_text_dump.py`
 
+### 3.1.0
 - 1209-bug-fix-document-type-creator-form
   - Description: The dropdown on the pattern creation form needs to be set as multi as the default option since this is why the doc type creator form is used for the majority of multi-URL pattern creations. This should be applied to doc types, division types, and titles as well.
   - Changes:
 
@@ -102,6 +102,7 @@
     "http://sciencediscoveryengine.nasa.gov",
     "https://localhost:4200",
     "http://localhost:4200",
+    "https://science.data.nasa.gov/",
 ]
 
 # MIGRATIONS
 
@@ -0,0 +1,168 @@
+"""
+this is meant to be run from within a shell. you can do it in the following way:
+
+establish a coding container
+
+```shell
+tmux new -s docker_django
+tmux attach -t docker_django
+tmux kill-session -t docker_django
+```
+
+```bash
+dmshell
+```
+
+copy paste this code into the shell and run it
+
+getting the info out of the container
+
+```bash
+docker cp 593dab064a15:/tmp/curated_urls_status.json ./curated_urls_status.json
+```
+
+move it onto local
+```bash
+scp sde:/home/ec2-user/sde_indexing_helper/curated_urls_status.json .
+```
+
+"""
+
+import concurrent.futures
+import json
+import os
+from collections import defaultdict
+
+from django.db import connection
+
+from sde_collections.models.delta_url import CuratedUrl
+
+
+def process_chunk(chunk_start, chunk_size, total_count):
+    """Process a chunk of curated URLs and return data grouped by collection"""
+    # Close any existing DB connections to avoid sharing connections between processes
+    connection.close()
+
+    # Get the chunk of data with collection information
+    curated_urls_chunk = (
+        CuratedUrl.objects.select_related("collection")
+        .all()
+        .with_exclusion_status()
+        .order_by("url")[chunk_start : chunk_start + chunk_size]
+    )
+
+    # Group URLs by collection folder name
+    collection_data = defaultdict(list)
+    for url in curated_urls_chunk:
+        collection_folder = url.collection.config_folder
+        included = not url.excluded  # Convert to boolean inclusion status
+
+        collection_data[collection_folder].append({"url": url.url, "included": included})
+
+    # Save to a temporary file
+    temp_path = f"/tmp/chunk{chunk_start}.json"
+    with open(temp_path, "w") as f:
+        json.dump(dict(collection_data), f)
+
+    processed = min(chunk_start + chunk_size, total_count)
+    print(f"Processed {processed}/{total_count} URLs")
+
+    return temp_path
+
+
+def export_curated_urls_with_status():
+    """Export all curated URLs with their inclusion status, grouped by collection"""
+    output_path = "/tmp/curated_urls_status.json"
+
+    # Get the total count and status statistics
+    curated_urls = CuratedUrl.objects.all().with_exclusion_status()
+    total_count = curated_urls.count()
+    excluded_count = curated_urls.filter(excluded=True).count()
+    included_count = curated_urls.filter(excluded=False).count()
+
+    print(f"Total URLs: {total_count}")
+    print(f"  Excluded: {excluded_count}")
+    print(f"  Included: {included_count}")
+
+    # Define chunk size and calculate number of chunks
+    chunk_size = 10000
+    chunk_starts = list(range(0, total_count, chunk_size))
+
+    # Process chunks in parallel
+    temp_files = []
+    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
+        # Submit all tasks
+        future_to_chunk = {
+            executor.submit(process_chunk, chunk_start, chunk_size, total_count): chunk_start
+            for chunk_start in chunk_starts
+        }
+
+        # Collect results as they complete
+        for future in concurrent.futures.as_completed(future_to_chunk):
+            chunk_start = future_to_chunk[future]
+            try:
+                temp_file = future.result()
+                temp_files.append(temp_file)
+            except Exception as e:
+                print(f"Chunk starting at {chunk_start} generated an exception: {e}")
+
+    # Combine all temp files into final output
+    combined_data = {}
+
+    # Sort temp files by chunk start position
+    temp_files.sort(key=lambda x: int(os.path.basename(x).replace("chunk", "").split(".")[0]))
+
+    for temp_file in temp_files:
+        with open(temp_file) as infile:
+            chunk_data = json.load(infile)
+            # Merge chunk data into combined data
+            for collection_folder, urls in chunk_data.items():
+                if collection_folder not in combined_data:
+                    combined_data[collection_folder] = []
+                combined_data[collection_folder].extend(urls)
+
+        # Clean up temp file
+        os.unlink(temp_file)
+
+    # Write the final combined data
+    with open(output_path, "w") as outfile:
+        json.dump(combined_data, outfile, indent=2)
+
+    # Verify export completed successfully
+    if os.path.exists(output_path):
+        file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
+        print(f"Export complete. File saved to: {output_path}")
+        print(f"File size: {file_size_mb:.2f} MB")
+
+        # Sanity check: Count the total included and excluded URLs in the final file
+        final_included = 0
+        final_excluded = 0
+
+        # Read the file back and count
+        with open(output_path) as infile:
+            file_data = json.load(infile)
+            for collection_folder, urls in file_data.items():
+                for url_data in urls:
+                    if url_data["included"]:
+                        final_included += 1
+                    else:
+                        final_excluded += 1
+
+        print("\nSanity check on final file:")
+        print(f"Total URLs in file: {final_included + final_excluded}")
+        print(f"  Included: {final_included}")
+        print(f"  Excluded: {final_excluded}")
+
+        # Check if counts match
+        if final_included == included_count and final_excluded == excluded_count:
+            print("✅ Counts match database query results!")
+        else:
+            print("⚠️ Warning: Final counts don't match initial database query!")
+            print(f"  Database included: {included_count}, File included: {final_included}")
+            print(f"  Database excluded: {excluded_count}, File excluded: {final_excluded}")
+    else:
+        print("ERROR: Output file was not created!")
+
+
+# Run the export function
+export_curated_urls_with_status()
Original file line number	Diff line number	Diff line change
`@@ -102,6 +102,7 @@`
`102`	`102`	`"http://sciencediscoveryengine.nasa.gov",`
`103`	`103`	`"https://localhost:4200",`
`104`	`104`	`"http://localhost:4200",`
	`105`	`+ "https://science.data.nasa.gov/",`
`105`	`106`	`]`
`106`	`107`
`107`	`108`	`# MIGRATIONS`