fedspendingtransparency
diff --git a/‎usaspending_api/common/etl/spark.py‎
Lines changed: 51 additions & 112 deletions b/‎usaspending_api/common/etl/spark.py‎
Lines changed: 51 additions & 112 deletions
diff --git a/‎usaspending_api/common/helpers/download_csv_strategies.py‎
Lines changed: 24 additions & 16 deletions b/‎usaspending_api/common/helpers/download_csv_strategies.py‎
Lines changed: 24 additions & 16 deletions
diff --git a/‎usaspending_api/common/helpers/s3_helpers.py‎
Lines changed: 23 additions & 5 deletions b/‎usaspending_api/common/helpers/s3_helpers.py‎
Lines changed: 23 additions & 5 deletions
@@ -6,17 +6,18 @@
 """
 
 import logging
+import math
 import time
 from collections import namedtuple
 from itertools import chain
 from typing import List
 
-from py4j.protocol import Py4JError
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.functions import col, concat, concat_ws, expr, lit, regexp_replace, to_date, transform, when
 from pyspark.sql.types import ArrayType, DecimalType, StringType, StructType
 
 from usaspending_api.accounts.models import AppropriationAccountBalances, FederalAccount, TreasuryAppropriationAccount
+from usaspending_api.common.helpers.s3_helpers import rename_s3_object, retrieve_s3_bucket_object_list
 from usaspending_api.common.helpers.spark_helpers import (
     get_broker_jdbc_url,
     get_jdbc_connection_properties,
@@ -97,7 +98,6 @@ def extract_db_data_frame(
     is_date_partitioning_col: bool = False,
     custom_schema: StructType = None,
 ) -> DataFrame:
-
     logger.info(f"Getting partition bounds using SQL:\n{min_max_sql}")
 
     data_df = None
@@ -427,7 +427,7 @@ def diff(
     cols_to_show = (
         ["diff"]
         + [f"l.{unique_key_col}", f"r.{unique_key_col}"]
-        + list(chain(*zip([f"l.{c}" for c in compare_cols], [f"r.{c}" for c in compare_cols])))
+        + list(chain(*zip([f"l.{c}" for c in compare_cols], [f"r.{c}" for c in compare_cols], strict=False)))
     )
     differences = differences.select(*cols_to_show)
     if not include_unchanged_rows:
@@ -588,7 +588,6 @@ def write_csv_file(
     spark: SparkSession,
     df: DataFrame,
     parts_dir: str,
-    num_partitions: int,
     max_records_per_file=EXCEL_ROW_LIMIT,
     overwrite=True,
     logger=None,
@@ -599,7 +598,6 @@ def write_csv_file(
         spark: passed-in active SparkSession
         df: the DataFrame wrapping the data source to be dumped to CSV.
             parts_dir: Path to dir that will contain the outputted parts files from partitions
-        num_partitions: Indicates the number of partitions to use when writing the Dataframe
         overwrite: Whether to replace the file CSV files if they already exist by that name
         max_records_per_file: Suggestion to Spark of how many records to put in each written CSV file part,
             if it will end up writing multiple files.
@@ -617,12 +615,13 @@ def write_csv_file(
     start = time.time()
     logger.info(f"Writing source data DataFrame to csv part files for file {parts_dir}...")
     df_record_count = df.count()
+    num_partitions = math.ceil(df_record_count / max_records_per_file) or 1
     df.repartition(num_partitions).write.options(
         # NOTE: this is a suggestion, to be used by Spark if partitions yield multiple files
         maxRecordsPerFile=max_records_per_file,
     ).csv(
         path=parts_dir,
-        header=False,
+        header=True,
         emptyValue="",  # "" creates the output of ,,, for null values to match behavior of previous Postgres job
         escape='"',  # " is used to escape the 'quote' character setting (which defaults to "). Escaped quote = ""
         ignoreLeadingWhiteSpace=False,  # must set for CSV write, as it defaults to true
@@ -636,112 +635,6 @@ def write_csv_file(
     return df_record_count
 
 
-def hadoop_copy_merge(
-    spark: SparkSession,
-    parts_dir: str,
-    header: str,
-    part_merge_group_size: int,
-    logger=None,
-    file_format="csv",
-) -> List[str]:
-    """PySpark impl of Hadoop 2.x copyMerge() (deprecated in Hadoop 3.x)
-    Merges files from a provided input directory and then redivides them
-        into multiple files based on merge group size.
-    Args:
-        spark: passed-in active SparkSession
-        parts_dir: Path to the dir that contains the input parts files. The parts dir name
-            determines the name of the merged files. Parts_dir cannot have a trailing slash.
-        header: A comma-separated list of field names, to be placed as the first row of every final CSV file.
-            Individual part files must NOT therefore be created with their own header.
-        part_merge_group_size: Final CSV data will be subdivided into numbered files. This indicates how many part files
-            should be combined into a numbered file.
-        logger: The logger to use. If one note provided (e.g. to log to console or stdout) the underlying JVM-based
-            Logger will be extracted from the ``spark`` ``SparkSession`` and used as the logger.
-        file_format: The format of the part files and the format of the final merged file, e.g. "csv"
-
-    Returns:
-        A list of file paths where each element in the list denotes a path to
-            a merged file that was generated during the copy merge.
-    """
-    overwrite = True
-    hadoop = spark.sparkContext._jvm.org.apache.hadoop
-    conf = spark.sparkContext._jsc.hadoopConfiguration()
-
-    # Guard against incorrectly formatted argument value
-    parts_dir = parts_dir.rstrip("/")
-
-    parts_dir_path = hadoop.fs.Path(parts_dir)
-
-    fs = parts_dir_path.getFileSystem(conf)
-
-    if not fs.exists(parts_dir_path):
-        raise ValueError("Source directory {} does not exist".format(parts_dir))
-
-    file = parts_dir
-    file_path = hadoop.fs.Path(file)
-
-    # Don't delete first if disallowing overwrite.
-    if not overwrite and fs.exists(file_path):
-        raise Py4JError(
-            spark._jvm.org.apache.hadoop.fs.FileAlreadyExistsException(f"{str(file_path)} " f"already exists")
-        )
-    part_files = []
-
-    for f in fs.listStatus(parts_dir_path):
-        if f.isFile():
-            # Sometimes part files can be empty, we need to ignore them
-            if f.getLen() == 0:
-                continue
-            file_path = f.getPath()
-            if file_path.getName().startswith("_"):
-                logger.debug(f"Skipping non-part file: {file_path.getName()}")
-                continue
-            logger.debug(f"Including part file: {file_path.getName()}")
-            part_files.append(f.getPath())
-    if not part_files:
-        logger.warning("Source directory is empty with no part files. Attempting creation of file with CSV header only")
-        out_stream = None
-        try:
-            merged_file_path = f"{parts_dir}.{file_format}"
-            out_stream = fs.create(hadoop.fs.Path(merged_file_path), overwrite)
-            out_stream.writeBytes(header + "\n")
-        finally:
-            if out_stream is not None:
-                out_stream.close()
-        return [merged_file_path]
-
-    part_files.sort(key=lambda f: str(f))  # put parts in order by part number for merging
-    paths_to_merged_files = []
-    for parts_file_group in _merge_grouper(part_files, part_merge_group_size):
-        part_suffix = f"_{str(parts_file_group.part).zfill(2)}" if parts_file_group.part else ""
-        partial_merged_file = f"{parts_dir}.partial{part_suffix}"
-        partial_merged_file_path = hadoop.fs.Path(partial_merged_file)
-        merged_file_path = f"{parts_dir}{part_suffix}.{file_format}"
-        paths_to_merged_files.append(merged_file_path)
-        # Make path a hadoop path because we are working with a hadoop file system
-        merged_file_path = hadoop.fs.Path(merged_file_path)
-        if overwrite and fs.exists(merged_file_path):
-            fs.delete(merged_file_path, True)
-        out_stream = None
-        try:
-            if fs.exists(partial_merged_file_path):
-                fs.delete(partial_merged_file_path, True)
-            out_stream = fs.create(partial_merged_file_path)
-            out_stream.writeBytes(header + "\n")
-            _merge_file_parts(fs, out_stream, conf, hadoop, partial_merged_file_path, parts_file_group.file_list)
-        finally:
-            if out_stream is not None:
-                out_stream.close()
-        try:
-            fs.rename(partial_merged_file_path, merged_file_path)
-        except Exception:
-            if fs.exists(partial_merged_file_path):
-                fs.delete(partial_merged_file_path, True)
-            logger.exception("Exception encountered. See logs")
-            raise
-    return paths_to_merged_files
-
-
 def _merge_file_parts(fs, out_stream, conf, hadoop, partial_merged_file_path, part_file_list):
     """Read-in files in alphabetical order and append them one by one to the merged file"""
 
@@ -767,3 +660,49 @@ def _merge_grouper(items, group_size):
     group_generator = (items[i : i + group_size] for i in range(0, len(items), group_size))
     for i, group in enumerate(group_generator, start=1):
         yield FileMergeGroup(i, group)
+
+
+def rename_part_files(
+    bucket_name: str,
+    destination_file_name: str,
+    logger: logging.Logger,
+    temp_download_dir_name: str = "temp_download",
+    file_format: str = "csv",
+) -> list[str]:
+    """Renames the part-000.csv files to match the zip filename structure.
+
+    Args:
+        bucket_name: S3 bucket that contains the file to be renamed and will contain the renamed file.
+        destination_file_name: Timestamped download file name. This is used to find the correct folder within the
+            bucket.
+        logger: Logger instance.
+        temp_download_dir_name: Name of the folder to used to store the renamed CSV files before they are downloaded.
+            Defaults to "temp_download".
+        file_format: What file format to save the files in.
+            Defaults to "csv".
+
+    Returns:
+        A list of the full S3 paths for the CSV files.
+    """
+    list_of_part_files = sorted(
+        [
+            file.key
+            for file in retrieve_s3_bucket_object_list(bucket_name)
+            if (
+                file.key.startswith(f"{temp_download_dir_name}/{destination_file_name}/part-")
+                and file.key.endswith(file_format)
+            )
+        ]
+    )
+
+    full_file_paths = []
+
+    for index, part_file in enumerate(list_of_part_files):
+        old_key = f"{bucket_name}/{part_file}"
+        new_key = f"{temp_download_dir_name}/{destination_file_name}_{str(index + 1).zfill(2)}.{file_format}"
+        logger.info(f"Renaming {old_key} to {bucket_name}/{new_key}")
+
+        rename_s3_object(bucket_name=bucket_name, old_key=old_key, new_key=new_key)
+        full_file_paths.append(f"s3a://{bucket_name}/{new_key}")
+
+    return full_file_paths
@@ -8,14 +8,19 @@
 
 from django.conf import settings
 from pyspark.sql import DataFrame
+
 from usaspending_api.common.csv_helpers import count_rows_in_delimited_file
-from usaspending_api.common.helpers.s3_helpers import delete_s3_objects, download_s3_object
+from usaspending_api.common.etl.spark import rename_part_files
+from usaspending_api.common.helpers.s3_helpers import (
+    delete_s3_objects,
+    download_s3_object,
+)
 from usaspending_api.download.filestreaming.download_generation import (
     EXCEL_ROW_LIMIT,
-    split_and_zip_data_files,
-    wait_for_process,
     execute_psql,
     generate_export_query_temp_file,
+    split_and_zip_data_files,
+    wait_for_process,
 )
 from usaspending_api.download.filestreaming.zip_file import append_files_to_zip_file
 from usaspending_api.download.lookups import FILE_FORMATS
@@ -139,7 +144,7 @@ def download_to_csv(
         #   we do not want to force all containers where
         #   other strategies run to have pyspark installed when the strategy
         #   doesn't require it.
-        from usaspending_api.common.etl.spark import hadoop_copy_merge, write_csv_file
+        from usaspending_api.common.etl.spark import write_csv_file
         from usaspending_api.common.helpers.spark_helpers import configure_spark_session, get_active_spark_session
 
         self.spark = None
@@ -172,26 +177,24 @@ def download_to_csv(
                 self.spark,
                 df,
                 parts_dir=s3_destination_path,
-                num_partitions=1,
                 max_records_per_file=EXCEL_ROW_LIMIT,
                 logger=self._logger,
                 delimiter=delimiter,
             )
             column_count = len(df.columns)
-            # When combining these later, will prepend the extracted header to each resultant file.
-            # The parts therefore must NOT have headers or the headers will show up in the data when combined.
-            header = ",".join([_.name for _ in df.schema.fields])
             self._logger.info("Concatenating partitioned output files ...")
-            merged_file_paths = hadoop_copy_merge(
-                spark=self.spark,
-                parts_dir=s3_destination_path,
-                header=header,
+            merged_file_paths = rename_part_files(
+                bucket_name=s3_bucket_name,
+                destination_file_name=destination_file_name,
                 logger=self._logger,
-                part_merge_group_size=1,
                 file_format=file_format,
             )
             final_csv_data_file_locations = self._move_data_csv_s3_to_local(
-                s3_bucket_name, merged_file_paths, s3_bucket_path, s3_bucket_sub_path, destination_path_dir
+                s3_bucket_name,
+                merged_file_paths,
+                s3_bucket_path,
+                s3_bucket_sub_path,
+                destination_path_dir,
             )
         except Exception:
             self._logger.exception("Exception encountered. See logs")
@@ -205,7 +208,12 @@ def download_to_csv(
         return CSVDownloadMetadata(final_csv_data_file_locations, record_count, column_count)
 
     def _move_data_csv_s3_to_local(
-        self, bucket_name, s3_file_paths, s3_bucket_path, s3_bucket_sub_path, destination_path_dir
+        self,
+        bucket_name: str,
+        s3_file_paths: list[str],
+        s3_bucket_path: str,
+        s3_bucket_sub_path: str,
+        destination_path_dir: str,
     ) -> List[str]:
         """Moves files from s3 data csv location to a location on the local machine.
 
@@ -216,7 +224,7 @@ def _move_data_csv_s3_to_local(
             s3_bucket_path: The bucket path, e.g. s3a:// + bucket name
             s3_bucket_sub_path: The path to the s3 files in the bucket, exluding s3a:// + bucket name, e.g. temp_directory/files
             destination_path_dir: The location to move those files from s3 to, must not include the
-                file name in the path. This path should be a diretory.
+                file name in the path. This path should be a directory.
 
         Returns:
             A list of the final location on the local machine that the
 
@@ -1,15 +1,15 @@
-import boto3
 import io
 import logging
 import math
 import time
-
-from boto3.s3.transfer import TransferConfig, S3Transfer
-from botocore.exceptions import ClientError
-from django.conf import settings
 from pathlib import Path
 from typing import Optional
+
+import boto3
+from boto3.s3.transfer import S3Transfer, TransferConfig
 from botocore.client import BaseClient
+from botocore.exceptions import ClientError
+from django.conf import settings
 
 from usaspending_api.config import CONFIG
 
@@ -167,3 +167,21 @@ def delete_s3_objects(
     resp = s3_client.delete_objects(Bucket=bucket_name, Delete={"Objects": object_list})
 
     return len(resp.get("Deleted", []))
+
+
+def rename_s3_object(bucket_name: str, old_key: str, new_key: str, region_name: str = settings.USASPENDING_AWS_REGION):
+    """Rename an existing S3 object by:
+        1) Copying the file (old_key) to a new file with the new name (new_key)
+        2) If the copy was successful, delete the old file (old_key)
+    Args:
+        bucket_name: The name of the bucket where the current object is located.
+        old_key: The current name of the key to be renamed.
+        new_key: The new name of the key.
+        region_name: AWS region to use; defaults to the settings provided region.
+    """
+
+    s3 = _get_boto3("client", "s3", region_name=region_name)
+    response = s3.copy_object(Bucket=bucket_name, CopySource=old_key, Key=new_key)
+
+    if response["ResponseMetadata"]["HTTPStatusCode"] == 200:
+        s3.delete_object(Bucket=bucket_name, Key=old_key)