From 4288026a8cea99e1b9ad17ef5ae42584668c6f87 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Fri, 31 Jan 2025 16:07:28 -0600 Subject: [PATCH 01/43] [DEV-11770] - Add account_download create and load commands --- .../download/delta_models/account_download.py | 383 ++++++++++++++++++ .../commands/load_query_to_delta.py | 25 ++ 2 files changed, 408 insertions(+) create mode 100644 usaspending_api/download/delta_models/account_download.py diff --git a/usaspending_api/download/delta_models/account_download.py b/usaspending_api/download/delta_models/account_download.py new file mode 100644 index 0000000000..094b9e06f4 --- /dev/null +++ b/usaspending_api/download/delta_models/account_download.py @@ -0,0 +1,383 @@ +ACCOUNT_DOWNLOAD_COLUMNS_TEST = {"award_id_piid": "STRING"} + +ACCOUNT_DOWNLOAD_COLUMNS = { + "owning_agency_name": "STRING", + "federal_account_symbol": "STRING", + "federal_account_name": "STRING", + "agency_identifier_name": "STRING", + "program_activity_code": "STRING", + "program_activity_name": "STRING", + "object_class_code": "STRING", + "object_class_name": "STRING", + "direct_or_reimbursable_funding_source": "STRING", + "disaster_emergency_fund_code": "STRING", + "disaster_emergency_fund_name": "STRING", + "award_unique_key": "STRING", + "award_id_piid": "STRING", + "parent_award_id_piid": "STRING", + "award_id_fain": "STRING", + "award_id_uri": "STRING", + "award_base_action_date": "DATE", + "award_latest_action_date": "DATE", + "period_of_performance_start_date": "DATE", + "period_of_performance_current_end_date": "DATE", + "ordering_period_end_date": "DATE", + "idv_type_code": "STRING", + "idv_type": "STRING", + "prime_award_base_transaction_description": "STRING", + "awarding_agency_code": "STRING", + "awarding_agency_name": "STRING", + "awarding_subagency_code": "STRING", + "awarding_subagency_name": "STRING", + "awarding_office_code": "STRING", + "awarding_office_name": "STRING", + "funding_agency_code": "STRING", + "funding_agency_name": "STRING", + "funding_sub_agency_code": "STRING", + "funding_sub_agency_name": "STRING", + "funding_office_code": "STRING", + "funding_office_name": "STRING", + "recipient_uei": "STRING", + "recipient_duns": "STRING", + "recipient_name": "STRING", + "recipient_name_raw": "STRING", + "recipient_parent_uei": "STRING", + "recipient_parent_duns": "STRING", + "recipient_parent_name": "STRING", + "recipient_parent_name_raw": "STRING", + "recipient_country": "STRING", + "recipient_state": "STRING", + "recipient_county": "STRING", + "recipient_city": "STRING", + "primary_place_of_performance_country": "STRING", + "primary_place_of_performance_state": "STRING", + "primary_place_of_performance_county": "STRING", + "primary_place_of_performance_zip_code": "STRING", + "cfda_number": "STRING", + "cfda_title": "STRING", + "product_or_service_code": "STRING", + "product_or_service_code_description": "STRING", + "naics_code": "STRING", + "naics_description": "STRING", + "national_interest_action_code": "STRING", + "national_interest_action": "STRING", + "reporting_agency_name": "STRING", + "submission_period": "STRING", + "budget_function": "STRING", + "budget_subfunction": "STRING", + "transaction_obligated_amount": "NUMERIC(23,2)", + "gross_outlay_amount_fyb_to_period_end": "NUMERIC(23,2)", + "ussgl487200_downward_adj_prior_year_prepaid_undeliv_order_oblig": "NUMERIC(23,2)", + "ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig": "NUMERIC(23,2)", + "award_base_action_date_fiscal_year": "INTEGER", + "award_latest_action_date_fiscal_year": "INTEGER", + "award_type_code": "STRING", + "award_type": "STRING", + "prime_award_summary_recipient_cd_original": "STRING", + "prime_award_summary_recipient_cd_current": "STRING", + "recipient_zip_code": "STRING", + "prime_award_summary_place_of_performance_cd_original": "STRING", + "prime_award_summary_place_of_performance_cd_current": "STRING", + "usaspending_permalink": "STRING", + "last_modified_date": "DATE", +} + +account_download_create_sql_string = rf""" + CREATE OR REPLACE TABLE {{DESTINATION_TABLE}} ( + {", ".join([f'{key} {val}' for key, val in ACCOUNT_DOWNLOAD_COLUMNS.items()])} + ) + USING DELTA + LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' + """ + +account_download_load_sql_string = rf""" + INSERT OVERWRITE {{DESTINATION_DATABASE}}.{{DESTINATION_TABLE}} ( + {",".join(list(ACCOUNT_DOWNLOAD_COLUMNS))} + ) + SELECT + toptier_agency.name AS owning_agency_name, + federal_account.federal_account_code AS federal_account_symbol, + federal_account.account_title AS federal_account_name, + cgac_aid.agency_name AS agency_identifier_name, + ref_program_activity.program_activity_code, + ref_program_activity.program_activity_name, + object_class.object_class AS object_class_code, + object_class.object_class_name, + object_class.direct_reimbursable AS direct_or_reimbursable_funding_source, + financial_accounts_by_awards.disaster_emergency_fund_code, + disaster_emergency_fund_code.title AS disaster_emergency_fund_name, + award_search.generated_unique_award_id AS award_unique_key, + financial_accounts_by_awards.piid AS award_id_piid, + financial_accounts_by_awards.parent_award_id AS parent_award_id_piid, + financial_accounts_by_awards.fain AS award_id_fain, + financial_accounts_by_awards.uri AS award_id_uri, + award_search.date_signed AS award_base_action_date, + award_search.certified_date AS award_latest_action_date, + award_search.period_of_performance_start_date, + award_search.period_of_performance_current_end_date, + transaction_search.ordering_period_end_date, + transaction_search.idv_type AS idv_type_code, + transaction_search.idv_type_description AS idv_type, + award_search.description AS prime_award_base_transaction_description, + transaction_search.awarding_agency_code, + transaction_search.awarding_toptier_agency_name_raw AS awarding_agency_name, + transaction_search.awarding_sub_tier_agency_c AS awarding_subagency_code, + transaction_search.awarding_subtier_agency_name_raw AS awarding_subagency_name, + transaction_search.awarding_office_code, + transaction_search.awarding_office_name, + transaction_search.funding_agency_code, + transaction_search.funding_toptier_agency_name_raw AS funding_agency_name, + transaction_search.funding_sub_tier_agency_co AS funding_sub_agency_code, + transaction_search.funding_subtier_agency_name_raw AS funding_sub_agency_name, + transaction_search.funding_office_code, + transaction_search.funding_office_name, + transaction_search.recipient_uei, + transaction_search.recipient_unique_id AS recipient_duns, + transaction_search.recipient_name, + transaction_search.recipient_name_raw, + transaction_search.parent_uei AS recipient_parent_uei, + transaction_search.parent_uei AS recipient_parent_duns, + transaction_search.parent_recipient_name AS recipient_parent_name, + transaction_search.parent_recipient_name_raw AS recipient_parent_name_raw, + transaction_search.recipient_location_country_code AS recipient_country, + transaction_search.recipient_location_state_code AS recipient_state, + transaction_search.recipient_location_county_name AS recipient_county, + transaction_search.recipient_location_city_name AS recipient_city, + transaction_search.pop_country_name AS primary_place_of_performance_country, + transaction_search.pop_state_name AS primary_place_of_performance_state, + transaction_search.pop_county_name AS primary_place_of_performance_county, + transaction_search.place_of_performance_zip4a AS primary_place_of_performance_zip_code, + transaction_search.cfda_number, + transaction_search.cfda_title, + transaction_search.product_or_service_code, + transaction_search.product_or_service_description AS product_or_service_code_description, + transaction_search.naics_code, + transaction_search.naics_description, + transaction_search.national_interest_action AS national_interest_action_code, + transaction_search.national_interest_desc AS national_interest_action, + submission_attributes.reporting_agency_name AS reporting_agency_name, + CASE + WHEN submission_attributes.quarter_format_flag = TRUE + THEN + CONCAT( + CAST('FY' AS STRING), + CAST(submission_attributes.reporting_fiscal_year AS STRING), + CAST('Q' AS STRING), + CAST( + submission_attributes.reporting_fiscal_quarter AS STRING + ) + ) + ELSE + CONCAT( + CAST('FY' AS STRING), + CAST(submission_attributes.reporting_fiscal_year AS STRING), + CAST('P' AS STRING), + LPAD( + CAST( + submission_attributes.reporting_fiscal_period AS STRING + ), + 2, + '0' + ) + ) + END AS submission_period, + treasury_appropriation_account.budget_function_title AS budget_function, + treasury_appropriation_account.budget_subfunction_title AS budget_subfunction, + financial_accounts_by_awards.transaction_obligated_amount AS transaction_obligated_amount, + CASE + WHEN + ( + ( + submission_attributes.quarter_format_flag = TRUE + AND submission_attributes.reporting_fiscal_quarter = 4 + ) + OR ( + submission_attributes.quarter_format_flag = FALSE + AND submission_attributes.reporting_fiscal_period = 12 + ) + ) AND submission_attributes.reporting_fiscal_year = 2021 + THEN + financial_accounts_by_awards.gross_outlay_amount_by_award_cpe + ELSE CAST(NULL AS NUMERIC(23, 2)) + END AS gross_outlay_amount_fyb_to_period_end, + CASE + WHEN + ( + ( + submission_attributes.quarter_format_flag = TRUE + AND submission_attributes.reporting_fiscal_quarter = 4 + ) + OR ( + submission_attributes.quarter_format_flag = FALSE + AND submission_attributes.reporting_fiscal_period = 12 + ) + ) AND submission_attributes.reporting_fiscal_year = 2021 + THEN + financial_accounts_by_awards.ussgl487200_down_adj_pri_ppaid_undel_orders_oblig_refund_cpe + ELSE CAST(NULL AS NUMERIC(23, 2)) + END AS ussgl487200_downward_adj_prior_year_prepaid_undeliv_order_oblig, + CASE + WHEN + ( + ( + submission_attributes.quarter_format_flag = TRUE + AND submission_attributes.reporting_fiscal_quarter = 4 + ) + OR ( + submission_attributes.quarter_format_flag = FALSE + AND submission_attributes.reporting_fiscal_period = 12 + ) + ) AND submission_attributes.reporting_fiscal_year = 2021 + THEN + financial_accounts_by_awards.ussgl497200_down_adj_pri_paid_deliv_orders_oblig_refund_cpe + ELSE CAST(NULL AS NUMERIC(23, 2)) + END AS ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig, + EXTRACT( + YEAR FROM (award_search.date_signed) + INTERVAL '3 months' + ) AS award_base_action_date_fiscal_year, + EXTRACT( + YEAR FROM (award_search.certified_date) + INTERVAL '3 months' + ) AS award_latest_action_date_fiscal_year, + COALESCE( + transaction_search.contract_award_type, + transaction_search.type + ) AS award_type_code, + COALESCE( + transaction_search.contract_award_type_desc, + transaction_search.type_description + ) AS award_type, + CASE + WHEN + transaction_search.recipient_location_state_code IS NOT NULL + AND transaction_search.recipient_location_congressional_code IS NOT NULL + AND NOT ( + transaction_search.recipient_location_state_code = '' + AND transaction_search.recipient_location_state_code IS NOT NULL + ) + THEN + CONCAT( + transaction_search.recipient_location_state_code, '-', + transaction_search.recipient_location_congressional_code + ) + ELSE transaction_search.recipient_location_congressional_code + END AS prime_award_summary_recipient_cd_original, + CASE + WHEN + transaction_search.recipient_location_state_code IS NOT NULL + AND transaction_search.recipient_location_congressional_code_current IS NOT NULL + AND NOT ( + transaction_search.recipient_location_state_code = '' + AND transaction_search.recipient_location_state_code IS NOT NULL + ) + THEN + CONCAT( + transaction_search.recipient_location_state_code, '-', + transaction_search.recipient_location_congressional_code_current + ) + ELSE transaction_search.recipient_location_congressional_code_current + END AS prime_award_summary_recipient_cd_current, + COALESCE( + transaction_search.legal_entity_zip4, + CONCAT( + CAST(transaction_search.recipient_location_zip5 AS STRING), + CAST(transaction_search.legal_entity_zip_last4 AS STRING) + ) + ) AS recipient_zip_code, + CASE + WHEN + transaction_search.pop_state_code IS NOT NULL + AND transaction_search.pop_congressional_code IS NOT NULL + AND NOT ( + transaction_search.pop_state_code = '' + AND transaction_search.pop_state_code IS NOT NULL + ) + THEN + CONCAT( + transaction_search.pop_state_code, + '-', + transaction_search.pop_congressional_code + ) + ELSE transaction_search.pop_congressional_code + END AS prime_award_summary_place_of_performance_cd_original, + CASE + WHEN + transaction_search.pop_state_code IS NOT NULL + AND transaction_search.pop_congressional_code_current IS NOT NULL + AND NOT ( + transaction_search.pop_state_code = '' + AND transaction_search.pop_state_code IS NOT NULL + ) + THEN + CONCAT( + transaction_search.pop_state_code, + '-', + transaction_search.pop_congressional_code_current + ) + ELSE transaction_search.pop_congressional_code_current + END AS prime_award_summary_place_of_performance_cd_current, + CASE + WHEN award_search.generated_unique_award_id IS NOT NULL + THEN + CONCAT( + 'localhost:3000/award/', + URL_ENCODE(award_search.generated_unique_award_id), + '/' + ) + ELSE '' + END AS usaspending_permalink, + CAST( + submission_attributes.published_date AS DATE + ) AS last_modified_date + FROM raw.financial_accounts_by_awards + INNER JOIN global_temp.submission_attributes AS submission_attributes + ON ( + financial_accounts_by_awards.submission_id + = submission_attributes.submission_id + ) + LEFT OUTER JOIN global_temp.treasury_appropriation_account + ON ( + financial_accounts_by_awards.treasury_account_id + = treasury_appropriation_account.treasury_account_identifier + ) + LEFT OUTER JOIN award_search + ON ( + financial_accounts_by_awards.award_id = award_search.award_id + ) + LEFT OUTER JOIN transaction_search + ON ( + award_search.latest_transaction_search_id + = transaction_search.transaction_id + ) + LEFT OUTER JOIN global_temp.ref_program_activity + ON ( + financial_accounts_by_awards.program_activity_id + = ref_program_activity.id + ) + LEFT OUTER JOIN global_temp.object_class + ON ( + financial_accounts_by_awards.object_class_id = object_class.id + ) + LEFT OUTER JOIN global_temp.disaster_emergency_fund_code + ON ( + financial_accounts_by_awards.disaster_emergency_fund_code + = disaster_emergency_fund_code.code + ) + LEFT OUTER JOIN global_temp.federal_account + ON ( + treasury_appropriation_account.federal_account_id = federal_account.id + ) + LEFT OUTER JOIN global_temp.toptier_agency + ON ( + federal_account.parent_toptier_agency_id + = toptier_agency.toptier_agency_id + ) + LEFT OUTER JOIN global_temp.cgac AS cgac_aid + ON ( + treasury_appropriation_account.agency_id = cgac_aid.cgac_code + ) + LEFT OUTER JOIN global_temp.cgac AS cgac_ata + ON ( + treasury_appropriation_account.allocation_transfer_agency_id + = cgac_ata.cgac_code + ); + """ diff --git a/usaspending_api/etl/management/commands/load_query_to_delta.py b/usaspending_api/etl/management/commands/load_query_to_delta.py index 64ee0158fd..45c58f63fc 100644 --- a/usaspending_api/etl/management/commands/load_query_to_delta.py +++ b/usaspending_api/etl/management/commands/load_query_to_delta.py @@ -17,6 +17,11 @@ covid_faba_spending_load_sql_strings, ) from usaspending_api.disaster.models import CovidFABASpending +from usaspending_api.download.delta_models.account_download import ( + ACCOUNT_DOWNLOAD_COLUMNS, + account_download_create_sql_string, + account_download_load_sql_string, +) from usaspending_api.recipient.delta_models import ( RECIPIENT_LOOKUP_POSTGRES_COLUMNS, RECIPIENT_PROFILE_POSTGRES_COLUMNS, @@ -291,6 +296,26 @@ "tsvectors": None, "postgres_partition_spec": None, }, + "account_download": { + "model": None, + "is_from_broker": False, + "source_query": [account_download_load_sql_string], + "source_database": None, + "source_table": None, + "destination_database": "rpt", + "swap_table": "account_download", + "swap_schema": "rpt", + "partition_column": "id", + "partition_column_type": "numeric", + "is_partition_column_unique": False, + "delta_table_create_sql": account_download_create_sql_string, + "source_schema": ACCOUNT_DOWNLOAD_COLUMNS, + "custom_schema": None, + "column_names": list(ACCOUNT_DOWNLOAD_COLUMNS), + "postgres_seq_name": None, + "tsvectors": None, + "postgres_partition_spec": None, + }, } From 38231229d8dbe02e0e862061b6f5ecb632ad592f Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Fri, 7 Feb 2025 08:51:20 -0600 Subject: [PATCH 02/43] [DEV-11771] - add generate spark download command --- .../common/helpers/download_csv_strategies.py | 27 +- usaspending_api/common/helpers/s3_helpers.py | 70 ++++- .../common/helpers/spark_helpers.py | 7 +- .../download/delta_models/account_download.py | 28 +- .../filestreaming/download_generation.py | 69 +++-- .../download/management/__init__.py | 0 .../download/management/commands/__init__.py | 0 .../commands/delta_downloads/__init__.py | 0 .../award_financial/__init__.py | 0 .../award_financial/federal_account.py | 203 ++++++++++++++ .../commands/generate_spark_download.py | 254 ++++++++++++++++++ .../management/commands/drop_delta_table.py | 0 12 files changed, 600 insertions(+), 58 deletions(-) create mode 100644 usaspending_api/download/management/__init__.py create mode 100644 usaspending_api/download/management/commands/__init__.py create mode 100644 usaspending_api/download/management/commands/delta_downloads/__init__.py create mode 100644 usaspending_api/download/management/commands/delta_downloads/award_financial/__init__.py create mode 100644 usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py create mode 100644 usaspending_api/download/management/commands/generate_spark_download.py create mode 100644 usaspending_api/etl/management/commands/drop_delta_table.py diff --git a/usaspending_api/common/helpers/download_csv_strategies.py b/usaspending_api/common/helpers/download_csv_strategies.py index 04560eb0cd..288ea0ffa2 100644 --- a/usaspending_api/common/helpers/download_csv_strategies.py +++ b/usaspending_api/common/helpers/download_csv_strategies.py @@ -1,13 +1,14 @@ from abc import ABC, abstractmethod +from dataclasses import dataclass import multiprocessing import time import logging from pathlib import Path -from typing import Tuple +from typing import Optional, Tuple from django.conf import settings from usaspending_api.common.csv_helpers import count_rows_in_delimited_file -from usaspending_api.common.helpers.s3_helpers import delete_s3_object, download_s3_object +from usaspending_api.common.helpers.s3_helpers import delete_s3_objects, download_s3_object from usaspending_api.common.helpers.sql_helpers import read_sql_file_to_text from usaspending_api.download.filestreaming.download_generation import ( EXCEL_ROW_LIMIT, @@ -21,6 +22,13 @@ from typing import List +@dataclass +class CSVDownloadMetadata: + filepaths: List[str] + number_of_rows: int + number_of_columns: Optional[int] + + class AbstractToCSVStrategy(ABC): """A composable class that can be used according to the Strategy software design pattern. The "to csv" strategy establishes the interface for a suite of download @@ -43,7 +51,7 @@ def download_to_csv( working_dir_path: Path, download_zip_path: Path, source_df=None, - ) -> Tuple[List[str], int]: + ) -> CSVDownloadMetadata: """ Args: source_sql: Some string that can be used as the source sql @@ -53,7 +61,7 @@ def download_to_csv( download_zip_path: The path (as a string) to the download zip file Returns: - Returns a list of paths to the downloaded csv files and the total record count of all those files. + Returns a CSVDownloadMetadata object (a dataclass containing metadata about the download) """ pass @@ -86,8 +94,8 @@ def download_to_csv( # Log how many rows we have self._logger.info(f"Counting rows in delimited text file {temp_data_file_name}") try: - count = count_rows_in_delimited_file(filename=temp_data_file_name, has_header=True, delimiter=delim) - self._logger.info(f"{destination_path} contains {count:,} rows of data") + row_count = count_rows_in_delimited_file(filename=temp_data_file_name, has_header=True, delimiter=delim) + self._logger.info(f"{destination_path} contains {row_count:,} rows of data") except Exception: self._logger.exception("Unable to obtain delimited text file line count") @@ -108,7 +116,7 @@ def download_to_csv( raise e finally: Path(temp_file_path).unlink() - return [destination_path], count + return CSVDownloadMetadata([destination_path], row_count) class SparkToCSVStrategy(AbstractToCSVStrategy): @@ -161,6 +169,7 @@ def download_to_csv( max_records_per_file=EXCEL_ROW_LIMIT, logger=self._logger, ) + column_count = len(df.columns) # When combining these later, will prepend the extracted header to each resultant file. # The parts therefore must NOT have headers or the headers will show up in the data when combined. header = ",".join([_.name for _ in df.schema.fields]) @@ -179,12 +188,12 @@ def download_to_csv( self._logger.exception("Exception encountered. See logs") raise finally: - delete_s3_object(s3_bucket_name, s3_destination_path) + delete_s3_objects(s3_bucket_name, key_prefix=f"{s3_bucket_sub_path}/{destination_file_name}") if self.spark_created_by_command: self.spark.stop() append_files_to_zip_file(final_csv_data_file_locations, download_zip_path) self._logger.info(f"Generated the following data csv files {final_csv_data_file_locations}") - return final_csv_data_file_locations, record_count + return CSVDownloadMetadata(final_csv_data_file_locations, record_count, column_count) def _move_data_csv_s3_to_local( self, bucket_name, s3_file_paths, s3_bucket_path, s3_bucket_sub_path, destination_path_dir diff --git a/usaspending_api/common/helpers/s3_helpers.py b/usaspending_api/common/helpers/s3_helpers.py index 958a11e608..07366375ad 100644 --- a/usaspending_api/common/helpers/s3_helpers.py +++ b/usaspending_api/common/helpers/s3_helpers.py @@ -4,11 +4,12 @@ import math import time +from boto3.resources.base import ServiceResource from boto3.s3.transfer import TransferConfig, S3Transfer from botocore.exceptions import ClientError from django.conf import settings from pathlib import Path -from typing import List +from typing import List, Optional from botocore.client import BaseClient from usaspending_api.config import CONFIG @@ -43,6 +44,32 @@ def _get_boto3_s3_client(region_name=CONFIG.AWS_REGION) -> BaseClient: return s3_client +def _get_boto3_s3_resource(region_name=CONFIG.AWS_REGION) -> ServiceResource: + """Returns the correct boto3 resource based on the environment. + + Returns: + ServiceResource: Boto3 resource + """ + if not CONFIG.USE_AWS: + boto3_session = boto3.session.Session( + region_name=region_name, + aws_access_key_id=CONFIG.AWS_ACCESS_KEY.get_secret_value(), + aws_secret_access_key=CONFIG.AWS_SECRET_KEY.get_secret_value(), + ) + s3_resource = boto3_session.resource( + service_name="s3", + region_name=region_name, + endpoint_url=f"http://{CONFIG.AWS_S3_ENDPOINT}", + ) + else: + s3_resource = boto3.resource( + service_name="s3", + region_name=region_name, + endpoint_url=f"https://{CONFIG.AWS_S3_ENDPOINT}", + ) + return s3_resource + + def retrieve_s3_bucket_object_list(bucket_name: str) -> List["boto3.resources.factory.s3.ObjectSummary"]: try: bucket = get_s3_bucket(bucket_name=bucket_name) @@ -60,7 +87,7 @@ def retrieve_s3_bucket_object_list(bucket_name: str) -> List["boto3.resources.fa def get_s3_bucket( bucket_name: str, region_name: str = settings.USASPENDING_AWS_REGION ) -> "boto3.resources.factory.s3.Instance": - s3 = boto3.resource("s3", region_name=region_name) + s3 = _get_boto3_s3_resource(region_name) return s3.Bucket(bucket_name) @@ -80,13 +107,13 @@ def upload_download_file_to_s3(file_path, sub_dir=None): multipart_upload(bucket, region, str(file_path), keyname, sub_dir) -def multipart_upload(bucketname, regionname, source_path, keyname, sub_dir=None): - s3client = boto3.client("s3", region_name=regionname) +def multipart_upload(bucketname, region_name, source_path, keyname, sub_dir=None): + s3_client = _get_boto3_s3_client(region_name) source_size = Path(source_path).stat().st_size # Sets the chunksize at minimum ~5MB to sqrt(5MB) * sqrt(source size) bytes_per_chunk = max(int(math.sqrt(5242880) * math.sqrt(source_size)), 5242880) config = TransferConfig(multipart_chunksize=bytes_per_chunk) - transfer = S3Transfer(s3client, config) + transfer = S3Transfer(s3_client, config) file_name = Path(keyname).name if sub_dir is not None: file_name = f"{sub_dir}/{file_name}" @@ -136,3 +163,36 @@ def delete_s3_object(bucket_name: str, key: str, region_name: str = settings.USA """ s3 = _get_boto3_s3_client(region_name) s3.delete_object(Bucket=bucket_name, Key=key) + + +def delete_s3_objects( + bucket_name: str, + *, + key_list: Optional[List[str]] = None, + key_prefix: Optional[str] = None, + region_name: Optional[str] = settings.USASPENDING_AWS_REGION, +) -> int: + """Deletes all objects based on a list of keys + Args: + bucket_name: The name of the bucket where the objects are located + key_list: A list of keys representing objects in the bucket to delete + key_prefix: A prefix in the bucket used to generate a list of objects to delete + region_name: AWS region to use; defaults to the settings provided region + + Returns: + Number of objects delete + """ + object_list = [] + + if key_prefix: + bucket = get_s3_bucket(bucket_name, region_name) + objects = bucket.objects.filter(Prefix=key_prefix) + object_list.extend([{"Key": obj.key} for obj in objects]) + + if key_list: + object_list.extend([{"Key": key} for key in key_list]) + + s3_client = _get_boto3_s3_client(region_name) + resp = s3_client.delete_objects(Bucket=bucket_name, Delete={"Objects": object_list}) + + return len(resp.get("Deleted", [])) \ No newline at end of file diff --git a/usaspending_api/common/helpers/spark_helpers.py b/usaspending_api/common/helpers/spark_helpers.py index 98a72533e1..d75c83e6ba 100644 --- a/usaspending_api/common/helpers/spark_helpers.py +++ b/usaspending_api/common/helpers/spark_helpers.py @@ -167,6 +167,10 @@ def configure_spark_session( conf.set("spark.hadoop.fs.s3a.endpoint", CONFIG.AWS_S3_ENDPOINT) if not CONFIG.USE_AWS: # i.e. running in a "local" [development] environment + conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") + conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") + conf.set("hive.metastore.disallow.incompatible.col.type.changes", "false") # Set configs to allow the S3AFileSystem to work against a local MinIO object storage proxy conf.set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") # "Enable S3 path style access ie disabling the default virtual hosting behaviour. @@ -230,6 +234,7 @@ def configure_spark_session( # Build the SparkSession based on args provided builder = SparkSession.builder + builder = builder.config(conf=conf) if spark_context: builder = builder._sparkContext(spark_context) if java_gateway: @@ -242,7 +247,7 @@ def configure_spark_session( builder = builder.appName(app_name) if enable_hive_support: builder = builder.enableHiveSupport() - spark = builder.config(conf=conf).getOrCreate() + spark = builder.getOrCreate() # Now that the SparkSession was created, check whether certain provided config values were ignored if given a # pre-existing SparkContext, and error-out if so diff --git a/usaspending_api/download/delta_models/account_download.py b/usaspending_api/download/delta_models/account_download.py index 094b9e06f4..31902aacbb 100644 --- a/usaspending_api/download/delta_models/account_download.py +++ b/usaspending_api/download/delta_models/account_download.py @@ -1,6 +1,7 @@ ACCOUNT_DOWNLOAD_COLUMNS_TEST = {"award_id_piid": "STRING"} ACCOUNT_DOWNLOAD_COLUMNS = { + "submission_id": "INTEGER NOT NULL", "owning_agency_name": "STRING", "federal_account_symbol": "STRING", "federal_account_name": "STRING", @@ -80,6 +81,10 @@ "prime_award_summary_place_of_performance_cd_current": "STRING", "usaspending_permalink": "STRING", "last_modified_date": "DATE", + "reporting_fiscal_period": "INTEGER", + "reporting_fiscal_quarter": "INTEGER", + "reporting_fiscal_year": "INTEGER", + "quarter_format_flag": "BOOLEAN", } account_download_create_sql_string = rf""" @@ -90,11 +95,16 @@ LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' """ +account_download_drop_sql_string = rf""" + DROP TABLE {{DESTINATION_TABLE}} + """ + account_download_load_sql_string = rf""" INSERT OVERWRITE {{DESTINATION_DATABASE}}.{{DESTINATION_TABLE}} ( {",".join(list(ACCOUNT_DOWNLOAD_COLUMNS))} ) SELECT + financial_accounts_by_awards.submission_id, toptier_agency.name AS owning_agency_name, federal_account.federal_account_code AS federal_account_symbol, federal_account.account_title AS federal_account_name, @@ -111,11 +121,11 @@ financial_accounts_by_awards.parent_award_id AS parent_award_id_piid, financial_accounts_by_awards.fain AS award_id_fain, financial_accounts_by_awards.uri AS award_id_uri, - award_search.date_signed AS award_base_action_date, - award_search.certified_date AS award_latest_action_date, - award_search.period_of_performance_start_date, - award_search.period_of_performance_current_end_date, - transaction_search.ordering_period_end_date, + TRY_CAST(award_search.date_signed AS DATE) AS award_base_action_date, + TRY_CAST(award_search.certified_date AS DATE) AS award_latest_action_date, + TRY_CAST(award_search.period_of_performance_start_date AS DATE), + TRY_CAST(award_search.period_of_performance_current_end_date AS DATE), + TRY_CAST(transaction_search.ordering_period_end_date AS DATE), transaction_search.idv_type AS idv_type_code, transaction_search.idv_type_description AS idv_type, award_search.description AS prime_award_base_transaction_description, @@ -325,9 +335,11 @@ ) ELSE '' END AS usaspending_permalink, - CAST( - submission_attributes.published_date AS DATE - ) AS last_modified_date + TRY_CAST(submission_attributes.published_date AS DATE) AS last_modified_date, + submission_attributes.reporting_fiscal_period, + submission_attributes.reporting_fiscal_quarter, + submission_attributes.reporting_fiscal_year, + submission_attributes.quarter_format_flag FROM raw.financial_accounts_by_awards INNER JOIN global_temp.submission_attributes AS submission_attributes ON ( diff --git a/usaspending_api/download/filestreaming/download_generation.py b/usaspending_api/download/filestreaming/download_generation.py index dedc5b587e..0992a141a9 100644 --- a/usaspending_api/download/filestreaming/download_generation.py +++ b/usaspending_api/download/filestreaming/download_generation.py @@ -130,47 +130,46 @@ def generate_download(download_job: DownloadJob, origination: Optional[str] = No # push file to S3 bucket, if not local if not settings.IS_LOCAL: - with ( - tracer.trace( - name=f"job.{JOB_TYPE}.download.s3", - service="bulk-download", - resource=f"s3://{settings.BULK_DOWNLOAD_S3_BUCKET_NAME}", - span_type=SpanTypes.WORKER, - ) as span, - tracer.trace( + with tracer.trace( + name=f"job.{JOB_TYPE}.download.s3", + service="bulk-download", + resource=f"s3://{settings.BULK_DOWNLOAD_S3_BUCKET_NAME}", + span_type=SpanTypes.WORKER, + ) as span: + with tracer.trace( name="s3.command", service="aws.s3", resource=".".join( [multipart_upload.__module__, (multipart_upload.__qualname__ or multipart_upload.__name__)] ), span_type=SpanTypes.WEB, - ) as s3_span, - ): - # NOTE: Traces still not auto-picking-up aws.s3 service upload activity - # Could be that the patches for boto and botocore don't cover the newer boto3 S3Transfer upload approach - span.set_tag("file_name", file_name) - try: - bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME - region = settings.USASPENDING_AWS_REGION - s3_span.set_tags({"bucket": bucket, "region": region, "file": zip_file_path}) - start_uploading = time.perf_counter() - multipart_upload(bucket, region, zip_file_path, os.path.basename(zip_file_path)) - write_to_log( - message=f"Uploading took {time.perf_counter() - start_uploading:.2f}s", download_job=download_job - ) - except Exception as e: - # Set error message; job_status_id will be set in download_sqs_worker.handle() - exc_msg = "An exception was raised while attempting to upload the file" - fail_download(download_job, e, exc_msg) - if isinstance(e, InvalidParameterException): - raise InvalidParameterException(e) - else: - raise Exception(download_job.error_message) from e - finally: - # Remove generated file - if os.path.exists(zip_file_path): - os.remove(zip_file_path) - _kill_spawned_processes(download_job) + ) as s3_span: + # NOTE: Traces still not auto-picking-up aws.s3 service upload activity + # Could be that the patches for boto and botocore don't cover the newer boto3 S3Transfer upload approach + span.set_tag("file_name", file_name) + try: + bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME + region = settings.USASPENDING_AWS_REGION + s3_span.set_tags({"bucket": bucket, "region": region, "file": zip_file_path}) + start_uploading = time.perf_counter() + multipart_upload(bucket, region, zip_file_path, os.path.basename(zip_file_path)) + write_to_log( + message=f"Uploading took {time.perf_counter() - start_uploading:.2f}s", + download_job=download_job, + ) + except Exception as e: + # Set error message; job_status_id will be set in download_sqs_worker.handle() + exc_msg = "An exception was raised while attempting to upload the file" + fail_download(download_job, e, exc_msg) + if isinstance(e, InvalidParameterException): + raise InvalidParameterException(e) + else: + raise Exception(download_job.error_message) from e + finally: + # Remove generated file + if os.path.exists(zip_file_path): + os.remove(zip_file_path) + _kill_spawned_processes(download_job) return finish_download(download_job) diff --git a/usaspending_api/download/management/__init__.py b/usaspending_api/download/management/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/usaspending_api/download/management/commands/__init__.py b/usaspending_api/download/management/commands/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/usaspending_api/download/management/commands/delta_downloads/__init__.py b/usaspending_api/download/management/commands/delta_downloads/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/__init__.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py new file mode 100644 index 0000000000..2a3701f401 --- /dev/null +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py @@ -0,0 +1,203 @@ +DOWNLOAD_QUERY = """ + SELECT + owning_agency_name, + CONCAT_WS('; ', COLLECT_SET(reporting_agency_name)) AS reporting_agency_name, + submission_period, + federal_account_symbol, + federal_account_name, + agency_identifier_name, + CONCAT_WS('; ', COLLECT_SET(budget_function)) AS budget_function, + CONCAT_WS('; ', COLLECT_SET(budget_subfunction)) AS budget_subfunction, + program_activity_code, + program_activity_name, + object_class_code, + object_class_name, + direct_or_reimbursable_funding_source, + disaster_emergency_fund_code, + disaster_emergency_fund_name, + SUM(transaction_obligated_amount) AS transaction_obligated_amount, + SUM(gross_outlay_amount_FYB_to_period_end) AS gross_outlay_amount_FYB_to_period_end, + SUM(USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig) AS USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig, + SUM(USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig) AS USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig, + award_unique_key, + award_id_piid, + parent_award_id_piid, + award_id_fain, + award_id_uri, + award_base_action_date, + award_base_action_date_fiscal_year, + award_latest_action_date, + award_latest_action_date_fiscal_year, + period_of_performance_start_date, + period_of_performance_current_end_date, + ordering_period_end_date, + award_type_code, + award_type, + idv_type_code, + idv_type_code, + idv_type, + prime_award_base_transaction_description, + awarding_agency_code, + awarding_agency_name, + awarding_subagency_code, + awarding_subagency_name, + awarding_office_code, + awarding_office_name, + funding_agency_code, + funding_agency_name, + funding_sub_agency_code, + funding_sub_agency_name, + funding_office_code, + funding_office_name, + recipient_uei, + recipient_duns, + recipient_name, + recipient_name_raw, + recipient_parent_uei, + recipient_parent_duns, + recipient_parent_name, + recipient_parent_name_raw, + recipient_country, + recipient_state, + recipient_county, + recipient_city, + prime_award_summary_recipient_cd_original, + prime_award_summary_recipient_cd_current, + recipient_zip_code, + primary_place_of_performance_country, + primary_place_of_performance_state, + primary_place_of_performance_county, + prime_award_summary_place_of_performance_cd_original, + prime_award_summary_place_of_performance_cd_current, + primary_place_of_performance_zip_code, + cfda_number, + cfda_title, + product_or_service_code, + product_or_service_code_description, + naics_code, + naics_description, + national_interest_action_code, + national_interest_action, + usaspending_permalink, + MAX(last_modified_date) + FROM rpt.account_download + WHERE + ( + submission_id IN {} + OR ( + ( + ( + reporting_fiscal_period <= 12 + AND NOT quarter_format_flag) + OR ( + reporting_fiscal_quarter <= 4 + AND quarter_format_flag + ) + ) + AND reporting_fiscal_year = 2021 + ) + ) + GROUP BY + owning_agency_name, + federal_account_symbol, + federal_account_name, + agency_identifier_name, + program_activity_code, + program_activity_name, + object_class_code, + object_class_name, + direct_or_reimbursable_funding_source, + disaster_emergency_fund_code, + disaster_emergency_fund_name, + award_unique_key, + award_id_piid, + parent_award_id_piid, + award_id_fain, + award_id_uri, + award_base_action_date, + award_latest_action_date, + period_of_performance_start_date, + period_of_performance_current_end_date, + ordering_period_end_date, + idv_type_code, + idv_type, + prime_award_base_transaction_description, + awarding_agency_code, + awarding_agency_name, + awarding_subagency_code, + awarding_subagency_name, + awarding_office_code, + awarding_office_name, + funding_agency_code, + funding_agency_name, + funding_sub_agency_code, + funding_sub_agency_name, + funding_office_code, + funding_office_name, + recipient_uei, + recipient_duns, + recipient_name, + recipient_name_raw, + recipient_parent_uei, + recipient_parent_duns, + recipient_parent_name, + recipient_parent_name_raw, + recipient_country, + recipient_state, + recipient_county, + recipient_city, + primary_place_of_performance_country, + primary_place_of_performance_state, + primary_place_of_performance_county, + primary_place_of_performance_zip_code, + cfda_number, + cfda_title, + product_or_service_code, + product_or_service_code_description, + naics_code, + naics_description, + national_interest_action_code, + national_interest_action, + submission_period, + award_type_code, + award_type, + recipient_zip_code, + award_base_action_date_fiscal_year, + award_latest_action_date_fiscal_year, + usaspending_permalink, + prime_award_summary_recipient_cd_original, + prime_award_summary_recipient_cd_current, + prime_award_summary_place_of_performance_cd_original, + prime_award_summary_place_of_performance_cd_current + HAVING + -- All of the HAVING statements below ensure we return only non-zero sum records + SUM(gross_outlay_amount_fyb_to_period_end) > 0 + OR SUM(gross_outlay_amount_fyb_to_period_end) < 0 + OR SUM(ussgl487200_downward_adj_prior_year_prepaid_undeliv_order_oblig) < 0 + OR SUM(ussgl487200_downward_adj_prior_year_prepaid_undeliv_order_oblig) > 0 + OR SUM(ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig) < 0 + OR SUM(ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig) > 0 + OR SUM(transaction_obligated_amount) > 0 + OR SUM(transaction_obligated_amount) < 0 +""" + + + +SUBMISSION_ID_QUERY = """ + SELECT submission_id + FROM global_temp.submission_attributes + WHERE (toptier_code, reporting_fiscal_year, reporting_fiscal_period) IN ( + SELECT toptier_code, reporting_fiscal_year, reporting_fiscal_period + FROM global_temp.submission_attributes + WHERE reporting_fiscal_year = 2021 AND + ( + (reporting_fiscal_quarter <= 4 AND quarter_format_flag is true) OR + (reporting_fiscal_period <= 12 AND quarter_format_flag is false) + ) + ORDER BY toptier_code, reporting_fiscal_period desc + ) AND + ( + (reporting_fiscal_quarter = 4 AND quarter_format_flag IS TRUE) OR + (reporting_fiscal_period = 12 AND quarter_format_flag IS FALSE) + ) +""" \ No newline at end of file diff --git a/usaspending_api/download/management/commands/generate_spark_download.py b/usaspending_api/download/management/commands/generate_spark_download.py new file mode 100644 index 0000000000..7f9ddad98c --- /dev/null +++ b/usaspending_api/download/management/commands/generate_spark_download.py @@ -0,0 +1,254 @@ +import json +import os +import traceback +from logging import Logger +from pathlib import Path +from typing import Optional, Dict, Tuple, Type, List, Union + +from django.conf import settings +from django.core.management.base import BaseCommand +from django.utils.functional import cached_property +from pyspark.sql import SparkSession + +from usaspending_api.common.etl.spark import create_ref_temp_views +from usaspending_api.common.exceptions import InvalidParameterException +from usaspending_api.common.helpers.dict_helpers import order_nested_object +from usaspending_api.common.helpers.download_csv_strategies import SparkToCSVStrategy +from usaspending_api.common.helpers.s3_helpers import upload_download_file_to_s3 +from usaspending_api.common.helpers.spark_helpers import ( + configure_spark_session, + get_active_spark_session, + get_jdbc_connection_properties, + get_jvm_logger, + get_usas_jdbc_url, +) +from usaspending_api.download.filestreaming.download_generation import build_data_file_name +from usaspending_api.download.filestreaming.download_source import DownloadSource +from usaspending_api.download.management.commands.delta_downloads.award_financial import federal_account +from usaspending_api.download.download_utils import create_unique_filename +from usaspending_api.download.lookups import JOB_STATUS_DICT, FILE_FORMATS, VALUE_MAPPINGS +from usaspending_api.download.models import DownloadJob +from usaspending_api.download.v2.request_validations import AccountDownloadValidator, DownloadValidatorBase + +DOWNLOAD_SPEC = { + "award_financial": { + "federal_account": { + "query": federal_account.DOWNLOAD_QUERY, + "select_in_formats": [("submission_id", federal_account.SUBMISSION_ID_QUERY)], + "validator_type": AccountDownloadValidator, + } + } +} + + +class Command(BaseCommand): + + help = "Generate a download zip file based on the provided type and level." + + download_job: DownloadJob + download_level: str + download_query: str + download_source: DownloadSource + download_spec: Dict + download_type: str + download_validator_type: Type[DownloadValidatorBase] + file_format_spec: Dict + file_prefix: str + jdbc_properties: Dict + jdbc_url: str + logger: Logger + should_cleanup: bool + spark: SparkSession + working_dir_path: Path + + def add_arguments(self, parser): + parser.add_argument("--download-type", type=str, required=True, choices=list(DOWNLOAD_SPEC)) + parser.add_argument( + "--download-level", + type=str, + required=True, + choices=set( + download_level + for download_level_list in [DOWNLOAD_SPEC[key] for key in DOWNLOAD_SPEC] + for download_level in download_level_list + ), + ) + parser.add_argument("--file-format", type=str, required=False, choices=list(FILE_FORMATS), default="csv") + parser.add_argument("--file-prefix", type=str, required=False, default="") + parser.add_argument("--skip-local-cleanup", action="store_true") + + def handle(self, *args, **options): + extra_conf = { + # Config for Delta Lake tables and SQL. Need these to keep Dela table metadata in the metastore + "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", + "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog", + # See comment below about old date and time values cannot parsed without these + "spark.sql.legacy.parquet.datetimeRebaseModeInWrite": "LEGACY", # for dates at/before 1900 + "spark.sql.legacy.parquet.int96RebaseModeInWrite": "LEGACY", # for timestamps at/before 1900 + "spark.sql.jsonGenerator.ignoreNullFields": "false", # keep nulls in our json + } + + self.spark = get_active_spark_session() + spark_created_by_command = False + if not self.spark: + spark_created_by_command = True + self.spark = configure_spark_session(**extra_conf, spark_context=self.spark) + + # Setup Logger + self.logger = get_jvm_logger(self.spark, __name__) + + # Resolve Parameters + self.download_type = options["download_type"] + self.download_level = options["download_level"] + self.file_prefix = options["file_prefix"] + self.should_cleanup = not options["skip_local_cleanup"] + + if self.download_level not in DOWNLOAD_SPEC[self.download_type].keys(): + raise ValueError( + f'Provided download level of "{self.download_level}" is not supported ' + f'for download type of "{self.download_type}".' + ) + + download_spec = DOWNLOAD_SPEC[self.download_type][self.download_level] + self.file_format_spec = FILE_FORMATS[options["file_format"]] + self.download_query = download_spec["query"] + self.download_validator_type = download_spec["validator_type"] + self.jdbc_properties = get_jdbc_connection_properties() + self.jdbc_url = get_usas_jdbc_url() + + self.working_dir_path = Path(settings.CSV_LOCAL_PATH) + if not self.working_dir_path.exists(): + self.working_dir_path.mkdir() + + create_ref_temp_views(self.spark) + + self.download_job, self.download_source = self.create_download_job() + self.modify_download_query(download_spec["select_in_formats"] or []) + self.process_download() + + if spark_created_by_command: + self.spark.stop() + + def modify_download_query(self, select_in_formats: List[Tuple[str, str]]) -> None: + formats_to_apply = [] + for select_col, query in select_in_formats: + formats_to_apply.append(tuple(val[select_col] for val in self.spark.sql(query).collect())) + self.download_query = self.download_query.format(*formats_to_apply) + + @cached_property + def json_request(self) -> Dict: + request_data = { + "account_level": "federal_account", + "download_types": ["award_financial"], + "file_format": "csv", + "filters": { + "agency": "all", + "budget_function": "all", + "budget_subfunction": "all", + "federal_account": "all", + "fy": 2021, + "period": 12, + "submission_types": ["award_financial"], + }, + "request_type": "account", + } + validator = self.download_validator_type(request_data) + processed_request = order_nested_object(validator.json_request) + + return processed_request + + @cached_property + def json_request_string(self) -> str: + return json.dumps(self.json_request) + + @cached_property + def download_name(self) -> str: + return self.download_job.file_name.replace(".zip", "") + + def create_download_job(self) -> Tuple[DownloadJob, DownloadSource]: + self.logger.info(f"Creating Download Job for {self.download_type} -> {self.download_level}") + + final_output_zip_name = f"{self.file_prefix}{create_unique_filename(self.json_request)}" + download_job_ready_status = JOB_STATUS_DICT["ready"] + + # Create a download_job object for use by the application + download_job = DownloadJob.objects.create( + job_status_id=download_job_ready_status, + file_name=final_output_zip_name, + json_request=self.json_request_string, + ) + + # TODO: This should be updated to be more dynamic to the download type + download_source = DownloadSource( + VALUE_MAPPINGS[self.download_type]["table_name"], + self.download_level, + self.download_type, + self.json_request.get("agency", "all"), + # TODO: Is this necessary for Spark downloads? It was originally added to File C downloads for performance. + extra_file_type="", + ) + download_source.file_name = build_data_file_name(download_source, download_job, piid=None, assistance_id=None) + + return download_job, download_source + + def process_download(self): + self.start_download() + files_to_cleanup = [] + try: + spark_to_csv_strategy = SparkToCSVStrategy(self.logger) + + zip_file_path = self.working_dir_path / f"{self.download_name}.zip" + + csv_metadata = spark_to_csv_strategy.download_to_csv( + self.download_query, + self.working_dir_path / self.download_name, + self.download_name, + self.working_dir_path, + zip_file_path, + ) + files_to_cleanup.extend(csv_metadata.filepaths) + + self.download_job.file_size = os.stat(zip_file_path).st_size + self.download_job.number_of_rows = csv_metadata.number_of_rows + self.download_job.number_of_columns = csv_metadata.number_of_columns + upload_download_file_to_s3(zip_file_path) + except InvalidParameterException as e: + exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob" + self.fail_download(exc_msg, e) + raise + except Exception as e: + exc_msg = "An exception was raised while attempting to process the DownloadJob" + self.fail_download(exc_msg, e) + raise + finally: + if self.should_cleanup: + self.cleanup(files_to_cleanup) + + self.finish_download() + + def start_download(self) -> None: + self.download_job.job_status_id = JOB_STATUS_DICT["running"] + self.download_job.save() + self.logger.info(f"Starting DownloadJob {self.download_job.download_job_id}") + + def fail_download(self, msg: str, e: Optional[Exception] = None) -> None: + if e: + stack_trace = "".join(traceback.format_exception(type(e), value=e, tb=e.__traceback__)) + self.download_job.error_message = f"{msg}:\n{stack_trace}" + else: + self.download_job.error_message = msg + self.logger.error(msg) + self.download_job.job_status_id = JOB_STATUS_DICT["failed"] + self.download_job.save() + + def finish_download(self) -> None: + self.download_job.job_status_id = JOB_STATUS_DICT["finished"] + self.download_job.save() + self.logger.info(f"Finished processing DownloadJob {self.download_job.download_job_id}") + + def cleanup(self, path_list: List[Union[Path, str]]) -> None: + for path in path_list: + if isinstance(path, str): + path = Path(path) + self.logger.info(f"Removing {path}") + path.unlink() \ No newline at end of file diff --git a/usaspending_api/etl/management/commands/drop_delta_table.py b/usaspending_api/etl/management/commands/drop_delta_table.py new file mode 100644 index 0000000000..e69de29bb2 From d7da705578ee4007d67991444250eb23da7558ac Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Mon, 10 Feb 2025 17:07:34 -0600 Subject: [PATCH 03/43] [DEV-11771] - Update generate download job --- .../common/helpers/download_csv_strategies.py | 2 +- .../common/helpers/spark_helpers.py | 7 +- usaspending_api/config/envs/local.py | 2 +- .../download/delta_models/account_download.py | 4 -- .../filestreaming/download_generation.py | 70 ++++++++++--------- .../award_financial/federal_account.py | 1 - 6 files changed, 39 insertions(+), 47 deletions(-) diff --git a/usaspending_api/common/helpers/download_csv_strategies.py b/usaspending_api/common/helpers/download_csv_strategies.py index 288ea0ffa2..99218efb9f 100644 --- a/usaspending_api/common/helpers/download_csv_strategies.py +++ b/usaspending_api/common/helpers/download_csv_strategies.py @@ -4,7 +4,7 @@ import time import logging from pathlib import Path -from typing import Optional, Tuple +from typing import Optional from django.conf import settings from usaspending_api.common.csv_helpers import count_rows_in_delimited_file diff --git a/usaspending_api/common/helpers/spark_helpers.py b/usaspending_api/common/helpers/spark_helpers.py index d75c83e6ba..98a72533e1 100644 --- a/usaspending_api/common/helpers/spark_helpers.py +++ b/usaspending_api/common/helpers/spark_helpers.py @@ -167,10 +167,6 @@ def configure_spark_session( conf.set("spark.hadoop.fs.s3a.endpoint", CONFIG.AWS_S3_ENDPOINT) if not CONFIG.USE_AWS: # i.e. running in a "local" [development] environment - conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") - conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") - conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") - conf.set("hive.metastore.disallow.incompatible.col.type.changes", "false") # Set configs to allow the S3AFileSystem to work against a local MinIO object storage proxy conf.set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") # "Enable S3 path style access ie disabling the default virtual hosting behaviour. @@ -234,7 +230,6 @@ def configure_spark_session( # Build the SparkSession based on args provided builder = SparkSession.builder - builder = builder.config(conf=conf) if spark_context: builder = builder._sparkContext(spark_context) if java_gateway: @@ -247,7 +242,7 @@ def configure_spark_session( builder = builder.appName(app_name) if enable_hive_support: builder = builder.enableHiveSupport() - spark = builder.getOrCreate() + spark = builder.config(conf=conf).getOrCreate() # Now that the SparkSession was created, check whether certain provided config values were ignored if given a # pre-existing SparkContext, and error-out if so diff --git a/usaspending_api/config/envs/local.py b/usaspending_api/config/envs/local.py index f42639ca80..9a33385917 100644 --- a/usaspending_api/config/envs/local.py +++ b/usaspending_api/config/envs/local.py @@ -93,7 +93,7 @@ class LocalConfig(DefaultConfig): AWS_PROFILE: str = None AWS_REGION: str = "" SPARK_S3_BUCKET: str = "data" - BULK_DOWNLOAD_S3_BUCKET_NAME: str = "bulk_download" + BULK_DOWNLOAD_S3_BUCKET_NAME: str = "bulk-download" DATABASE_DOWNLOAD_S3_BUCKET_NAME = "dti-usaspending-db" # Since this config values is built by composing others, we want to late/lazily-evaluate their values, diff --git a/usaspending_api/download/delta_models/account_download.py b/usaspending_api/download/delta_models/account_download.py index 31902aacbb..6535b2bd21 100644 --- a/usaspending_api/download/delta_models/account_download.py +++ b/usaspending_api/download/delta_models/account_download.py @@ -95,10 +95,6 @@ LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' """ -account_download_drop_sql_string = rf""" - DROP TABLE {{DESTINATION_TABLE}} - """ - account_download_load_sql_string = rf""" INSERT OVERWRITE {{DESTINATION_DATABASE}}.{{DESTINATION_TABLE}} ( {",".join(list(ACCOUNT_DOWNLOAD_COLUMNS))} diff --git a/usaspending_api/download/filestreaming/download_generation.py b/usaspending_api/download/filestreaming/download_generation.py index 0992a141a9..78a75ad200 100644 --- a/usaspending_api/download/filestreaming/download_generation.py +++ b/usaspending_api/download/filestreaming/download_generation.py @@ -130,46 +130,48 @@ def generate_download(download_job: DownloadJob, origination: Optional[str] = No # push file to S3 bucket, if not local if not settings.IS_LOCAL: - with tracer.trace( - name=f"job.{JOB_TYPE}.download.s3", - service="bulk-download", - resource=f"s3://{settings.BULK_DOWNLOAD_S3_BUCKET_NAME}", - span_type=SpanTypes.WORKER, - ) as span: - with tracer.trace( + with ( + tracer.trace( + name=f"job.{JOB_TYPE}.download.s3", + service="bulk-download", + resource=f"s3://{settings.BULK_DOWNLOAD_S3_BUCKET_NAME}", + span_type=SpanTypes.WORKER, + ) as span, + tracer.trace( name="s3.command", service="aws.s3", resource=".".join( [multipart_upload.__module__, (multipart_upload.__qualname__ or multipart_upload.__name__)] ), span_type=SpanTypes.WEB, - ) as s3_span: - # NOTE: Traces still not auto-picking-up aws.s3 service upload activity - # Could be that the patches for boto and botocore don't cover the newer boto3 S3Transfer upload approach - span.set_tag("file_name", file_name) - try: - bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME - region = settings.USASPENDING_AWS_REGION - s3_span.set_tags({"bucket": bucket, "region": region, "file": zip_file_path}) - start_uploading = time.perf_counter() - multipart_upload(bucket, region, zip_file_path, os.path.basename(zip_file_path)) - write_to_log( - message=f"Uploading took {time.perf_counter() - start_uploading:.2f}s", - download_job=download_job, - ) - except Exception as e: - # Set error message; job_status_id will be set in download_sqs_worker.handle() - exc_msg = "An exception was raised while attempting to upload the file" - fail_download(download_job, e, exc_msg) - if isinstance(e, InvalidParameterException): - raise InvalidParameterException(e) - else: - raise Exception(download_job.error_message) from e - finally: - # Remove generated file - if os.path.exists(zip_file_path): - os.remove(zip_file_path) - _kill_spawned_processes(download_job) + ) as s3_span, + ): + # NOTE: Traces still not auto-picking-up aws.s3 service upload activity + # Could be that the patches for boto and botocore don't cover the newer boto3 S3Transfer upload approach + span.set_tag("file_name", file_name) + try: + bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME + region = settings.USASPENDING_AWS_REGION + s3_span.set_tags({"bucket": bucket, "region": region, "file": zip_file_path}) + start_uploading = time.perf_counter() + multipart_upload(bucket, region, zip_file_path, os.path.basename(zip_file_path)) + write_to_log( + message=f"Uploading took {time.perf_counter() - start_uploading:.2f}s", + download_job=download_job, + ) + except Exception as e: + # Set error message; job_status_id will be set in download_sqs_worker.handle() + exc_msg = "An exception was raised while attempting to upload the file" + fail_download(download_job, e, exc_msg) + if isinstance(e, InvalidParameterException): + raise InvalidParameterException(e) + else: + raise Exception(download_job.error_message) from e + finally: + # Remove generated file + if os.path.exists(zip_file_path): + os.remove(zip_file_path) + _kill_spawned_processes(download_job) return finish_download(download_job) diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py index 2a3701f401..5171fbd617 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py @@ -34,7 +34,6 @@ award_type_code, award_type, idv_type_code, - idv_type_code, idv_type, prime_award_base_transaction_description, awarding_agency_code, From 06aa46002b5da12ce2148e2dc723d544db62496b Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Mon, 10 Feb 2025 17:11:12 -0600 Subject: [PATCH 04/43] [DEV-11771] - Add empty lines at ends of files --- usaspending_api/common/helpers/s3_helpers.py | 2 +- .../commands/delta_downloads/award_financial/federal_account.py | 2 +- .../download/management/commands/generate_spark_download.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/usaspending_api/common/helpers/s3_helpers.py b/usaspending_api/common/helpers/s3_helpers.py index 07366375ad..117e120257 100644 --- a/usaspending_api/common/helpers/s3_helpers.py +++ b/usaspending_api/common/helpers/s3_helpers.py @@ -195,4 +195,4 @@ def delete_s3_objects( s3_client = _get_boto3_s3_client(region_name) resp = s3_client.delete_objects(Bucket=bucket_name, Delete={"Objects": object_list}) - return len(resp.get("Deleted", [])) \ No newline at end of file + return len(resp.get("Deleted", [])) diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py index 5171fbd617..cda5c52535 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py @@ -199,4 +199,4 @@ (reporting_fiscal_quarter = 4 AND quarter_format_flag IS TRUE) OR (reporting_fiscal_period = 12 AND quarter_format_flag IS FALSE) ) -""" \ No newline at end of file +""" diff --git a/usaspending_api/download/management/commands/generate_spark_download.py b/usaspending_api/download/management/commands/generate_spark_download.py index 7f9ddad98c..74a676f4e2 100644 --- a/usaspending_api/download/management/commands/generate_spark_download.py +++ b/usaspending_api/download/management/commands/generate_spark_download.py @@ -251,4 +251,4 @@ def cleanup(self, path_list: List[Union[Path, str]]) -> None: if isinstance(path, str): path = Path(path) self.logger.info(f"Removing {path}") - path.unlink() \ No newline at end of file + path.unlink() From c1976971d06e1666b354873344b757fa76d34769 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Tue, 11 Feb 2025 12:44:00 -0600 Subject: [PATCH 05/43] [DEV-11770] - Add fields to account download table --- .../download/delta_models/account_download.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/usaspending_api/download/delta_models/account_download.py b/usaspending_api/download/delta_models/account_download.py index 094b9e06f4..ddca2e1bec 100644 --- a/usaspending_api/download/delta_models/account_download.py +++ b/usaspending_api/download/delta_models/account_download.py @@ -1,6 +1,7 @@ ACCOUNT_DOWNLOAD_COLUMNS_TEST = {"award_id_piid": "STRING"} ACCOUNT_DOWNLOAD_COLUMNS = { + "submission_id": "INTEGER NOT NULL", "owning_agency_name": "STRING", "federal_account_symbol": "STRING", "federal_account_name": "STRING", @@ -80,6 +81,10 @@ "prime_award_summary_place_of_performance_cd_current": "STRING", "usaspending_permalink": "STRING", "last_modified_date": "DATE", + "reporting_fiscal_period": "INTEGER", + "reporting_fiscal_quarter": "INTEGER", + "reporting_fiscal_year": "INTEGER", + "quarter_format_flag": "BOOLEAN", } account_download_create_sql_string = rf""" @@ -111,11 +116,11 @@ financial_accounts_by_awards.parent_award_id AS parent_award_id_piid, financial_accounts_by_awards.fain AS award_id_fain, financial_accounts_by_awards.uri AS award_id_uri, - award_search.date_signed AS award_base_action_date, - award_search.certified_date AS award_latest_action_date, - award_search.period_of_performance_start_date, - award_search.period_of_performance_current_end_date, - transaction_search.ordering_period_end_date, + TRY_CAST(award_search.date_signed AS DATE) AS award_base_action_date, + TRY_CAST(award_search.certified_date AS DATE) AS award_latest_action_date, + TRY_CAST(award_search.period_of_performance_start_date AS DATE), + TRY_CAST(award_search.period_of_performance_current_end_date AS DATE), + TRY_CAST(transaction_search.ordering_period_end_date AS DATE), transaction_search.idv_type AS idv_type_code, transaction_search.idv_type_description AS idv_type, award_search.description AS prime_award_base_transaction_description, @@ -325,9 +330,11 @@ ) ELSE '' END AS usaspending_permalink, - CAST( - submission_attributes.published_date AS DATE - ) AS last_modified_date + TRY_CAST(submission_attributes.published_date AS DATE) AS last_modified_date, + submission_attributes.reporting_fiscal_period, + submission_attributes.reporting_fiscal_quarter, + submission_attributes.reporting_fiscal_year, + submission_attributes.quarter_format_flag FROM raw.financial_accounts_by_awards INNER JOIN global_temp.submission_attributes AS submission_attributes ON ( From 451e4c9fe9b1e05640556f4e86aaa3c65a8958e5 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Tue, 11 Feb 2025 13:52:45 -0600 Subject: [PATCH 06/43] [DEV-11770] - Add submission_id to account download. --- usaspending_api/download/delta_models/account_download.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/usaspending_api/download/delta_models/account_download.py b/usaspending_api/download/delta_models/account_download.py index ddca2e1bec..5387466832 100644 --- a/usaspending_api/download/delta_models/account_download.py +++ b/usaspending_api/download/delta_models/account_download.py @@ -99,7 +99,8 @@ INSERT OVERWRITE {{DESTINATION_DATABASE}}.{{DESTINATION_TABLE}} ( {",".join(list(ACCOUNT_DOWNLOAD_COLUMNS))} ) - SELECT + SELECT\ + financial_accounts_by_awards.submission_id, toptier_agency.name AS owning_agency_name, federal_account.federal_account_code AS federal_account_symbol, federal_account.account_title AS federal_account_name, From 44ba832a7729574be3c497ed543b645c0ceead28 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Tue, 11 Feb 2025 14:16:43 -0600 Subject: [PATCH 07/43] [DEV-11770] - Fix syntax error. --- usaspending_api/download/delta_models/account_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usaspending_api/download/delta_models/account_download.py b/usaspending_api/download/delta_models/account_download.py index 5387466832..6535b2bd21 100644 --- a/usaspending_api/download/delta_models/account_download.py +++ b/usaspending_api/download/delta_models/account_download.py @@ -99,7 +99,7 @@ INSERT OVERWRITE {{DESTINATION_DATABASE}}.{{DESTINATION_TABLE}} ( {",".join(list(ACCOUNT_DOWNLOAD_COLUMNS))} ) - SELECT\ + SELECT financial_accounts_by_awards.submission_id, toptier_agency.name AS owning_agency_name, federal_account.federal_account_code AS federal_account_symbol, From 97a39326587418b7344da1c379c57ca3ef381506 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Fri, 14 Feb 2025 11:07:04 -0600 Subject: [PATCH 08/43] [DEV-11770- - update account download table load query to remove fiscal year filter --- .../download/delta_models/account_download.py | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/usaspending_api/download/delta_models/account_download.py b/usaspending_api/download/delta_models/account_download.py index 6535b2bd21..40f43ca196 100644 --- a/usaspending_api/download/delta_models/account_download.py +++ b/usaspending_api/download/delta_models/account_download.py @@ -1,5 +1,3 @@ -ACCOUNT_DOWNLOAD_COLUMNS_TEST = {"award_id_piid": "STRING"} - ACCOUNT_DOWNLOAD_COLUMNS = { "submission_id": "INTEGER NOT NULL", "owning_agency_name": "STRING", @@ -117,11 +115,11 @@ financial_accounts_by_awards.parent_award_id AS parent_award_id_piid, financial_accounts_by_awards.fain AS award_id_fain, financial_accounts_by_awards.uri AS award_id_uri, - TRY_CAST(award_search.date_signed AS DATE) AS award_base_action_date, - TRY_CAST(award_search.certified_date AS DATE) AS award_latest_action_date, - TRY_CAST(award_search.period_of_performance_start_date AS DATE), - TRY_CAST(award_search.period_of_performance_current_end_date AS DATE), - TRY_CAST(transaction_search.ordering_period_end_date AS DATE), + CAST(award_search.date_signed AS DATE) AS award_base_action_date, + CAST(award_search.certified_date AS DATE) AS award_latest_action_date, + CAST(award_search.period_of_performance_start_date AS DATE), + CAST(award_search.period_of_performance_current_end_date AS DATE), + CAST(transaction_search.ordering_period_end_date AS DATE), transaction_search.idv_type AS idv_type_code, transaction_search.idv_type_description AS idv_type, award_search.description AS prime_award_base_transaction_description, @@ -201,7 +199,7 @@ submission_attributes.quarter_format_flag = FALSE AND submission_attributes.reporting_fiscal_period = 12 ) - ) AND submission_attributes.reporting_fiscal_year = 2021 + ) THEN financial_accounts_by_awards.gross_outlay_amount_by_award_cpe ELSE CAST(NULL AS NUMERIC(23, 2)) @@ -217,7 +215,7 @@ submission_attributes.quarter_format_flag = FALSE AND submission_attributes.reporting_fiscal_period = 12 ) - ) AND submission_attributes.reporting_fiscal_year = 2021 + ) THEN financial_accounts_by_awards.ussgl487200_down_adj_pri_ppaid_undel_orders_oblig_refund_cpe ELSE CAST(NULL AS NUMERIC(23, 2)) @@ -233,7 +231,7 @@ submission_attributes.quarter_format_flag = FALSE AND submission_attributes.reporting_fiscal_period = 12 ) - ) AND submission_attributes.reporting_fiscal_year = 2021 + ) THEN financial_accounts_by_awards.ussgl497200_down_adj_pri_paid_deliv_orders_oblig_refund_cpe ELSE CAST(NULL AS NUMERIC(23, 2)) @@ -331,7 +329,7 @@ ) ELSE '' END AS usaspending_permalink, - TRY_CAST(submission_attributes.published_date AS DATE) AS last_modified_date, + CAST(submission_attributes.published_date AS DATE) AS last_modified_date, submission_attributes.reporting_fiscal_period, submission_attributes.reporting_fiscal_quarter, submission_attributes.reporting_fiscal_year, From ad2839b2fadeecaf15297311340e29f498e0129b Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Fri, 14 Feb 2025 15:26:29 -0600 Subject: [PATCH 09/43] [DEV-11770] - update partition_column --- usaspending_api/download/delta_models/account_download.py | 2 ++ usaspending_api/etl/management/commands/load_query_to_delta.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/usaspending_api/download/delta_models/account_download.py b/usaspending_api/download/delta_models/account_download.py index 40f43ca196..a1d316a756 100644 --- a/usaspending_api/download/delta_models/account_download.py +++ b/usaspending_api/download/delta_models/account_download.py @@ -1,4 +1,5 @@ ACCOUNT_DOWNLOAD_COLUMNS = { + "financial_accounts_by_awards_id": "INTEGER NOT NULL", "submission_id": "INTEGER NOT NULL", "owning_agency_name": "STRING", "federal_account_symbol": "STRING", @@ -98,6 +99,7 @@ {",".join(list(ACCOUNT_DOWNLOAD_COLUMNS))} ) SELECT + financial_accounts_by_awards.financial_accounts_by_awards_id, financial_accounts_by_awards.submission_id, toptier_agency.name AS owning_agency_name, federal_account.federal_account_code AS federal_account_symbol, diff --git a/usaspending_api/etl/management/commands/load_query_to_delta.py b/usaspending_api/etl/management/commands/load_query_to_delta.py index 45c58f63fc..f237a4f92c 100644 --- a/usaspending_api/etl/management/commands/load_query_to_delta.py +++ b/usaspending_api/etl/management/commands/load_query_to_delta.py @@ -305,7 +305,7 @@ "destination_database": "rpt", "swap_table": "account_download", "swap_schema": "rpt", - "partition_column": "id", + "partition_column": "financial_accounts_by_awards_id", "partition_column_type": "numeric", "is_partition_column_unique": False, "delta_table_create_sql": account_download_create_sql_string, From ffe97c128544e08d15e5c1ebbebdb22555d2cca2 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Fri, 14 Feb 2025 15:58:13 -0600 Subject: [PATCH 10/43] [DEV-11772] - Update load query to filter by year --- .../award_financial/federal_account.py | 98 +++++++++++++++++-- .../management/commands/drop_delta_table.py | 0 2 files changed, 88 insertions(+), 10 deletions(-) delete mode 100644 usaspending_api/etl/management/commands/drop_delta_table.py diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py index cda5c52535..f027811be0 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py @@ -16,9 +16,35 @@ disaster_emergency_fund_code, disaster_emergency_fund_name, SUM(transaction_obligated_amount) AS transaction_obligated_amount, - SUM(gross_outlay_amount_FYB_to_period_end) AS gross_outlay_amount_FYB_to_period_end, - SUM(USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig) AS USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig, - SUM(USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig) AS USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig, + SUM( + CASE + WHEN + gross_outlay_amount_FYB_to_period_end and reporting_fiscal_year = 2021 + THEN + gross_outlay_amount_FYB_to_period_end + ElSE CAST(NULL as NUMERIC(23, 2)) + END + ) AS gross_outlay_amount_FYB_to_period_end, + SUM( + CASE + WHEN + USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig + and reporting_fiscal_year = 2021 + THEN + USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig + ElSE CAST(NULL as NUMERIC(23, 2)) + END + ) AS USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig, + SUM( + CASE + WHEN + USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig + and reporting_fiscal_year = 2021 + THEN + USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig + ElSE CAST(NULL as NUMERIC(23, 2)) + END + ) AS USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig, award_unique_key, award_id_piid, parent_award_id_piid, @@ -170,14 +196,66 @@ prime_award_summary_place_of_performance_cd_current HAVING -- All of the HAVING statements below ensure we return only non-zero sum records - SUM(gross_outlay_amount_fyb_to_period_end) > 0 - OR SUM(gross_outlay_amount_fyb_to_period_end) < 0 - OR SUM(ussgl487200_downward_adj_prior_year_prepaid_undeliv_order_oblig) < 0 - OR SUM(ussgl487200_downward_adj_prior_year_prepaid_undeliv_order_oblig) > 0 - OR SUM(ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig) < 0 - OR SUM(ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig) > 0 + SUM( + CASE + WHEN + gross_outlay_amount_FYB_to_period_end and reporting_fiscal_year = 2021 + THEN + gross_outlay_amount_FYB_to_period_end + ElSE CAST(NULL as NUMERIC(23, 2)) + END + ) > 0 + OR SUM( + CASE + WHEN + gross_outlay_amount_FYB_to_period_end and reporting_fiscal_year = 2021 + THEN + gross_outlay_amount_FYB_to_period_end + ElSE CAST(NULL as NUMERIC(23, 2)) + END + ) < 0 + OR SUM( + CASE + WHEN + USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig + and reporting_fiscal_year = 2021 + THEN + USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig + ElSE CAST(NULL as NUMERIC(23, 2)) + END + ) < 0 + OR SUM( + CASE + WHEN + USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig + and reporting_fiscal_year = 2021 + THEN + USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig + ElSE CAST(NULL as NUMERIC(23, 2)) + END + ) > 0 + OR SUM( + CASE + WHEN + USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig + and reporting_fiscal_year = 2021 + THEN + USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig + ElSE CAST(NULL as NUMERIC(23, 2)) + END + ) < 0 + OR SUM( + CASE + WHEN + USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig + and reporting_fiscal_year = 2021 + THEN + USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig + ElSE CAST(NULL as NUMERIC(23, 2)) + END + ) > 0 OR SUM(transaction_obligated_amount) > 0 - OR SUM(transaction_obligated_amount) < 0 + OR SUM(transaction_obligated_amount) < 0 """ diff --git a/usaspending_api/etl/management/commands/drop_delta_table.py b/usaspending_api/etl/management/commands/drop_delta_table.py deleted file mode 100644 index e69de29bb2..0000000000 From 5eea3467fa57082d543e291e1a8a4154d286b2cb Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Tue, 18 Feb 2025 09:18:37 -0600 Subject: [PATCH 11/43] [DEV-11771] - Fix merge conflict and whitespace --- usaspending_api/download/delta_models/account_download.py | 8 -------- .../delta_downloads/award_financial/federal_account.py | 2 +- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/usaspending_api/download/delta_models/account_download.py b/usaspending_api/download/delta_models/account_download.py index bca14261be..a1d316a756 100644 --- a/usaspending_api/download/delta_models/account_download.py +++ b/usaspending_api/download/delta_models/account_download.py @@ -117,19 +117,11 @@ financial_accounts_by_awards.parent_award_id AS parent_award_id_piid, financial_accounts_by_awards.fain AS award_id_fain, financial_accounts_by_awards.uri AS award_id_uri, -<<<<<<< HEAD - TRY_CAST(award_search.date_signed AS DATE) AS award_base_action_date, - TRY_CAST(award_search.certified_date AS DATE) AS award_latest_action_date, - TRY_CAST(award_search.period_of_performance_start_date AS DATE), - TRY_CAST(award_search.period_of_performance_current_end_date AS DATE), - TRY_CAST(transaction_search.ordering_period_end_date AS DATE), -======= CAST(award_search.date_signed AS DATE) AS award_base_action_date, CAST(award_search.certified_date AS DATE) AS award_latest_action_date, CAST(award_search.period_of_performance_start_date AS DATE), CAST(award_search.period_of_performance_current_end_date AS DATE), CAST(transaction_search.ordering_period_end_date AS DATE), ->>>>>>> ftr/dev-11770-unflitered-account-download-delta-table transaction_search.idv_type AS idv_type_code, transaction_search.idv_type_description AS idv_type, award_search.description AS prime_award_base_transaction_description, diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py index f027811be0..286f1f72d4 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py @@ -255,7 +255,7 @@ END ) > 0 OR SUM(transaction_obligated_amount) > 0 - OR SUM(transaction_obligated_amount) < 0 + OR SUM(transaction_obligated_amount) < 0 """ From cfc27c4c134f5a92928bb9af628a6864624cc62c Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Tue, 18 Feb 2025 10:48:14 -0600 Subject: [PATCH 12/43] [DEV-11771] - Fix federal account sql --- .../award_financial/federal_account.py | 24 +++++++------------ 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py index 286f1f72d4..b42dda783d 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py @@ -19,7 +19,7 @@ SUM( CASE WHEN - gross_outlay_amount_FYB_to_period_end and reporting_fiscal_year = 2021 + reporting_fiscal_year = 2021 THEN gross_outlay_amount_FYB_to_period_end ElSE CAST(NULL as NUMERIC(23, 2)) @@ -28,8 +28,7 @@ SUM( CASE WHEN - USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig - and reporting_fiscal_year = 2021 + reporting_fiscal_year = 2021 THEN USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig ElSE CAST(NULL as NUMERIC(23, 2)) @@ -38,8 +37,7 @@ SUM( CASE WHEN - USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig - and reporting_fiscal_year = 2021 + reporting_fiscal_year = 2021 THEN USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig ElSE CAST(NULL as NUMERIC(23, 2)) @@ -199,7 +197,7 @@ SUM( CASE WHEN - gross_outlay_amount_FYB_to_period_end and reporting_fiscal_year = 2021 + reporting_fiscal_year = 2021 THEN gross_outlay_amount_FYB_to_period_end ElSE CAST(NULL as NUMERIC(23, 2)) @@ -208,7 +206,7 @@ OR SUM( CASE WHEN - gross_outlay_amount_FYB_to_period_end and reporting_fiscal_year = 2021 + reporting_fiscal_year = 2021 THEN gross_outlay_amount_FYB_to_period_end ElSE CAST(NULL as NUMERIC(23, 2)) @@ -217,8 +215,7 @@ OR SUM( CASE WHEN - USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig - and reporting_fiscal_year = 2021 + reporting_fiscal_year = 2021 THEN USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig ElSE CAST(NULL as NUMERIC(23, 2)) @@ -227,8 +224,7 @@ OR SUM( CASE WHEN - USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig - and reporting_fiscal_year = 2021 + reporting_fiscal_year = 2021 THEN USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig ElSE CAST(NULL as NUMERIC(23, 2)) @@ -237,8 +233,7 @@ OR SUM( CASE WHEN - USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig - and reporting_fiscal_year = 2021 + reporting_fiscal_year = 2021 THEN USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig ElSE CAST(NULL as NUMERIC(23, 2)) @@ -247,8 +242,7 @@ OR SUM( CASE WHEN - USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig - and reporting_fiscal_year = 2021 + reporting_fiscal_year = 2021 THEN USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig ElSE CAST(NULL as NUMERIC(23, 2)) From 71ee6d0a9a63cf359db827aa27fab73e84dc39a3 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Mon, 24 Feb 2025 12:17:03 -0600 Subject: [PATCH 13/43] [DEV-11771] - Move filters to download query --- .../download/delta_models/account_download.py | 51 +-------- .../award_financial/federal_account.py | 108 +++++++++++++++--- 2 files changed, 97 insertions(+), 62 deletions(-) diff --git a/usaspending_api/download/delta_models/account_download.py b/usaspending_api/download/delta_models/account_download.py index a1d316a756..a0242f7cbb 100644 --- a/usaspending_api/download/delta_models/account_download.py +++ b/usaspending_api/download/delta_models/account_download.py @@ -190,54 +190,9 @@ treasury_appropriation_account.budget_function_title AS budget_function, treasury_appropriation_account.budget_subfunction_title AS budget_subfunction, financial_accounts_by_awards.transaction_obligated_amount AS transaction_obligated_amount, - CASE - WHEN - ( - ( - submission_attributes.quarter_format_flag = TRUE - AND submission_attributes.reporting_fiscal_quarter = 4 - ) - OR ( - submission_attributes.quarter_format_flag = FALSE - AND submission_attributes.reporting_fiscal_period = 12 - ) - ) - THEN - financial_accounts_by_awards.gross_outlay_amount_by_award_cpe - ELSE CAST(NULL AS NUMERIC(23, 2)) - END AS gross_outlay_amount_fyb_to_period_end, - CASE - WHEN - ( - ( - submission_attributes.quarter_format_flag = TRUE - AND submission_attributes.reporting_fiscal_quarter = 4 - ) - OR ( - submission_attributes.quarter_format_flag = FALSE - AND submission_attributes.reporting_fiscal_period = 12 - ) - ) - THEN - financial_accounts_by_awards.ussgl487200_down_adj_pri_ppaid_undel_orders_oblig_refund_cpe - ELSE CAST(NULL AS NUMERIC(23, 2)) - END AS ussgl487200_downward_adj_prior_year_prepaid_undeliv_order_oblig, - CASE - WHEN - ( - ( - submission_attributes.quarter_format_flag = TRUE - AND submission_attributes.reporting_fiscal_quarter = 4 - ) - OR ( - submission_attributes.quarter_format_flag = FALSE - AND submission_attributes.reporting_fiscal_period = 12 - ) - ) - THEN - financial_accounts_by_awards.ussgl497200_down_adj_pri_paid_deliv_orders_oblig_refund_cpe - ELSE CAST(NULL AS NUMERIC(23, 2)) - END AS ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig, + financial_accounts_by_awards.gross_outlay_amount_by_award_cpe as gross_outlay_amount_fyb_to_period_end, + financial_accounts_by_awards.ussgl487200_down_adj_pri_ppaid_undel_orders_oblig_refund_cpe as ussgl487200_downward_adj_prior_year_prepaid_undeliv_order_oblig, + financial_accounts_by_awards.ussgl497200_down_adj_pri_paid_deliv_orders_oblig_refund_cpe as ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig, EXTRACT( YEAR FROM (award_search.date_signed) + INTERVAL '3 months' ) AS award_base_action_date_fiscal_year, diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py index b42dda783d..de9fc35208 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py @@ -18,8 +18,17 @@ SUM(transaction_obligated_amount) AS transaction_obligated_amount, SUM( CASE - WHEN - reporting_fiscal_year = 2021 + WHEN + ( + ( + quarter_format_flag = TRUE + AND reporting_fiscal_quarter = 4 + ) + OR ( + quarter_format_flag = FALSE + AND reporting_fiscal_period = 12 + ) + ) AND reporting_fiscal_year = 2021 THEN gross_outlay_amount_FYB_to_period_end ElSE CAST(NULL as NUMERIC(23, 2)) @@ -28,7 +37,16 @@ SUM( CASE WHEN - reporting_fiscal_year = 2021 + ( + ( + quarter_format_flag = TRUE + AND reporting_fiscal_quarter = 4 + ) + OR ( + quarter_format_flag = FALSE + AND reporting_fiscal_period = 12 + ) + ) AND reporting_fiscal_year = 2021 THEN USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig ElSE CAST(NULL as NUMERIC(23, 2)) @@ -37,7 +55,16 @@ SUM( CASE WHEN - reporting_fiscal_year = 2021 + ( + ( + quarter_format_flag = TRUE + AND reporting_fiscal_quarter = 4 + ) + OR ( + quarter_format_flag = FALSE + AND reporting_fiscal_period = 12 + ) + ) AND reporting_fiscal_year = 2021 THEN USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig ElSE CAST(NULL as NUMERIC(23, 2)) @@ -197,7 +224,16 @@ SUM( CASE WHEN - reporting_fiscal_year = 2021 + ( + ( + quarter_format_flag = TRUE + AND reporting_fiscal_quarter = 4 + ) + OR ( + quarter_format_flag = FALSE + AND reporting_fiscal_period = 12 + ) + ) AND reporting_fiscal_year = 2021 THEN gross_outlay_amount_FYB_to_period_end ElSE CAST(NULL as NUMERIC(23, 2)) @@ -205,8 +241,17 @@ ) > 0 OR SUM( CASE - WHEN - reporting_fiscal_year = 2021 + WHEN + ( + ( + quarter_format_flag = TRUE + AND reporting_fiscal_quarter = 4 + ) + OR ( + quarter_format_flag = FALSE + AND reporting_fiscal_period = 12 + ) + ) AND reporting_fiscal_year = 2021 THEN gross_outlay_amount_FYB_to_period_end ElSE CAST(NULL as NUMERIC(23, 2)) @@ -214,8 +259,17 @@ ) < 0 OR SUM( CASE - WHEN - reporting_fiscal_year = 2021 + WHEN + ( + ( + quarter_format_flag = TRUE + AND reporting_fiscal_quarter = 4 + ) + OR ( + quarter_format_flag = FALSE + AND reporting_fiscal_period = 12 + ) + ) AND reporting_fiscal_year = 2021 THEN USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig ElSE CAST(NULL as NUMERIC(23, 2)) @@ -224,7 +278,16 @@ OR SUM( CASE WHEN - reporting_fiscal_year = 2021 + ( + ( + quarter_format_flag = TRUE + AND reporting_fiscal_quarter = 4 + ) + OR ( + quarter_format_flag = FALSE + AND reporting_fiscal_period = 12 + ) + ) AND reporting_fiscal_year = 2021 THEN USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig ElSE CAST(NULL as NUMERIC(23, 2)) @@ -232,8 +295,17 @@ ) > 0 OR SUM( CASE - WHEN - reporting_fiscal_year = 2021 + WHEN + ( + ( + quarter_format_flag = TRUE + AND reporting_fiscal_quarter = 4 + ) + OR ( + quarter_format_flag = FALSE + AND reporting_fiscal_period = 12 + ) + ) AND reporting_fiscal_year = 2021 THEN USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig ElSE CAST(NULL as NUMERIC(23, 2)) @@ -242,7 +314,16 @@ OR SUM( CASE WHEN - reporting_fiscal_year = 2021 + ( + ( + quarter_format_flag = TRUE + AND reporting_fiscal_quarter = 4 + ) + OR ( + quarter_format_flag = FALSE + AND reporting_fiscal_period = 12 + ) + ) AND reporting_fiscal_year = 2021 THEN USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig ElSE CAST(NULL as NUMERIC(23, 2)) @@ -253,7 +334,6 @@ """ - SUBMISSION_ID_QUERY = """ SELECT submission_id FROM global_temp.submission_attributes From e9691a86a8f82e2359c5d74174fa0974249b2348 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Tue, 25 Feb 2025 09:20:39 -0600 Subject: [PATCH 14/43] [DEV-11770] - Fix white space --- .../download/delta_models/account_download.py | 2 +- .../award_financial/federal_account.py | 60 +++++++++---------- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/usaspending_api/download/delta_models/account_download.py b/usaspending_api/download/delta_models/account_download.py index a0242f7cbb..2414c0bcb3 100644 --- a/usaspending_api/download/delta_models/account_download.py +++ b/usaspending_api/download/delta_models/account_download.py @@ -192,7 +192,7 @@ financial_accounts_by_awards.transaction_obligated_amount AS transaction_obligated_amount, financial_accounts_by_awards.gross_outlay_amount_by_award_cpe as gross_outlay_amount_fyb_to_period_end, financial_accounts_by_awards.ussgl487200_down_adj_pri_ppaid_undel_orders_oblig_refund_cpe as ussgl487200_downward_adj_prior_year_prepaid_undeliv_order_oblig, - financial_accounts_by_awards.ussgl497200_down_adj_pri_paid_deliv_orders_oblig_refund_cpe as ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig, + financial_accounts_by_awards.ussgl497200_down_adj_pri_paid_deliv_orders_oblig_refund_cpe as ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig, EXTRACT( YEAR FROM (award_search.date_signed) + INTERVAL '3 months' ) AS award_base_action_date_fiscal_year, diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py index de9fc35208..546bf328d2 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py @@ -17,7 +17,7 @@ disaster_emergency_fund_name, SUM(transaction_obligated_amount) AS transaction_obligated_amount, SUM( - CASE + CASE WHEN ( ( @@ -29,14 +29,14 @@ AND reporting_fiscal_period = 12 ) ) AND reporting_fiscal_year = 2021 - THEN + THEN gross_outlay_amount_FYB_to_period_end ElSE CAST(NULL as NUMERIC(23, 2)) - END + END ) AS gross_outlay_amount_FYB_to_period_end, SUM( - CASE - WHEN + CASE + WHEN ( ( quarter_format_flag = TRUE @@ -47,14 +47,14 @@ AND reporting_fiscal_period = 12 ) ) AND reporting_fiscal_year = 2021 - THEN + THEN USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig ElSE CAST(NULL as NUMERIC(23, 2)) - END + END ) AS USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig, SUM( - CASE - WHEN + CASE + WHEN ( ( quarter_format_flag = TRUE @@ -65,7 +65,7 @@ AND reporting_fiscal_period = 12 ) ) AND reporting_fiscal_year = 2021 - THEN + THEN USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig ElSE CAST(NULL as NUMERIC(23, 2)) END @@ -129,7 +129,7 @@ national_interest_action_code, national_interest_action, usaspending_permalink, - MAX(last_modified_date) + MAX(last_modified_date) FROM rpt.account_download WHERE ( @@ -146,9 +146,9 @@ ) AND reporting_fiscal_year = 2021 ) - ) + ) GROUP BY - owning_agency_name, + owning_agency_name, federal_account_symbol, federal_account_name, agency_identifier_name, @@ -222,8 +222,8 @@ HAVING -- All of the HAVING statements below ensure we return only non-zero sum records SUM( - CASE - WHEN + CASE + WHEN ( ( quarter_format_flag = TRUE @@ -234,13 +234,13 @@ AND reporting_fiscal_period = 12 ) ) AND reporting_fiscal_year = 2021 - THEN + THEN gross_outlay_amount_FYB_to_period_end ElSE CAST(NULL as NUMERIC(23, 2)) END ) > 0 OR SUM( - CASE + CASE WHEN ( ( @@ -252,13 +252,13 @@ AND reporting_fiscal_period = 12 ) ) AND reporting_fiscal_year = 2021 - THEN + THEN gross_outlay_amount_FYB_to_period_end ElSE CAST(NULL as NUMERIC(23, 2)) END ) < 0 OR SUM( - CASE + CASE WHEN ( ( @@ -270,14 +270,14 @@ AND reporting_fiscal_period = 12 ) ) AND reporting_fiscal_year = 2021 - THEN + THEN USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig ElSE CAST(NULL as NUMERIC(23, 2)) - END + END ) < 0 OR SUM( - CASE - WHEN + CASE + WHEN ( ( quarter_format_flag = TRUE @@ -288,13 +288,13 @@ AND reporting_fiscal_period = 12 ) ) AND reporting_fiscal_year = 2021 - THEN + THEN USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig ElSE CAST(NULL as NUMERIC(23, 2)) - END + END ) > 0 OR SUM( - CASE + CASE WHEN ( ( @@ -306,14 +306,14 @@ AND reporting_fiscal_period = 12 ) ) AND reporting_fiscal_year = 2021 - THEN + THEN USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig ElSE CAST(NULL as NUMERIC(23, 2)) END ) < 0 OR SUM( - CASE - WHEN + CASE + WHEN ( ( quarter_format_flag = TRUE @@ -324,7 +324,7 @@ AND reporting_fiscal_period = 12 ) ) AND reporting_fiscal_year = 2021 - THEN + THEN USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig ElSE CAST(NULL as NUMERIC(23, 2)) END From ac480b3d08b0d3f79e160b4b8313f45828649763 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Tue, 15 Apr 2025 09:32:24 -0500 Subject: [PATCH 15/43] [DEV-12234] - Update account_download schema and table spec --- .../download/delta_models/account_download.py | 181 +++++++++--------- .../commands/load_query_to_delta.py | 10 +- 2 files changed, 100 insertions(+), 91 deletions(-) diff --git a/usaspending_api/download/delta_models/account_download.py b/usaspending_api/download/delta_models/account_download.py index 2414c0bcb3..86a3a5b678 100644 --- a/usaspending_api/download/delta_models/account_download.py +++ b/usaspending_api/download/delta_models/account_download.py @@ -1,94 +1,103 @@ ACCOUNT_DOWNLOAD_COLUMNS = { - "financial_accounts_by_awards_id": "INTEGER NOT NULL", - "submission_id": "INTEGER NOT NULL", - "owning_agency_name": "STRING", - "federal_account_symbol": "STRING", - "federal_account_name": "STRING", - "agency_identifier_name": "STRING", - "program_activity_code": "STRING", - "program_activity_name": "STRING", - "object_class_code": "STRING", - "object_class_name": "STRING", - "direct_or_reimbursable_funding_source": "STRING", - "disaster_emergency_fund_code": "STRING", - "disaster_emergency_fund_name": "STRING", - "award_unique_key": "STRING", - "award_id_piid": "STRING", - "parent_award_id_piid": "STRING", - "award_id_fain": "STRING", - "award_id_uri": "STRING", - "award_base_action_date": "DATE", - "award_latest_action_date": "DATE", - "period_of_performance_start_date": "DATE", - "period_of_performance_current_end_date": "DATE", - "ordering_period_end_date": "DATE", - "idv_type_code": "STRING", - "idv_type": "STRING", - "prime_award_base_transaction_description": "STRING", - "awarding_agency_code": "STRING", - "awarding_agency_name": "STRING", - "awarding_subagency_code": "STRING", - "awarding_subagency_name": "STRING", - "awarding_office_code": "STRING", - "awarding_office_name": "STRING", - "funding_agency_code": "STRING", - "funding_agency_name": "STRING", - "funding_sub_agency_code": "STRING", - "funding_sub_agency_name": "STRING", - "funding_office_code": "STRING", - "funding_office_name": "STRING", - "recipient_uei": "STRING", - "recipient_duns": "STRING", - "recipient_name": "STRING", - "recipient_name_raw": "STRING", - "recipient_parent_uei": "STRING", - "recipient_parent_duns": "STRING", - "recipient_parent_name": "STRING", - "recipient_parent_name_raw": "STRING", - "recipient_country": "STRING", - "recipient_state": "STRING", - "recipient_county": "STRING", - "recipient_city": "STRING", - "primary_place_of_performance_country": "STRING", - "primary_place_of_performance_state": "STRING", - "primary_place_of_performance_county": "STRING", - "primary_place_of_performance_zip_code": "STRING", - "cfda_number": "STRING", - "cfda_title": "STRING", - "product_or_service_code": "STRING", - "product_or_service_code_description": "STRING", - "naics_code": "STRING", - "naics_description": "STRING", - "national_interest_action_code": "STRING", - "national_interest_action": "STRING", - "reporting_agency_name": "STRING", - "submission_period": "STRING", - "budget_function": "STRING", - "budget_subfunction": "STRING", - "transaction_obligated_amount": "NUMERIC(23,2)", - "gross_outlay_amount_fyb_to_period_end": "NUMERIC(23,2)", - "ussgl487200_downward_adj_prior_year_prepaid_undeliv_order_oblig": "NUMERIC(23,2)", - "ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig": "NUMERIC(23,2)", - "award_base_action_date_fiscal_year": "INTEGER", - "award_latest_action_date_fiscal_year": "INTEGER", - "award_type_code": "STRING", - "award_type": "STRING", - "prime_award_summary_recipient_cd_original": "STRING", - "prime_award_summary_recipient_cd_current": "STRING", - "recipient_zip_code": "STRING", - "prime_award_summary_place_of_performance_cd_original": "STRING", - "prime_award_summary_place_of_performance_cd_current": "STRING", - "usaspending_permalink": "STRING", - "last_modified_date": "DATE", - "reporting_fiscal_period": "INTEGER", - "reporting_fiscal_quarter": "INTEGER", - "reporting_fiscal_year": "INTEGER", - "quarter_format_flag": "BOOLEAN", + "financial_accounts_by_awards_id": {"delta": "INTEGER NOT NULL", "postgres": "INTEGER NOT NULL"}, + "submission_id": {"delta": "INTEGER NOT NULL", "postgres": "INTEGER NOT NULL"}, + "owning_agency_name": {"delta": "STRING", "postgres": "TEXT"}, + "federal_account_symbol": {"delta": "STRING", "postgres": "TEXT"}, + "federal_account_name": {"delta": "STRING", "postgres": "TEXT"}, + "agency_identifier_name": {"delta": "STRING", "postgres": "TEXT"}, + "program_activity_code": {"delta": "STRING", "postgres": "TEXT"}, + "program_activity_name": {"delta": "STRING", "postgres": "TEXT"}, + "object_class_code": {"delta": "STRING", "postgres": "TEXT"}, + "object_class_name": {"delta": "STRING", "postgres": "TEXT"}, + "direct_or_reimbursable_funding_source": {"delta": "STRING", "postgres": "TEXT"}, + "disaster_emergency_fund_code": {"delta": "STRING", "postgres": "TEXT"}, + "disaster_emergency_fund_name": {"delta": "STRING", "postgres": "TEXT"}, + "award_unique_key": {"delta": "STRING", "postgres": "TEXT"}, + "award_id_piid": {"delta": "STRING", "postgres": "TEXT"}, + "parent_award_id_piid": {"delta": "STRING", "postgres": "TEXT"}, + "award_id_fain": {"delta": "STRING", "postgres": "TEXT"}, + "award_id_uri": {"delta": "STRING", "postgres": "TEXT"}, + "award_base_action_date": {"delta": "DATE", "postgres": "DATE"}, + "award_latest_action_date": {"delta": "DATE", "postgres": "DATE"}, + "period_of_performance_start_date": {"delta": "DATE", "postgres": "DATE"}, + "period_of_performance_current_end_date": {"delta": "DATE", "postgres": "DATE"}, + "ordering_period_end_date": {"delta": "DATE", "postgres": "DATE"}, + "idv_type_code": {"delta": "STRING", "postgres": "TEXT"}, + "idv_type": {"delta": "STRING", "postgres": "TEXT"}, + "prime_award_base_transaction_description": {"delta": "STRING", "postgres": "TEXT"}, + "awarding_agency_code": {"delta": "STRING", "postgres": "TEXT"}, + "awarding_agency_name": {"delta": "STRING", "postgres": "TEXT"}, + "awarding_subagency_code": {"delta": "STRING", "postgres": "TEXT"}, + "awarding_subagency_name": {"delta": "STRING", "postgres": "TEXT"}, + "awarding_office_code": {"delta": "STRING", "postgres": "TEXT"}, + "awarding_office_name": {"delta": "STRING", "postgres": "TEXT"}, + "funding_agency_code": {"delta": "STRING", "postgres": "TEXT"}, + "funding_agency_name": {"delta": "STRING", "postgres": "TEXT"}, + "funding_sub_agency_code": {"delta": "STRING", "postgres": "TEXT"}, + "funding_sub_agency_name": {"delta": "STRING", "postgres": "TEXT"}, + "funding_office_code": {"delta": "STRING", "postgres": "TEXT"}, + "funding_office_name": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_uei": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_duns": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_name": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_name_raw": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_parent_uei": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_parent_duns": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_parent_name": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_parent_name_raw": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_country": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_state": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_county": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_city": {"delta": "STRING", "postgres": "TEXT"}, + "primary_place_of_performance_country": {"delta": "STRING", "postgres": "TEXT"}, + "primary_place_of_performance_state": {"delta": "STRING", "postgres": "TEXT"}, + "primary_place_of_performance_county": {"delta": "STRING", "postgres": "TEXT"}, + "primary_place_of_performance_zip_code": {"delta": "STRING", "postgres": "TEXT"}, + "cfda_number": {"delta": "STRING", "postgres": "TEXT"}, + "cfda_title": {"delta": "STRING", "postgres": "TEXT"}, + "product_or_service_code": {"delta": "STRING", "postgres": "TEXT"}, + "product_or_service_code_description": {"delta": "STRING", "postgres": "TEXT"}, + "naics_code": {"delta": "STRING", "postgres": "TEXT"}, + "naics_description": {"delta": "STRING", "postgres": "TEXT"}, + "national_interest_action_code": {"delta": "STRING", "postgres": "TEXT"}, + "national_interest_action": {"delta": "STRING", "postgres": "TEXT"}, + "reporting_agency_name": {"delta": "STRING", "postgres": "TEXT"}, + "submission_period": {"delta": "STRING", "postgres": "TEXT"}, + "budget_function": {"delta": "STRING", "postgres": "TEXT"}, + "budget_subfunction": {"delta": "STRING", "postgres": "TEXT"}, + "transaction_obligated_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)"}, + "gross_outlay_amount_fyb_to_period_end": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)"}, + "ussgl487200_downward_adj_prior_year_prepaid_undeliv_order_oblig": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + }, + "ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig": { + "delta": "NUMERIC(23},2)", + "postgres": "NUMERIC(23,2)", + }, + "award_base_action_date_fiscal_year": {"delta": "INTEGER", "postgres": "INTEGER"}, + "award_latest_action_date_fiscal_year": {"delta": "INTEGER", "postgres": "INTEGER"}, + "award_type_code": {"delta": "STRING", "postgres": "TEXT"}, + "award_type": {"delta": "STRING", "postgres": "TEXT"}, + "prime_award_summary_recipient_cd_original": {"delta": "STRING", "postgres": "TEXT"}, + "prime_award_summary_recipient_cd_current": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_zip_code": {"delta": "STRING", "postgres": "TEXT"}, + "prime_award_summary_place_of_performance_cd_original": {"delta": "STRING", "postgres": "TEXT"}, + "prime_award_summary_place_of_performance_cd_current": {"delta": "STRING", "postgres": "TEXT"}, + "usaspending_permalink": {"delta": "STRING", "postgres": "TEXT"}, + "last_modified_date": {"delta": "DATE", "postgres": "DATE"}, + "reporting_fiscal_period": {"delta": "INTEGER", "postgres": "INTEGER"}, + "reporting_fiscal_quarter": {"delta": "INTEGER", "postgres": "INTEGER"}, + "reporting_fiscal_year": {"delta": "INTEGER", "postgres": "INTEGER"}, + "quarter_format_flag": {"delta": "BOOLEAN", "postgres": "BOOLEAN"}, } +ACCOUNT_DOWNLOAD_DELTA_COLUMNS = {k: v["delta"] for k, v in ACCOUNT_DOWNLOAD_COLUMNS.items()} +ACCOUNT_DOWNLOAD_POSTGRES_COLUMNS = {k: v["postgres"] for k, v in ACCOUNT_DOWNLOAD_COLUMNS.items()} + account_download_create_sql_string = rf""" CREATE OR REPLACE TABLE {{DESTINATION_TABLE}} ( - {", ".join([f'{key} {val}' for key, val in ACCOUNT_DOWNLOAD_COLUMNS.items()])} + {", ".join([f'{key} {val}' for key, val in ACCOUNT_DOWNLOAD_DELTA_COLUMNS.items()])} ) USING DELTA LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' diff --git a/usaspending_api/etl/management/commands/load_query_to_delta.py b/usaspending_api/etl/management/commands/load_query_to_delta.py index f237a4f92c..d39071975c 100644 --- a/usaspending_api/etl/management/commands/load_query_to_delta.py +++ b/usaspending_api/etl/management/commands/load_query_to_delta.py @@ -18,7 +18,7 @@ ) from usaspending_api.disaster.models import CovidFABASpending from usaspending_api.download.delta_models.account_download import ( - ACCOUNT_DOWNLOAD_COLUMNS, + ACCOUNT_DOWNLOAD_POSTGRES_COLUMNS, account_download_create_sql_string, account_download_load_sql_string, ) @@ -303,15 +303,15 @@ "source_database": None, "source_table": None, "destination_database": "rpt", - "swap_table": "account_download", - "swap_schema": "rpt", + "swap_table": None, + "swap_schema": None, "partition_column": "financial_accounts_by_awards_id", "partition_column_type": "numeric", "is_partition_column_unique": False, "delta_table_create_sql": account_download_create_sql_string, - "source_schema": ACCOUNT_DOWNLOAD_COLUMNS, + "source_schema": ACCOUNT_DOWNLOAD_POSTGRES_COLUMNS, "custom_schema": None, - "column_names": list(ACCOUNT_DOWNLOAD_COLUMNS), + "column_names": list(ACCOUNT_DOWNLOAD_POSTGRES_COLUMNS), "postgres_seq_name": None, "tsvectors": None, "postgres_partition_spec": None, From 6f99dbcbb13a0e8fe370030237eda5c34f7477ea Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Wed, 16 Apr 2025 14:58:23 -0500 Subject: [PATCH 16/43] [DEV-12234] - fix type in account download sql --- usaspending_api/download/delta_models/account_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usaspending_api/download/delta_models/account_download.py b/usaspending_api/download/delta_models/account_download.py index 86a3a5b678..4259ec908d 100644 --- a/usaspending_api/download/delta_models/account_download.py +++ b/usaspending_api/download/delta_models/account_download.py @@ -72,7 +72,7 @@ "postgres": "NUMERIC(23,2)", }, "ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig": { - "delta": "NUMERIC(23},2)", + "delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)", }, "award_base_action_date_fiscal_year": {"delta": "INTEGER", "postgres": "INTEGER"}, From 5798b2f9e9a34c730dfedd8d4638ccddf07b4b24 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Thu, 17 Apr 2025 13:19:54 -0500 Subject: [PATCH 17/43] [DEV-12235] - Add generate_postgres_download command --- .../common/helpers/download_csv_strategies.py | 9 +- usaspending_api/common/helpers/sql_helpers.py | 6 + .../award_financial/federal_account.py | 333 +++++++++++++++++- .../commands/generate_postgres_download.py | 193 ++++++++++ .../commands/generate_spark_download.py | 2 +- 5 files changed, 536 insertions(+), 7 deletions(-) create mode 100644 usaspending_api/download/management/commands/generate_postgres_download.py diff --git a/usaspending_api/common/helpers/download_csv_strategies.py b/usaspending_api/common/helpers/download_csv_strategies.py index 99218efb9f..64b0e177ab 100644 --- a/usaspending_api/common/helpers/download_csv_strategies.py +++ b/usaspending_api/common/helpers/download_csv_strategies.py @@ -9,7 +9,7 @@ from usaspending_api.common.csv_helpers import count_rows_in_delimited_file from usaspending_api.common.helpers.s3_helpers import delete_s3_objects, download_s3_object -from usaspending_api.common.helpers.sql_helpers import read_sql_file_to_text +from usaspending_api.common.helpers.sql_helpers import strip_sql_whitespace from usaspending_api.download.filestreaming.download_generation import ( EXCEL_ROW_LIMIT, split_and_zip_data_files, @@ -74,12 +74,11 @@ def __init__(self, logger: logging.Logger, *args, **kwargs): def download_to_csv( self, source_sql, destination_path, destination_file_name, working_dir_path, download_zip_path, source_df=None ): - source_sql = Path(source_sql) start_time = time.perf_counter() self._logger.info(f"Downloading data to {destination_path}") temp_data_file_name = destination_path.parent / (destination_path.name + "_temp") options = FILE_FORMATS[self.file_format]["options"] - export_query = r"\COPY ({}) TO STDOUT {}".format(read_sql_file_to_text(source_sql), options) + export_query = r"\COPY ({}) TO STDOUT {}".format(strip_sql_whitespace(source_sql), options) try: temp_file, temp_file_path = generate_export_query_temp_file(export_query, None, working_dir_path) # Create a separate process to run the PSQL command; wait @@ -115,8 +114,8 @@ def download_to_csv( except Exception as e: raise e finally: - Path(temp_file_path).unlink() - return CSVDownloadMetadata([destination_path], row_count) + Path(temp_data_file_name).unlink() + return CSVDownloadMetadata([destination_path], row_count, None) class SparkToCSVStrategy(AbstractToCSVStrategy): diff --git a/usaspending_api/common/helpers/sql_helpers.py b/usaspending_api/common/helpers/sql_helpers.py index eb53872cac..429fad6685 100644 --- a/usaspending_api/common/helpers/sql_helpers.py +++ b/usaspending_api/common/helpers/sql_helpers.py @@ -41,6 +41,12 @@ def read_sql_file_to_text(file_path: Path) -> str: return p.sub(" ", str(file_path.read_text().replace("\n", " "))) +def strip_sql_whitespace(query: str) -> str: + """Open file and return text with most whitespace removed""" + p = re.compile(r"\s\s+") + return p.sub(" ", str(query.replace("\n", " "))) + + def read_sql_file(file_path): # Read in SQL file and extract commands into a list _, file_extension = os.path.splitext(file_path) diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py index 546bf328d2..1ad127195d 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py @@ -1,4 +1,4 @@ -DOWNLOAD_QUERY = """ +DELTA_DOWNLOAD_QUERY = """ SELECT owning_agency_name, CONCAT_WS('; ', COLLECT_SET(reporting_agency_name)) AS reporting_agency_name, @@ -352,3 +352,334 @@ (reporting_fiscal_period = 12 AND quarter_format_flag IS FALSE) ) """ + +POSTGRES_DOWNLOAD_QUERY = """ + SELECT + owning_agency_name, + STRING_AGG(reporting_agency_name, '; ') AS reporting_agency_name, + submission_period, + federal_account_symbol, + federal_account_name, + agency_identifier_name, + STRING_AGG(budget_function, '; ') AS budget_function, + STRING_AGG(budget_subfunction, '; ') AS budget_subfunction, + program_activity_code, + program_activity_name, + object_class_code, + object_class_name, + direct_or_reimbursable_funding_source, + disaster_emergency_fund_code, + disaster_emergency_fund_name, + SUM(transaction_obligated_amount) AS transaction_obligated_amount, + SUM( + CASE + WHEN + ( + ( + quarter_format_flag = TRUE + AND reporting_fiscal_quarter = 4 + ) + OR ( + quarter_format_flag = FALSE + AND reporting_fiscal_period = 12 + ) + ) AND reporting_fiscal_year = 2021 + THEN + gross_outlay_amount_FYB_to_period_end + ElSE CAST(NULL as NUMERIC(23, 2)) + END + ) AS gross_outlay_amount_FYB_to_period_end, + SUM( + CASE + WHEN + ( + ( + quarter_format_flag = TRUE + AND reporting_fiscal_quarter = 4 + ) + OR ( + quarter_format_flag = FALSE + AND reporting_fiscal_period = 12 + ) + ) AND reporting_fiscal_year = 2021 + THEN + USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig + ElSE CAST(NULL as NUMERIC(23, 2)) + END + ) AS USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig, + SUM( + CASE + WHEN + ( + ( + quarter_format_flag = TRUE + AND reporting_fiscal_quarter = 4 + ) + OR ( + quarter_format_flag = FALSE + AND reporting_fiscal_period = 12 + ) + ) AND reporting_fiscal_year = 2021 + THEN + USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig + ElSE CAST(NULL as NUMERIC(23, 2)) + END + ) AS USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig, + award_unique_key, + award_id_piid, + parent_award_id_piid, + award_id_fain, + award_id_uri, + award_base_action_date, + award_base_action_date_fiscal_year, + award_latest_action_date, + award_latest_action_date_fiscal_year, + period_of_performance_start_date, + period_of_performance_current_end_date, + ordering_period_end_date, + award_type_code, + award_type, + idv_type_code, + idv_type, + prime_award_base_transaction_description, + awarding_agency_code, + awarding_agency_name, + awarding_subagency_code, + awarding_subagency_name, + awarding_office_code, + awarding_office_name, + funding_agency_code, + funding_agency_name, + funding_sub_agency_code, + funding_sub_agency_name, + funding_office_code, + funding_office_name, + recipient_uei, + recipient_duns, + recipient_name, + recipient_name_raw, + recipient_parent_uei, + recipient_parent_duns, + recipient_parent_name, + recipient_parent_name_raw, + recipient_country, + recipient_state, + recipient_county, + recipient_city, + prime_award_summary_recipient_cd_original, + prime_award_summary_recipient_cd_current, + recipient_zip_code, + primary_place_of_performance_country, + primary_place_of_performance_state, + primary_place_of_performance_county, + prime_award_summary_place_of_performance_cd_original, + prime_award_summary_place_of_performance_cd_current, + primary_place_of_performance_zip_code, + cfda_number, + cfda_title, + product_or_service_code, + product_or_service_code_description, + naics_code, + naics_description, + national_interest_action_code, + national_interest_action, + usaspending_permalink, + MAX(last_modified_date) + FROM temp.account_download_temp + WHERE + ( + ( + ( + reporting_fiscal_period <= 12 + AND NOT quarter_format_flag) + OR ( + reporting_fiscal_quarter <= 4 + AND quarter_format_flag + ) + ) + AND reporting_fiscal_year = 2021 + ) + GROUP BY + owning_agency_name, + federal_account_symbol, + federal_account_name, + agency_identifier_name, + program_activity_code, + program_activity_name, + object_class_code, + object_class_name, + direct_or_reimbursable_funding_source, + disaster_emergency_fund_code, + disaster_emergency_fund_name, + award_unique_key, + award_id_piid, + parent_award_id_piid, + award_id_fain, + award_id_uri, + award_base_action_date, + award_latest_action_date, + period_of_performance_start_date, + period_of_performance_current_end_date, + ordering_period_end_date, + idv_type_code, + idv_type, + prime_award_base_transaction_description, + awarding_agency_code, + awarding_agency_name, + awarding_subagency_code, + awarding_subagency_name, + awarding_office_code, + awarding_office_name, + funding_agency_code, + funding_agency_name, + funding_sub_agency_code, + funding_sub_agency_name, + funding_office_code, + funding_office_name, + recipient_uei, + recipient_duns, + recipient_name, + recipient_name_raw, + recipient_parent_uei, + recipient_parent_duns, + recipient_parent_name, + recipient_parent_name_raw, + recipient_country, + recipient_state, + recipient_county, + recipient_city, + primary_place_of_performance_country, + primary_place_of_performance_state, + primary_place_of_performance_county, + primary_place_of_performance_zip_code, + cfda_number, + cfda_title, + product_or_service_code, + product_or_service_code_description, + naics_code, + naics_description, + national_interest_action_code, + national_interest_action, + submission_period, + award_type_code, + award_type, + recipient_zip_code, + award_base_action_date_fiscal_year, + award_latest_action_date_fiscal_year, + usaspending_permalink, + prime_award_summary_recipient_cd_original, + prime_award_summary_recipient_cd_current, + prime_award_summary_place_of_performance_cd_original, + prime_award_summary_place_of_performance_cd_current + HAVING + SUM( + CASE + WHEN + ( + ( + quarter_format_flag = TRUE + AND reporting_fiscal_quarter = 4 + ) + OR ( + quarter_format_flag = FALSE + AND reporting_fiscal_period = 12 + ) + ) AND reporting_fiscal_year = 2021 + THEN + gross_outlay_amount_FYB_to_period_end + ElSE CAST(NULL as NUMERIC(23, 2)) + END + ) > 0 + OR SUM( + CASE + WHEN + ( + ( + quarter_format_flag = TRUE + AND reporting_fiscal_quarter = 4 + ) + OR ( + quarter_format_flag = FALSE + AND reporting_fiscal_period = 12 + ) + ) AND reporting_fiscal_year = 2021 + THEN + gross_outlay_amount_FYB_to_period_end + ElSE CAST(NULL as NUMERIC(23, 2)) + END + ) < 0 + OR SUM( + CASE + WHEN + ( + ( + quarter_format_flag = TRUE + AND reporting_fiscal_quarter = 4 + ) + OR ( + quarter_format_flag = FALSE + AND reporting_fiscal_period = 12 + ) + ) AND reporting_fiscal_year = 2021 + THEN + USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig + ElSE CAST(NULL as NUMERIC(23, 2)) + END + ) < 0 + OR SUM( + CASE + WHEN + ( + ( + quarter_format_flag = TRUE + AND reporting_fiscal_quarter = 4 + ) + OR ( + quarter_format_flag = FALSE + AND reporting_fiscal_period = 12 + ) + ) AND reporting_fiscal_year = 2021 + THEN + USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig + ElSE CAST(NULL as NUMERIC(23, 2)) + END + ) > 0 + OR SUM( + CASE + WHEN + ( + ( + quarter_format_flag = TRUE + AND reporting_fiscal_quarter = 4 + ) + OR ( + quarter_format_flag = FALSE + AND reporting_fiscal_period = 12 + ) + ) AND reporting_fiscal_year = 2021 + THEN + USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig + ElSE CAST(NULL as NUMERIC(23, 2)) + END + ) < 0 + OR SUM( + CASE + WHEN + ( + ( + quarter_format_flag = TRUE + AND reporting_fiscal_quarter = 4 + ) + OR ( + quarter_format_flag = FALSE + AND reporting_fiscal_period = 12 + ) + ) AND reporting_fiscal_year = 2021 + THEN + USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig + ElSE CAST(NULL as NUMERIC(23, 2)) + END + ) > 0 + OR SUM(transaction_obligated_amount) > 0 + OR SUM(transaction_obligated_amount) < 0 +""" diff --git a/usaspending_api/download/management/commands/generate_postgres_download.py b/usaspending_api/download/management/commands/generate_postgres_download.py new file mode 100644 index 0000000000..0340b9d4bc --- /dev/null +++ b/usaspending_api/download/management/commands/generate_postgres_download.py @@ -0,0 +1,193 @@ +import json +import logging +import os +import traceback +from logging import Logger +from pathlib import Path +from typing import Optional, Dict, Tuple, Type, List, Union + +from django.conf import settings +from django.core.management.base import BaseCommand +from django.utils.functional import cached_property + +from usaspending_api.common.exceptions import InvalidParameterException +from usaspending_api.common.helpers.dict_helpers import order_nested_object +from usaspending_api.common.helpers.download_csv_strategies import PostgresToCSVStrategy +from usaspending_api.common.helpers.s3_helpers import upload_download_file_to_s3 +from usaspending_api.download.filestreaming.download_generation import build_data_file_name +from usaspending_api.download.filestreaming.download_source import DownloadSource +from usaspending_api.download.management.commands.delta_downloads.award_financial import federal_account +from usaspending_api.download.download_utils import create_unique_filename +from usaspending_api.download.lookups import JOB_STATUS_DICT, FILE_FORMATS, VALUE_MAPPINGS +from usaspending_api.download.models import DownloadJob +from usaspending_api.download.v2.request_validations import AccountDownloadValidator, DownloadValidatorBase + +DOWNLOAD_SPEC = { + "award_financial": { + "federal_account": { + "query": federal_account.POSTGRES_DOWNLOAD_QUERY, + "validator_type": AccountDownloadValidator, + } + } +} + + +class Command(BaseCommand): + + help = "Generate a download zip file based on the provided type and level." + + download_job: DownloadJob + download_level: str + download_query: str + download_source: DownloadSource + download_spec: Dict + download_type: str + download_validator_type: Type[DownloadValidatorBase] + file_format_spec: Dict + file_prefix: str + logger: Logger + should_cleanup: bool + working_dir_path: Path + + def add_arguments(self, parser): + parser.add_argument("--download-type", type=str, required=True, choices=list(DOWNLOAD_SPEC)) + parser.add_argument( + "--download-level", + type=str, + required=True, + choices=set( + download_level + for download_level_list in [DOWNLOAD_SPEC[key] for key in DOWNLOAD_SPEC] + for download_level in download_level_list + ), + ) + parser.add_argument("--file-format", type=str, required=False, choices=list(FILE_FORMATS), default="csv") + parser.add_argument("--file-prefix", type=str, required=False, default="") + + def handle(self, *args, **options): + self.logger = logging.getLogger(__name__) + + # Resolve Parameters + self.download_type = options["download_type"] + self.download_level = options["download_level"] + self.file_prefix = options["file_prefix"] + + if self.download_level not in DOWNLOAD_SPEC[self.download_type].keys(): + raise ValueError( + f'Provided download level of "{self.download_level}" is not supported ' + f'for download type of "{self.download_type}".' + ) + + download_spec = DOWNLOAD_SPEC[self.download_type][self.download_level] + self.file_format_spec = FILE_FORMATS[options["file_format"]] + self.download_query = download_spec["query"] + self.download_validator_type = download_spec["validator_type"] + self.working_dir_path = Path(settings.CSV_LOCAL_PATH) + if not self.working_dir_path.exists(): + self.working_dir_path.mkdir() + self.download_job, self.download_source = self.create_download_job() + self.process_download() + + @cached_property + def json_request(self) -> Dict: + request_data = { + "account_level": "federal_account", + "download_types": ["award_financial"], + "file_format": "csv", + "filters": { + "agency": "all", + "budget_function": "all", + "budget_subfunction": "all", + "federal_account": "all", + "fy": 2021, + "period": 12, + "submission_types": ["award_financial"], + }, + "request_type": "account", + } + validator = self.download_validator_type(request_data) + processed_request = order_nested_object(validator.json_request) + + return processed_request + + @cached_property + def json_request_string(self) -> str: + return json.dumps(self.json_request) + + @cached_property + def download_name(self) -> str: + return self.download_job.file_name.replace(".zip", "") + + def create_download_job(self) -> Tuple[DownloadJob, DownloadSource]: + final_output_zip_name = f"{self.file_prefix}{create_unique_filename(self.json_request)}" + download_job_ready_status = JOB_STATUS_DICT["ready"] + + # Create a download_job object for use by the application + download_job = DownloadJob.objects.create( + job_status_id=download_job_ready_status, + file_name=final_output_zip_name, + json_request=self.json_request_string, + ) + + # TODO: This should be updated to be more dynamic to the download type + download_source = DownloadSource( + VALUE_MAPPINGS[self.download_type]["table_name"], + self.download_level, + self.download_type, + self.json_request.get("agency", "all"), + extra_file_type="", + ) + download_source.file_name = build_data_file_name(download_source, download_job, piid=None, assistance_id=None) + + return download_job, download_source + + def process_download(self): + self.start_download() + files_to_cleanup = [] + try: + to_csv_strategy = PostgresToCSVStrategy(self.logger) + + zip_file_path = self.working_dir_path / f"{self.download_name}.zip" + + csv_metadata = to_csv_strategy.download_to_csv( + self.download_query, + self.working_dir_path / self.download_name, + self.download_name, + self.working_dir_path, + zip_file_path, + ) + files_to_cleanup.extend(csv_metadata.filepaths) + + self.download_job.file_size = os.stat(zip_file_path).st_size + self.download_job.number_of_rows = csv_metadata.number_of_rows + self.download_job.number_of_columns = csv_metadata.number_of_columns + upload_download_file_to_s3(zip_file_path) + except InvalidParameterException as e: + exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob" + self.fail_download(exc_msg, e) + raise + except Exception as e: + exc_msg = "An exception was raised while attempting to process the DownloadJob" + self.fail_download(exc_msg, e) + raise + self.finish_download() + + def start_download(self) -> None: + self.download_job.job_status_id = JOB_STATUS_DICT["running"] + self.download_job.save() + self.logger.info(f"Starting DownloadJob {self.download_job.download_job_id}") + + def fail_download(self, msg: str, e: Optional[Exception] = None) -> None: + if e: + stack_trace = "".join(traceback.format_exception(type(e), value=e, tb=e.__traceback__)) + self.download_job.error_message = f"{msg}:\n{stack_trace}" + else: + self.download_job.error_message = msg + self.logger.error(msg) + self.download_job.job_status_id = JOB_STATUS_DICT["failed"] + self.download_job.save() + + def finish_download(self) -> None: + self.download_job.job_status_id = JOB_STATUS_DICT["finished"] + self.download_job.save() + self.logger.info(f"Finished processing DownloadJob {self.download_job.download_job_id}") diff --git a/usaspending_api/download/management/commands/generate_spark_download.py b/usaspending_api/download/management/commands/generate_spark_download.py index 74a676f4e2..b1a16a4856 100644 --- a/usaspending_api/download/management/commands/generate_spark_download.py +++ b/usaspending_api/download/management/commands/generate_spark_download.py @@ -33,7 +33,7 @@ DOWNLOAD_SPEC = { "award_financial": { "federal_account": { - "query": federal_account.DOWNLOAD_QUERY, + "query": federal_account.DELTA_DOWNLOAD_QUERY, "select_in_formats": [("submission_id", federal_account.SUBMISSION_ID_QUERY)], "validator_type": AccountDownloadValidator, } From c47003e155a60f01fe72a1a2133c2cb6cfe55938 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Thu, 17 Apr 2025 13:59:41 -0500 Subject: [PATCH 18/43] [DEV-12235] - Handle case where downloadjob is None] --- .../filestreaming/download_generation.py | 108 +++++++++--------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/usaspending_api/download/filestreaming/download_generation.py b/usaspending_api/download/filestreaming/download_generation.py index 4853a16a2e..ee49fa9c47 100644 --- a/usaspending_api/download/filestreaming/download_generation.py +++ b/usaspending_api/download/filestreaming/download_generation.py @@ -848,63 +848,63 @@ def execute_psql(temp_sql_file_path, source_path, download_job): kind=SpanKind.INTERNAL, service="bulk-download", ) - - with subprocess_trace as span: - span.set_attributes( - { - "service": "bulk-download", - "resource": str(download_sql), - "span_type": "Internal", - "source_path": str(source_path), - # download job details - "download_job_id": str(download_job.download_job_id), - "download_job_status": str(download_job.job_status.name), - "download_file_name": str(download_job.file_name), - "download_file_size": download_job.file_size if download_job.file_size is not None else 0, - "number_of_rows": download_job.number_of_rows if download_job.number_of_rows is not None else 0, - "number_of_columns": ( - download_job.number_of_columns if download_job.number_of_columns is not None else 0 - ), - "error_message": download_job.error_message if download_job.error_message else "", - "monthly_download": str(download_job.monthly_download), - "json_request": str(download_job.json_request) if download_job.json_request else "", - } - ) - - try: - log_time = time.perf_counter() - temp_env = os.environ.copy() - if download_job and not download_job.monthly_download: - # Since terminating the process isn't guaranteed to end the DB statement, add timeout to client connection - temp_env["PGOPTIONS"] = ( - f"--statement-timeout={settings.DOWNLOAD_DB_TIMEOUT_IN_HOURS}h " - f"--work-mem={settings.DOWNLOAD_DB_WORK_MEM_IN_MB}MB" - ) - - cat_command = subprocess.Popen(["cat", temp_sql_file_path], stdout=subprocess.PIPE) - subprocess.check_output( - ["psql", "-q", "-o", source_path, retrieve_db_string(), "-v", "ON_ERROR_STOP=1"], - stdin=cat_command.stdout, - stderr=subprocess.STDOUT, - env=temp_env, + if download_job: + with subprocess_trace as span: + span.set_attributes( + { + "service": "bulk-download", + "resource": str(download_sql), + "span_type": "Internal", + "source_path": str(source_path), + # download job details + "download_job_id": str(download_job.download_job_id), + "download_job_status": str(download_job.job_status.name), + "download_file_name": str(download_job.file_name), + "download_file_size": download_job.file_size if download_job.file_size is not None else 0, + "number_of_rows": download_job.number_of_rows if download_job.number_of_rows is not None else 0, + "number_of_columns": ( + download_job.number_of_columns if download_job.number_of_columns is not None else 0 + ), + "error_message": download_job.error_message if download_job.error_message else "", + "monthly_download": str(download_job.monthly_download), + "json_request": str(download_job.json_request) if download_job.json_request else "", + } ) - duration = time.perf_counter() - log_time - write_to_log( - message=f"Wrote {os.path.basename(source_path)}, took {duration:.4f} seconds", - download_job=download_job, + try: + log_time = time.perf_counter() + temp_env = os.environ.copy() + if download_job and not download_job.monthly_download: + # Since terminating the process isn't guaranteed to end the DB statement, add timeout to client connection + temp_env["PGOPTIONS"] = ( + f"--statement-timeout={settings.DOWNLOAD_DB_TIMEOUT_IN_HOURS}h " + f"--work-mem={settings.DOWNLOAD_DB_WORK_MEM_IN_MB}MB" ) - except subprocess.CalledProcessError as e: - write_to_log(message=f"PSQL Error: {e.output.decode()}", is_error=True, download_job=download_job) - raise e - except Exception as e: - if not settings.IS_LOCAL: - # Not logging the command as it can contain the database connection string - e.cmd = "[redacted psql command]" - write_to_log(message=e, is_error=True, download_job=download_job) - sql = subprocess.check_output(["cat", temp_sql_file_path]).decode() - write_to_log(message=f"Faulty SQL: {sql}", is_error=True, download_job=download_job) - raise e + + cat_command = subprocess.Popen(["cat", temp_sql_file_path], stdout=subprocess.PIPE) + subprocess.check_output( + ["psql", "-q", "-o", source_path, retrieve_db_string(), "-v", "ON_ERROR_STOP=1"], + stdin=cat_command.stdout, + stderr=subprocess.STDOUT, + env=temp_env, + ) + + duration = time.perf_counter() - log_time + write_to_log( + message=f"Wrote {os.path.basename(source_path)}, took {duration:.4f} seconds", + download_job=download_job, + ) + except subprocess.CalledProcessError as e: + write_to_log(message=f"PSQL Error: {e.output.decode()}", is_error=True, download_job=download_job) + raise e + except Exception as e: + if not settings.IS_LOCAL: + # Not logging the command as it can contain the database connection string + e.cmd = "[redacted psql command]" + write_to_log(message=e, is_error=True, download_job=download_job) + sql = subprocess.check_output(["cat", temp_sql_file_path]).decode() + write_to_log(message=f"Faulty SQL: {sql}", is_error=True, download_job=download_job) + raise e def retrieve_db_string(): From 7b78432b982900958f330a0529e8db3cc90af024 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Wed, 7 May 2025 13:09:44 -0500 Subject: [PATCH 19/43] [DEV-12235] - code style fixes --- .../delta_downloads/award_financial/federal_account.py | 4 ++-- .../management/commands/generate_postgres_download.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py index 1ad127195d..6b1dadf74f 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py @@ -486,7 +486,7 @@ usaspending_permalink, MAX(last_modified_date) FROM temp.account_download_temp - WHERE + WHERE ( ( ( @@ -571,7 +571,7 @@ prime_award_summary_recipient_cd_current, prime_award_summary_place_of_performance_cd_original, prime_award_summary_place_of_performance_cd_current - HAVING + HAVING SUM( CASE WHEN diff --git a/usaspending_api/download/management/commands/generate_postgres_download.py b/usaspending_api/download/management/commands/generate_postgres_download.py index 0340b9d4bc..9aad4c476e 100644 --- a/usaspending_api/download/management/commands/generate_postgres_download.py +++ b/usaspending_api/download/management/commands/generate_postgres_download.py @@ -4,7 +4,7 @@ import traceback from logging import Logger from pathlib import Path -from typing import Optional, Dict, Tuple, Type, List, Union +from typing import Optional, Dict, Tuple, Type from django.conf import settings from django.core.management.base import BaseCommand From 689109bf3f3997e6a6f2ecee0a05cca850008d18 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Wed, 7 May 2025 13:22:22 -0500 Subject: [PATCH 20/43] [DEV-12235] - code style fixes --- usaspending_api/common/helpers/s3_helpers.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/usaspending_api/common/helpers/s3_helpers.py b/usaspending_api/common/helpers/s3_helpers.py index d6e1e9b3f9..6a28091628 100644 --- a/usaspending_api/common/helpers/s3_helpers.py +++ b/usaspending_api/common/helpers/s3_helpers.py @@ -4,7 +4,6 @@ import math import time -from boto3.resources.base import ServiceResource from boto3.s3.transfer import TransferConfig, S3Transfer from botocore.exceptions import ClientError from django.conf import settings @@ -61,13 +60,6 @@ def retrieve_s3_bucket_object_list(bucket_name: str) -> List["boto3.resources.fa return bucket_objects -def get_s3_bucket( - bucket_name: str, region_name: str = settings.USASPENDING_AWS_REGION -) -> "boto3.resources.factory.s3.Instance": - s3 = _get_boto3("client", "s3", region_name=region_name) - return s3.Bucket(bucket_name) - - def access_s3_object(bucket_name: str, obj: "boto3.resources.factory.s3.ObjectSummary") -> io.BytesIO: """Return the Bytes of an S3 object""" bucket = get_s3_bucket(bucket_name=bucket_name) From 505e24785ee1c0480cd127096daf9b0e5fdeec3c Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Wed, 4 Jun 2025 16:36:21 -0500 Subject: [PATCH 21/43] [DEV-12574] - WIP - add AccountDownloadDataFrameBuilder --- .../award_financial/federal_account.py | 578 +++++++----------- .../commands/generate_spark_download.py | 85 +-- 2 files changed, 251 insertions(+), 412 deletions(-) diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py index 546bf328d2..79dda391f3 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py @@ -1,354 +1,236 @@ -DOWNLOAD_QUERY = """ - SELECT - owning_agency_name, - CONCAT_WS('; ', COLLECT_SET(reporting_agency_name)) AS reporting_agency_name, - submission_period, - federal_account_symbol, - federal_account_name, - agency_identifier_name, - CONCAT_WS('; ', COLLECT_SET(budget_function)) AS budget_function, - CONCAT_WS('; ', COLLECT_SET(budget_subfunction)) AS budget_subfunction, - program_activity_code, - program_activity_name, - object_class_code, - object_class_name, - direct_or_reimbursable_funding_source, - disaster_emergency_fund_code, - disaster_emergency_fund_name, - SUM(transaction_obligated_amount) AS transaction_obligated_amount, - SUM( - CASE - WHEN - ( - ( - quarter_format_flag = TRUE - AND reporting_fiscal_quarter = 4 - ) - OR ( - quarter_format_flag = FALSE - AND reporting_fiscal_period = 12 - ) - ) AND reporting_fiscal_year = 2021 - THEN - gross_outlay_amount_FYB_to_period_end - ElSE CAST(NULL as NUMERIC(23, 2)) - END - ) AS gross_outlay_amount_FYB_to_period_end, - SUM( - CASE - WHEN - ( - ( - quarter_format_flag = TRUE - AND reporting_fiscal_quarter = 4 - ) - OR ( - quarter_format_flag = FALSE - AND reporting_fiscal_period = 12 - ) - ) AND reporting_fiscal_year = 2021 - THEN - USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig - ElSE CAST(NULL as NUMERIC(23, 2)) - END - ) AS USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig, - SUM( - CASE - WHEN - ( - ( - quarter_format_flag = TRUE - AND reporting_fiscal_quarter = 4 - ) - OR ( - quarter_format_flag = FALSE - AND reporting_fiscal_period = 12 - ) - ) AND reporting_fiscal_year = 2021 - THEN - USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig - ElSE CAST(NULL as NUMERIC(23, 2)) - END - ) AS USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig, - award_unique_key, - award_id_piid, - parent_award_id_piid, - award_id_fain, - award_id_uri, - award_base_action_date, - award_base_action_date_fiscal_year, - award_latest_action_date, - award_latest_action_date_fiscal_year, - period_of_performance_start_date, - period_of_performance_current_end_date, - ordering_period_end_date, - award_type_code, - award_type, - idv_type_code, - idv_type, - prime_award_base_transaction_description, - awarding_agency_code, - awarding_agency_name, - awarding_subagency_code, - awarding_subagency_name, - awarding_office_code, - awarding_office_name, - funding_agency_code, - funding_agency_name, - funding_sub_agency_code, - funding_sub_agency_name, - funding_office_code, - funding_office_name, - recipient_uei, - recipient_duns, - recipient_name, - recipient_name_raw, - recipient_parent_uei, - recipient_parent_duns, - recipient_parent_name, - recipient_parent_name_raw, - recipient_country, - recipient_state, - recipient_county, - recipient_city, - prime_award_summary_recipient_cd_original, - prime_award_summary_recipient_cd_current, - recipient_zip_code, - primary_place_of_performance_country, - primary_place_of_performance_state, - primary_place_of_performance_county, - prime_award_summary_place_of_performance_cd_original, - prime_award_summary_place_of_performance_cd_current, - primary_place_of_performance_zip_code, - cfda_number, - cfda_title, - product_or_service_code, - product_or_service_code_description, - naics_code, - naics_description, - national_interest_action_code, - national_interest_action, - usaspending_permalink, - MAX(last_modified_date) - FROM rpt.account_download - WHERE - ( - submission_id IN {} - OR ( +from typing import Any + +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql import functions as sf, Column + + +class AccountDownloadDataFrameBuilder: + + def __init__( + self, + reporting_fiscal_year: int, + reporting_fiscal_quarter: int, + reporting_fiscal_period: int, + spark: SparkSession, + ): + self.reporting_fiscal_year = reporting_fiscal_year + self.reporting_fiscal_quarter = reporting_fiscal_quarter + self.reporting_fiscal_period = reporting_fiscal_period + self.df = spark.table("rpt.account_download") + self.groupby_cols = [ + "owning_agency_name", + "federal_account_symbol", + "federal_account_name", + "agency_identifier_name", + "program_activity_code", + "program_activity_name", + "object_class_code", + "object_class_name", + "direct_or_reimbursable_funding_source", + "disaster_emergency_fund_code", + "disaster_emergency_fund_name", + "award_unique_key", + "award_id_piid", + "parent_award_id_piid", + "award_id_fain", + "award_id_uri", + "award_base_action_date", + "award_latest_action_date", + "period_of_performance_start_date", + "period_of_performance_current_end_date", + "ordering_period_end_date", + "idv_type_code", + "idv_type", + "prime_award_base_transaction_description", + "awarding_agency_code", + "awarding_agency_name", + "awarding_subagency_code", + "awarding_subagency_name", + "awarding_office_code", + "awarding_office_name", + "funding_agency_code", + "funding_agency_name", + "funding_sub_agency_code", + "funding_sub_agency_name", + "funding_office_code", + "funding_office_name", + "recipient_uei", + "recipient_duns", + "recipient_name", + "recipient_name_raw", + "recipient_parent_uei", + "recipient_parent_duns", + "recipient_parent_name", + "recipient_parent_name_raw", + "recipient_country", + "recipient_state", + "recipient_county", + "recipient_city", + "primary_place_of_performance_country", + "primary_place_of_performance_state", + "primary_place_of_performance_county", + "primary_place_of_performance_zip_code", + "cfda_number", + "cfda_title", + "product_or_service_code", + "product_or_service_code_description", + "naics_code", + "naics_description", + "national_interest_action_code", + "national_interest_action", + "submission_period", + "award_type_code", + "award_type", + "recipient_zip_code", + "award_base_action_date_fiscal_year", + "award_latest_action_date_fiscal_year", + "usaspending_permalink", + "prime_award_summary_recipient_cd_original", + "prime_award_summary_recipient_cd_current", + "prime_award_summary_place_of_performance_cd_original", + "prime_award_summary_place_of_performance_cd_current", + ] + self.select_cols = [ + "owning_agency_name", + "reporting_agency_name", + "submission_period", + "federal_account_symbol", + "federal_account_name", + "agency_identifier_name", + "budget_function", + "budget_subfunction", + "program_activity_code", + "program_activity_name", + "object_class_code", + "object_class_name", + "direct_or_reimbursable_funding_source", + "disaster_emergency_fund_code", + "disaster_emergency_fund_name", + "transaction_obligated_amount", + "gross_outlay_amount_FYB_to_period_end", + "USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig", + "USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig", + "award_unique_key", + "award_id_piid", + "parent_award_id_piid", + "award_id_fain", + "award_id_uri", + "award_base_action_date", + "award_base_action_date_fiscal_year", + "award_latest_action_date", + "award_latest_action_date_fiscal_year", + "period_of_performance_start_date", + "period_of_performance_current_end_date", + "ordering_period_end_date", + "award_type_code", + "award_type", + "idv_type_code", + "idv_type", + "prime_award_base_transaction_description", + "awarding_agency_code", + "awarding_agency_name", + "awarding_subagency_code", + "awarding_subagency_name", + "awarding_office_code", + "awarding_office_name", + "funding_agency_code", + "funding_agency_name", + "funding_sub_agency_code", + "funding_sub_agency_name", + "funding_office_code", + "funding_office_name", + "recipient_uei", + "recipient_duns", + "recipient_name", + "recipient_name_raw", + "recipient_parent_uei", + "recipient_parent_duns", + "recipient_parent_name", + "recipient_parent_name_raw", + "recipient_country", + "recipient_state", + "recipient_county", + "recipient_city", + "prime_award_summary_recipient_cd_original", + "prime_award_summary_recipient_cd_current", + "recipient_zip_code", + "primary_place_of_performance_country", + "primary_place_of_performance_state", + "primary_place_of_performance_county", + "prime_award_summary_place_of_performance_cd_original", + "prime_award_summary_place_of_performance_cd_current", + "primary_place_of_performance_zip_code", + "cfda_number", + "cfda_title", + "product_or_service_code", + "product_or_service_code_description", + "naics_code", + "naics_description", + "national_interest_action_code", + "national_interest_action", + "usaspending_permalink", + "last_modified_date", + ] + + @staticmethod + def collect_concat(col_name: str, concat_str: str = "; ") -> Column: + return sf.concat_ws(concat_str, sf.collect_set(col_name)).alias(col_name) + + def handle_quarter_period(self, col_name: str, otherwise: Any = None) -> Column: + return ( + sf.when( ( - ( - reporting_fiscal_period <= 12 - AND NOT quarter_format_flag) - OR ( - reporting_fiscal_quarter <= 4 - AND quarter_format_flag - ) + (sf.col("quarter_format_flag") == True) + & (sf.col("reporting_fiscal_quarter") == self.reporting_fiscal_quarter) ) - AND reporting_fiscal_year = 2021 + | ( + (sf.col("quarter_format_flag") == False) + & (sf.col("reporting_fiscal_period") == self.reporting_fiscal_period) + ), + sf.col(col_name), ) + .otherwise(otherwise) + .alias(col_name) ) - GROUP BY - owning_agency_name, - federal_account_symbol, - federal_account_name, - agency_identifier_name, - program_activity_code, - program_activity_name, - object_class_code, - object_class_name, - direct_or_reimbursable_funding_source, - disaster_emergency_fund_code, - disaster_emergency_fund_name, - award_unique_key, - award_id_piid, - parent_award_id_piid, - award_id_fain, - award_id_uri, - award_base_action_date, - award_latest_action_date, - period_of_performance_start_date, - period_of_performance_current_end_date, - ordering_period_end_date, - idv_type_code, - idv_type, - prime_award_base_transaction_description, - awarding_agency_code, - awarding_agency_name, - awarding_subagency_code, - awarding_subagency_name, - awarding_office_code, - awarding_office_name, - funding_agency_code, - funding_agency_name, - funding_sub_agency_code, - funding_sub_agency_name, - funding_office_code, - funding_office_name, - recipient_uei, - recipient_duns, - recipient_name, - recipient_name_raw, - recipient_parent_uei, - recipient_parent_duns, - recipient_parent_name, - recipient_parent_name_raw, - recipient_country, - recipient_state, - recipient_county, - recipient_city, - primary_place_of_performance_country, - primary_place_of_performance_state, - primary_place_of_performance_county, - primary_place_of_performance_zip_code, - cfda_number, - cfda_title, - product_or_service_code, - product_or_service_code_description, - naics_code, - naics_description, - national_interest_action_code, - national_interest_action, - submission_period, - award_type_code, - award_type, - recipient_zip_code, - award_base_action_date_fiscal_year, - award_latest_action_date_fiscal_year, - usaspending_permalink, - prime_award_summary_recipient_cd_original, - prime_award_summary_recipient_cd_current, - prime_award_summary_place_of_performance_cd_original, - prime_award_summary_place_of_performance_cd_current - HAVING - -- All of the HAVING statements below ensure we return only non-zero sum records - SUM( - CASE - WHEN - ( - ( - quarter_format_flag = TRUE - AND reporting_fiscal_quarter = 4 - ) - OR ( - quarter_format_flag = FALSE - AND reporting_fiscal_period = 12 - ) - ) AND reporting_fiscal_year = 2021 - THEN - gross_outlay_amount_FYB_to_period_end - ElSE CAST(NULL as NUMERIC(23, 2)) - END - ) > 0 - OR SUM( - CASE - WHEN - ( - ( - quarter_format_flag = TRUE - AND reporting_fiscal_quarter = 4 - ) - OR ( - quarter_format_flag = FALSE - AND reporting_fiscal_period = 12 - ) - ) AND reporting_fiscal_year = 2021 - THEN - gross_outlay_amount_FYB_to_period_end - ElSE CAST(NULL as NUMERIC(23, 2)) - END - ) < 0 - OR SUM( - CASE - WHEN - ( - ( - quarter_format_flag = TRUE - AND reporting_fiscal_quarter = 4 - ) - OR ( - quarter_format_flag = FALSE - AND reporting_fiscal_period = 12 - ) - ) AND reporting_fiscal_year = 2021 - THEN - USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig - ElSE CAST(NULL as NUMERIC(23, 2)) - END - ) < 0 - OR SUM( - CASE - WHEN - ( - ( - quarter_format_flag = TRUE - AND reporting_fiscal_quarter = 4 - ) - OR ( - quarter_format_flag = FALSE - AND reporting_fiscal_period = 12 - ) - ) AND reporting_fiscal_year = 2021 - THEN - USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig - ElSE CAST(NULL as NUMERIC(23, 2)) - END - ) > 0 - OR SUM( - CASE - WHEN - ( - ( - quarter_format_flag = TRUE - AND reporting_fiscal_quarter = 4 - ) - OR ( - quarter_format_flag = FALSE - AND reporting_fiscal_period = 12 - ) - ) AND reporting_fiscal_year = 2021 - THEN - USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig - ElSE CAST(NULL as NUMERIC(23, 2)) - END - ) < 0 - OR SUM( - CASE - WHEN - ( - ( - quarter_format_flag = TRUE - AND reporting_fiscal_quarter = 4 - ) - OR ( - quarter_format_flag = FALSE - AND reporting_fiscal_period = 12 - ) - ) AND reporting_fiscal_year = 2021 - THEN - USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig - ElSE CAST(NULL as NUMERIC(23, 2)) - END - ) > 0 - OR SUM(transaction_obligated_amount) > 0 - OR SUM(transaction_obligated_amount) < 0 -""" - -SUBMISSION_ID_QUERY = """ - SELECT submission_id - FROM global_temp.submission_attributes - WHERE (toptier_code, reporting_fiscal_year, reporting_fiscal_period) IN ( - SELECT toptier_code, reporting_fiscal_year, reporting_fiscal_period - FROM global_temp.submission_attributes - WHERE reporting_fiscal_year = 2021 AND - ( - (reporting_fiscal_quarter <= 4 AND quarter_format_flag is true) OR - (reporting_fiscal_period <= 12 AND quarter_format_flag is false) - ) - ORDER BY toptier_code, reporting_fiscal_period desc - ) AND - ( - (reporting_fiscal_quarter = 4 AND quarter_format_flag IS TRUE) OR - (reporting_fiscal_period = 12 AND quarter_format_flag IS FALSE) + @property + def source_df(self): + return ( + self.df.filter( + ( + ( + (sf.col("reporting_fiscal_period") <= self.reporting_fiscal_period) + & (sf.col("quarter_format_flag") == False) + ) + | ( + (sf.col("reporting_fiscal_quarter") <= self.reporting_fiscal_quarter) + & (sf.col("quarter_format_flag") == True) + ) + ) + & (sf.col("reporting_fiscal_year") == self.reporting_fiscal_year) ) -""" + .groupBy(self.groupby_cols) + .agg( + *[ + self.collect_concat(col) + for col in ["reporting_agency_name", "budget_function", "budget_subfunction"] + ], + sf.sum("transaction_obligated_amount").alias("transaction_obligated_amount"), + *[ + sf.sum(self.handle_quarter_period(col)).alias(col) + for col in [ + "gross_outlay_amount_FYB_to_period_end", + "USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig", + "USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig", + ] + ], + sf.max(sf.col("last_modified_date")).alias("last_modified_date"), + ) + .filter( + (sf.col("gross_outlay_amount_FYB_to_period_end") != 0) + | (sf.col("USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig") != 0) + | (sf.col("USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig") != 0) + | (sf.col("transaction_obligated_amount") != 0) + ) + .select(self.select_cols) + ) diff --git a/usaspending_api/download/management/commands/generate_spark_download.py b/usaspending_api/download/management/commands/generate_spark_download.py index 74a676f4e2..42be8e6a03 100644 --- a/usaspending_api/download/management/commands/generate_spark_download.py +++ b/usaspending_api/download/management/commands/generate_spark_download.py @@ -12,7 +12,6 @@ from usaspending_api.common.etl.spark import create_ref_temp_views from usaspending_api.common.exceptions import InvalidParameterException -from usaspending_api.common.helpers.dict_helpers import order_nested_object from usaspending_api.common.helpers.download_csv_strategies import SparkToCSVStrategy from usaspending_api.common.helpers.s3_helpers import upload_download_file_to_s3 from usaspending_api.common.helpers.spark_helpers import ( @@ -24,8 +23,9 @@ ) from usaspending_api.download.filestreaming.download_generation import build_data_file_name from usaspending_api.download.filestreaming.download_source import DownloadSource -from usaspending_api.download.management.commands.delta_downloads.award_financial import federal_account -from usaspending_api.download.download_utils import create_unique_filename +from usaspending_api.download.management.commands.delta_downloads.award_financial.federal_account import ( + AccountDownloadDataFrameBuilder, +) from usaspending_api.download.lookups import JOB_STATUS_DICT, FILE_FORMATS, VALUE_MAPPINGS from usaspending_api.download.models import DownloadJob from usaspending_api.download.v2.request_validations import AccountDownloadValidator, DownloadValidatorBase @@ -33,8 +33,7 @@ DOWNLOAD_SPEC = { "award_financial": { "federal_account": { - "query": federal_account.DOWNLOAD_QUERY, - "select_in_formats": [("submission_id", federal_account.SUBMISSION_ID_QUERY)], + "df_builder": AccountDownloadDataFrameBuilder, "validator_type": AccountDownloadValidator, } } @@ -45,6 +44,7 @@ class Command(BaseCommand): help = "Generate a download zip file based on the provided type and level." + download_job_id: int download_job: DownloadJob download_level: str download_query: str @@ -73,6 +73,7 @@ def add_arguments(self, parser): for download_level in download_level_list ), ) + parser.add_argument("--download-job-id", type=int, required=True) parser.add_argument("--file-format", type=str, required=False, choices=list(FILE_FORMATS), default="csv") parser.add_argument("--file-prefix", type=str, required=False, default="") parser.add_argument("--skip-local-cleanup", action="store_true") @@ -100,6 +101,7 @@ def handle(self, *args, **options): # Resolve Parameters self.download_type = options["download_type"] self.download_level = options["download_level"] + self.download_job_id = options["download_job_id"] self.file_prefix = options["file_prefix"] self.should_cleanup = not options["skip_local_cleanup"] @@ -111,7 +113,7 @@ def handle(self, *args, **options): download_spec = DOWNLOAD_SPEC[self.download_type][self.download_level] self.file_format_spec = FILE_FORMATS[options["file_format"]] - self.download_query = download_spec["query"] + self.df_builder = download_spec["df_builder"] self.download_validator_type = download_spec["validator_type"] self.jdbc_properties = get_jdbc_connection_properties() self.jdbc_url = get_usas_jdbc_url() @@ -122,68 +124,26 @@ def handle(self, *args, **options): create_ref_temp_views(self.spark) - self.download_job, self.download_source = self.create_download_job() - self.modify_download_query(download_spec["select_in_formats"] or []) + self.download_job, self.download_source = self.get_download_job() self.process_download() if spark_created_by_command: self.spark.stop() - def modify_download_query(self, select_in_formats: List[Tuple[str, str]]) -> None: - formats_to_apply = [] - for select_col, query in select_in_formats: - formats_to_apply.append(tuple(val[select_col] for val in self.spark.sql(query).collect())) - self.download_query = self.download_query.format(*formats_to_apply) - - @cached_property - def json_request(self) -> Dict: - request_data = { - "account_level": "federal_account", - "download_types": ["award_financial"], - "file_format": "csv", - "filters": { - "agency": "all", - "budget_function": "all", - "budget_subfunction": "all", - "federal_account": "all", - "fy": 2021, - "period": 12, - "submission_types": ["award_financial"], - }, - "request_type": "account", - } - validator = self.download_validator_type(request_data) - processed_request = order_nested_object(validator.json_request) - - return processed_request - - @cached_property - def json_request_string(self) -> str: - return json.dumps(self.json_request) - @cached_property def download_name(self) -> str: return self.download_job.file_name.replace(".zip", "") - def create_download_job(self) -> Tuple[DownloadJob, DownloadSource]: - self.logger.info(f"Creating Download Job for {self.download_type} -> {self.download_level}") - - final_output_zip_name = f"{self.file_prefix}{create_unique_filename(self.json_request)}" - download_job_ready_status = JOB_STATUS_DICT["ready"] - - # Create a download_job object for use by the application - download_job = DownloadJob.objects.create( - job_status_id=download_job_ready_status, - file_name=final_output_zip_name, - json_request=self.json_request_string, - ) - - # TODO: This should be updated to be more dynamic to the download type + def get_download_job(self) -> Tuple[DownloadJob, DownloadSource]: + download_job = DownloadJob.objects.get(download_job_id=self.download_job_id) + if download_job.job_status != JOB_STATUS_DICT["ready"]: + raise InvalidParameterException(f"Download Job {self.download_job_id} is not ready.") + json_request = json.loads(download_job.json_request) download_source = DownloadSource( VALUE_MAPPINGS[self.download_type]["table_name"], self.download_level, self.download_type, - self.json_request.get("agency", "all"), + json_request.get("agency", "all"), # TODO: Is this necessary for Spark downloads? It was originally added to File C downloads for performance. extra_file_type="", ) @@ -196,18 +156,16 @@ def process_download(self): files_to_cleanup = [] try: spark_to_csv_strategy = SparkToCSVStrategy(self.logger) - zip_file_path = self.working_dir_path / f"{self.download_name}.zip" - csv_metadata = spark_to_csv_strategy.download_to_csv( - self.download_query, - self.working_dir_path / self.download_name, - self.download_name, - self.working_dir_path, - zip_file_path, + source_sql=None, + destination_path=self.working_dir_path / self.download_name, + destination_file_name=self.download_name, + working_dir_path=self.working_dir_path, + download_zip_path=zip_file_path, + source_df=self.df_builder(2021, 4, 12).source_df, ) files_to_cleanup.extend(csv_metadata.filepaths) - self.download_job.file_size = os.stat(zip_file_path).st_size self.download_job.number_of_rows = csv_metadata.number_of_rows self.download_job.number_of_columns = csv_metadata.number_of_columns @@ -223,7 +181,6 @@ def process_download(self): finally: if self.should_cleanup: self.cleanup(files_to_cleanup) - self.finish_download() def start_download(self) -> None: From 7cbda732aaca7bb9b1cbcacc9e756a42cef9769d Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Thu, 5 Jun 2025 16:45:48 -0500 Subject: [PATCH 22/43] [DEV-12574] - update spark download dataframe builder --- .../award_financial/federal_account.py | 40 +++++++++---------- .../commands/generate_spark_download.py | 11 ++++- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py index 79dda391f3..7f76ba8cde 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py @@ -1,21 +1,23 @@ -from typing import Any +from typing import Any, Literal from pyspark.sql import DataFrame, SparkSession from pyspark.sql import functions as sf, Column +from usaspending_api.submissions.helpers import get_submission_ids_for_periods + class AccountDownloadDataFrameBuilder: def __init__( self, - reporting_fiscal_year: int, - reporting_fiscal_quarter: int, - reporting_fiscal_period: int, spark: SparkSession, + year: int, + period: int, + period_type: Literal["month", "quarter"] = "month", ): - self.reporting_fiscal_year = reporting_fiscal_year - self.reporting_fiscal_quarter = reporting_fiscal_quarter - self.reporting_fiscal_period = reporting_fiscal_period + self.reporting_fiscal_year = year + self.reporting_fiscal_quarter = period if period_type == "quarter" else period // 3 + self.reporting_fiscal_period = period if period_type == "month" else period * 3 self.df = spark.table("rpt.account_download") self.groupby_cols = [ "owning_agency_name", @@ -172,20 +174,14 @@ def __init__( "last_modified_date", ] - @staticmethod - def collect_concat(col_name: str, concat_str: str = "; ") -> Column: - return sf.concat_ws(concat_str, sf.collect_set(col_name)).alias(col_name) - - def handle_quarter_period(self, col_name: str, otherwise: Any = None) -> Column: + def filter_to_latest_submissions_for_agencies(self, col_name: str, otherwise: Any = None) -> Column: + """Filter to the latest submission regardless of whether the agency submitted on a monthly or quarterly basis""" return ( sf.when( - ( - (sf.col("quarter_format_flag") == True) - & (sf.col("reporting_fiscal_quarter") == self.reporting_fiscal_quarter) - ) - | ( - (sf.col("quarter_format_flag") == False) - & (sf.col("reporting_fiscal_period") == self.reporting_fiscal_period) + sf.col("submission_id").isin( + get_submission_ids_for_periods( + self.reporting_fiscal_year, self.reporting_fiscal_quarter, self.reporting_fiscal_period + ) ), sf.col(col_name), ) @@ -193,6 +189,10 @@ def handle_quarter_period(self, col_name: str, otherwise: Any = None) -> Column: .alias(col_name) ) + @staticmethod + def collect_concat(col_name: str, concat_str: str = "; ") -> Column: + return sf.concat_ws(concat_str, sf.collect_set(col_name)).alias(col_name) + @property def source_df(self): return ( @@ -217,7 +217,7 @@ def source_df(self): ], sf.sum("transaction_obligated_amount").alias("transaction_obligated_amount"), *[ - sf.sum(self.handle_quarter_period(col)).alias(col) + sf.sum(self.filter_to_latest_submissions_for_agencies(col)).alias(col) for col in [ "gross_outlay_amount_FYB_to_period_end", "USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig", diff --git a/usaspending_api/download/management/commands/generate_spark_download.py b/usaspending_api/download/management/commands/generate_spark_download.py index 42be8e6a03..59932b83aa 100644 --- a/usaspending_api/download/management/commands/generate_spark_download.py +++ b/usaspending_api/download/management/commands/generate_spark_download.py @@ -136,7 +136,7 @@ def download_name(self) -> str: def get_download_job(self) -> Tuple[DownloadJob, DownloadSource]: download_job = DownloadJob.objects.get(download_job_id=self.download_job_id) - if download_job.job_status != JOB_STATUS_DICT["ready"]: + if download_job.job_status_id != JOB_STATUS_DICT["ready"]: raise InvalidParameterException(f"Download Job {self.download_job_id} is not ready.") json_request = json.loads(download_job.json_request) download_source = DownloadSource( @@ -157,13 +157,20 @@ def process_download(self): try: spark_to_csv_strategy = SparkToCSVStrategy(self.logger) zip_file_path = self.working_dir_path / f"{self.download_name}.zip" + download_request = json.loads(self.download_job.json_request) + year = download_request["filters"]["fy"] + period, period_type = ( + (download_request["filters"]["period"], "month") + if download_request["filters"]["period"] + else (download_request["filters"]["quarter"], "quarter") + ) csv_metadata = spark_to_csv_strategy.download_to_csv( source_sql=None, destination_path=self.working_dir_path / self.download_name, destination_file_name=self.download_name, working_dir_path=self.working_dir_path, download_zip_path=zip_file_path, - source_df=self.df_builder(2021, 4, 12).source_df, + source_df=self.df_builder(self.spark, year, period, period_type).source_df, ) files_to_cleanup.extend(csv_metadata.filepaths) self.download_job.file_size = os.stat(zip_file_path).st_size From bb31aae272869dc3db56deea66ae753c0d9a3419 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Fri, 6 Jun 2025 15:53:19 -0500 Subject: [PATCH 23/43] [DEV-12574] - adding dynamic filters for def codes, agency, account id --- .../download/delta_models/account_download.py | 2 + .../award_financial/federal_account.py | 62 +++++++++++++++---- .../commands/generate_spark_download.py | 17 ++++- 3 files changed, 66 insertions(+), 15 deletions(-) diff --git a/usaspending_api/download/delta_models/account_download.py b/usaspending_api/download/delta_models/account_download.py index 2414c0bcb3..d8ea24ca70 100644 --- a/usaspending_api/download/delta_models/account_download.py +++ b/usaspending_api/download/delta_models/account_download.py @@ -2,6 +2,7 @@ "financial_accounts_by_awards_id": "INTEGER NOT NULL", "submission_id": "INTEGER NOT NULL", "owning_agency_name": "STRING", + "federal_account_id": "INTEGER", "federal_account_symbol": "STRING", "federal_account_name": "STRING", "agency_identifier_name": "STRING", @@ -102,6 +103,7 @@ financial_accounts_by_awards.financial_accounts_by_awards_id, financial_accounts_by_awards.submission_id, toptier_agency.name AS owning_agency_name, + federal_account.id AS federal_account_id, federal_account.federal_account_code AS federal_account_symbol, federal_account.account_title AS federal_account_name, cgac_aid.agency_name AS agency_identifier_name, diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py index 7f76ba8cde..a61aad9c4e 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py @@ -1,3 +1,5 @@ +from dataclasses import dataclass +from functools import reduce from typing import Any, Literal from pyspark.sql import DataFrame, SparkSession @@ -14,10 +16,16 @@ def __init__( year: int, period: int, period_type: Literal["month", "quarter"] = "month", + agency: int | None = None, + federal_account_id: int | None = None, + def_codes: list[str] | None = None, ): self.reporting_fiscal_year = year self.reporting_fiscal_quarter = period if period_type == "quarter" else period // 3 self.reporting_fiscal_period = period if period_type == "month" else period * 3 + self.agency = agency + self.federal_account_id = federal_account_id + self.def_codes = def_codes self.df = spark.table("rpt.account_download") self.groupby_cols = [ "owning_agency_name", @@ -189,6 +197,46 @@ def filter_to_latest_submissions_for_agencies(self, col_name: str, otherwise: An .alias(col_name) ) + @property + def combined_filters(self): + + @dataclass + class Condition: + name: str + condition: Column + apply: bool + + conditions = [ + Condition(name="year", condition=sf.col("reporting_fiscal_year") == self.reporting_fiscal_year, apply=True), + Condition( + name="quarter or month", + condition=( + (sf.col("reporting_fiscal_period") <= self.reporting_fiscal_period) + & (sf.col("quarter_format_flag") == False) + ) + | ( + (sf.col("reporting_fiscal_quarter") <= self.reporting_fiscal_quarter) + & (sf.col("quarter_format_flag") == True) + ), + apply=True, + ), + Condition(name="agency", condition=sf.col("agency_code") == self.agency, apply=bool(self.agency)), + Condition( + name="federal account", + condition=sf.col("federal_account_id") == self.federal_account_id, + apply=bool(self.federal_account_id), + ), + Condition( + name="def_codes", + condition=sf.col("disaster_emergency_fund_code").isin(self.def_codes), + apply=bool(self.def_codes), + ), + ] + return reduce( + lambda x, y: x & y, + [condition.condition for condition in conditions if condition.apply], + ) + @staticmethod def collect_concat(col_name: str, concat_str: str = "; ") -> Column: return sf.concat_ws(concat_str, sf.collect_set(col_name)).alias(col_name) @@ -196,19 +244,7 @@ def collect_concat(col_name: str, concat_str: str = "; ") -> Column: @property def source_df(self): return ( - self.df.filter( - ( - ( - (sf.col("reporting_fiscal_period") <= self.reporting_fiscal_period) - & (sf.col("quarter_format_flag") == False) - ) - | ( - (sf.col("reporting_fiscal_quarter") <= self.reporting_fiscal_quarter) - & (sf.col("quarter_format_flag") == True) - ) - ) - & (sf.col("reporting_fiscal_year") == self.reporting_fiscal_year) - ) + self.df.filter(self.combined_filters) .groupBy(self.groupby_cols) .agg( *[ diff --git a/usaspending_api/download/management/commands/generate_spark_download.py b/usaspending_api/download/management/commands/generate_spark_download.py index 59932b83aa..be6a898798 100644 --- a/usaspending_api/download/management/commands/generate_spark_download.py +++ b/usaspending_api/download/management/commands/generate_spark_download.py @@ -10,6 +10,7 @@ from django.utils.functional import cached_property from pyspark.sql import SparkSession +from usaspending_api.accounts.urls_federal_accounts_v2 import federal_account from usaspending_api.common.etl.spark import create_ref_temp_views from usaspending_api.common.exceptions import InvalidParameterException from usaspending_api.common.helpers.download_csv_strategies import SparkToCSVStrategy @@ -161,16 +162,28 @@ def process_download(self): year = download_request["filters"]["fy"] period, period_type = ( (download_request["filters"]["period"], "month") - if download_request["filters"]["period"] + if "period" in download_request["filters"] else (download_request["filters"]["quarter"], "quarter") ) + agency = download_request["filters"].get("agency") + federal_account = download_request["filters"].get("federal_account") + def_codes = download_request["filters"].get("def_codes") + source_df = self.df_builder( + spark=self.spark, + year=int(year), + period=int(period), + period_type=period_type, + agency=int(agency), + federal_account_id=int(federal_account), + def_codes=def_codes, + ).source_df csv_metadata = spark_to_csv_strategy.download_to_csv( source_sql=None, destination_path=self.working_dir_path / self.download_name, destination_file_name=self.download_name, working_dir_path=self.working_dir_path, download_zip_path=zip_file_path, - source_df=self.df_builder(self.spark, year, period, period_type).source_df, + source_df=source_df, ) files_to_cleanup.extend(csv_metadata.filepaths) self.download_job.file_size = os.stat(zip_file_path).st_size From 6e4434dc5cde6e2af3de7fe279c1226444e0b7b5 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Mon, 9 Jun 2025 09:41:02 -0500 Subject: [PATCH 24/43] [DEV-12574] - cleanup --- .../delta_downloads/award_financial/federal_account.py | 9 ++++----- .../management/commands/generate_spark_download.py | 1 - 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py index a61aad9c4e..7b1161cead 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py @@ -198,7 +198,7 @@ def filter_to_latest_submissions_for_agencies(self, col_name: str, otherwise: An ) @property - def combined_filters(self): + def combined_filters(self) -> Column: @dataclass class Condition: @@ -211,12 +211,11 @@ class Condition: Condition( name="quarter or month", condition=( - (sf.col("reporting_fiscal_period") <= self.reporting_fiscal_period) - & (sf.col("quarter_format_flag") == False) + (sf.col("reporting_fiscal_period") <= self.reporting_fiscal_period) & ~sf.col("quarter_format_flag") ) | ( (sf.col("reporting_fiscal_quarter") <= self.reporting_fiscal_quarter) - & (sf.col("quarter_format_flag") == True) + & sf.col("quarter_format_flag") ), apply=True, ), @@ -242,7 +241,7 @@ def collect_concat(col_name: str, concat_str: str = "; ") -> Column: return sf.concat_ws(concat_str, sf.collect_set(col_name)).alias(col_name) @property - def source_df(self): + def source_df(self) -> DataFrame: return ( self.df.filter(self.combined_filters) .groupBy(self.groupby_cols) diff --git a/usaspending_api/download/management/commands/generate_spark_download.py b/usaspending_api/download/management/commands/generate_spark_download.py index be6a898798..1698ad2cde 100644 --- a/usaspending_api/download/management/commands/generate_spark_download.py +++ b/usaspending_api/download/management/commands/generate_spark_download.py @@ -10,7 +10,6 @@ from django.utils.functional import cached_property from pyspark.sql import SparkSession -from usaspending_api.accounts.urls_federal_accounts_v2 import federal_account from usaspending_api.common.etl.spark import create_ref_temp_views from usaspending_api.common.exceptions import InvalidParameterException from usaspending_api.common.helpers.download_csv_strategies import SparkToCSVStrategy From facdf9ec8dc476e3c4a18f64fb17f1d665f22279 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Mon, 9 Jun 2025 11:18:03 -0500 Subject: [PATCH 25/43] [DEV-12574] - Cleanup and refactoring --- .../common/helpers/download_csv_strategies.py | 17 +- .../award_financial/columns.py | 155 ++++++++++++++ .../award_financial/federal_account.py | 197 +++--------------- .../commands/generate_spark_download.py | 28 +-- 4 files changed, 205 insertions(+), 192 deletions(-) create mode 100644 usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py diff --git a/usaspending_api/common/helpers/download_csv_strategies.py b/usaspending_api/common/helpers/download_csv_strategies.py index 99218efb9f..a36f6d7bea 100644 --- a/usaspending_api/common/helpers/download_csv_strategies.py +++ b/usaspending_api/common/helpers/download_csv_strategies.py @@ -1,12 +1,13 @@ -from abc import ABC, abstractmethod -from dataclasses import dataclass +import logging import multiprocessing import time -import logging +from abc import ABC, abstractmethod +from dataclasses import dataclass from pathlib import Path -from typing import Optional -from django.conf import settings +from typing import List, Optional +from django.conf import settings +from pyspark.sql import DataFrame from usaspending_api.common.csv_helpers import count_rows_in_delimited_file from usaspending_api.common.helpers.s3_helpers import delete_s3_objects, download_s3_object from usaspending_api.common.helpers.sql_helpers import read_sql_file_to_text @@ -19,7 +20,6 @@ ) from usaspending_api.download.filestreaming.zip_file import append_files_to_zip_file from usaspending_api.download.lookups import FILE_FORMATS -from typing import List @dataclass @@ -45,12 +45,12 @@ def __init__(self, *args, **kwargs): @abstractmethod def download_to_csv( self, - source_sql: str, + source_sql: str | None, destination_path: Path, destination_file_name: str, working_dir_path: Path, download_zip_path: Path, - source_df=None, + source_df: DataFrame | None = None, ) -> CSVDownloadMetadata: """ Args: @@ -59,6 +59,7 @@ def download_to_csv( destination_file_name: The name of the file in destination path without a file extension working_dir_path: The working directory path as a string download_zip_path: The path (as a string) to the download zip file + source_df: A pyspark DataFrame that contains the data to be downloaded Returns: Returns a CSVDownloadMetadata object (a dataclass containing metadata about the download) diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py new file mode 100644 index 0000000000..6affa89622 --- /dev/null +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py @@ -0,0 +1,155 @@ +groupby_cols = [ + "owning_agency_name", + "federal_account_symbol", + "federal_account_name", + "agency_identifier_name", + "program_activity_code", + "program_activity_name", + "object_class_code", + "object_class_name", + "direct_or_reimbursable_funding_source", + "disaster_emergency_fund_code", + "disaster_emergency_fund_name", + "award_unique_key", + "award_id_piid", + "parent_award_id_piid", + "award_id_fain", + "award_id_uri", + "award_base_action_date", + "award_latest_action_date", + "period_of_performance_start_date", + "period_of_performance_current_end_date", + "ordering_period_end_date", + "idv_type_code", + "idv_type", + "prime_award_base_transaction_description", + "awarding_agency_code", + "awarding_agency_name", + "awarding_subagency_code", + "awarding_subagency_name", + "awarding_office_code", + "awarding_office_name", + "funding_agency_code", + "funding_agency_name", + "funding_sub_agency_code", + "funding_sub_agency_name", + "funding_office_code", + "funding_office_name", + "recipient_uei", + "recipient_duns", + "recipient_name", + "recipient_name_raw", + "recipient_parent_uei", + "recipient_parent_duns", + "recipient_parent_name", + "recipient_parent_name_raw", + "recipient_country", + "recipient_state", + "recipient_county", + "recipient_city", + "primary_place_of_performance_country", + "primary_place_of_performance_state", + "primary_place_of_performance_county", + "primary_place_of_performance_zip_code", + "cfda_number", + "cfda_title", + "product_or_service_code", + "product_or_service_code_description", + "naics_code", + "naics_description", + "national_interest_action_code", + "national_interest_action", + "submission_period", + "award_type_code", + "award_type", + "recipient_zip_code", + "award_base_action_date_fiscal_year", + "award_latest_action_date_fiscal_year", + "usaspending_permalink", + "prime_award_summary_recipient_cd_original", + "prime_award_summary_recipient_cd_current", + "prime_award_summary_place_of_performance_cd_original", + "prime_award_summary_place_of_performance_cd_current", +] + +select_cols = [ + "owning_agency_name", + "reporting_agency_name", + "submission_period", + "federal_account_symbol", + "federal_account_name", + "agency_identifier_name", + "budget_function", + "budget_subfunction", + "program_activity_code", + "program_activity_name", + "object_class_code", + "object_class_name", + "direct_or_reimbursable_funding_source", + "disaster_emergency_fund_code", + "disaster_emergency_fund_name", + "transaction_obligated_amount", + "gross_outlay_amount_FYB_to_period_end", + "USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig", + "USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig", + "award_unique_key", + "award_id_piid", + "parent_award_id_piid", + "award_id_fain", + "award_id_uri", + "award_base_action_date", + "award_base_action_date_fiscal_year", + "award_latest_action_date", + "award_latest_action_date_fiscal_year", + "period_of_performance_start_date", + "period_of_performance_current_end_date", + "ordering_period_end_date", + "award_type_code", + "award_type", + "idv_type_code", + "idv_type", + "prime_award_base_transaction_description", + "awarding_agency_code", + "awarding_agency_name", + "awarding_subagency_code", + "awarding_subagency_name", + "awarding_office_code", + "awarding_office_name", + "funding_agency_code", + "funding_agency_name", + "funding_sub_agency_code", + "funding_sub_agency_name", + "funding_office_code", + "funding_office_name", + "recipient_uei", + "recipient_duns", + "recipient_name", + "recipient_name_raw", + "recipient_parent_uei", + "recipient_parent_duns", + "recipient_parent_name", + "recipient_parent_name_raw", + "recipient_country", + "recipient_state", + "recipient_county", + "recipient_city", + "prime_award_summary_recipient_cd_original", + "prime_award_summary_recipient_cd_current", + "recipient_zip_code", + "primary_place_of_performance_country", + "primary_place_of_performance_state", + "primary_place_of_performance_county", + "prime_award_summary_place_of_performance_cd_original", + "prime_award_summary_place_of_performance_cd_current", + "primary_place_of_performance_zip_code", + "cfda_number", + "cfda_title", + "product_or_service_code", + "product_or_service_code_description", + "naics_code", + "naics_description", + "national_interest_action_code", + "national_interest_action", + "usaspending_permalink", + "last_modified_date", +] diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py index 7b1161cead..6c7968f23c 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py @@ -1,11 +1,31 @@ from dataclasses import dataclass from functools import reduce -from typing import Any, Literal +from typing import Any from pyspark.sql import DataFrame, SparkSession from pyspark.sql import functions as sf, Column from usaspending_api.submissions.helpers import get_submission_ids_for_periods +from usaspending_api.download.management.commands.delta_downloads.award_financial.columns import ( + groupby_cols, + select_cols, +) + + +@dataclass +class AccountDownloadFilter: + year: int + month: int | None = None + quarter: int | None = None + agency: int | None = None + federal_account_id: int | None = None + def_codes: list[str] | None = None + + def __post_init__(self): + if self.month is None and self.quarter is None: + raise ValueError("Must define month or quarter.") + elif self.month is not None and self.quarter is not None: + raise ValueError("Month and quarter are mutually exclusive.") class AccountDownloadDataFrameBuilder: @@ -13,174 +33,17 @@ class AccountDownloadDataFrameBuilder: def __init__( self, spark: SparkSession, - year: int, - period: int, - period_type: Literal["month", "quarter"] = "month", - agency: int | None = None, - federal_account_id: int | None = None, - def_codes: list[str] | None = None, + account_download_filter: AccountDownloadFilter, ): - self.reporting_fiscal_year = year - self.reporting_fiscal_quarter = period if period_type == "quarter" else period // 3 - self.reporting_fiscal_period = period if period_type == "month" else period * 3 - self.agency = agency - self.federal_account_id = federal_account_id - self.def_codes = def_codes + self.reporting_fiscal_year = account_download_filter.year + self.reporting_fiscal_quarter = account_download_filter.quarter + self.reporting_fiscal_period = account_download_filter.month + self.agency = account_download_filter.agency + self.federal_account_id = account_download_filter.federal_account_id + self.def_codes = account_download_filter.def_codes self.df = spark.table("rpt.account_download") - self.groupby_cols = [ - "owning_agency_name", - "federal_account_symbol", - "federal_account_name", - "agency_identifier_name", - "program_activity_code", - "program_activity_name", - "object_class_code", - "object_class_name", - "direct_or_reimbursable_funding_source", - "disaster_emergency_fund_code", - "disaster_emergency_fund_name", - "award_unique_key", - "award_id_piid", - "parent_award_id_piid", - "award_id_fain", - "award_id_uri", - "award_base_action_date", - "award_latest_action_date", - "period_of_performance_start_date", - "period_of_performance_current_end_date", - "ordering_period_end_date", - "idv_type_code", - "idv_type", - "prime_award_base_transaction_description", - "awarding_agency_code", - "awarding_agency_name", - "awarding_subagency_code", - "awarding_subagency_name", - "awarding_office_code", - "awarding_office_name", - "funding_agency_code", - "funding_agency_name", - "funding_sub_agency_code", - "funding_sub_agency_name", - "funding_office_code", - "funding_office_name", - "recipient_uei", - "recipient_duns", - "recipient_name", - "recipient_name_raw", - "recipient_parent_uei", - "recipient_parent_duns", - "recipient_parent_name", - "recipient_parent_name_raw", - "recipient_country", - "recipient_state", - "recipient_county", - "recipient_city", - "primary_place_of_performance_country", - "primary_place_of_performance_state", - "primary_place_of_performance_county", - "primary_place_of_performance_zip_code", - "cfda_number", - "cfda_title", - "product_or_service_code", - "product_or_service_code_description", - "naics_code", - "naics_description", - "national_interest_action_code", - "national_interest_action", - "submission_period", - "award_type_code", - "award_type", - "recipient_zip_code", - "award_base_action_date_fiscal_year", - "award_latest_action_date_fiscal_year", - "usaspending_permalink", - "prime_award_summary_recipient_cd_original", - "prime_award_summary_recipient_cd_current", - "prime_award_summary_place_of_performance_cd_original", - "prime_award_summary_place_of_performance_cd_current", - ] - self.select_cols = [ - "owning_agency_name", - "reporting_agency_name", - "submission_period", - "federal_account_symbol", - "federal_account_name", - "agency_identifier_name", - "budget_function", - "budget_subfunction", - "program_activity_code", - "program_activity_name", - "object_class_code", - "object_class_name", - "direct_or_reimbursable_funding_source", - "disaster_emergency_fund_code", - "disaster_emergency_fund_name", - "transaction_obligated_amount", - "gross_outlay_amount_FYB_to_period_end", - "USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig", - "USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig", - "award_unique_key", - "award_id_piid", - "parent_award_id_piid", - "award_id_fain", - "award_id_uri", - "award_base_action_date", - "award_base_action_date_fiscal_year", - "award_latest_action_date", - "award_latest_action_date_fiscal_year", - "period_of_performance_start_date", - "period_of_performance_current_end_date", - "ordering_period_end_date", - "award_type_code", - "award_type", - "idv_type_code", - "idv_type", - "prime_award_base_transaction_description", - "awarding_agency_code", - "awarding_agency_name", - "awarding_subagency_code", - "awarding_subagency_name", - "awarding_office_code", - "awarding_office_name", - "funding_agency_code", - "funding_agency_name", - "funding_sub_agency_code", - "funding_sub_agency_name", - "funding_office_code", - "funding_office_name", - "recipient_uei", - "recipient_duns", - "recipient_name", - "recipient_name_raw", - "recipient_parent_uei", - "recipient_parent_duns", - "recipient_parent_name", - "recipient_parent_name_raw", - "recipient_country", - "recipient_state", - "recipient_county", - "recipient_city", - "prime_award_summary_recipient_cd_original", - "prime_award_summary_recipient_cd_current", - "recipient_zip_code", - "primary_place_of_performance_country", - "primary_place_of_performance_state", - "primary_place_of_performance_county", - "prime_award_summary_place_of_performance_cd_original", - "prime_award_summary_place_of_performance_cd_current", - "primary_place_of_performance_zip_code", - "cfda_number", - "cfda_title", - "product_or_service_code", - "product_or_service_code_description", - "naics_code", - "naics_description", - "national_interest_action_code", - "national_interest_action", - "usaspending_permalink", - "last_modified_date", - ] + self.groupby_cols = groupby_cols + self.select_cols = select_cols def filter_to_latest_submissions_for_agencies(self, col_name: str, otherwise: Any = None) -> Column: """Filter to the latest submission regardless of whether the agency submitted on a monthly or quarterly basis""" diff --git a/usaspending_api/download/management/commands/generate_spark_download.py b/usaspending_api/download/management/commands/generate_spark_download.py index 1698ad2cde..dd5e1e0358 100644 --- a/usaspending_api/download/management/commands/generate_spark_download.py +++ b/usaspending_api/download/management/commands/generate_spark_download.py @@ -25,6 +25,7 @@ from usaspending_api.download.filestreaming.download_source import DownloadSource from usaspending_api.download.management.commands.delta_downloads.award_financial.federal_account import ( AccountDownloadDataFrameBuilder, + AccountDownloadFilter, ) from usaspending_api.download.lookups import JOB_STATUS_DICT, FILE_FORMATS, VALUE_MAPPINGS from usaspending_api.download.models import DownloadJob @@ -158,24 +159,17 @@ def process_download(self): spark_to_csv_strategy = SparkToCSVStrategy(self.logger) zip_file_path = self.working_dir_path / f"{self.download_name}.zip" download_request = json.loads(self.download_job.json_request) - year = download_request["filters"]["fy"] - period, period_type = ( - (download_request["filters"]["period"], "month") - if "period" in download_request["filters"] - else (download_request["filters"]["quarter"], "quarter") + account_download_filter = AccountDownloadFilter( + year=int(download_request["filters"]["fy"]), + month=int(download_request["filters"]["period"]) if "period" in download_request["filters"] else None, + quarter=( + int(download_request["filters"]["quarter"]) if "quarter" in download_request["filters"] else None + ), + agency=download_request["filters"].get("agency"), + federal_account_id=download_request["filters"].get("federal_account"), + def_codes=download_request["filters"].get("def_codes"), ) - agency = download_request["filters"].get("agency") - federal_account = download_request["filters"].get("federal_account") - def_codes = download_request["filters"].get("def_codes") - source_df = self.df_builder( - spark=self.spark, - year=int(year), - period=int(period), - period_type=period_type, - agency=int(agency), - federal_account_id=int(federal_account), - def_codes=def_codes, - ).source_df + source_df = self.df_builder(spark=self.spark, account_download_filter=account_download_filter).source_df csv_metadata = spark_to_csv_strategy.download_to_csv( source_sql=None, destination_path=self.working_dir_path / self.download_name, From 141db7a79c5f4401ce3eb4c4091c1eba1836b7ad Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Mon, 9 Jun 2025 13:53:51 -0500 Subject: [PATCH 26/43] [DEV-12574] - Fix table spec --- usaspending_api/etl/management/commands/load_query_to_delta.py | 1 + 1 file changed, 1 insertion(+) diff --git a/usaspending_api/etl/management/commands/load_query_to_delta.py b/usaspending_api/etl/management/commands/load_query_to_delta.py index 88e0058b7b..789bb21e63 100644 --- a/usaspending_api/etl/management/commands/load_query_to_delta.py +++ b/usaspending_api/etl/management/commands/load_query_to_delta.py @@ -317,6 +317,7 @@ "model": None, "is_from_broker": False, "source_query": [account_download_load_sql_string], + "source_query_incremental": None, "source_database": None, "source_table": None, "destination_database": "rpt", From 2b2325fb6fe3bcb0b706964b349fed03c44d1468 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Tue, 10 Jun 2025 13:55:05 -0500 Subject: [PATCH 27/43] [DEV-12574] - Adding tests --- .../award_financial/federal_account.py | 6 +- ...test_account_download_dataframe_builder.py | 103 ++++++++++++++++++ 2 files changed, 106 insertions(+), 3 deletions(-) create mode 100644 usaspending_api/download/tests/unit/test_account_download_dataframe_builder.py diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py index 6c7968f23c..5fa4614f2f 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py @@ -36,8 +36,8 @@ def __init__( account_download_filter: AccountDownloadFilter, ): self.reporting_fiscal_year = account_download_filter.year - self.reporting_fiscal_quarter = account_download_filter.quarter - self.reporting_fiscal_period = account_download_filter.month + self.reporting_fiscal_quarter = account_download_filter.quarter or account_download_filter.month // 3 + self.reporting_fiscal_period = account_download_filter.month or account_download_filter.quarter * 3 self.agency = account_download_filter.agency self.federal_account_id = account_download_filter.federal_account_id self.def_codes = account_download_filter.def_codes @@ -101,7 +101,7 @@ class Condition: @staticmethod def collect_concat(col_name: str, concat_str: str = "; ") -> Column: - return sf.concat_ws(concat_str, sf.collect_set(col_name)).alias(col_name) + return sf.concat_ws(concat_str, sf.sort_array(sf.collect_set(col_name))).alias(col_name) @property def source_df(self) -> DataFrame: diff --git a/usaspending_api/download/tests/unit/test_account_download_dataframe_builder.py b/usaspending_api/download/tests/unit/test_account_download_dataframe_builder.py new file mode 100644 index 0000000000..54cab28bb5 --- /dev/null +++ b/usaspending_api/download/tests/unit/test_account_download_dataframe_builder.py @@ -0,0 +1,103 @@ +from unittest.mock import patch + +import pandas as pd +import pytest +from usaspending_api.download.management.commands.delta_downloads.award_financial.columns import ( + select_cols, + groupby_cols, +) +from usaspending_api.download.management.commands.delta_downloads.award_financial.federal_account import ( + AccountDownloadDataFrameBuilder, + AccountDownloadFilter, +) + + +@pytest.fixture(scope="module") +def account_download_table(spark): + spark.sql("CREATE DATABASE IF NOT EXISTS rpt") + columns = list(set(select_cols + groupby_cols)) + [ + "reporting_fiscal_year", + "reporting_fiscal_quarter", + "reporting_fiscal_period", + "quarter_format_flag", + "submission_id", + "agency_code", + "federal_account_id", + ] + test_data_df = pd.DataFrame( + data={ + "reporting_fiscal_year": [2018, 2018, 2018, 2018, 2019], + "quarter_format_flag": [True, True, False, True, True], + "reporting_fiscal_quarter": [1, 2, None, 4, 2], + "reporting_fiscal_period": [None, None, 5, None, None], + "transaction_obligated_amount": [100, 100, 100, 100, 100], + "submission_id": [1, 2, 3, 4, 5], + "owning_agency_name": ["test1", "test2", "test2", "test2", "test3"], + "reporting_agency_name": ["A", "B", "C", "D", "E"], + "budget_function": ["A", "B", "C", "D", "E"], + "budget_subfunction": ["A", "B", "C", "D", "E"], + "gross_outlay_amount_FYB_to_period_end": [100, 100, 100, 100, 100], + "agency_code": [1, 2, 2, 2, 3], + "federal_account_id": [1, 2, 2, 2, 3], + }, + columns=columns, + ).fillna("dummy_text") + spark.createDataFrame(test_data_df).write.format("delta").mode("overwrite").saveAsTable("rpt.account_download") + yield + spark.sql("DROP TABLE IF EXISTS rpt.account_download") + + +@patch( + "usaspending_api.download.management.commands.delta_downloads.award_financial.federal_account.get_submission_ids_for_periods" +) +def test_account_download_dataframe_builder(mock_get_submission_ids_for_periods, spark, account_download_table): + mock_get_submission_ids_for_periods.return_value = [1, 2, 4, 5] + + account_download_filter = AccountDownloadFilter( + year=2018, + quarter=4, + ) + builder = AccountDownloadDataFrameBuilder(spark, account_download_filter) + result = builder.source_df + for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: + assert sorted(result.toPandas()[col].to_list()) == ["A", "B; C; D"] + assert sorted(result.toPandas().transaction_obligated_amount.to_list()) == [100, 300] + assert sorted(result.toPandas().gross_outlay_amount_FYB_to_period_end.to_list()) == [100, 200] + + +@patch( + "usaspending_api.download.management.commands.delta_downloads.award_financial.federal_account.get_submission_ids_for_periods" +) +def test_filter_by_agency(mock_get_submission_ids_for_periods, spark, account_download_table): + mock_get_submission_ids_for_periods.return_value = [1, 2, 4, 5] + + account_download_filter = AccountDownloadFilter( + year=2018, + quarter=4, + agency=2, + ) + builder = AccountDownloadDataFrameBuilder(spark, account_download_filter) + result = builder.source_df + for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: + assert sorted(result.toPandas()[col].to_list()) == ["B; C; D"] + assert sorted(result.toPandas().transaction_obligated_amount.to_list()) == [300] + assert sorted(result.toPandas().gross_outlay_amount_FYB_to_period_end.to_list()) == [200] + + +@patch( + "usaspending_api.download.management.commands.delta_downloads.award_financial.federal_account.get_submission_ids_for_periods" +) +def test_filter_by_federal_account_id(mock_get_submission_ids_for_periods, spark, account_download_table): + mock_get_submission_ids_for_periods.return_value = [1, 2, 4, 5] + + account_download_filter = AccountDownloadFilter( + year=2018, + quarter=4, + federal_account_id=1, + ) + builder = AccountDownloadDataFrameBuilder(spark, account_download_filter) + result = builder.source_df + for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: + assert sorted(result.toPandas()[col].to_list()) == ["A"] + assert sorted(result.toPandas().transaction_obligated_amount.to_list()) == [100] + assert sorted(result.toPandas().gross_outlay_amount_FYB_to_period_end.to_list()) == [100] From da8d9c70e11bcd9b2d8387239e7cb189daa462a9 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Thu, 12 Jun 2025 09:25:25 -0500 Subject: [PATCH 28/43] [DEV-12574] - Move test file to integration tests --- .../test_account_download_dataframe_builder.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename usaspending_api/download/tests/{unit => integration}/test_account_download_dataframe_builder.py (100%) diff --git a/usaspending_api/download/tests/unit/test_account_download_dataframe_builder.py b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py similarity index 100% rename from usaspending_api/download/tests/unit/test_account_download_dataframe_builder.py rename to usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py From 34a5706e5b385a9567430bb59baf27a0574e9915 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Thu, 12 Jun 2025 11:09:58 -0500 Subject: [PATCH 29/43] [DEV-12574] - update source of select columns --- .../award_financial/columns.py | 83 +------------------ .../award_financial/federal_account.py | 6 +- 2 files changed, 7 insertions(+), 82 deletions(-) diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py index 6affa89622..1fabb6d967 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py @@ -1,3 +1,5 @@ +from usaspending_api.download.v2.download_column_historical_lookups import query_paths + groupby_cols = [ "owning_agency_name", "federal_account_symbol", @@ -73,83 +75,6 @@ ] select_cols = [ - "owning_agency_name", - "reporting_agency_name", - "submission_period", - "federal_account_symbol", - "federal_account_name", - "agency_identifier_name", - "budget_function", - "budget_subfunction", - "program_activity_code", - "program_activity_name", - "object_class_code", - "object_class_name", - "direct_or_reimbursable_funding_source", - "disaster_emergency_fund_code", - "disaster_emergency_fund_name", - "transaction_obligated_amount", - "gross_outlay_amount_FYB_to_period_end", - "USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig", - "USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig", - "award_unique_key", - "award_id_piid", - "parent_award_id_piid", - "award_id_fain", - "award_id_uri", - "award_base_action_date", - "award_base_action_date_fiscal_year", - "award_latest_action_date", - "award_latest_action_date_fiscal_year", - "period_of_performance_start_date", - "period_of_performance_current_end_date", - "ordering_period_end_date", - "award_type_code", - "award_type", - "idv_type_code", - "idv_type", - "prime_award_base_transaction_description", - "awarding_agency_code", - "awarding_agency_name", - "awarding_subagency_code", - "awarding_subagency_name", - "awarding_office_code", - "awarding_office_name", - "funding_agency_code", - "funding_agency_name", - "funding_sub_agency_code", - "funding_sub_agency_name", - "funding_office_code", - "funding_office_name", - "recipient_uei", - "recipient_duns", - "recipient_name", - "recipient_name_raw", - "recipient_parent_uei", - "recipient_parent_duns", - "recipient_parent_name", - "recipient_parent_name_raw", - "recipient_country", - "recipient_state", - "recipient_county", - "recipient_city", - "prime_award_summary_recipient_cd_original", - "prime_award_summary_recipient_cd_current", - "recipient_zip_code", - "primary_place_of_performance_country", - "primary_place_of_performance_state", - "primary_place_of_performance_county", - "prime_award_summary_place_of_performance_cd_original", - "prime_award_summary_place_of_performance_cd_current", - "primary_place_of_performance_zip_code", - "cfda_number", - "cfda_title", - "product_or_service_code", - "product_or_service_code_description", - "naics_code", - "naics_description", - "national_interest_action_code", - "national_interest_action", - "usaspending_permalink", - "last_modified_date", + col if not col.startswith("last_modified_date") else "last_modified_date" + for col in query_paths["award_financial"]["federal_account"].keys() ] diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py index 5fa4614f2f..3dbc98d05f 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py @@ -41,9 +41,9 @@ def __init__( self.agency = account_download_filter.agency self.federal_account_id = account_download_filter.federal_account_id self.def_codes = account_download_filter.def_codes - self.df = spark.table("rpt.account_download") - self.groupby_cols = groupby_cols - self.select_cols = select_cols + self.df: str = spark.table("rpt.account_download") + self.groupby_cols: list[str] = groupby_cols + self.select_cols: list[str] = select_cols def filter_to_latest_submissions_for_agencies(self, col_name: str, otherwise: Any = None) -> Column: """Filter to the latest submission regardless of whether the agency submitted on a monthly or quarterly basis""" From 961983b80510f60c4489fc64ddec50d3493dc353 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Fri, 20 Jun 2025 18:01:54 -0500 Subject: [PATCH 30/43] [DEV-12574] - Update fixtures to ensure cleanup of delta tables --- .../test_account_download_dataframe_builder.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py index 54cab28bb5..15056d1d3a 100644 --- a/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py +++ b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py @@ -2,6 +2,7 @@ import pandas as pd import pytest +from django.core.management import call_command from usaspending_api.download.management.commands.delta_downloads.award_financial.columns import ( select_cols, groupby_cols, @@ -12,9 +13,15 @@ ) -@pytest.fixture(scope="module") -def account_download_table(spark): +@pytest.fixture(scope="function") +def account_download_table(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): spark.sql("CREATE DATABASE IF NOT EXISTS rpt") + call_command( + "create_delta_table", + f"--destination-table=account_download", + "--alt-db=int", + f"--spark-s3-bucket={s3_unittest_data_bucket}", + ) columns = list(set(select_cols + groupby_cols)) + [ "reporting_fiscal_year", "reporting_fiscal_quarter", @@ -44,7 +51,6 @@ def account_download_table(spark): ).fillna("dummy_text") spark.createDataFrame(test_data_df).write.format("delta").mode("overwrite").saveAsTable("rpt.account_download") yield - spark.sql("DROP TABLE IF EXISTS rpt.account_download") @patch( From 0c54c6acf7684b1de4dd8f146a44f607d25b9eab Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Sat, 21 Jun 2025 11:47:58 -0500 Subject: [PATCH 31/43] [DEV-12574] - Update fixtures to ensure cleanup of delta tables pt. 2 --- .../award_financial/federal_account.py | 3 ++- .../test_account_download_dataframe_builder.py | 14 +++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py index 3dbc98d05f..3cea41c7e4 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py @@ -34,6 +34,7 @@ def __init__( self, spark: SparkSession, account_download_filter: AccountDownloadFilter, + table_name: str = "rpt.account_download", ): self.reporting_fiscal_year = account_download_filter.year self.reporting_fiscal_quarter = account_download_filter.quarter or account_download_filter.month // 3 @@ -41,7 +42,7 @@ def __init__( self.agency = account_download_filter.agency self.federal_account_id = account_download_filter.federal_account_id self.def_codes = account_download_filter.def_codes - self.df: str = spark.table("rpt.account_download") + self.df: str = spark.table(table_name) self.groupby_cols: list[str] = groupby_cols self.select_cols: list[str] = select_cols diff --git a/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py index 15056d1d3a..0d564e33b1 100644 --- a/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py +++ b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py @@ -3,6 +3,7 @@ import pandas as pd import pytest from django.core.management import call_command +from pyspark.sql.functions import col, to_date from usaspending_api.download.management.commands.delta_downloads.award_financial.columns import ( select_cols, groupby_cols, @@ -15,11 +16,9 @@ @pytest.fixture(scope="function") def account_download_table(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): - spark.sql("CREATE DATABASE IF NOT EXISTS rpt") call_command( "create_delta_table", f"--destination-table=account_download", - "--alt-db=int", f"--spark-s3-bucket={s3_unittest_data_bucket}", ) columns = list(set(select_cols + groupby_cols)) + [ @@ -49,7 +48,13 @@ def account_download_table(spark, s3_unittest_data_bucket, hive_unittest_metasto }, columns=columns, ).fillna("dummy_text") - spark.createDataFrame(test_data_df).write.format("delta").mode("overwrite").saveAsTable("rpt.account_download") + ( + spark.createDataFrame(test_data_df) + .write.format("delta") + .mode("overwrite") + .option("overwriteSchema", "true") + .saveAsTable("rpt.account_download") + ) yield @@ -58,12 +63,11 @@ def account_download_table(spark, s3_unittest_data_bucket, hive_unittest_metasto ) def test_account_download_dataframe_builder(mock_get_submission_ids_for_periods, spark, account_download_table): mock_get_submission_ids_for_periods.return_value = [1, 2, 4, 5] - account_download_filter = AccountDownloadFilter( year=2018, quarter=4, ) - builder = AccountDownloadDataFrameBuilder(spark, account_download_filter) + builder = AccountDownloadDataFrameBuilder(spark, account_download_filter, "rpt.account_download") result = builder.source_df for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: assert sorted(result.toPandas()[col].to_list()) == ["A", "B; C; D"] From dafa1b2c507c4e5c41ec13ac0fa882dc2abd71ca Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Sat, 21 Jun 2025 11:53:18 -0500 Subject: [PATCH 32/43] [DEV-12574] - Remove unused import --- .../tests/integration/test_account_download_dataframe_builder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py index 0d564e33b1..9a35d17c42 100644 --- a/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py +++ b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py @@ -3,7 +3,6 @@ import pandas as pd import pytest from django.core.management import call_command -from pyspark.sql.functions import col, to_date from usaspending_api.download.management.commands.delta_downloads.award_financial.columns import ( select_cols, groupby_cols, From 3e75bc5ecce45334b68a8fd22c0dae3c201d2d46 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Wed, 25 Jun 2025 09:05:11 -0500 Subject: [PATCH 33/43] [DEV-12772] - Adding treasury account downloads WIP --- .../download/delta_models/account_download.py | 79 ++--- .../download/delta_models/treasury_account.py | 321 ++++++++++++++++++ .../commands/load_query_to_delta.py | 26 ++ 3 files changed, 374 insertions(+), 52 deletions(-) create mode 100644 usaspending_api/download/delta_models/treasury_account.py diff --git a/usaspending_api/download/delta_models/account_download.py b/usaspending_api/download/delta_models/account_download.py index 4259ec908d..988576cb4d 100644 --- a/usaspending_api/download/delta_models/account_download.py +++ b/usaspending_api/download/delta_models/account_download.py @@ -114,6 +114,7 @@ federal_account.federal_account_code AS federal_account_symbol, federal_account.account_title AS federal_account_name, cgac_aid.agency_name AS agency_identifier_name, + cgac_ata.agency_name AS allocation_transfer_agency_identifier_name, ref_program_activity.program_activity_code, ref_program_activity.program_activity_name, object_class.object_class AS object_class_code, @@ -196,6 +197,8 @@ ) ) END AS submission_period, + treasury_appropriation_account.allocation_transfer_agency_id AS allocation_transfer_agency_identifier_code, + treasury_appropriation_account.agency_id AS agency_identifier_code, treasury_appropriation_account.budget_function_title AS budget_function, treasury_appropriation_account.budget_subfunction_title AS budget_subfunction, financial_accounts_by_awards.transaction_obligated_amount AS transaction_obligated_amount, @@ -300,56 +303,28 @@ submission_attributes.reporting_fiscal_quarter, submission_attributes.reporting_fiscal_year, submission_attributes.quarter_format_flag - FROM raw.financial_accounts_by_awards - INNER JOIN global_temp.submission_attributes AS submission_attributes - ON ( - financial_accounts_by_awards.submission_id - = submission_attributes.submission_id - ) - LEFT OUTER JOIN global_temp.treasury_appropriation_account - ON ( - financial_accounts_by_awards.treasury_account_id - = treasury_appropriation_account.treasury_account_identifier - ) - LEFT OUTER JOIN award_search - ON ( - financial_accounts_by_awards.award_id = award_search.award_id - ) - LEFT OUTER JOIN transaction_search - ON ( - award_search.latest_transaction_search_id - = transaction_search.transaction_id - ) - LEFT OUTER JOIN global_temp.ref_program_activity - ON ( - financial_accounts_by_awards.program_activity_id - = ref_program_activity.id - ) - LEFT OUTER JOIN global_temp.object_class - ON ( - financial_accounts_by_awards.object_class_id = object_class.id - ) - LEFT OUTER JOIN global_temp.disaster_emergency_fund_code - ON ( - financial_accounts_by_awards.disaster_emergency_fund_code - = disaster_emergency_fund_code.code - ) - LEFT OUTER JOIN global_temp.federal_account - ON ( - treasury_appropriation_account.federal_account_id = federal_account.id - ) - LEFT OUTER JOIN global_temp.toptier_agency - ON ( - federal_account.parent_toptier_agency_id - = toptier_agency.toptier_agency_id - ) - LEFT OUTER JOIN global_temp.cgac AS cgac_aid - ON ( - treasury_appropriation_account.agency_id = cgac_aid.cgac_code - ) - LEFT OUTER JOIN global_temp.cgac AS cgac_ata - ON ( - treasury_appropriation_account.allocation_transfer_agency_id - = cgac_ata.cgac_code - ); + FROM + raw.financial_accounts_by_awards + INNER JOIN global_temp.submission_attributes AS submission_attributes + ON (financial_accounts_by_awards.submission_id = submission_attributes.submission_id) + LEFT OUTER JOIN global_temp.treasury_appropriation_account + ON (financial_accounts_by_awards.treasury_account_id = treasury_appropriation_account.treasury_account_identifier) + LEFT OUTER JOIN award_search + ON (financial_accounts_by_awards.award_id = award_search.award_id) + LEFT OUTER JOIN transaction_search + ON (award_search.latest_transaction_search_id = transaction_search.transaction_id) + LEFT OUTER JOIN global_temp.ref_program_activity + ON (financial_accounts_by_awards.program_activity_id = ref_program_activity.id) + LEFT OUTER JOIN global_temp.object_class + ON (financial_accounts_by_awards.object_class_id = object_class.id) + LEFT OUTER JOIN global_temp.disaster_emergency_fund_code + ON (financial_accounts_by_awards.disaster_emergency_fund_code = disaster_emergency_fund_code.code) + LEFT OUTER JOIN global_temp.federal_account + ON (treasury_appropriation_account.federal_account_id = federal_account.id) + LEFT OUTER JOIN global_temp.toptier_agency + ON (federal_account.parent_toptier_agency_id = toptier_agency.toptier_agency_id) + LEFT OUTER JOIN global_temp.cgac AS cgac_aid + ON (treasury_appropriation_account.agency_id = cgac_aid.cgac_code) + LEFT OUTER JOIN global_temp.cgac AS cgac_ata + ON (treasury_appropriation_account.allocation_transfer_agency_id = cgac_ata.cgac_code); """ diff --git a/usaspending_api/download/delta_models/treasury_account.py b/usaspending_api/download/delta_models/treasury_account.py new file mode 100644 index 0000000000..6541686790 --- /dev/null +++ b/usaspending_api/download/delta_models/treasury_account.py @@ -0,0 +1,321 @@ +TREASURY_ACCOUNT_DOWNLOAD_COLUMNS = { + "financial_accounts_by_awards_id", + "submission_id" "owning_agency_name", + "reporting_agency_name", + "submission_period", + "allocation_transfer_agency_identifier_code", + "agency_identifier_code", + "beginning_period_of_availability", + "ending_period_of_availability", + "availability_type_code", + "main_account_code", + "sub_account_code", + "treasury_account_symbol", + "treasury_account_name", + "agency_identifier_name", + "allocation_transfer_agency_identifier_name", + "budget_function", + "budget_subfunction", + "federal_account_symbol", + "federal_account_name", + "program_activity_code", + "program_activity_name", + "object_class_code", + "object_class_name", + "direct_or_reimbursable_funding_source", + "disaster_emergency_fund_code", + "disaster_emergency_fund_name", + "transaction_obligated_amount", + "gross_outlay_amount_fyb_to_period_end", + "ussgl487200_downward_adj_prior_year_prepaid_undeliv_order_oblig", + "ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig", + "award_unique_key", + "award_id_piid", + "parent_award_id_piid", + "award_id_fain", + "award_id_uri", + "award_base_action_date", + "award_base_action_date_fiscal_year", + "award_latest_action_date", + "award_latest_action_date_fiscal_year", + "period_of_performance_start_date", + "period_of_performance_current_end_date", + "ordering_period_end_date", + "award_type_code", + "award_type", + "idv_type_code", + "idv_type", + "prime_award_base_transaction_description", + "awarding_agency_code", + "awarding_agency_name", + "awarding_subagency_code", + "awarding_subagency_name", + "awarding_office_code", + "awarding_office_name", + "funding_agency_code", + "funding_agency_name", + "funding_sub_agency_code", + "funding_sub_agency_name", + "funding_office_code", + "funding_office_name", + "recipient_uei", + "recipient_duns", + "recipient_name", + "recipient_name_raw", + "recipient_parent_uei", + "recipient_parent_duns", + "recipient_parent_name", + "recipient_parent_name_raw", + "recipient_country", + "recipient_state", + "recipient_county", + "recipient_city", + "prime_award_summary_recipient_cd_original", + "prime_award_summary_recipient_cd_current", + "recipient_zip_code", + "primary_place_of_performance_country", + "primary_place_of_performance_state", + "primary_place_of_performance_county", + "prime_award_summary_place_of_performance_cd_original", + "prime_award_summary_place_of_performance_cd_current", + "primary_place_of_performance_zip_code", + "cfda_number", + "cfda_title", + "product_or_service_code", + "product_or_service_code_description", + "naics_code", + "naics_description", + "national_interest_action_code", + "national_interest_action", + "usaspending_permalink", + "last_modified_date", +} + +TREASURY_ACCOUNT_DOWNLOAD_DELTA_COLUMNS = {} + +account_download_create_sql_string = rf""" + CREATE OR REPLACE TABLE {{DESTINATION_TABLE}} ( + {", ".join([f'{key} {val}' for key, val in TREASURY_ACCOUNT_DOWNLOAD_DELTA_COLUMNS.items()])} + ) + USING DELTA + LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' + """ + + +treasury_account_download_load_sql_string = rf""" + INSERT OVERWRITE {{DESTINATION_DATABASE}}.{{DESTINATION_TABLE}} ( + {",".join(list(TREASURY_ACCOUNT_DOWNLOAD_COLUMNS))} + ) + SELECT + financial_accounts_by_awards.financial_accounts_by_awards_id, + financial_accounts_by_awards.submission_id, + toptier_agency.name AS owning_agency_name, + submission_attributes.reporting_agency_name AS reporting_agency_name, + CASE + WHEN submission_attributes.quarter_format_flag = TRUE + THEN + CONCAT( + CAST('FY' AS STRING), + CAST(submission_attributes.reporting_fiscal_year AS STRING), + CAST('Q' AS STRING), + CAST( + submission_attributes.reporting_fiscal_quarter AS STRING + ) + ) + ELSE + CONCAT( + CAST('FY' AS STRING), + CAST(submission_attributes.reporting_fiscal_year AS STRING), + CAST('P' AS STRING), + LPAD( + CAST( + submission_attributes.reporting_fiscal_period AS STRING + ), + 2, + '0' + ) + ) + END AS submission_period, + treasury_appropriation_account.allocation_transfer_agency_id AS allocation_transfer_agency_identifier_code, + treasury_appropriation_account.agency_id AS agency_identifier_code, + treasury_appropriation_account.beginning_period_of_availability AS beginning_period_of_availability, + treasury_appropriation_account.ending_period_of_availability AS ending_period_of_availability, + treasury_appropriation_account.availability_type_code AS availability_type_code, + treasury_appropriation_account.main_account_code AS main_account_code, + treasury_appropriation_account.sub_account_code AS sub_account_code, + treasury_appropriation_account.tas_rendering_label AS treasury_account_symbol, + treasury_appropriation_account.account_title AS treasury_account_name, + CGAC_AID.agency_name AS agency_identifier_name, + CGAC_ATA.agency_name AS allocation_transfer_agency_identifier_name, + treasury_appropriation_account.budget_function_title AS budget_function, + treasury_appropriation_account.budget_subfunction_title AS budget_subfunction, + federal_account.federal_account_code AS federal_account_symbol, + federal_account.account_title AS federal_account_name, + ref_program_activity.program_activity_code AS program_activity_code, + ref_program_activity.program_activity_name AS program_activity_name, + object_class.object_class AS object_class_code, + object_class.object_class_name AS object_class_name, + object_class.direct_reimbursable AS direct_or_reimbursable_funding_source, + financial_accounts_by_awards.disaster_emergency_fund_code AS disaster_emergency_fund_code, + disaster_emergency_fund_code.title AS disaster_emergency_fund_name, + financial_accounts_by_awards.transaction_obligated_amount AS transaction_obligated_amount, + financial_accounts_by_awards.gross_outlay_amount_by_award_cpe AS gross_outlay_amount_fyb_to_period_end, + financial_accounts_by_awards.ussgl487200_down_adj_pri_ppaid_undel_orders_oblig_refund_cpe AS ussgl487200_downward_adj_prior_year_prepaid_undeliv_order_oblig, + financial_accounts_by_awards.ussgl497200_down_adj_pri_paid_deliv_orders_oblig_refund_cpe AS ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig, + award_search.generated_unique_award_id AS award_unique_key, + financial_accounts_by_awards.piid AS award_id_piid, + financial_accounts_by_awards.parent_award_id AS parent_award_id_piid, + financial_accounts_by_awards.fain AS award_id_fain, + financial_accounts_by_awards.uri AS award_id_uri, + award_search.date_signed AS award_base_action_date, + EXTRACT(YEAR from (award_search.date_signed) + INTERVAL '3 months') AS award_base_action_date_fiscal_year, + award_search.certified_date AS award_latest_action_date, + EXTRACT(YEAR from (award_search.certified_date) + INTERVAL '3 months') AS award_latest_action_date_fiscal_year, + award_search.period_of_performance_start_date AS period_of_performance_start_date, + award_search.period_of_performance_current_end_date AS period_of_performance_current_end_date, + transaction_search.ordering_period_end_date AS ordering_period_end_date, + COALESCE(transaction_search.contract_award_type, transaction_search.type) AS award_type_code, + COALESCE(transaction_search.contract_award_type_desc, transaction_search.type_description) AS award_type, + transaction_search.idv_type AS idv_type_code, + transaction_search.idv_type_description AS idv_type, + award_search.description AS prime_award_base_transaction_description, + transaction_search.awarding_agency_code AS awarding_agency_code, + transaction_search.awarding_toptier_agency_name_raw AS awarding_agency_name, + transaction_search.awarding_sub_tier_agency_c AS awarding_subagency_code, + transaction_search.awarding_subtier_agency_name_raw AS awarding_subagency_name, + transaction_search.awarding_office_code AS awarding_office_code, + transaction_search.awarding_office_name AS awarding_office_name, + transaction_search.funding_agency_code AS funding_agency_code, + transaction_search.funding_toptier_agency_name_raw AS funding_agency_name, + transaction_search.funding_sub_tier_agency_co AS funding_sub_agency_code, + transaction_search.funding_subtier_agency_name_raw AS funding_sub_agency_name, + transaction_search.funding_office_code AS funding_office_code, + transaction_search.funding_office_name AS funding_office_name, + transaction_search.recipient_uei AS recipient_uei, + transaction_search.recipient_unique_id AS recipient_duns, + transaction_search.recipient_name AS recipient_name, + transaction_search.recipient_name_raw AS recipient_name_raw, + transaction_search.parent_uei AS recipient_parent_uei, + transaction_search.parent_uei AS recipient_parent_duns, + transaction_search.parent_recipient_name AS recipient_parent_name, + transaction_search.parent_recipient_name_raw AS recipient_parent_name_raw, + transaction_search.recipient_location_country_code AS recipient_country, + transaction_search.recipient_location_state_code AS recipient_state, + transaction_search.recipient_location_county_name AS recipient_county, + transaction_search.recipient_location_city_name AS recipient_city, + CASE + WHEN ( + transaction_search.recipient_location_state_code IS NOT NULL + AND transaction_search.recipient_location_congressional_code IS NOT NULL + AND NOT ( + transaction_search.recipient_location_state_code = '' + AND transaction_search.recipient_location_state_code IS NOT NULL + )) + THEN + CONCAT( + transaction_search.recipient_location_state_code, '-', + transaction_search.recipient_location_congressional_code + ) + ELSE transaction_search.recipient_location_congressional_code + END AS prime_award_summary_recipient_cd_original, + CASE + WHEN ( + transaction_search.recipient_location_state_code IS NOT NULL + AND transaction_search.recipient_location_congressional_code_current IS NOT NULL + AND NOT ( + transaction_search.recipient_location_state_code = '' + AND transaction_search.recipient_location_state_code IS NOT NULL + )) + THEN + CONCAT( + transaction_search.recipient_location_state_code, + '-', + transaction_search.recipient_location_congressional_code_current + ) + ELSE transaction_search.recipient_location_congressional_code_current + END AS prime_award_summary_recipient_cd_current, + COALESCE( + transaction_search.legal_entity_zip4, + CONCAT( + CAST(transaction_search.recipient_location_zip5 AS STRING), + CAST(transaction_search.legal_entity_zip_last4 AS STRING) + ) + ) AS recipient_zip_code, + transaction_search.pop_country_name AS primary_place_of_performance_country, + transaction_search.pop_state_name AS primary_place_of_performance_state, + transaction_search.pop_county_name AS primary_place_of_performance_county, + CASE + WHEN + transaction_search.pop_state_code IS NOT NULL + AND transaction_search.pop_congressional_code IS NOT NULL + AND NOT ( + transaction_search.pop_state_code = '' + AND transaction_search.pop_state_code IS NOT NULL + ) + THEN + CONCAT( + transaction_search.pop_state_code, + '-', + transaction_search.pop_congressional_code + ) + ELSE transaction_search.pop_congressional_code + END AS prime_award_summary_place_of_performance_cd_original, + CASE + WHEN + transaction_search.pop_state_code IS NOT NULL + AND transaction_search.pop_congressional_code_current IS NOT NULL + AND NOT ( + transaction_search.pop_state_code = '' + AND transaction_search.pop_state_code IS NOT NULL + ) + THEN + CONCAT( + transaction_search.pop_state_code, + '-', + transaction_search.pop_congressional_code_current + ) + ELSE transaction_search.pop_congressional_code_current + END AS prime_award_summary_place_of_performance_cd_current, + transaction_search.place_of_performance_zip4a AS primary_place_of_performance_zip_code, + transaction_search.cfda_number AS cfda_number, + transaction_search.cfda_title AS cfda_title, + transaction_search.product_or_service_code AS product_or_service_code, + transaction_search.product_or_service_description AS product_or_service_code_description, + transaction_search.naics_code AS naics_code, + transaction_search.naics_description AS naics_description, + transaction_search.national_interest_action AS national_interest_action_code, + transaction_search.national_interest_desc AS national_interest_action, + CASE + WHEN award_search.generated_unique_award_id IS NOT NULL + THEN CONCAT('localhost:3000/award/', URL_ENCODE(award_search.generated_unique_award_id), '/') + ELSE '/' END AS usaspending_permalink, + CAST(submission_attributes.published_date AS DATE) AS last_modified_date, + submission_attributes.reporting_fiscal_period, + submission_attributes.reporting_fiscal_quarter, + submission_attributes.reporting_fiscal_year, + submission_attributes.quarter_format_flag + FROM + raw.financial_accounts_by_awards + INNER JOIN global_temp.submission_attributes + ON (financial_accounts_by_awards.submission_id = submission_attributes.submission_id) + LEFT OUTER JOIN global_temp.treasury_appropriation_account + ON (financial_accounts_by_awards.treasury_account_id = treasury_appropriation_account.treasury_account_identifier) + LEFT OUTER JOIN global_temp.cgac AS CGAC_AID + ON (treasury_appropriation_account.agency_id = CGAC_AID.cgac_code) + LEFT OUTER JOIN global_temp.cgac AS CGAC_ATA + ON (treasury_appropriation_account.allocation_transfer_agency_id = CGAC_ATA.cgac_code) + INNER JOIN award_search + ON (financial_accounts_by_awards.award_id = award_search.award_id) + INNER JOIN transaction_search + ON (award_search.latest_transaction_search_id = transaction_search.transaction_id) + LEFT OUTER JOIN global_temp.toptier_agency + ON (treasury_appropriation_account.funding_toptier_agency_id = toptier_agency.toptier_agency_id) + LEFT OUTER JOIN global_temp.federal_account + ON (treasury_appropriation_account.federal_account_id = federal_account.id) + LEFT OUTER JOIN global_temp.ref_program_activity + ON (financial_accounts_by_awards.program_activity_id = ref_program_activity.id) + LEFT OUTER JOIN global_temp.object_class + ON (financial_accounts_by_awards.object_class_id = object_class.id) + LEFT OUTER JOIN global_temp.disaster_emergency_fund_code + ON (financial_accounts_by_awards.disaster_emergency_fund_code = disaster_emergency_fund_code.code) +""" diff --git a/usaspending_api/etl/management/commands/load_query_to_delta.py b/usaspending_api/etl/management/commands/load_query_to_delta.py index 248cccb424..5a876a50c1 100644 --- a/usaspending_api/etl/management/commands/load_query_to_delta.py +++ b/usaspending_api/etl/management/commands/load_query_to_delta.py @@ -24,6 +24,11 @@ account_download_create_sql_string, account_download_load_sql_string, ) +from usaspending_api.download.delta_models.treasury_account import ( + TREASURY_ACCOUNT_DOWNLOAD_POSTGRES_COLUMNS, + treasury_account_download_create_sql_string, + treasury_account_download_load_sql_string, +) from usaspending_api.recipient.delta_models import ( RECIPIENT_LOOKUP_POSTGRES_COLUMNS, RECIPIENT_PROFILE_POSTGRES_COLUMNS, @@ -334,6 +339,27 @@ "tsvectors": None, "postgres_partition_spec": None, }, + "treasury_account_download": { + "model": None, + "is_from_broker": False, + "source_query": [treasury_account_download_load_sql_string], + "source_query_incremental": None, + "source_database": None, + "source_table": None, + "destination_database": "rpt", + "swap_table": None, + "swap_schema": None, + "partition_column": "financial_accounts_by_awards_id", + "partition_column_type": "numeric", + "is_partition_column_unique": False, + "delta_table_create_sql": treasury_account_download_create_sql_string, + "source_schema": TREASURY_ACCOUNT_DOWNLOAD_POSTGRES_COLUMNS, + "custom_schema": None, + "column_names": list(TREASURY_ACCOUNT_DOWNLOAD_POSTGRES_COLUMNS), + "postgres_seq_name": None, + "tsvectors": None, + "postgres_partition_spec": None, + }, } From 6a0fe2f44af92f36d2839d899dfa61d2c2a8810d Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Mon, 30 Jun 2025 16:55:00 -0500 Subject: [PATCH 34/43] [DEV-12772] - WIP adding treasury account downloads --- .../download/delta_models/account_download.py | 34 ++++- .../download/delta_models/treasury_account.py | 3 +- .../award_financial/builders.py | 99 +++++++++---- .../award_financial/columns.py | 16 ++- .../award_financial/federal_account.py | 135 ------------------ .../commands/generate_spark_download.py | 11 +- .../commands/load_query_to_delta.py | 26 ---- 7 files changed, 119 insertions(+), 205 deletions(-) delete mode 100644 usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py diff --git a/usaspending_api/download/delta_models/account_download.py b/usaspending_api/download/delta_models/account_download.py index 67d0b6b685..df61836d11 100644 --- a/usaspending_api/download/delta_models/account_download.py +++ b/usaspending_api/download/delta_models/account_download.py @@ -1,10 +1,12 @@ ACCOUNT_DOWNLOAD_COLUMNS = { "financial_accounts_by_awards_id": {"delta": "INTEGER NOT NULL", "postgres": "INTEGER NOT NULL"}, "submission_id": {"delta": "INTEGER NOT NULL", "postgres": "INTEGER NOT NULL"}, - "owning_agency_name": {"delta": "STRING", "postgres": "TEXT"}, + "federal_owning_agency_name": {"delta": "STRING", "postgres": "TEXT"}, + "treasury_owning_agency_name": {"delta": "STRING", "postgres": "TEXT"}, "federal_account_symbol": {"delta": "STRING", "postgres": "TEXT"}, "federal_account_name": {"delta": "STRING", "postgres": "TEXT"}, "agency_identifier_name": {"delta": "STRING", "postgres": "TEXT"}, + "allocation_transfer_agency_identifier_name": {"delta": "STRING", "postgres": "TEXT"}, "program_activity_code": {"delta": "STRING", "postgres": "TEXT"}, "program_activity_name": {"delta": "STRING", "postgres": "TEXT"}, "object_class_code": {"delta": "STRING", "postgres": "TEXT"}, @@ -63,6 +65,15 @@ "national_interest_action": {"delta": "STRING", "postgres": "TEXT"}, "reporting_agency_name": {"delta": "STRING", "postgres": "TEXT"}, "submission_period": {"delta": "STRING", "postgres": "TEXT"}, + "allocation_transfer_agency_identifier_code": {"delta": "STRING", "postgres": "TEXT"}, + "agency_identifier_code": {"delta": "STRING", "postgres": "TEXT"}, + "beginning_period_of_availability": {"delta": "DATE", "postgres": "DATE"}, + "ending_period_of_availability": {"delta": "DATE", "postgres": "DATE"}, + "availability_type_code": {"delta": "STRING", "postgres": "TEXT"}, + "main_account_code": {"delta": "STRING", "postgres": "TEXT"}, + "sub_account_code": {"delta": "STRING", "postgres": "TEXT"}, + "treasury_account_symbol": {"delta": "STRING", "postgres": "TEXT"}, + "treasury_account_name": {"delta": "STRING", "postgres": "TEXT"}, "funding_toptier_agency_id": {"delta": "INTEGER", "postgres": "INTEGER"}, "federal_account_id": {"delta": "INTEGER", "postgres": "INTEGER"}, "budget_function": {"delta": "STRING", "postgres": "TEXT"}, @@ -114,7 +125,8 @@ SELECT financial_accounts_by_awards.financial_accounts_by_awards_id, financial_accounts_by_awards.submission_id, - toptier_agency.name AS owning_agency_name, + federal_toptier_agency.name AS federal_owning_agency_name, + treasury_toptier_agency.name AS treasury_owning_agency_name, federal_account.federal_account_code AS federal_account_symbol, federal_account.account_title AS federal_account_name, cgac_aid.agency_name AS agency_identifier_name, @@ -202,14 +214,20 @@ ) END AS submission_period, treasury_appropriation_account.allocation_transfer_agency_id AS allocation_transfer_agency_identifier_code, - treasury_appropriation_account.agency_id AS agency_identifier_code, + treasury_appropriation_account.agency_id AS agency_identifier_code, + treasury_appropriation_account.beginning_period_of_availability AS beginning_period_of_availability, + treasury_appropriation_account.ending_period_of_availability AS ending_period_of_availability, + treasury_appropriation_account.availability_type_code AS availability_type_code, + treasury_appropriation_account.main_account_code AS main_account_code, + treasury_appropriation_account.sub_account_code AS sub_account_code, + treasury_appropriation_account.tas_rendering_label AS treasury_account_symbol, + treasury_appropriation_account.account_title AS treasury_account_name, treasury_appropriation_account.funding_toptier_agency_id AS funding_toptier_agency_id, treasury_appropriation_account.federal_account_id AS federal_account_id, treasury_appropriation_account.budget_function_title AS budget_function, treasury_appropriation_account.budget_function_code AS budget_function_code, treasury_appropriation_account.budget_subfunction_title AS budget_subfunction, treasury_appropriation_account.budget_subfunction_code AS budget_subfunction_code, - financial_accounts_by_awards.transaction_obligated_amount AS transaction_obligated_amount, financial_accounts_by_awards.gross_outlay_amount_by_award_cpe as gross_outlay_amount_fyb_to_period_end, financial_accounts_by_awards.ussgl487200_down_adj_pri_ppaid_undel_orders_oblig_refund_cpe as ussgl487200_downward_adj_prior_year_prepaid_undeliv_order_oblig, @@ -314,7 +332,7 @@ submission_attributes.quarter_format_flag FROM raw.financial_accounts_by_awards - INNER JOIN global_temp.submission_attributes AS submission_attributes + INNER JOIN global_temp.submission_attributes ON (financial_accounts_by_awards.submission_id = submission_attributes.submission_id) LEFT OUTER JOIN global_temp.treasury_appropriation_account ON (financial_accounts_by_awards.treasury_account_id = treasury_appropriation_account.treasury_account_identifier) @@ -330,8 +348,10 @@ ON (financial_accounts_by_awards.disaster_emergency_fund_code = disaster_emergency_fund_code.code) LEFT OUTER JOIN global_temp.federal_account ON (treasury_appropriation_account.federal_account_id = federal_account.id) - LEFT OUTER JOIN global_temp.toptier_agency - ON (federal_account.parent_toptier_agency_id = toptier_agency.toptier_agency_id) + LEFT OUTER JOIN global_temp.toptier_agency as federal_toptier_agency + ON (federal_account.parent_toptier_agency_id = federal_toptier_agency.toptier_agency_id) + LEFT OUTER JOIN global_temp.toptier_agency as treasury_toptier_agency + ON (treasury_appropriation_account.funding_toptier_agency_id = treasury_toptier_agency.toptier_agency_id) LEFT OUTER JOIN global_temp.cgac AS cgac_aid ON (treasury_appropriation_account.agency_id = cgac_aid.cgac_code) LEFT OUTER JOIN global_temp.cgac AS cgac_ata diff --git a/usaspending_api/download/delta_models/treasury_account.py b/usaspending_api/download/delta_models/treasury_account.py index 6541686790..05e28ecb47 100644 --- a/usaspending_api/download/delta_models/treasury_account.py +++ b/usaspending_api/download/delta_models/treasury_account.py @@ -1,6 +1,7 @@ TREASURY_ACCOUNT_DOWNLOAD_COLUMNS = { "financial_accounts_by_awards_id", - "submission_id" "owning_agency_name", + "submission_id", + "owning_agency_name", "reporting_agency_name", "submission_period", "allocation_transfer_agency_identifier_code", diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/builders.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/builders.py index fcf871a86a..be062372aa 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/builders.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/builders.py @@ -1,3 +1,4 @@ +from abc import ABC, abstractmethod from dataclasses import dataclass from functools import reduce from typing import Any @@ -8,12 +9,13 @@ from usaspending_api.download.management.commands.delta_downloads.award_financial.filters import AccountDownloadFilter from usaspending_api.download.management.commands.delta_downloads.award_financial.columns import ( federal_account_groupby_cols, - federal_account_select_cols, + treasury_account_select_cols, ) +from usaspending_api.download.v2.download_column_historical_lookups import query_paths from usaspending_api.submissions.helpers import get_submission_ids_for_periods -class AccountDownloadDataFrameBuilder: +class AbstractAccountDownloadDataFrameBuilder(ABC): def __init__( self, @@ -29,27 +31,10 @@ def __init__( self.budget_function = account_download_filter.budget_function self.budget_subfunction = account_download_filter.budget_subfunction self.def_codes = account_download_filter.def_codes - self.df: str = spark.table(table_name) - self.groupby_cols: list[str] = federal_account_groupby_cols - self.select_cols: list[str] = federal_account_select_cols - - def filter_to_latest_submissions_for_agencies(self, col_name: str, otherwise: Any = None) -> Column: - """Filter to the latest submission regardless of whether the agency submitted on a monthly or quarterly basis""" - return ( - sf.when( - sf.col("submission_id").isin( - get_submission_ids_for_periods( - self.reporting_fiscal_year, self.reporting_fiscal_quarter, self.reporting_fiscal_period - ) - ), - sf.col(col_name), - ) - .otherwise(otherwise) - .alias(col_name) - ) + self.df: DataFrame = spark.table(table_name) @property - def combined_filters(self) -> Column: + def dynamic_filters(self) -> Column: @dataclass class Condition: @@ -99,14 +84,54 @@ class Condition: [condition.condition for condition in conditions if condition.apply], ) + @property + def non_zero_filters(self) -> Column: + return ( + (sf.col("gross_outlay_amount_FYB_to_period_end") != 0) + | (sf.col("USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig") != 0) + | (sf.col("USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig") != 0) + | (sf.col("transaction_obligated_amount") != 0) + ) + @staticmethod def collect_concat(col_name: str, concat_str: str = "; ") -> Column: return sf.concat_ws(concat_str, sf.sort_array(sf.collect_set(col_name))).alias(col_name) + @property + @abstractmethod + def source_df(self) -> DataFrame: ... + + +class FederalAccountDownloadDataFrameBuilder(AbstractAccountDownloadDataFrameBuilder): + + def __init__( + self, + spark: SparkSession, + account_download_filter: AccountDownloadFilter, + table_name: str = "rpt.account_download", + ): + super().__init__(spark, account_download_filter, table_name) + self.groupby_cols: list[str] = federal_account_groupby_cols + + def filter_to_latest_submissions_for_agencies(self, col_name: str, otherwise: Any = None) -> Column: + """Filter to the latest submission regardless of whether the agency submitted on a monthly or quarterly basis""" + return ( + sf.when( + sf.col("submission_id").isin( + get_submission_ids_for_periods( + self.reporting_fiscal_year, self.reporting_fiscal_quarter, self.reporting_fiscal_period + ) + ), + sf.col(col_name), + ) + .otherwise(otherwise) + .alias(col_name) + ) + @property def source_df(self) -> DataFrame: return ( - self.df.filter(self.combined_filters) + self.df.filter(self.dynamic_filters) .groupBy(self.groupby_cols) .agg( *[ @@ -124,11 +149,29 @@ def source_df(self) -> DataFrame: ], sf.max(sf.col("last_modified_date")).alias("last_modified_date"), ) - .filter( - (sf.col("gross_outlay_amount_FYB_to_period_end") != 0) - | (sf.col("USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig") != 0) - | (sf.col("USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig") != 0) - | (sf.col("transaction_obligated_amount") != 0) + .filter(self.non_zero_filters) + .select( + [sf.col("federal_owning_agency_name").alias("owning_agency_name")] + + [ + col + for col in query_paths["award_financial"]["federal_account"].keys() + if col != "owning_agency_name" and not col.startswith("last_modified_date") + ] + + ["last_modified_date"] ) - .select(self.select_cols) + ) + + +class TreasuryAccountDownloadDataFrameBuilder(AbstractAccountDownloadDataFrameBuilder): + + @property + def source_df(self) -> DataFrame: + return self.df.filter(self.dynamic_filters & self.non_zero_filters).select( + [sf.col("treasury_owning_agency_name").alias("owning_agency_name")] + + [ + col + for col in query_paths["award_financial"]["treasury_account"].keys() + if col != "owning_agency_name" and not col.startswith("last_modified_date") + ] + + ["last_modified_date"] ) diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py index b36e956503..7300a17d19 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py @@ -1,7 +1,7 @@ from usaspending_api.download.v2.download_column_historical_lookups import query_paths federal_account_groupby_cols = [ - "owning_agency_name", + "federal_owning_agency_name", "federal_account_symbol", "federal_account_name", "agency_identifier_name", @@ -74,7 +74,13 @@ "prime_award_summary_place_of_performance_cd_current", ] -federal_account_select_cols = [ - col if not col.startswith("last_modified_date") else "last_modified_date" - for col in query_paths["award_financial"]["federal_account"].keys() -] + +treasury_account_select_cols = ( + ["treasury_owning_agency_name"] + + [ + col + for col in query_paths["award_financial"]["treasury_account"].keys() + if col != "owning_agency_name" and not col.startswith("last_modified_date") + ] + + ["last_modified_date"] +) diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py deleted file mode 100644 index 3cea41c7e4..0000000000 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/federal_account.py +++ /dev/null @@ -1,135 +0,0 @@ -from dataclasses import dataclass -from functools import reduce -from typing import Any - -from pyspark.sql import DataFrame, SparkSession -from pyspark.sql import functions as sf, Column - -from usaspending_api.submissions.helpers import get_submission_ids_for_periods -from usaspending_api.download.management.commands.delta_downloads.award_financial.columns import ( - groupby_cols, - select_cols, -) - - -@dataclass -class AccountDownloadFilter: - year: int - month: int | None = None - quarter: int | None = None - agency: int | None = None - federal_account_id: int | None = None - def_codes: list[str] | None = None - - def __post_init__(self): - if self.month is None and self.quarter is None: - raise ValueError("Must define month or quarter.") - elif self.month is not None and self.quarter is not None: - raise ValueError("Month and quarter are mutually exclusive.") - - -class AccountDownloadDataFrameBuilder: - - def __init__( - self, - spark: SparkSession, - account_download_filter: AccountDownloadFilter, - table_name: str = "rpt.account_download", - ): - self.reporting_fiscal_year = account_download_filter.year - self.reporting_fiscal_quarter = account_download_filter.quarter or account_download_filter.month // 3 - self.reporting_fiscal_period = account_download_filter.month or account_download_filter.quarter * 3 - self.agency = account_download_filter.agency - self.federal_account_id = account_download_filter.federal_account_id - self.def_codes = account_download_filter.def_codes - self.df: str = spark.table(table_name) - self.groupby_cols: list[str] = groupby_cols - self.select_cols: list[str] = select_cols - - def filter_to_latest_submissions_for_agencies(self, col_name: str, otherwise: Any = None) -> Column: - """Filter to the latest submission regardless of whether the agency submitted on a monthly or quarterly basis""" - return ( - sf.when( - sf.col("submission_id").isin( - get_submission_ids_for_periods( - self.reporting_fiscal_year, self.reporting_fiscal_quarter, self.reporting_fiscal_period - ) - ), - sf.col(col_name), - ) - .otherwise(otherwise) - .alias(col_name) - ) - - @property - def combined_filters(self) -> Column: - - @dataclass - class Condition: - name: str - condition: Column - apply: bool - - conditions = [ - Condition(name="year", condition=sf.col("reporting_fiscal_year") == self.reporting_fiscal_year, apply=True), - Condition( - name="quarter or month", - condition=( - (sf.col("reporting_fiscal_period") <= self.reporting_fiscal_period) & ~sf.col("quarter_format_flag") - ) - | ( - (sf.col("reporting_fiscal_quarter") <= self.reporting_fiscal_quarter) - & sf.col("quarter_format_flag") - ), - apply=True, - ), - Condition(name="agency", condition=sf.col("agency_code") == self.agency, apply=bool(self.agency)), - Condition( - name="federal account", - condition=sf.col("federal_account_id") == self.federal_account_id, - apply=bool(self.federal_account_id), - ), - Condition( - name="def_codes", - condition=sf.col("disaster_emergency_fund_code").isin(self.def_codes), - apply=bool(self.def_codes), - ), - ] - return reduce( - lambda x, y: x & y, - [condition.condition for condition in conditions if condition.apply], - ) - - @staticmethod - def collect_concat(col_name: str, concat_str: str = "; ") -> Column: - return sf.concat_ws(concat_str, sf.sort_array(sf.collect_set(col_name))).alias(col_name) - - @property - def source_df(self) -> DataFrame: - return ( - self.df.filter(self.combined_filters) - .groupBy(self.groupby_cols) - .agg( - *[ - self.collect_concat(col) - for col in ["reporting_agency_name", "budget_function", "budget_subfunction"] - ], - sf.sum("transaction_obligated_amount").alias("transaction_obligated_amount"), - *[ - sf.sum(self.filter_to_latest_submissions_for_agencies(col)).alias(col) - for col in [ - "gross_outlay_amount_FYB_to_period_end", - "USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig", - "USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig", - ] - ], - sf.max(sf.col("last_modified_date")).alias("last_modified_date"), - ) - .filter( - (sf.col("gross_outlay_amount_FYB_to_period_end") != 0) - | (sf.col("USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig") != 0) - | (sf.col("USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig") != 0) - | (sf.col("transaction_obligated_amount") != 0) - ) - .select(self.select_cols) - ) diff --git a/usaspending_api/download/management/commands/generate_spark_download.py b/usaspending_api/download/management/commands/generate_spark_download.py index 8b5d5e43e5..469a86c3a6 100644 --- a/usaspending_api/download/management/commands/generate_spark_download.py +++ b/usaspending_api/download/management/commands/generate_spark_download.py @@ -25,7 +25,8 @@ from usaspending_api.download.filestreaming.download_source import DownloadSource from usaspending_api.download.lookups import JOB_STATUS_DICT, FILE_FORMATS, VALUE_MAPPINGS from usaspending_api.download.management.commands.delta_downloads.award_financial.builders import ( - AccountDownloadDataFrameBuilder, + FederalAccountDownloadDataFrameBuilder, + TreasuryAccountDownloadDataFrameBuilder, ) from usaspending_api.download.management.commands.delta_downloads.award_financial.filters import AccountDownloadFilter from usaspending_api.download.models import DownloadJob @@ -36,9 +37,13 @@ DOWNLOAD_SPEC = { "award_financial": { "federal_account": { - "df_builder": AccountDownloadDataFrameBuilder, + "df_builder": FederalAccountDownloadDataFrameBuilder, "validator_type": AccountDownloadValidator, - } + }, + "treasury_account": { + "df_builder": TreasuryAccountDownloadDataFrameBuilder, + "validator_type": AccountDownloadValidator, + }, } } diff --git a/usaspending_api/etl/management/commands/load_query_to_delta.py b/usaspending_api/etl/management/commands/load_query_to_delta.py index ff3c95a640..210b4ff40d 100644 --- a/usaspending_api/etl/management/commands/load_query_to_delta.py +++ b/usaspending_api/etl/management/commands/load_query_to_delta.py @@ -24,11 +24,6 @@ account_download_create_sql_string, account_download_load_sql_string, ) -from usaspending_api.download.delta_models.treasury_account import ( - TREASURY_ACCOUNT_DOWNLOAD_POSTGRES_COLUMNS, - treasury_account_download_create_sql_string, - treasury_account_download_load_sql_string, -) from usaspending_api.recipient.delta_models import ( RECIPIENT_LOOKUP_POSTGRES_COLUMNS, RECIPIENT_PROFILE_POSTGRES_COLUMNS, @@ -342,27 +337,6 @@ "tsvectors": None, "postgres_partition_spec": None, }, - "treasury_account_download": { - "model": None, - "is_from_broker": False, - "source_query": [treasury_account_download_load_sql_string], - "source_query_incremental": None, - "source_database": None, - "source_table": None, - "destination_database": "rpt", - "swap_table": None, - "swap_schema": None, - "partition_column": "financial_accounts_by_awards_id", - "partition_column_type": "numeric", - "is_partition_column_unique": False, - "delta_table_create_sql": treasury_account_download_create_sql_string, - "source_schema": TREASURY_ACCOUNT_DOWNLOAD_POSTGRES_COLUMNS, - "custom_schema": None, - "column_names": list(TREASURY_ACCOUNT_DOWNLOAD_POSTGRES_COLUMNS), - "postgres_seq_name": None, - "tsvectors": None, - "postgres_partition_spec": None, - }, } From fcc6401c2ab0f0a821f0f989853ae0eb9e0c3f83 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Tue, 1 Jul 2025 13:07:48 -0500 Subject: [PATCH 35/43] [DEV-12772] - WIP adding treasury account downloads - updating tests --- .../award_financial/builders.py | 73 +++++++++------- .../award_financial/columns.py | 86 ------------------- ...test_account_download_dataframe_builder.py | 24 +++--- 3 files changed, 57 insertions(+), 126 deletions(-) delete mode 100644 usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/builders.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/builders.py index be062372aa..9f83151521 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/builders.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/builders.py @@ -7,10 +7,6 @@ from pyspark.sql import functions as sf, Column from usaspending_api.download.management.commands.delta_downloads.award_financial.filters import AccountDownloadFilter -from usaspending_api.download.management.commands.delta_downloads.award_financial.columns import ( - federal_account_groupby_cols, - treasury_account_select_cols, -) from usaspending_api.download.v2.download_column_historical_lookups import query_paths from usaspending_api.submissions.helpers import get_submission_ids_for_periods @@ -111,7 +107,44 @@ def __init__( table_name: str = "rpt.account_download", ): super().__init__(spark, account_download_filter, table_name) - self.groupby_cols: list[str] = federal_account_groupby_cols + self.agg_cols = { + "reporting_agency_name": self.collect_concat, + "budget_function": self.collect_concat, + "budget_subfunction": self.collect_concat, + "transaction_obligated_amount": lambda col: sf.sum(col).alias(col), + "gross_outlay_amount_FYB_to_period_end": self.filter_and_sum, + "USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig": self.filter_and_sum, + "USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig": self.filter_and_sum, + "last_modified_date": lambda col: sf.max(col).alias(col), + } + self.select_cols = ( + [sf.col("federal_owning_agency_name").alias("owning_agency_name")] + + [ + col + for col in query_paths["award_financial"]["federal_account"].keys() + if col != "owning_agency_name" and not col.startswith("last_modified_date") + ] + + ["last_modified_date"] + ) + self.groupby_cols = [ + col + for col in self.df.columns + if col + not in ( + list(self.agg_cols) + + [ + "submission_id", + "federal_account_id", + "funding_toptier_agency_id", + "budget_function_code", + "budget_subfunction_code", + "reporting_fiscal_period", + "reporting_fiscal_quarter", + "reporting_fiscal_year", + "quarter_format_flag", + ] + ) + ] def filter_to_latest_submissions_for_agencies(self, col_name: str, otherwise: Any = None) -> Column: """Filter to the latest submission regardless of whether the agency submitted on a monthly or quarterly basis""" @@ -128,37 +161,17 @@ def filter_to_latest_submissions_for_agencies(self, col_name: str, otherwise: An .alias(col_name) ) + def filter_and_sum(self, col_name: str) -> Column: + return sf.sum(self.filter_to_latest_submissions_for_agencies(col_name)).alias(col_name) + @property def source_df(self) -> DataFrame: return ( self.df.filter(self.dynamic_filters) .groupBy(self.groupby_cols) - .agg( - *[ - self.collect_concat(col) - for col in ["reporting_agency_name", "budget_function", "budget_subfunction"] - ], - sf.sum("transaction_obligated_amount").alias("transaction_obligated_amount"), - *[ - sf.sum(self.filter_to_latest_submissions_for_agencies(col)).alias(col) - for col in [ - "gross_outlay_amount_FYB_to_period_end", - "USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig", - "USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig", - ] - ], - sf.max(sf.col("last_modified_date")).alias("last_modified_date"), - ) + .agg(*[agg_func(col) for col, agg_func in self.agg_cols.items()]) .filter(self.non_zero_filters) - .select( - [sf.col("federal_owning_agency_name").alias("owning_agency_name")] - + [ - col - for col in query_paths["award_financial"]["federal_account"].keys() - if col != "owning_agency_name" and not col.startswith("last_modified_date") - ] - + ["last_modified_date"] - ) + .select(self.select_cols) ) diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py deleted file mode 100644 index 7300a17d19..0000000000 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py +++ /dev/null @@ -1,86 +0,0 @@ -from usaspending_api.download.v2.download_column_historical_lookups import query_paths - -federal_account_groupby_cols = [ - "federal_owning_agency_name", - "federal_account_symbol", - "federal_account_name", - "agency_identifier_name", - "program_activity_code", - "program_activity_name", - "object_class_code", - "object_class_name", - "direct_or_reimbursable_funding_source", - "disaster_emergency_fund_code", - "disaster_emergency_fund_name", - "award_unique_key", - "award_id_piid", - "parent_award_id_piid", - "award_id_fain", - "award_id_uri", - "award_base_action_date", - "award_latest_action_date", - "period_of_performance_start_date", - "period_of_performance_current_end_date", - "ordering_period_end_date", - "idv_type_code", - "idv_type", - "prime_award_base_transaction_description", - "awarding_agency_code", - "awarding_agency_name", - "awarding_subagency_code", - "awarding_subagency_name", - "awarding_office_code", - "awarding_office_name", - "funding_agency_code", - "funding_agency_name", - "funding_sub_agency_code", - "funding_sub_agency_name", - "funding_office_code", - "funding_office_name", - "recipient_uei", - "recipient_duns", - "recipient_name", - "recipient_name_raw", - "recipient_parent_uei", - "recipient_parent_duns", - "recipient_parent_name", - "recipient_parent_name_raw", - "recipient_country", - "recipient_state", - "recipient_county", - "recipient_city", - "primary_place_of_performance_country", - "primary_place_of_performance_state", - "primary_place_of_performance_county", - "primary_place_of_performance_zip_code", - "cfda_number", - "cfda_title", - "product_or_service_code", - "product_or_service_code_description", - "naics_code", - "naics_description", - "national_interest_action_code", - "national_interest_action", - "submission_period", - "award_type_code", - "award_type", - "recipient_zip_code", - "award_base_action_date_fiscal_year", - "award_latest_action_date_fiscal_year", - "usaspending_permalink", - "prime_award_summary_recipient_cd_original", - "prime_award_summary_recipient_cd_current", - "prime_award_summary_place_of_performance_cd_original", - "prime_award_summary_place_of_performance_cd_current", -] - - -treasury_account_select_cols = ( - ["treasury_owning_agency_name"] - + [ - col - for col in query_paths["award_financial"]["treasury_account"].keys() - if col != "owning_agency_name" and not col.startswith("last_modified_date") - ] - + ["last_modified_date"] -) diff --git a/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py index c8953bc390..3b406bee11 100644 --- a/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py +++ b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py @@ -4,14 +4,11 @@ import pytest from django.core.management import call_command from model_bakery import baker -from usaspending_api.download.management.commands.delta_downloads.award_financial.columns import ( - federal_account_select_cols, - federal_account_groupby_cols, -) from usaspending_api.download.management.commands.delta_downloads.award_financial.builders import ( - AccountDownloadDataFrameBuilder, + FederalAccountDownloadDataFrameBuilder, ) from usaspending_api.download.management.commands.delta_downloads.award_financial.filters import AccountDownloadFilter +from usaspending_api.download.v2.download_column_historical_lookups import query_paths @pytest.fixture(scope="function") @@ -21,7 +18,14 @@ def account_download_table(spark, s3_unittest_data_bucket, hive_unittest_metasto f"--destination-table=account_download", f"--spark-s3-bucket={s3_unittest_data_bucket}", ) - columns = list(set(federal_account_select_cols + federal_account_groupby_cols)) + [ + columns = [ + col + for col in query_paths["award_financial"]["federal_account"].keys() + if col != "owning_agency_name" and not col.startswith("last_modified_date") + ] + [ + "federal_owning_agency_name", + "treasury_owning_agency_name", + "last_modified_date", "reporting_fiscal_year", "reporting_fiscal_quarter", "reporting_fiscal_period", @@ -40,7 +44,7 @@ def account_download_table(spark, s3_unittest_data_bucket, hive_unittest_metasto "reporting_fiscal_period": [None, None, 5, None, None], "transaction_obligated_amount": [100, 100, 100, 100, 100], "submission_id": [1, 2, 3, 4, 5], - "owning_agency_name": ["test1", "test2", "test2", "test2", "test3"], + "federal_owning_agency_name": ["test1", "test2", "test2", "test2", "test3"], "reporting_agency_name": ["A", "B", "C", "D", "E"], "budget_function": ["A", "B", "C", "D", "E"], "budget_subfunction": ["A", "B", "C", "D", "E"], @@ -83,7 +87,7 @@ def test_account_download_dataframe_builder(mock_get_submission_ids_for_periods, fy=2018, quarter=4, ) - builder = AccountDownloadDataFrameBuilder(spark, account_download_filter, "rpt.account_download") + builder = FederalAccountDownloadDataFrameBuilder(spark, account_download_filter, "rpt.account_download") result = builder.source_df for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: assert sorted(result.toPandas()[col].to_list()) == ["A", "B; C; D"] @@ -102,7 +106,7 @@ def test_filter_by_agency(mock_get_submission_ids_for_periods, spark, account_do quarter=4, agency=2, ) - builder = AccountDownloadDataFrameBuilder(spark, account_download_filter) + builder = FederalAccountDownloadDataFrameBuilder(spark, account_download_filter) result = builder.source_df for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: assert sorted(result.toPandas()[col].to_list()) == ["B; C; D"] @@ -123,7 +127,7 @@ def test_filter_by_federal_account_id( quarter=4, federal_account=1, ) - builder = AccountDownloadDataFrameBuilder(spark, account_download_filter) + builder = FederalAccountDownloadDataFrameBuilder(spark, account_download_filter) result = builder.source_df for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: assert sorted(result.toPandas()[col].to_list()) == ["A"] From 4b50fc70c1d8625737ad8f072a697abb014ee639 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Tue, 1 Jul 2025 15:36:31 -0500 Subject: [PATCH 36/43] [DEV-12772] - WIP adding treasury account downloads - updating tests --- .../award_financial/builders.py | 32 +++--- ...test_account_download_dataframe_builder.py | 104 +++++++++++++----- 2 files changed, 87 insertions(+), 49 deletions(-) diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/builders.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/builders.py index 9f83151521..32e2bab218 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/builders.py +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/builders.py @@ -126,25 +126,18 @@ def __init__( ] + ["last_modified_date"] ) - self.groupby_cols = [ - col - for col in self.df.columns - if col - not in ( - list(self.agg_cols) - + [ - "submission_id", - "federal_account_id", - "funding_toptier_agency_id", - "budget_function_code", - "budget_subfunction_code", - "reporting_fiscal_period", - "reporting_fiscal_quarter", - "reporting_fiscal_year", - "quarter_format_flag", - ] - ) + filter_cols = [ + "submission_id", + "federal_account_id", + "funding_toptier_agency_id", + "budget_function_code", + "budget_subfunction_code", + "reporting_fiscal_period", + "reporting_fiscal_quarter", + "reporting_fiscal_year", + "quarter_format_flag", ] + self.groupby_cols = [col for col in self.df.columns if col not in list(self.agg_cols) + filter_cols] def filter_to_latest_submissions_for_agencies(self, col_name: str, otherwise: Any = None) -> Column: """Filter to the latest submission regardless of whether the agency submitted on a monthly or quarterly basis""" @@ -179,7 +172,7 @@ class TreasuryAccountDownloadDataFrameBuilder(AbstractAccountDownloadDataFrameBu @property def source_df(self) -> DataFrame: - return self.df.filter(self.dynamic_filters & self.non_zero_filters).select( + select_cols = ( [sf.col("treasury_owning_agency_name").alias("owning_agency_name")] + [ col @@ -188,3 +181,4 @@ def source_df(self) -> DataFrame: ] + ["last_modified_date"] ) + return self.df.filter(self.dynamic_filters & self.non_zero_filters).select(select_cols) diff --git a/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py index 3b406bee11..3804b7614b 100644 --- a/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py +++ b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py @@ -6,6 +6,7 @@ from model_bakery import baker from usaspending_api.download.management.commands.delta_downloads.award_financial.builders import ( FederalAccountDownloadDataFrameBuilder, + TreasuryAccountDownloadDataFrameBuilder, ) from usaspending_api.download.management.commands.delta_downloads.award_financial.filters import AccountDownloadFilter from usaspending_api.download.v2.download_column_historical_lookups import query_paths @@ -18,24 +19,34 @@ def account_download_table(spark, s3_unittest_data_bucket, hive_unittest_metasto f"--destination-table=account_download", f"--spark-s3-bucket={s3_unittest_data_bucket}", ) - columns = [ - col - for col in query_paths["award_financial"]["federal_account"].keys() - if col != "owning_agency_name" and not col.startswith("last_modified_date") - ] + [ - "federal_owning_agency_name", - "treasury_owning_agency_name", - "last_modified_date", - "reporting_fiscal_year", - "reporting_fiscal_quarter", - "reporting_fiscal_period", - "quarter_format_flag", - "submission_id", - "federal_account_id", - "funding_toptier_agency_id", - "budget_function_code", - "budget_subfunction_code", - ] + columns = list( + set( + [ + col + for col in query_paths["award_financial"]["federal_account"].keys() + if col != "owning_agency_name" and not col.startswith("last_modified_date") + ] + + [ + col + for col in query_paths["award_financial"]["treasury_account"].keys() + if col != "owning_agency_name" and not col.startswith("last_modified_date") + ] + + [ + "federal_owning_agency_name", + "treasury_owning_agency_name", + "last_modified_date", + "reporting_fiscal_year", + "reporting_fiscal_quarter", + "reporting_fiscal_period", + "quarter_format_flag", + "submission_id", + "federal_account_id", + "funding_toptier_agency_id", + "budget_function_code", + "budget_subfunction_code", + ] + ) + ) test_data_df = pd.DataFrame( data={ "reporting_fiscal_year": [2018, 2018, 2018, 2018, 2019], @@ -81,7 +92,7 @@ def federal_account_models(db): @patch( "usaspending_api.download.management.commands.delta_downloads.award_financial.builders.get_submission_ids_for_periods" ) -def test_account_download_dataframe_builder(mock_get_submission_ids_for_periods, spark, account_download_table): +def test_federal_account_download_dataframe_builder(mock_get_submission_ids_for_periods, spark, account_download_table): mock_get_submission_ids_for_periods.return_value = [1, 2, 4, 5] account_download_filter = AccountDownloadFilter( fy=2018, @@ -89,16 +100,17 @@ def test_account_download_dataframe_builder(mock_get_submission_ids_for_periods, ) builder = FederalAccountDownloadDataFrameBuilder(spark, account_download_filter, "rpt.account_download") result = builder.source_df + result_df = result.toPandas() for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: - assert sorted(result.toPandas()[col].to_list()) == ["A", "B; C; D"] - assert sorted(result.toPandas().transaction_obligated_amount.to_list()) == [100, 300] - assert sorted(result.toPandas().gross_outlay_amount_FYB_to_period_end.to_list()) == [100, 200] + assert sorted(result_df[col].to_list()) == ["A", "B; C; D"] + assert sorted(result_df.transaction_obligated_amount.to_list()) == [100, 300] + assert sorted(result_df.gross_outlay_amount_FYB_to_period_end.to_list()) == [100, 200] @patch( "usaspending_api.download.management.commands.delta_downloads.award_financial.builders.get_submission_ids_for_periods" ) -def test_filter_by_agency(mock_get_submission_ids_for_periods, spark, account_download_table, agency_models): +def test_filter_federal_by_agency(mock_get_submission_ids_for_periods, spark, account_download_table, agency_models): mock_get_submission_ids_for_periods.return_value = [1, 2, 4, 5] account_download_filter = AccountDownloadFilter( @@ -108,16 +120,17 @@ def test_filter_by_agency(mock_get_submission_ids_for_periods, spark, account_do ) builder = FederalAccountDownloadDataFrameBuilder(spark, account_download_filter) result = builder.source_df + result_df = result.toPandas() for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: - assert sorted(result.toPandas()[col].to_list()) == ["B; C; D"] - assert sorted(result.toPandas().transaction_obligated_amount.to_list()) == [300] - assert sorted(result.toPandas().gross_outlay_amount_FYB_to_period_end.to_list()) == [200] + assert sorted(result_df[col].to_list()) == ["B; C; D"] + assert result_df.transaction_obligated_amount.to_list() == [300] + assert result_df.gross_outlay_amount_FYB_to_period_end.to_list() == [200] @patch( "usaspending_api.download.management.commands.delta_downloads.award_financial.builders.get_submission_ids_for_periods" ) -def test_filter_by_federal_account_id( +def test_filter_federal_by_federal_account_id( mock_get_submission_ids_for_periods, spark, account_download_table, federal_account_models ): mock_get_submission_ids_for_periods.return_value = [1, 2, 4, 5] @@ -129,7 +142,38 @@ def test_filter_by_federal_account_id( ) builder = FederalAccountDownloadDataFrameBuilder(spark, account_download_filter) result = builder.source_df + result_df = result.toPandas() + for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: + assert sorted(result_df[col].to_list()) == ["A"] + assert sorted(result_df.transaction_obligated_amount.to_list()) == [100] + assert sorted(result_df.gross_outlay_amount_FYB_to_period_end.to_list()) == [100] + + +def test_treasury_account_download_dataframe_builder(spark, account_download_table): + account_download_filter = AccountDownloadFilter( + fy=2018, + quarter=4, + ) + builder = TreasuryAccountDownloadDataFrameBuilder(spark, account_download_filter) + result = builder.source_df + result_df = result.toPandas() + for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: + assert sorted(result_df[col].to_list()) == ["A", "B", "C", "D"] + assert result_df.transaction_obligated_amount.to_list() == [100] * 4 + assert result_df.gross_outlay_amount_FYB_to_period_end.to_list() == [100] * 4 + + +def test_filter_treasury_by_agency(spark, account_download_table, agency_models): + + account_download_filter = AccountDownloadFilter( + fy=2018, + quarter=4, + agency=2, + ) + builder = TreasuryAccountDownloadDataFrameBuilder(spark, account_download_filter) + result = builder.source_df + result_df = result.toPandas() for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: - assert sorted(result.toPandas()[col].to_list()) == ["A"] - assert sorted(result.toPandas().transaction_obligated_amount.to_list()) == [100] - assert sorted(result.toPandas().gross_outlay_amount_FYB_to_period_end.to_list()) == [100] + assert sorted(result_df[col].to_list()) == ["B", "C", "D"] + assert result_df.transaction_obligated_amount.to_list() == [100] * 3 + assert result_df.gross_outlay_amount_FYB_to_period_end.to_list() == [100] * 3 From e808e6e4392003089951ceb4ba9d9c8289756a08 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Tue, 1 Jul 2025 15:39:10 -0500 Subject: [PATCH 37/43] [DEV-12772] - Removing treasury account download load sql --- .../download/delta_models/treasury_account.py | 322 ------------------ 1 file changed, 322 deletions(-) delete mode 100644 usaspending_api/download/delta_models/treasury_account.py diff --git a/usaspending_api/download/delta_models/treasury_account.py b/usaspending_api/download/delta_models/treasury_account.py deleted file mode 100644 index 05e28ecb47..0000000000 --- a/usaspending_api/download/delta_models/treasury_account.py +++ /dev/null @@ -1,322 +0,0 @@ -TREASURY_ACCOUNT_DOWNLOAD_COLUMNS = { - "financial_accounts_by_awards_id", - "submission_id", - "owning_agency_name", - "reporting_agency_name", - "submission_period", - "allocation_transfer_agency_identifier_code", - "agency_identifier_code", - "beginning_period_of_availability", - "ending_period_of_availability", - "availability_type_code", - "main_account_code", - "sub_account_code", - "treasury_account_symbol", - "treasury_account_name", - "agency_identifier_name", - "allocation_transfer_agency_identifier_name", - "budget_function", - "budget_subfunction", - "federal_account_symbol", - "federal_account_name", - "program_activity_code", - "program_activity_name", - "object_class_code", - "object_class_name", - "direct_or_reimbursable_funding_source", - "disaster_emergency_fund_code", - "disaster_emergency_fund_name", - "transaction_obligated_amount", - "gross_outlay_amount_fyb_to_period_end", - "ussgl487200_downward_adj_prior_year_prepaid_undeliv_order_oblig", - "ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig", - "award_unique_key", - "award_id_piid", - "parent_award_id_piid", - "award_id_fain", - "award_id_uri", - "award_base_action_date", - "award_base_action_date_fiscal_year", - "award_latest_action_date", - "award_latest_action_date_fiscal_year", - "period_of_performance_start_date", - "period_of_performance_current_end_date", - "ordering_period_end_date", - "award_type_code", - "award_type", - "idv_type_code", - "idv_type", - "prime_award_base_transaction_description", - "awarding_agency_code", - "awarding_agency_name", - "awarding_subagency_code", - "awarding_subagency_name", - "awarding_office_code", - "awarding_office_name", - "funding_agency_code", - "funding_agency_name", - "funding_sub_agency_code", - "funding_sub_agency_name", - "funding_office_code", - "funding_office_name", - "recipient_uei", - "recipient_duns", - "recipient_name", - "recipient_name_raw", - "recipient_parent_uei", - "recipient_parent_duns", - "recipient_parent_name", - "recipient_parent_name_raw", - "recipient_country", - "recipient_state", - "recipient_county", - "recipient_city", - "prime_award_summary_recipient_cd_original", - "prime_award_summary_recipient_cd_current", - "recipient_zip_code", - "primary_place_of_performance_country", - "primary_place_of_performance_state", - "primary_place_of_performance_county", - "prime_award_summary_place_of_performance_cd_original", - "prime_award_summary_place_of_performance_cd_current", - "primary_place_of_performance_zip_code", - "cfda_number", - "cfda_title", - "product_or_service_code", - "product_or_service_code_description", - "naics_code", - "naics_description", - "national_interest_action_code", - "national_interest_action", - "usaspending_permalink", - "last_modified_date", -} - -TREASURY_ACCOUNT_DOWNLOAD_DELTA_COLUMNS = {} - -account_download_create_sql_string = rf""" - CREATE OR REPLACE TABLE {{DESTINATION_TABLE}} ( - {", ".join([f'{key} {val}' for key, val in TREASURY_ACCOUNT_DOWNLOAD_DELTA_COLUMNS.items()])} - ) - USING DELTA - LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' - """ - - -treasury_account_download_load_sql_string = rf""" - INSERT OVERWRITE {{DESTINATION_DATABASE}}.{{DESTINATION_TABLE}} ( - {",".join(list(TREASURY_ACCOUNT_DOWNLOAD_COLUMNS))} - ) - SELECT - financial_accounts_by_awards.financial_accounts_by_awards_id, - financial_accounts_by_awards.submission_id, - toptier_agency.name AS owning_agency_name, - submission_attributes.reporting_agency_name AS reporting_agency_name, - CASE - WHEN submission_attributes.quarter_format_flag = TRUE - THEN - CONCAT( - CAST('FY' AS STRING), - CAST(submission_attributes.reporting_fiscal_year AS STRING), - CAST('Q' AS STRING), - CAST( - submission_attributes.reporting_fiscal_quarter AS STRING - ) - ) - ELSE - CONCAT( - CAST('FY' AS STRING), - CAST(submission_attributes.reporting_fiscal_year AS STRING), - CAST('P' AS STRING), - LPAD( - CAST( - submission_attributes.reporting_fiscal_period AS STRING - ), - 2, - '0' - ) - ) - END AS submission_period, - treasury_appropriation_account.allocation_transfer_agency_id AS allocation_transfer_agency_identifier_code, - treasury_appropriation_account.agency_id AS agency_identifier_code, - treasury_appropriation_account.beginning_period_of_availability AS beginning_period_of_availability, - treasury_appropriation_account.ending_period_of_availability AS ending_period_of_availability, - treasury_appropriation_account.availability_type_code AS availability_type_code, - treasury_appropriation_account.main_account_code AS main_account_code, - treasury_appropriation_account.sub_account_code AS sub_account_code, - treasury_appropriation_account.tas_rendering_label AS treasury_account_symbol, - treasury_appropriation_account.account_title AS treasury_account_name, - CGAC_AID.agency_name AS agency_identifier_name, - CGAC_ATA.agency_name AS allocation_transfer_agency_identifier_name, - treasury_appropriation_account.budget_function_title AS budget_function, - treasury_appropriation_account.budget_subfunction_title AS budget_subfunction, - federal_account.federal_account_code AS federal_account_symbol, - federal_account.account_title AS federal_account_name, - ref_program_activity.program_activity_code AS program_activity_code, - ref_program_activity.program_activity_name AS program_activity_name, - object_class.object_class AS object_class_code, - object_class.object_class_name AS object_class_name, - object_class.direct_reimbursable AS direct_or_reimbursable_funding_source, - financial_accounts_by_awards.disaster_emergency_fund_code AS disaster_emergency_fund_code, - disaster_emergency_fund_code.title AS disaster_emergency_fund_name, - financial_accounts_by_awards.transaction_obligated_amount AS transaction_obligated_amount, - financial_accounts_by_awards.gross_outlay_amount_by_award_cpe AS gross_outlay_amount_fyb_to_period_end, - financial_accounts_by_awards.ussgl487200_down_adj_pri_ppaid_undel_orders_oblig_refund_cpe AS ussgl487200_downward_adj_prior_year_prepaid_undeliv_order_oblig, - financial_accounts_by_awards.ussgl497200_down_adj_pri_paid_deliv_orders_oblig_refund_cpe AS ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig, - award_search.generated_unique_award_id AS award_unique_key, - financial_accounts_by_awards.piid AS award_id_piid, - financial_accounts_by_awards.parent_award_id AS parent_award_id_piid, - financial_accounts_by_awards.fain AS award_id_fain, - financial_accounts_by_awards.uri AS award_id_uri, - award_search.date_signed AS award_base_action_date, - EXTRACT(YEAR from (award_search.date_signed) + INTERVAL '3 months') AS award_base_action_date_fiscal_year, - award_search.certified_date AS award_latest_action_date, - EXTRACT(YEAR from (award_search.certified_date) + INTERVAL '3 months') AS award_latest_action_date_fiscal_year, - award_search.period_of_performance_start_date AS period_of_performance_start_date, - award_search.period_of_performance_current_end_date AS period_of_performance_current_end_date, - transaction_search.ordering_period_end_date AS ordering_period_end_date, - COALESCE(transaction_search.contract_award_type, transaction_search.type) AS award_type_code, - COALESCE(transaction_search.contract_award_type_desc, transaction_search.type_description) AS award_type, - transaction_search.idv_type AS idv_type_code, - transaction_search.idv_type_description AS idv_type, - award_search.description AS prime_award_base_transaction_description, - transaction_search.awarding_agency_code AS awarding_agency_code, - transaction_search.awarding_toptier_agency_name_raw AS awarding_agency_name, - transaction_search.awarding_sub_tier_agency_c AS awarding_subagency_code, - transaction_search.awarding_subtier_agency_name_raw AS awarding_subagency_name, - transaction_search.awarding_office_code AS awarding_office_code, - transaction_search.awarding_office_name AS awarding_office_name, - transaction_search.funding_agency_code AS funding_agency_code, - transaction_search.funding_toptier_agency_name_raw AS funding_agency_name, - transaction_search.funding_sub_tier_agency_co AS funding_sub_agency_code, - transaction_search.funding_subtier_agency_name_raw AS funding_sub_agency_name, - transaction_search.funding_office_code AS funding_office_code, - transaction_search.funding_office_name AS funding_office_name, - transaction_search.recipient_uei AS recipient_uei, - transaction_search.recipient_unique_id AS recipient_duns, - transaction_search.recipient_name AS recipient_name, - transaction_search.recipient_name_raw AS recipient_name_raw, - transaction_search.parent_uei AS recipient_parent_uei, - transaction_search.parent_uei AS recipient_parent_duns, - transaction_search.parent_recipient_name AS recipient_parent_name, - transaction_search.parent_recipient_name_raw AS recipient_parent_name_raw, - transaction_search.recipient_location_country_code AS recipient_country, - transaction_search.recipient_location_state_code AS recipient_state, - transaction_search.recipient_location_county_name AS recipient_county, - transaction_search.recipient_location_city_name AS recipient_city, - CASE - WHEN ( - transaction_search.recipient_location_state_code IS NOT NULL - AND transaction_search.recipient_location_congressional_code IS NOT NULL - AND NOT ( - transaction_search.recipient_location_state_code = '' - AND transaction_search.recipient_location_state_code IS NOT NULL - )) - THEN - CONCAT( - transaction_search.recipient_location_state_code, '-', - transaction_search.recipient_location_congressional_code - ) - ELSE transaction_search.recipient_location_congressional_code - END AS prime_award_summary_recipient_cd_original, - CASE - WHEN ( - transaction_search.recipient_location_state_code IS NOT NULL - AND transaction_search.recipient_location_congressional_code_current IS NOT NULL - AND NOT ( - transaction_search.recipient_location_state_code = '' - AND transaction_search.recipient_location_state_code IS NOT NULL - )) - THEN - CONCAT( - transaction_search.recipient_location_state_code, - '-', - transaction_search.recipient_location_congressional_code_current - ) - ELSE transaction_search.recipient_location_congressional_code_current - END AS prime_award_summary_recipient_cd_current, - COALESCE( - transaction_search.legal_entity_zip4, - CONCAT( - CAST(transaction_search.recipient_location_zip5 AS STRING), - CAST(transaction_search.legal_entity_zip_last4 AS STRING) - ) - ) AS recipient_zip_code, - transaction_search.pop_country_name AS primary_place_of_performance_country, - transaction_search.pop_state_name AS primary_place_of_performance_state, - transaction_search.pop_county_name AS primary_place_of_performance_county, - CASE - WHEN - transaction_search.pop_state_code IS NOT NULL - AND transaction_search.pop_congressional_code IS NOT NULL - AND NOT ( - transaction_search.pop_state_code = '' - AND transaction_search.pop_state_code IS NOT NULL - ) - THEN - CONCAT( - transaction_search.pop_state_code, - '-', - transaction_search.pop_congressional_code - ) - ELSE transaction_search.pop_congressional_code - END AS prime_award_summary_place_of_performance_cd_original, - CASE - WHEN - transaction_search.pop_state_code IS NOT NULL - AND transaction_search.pop_congressional_code_current IS NOT NULL - AND NOT ( - transaction_search.pop_state_code = '' - AND transaction_search.pop_state_code IS NOT NULL - ) - THEN - CONCAT( - transaction_search.pop_state_code, - '-', - transaction_search.pop_congressional_code_current - ) - ELSE transaction_search.pop_congressional_code_current - END AS prime_award_summary_place_of_performance_cd_current, - transaction_search.place_of_performance_zip4a AS primary_place_of_performance_zip_code, - transaction_search.cfda_number AS cfda_number, - transaction_search.cfda_title AS cfda_title, - transaction_search.product_or_service_code AS product_or_service_code, - transaction_search.product_or_service_description AS product_or_service_code_description, - transaction_search.naics_code AS naics_code, - transaction_search.naics_description AS naics_description, - transaction_search.national_interest_action AS national_interest_action_code, - transaction_search.national_interest_desc AS national_interest_action, - CASE - WHEN award_search.generated_unique_award_id IS NOT NULL - THEN CONCAT('localhost:3000/award/', URL_ENCODE(award_search.generated_unique_award_id), '/') - ELSE '/' END AS usaspending_permalink, - CAST(submission_attributes.published_date AS DATE) AS last_modified_date, - submission_attributes.reporting_fiscal_period, - submission_attributes.reporting_fiscal_quarter, - submission_attributes.reporting_fiscal_year, - submission_attributes.quarter_format_flag - FROM - raw.financial_accounts_by_awards - INNER JOIN global_temp.submission_attributes - ON (financial_accounts_by_awards.submission_id = submission_attributes.submission_id) - LEFT OUTER JOIN global_temp.treasury_appropriation_account - ON (financial_accounts_by_awards.treasury_account_id = treasury_appropriation_account.treasury_account_identifier) - LEFT OUTER JOIN global_temp.cgac AS CGAC_AID - ON (treasury_appropriation_account.agency_id = CGAC_AID.cgac_code) - LEFT OUTER JOIN global_temp.cgac AS CGAC_ATA - ON (treasury_appropriation_account.allocation_transfer_agency_id = CGAC_ATA.cgac_code) - INNER JOIN award_search - ON (financial_accounts_by_awards.award_id = award_search.award_id) - INNER JOIN transaction_search - ON (award_search.latest_transaction_search_id = transaction_search.transaction_id) - LEFT OUTER JOIN global_temp.toptier_agency - ON (treasury_appropriation_account.funding_toptier_agency_id = toptier_agency.toptier_agency_id) - LEFT OUTER JOIN global_temp.federal_account - ON (treasury_appropriation_account.federal_account_id = federal_account.id) - LEFT OUTER JOIN global_temp.ref_program_activity - ON (financial_accounts_by_awards.program_activity_id = ref_program_activity.id) - LEFT OUTER JOIN global_temp.object_class - ON (financial_accounts_by_awards.object_class_id = object_class.id) - LEFT OUTER JOIN global_temp.disaster_emergency_fund_code - ON (financial_accounts_by_awards.disaster_emergency_fund_code = disaster_emergency_fund_code.code) -""" From febccc592dedd9636037bb0650b57ff02f3de0ac Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Tue, 1 Jul 2025 15:49:03 -0500 Subject: [PATCH 38/43] [DEV-12772] - Removing treasury account download load sql --- usaspending_api/common/helpers/sql_helpers.py | 6 - .../filestreaming/download_generation.py | 111 +++++----- .../commands/generate_postgres_download.py | 193 ------------------ 3 files changed, 55 insertions(+), 255 deletions(-) delete mode 100644 usaspending_api/download/management/commands/generate_postgres_download.py diff --git a/usaspending_api/common/helpers/sql_helpers.py b/usaspending_api/common/helpers/sql_helpers.py index 429fad6685..eb53872cac 100644 --- a/usaspending_api/common/helpers/sql_helpers.py +++ b/usaspending_api/common/helpers/sql_helpers.py @@ -41,12 +41,6 @@ def read_sql_file_to_text(file_path: Path) -> str: return p.sub(" ", str(file_path.read_text().replace("\n", " "))) -def strip_sql_whitespace(query: str) -> str: - """Open file and return text with most whitespace removed""" - p = re.compile(r"\s\s+") - return p.sub(" ", str(query.replace("\n", " "))) - - def read_sql_file(file_path): # Read in SQL file and extract commands into a list _, file_extension = os.path.splitext(file_path) diff --git a/usaspending_api/download/filestreaming/download_generation.py b/usaspending_api/download/filestreaming/download_generation.py index ee49fa9c47..8badb283d7 100644 --- a/usaspending_api/download/filestreaming/download_generation.py +++ b/usaspending_api/download/filestreaming/download_generation.py @@ -236,8 +236,7 @@ def generate_download(download_job: DownloadJob, origination: Optional[str] = No start_uploading = time.perf_counter() multipart_upload(bucket, region, zip_file_path, os.path.basename(zip_file_path)) write_to_log( - message=f"Uploading took {time.perf_counter() - start_uploading:.2f}s", - download_job=download_job, + message=f"Uploading took {time.perf_counter() - start_uploading:.2f}s", download_job=download_job ) except Exception as e: exc_msg = "An exception was raised while attempting to upload the file" @@ -848,63 +847,63 @@ def execute_psql(temp_sql_file_path, source_path, download_job): kind=SpanKind.INTERNAL, service="bulk-download", ) - if download_job: - with subprocess_trace as span: - span.set_attributes( - { - "service": "bulk-download", - "resource": str(download_sql), - "span_type": "Internal", - "source_path": str(source_path), - # download job details - "download_job_id": str(download_job.download_job_id), - "download_job_status": str(download_job.job_status.name), - "download_file_name": str(download_job.file_name), - "download_file_size": download_job.file_size if download_job.file_size is not None else 0, - "number_of_rows": download_job.number_of_rows if download_job.number_of_rows is not None else 0, - "number_of_columns": ( - download_job.number_of_columns if download_job.number_of_columns is not None else 0 - ), - "error_message": download_job.error_message if download_job.error_message else "", - "monthly_download": str(download_job.monthly_download), - "json_request": str(download_job.json_request) if download_job.json_request else "", - } - ) - - try: - log_time = time.perf_counter() - temp_env = os.environ.copy() - if download_job and not download_job.monthly_download: - # Since terminating the process isn't guaranteed to end the DB statement, add timeout to client connection - temp_env["PGOPTIONS"] = ( - f"--statement-timeout={settings.DOWNLOAD_DB_TIMEOUT_IN_HOURS}h " - f"--work-mem={settings.DOWNLOAD_DB_WORK_MEM_IN_MB}MB" - ) - cat_command = subprocess.Popen(["cat", temp_sql_file_path], stdout=subprocess.PIPE) - subprocess.check_output( - ["psql", "-q", "-o", source_path, retrieve_db_string(), "-v", "ON_ERROR_STOP=1"], - stdin=cat_command.stdout, - stderr=subprocess.STDOUT, - env=temp_env, + with subprocess_trace as span: + span.set_attributes( + { + "service": "bulk-download", + "resource": str(download_sql), + "span_type": "Internal", + "source_path": str(source_path), + # download job details + "download_job_id": str(download_job.download_job_id), + "download_job_status": str(download_job.job_status.name), + "download_file_name": str(download_job.file_name), + "download_file_size": download_job.file_size if download_job.file_size is not None else 0, + "number_of_rows": download_job.number_of_rows if download_job.number_of_rows is not None else 0, + "number_of_columns": ( + download_job.number_of_columns if download_job.number_of_columns is not None else 0 + ), + "error_message": download_job.error_message if download_job.error_message else "", + "monthly_download": str(download_job.monthly_download), + "json_request": str(download_job.json_request) if download_job.json_request else "", + } ) - duration = time.perf_counter() - log_time - write_to_log( - message=f"Wrote {os.path.basename(source_path)}, took {duration:.4f} seconds", - download_job=download_job, - ) - except subprocess.CalledProcessError as e: - write_to_log(message=f"PSQL Error: {e.output.decode()}", is_error=True, download_job=download_job) - raise e - except Exception as e: - if not settings.IS_LOCAL: - # Not logging the command as it can contain the database connection string - e.cmd = "[redacted psql command]" - write_to_log(message=e, is_error=True, download_job=download_job) - sql = subprocess.check_output(["cat", temp_sql_file_path]).decode() - write_to_log(message=f"Faulty SQL: {sql}", is_error=True, download_job=download_job) - raise e + try: + log_time = time.perf_counter() + temp_env = os.environ.copy() + if download_job and not download_job.monthly_download: + # Since terminating the process isn't guaranteed to end the DB statement, add timeout to client connection + temp_env["PGOPTIONS"] = ( + f"--statement-timeout={settings.DOWNLOAD_DB_TIMEOUT_IN_HOURS}h " + f"--work-mem={settings.DOWNLOAD_DB_WORK_MEM_IN_MB}MB" + ) + + cat_command = subprocess.Popen(["cat", temp_sql_file_path], stdout=subprocess.PIPE) + subprocess.check_output( + ["psql", "-q", "-o", source_path, retrieve_db_string(), "-v", "ON_ERROR_STOP=1"], + stdin=cat_command.stdout, + stderr=subprocess.STDOUT, + env=temp_env, + ) + + duration = time.perf_counter() - log_time + write_to_log( + message=f"Wrote {os.path.basename(source_path)}, took {duration:.4f} seconds", + download_job=download_job, + ) + except subprocess.CalledProcessError as e: + write_to_log(message=f"PSQL Error: {e.output.decode()}", is_error=True, download_job=download_job) + raise e + except Exception as e: + if not settings.IS_LOCAL: + # Not logging the command as it can contain the database connection string + e.cmd = "[redacted psql command]" + write_to_log(message=e, is_error=True, download_job=download_job) + sql = subprocess.check_output(["cat", temp_sql_file_path]).decode() + write_to_log(message=f"Faulty SQL: {sql}", is_error=True, download_job=download_job) + raise e def retrieve_db_string(): diff --git a/usaspending_api/download/management/commands/generate_postgres_download.py b/usaspending_api/download/management/commands/generate_postgres_download.py deleted file mode 100644 index 9aad4c476e..0000000000 --- a/usaspending_api/download/management/commands/generate_postgres_download.py +++ /dev/null @@ -1,193 +0,0 @@ -import json -import logging -import os -import traceback -from logging import Logger -from pathlib import Path -from typing import Optional, Dict, Tuple, Type - -from django.conf import settings -from django.core.management.base import BaseCommand -from django.utils.functional import cached_property - -from usaspending_api.common.exceptions import InvalidParameterException -from usaspending_api.common.helpers.dict_helpers import order_nested_object -from usaspending_api.common.helpers.download_csv_strategies import PostgresToCSVStrategy -from usaspending_api.common.helpers.s3_helpers import upload_download_file_to_s3 -from usaspending_api.download.filestreaming.download_generation import build_data_file_name -from usaspending_api.download.filestreaming.download_source import DownloadSource -from usaspending_api.download.management.commands.delta_downloads.award_financial import federal_account -from usaspending_api.download.download_utils import create_unique_filename -from usaspending_api.download.lookups import JOB_STATUS_DICT, FILE_FORMATS, VALUE_MAPPINGS -from usaspending_api.download.models import DownloadJob -from usaspending_api.download.v2.request_validations import AccountDownloadValidator, DownloadValidatorBase - -DOWNLOAD_SPEC = { - "award_financial": { - "federal_account": { - "query": federal_account.POSTGRES_DOWNLOAD_QUERY, - "validator_type": AccountDownloadValidator, - } - } -} - - -class Command(BaseCommand): - - help = "Generate a download zip file based on the provided type and level." - - download_job: DownloadJob - download_level: str - download_query: str - download_source: DownloadSource - download_spec: Dict - download_type: str - download_validator_type: Type[DownloadValidatorBase] - file_format_spec: Dict - file_prefix: str - logger: Logger - should_cleanup: bool - working_dir_path: Path - - def add_arguments(self, parser): - parser.add_argument("--download-type", type=str, required=True, choices=list(DOWNLOAD_SPEC)) - parser.add_argument( - "--download-level", - type=str, - required=True, - choices=set( - download_level - for download_level_list in [DOWNLOAD_SPEC[key] for key in DOWNLOAD_SPEC] - for download_level in download_level_list - ), - ) - parser.add_argument("--file-format", type=str, required=False, choices=list(FILE_FORMATS), default="csv") - parser.add_argument("--file-prefix", type=str, required=False, default="") - - def handle(self, *args, **options): - self.logger = logging.getLogger(__name__) - - # Resolve Parameters - self.download_type = options["download_type"] - self.download_level = options["download_level"] - self.file_prefix = options["file_prefix"] - - if self.download_level not in DOWNLOAD_SPEC[self.download_type].keys(): - raise ValueError( - f'Provided download level of "{self.download_level}" is not supported ' - f'for download type of "{self.download_type}".' - ) - - download_spec = DOWNLOAD_SPEC[self.download_type][self.download_level] - self.file_format_spec = FILE_FORMATS[options["file_format"]] - self.download_query = download_spec["query"] - self.download_validator_type = download_spec["validator_type"] - self.working_dir_path = Path(settings.CSV_LOCAL_PATH) - if not self.working_dir_path.exists(): - self.working_dir_path.mkdir() - self.download_job, self.download_source = self.create_download_job() - self.process_download() - - @cached_property - def json_request(self) -> Dict: - request_data = { - "account_level": "federal_account", - "download_types": ["award_financial"], - "file_format": "csv", - "filters": { - "agency": "all", - "budget_function": "all", - "budget_subfunction": "all", - "federal_account": "all", - "fy": 2021, - "period": 12, - "submission_types": ["award_financial"], - }, - "request_type": "account", - } - validator = self.download_validator_type(request_data) - processed_request = order_nested_object(validator.json_request) - - return processed_request - - @cached_property - def json_request_string(self) -> str: - return json.dumps(self.json_request) - - @cached_property - def download_name(self) -> str: - return self.download_job.file_name.replace(".zip", "") - - def create_download_job(self) -> Tuple[DownloadJob, DownloadSource]: - final_output_zip_name = f"{self.file_prefix}{create_unique_filename(self.json_request)}" - download_job_ready_status = JOB_STATUS_DICT["ready"] - - # Create a download_job object for use by the application - download_job = DownloadJob.objects.create( - job_status_id=download_job_ready_status, - file_name=final_output_zip_name, - json_request=self.json_request_string, - ) - - # TODO: This should be updated to be more dynamic to the download type - download_source = DownloadSource( - VALUE_MAPPINGS[self.download_type]["table_name"], - self.download_level, - self.download_type, - self.json_request.get("agency", "all"), - extra_file_type="", - ) - download_source.file_name = build_data_file_name(download_source, download_job, piid=None, assistance_id=None) - - return download_job, download_source - - def process_download(self): - self.start_download() - files_to_cleanup = [] - try: - to_csv_strategy = PostgresToCSVStrategy(self.logger) - - zip_file_path = self.working_dir_path / f"{self.download_name}.zip" - - csv_metadata = to_csv_strategy.download_to_csv( - self.download_query, - self.working_dir_path / self.download_name, - self.download_name, - self.working_dir_path, - zip_file_path, - ) - files_to_cleanup.extend(csv_metadata.filepaths) - - self.download_job.file_size = os.stat(zip_file_path).st_size - self.download_job.number_of_rows = csv_metadata.number_of_rows - self.download_job.number_of_columns = csv_metadata.number_of_columns - upload_download_file_to_s3(zip_file_path) - except InvalidParameterException as e: - exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob" - self.fail_download(exc_msg, e) - raise - except Exception as e: - exc_msg = "An exception was raised while attempting to process the DownloadJob" - self.fail_download(exc_msg, e) - raise - self.finish_download() - - def start_download(self) -> None: - self.download_job.job_status_id = JOB_STATUS_DICT["running"] - self.download_job.save() - self.logger.info(f"Starting DownloadJob {self.download_job.download_job_id}") - - def fail_download(self, msg: str, e: Optional[Exception] = None) -> None: - if e: - stack_trace = "".join(traceback.format_exception(type(e), value=e, tb=e.__traceback__)) - self.download_job.error_message = f"{msg}:\n{stack_trace}" - else: - self.download_job.error_message = msg - self.logger.error(msg) - self.download_job.job_status_id = JOB_STATUS_DICT["failed"] - self.download_job.save() - - def finish_download(self) -> None: - self.download_job.job_status_id = JOB_STATUS_DICT["finished"] - self.download_job.save() - self.logger.info(f"Finished processing DownloadJob {self.download_job.download_job_id}") From 7989bc5edc0fefbeb7298b2fadd062d3f648e06c Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Wed, 2 Jul 2025 13:13:39 -0500 Subject: [PATCH 39/43] [DEV-12772] - Update Flake8 rules to bring in alignment with Black --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 9b4258c628..8f8a292587 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [flake8] select=C,E,F,W,B,B950 -ignore=E501,W503,E203,F541 +ignore=E501,W503,E203,F541,E701 exclude=.venv,build,usaspending_api.egg-info,usaspending_api/*/migrations/* max-line-length=120 \ No newline at end of file From 59a632110e32eb706d76fff3f5494b37aef4a157 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Wed, 2 Jul 2025 13:25:13 -0500 Subject: [PATCH 40/43] [DEV-12772] - Update Flake8 rules to bring in alignment with Black --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 8f8a292587..3fc11249c1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [flake8] select=C,E,F,W,B,B950 -ignore=E501,W503,E203,F541,E701 +ignore=E501,W503,E203,F541,E704 exclude=.venv,build,usaspending_api.egg-info,usaspending_api/*/migrations/* max-line-length=120 \ No newline at end of file From e8e4554f3a54d9fd189e9454de275c884d3229fc Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Mon, 14 Jul 2025 13:01:20 -0500 Subject: [PATCH 41/43] [DEV-12772] - Update filter to add submission types and command and builder to handle multiple submission types --- .../award_financial/__init__.py | 0 .../{award_financial => }/builders.py | 15 +- .../{award_financial => }/filters.py | 3 +- .../commands/generate_spark_download.py | 138 ++++++------------ ...test_account_download_dataframe_builder.py | 19 ++- .../test_account_download_filter.py | 21 ++- 6 files changed, 83 insertions(+), 113 deletions(-) delete mode 100644 usaspending_api/download/management/commands/delta_downloads/award_financial/__init__.py rename usaspending_api/download/management/commands/delta_downloads/{award_financial => }/builders.py (92%) rename usaspending_api/download/management/commands/delta_downloads/{award_financial => }/filters.py (95%) diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/__init__.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/builders.py b/usaspending_api/download/management/commands/delta_downloads/builders.py similarity index 92% rename from usaspending_api/download/management/commands/delta_downloads/award_financial/builders.py rename to usaspending_api/download/management/commands/delta_downloads/builders.py index 32e2bab218..c2c084a135 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/builders.py +++ b/usaspending_api/download/management/commands/delta_downloads/builders.py @@ -6,7 +6,7 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql import functions as sf, Column -from usaspending_api.download.management.commands.delta_downloads.award_financial.filters import AccountDownloadFilter +from usaspending_api.download.management.commands.delta_downloads.filters import AccountDownloadFilter from usaspending_api.download.v2.download_column_historical_lookups import query_paths from usaspending_api.submissions.helpers import get_submission_ids_for_periods @@ -20,6 +20,7 @@ def __init__( table_name: str = "rpt.account_download", ): self.reporting_fiscal_year = account_download_filter.fy + self.submission_types = account_download_filter.submission_types self.reporting_fiscal_quarter = account_download_filter.quarter or account_download_filter.period // 3 self.reporting_fiscal_period = account_download_filter.period or account_download_filter.quarter * 3 self.agency = account_download_filter.agency @@ -95,7 +96,11 @@ def collect_concat(col_name: str, concat_str: str = "; ") -> Column: @property @abstractmethod - def source_df(self) -> DataFrame: ... + def award_financial(self) -> DataFrame: ... + + @property + def source_dfs(self) -> list[DataFrame]: + return [getattr(self, submission_type) for submission_type in self.submission_types] class FederalAccountDownloadDataFrameBuilder(AbstractAccountDownloadDataFrameBuilder): @@ -158,11 +163,13 @@ def filter_and_sum(self, col_name: str) -> Column: return sf.sum(self.filter_to_latest_submissions_for_agencies(col_name)).alias(col_name) @property - def source_df(self) -> DataFrame: + def award_financial(self) -> DataFrame: return ( self.df.filter(self.dynamic_filters) .groupBy(self.groupby_cols) .agg(*[agg_func(col) for col, agg_func in self.agg_cols.items()]) + # drop original agg columns from the dataframe to avoid ambiguous column names + .drop(*[sf.col(f"account_download.{col}") for col in self.agg_cols]) .filter(self.non_zero_filters) .select(self.select_cols) ) @@ -171,7 +178,7 @@ def source_df(self) -> DataFrame: class TreasuryAccountDownloadDataFrameBuilder(AbstractAccountDownloadDataFrameBuilder): @property - def source_df(self) -> DataFrame: + def award_financial(self) -> DataFrame: select_cols = ( [sf.col("treasury_owning_agency_name").alias("owning_agency_name")] + [ diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/filters.py b/usaspending_api/download/management/commands/delta_downloads/filters.py similarity index 95% rename from usaspending_api/download/management/commands/delta_downloads/award_financial/filters.py rename to usaspending_api/download/management/commands/delta_downloads/filters.py index dbe21264e8..79f60af5cf 100644 --- a/usaspending_api/download/management/commands/delta_downloads/award_financial/filters.py +++ b/usaspending_api/download/management/commands/delta_downloads/filters.py @@ -1,5 +1,5 @@ import warnings -from typing import Any +from typing import Any, Literal from pydantic import BaseModel, root_validator, validator from pydantic.fields import ModelField @@ -11,6 +11,7 @@ class AccountDownloadFilter(BaseModel): fy: int + submission_types: list[Literal["account_balances", "object_class_program_activity", "award_financial"]] period: int | None = None quarter: int | None = None agency: int | None = None diff --git a/usaspending_api/download/management/commands/generate_spark_download.py b/usaspending_api/download/management/commands/generate_spark_download.py index 469a86c3a6..ebac2990d6 100644 --- a/usaspending_api/download/management/commands/generate_spark_download.py +++ b/usaspending_api/download/management/commands/generate_spark_download.py @@ -3,7 +3,7 @@ import os import traceback from pathlib import Path -from typing import Optional, Dict, Tuple, Type, List, Union +from typing import Optional, Union from django.conf import settings from django.core.management.base import BaseCommand @@ -17,34 +17,21 @@ from usaspending_api.common.helpers.spark_helpers import ( configure_spark_session, get_active_spark_session, - get_jdbc_connection_properties, - get_usas_jdbc_url, ) from usaspending_api.common.spark.configs import DEFAULT_EXTRA_CONF -from usaspending_api.download.filestreaming.download_generation import build_data_file_name -from usaspending_api.download.filestreaming.download_source import DownloadSource -from usaspending_api.download.lookups import JOB_STATUS_DICT, FILE_FORMATS, VALUE_MAPPINGS -from usaspending_api.download.management.commands.delta_downloads.award_financial.builders import ( +from usaspending_api.download.lookups import JOB_STATUS_DICT, FILE_FORMATS +from usaspending_api.download.management.commands.delta_downloads.builders import ( FederalAccountDownloadDataFrameBuilder, TreasuryAccountDownloadDataFrameBuilder, ) -from usaspending_api.download.management.commands.delta_downloads.award_financial.filters import AccountDownloadFilter +from usaspending_api.download.management.commands.delta_downloads.filters import AccountDownloadFilter from usaspending_api.download.models import DownloadJob -from usaspending_api.download.v2.request_validations import AccountDownloadValidator, DownloadValidatorBase logger = logging.getLogger(__name__) -DOWNLOAD_SPEC = { - "award_financial": { - "federal_account": { - "df_builder": FederalAccountDownloadDataFrameBuilder, - "validator_type": AccountDownloadValidator, - }, - "treasury_account": { - "df_builder": TreasuryAccountDownloadDataFrameBuilder, - "validator_type": AccountDownloadValidator, - }, - } +dataframe_builders = { + "federal_account": FederalAccountDownloadDataFrameBuilder, + "treasury_account": TreasuryAccountDownloadDataFrameBuilder, } @@ -52,96 +39,52 @@ class Command(BaseCommand): help = "Generate a download zip file based on the provided type and level." - download_job_id: int download_job: DownloadJob - download_level: str - download_query: str - download_source: DownloadSource - download_spec: Dict - download_type: str - download_validator_type: Type[DownloadValidatorBase] - file_format_spec: Dict file_prefix: str - jdbc_properties: Dict + jdbc_properties: dict jdbc_url: str should_cleanup: bool spark: SparkSession working_dir_path: Path def add_arguments(self, parser): - parser.add_argument("--download-type", type=str, required=True, choices=list(DOWNLOAD_SPEC)) - parser.add_argument( - "--download-level", - type=str, - required=True, - choices=set( - download_level - for download_level_list in [DOWNLOAD_SPEC[key] for key in DOWNLOAD_SPEC] - for download_level in download_level_list - ), - ) parser.add_argument("--download-job-id", type=int, required=True) parser.add_argument("--file-format", type=str, required=False, choices=list(FILE_FORMATS), default="csv") parser.add_argument("--file-prefix", type=str, required=False, default="") parser.add_argument("--skip-local-cleanup", action="store_true") def handle(self, *args, **options): - self.spark = get_active_spark_session() - spark_created_by_command = False - if not self.spark: - spark_created_by_command = True - self.spark = configure_spark_session(**DEFAULT_EXTRA_CONF, spark_context=self.spark) - - # Resolve Parameters - self.download_type = options["download_type"] - self.download_level = options["download_level"] - self.download_job_id = options["download_job_id"] + self.spark, spark_created_by_command = self.setup_spark_session() self.file_prefix = options["file_prefix"] self.should_cleanup = not options["skip_local_cleanup"] - - if self.download_level not in DOWNLOAD_SPEC[self.download_type].keys(): - raise ValueError( - f'Provided download level of "{self.download_level}" is not supported ' - f'for download type of "{self.download_type}".' - ) - - download_spec = DOWNLOAD_SPEC[self.download_type][self.download_level] - self.file_format_spec = FILE_FORMATS[options["file_format"]] - self.df_builder = download_spec["df_builder"] - self.download_validator_type = download_spec["validator_type"] - self.jdbc_properties = get_jdbc_connection_properties() - self.jdbc_url = get_usas_jdbc_url() - + self.download_job = self.get_download_job(options["download_job_id"]) self.working_dir_path = Path(settings.CSV_LOCAL_PATH) if not self.working_dir_path.exists(): self.working_dir_path.mkdir() - create_ref_temp_views(self.spark) - - self.download_job, self.download_source = self.get_download_job() self.process_download() - if spark_created_by_command: self.spark.stop() + @staticmethod + def setup_spark_session() -> tuple[SparkSession, bool]: + spark = get_active_spark_session() + spark_created_by_command = False + if not spark: + spark_created_by_command = True + spark = configure_spark_session(**DEFAULT_EXTRA_CONF, spark_context=spark) + return spark, spark_created_by_command + @cached_property def download_name(self) -> str: return self.download_job.file_name.replace(".zip", "") - def get_download_job(self) -> Tuple[DownloadJob, DownloadSource]: - download_job = DownloadJob.objects.get(download_job_id=self.download_job_id) + @staticmethod + def get_download_job(download_job_id) -> DownloadJob: + download_job = DownloadJob.objects.get(download_job_id=download_job_id) if download_job.job_status_id != JOB_STATUS_DICT["ready"]: - raise InvalidParameterException(f"Download Job {self.download_job_id} is not ready.") - json_request = json.loads(download_job.json_request) - download_source = DownloadSource( - VALUE_MAPPINGS[self.download_type]["table_name"], - self.download_level, - self.download_type, - json_request.get("agency", "all"), - ) - download_source.file_name = build_data_file_name(download_source, download_job, piid=None, assistance_id=None) - - return download_job, download_source + raise InvalidParameterException(f"Download Job {download_job_id} is not ready.") + return download_job def process_download(self): self.start_download() @@ -150,20 +93,27 @@ def process_download(self): spark_to_csv_strategy = SparkToCSVStrategy(logger) zip_file_path = self.working_dir_path / f"{self.download_name}.zip" download_request = json.loads(self.download_job.json_request) + df_builder = dataframe_builders[download_request["account_level"]] account_download_filter = AccountDownloadFilter(**download_request["filters"]) - source_df = self.df_builder(spark=self.spark, account_download_filter=account_download_filter).source_df - csv_metadata = spark_to_csv_strategy.download_to_csv( - source_sql=None, - destination_path=self.working_dir_path / self.download_name, - destination_file_name=self.download_name, - working_dir_path=self.working_dir_path, - download_zip_path=zip_file_path, - source_df=source_df, - ) - files_to_cleanup.extend(csv_metadata.filepaths) + source_dfs = df_builder(spark=self.spark, account_download_filter=account_download_filter).source_dfs + csvs_metadata = [ + spark_to_csv_strategy.download_to_csv( + source_sql=None, + destination_path=self.working_dir_path / self.download_name, + destination_file_name=self.download_name, + working_dir_path=self.working_dir_path, + download_zip_path=zip_file_path, + source_df=source_df, + ) + for source_df in source_dfs + ] + for csv_metadata in csvs_metadata: + files_to_cleanup.extend(csv_metadata.filepaths) self.download_job.file_size = os.stat(zip_file_path).st_size - self.download_job.number_of_rows = csv_metadata.number_of_rows - self.download_job.number_of_columns = csv_metadata.number_of_columns + self.download_job.number_of_rows = sum([csv_metadata.number_of_rows for csv_metadata in csvs_metadata]) + self.download_job.number_of_columns = sum( + [csv_metadata.number_of_columns for csv_metadata in csvs_metadata] + ) upload_download_file_to_s3(zip_file_path) except InvalidParameterException as e: exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob" @@ -198,7 +148,7 @@ def finish_download(self) -> None: self.download_job.save() logger.info(f"Finished processing DownloadJob {self.download_job.download_job_id}") - def cleanup(self, path_list: List[Union[Path, str]]) -> None: + def cleanup(self, path_list: list[Union[Path, str]]) -> None: for path in path_list: if isinstance(path, str): path = Path(path) diff --git a/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py index 3804b7614b..946dfa1247 100644 --- a/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py +++ b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py @@ -4,11 +4,11 @@ import pytest from django.core.management import call_command from model_bakery import baker -from usaspending_api.download.management.commands.delta_downloads.award_financial.builders import ( +from usaspending_api.download.management.commands.delta_downloads.builders import ( FederalAccountDownloadDataFrameBuilder, TreasuryAccountDownloadDataFrameBuilder, ) -from usaspending_api.download.management.commands.delta_downloads.award_financial.filters import AccountDownloadFilter +from usaspending_api.download.management.commands.delta_downloads.filters import AccountDownloadFilter from usaspending_api.download.v2.download_column_historical_lookups import query_paths @@ -96,10 +96,11 @@ def test_federal_account_download_dataframe_builder(mock_get_submission_ids_for_ mock_get_submission_ids_for_periods.return_value = [1, 2, 4, 5] account_download_filter = AccountDownloadFilter( fy=2018, + submission_types=["award_financial"], quarter=4, ) builder = FederalAccountDownloadDataFrameBuilder(spark, account_download_filter, "rpt.account_download") - result = builder.source_df + result = builder.source_dfs[0] result_df = result.toPandas() for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: assert sorted(result_df[col].to_list()) == ["A", "B; C; D"] @@ -115,11 +116,12 @@ def test_filter_federal_by_agency(mock_get_submission_ids_for_periods, spark, ac account_download_filter = AccountDownloadFilter( fy=2018, + submission_types=["award_financial"], quarter=4, agency=2, ) builder = FederalAccountDownloadDataFrameBuilder(spark, account_download_filter) - result = builder.source_df + result = builder.source_dfs[0] result_df = result.toPandas() for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: assert sorted(result_df[col].to_list()) == ["B; C; D"] @@ -137,11 +139,12 @@ def test_filter_federal_by_federal_account_id( account_download_filter = AccountDownloadFilter( fy=2018, + submission_types=["award_financial"], quarter=4, federal_account=1, ) builder = FederalAccountDownloadDataFrameBuilder(spark, account_download_filter) - result = builder.source_df + result = builder.source_dfs[0] result_df = result.toPandas() for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: assert sorted(result_df[col].to_list()) == ["A"] @@ -152,10 +155,11 @@ def test_filter_federal_by_federal_account_id( def test_treasury_account_download_dataframe_builder(spark, account_download_table): account_download_filter = AccountDownloadFilter( fy=2018, + submission_types=["award_financial"], quarter=4, ) builder = TreasuryAccountDownloadDataFrameBuilder(spark, account_download_filter) - result = builder.source_df + result = builder.source_dfs[0] result_df = result.toPandas() for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: assert sorted(result_df[col].to_list()) == ["A", "B", "C", "D"] @@ -167,11 +171,12 @@ def test_filter_treasury_by_agency(spark, account_download_table, agency_models) account_download_filter = AccountDownloadFilter( fy=2018, + submission_types=["award_financial"], quarter=4, agency=2, ) builder = TreasuryAccountDownloadDataFrameBuilder(spark, account_download_filter) - result = builder.source_df + result = builder.source_dfs[0] result_df = result.toPandas() for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: assert sorted(result_df[col].to_list()) == ["B", "C", "D"] diff --git a/usaspending_api/download/tests/integration/test_account_download_filter.py b/usaspending_api/download/tests/integration/test_account_download_filter.py index 73eafc9c58..6f992c7142 100644 --- a/usaspending_api/download/tests/integration/test_account_download_filter.py +++ b/usaspending_api/download/tests/integration/test_account_download_filter.py @@ -2,7 +2,7 @@ from model_bakery import baker from usaspending_api.common.exceptions import InvalidParameterException -from usaspending_api.download.management.commands.delta_downloads.award_financial.filters import AccountDownloadFilter +from usaspending_api.download.management.commands.delta_downloads.filters import AccountDownloadFilter @pytest.fixture @@ -20,7 +20,13 @@ def federal_account_models(db): def test_account_download_filter_cast_to_int(agency_models, federal_account_models): - test_data = {"fy": "2018", "quarter": "4", "agency": "2", "federal_account": "3"} + test_data = { + "fy": "2018", + "submission_types": ["award_financial"], + "quarter": "4", + "agency": "2", + "federal_account": "3", + } result = AccountDownloadFilter(**test_data) assert result.fy == 2018 assert result.quarter == 4 @@ -31,6 +37,7 @@ def test_account_download_filter_cast_to_int(agency_models, federal_account_mode def test_account_download_handle_all(agency_models, federal_account_models): test_data = { "fy": "2018", + "submission_types": ["award_financial"], "quarter": "4", "agency": "all", "federal_account": "all", @@ -47,7 +54,7 @@ def test_account_download_handle_all(agency_models, federal_account_models): def test_account_download_both_period_quarter(agency_models, federal_account_models): - test_data = {"fy": "2018", "period": "12", "quarter": "4"} + test_data = {"fy": "2018", "submission_types": ["award_financial"], "period": "12", "quarter": "4"} with pytest.warns() as warnings: result = AccountDownloadFilter(**test_data) assert result.fy == 2018 @@ -58,13 +65,13 @@ def test_account_download_both_period_quarter(agency_models, federal_account_mod def test_account_download_none_period_quarter(agency_models, federal_account_models): - test_data = {"fy": "2018"} + test_data = {"fy": "2018", "submission_types": ["award_financial"]} with pytest.raises(InvalidParameterException, match="Must define period or quarter."): AccountDownloadFilter(**test_data) def test_account_download_no_agency(agency_models, federal_account_models): - test_data = {"fy": "2018", "period": 2, "agency": 3} + test_data = {"fy": "2018", "submission_types": ["award_financial"], "period": 2, "agency": 3} result = AccountDownloadFilter(**test_data) assert result.agency == 3 test_data = {"fy": "2018", "period": 2, "agency": 4} @@ -73,9 +80,9 @@ def test_account_download_no_agency(agency_models, federal_account_models): def test_account_download_no_federal_account(agency_models, federal_account_models): - test_data = {"fy": "2018", "period": 2, "federal_account": 3} + test_data = {"fy": "2018", "submission_types": ["award_financial"], "period": 2, "federal_account": 3} result = AccountDownloadFilter(**test_data) assert result.federal_account == 3 - test_data = {"fy": "2018", "period": 2, "federal_account": 4} + test_data = {"fy": "2018", "submission_types": ["award_financial"], "period": 2, "federal_account": 4} with pytest.raises(InvalidParameterException, match="Federal Account with that ID does not exist"): result = AccountDownloadFilter(**test_data) From 93dc6fd3f0712c257493681ba551ad4770411e04 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Mon, 14 Jul 2025 13:49:01 -0500 Subject: [PATCH 42/43] [DEV-12772] - Update patch path --- .../integration/test_account_download_dataframe_builder.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py index 946dfa1247..310264f04a 100644 --- a/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py +++ b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py @@ -90,7 +90,7 @@ def federal_account_models(db): @patch( - "usaspending_api.download.management.commands.delta_downloads.award_financial.builders.get_submission_ids_for_periods" + "usaspending_api.download.management.commands.delta_downloads.builders.get_submission_ids_for_periods" ) def test_federal_account_download_dataframe_builder(mock_get_submission_ids_for_periods, spark, account_download_table): mock_get_submission_ids_for_periods.return_value = [1, 2, 4, 5] @@ -109,7 +109,7 @@ def test_federal_account_download_dataframe_builder(mock_get_submission_ids_for_ @patch( - "usaspending_api.download.management.commands.delta_downloads.award_financial.builders.get_submission_ids_for_periods" + "usaspending_api.download.management.commands.delta_downloads.builders.get_submission_ids_for_periods" ) def test_filter_federal_by_agency(mock_get_submission_ids_for_periods, spark, account_download_table, agency_models): mock_get_submission_ids_for_periods.return_value = [1, 2, 4, 5] @@ -130,7 +130,7 @@ def test_filter_federal_by_agency(mock_get_submission_ids_for_periods, spark, ac @patch( - "usaspending_api.download.management.commands.delta_downloads.award_financial.builders.get_submission_ids_for_periods" + "usaspending_api.download.management.commands.delta_downloads.builders.get_submission_ids_for_periods" ) def test_filter_federal_by_federal_account_id( mock_get_submission_ids_for_periods, spark, account_download_table, federal_account_models From 41312d332ef10542acf3375cf3ba0000a4979ec1 Mon Sep 17 00:00:00 2001 From: Zach Flanders Date: Mon, 14 Jul 2025 13:54:41 -0500 Subject: [PATCH 43/43] [DEV-12772] - Update patch path formatting --- .../test_account_download_dataframe_builder.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py index 310264f04a..a1b56181c0 100644 --- a/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py +++ b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py @@ -89,9 +89,7 @@ def federal_account_models(db): baker.make("accounts.FederalAccount", pk=3, agency_identifier="345", main_account_code="0333") -@patch( - "usaspending_api.download.management.commands.delta_downloads.builders.get_submission_ids_for_periods" -) +@patch("usaspending_api.download.management.commands.delta_downloads.builders.get_submission_ids_for_periods") def test_federal_account_download_dataframe_builder(mock_get_submission_ids_for_periods, spark, account_download_table): mock_get_submission_ids_for_periods.return_value = [1, 2, 4, 5] account_download_filter = AccountDownloadFilter( @@ -108,9 +106,7 @@ def test_federal_account_download_dataframe_builder(mock_get_submission_ids_for_ assert sorted(result_df.gross_outlay_amount_FYB_to_period_end.to_list()) == [100, 200] -@patch( - "usaspending_api.download.management.commands.delta_downloads.builders.get_submission_ids_for_periods" -) +@patch("usaspending_api.download.management.commands.delta_downloads.builders.get_submission_ids_for_periods") def test_filter_federal_by_agency(mock_get_submission_ids_for_periods, spark, account_download_table, agency_models): mock_get_submission_ids_for_periods.return_value = [1, 2, 4, 5] @@ -129,9 +125,7 @@ def test_filter_federal_by_agency(mock_get_submission_ids_for_periods, spark, ac assert result_df.gross_outlay_amount_FYB_to_period_end.to_list() == [200] -@patch( - "usaspending_api.download.management.commands.delta_downloads.builders.get_submission_ids_for_periods" -) +@patch("usaspending_api.download.management.commands.delta_downloads.builders.get_submission_ids_for_periods") def test_filter_federal_by_federal_account_id( mock_get_submission_ids_for_periods, spark, account_download_table, federal_account_models ):