diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index f95f739dc4..d581feff06 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,27 +1,27 @@ -**Description:** -High level description of what the PR addresses should be put here. Should be detailed enough to communicate to a PO what this PR addresses without diving into the technical nuances +## Description: + -**Technical details:** -The technical details for the knowledge of other developers. Any detailed caveats or specific deployment steps should be outlined here. -**Requirements for PR merge:** + +## Technical Details: + + + + +## Requirements for PR Merge: + 1. [ ] Unit & integration tests updated -2. [ ] API documentation updated -3. [ ] Necessary PR reviewers: - - [ ] Backend - - [ ] Frontend - - [ ] Operations - - [ ] Domain Expert -4. [ ] Matview impact assessment completed -5. [ ] Frontend impact assessment completed -6. [ ] Data validation completed -7. [ ] Appropriate Operations ticket(s) created -8. [ ] Jira Ticket [DEV-123](https://federal-spending-transparency.atlassian.net/browse/DEV-123): - - [ ] Link to this Pull-Request - - [ ] Performance evaluation of affected (API | Script | Download) - - [ ] Before / After data comparison - -**Area for explaining above N/A when needed:** -``` -``` +2. [ ] API documentation updated (examples listed below) + 1. API Contracts + 2. API UI + 3. Comments +3. [ ] Data validation completed (examples listed below) + 1. Does this work well with the current frontend? Or is the frontend aware of a needed change? + 2. Is performance impacted in the changes (e.g., API, pipeline, downloads, etc.)? + 3. Is the expected data returned with the expected format? +4. [ ] Appropriate Operations ticket(s) created +5. [ ] Jira Ticket(s) + 1. [DEV-0](https://federal-spending-transparency.atlassian.net/browse/DEV-0) + +### Explain N/A in above checklist: diff --git a/.github/pull_request_template_future.md b/.github/pull_request_template_future.md deleted file mode 100644 index bc25a8a5f5..0000000000 --- a/.github/pull_request_template_future.md +++ /dev/null @@ -1,18 +0,0 @@ -**Description:** -High level description of what the PR addresses should be put here. Should be detailed enough to communicate to a PO what this PR addresses without diving into the technical nuances - -**Technical details:** -The technical details for the knowledge of other developers. Any detailed caveats or specific deployment steps should be outlined here. - -**Requirements for PR merge:** - -1. [ ] Definition of Done - Development section appropriately satisfied -2. [ ] Necessary PR reviewers: - - [ ] Backend - - [ ] Frontend - - [ ] Operations -3. [ ] Jira Ticket(s) - - [DEV-0](https://federal-spending-transparency.atlassian.net/browse/DEV-0): - - -Click [here](https://github.com/fedspendingtransparency/data-act-documentation/blob/master/agile_practices/story_definition_of_done.md) for Definition of Done diff --git a/.github/workflows/pull-request-and-review-updates.yaml b/.github/workflows/pull-request-and-review-updates.yaml new file mode 100644 index 0000000000..94465c9f6f --- /dev/null +++ b/.github/workflows/pull-request-and-review-updates.yaml @@ -0,0 +1,27 @@ +name: Pull Request and Review Updates + +on: + pull_request: + types: [opened] + pull_request_review: + types: [submitted] + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number }}-${{ github.actor_id }} + cancel-in-progress: true + +jobs: + Update-Pull-Request-Assignees: + name: Update Pull Request Assignees + runs-on: ${{ vars.RUNNER_VERSION }} + steps: + - name: Update Assignee + uses: actions/github-script@v7 + with: + script: | + github.rest.issues.addAssignees({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + assignees: [context.actor] + }); diff --git a/README.md b/README.md index 6e084f8dc4..ad9534d54a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ #

USAspending API

-[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black) [![Pull Request Checks](https://github.com/fedspendingtransparency/usaspending-api/actions/workflows/pull-request-checks.yaml/badge.svg)](https://github.com/fedspendingtransparency/usaspending-api/actions/workflows/pull-request-checks.yaml) [![Test Coverage](https://codeclimate.com/github/fedspendingtransparency/usaspending-api/badges/coverage.svg)](https://codeclimate.com/github/fedspendingtransparency/usaspending-api/coverage) [![Code Climate](https://codeclimate.com/github/fedspendingtransparency/usaspending-api/badges/gpa.svg)](https://codeclimate.com/github/fedspendingtransparency/usaspending-api) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black) [![Pull Request Checks](https://github.com/fedspendingtransparency/usaspending-api/actions/workflows/pull-request-checks.yaml/badge.svg?branch=staging)](https://github.com/fedspendingtransparency/usaspending-api/actions/workflows/pull-request-checks.yaml) [![Test Coverage](https://codeclimate.com/github/fedspendingtransparency/usaspending-api/badges/coverage.svg)](https://codeclimate.com/github/fedspendingtransparency/usaspending-api/coverage) [![Code Climate](https://codeclimate.com/github/fedspendingtransparency/usaspending-api/badges/gpa.svg)](https://codeclimate.com/github/fedspendingtransparency/usaspending-api) _This API is utilized by USAspending.gov to obtain all federal spending data which is open source and provided to the public as part of the DATA Act._ diff --git a/requirements/requirements-app.txt b/requirements/requirements-app.txt index ae885c5c8f..159eeaa09f 100644 --- a/requirements/requirements-app.txt +++ b/requirements/requirements-app.txt @@ -2,6 +2,7 @@ asyncpg==0.29.* attrs==23.2.* boto3==1.34.* certifi==2024.7.4 +databricks-sdk==0.44.1 # Pinned because newer versions use v2.2 of the API which is not supported by PVC dataclasses-json==0.6.* dj-database-url==2.1.0 django-cors-headers==4.3.* @@ -28,6 +29,7 @@ psutil==5.9.* psycopg2==2.9.9 # Pinning exact version because this package will drop support for Python versions in patches py-gfm==2.0.0 pydantic[dotenv]==1.9.* +python-dateutil==2.9.* python-json-logger==2.0.7 requests==2.31.* retrying==1.3.4 diff --git a/usaspending_api/api_contracts/contracts/v2/download/accounts.md b/usaspending_api/api_contracts/contracts/v2/download/accounts.md index d24acb595c..f63dce40d4 100644 --- a/usaspending_api/api_contracts/contracts/v2/download/accounts.md +++ b/usaspending_api/api_contracts/contracts/v2/download/accounts.md @@ -21,8 +21,8 @@ Generate files and return metadata using filters on custom account + `account_level` (required, enum[string]) The account level is used to filter for a specific type of file. + Members - + `treasury_account` + `federal_account` + + `treasury_account` + `file_format` (optional, enum[string]) The format of the file(s) in the zip file containing the data. + Default: `csv` @@ -87,9 +87,20 @@ Generate files and return metadata using filters on custom account + `agency` (optional, string) The agency on which to filter. This field expects an internal toptier agency identifier also known as the `toptier_agency_id`. + Default: `all` ++ `budget_function` (optional, string) + The budget function code on which to filter. ++ `budget_subfunction` (optional, string) + The budget subfunction code on whicn to filter + `federal_account`(optional, string) This field is an internal id. -+ `submission_types` (required, array) ++ `submission_type` (optional, enum[string]) + Either `submission_type` or `submission_types` is required. + + Members + + `account_balances` + + `object_class_program_activity` + + `award_financial` ++ `submission_types` (optional, array) + Either `submission_type` or `submission_types` is required. + (enum[string]) + `account_balances` + `object_class_program_activity` diff --git a/usaspending_api/awards/management/commands/generate_unlinked_awards_download.py b/usaspending_api/awards/management/commands/generate_unlinked_awards_download.py index 5dd53d4953..ac490d67fb 100644 --- a/usaspending_api/awards/management/commands/generate_unlinked_awards_download.py +++ b/usaspending_api/awards/management/commands/generate_unlinked_awards_download.py @@ -144,10 +144,10 @@ def process_data_copy_jobs(self, zip_file_path): sql_file = None final_path = self._create_data_csv_dest_path(final_name) intermediate_data_file_path = final_path.parent / (final_path.name + "_temp") - data_file_names, count = self.download_to_csv( + download_metadata = self.download_to_csv( sql_file, final_path, final_name, str(intermediate_data_file_path), zip_file_path, df ) - if count <= 0: + if download_metadata.number_of_rows <= 0: logger.warning(f"Empty data file generated: {final_path}!") self.filepaths_to_delete.extend(self.working_dir_path.glob(f"{final_path.stem}*")) @@ -159,7 +159,7 @@ def complete_zip_and_upload(self, zip_file_path): upload_download_file_to_s3(zip_file_path, settings.UNLINKED_AWARDS_DOWNLOAD_REDIRECT_DIR) logger.info("Marking zip file for deletion in cleanup") else: - logger.warn("Not uploading zip file to S3. Leaving file locally") + logger.warning("Not uploading zip file to S3. Leaving file locally") self.filepaths_to_delete.remove(zip_file_path) @property diff --git a/usaspending_api/broker/management/commands/update_table_value_from_broker.py b/usaspending_api/broker/management/commands/update_table_value_from_broker.py index 92ab6e9ec8..b8dc25229e 100644 --- a/usaspending_api/broker/management/commands/update_table_value_from_broker.py +++ b/usaspending_api/broker/management/commands/update_table_value_from_broker.py @@ -30,7 +30,7 @@ def add_arguments(self, parser): parser.add_argument( "--load-field-type", type=str, - required=True, + required=False, default="text", help="Postgres data type of the field that will be copied from Broker", ) @@ -122,12 +122,16 @@ def run_update(self, min_id: int, max_id: int) -> None: 'broker_server','( SELECT {self.broker_match_field}, {self.broker_load_field} FROM {self.broker_table_name} + WHERE + {self.broker_match_field} >= {chunk_min_id} + AND {self.broker_match_field} <= {chunk_max_id} )') AS broker_table ( lookup_id bigint, load_field {self.load_field_type} ) - WHERE usas_table.{self.usas_match_field} = broker_table.lookup_id + WHERE + usas_table.{self.usas_match_field} = broker_table.lookup_id ; """ ) @@ -135,11 +139,11 @@ def run_update(self, min_id: int, max_id: int) -> None: row_count = cursor.rowcount total_row_count += row_count ratio = (chunk_max_id - min_id + 1) / estimated_id_count - logging.info( + logger.info( f'Updated {row_count:,d} rows with "{self.usas_match_field}" between {chunk_min_id:,d} and {chunk_max_id:,d}.' f" Estimated time remaining: {timer.estimated_remaining_runtime(ratio)}" ) - logging.info( + logger.info( f'Finished updating {total_row_count:,d} rows for "{self.usas_table_name}"."{self.usas_load_field}" ' f"in {timer}" ) diff --git a/usaspending_api/common/etl/spark.py b/usaspending_api/common/etl/spark.py index ef606dce6e..85067c1c5c 100644 --- a/usaspending_api/common/etl/spark.py +++ b/usaspending_api/common/etl/spark.py @@ -6,46 +6,47 @@ """ import logging - -from itertools import chain -from typing import List -from pyspark.sql.functions import to_date, lit, expr, concat, concat_ws, col, regexp_replace, transform, when -from pyspark.sql.types import StructType, DecimalType, StringType, ArrayType -from pyspark.sql import DataFrame, SparkSession import time from collections import namedtuple +from itertools import chain +from typing import List + from py4j.protocol import Py4JError +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql.functions import col, concat, concat_ws, expr, lit, regexp_replace, to_date, transform, when +from pyspark.sql.types import ArrayType, DecimalType, StringType, StructType from usaspending_api.accounts.models import FederalAccount, TreasuryAppropriationAccount -from usaspending_api.config import CONFIG from usaspending_api.common.helpers.spark_helpers import ( + get_broker_jdbc_url, get_jdbc_connection_properties, get_usas_jdbc_url, - get_broker_jdbc_url, ) +from usaspending_api.config import CONFIG +from usaspending_api.download.filestreaming.download_generation import EXCEL_ROW_LIMIT from usaspending_api.financial_activities.models import FinancialAccountsByProgramActivityObjectClass from usaspending_api.recipient.models import StateData from usaspending_api.references.models import ( - Cfda, - Agency, - ToptierAgency, - SubtierAgency, + CGAC, NAICS, - Office, PSC, - RefCountryCode, + Agency, + Cfda, CityCountyStateCode, - PopCounty, - PopCongressionalDistrict, DisasterEmergencyFundCode, - RefProgramActivity, - ObjectClass, GTASSF133Balances, - CGAC, + ObjectClass, + Office, + PopCongressionalDistrict, + PopCounty, + RefCountryCode, + RefProgramActivity, + SubtierAgency, + ToptierAgency, + ZipsGrouped, ) from usaspending_api.reporting.models import ReportingAgencyMissingTas, ReportingAgencyOverview -from usaspending_api.submissions.models import SubmissionAttributes, DABSSubmissionWindowSchedule -from usaspending_api.download.filestreaming.download_generation import EXCEL_ROW_LIMIT +from usaspending_api.submissions.models import DABSSubmissionWindowSchedule, SubmissionAttributes MAX_PARTITIONS = CONFIG.SPARK_MAX_PARTITIONS _USAS_RDS_REF_TABLES = [ @@ -73,9 +74,10 @@ TreasuryAppropriationAccount, ReportingAgencyOverview, ReportingAgencyMissingTas, + ZipsGrouped, ] -_BROKER_REF_TABLES = ["zips_grouped", "cd_state_grouped", "cd_zips_grouped", "cd_county_grouped", "cd_city_grouped"] +_BROKER_REF_TABLES = ["cd_state_grouped", "cd_zips_grouped", "cd_county_grouped", "cd_city_grouped"] logger = logging.getLogger(__name__) @@ -444,7 +446,7 @@ def convert_array_cols_to_string( is_postgres_array_format=False, is_for_csv_export=False, ) -> DataFrame: - """For each column that is an Array of ANYTHING, transfrom it to a string-ified representation of that Array. + """For each column that is an Array of ANYTHING, transform it to a string-ified representation of that Array. This will: 1. cast each array element to a STRING representation @@ -576,7 +578,7 @@ def create_ref_temp_views(spark: SparkSession, create_broker_views: bool = False for sql_statement in broker_sql_strings: spark.sql(sql_statement) - logger.info(f"Created the reference views in the global_temp database") + logger.info("Created the reference views in the global_temp database") def write_csv_file( @@ -691,7 +693,7 @@ def hadoop_copy_merge( logger.debug(f"Including part file: {file_path.getName()}") part_files.append(f.getPath()) if not part_files: - logger.warn("Source directory is empty with no part files. Attempting creation of file with CSV header only") + logger.warning("Source directory is empty with no part files. Attempting creation of file with CSV header only") out_stream = None try: merged_file_path = f"{parts_dir}.{file_format}" diff --git a/usaspending_api/common/helpers/download_csv_strategies.py b/usaspending_api/common/helpers/download_csv_strategies.py index 04560eb0cd..0e8b44fab5 100644 --- a/usaspending_api/common/helpers/download_csv_strategies.py +++ b/usaspending_api/common/helpers/download_csv_strategies.py @@ -1,14 +1,15 @@ -from abc import ABC, abstractmethod +import logging import multiprocessing import time -import logging +from abc import ABC, abstractmethod +from dataclasses import dataclass from pathlib import Path -from typing import Tuple -from django.conf import settings +from typing import List, Optional +from django.conf import settings +from pyspark.sql import DataFrame from usaspending_api.common.csv_helpers import count_rows_in_delimited_file -from usaspending_api.common.helpers.s3_helpers import delete_s3_object, download_s3_object -from usaspending_api.common.helpers.sql_helpers import read_sql_file_to_text +from usaspending_api.common.helpers.s3_helpers import delete_s3_objects, download_s3_object from usaspending_api.download.filestreaming.download_generation import ( EXCEL_ROW_LIMIT, split_and_zip_data_files, @@ -18,7 +19,13 @@ ) from usaspending_api.download.filestreaming.zip_file import append_files_to_zip_file from usaspending_api.download.lookups import FILE_FORMATS -from typing import List + + +@dataclass +class CSVDownloadMetadata: + filepaths: list[str] + number_of_rows: int + number_of_columns: Optional[int] = None class AbstractToCSVStrategy(ABC): @@ -37,13 +44,13 @@ def __init__(self, *args, **kwargs): @abstractmethod def download_to_csv( self, - source_sql: str, + source_sql: str | None, destination_path: Path, destination_file_name: str, working_dir_path: Path, download_zip_path: Path, - source_df=None, - ) -> Tuple[List[str], int]: + source_df: DataFrame | None = None, + ) -> CSVDownloadMetadata: """ Args: source_sql: Some string that can be used as the source sql @@ -51,9 +58,10 @@ def download_to_csv( destination_file_name: The name of the file in destination path without a file extension working_dir_path: The working directory path as a string download_zip_path: The path (as a string) to the download zip file + source_df: A pyspark DataFrame that contains the data to be downloaded Returns: - Returns a list of paths to the downloaded csv files and the total record count of all those files. + Returns a CSVDownloadMetadata object (a dataclass containing metadata about the download) """ pass @@ -66,12 +74,11 @@ def __init__(self, logger: logging.Logger, *args, **kwargs): def download_to_csv( self, source_sql, destination_path, destination_file_name, working_dir_path, download_zip_path, source_df=None ): - source_sql = Path(source_sql) start_time = time.perf_counter() self._logger.info(f"Downloading data to {destination_path}") temp_data_file_name = destination_path.parent / (destination_path.name + "_temp") options = FILE_FORMATS[self.file_format]["options"] - export_query = r"\COPY ({}) TO STDOUT {}".format(read_sql_file_to_text(source_sql), options) + export_query = r"\COPY ({}) TO STDOUT {}".format(source_sql, options) try: temp_file, temp_file_path = generate_export_query_temp_file(export_query, None, working_dir_path) # Create a separate process to run the PSQL command; wait @@ -86,8 +93,8 @@ def download_to_csv( # Log how many rows we have self._logger.info(f"Counting rows in delimited text file {temp_data_file_name}") try: - count = count_rows_in_delimited_file(filename=temp_data_file_name, has_header=True, delimiter=delim) - self._logger.info(f"{destination_path} contains {count:,} rows of data") + row_count = count_rows_in_delimited_file(filename=temp_data_file_name, has_header=True, delimiter=delim) + self._logger.info(f"{destination_path} contains {row_count:,} rows of data") except Exception: self._logger.exception("Unable to obtain delimited text file line count") @@ -108,7 +115,7 @@ def download_to_csv( raise e finally: Path(temp_file_path).unlink() - return [destination_path], count + return CSVDownloadMetadata([destination_path], row_count) class SparkToCSVStrategy(AbstractToCSVStrategy): @@ -161,6 +168,7 @@ def download_to_csv( max_records_per_file=EXCEL_ROW_LIMIT, logger=self._logger, ) + column_count = len(df.columns) # When combining these later, will prepend the extracted header to each resultant file. # The parts therefore must NOT have headers or the headers will show up in the data when combined. header = ",".join([_.name for _ in df.schema.fields]) @@ -179,12 +187,12 @@ def download_to_csv( self._logger.exception("Exception encountered. See logs") raise finally: - delete_s3_object(s3_bucket_name, s3_destination_path) + delete_s3_objects(s3_bucket_name, key_prefix=f"{s3_bucket_sub_path}/{destination_file_name}") if self.spark_created_by_command: self.spark.stop() append_files_to_zip_file(final_csv_data_file_locations, download_zip_path) self._logger.info(f"Generated the following data csv files {final_csv_data_file_locations}") - return final_csv_data_file_locations, record_count + return CSVDownloadMetadata(final_csv_data_file_locations, record_count, column_count) def _move_data_csv_s3_to_local( self, bucket_name, s3_file_paths, s3_bucket_path, s3_bucket_sub_path, destination_path_dir diff --git a/usaspending_api/common/helpers/s3_helpers.py b/usaspending_api/common/helpers/s3_helpers.py index 0a48df14bf..e289e34027 100644 --- a/usaspending_api/common/helpers/s3_helpers.py +++ b/usaspending_api/common/helpers/s3_helpers.py @@ -8,7 +8,7 @@ from botocore.exceptions import ClientError from django.conf import settings from pathlib import Path -from typing import List +from typing import Optional from botocore.client import BaseClient from usaspending_api.config import CONFIG @@ -46,7 +46,7 @@ def get_s3_bucket(bucket_name: str, region_name: str = CONFIG.AWS_REGION) -> "bo return s3.Bucket(bucket_name) -def retrieve_s3_bucket_object_list(bucket_name: str) -> List["boto3.resources.factory.s3.ObjectSummary"]: +def retrieve_s3_bucket_object_list(bucket_name: str) -> list["boto3.resources.factory.s3.ObjectSummary"]: try: bucket = get_s3_bucket(bucket_name=bucket_name) bucket_objects = list(bucket.objects.all()) @@ -77,12 +77,12 @@ def upload_download_file_to_s3(file_path, sub_dir=None): def multipart_upload(bucketname, regionname, source_path, keyname, sub_dir=None): - s3client = boto3.client("s3", region_name=regionname) + s3_client = _get_boto3("client", "s3", region_name=regionname) source_size = Path(source_path).stat().st_size # Sets the chunksize at minimum ~5MB to sqrt(5MB) * sqrt(source size) bytes_per_chunk = max(int(math.sqrt(5242880) * math.sqrt(source_size)), 5242880) config = TransferConfig(multipart_chunksize=bytes_per_chunk) - transfer = S3Transfer(s3client, config) + transfer = S3Transfer(s3_client, config) file_name = Path(keyname).name if sub_dir is not None: file_name = f"{sub_dir}/{file_name}" @@ -132,3 +132,36 @@ def delete_s3_object(bucket_name: str, key: str, region_name: str = settings.USA """ s3 = _get_boto3("client", "s3", region_name=region_name) s3.delete_object(Bucket=bucket_name, Key=key) + + +def delete_s3_objects( + bucket_name: str, + *, + key_list: Optional[list[str]] = None, + key_prefix: Optional[str] = None, + region_name: Optional[str] = settings.USASPENDING_AWS_REGION, +) -> int: + """Deletes all objects based on a list of keys + Args: + bucket_name: The name of the bucket where the objects are located + key_list: A list of keys representing objects in the bucket to delete + key_prefix: A prefix in the bucket used to generate a list of objects to delete + region_name: AWS region to use; defaults to the settings provided region + + Returns: + Number of objects delete + """ + object_list = [] + + if key_prefix: + bucket = get_s3_bucket(bucket_name, region_name) + objects = bucket.objects.filter(Prefix=key_prefix) + object_list.extend([{"Key": obj.key} for obj in objects]) + + if key_list: + object_list.extend([{"Key": key} for key in key_list]) + + s3_client = _get_boto3("client", "s3", region_name=region_name) + resp = s3_client.delete_objects(Bucket=bucket_name, Delete={"Objects": object_list}) + + return len(resp.get("Deleted", [])) diff --git a/usaspending_api/common/spark/__init__.py b/usaspending_api/common/spark/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/usaspending_api/common/spark/configs.py b/usaspending_api/common/spark/configs.py new file mode 100644 index 0000000000..4fa409785d --- /dev/null +++ b/usaspending_api/common/spark/configs.py @@ -0,0 +1,69 @@ +from usaspending_api.config import CONFIG + +# The versions below are determined by the current version of Databricks in use +_SCALA_VERSION = "2.12" +_HADOOP_VERSION = "3.3.4" +_SPARK_VERSION = "3.5.0" +_DELTA_VERSION = "3.1.0" + +# List of Maven coordinates for required JAR files used by running code, which can be added to the driver and +# executor class paths +SPARK_SESSION_JARS = [ + # "com.amazonaws:aws-java-sdk:1.12.31", + # hadoop-aws is an add-on to hadoop with Classes that allow hadoop to interface with an S3A (AWS S3) FileSystem + # NOTE That in order to work, the version number should be the same as the Hadoop version used by your Spark runtime + # It SHOULD pull in (via Ivy package manager from maven repo) the version of com.amazonaws:aws-java-sdk that is + # COMPATIBLE with it (so that should not be set as a dependent package by us) + f"org.apache.hadoop:hadoop-aws:{_HADOOP_VERSION}", + "org.postgresql:postgresql:42.2.23", + f"io.delta:delta-spark_{_SCALA_VERSION}:{_DELTA_VERSION}", +] + +# TODO: This should be used more widely across our different commands +DEFAULT_EXTRA_CONF = { + # Config for Delta Lake tables and SQL. Need these to keep Dela table metadata in the metastore + "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", + "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog", + # See comment below about old date and time values cannot parse without these + "spark.sql.legacy.parquet.datetimeRebaseModeInWrite": "LEGACY", # for dates at/before 1900 + "spark.sql.legacy.parquet.int96RebaseModeInWrite": "LEGACY", # for timestamps at/before 1900 + "spark.sql.jsonGenerator.ignoreNullFields": "false", # keep nulls in our json +} + +LOCAL_BASIC_EXTRA_CONF = { + **DEFAULT_EXTRA_CONF, + # This is the default, but being explicit + "spark.master": "local[*]", + "spark.driver.host": "127.0.0.1", # if not set fails in local envs, trying to use network IP instead + # Client deploy mode is the default, but being explicit. + # Means the driver node is the place where the SparkSession is instantiated (and/or where spark-submit + # process is started from, even if started under the hood of a Py4J JavaGateway). With a "standalone" (not + # YARN or Mesos or Kubernetes) cluster manager, only client mode is supported. + "spark.submit.deployMode": "client", + # Default of 1g (1GiB) for Driver. Increase here if the Java process is crashing with memory errors + "spark.driver.memory": "1g", + "spark.executor.memory": "1g", + "spark.ui.enabled": "false", # Does the same as setting SPARK_TESTING=true env var + "spark.jars.packages": ",".join(SPARK_SESSION_JARS), +} + + +LOCAL_EXTENDED_EXTRA_CONF = { + **LOCAL_BASIC_EXTRA_CONF, + "spark.hadoop.fs.s3a.endpoint": getattr(CONFIG, "MINIO_HOST", ""), + "spark.hadoop.fs.s3a.connection.ssl.enabled": False, + "spark.hadoop.fs.s3a.path.style.access": True, + "spark.sql.catalogImplementation": "hive", + "spark.sql.warehouse.dir": getattr(CONFIG, "SPARK_SQL_WAREHOUSE_DIR", ""), +} + +if getattr(CONFIG, "MINIO_ACCESS_KEY", False) and getattr(CONFIG.MINIO_ACCESS_KEY, "get_secret_value", False): + LOCAL_EXTENDED_EXTRA_CONF["spark.hadoop.fs.s3a.access.key"] = CONFIG.MINIO_ACCESS_KEY.get_secret_value() + +if getattr(CONFIG, "MINIO_SECRET_KEY", False): + LOCAL_EXTENDED_EXTRA_CONF["spark.hadoop.fs.s3a.secret.key"] = CONFIG.MINIO_SECRET_KEY.get_secret_value() + +if getattr(CONFIG, "HIVE_METASTORE_DERBY_DB_DIR", False): + LOCAL_EXTENDED_EXTRA_CONF["spark.hadoop.javax.jdo.option.ConnectionURL"] = ( + f"jdbc:derby:;databaseName={CONFIG.HIVE_METASTORE_DERBY_DB_DIR};create=true" + ) diff --git a/usaspending_api/common/spark/jobs.py b/usaspending_api/common/spark/jobs.py new file mode 100644 index 0000000000..0090a7e2d8 --- /dev/null +++ b/usaspending_api/common/spark/jobs.py @@ -0,0 +1,130 @@ +import logging +from abc import ABC, abstractmethod +from contextlib import contextmanager +from typing import TYPE_CHECKING, Generator + +from databricks.sdk import WorkspaceClient +from django.core.management import call_command + +from usaspending_api.common.helpers.spark_helpers import configure_spark_session, get_active_spark_session +from usaspending_api.common.spark.configs import LOCAL_EXTENDED_EXTRA_CONF + +if TYPE_CHECKING: + from pyspark.sql import SparkSession + +logger = logging.getLogger(__name__) + + +class _AbstractStrategy(ABC): + + @property + @abstractmethod + def name(self) -> str: + pass + + @abstractmethod + def handle_start( + self, job_name: str, command_name: str, command_options: list[str], **kwargs + ) -> dict[str, str] | None: + pass + + +class DatabricksStrategy(_AbstractStrategy): + + _client: WorkspaceClient = None + + @property + def name(self) -> str: + return "DATABRICKS" + + @property + def client(self) -> WorkspaceClient: + if not self._client: + self._client = WorkspaceClient() + return self._client + + def handle_start(self, job_name: str, command_name: str, command_options: list[str], **kwargs) -> dict[str, str]: + job_id = self.get_job_id(job_name) + try: + job_run = self.client.jobs.run_now(job_id, python_params=[command_name, *command_options]) + except Exception: + logger.exception(f'Failed to run job "{job_name}" with ID "{job_id}"') + raise + return {"job_id": job_id, "run_id": job_run.bind()["run_id"]} + + def get_job_id(self, job_name: str) -> int: + run_list = list(self.client.jobs.list(name=job_name)) + if len(run_list) == 0: + raise ValueError(f"No job found with name: {job_name}") + if len(run_list) > 1: + raise ValueError(f"More than one job found that match name: {job_name}") + return run_list[0].job_id + + +class EmrServerlessStrategy(_AbstractStrategy): + + @property + def name(self) -> str: + return "EMR_SERVERLESS" + + def handle_start(self, job_name: str, command_name: str, command_options: list[str], **kwargs) -> dict[str, str]: + # TODO: This will be implemented as we migrate, but added as a placeholder for now + pass + + +class LocalStrategy(_AbstractStrategy): + + @property + def name(self) -> str: + return "LOCAL" + + @staticmethod + @contextmanager + def _get_spark_session() -> Generator["SparkSession", None, None]: + extra_conf = { + **LOCAL_EXTENDED_EXTRA_CONF, + # Overwrite to allow more memory given this will process more data than test cases + "spark.driver.memory": "2g", + "spark.executor.memory": "2g", + } + spark = get_active_spark_session() + spark_created_for_job = False + if not spark: + spark_created_for_job = True + spark = configure_spark_session(**extra_conf, spark_context=spark, enable_hive_support=True) + + yield spark + + if spark_created_for_job: + spark.stop() + + def handle_start(self, job_name: str, command_name: str, command_options: list[str], **kwargs) -> None: + try: + with self._get_spark_session(): + call_command(command_name, *command_options) + except Exception: + logger.exception(f"Failed on command: {command_name} {' '.join(command_options)}") + raise + + +class SparkJobs: + def __init__(self, strategy: _AbstractStrategy): + self._strategy = strategy + + @property + def strategy(self) -> _AbstractStrategy: + return self._strategy + + @strategy.setter + def strategy(self, strategy: _AbstractStrategy) -> None: + self._strategy = strategy + + def start(self, job_name: str, command_name: str, command_options: list[str], **kwargs) -> dict[str, str] | None: + logger.info(f'Starting {job_name} on {self.strategy.name}: "{command_name} {" ".join(command_options)}"') + run_details = self.strategy.handle_start(job_name, command_name, command_options, **kwargs) + if run_details is None: + success_msg = "Job completed successfully" + else: + success_msg = f"Job run successfully started; {run_details}" + logger.info(success_msg) + return run_details diff --git a/usaspending_api/common/tests/integration/test_spark_jobs.py b/usaspending_api/common/tests/integration/test_spark_jobs.py new file mode 100644 index 0000000000..389d41371e --- /dev/null +++ b/usaspending_api/common/tests/integration/test_spark_jobs.py @@ -0,0 +1,21 @@ +from usaspending_api.common.spark.jobs import LocalStrategy, SparkJobs +from usaspending_api.etl.management.commands.load_query_to_delta import TABLE_SPEC + + +def test_local_spark_jobs_strategy(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): + expected_table_name = "award_search" + delta_table_spec = TABLE_SPEC[expected_table_name] + expected_db_name = delta_table_spec["destination_database"] + + spark_jobs = SparkJobs(LocalStrategy()) + spark_jobs.start( + job_name="create_delta_table-award_search", + command_name="create_delta_table", + command_options=[f"--destination-table={expected_table_name}", f"--spark-s3-bucket={s3_unittest_data_bucket}"], + ) + + schemas = spark.sql("show schemas").collect() + assert expected_db_name in [s["namespace"] for s in schemas] + + tables = spark.sql("show tables").collect() + assert expected_table_name in [t["tableName"] for t in tables] diff --git a/usaspending_api/common/tests/unit/test_spark_jobs.py b/usaspending_api/common/tests/unit/test_spark_jobs.py new file mode 100644 index 0000000000..09dd77f9b9 --- /dev/null +++ b/usaspending_api/common/tests/unit/test_spark_jobs.py @@ -0,0 +1,44 @@ +from unittest.mock import MagicMock, patch + +from usaspending_api.common.spark.jobs import DatabricksStrategy, EmrServerlessStrategy, LocalStrategy, SparkJobs + + +@patch("usaspending_api.common.spark.jobs.EmrServerlessStrategy.handle_start") +@patch("usaspending_api.common.spark.jobs.DatabricksStrategy.handle_start") +@patch("usaspending_api.common.spark.jobs.LocalStrategy.handle_start") +def test_set_strategy(local_strategy_start, databricks_strategy_start, emr_serverless_strategy_start): + spark_job = SparkJobs(LocalStrategy()) + assert spark_job.strategy.name == "LOCAL" + spark_job.start("test", "test", ["test"]) + assert local_strategy_start.call_count == 1 + + spark_job.strategy = DatabricksStrategy() + assert spark_job.strategy.name == "DATABRICKS" + spark_job.start("test", "test", ["test"]) + assert databricks_strategy_start.call_count == 1 + + spark_job.strategy = EmrServerlessStrategy() + assert spark_job.strategy.name == "EMR_SERVERLESS" + spark_job.start("test", "test", ["test"]) + assert emr_serverless_strategy_start.call_count == 1 + + +@patch("usaspending_api.common.spark.jobs.DatabricksStrategy.client") +def test_databricks_strategy_handle_start(databricks_strategy_client): + mock_job = MagicMock() + mock_job.job_id = 1 + + mock_job_run_wait = MagicMock() + mock_job_run_wait.bind = MagicMock(return_value={"run_id": 10}) + mock_job_run = MagicMock(return_value=mock_job_run_wait) + + mock_jobs = MagicMock() + mock_jobs.list = MagicMock(return_value=[mock_job]) + mock_jobs.run_now = mock_job_run + databricks_strategy_client.jobs = mock_jobs + + strategy = DatabricksStrategy() + assert strategy.get_job_id("test_job_name") == 1 + + spark_job = SparkJobs(DatabricksStrategy()) + assert spark_job.start(job_name="", command_name="", command_options=[""]) == {"job_id": 1, "run_id": 10} diff --git a/usaspending_api/config/envs/local.py b/usaspending_api/config/envs/local.py index bb6b1e54c9..7c137962d3 100644 --- a/usaspending_api/config/envs/local.py +++ b/usaspending_api/config/envs/local.py @@ -37,7 +37,7 @@ class LocalConfig(DefaultConfig): ENV_CODE: ClassVar[str] = "lcl" # Common credentials to share across services for convenience / ease on remembering - _USASPENDING_USER: str = "usaspending" + _USASPENDING_USER: SecretStr = "usaspending" _USASPENDING_PASSWORD: SecretStr = "usaspender" # ==== [Postgres USAS] ==== diff --git a/usaspending_api/database_scripts/etl/location_delta_view.sql b/usaspending_api/database_scripts/etl/location_delta_view.sql index cfb0b5ec1c..4a5fd74a08 100644 --- a/usaspending_api/database_scripts/etl/location_delta_view.sql +++ b/usaspending_api/database_scripts/etl/location_delta_view.sql @@ -1,460 +1,238 @@ DROP VIEW IF EXISTS location_delta_view; CREATE VIEW location_delta_view AS - WITH transaction_locations_cte AS ( - SELECT - -- Country - CASE - WHEN pop_country_name = 'UNITED STATES OF AMERICA' - THEN 'UNITED STATES' - ELSE - pop_country_name - END AS pop_country_string, - CASE - WHEN pop_country_name = 'UNITED STATES OF AMERICA' - THEN TO_JSONB( - JSONB_BUILD_OBJECT( - 'country_name', 'UNITED STATES', - 'location_type', 'country' - ) - ) - ELSE +-- Country +WITH + country_cte AS ( + SELECT + UPPER(country_name) AS location, TO_JSONB( JSONB_BUILD_OBJECT( - 'country_name', pop_country_name, + 'country_name', country_name, 'location_type', 'country' ) - ) - END AS pop_country_json, + ) AS location_json + FROM + ref_country_code + ), -- State - CASE - WHEN pop_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') AND pop_state_name IS NOT NULL - THEN CONCAT(UPPER(pop_state_name), ', ', 'UNITED STATES') - END AS pop_state_string, - CASE - WHEN pop_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') AND pop_state_name IS NOT NULL - THEN TO_JSONB( + state_cte AS ( + SELECT + CONCAT(UPPER(name), ', ', 'UNITED STATES') AS location, + TO_JSONB( JSONB_BUILD_OBJECT( - 'state_name', UPPER(pop_state_name), + 'state_name', UPPER(name), 'country_name', 'UNITED STATES', 'location_type', 'state' ) - ) - END AS pop_state_json, - -- City - CASE - WHEN pop_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') AND pop_state_name IS NOT NULL AND pop_city_name IS NOT NULL - THEN CONCAT(UPPER(pop_city_name), ', ', UPPER(pop_state_name), ', ', 'UNITED STATES') - WHEN pop_country_name NOT IN ('UNITED STATES OF AMERICA', 'UNITED STATES') AND pop_country_name IS NOT NULL AND pop_city_name IS NOT NULL - THEN concat(UPPER(pop_city_name), ', ', UPPER(pop_country_name)) - END AS pop_city_string, - CASE - WHEN pop_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') AND pop_state_name IS NOT NULL AND pop_city_name IS NOT NULL - THEN TO_JSONB( + ) AS location_json + FROM + state_data + ), + -- City (domestic) + city_domestic AS ( + SELECT + CONCAT(UPPER(ref_city.feature_name), ', ', UPPER(ref_state.name), ', ', 'UNITED STATES') AS location, + TO_JSONB( JSONB_BUILD_OBJECT( - 'city_name', UPPER(pop_city_name), - 'state_name', UPPER(pop_state_name), + 'city_name', UPPER(ref_city.feature_name), + 'state_name', UPPER(ref_state.name), 'country_name', 'UNITED STATES', 'location_type', 'city' ) - ) - WHEN pop_country_name NOT IN ('UNITED STATES OF AMERICA', 'UNITED STATES') AND pop_country_name IS NOT NULL AND pop_city_name IS NOT NULL - THEN TO_JSONB( + ) AS location_json + FROM + ref_city_county_state_code AS ref_city + JOIN + state_data AS ref_state ON ref_state.code = ref_city.state_alpha + ), + -- City (foreign) + city_foreign_pop_cte AS ( + SELECT + CONCAT(UPPER(pop_city_name), ', ', rcc.country_name) AS location, + TO_JSONB( JSONB_BUILD_OBJECT( 'city_name', UPPER(pop_city_name), 'state_name', NULL, - 'country_name', UPPER(pop_country_name), + 'country_name', rcc.country_name, 'location_type', 'city' ) - ) - END AS pop_city_json, + ) AS location_json + FROM + rpt.transaction_search + JOIN + ref_country_code AS rcc ON pop_country_code = rcc.country_code + WHERE + rcc.country_name NOT IN ('UNITED STATES OF AMERICA', 'UNITED STATES') + AND + rcc.country_name IS NOT NULL + AND + pop_city_name IS NOT NULL + ), + city_foreign_rl_cte AS ( + SELECT + CONCAT(UPPER(recipient_location_city_name), ', ', rcc.country_name) AS location, + TO_JSONB( + JSONB_BUILD_OBJECT( + 'city_name', UPPER(recipient_location_city_name), + 'state_name', NULL, + 'country_name', rcc.country_name, + 'location_type', 'city' + ) + ) AS location_json + FROM + rpt.transaction_search + JOIN ref_country_code AS rcc ON + recipient_location_country_code = rcc.country_code + WHERE + rcc.country_name NOT IN ('UNITED STATES OF AMERICA', 'UNITED STATES') + AND + rcc.country_name IS NOT NULL + AND + recipient_location_city_name IS NOT NULL + ), -- County - CASE - WHEN ( - pop_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND pop_state_name IS NOT NULL - AND pop_state_fips IS NOT NULL - AND pop_state_code IS NOT NULL - AND pop_county_name IS NOT NULL - ) - THEN CONCAT(UPPER(pop_county_name), ' COUNTY, ', UPPER(pop_state_name), ', ', 'UNITED STATES') - END AS pop_county_string, - CASE - WHEN ( - pop_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND pop_state_name IS NOT NULL - AND pop_state_fips IS NOT NULL - AND pop_county_code IS NOT NULL - AND pop_county_name IS NOT NULL - ) - THEN TO_JSONB( + county_cte AS ( + SELECT + CONCAT(UPPER(ref_county.county_name), ' COUNTY, ', UPPER(sd.name), ', ', 'UNITED STATES') AS location, + TO_JSONB( JSONB_BUILD_OBJECT( - 'county_name', UPPER(pop_county_name), - 'county_fips', CONCAT(pop_state_fips, pop_county_code), - 'state_name', UPPER(pop_state_name), + 'county_name', UPPER(ref_county.county_name), + 'state_name', UPPER(sd.name), 'country_name', 'UNITED STATES', 'location_type', 'county' ) - ) - END AS pop_county_json, + ) AS location_json + FROM + ref_city_county_state_code AS ref_county + JOIN + state_data AS sd ON sd.code = ref_county.state_alpha + ), -- Zip code - CASE - WHEN pop_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND pop_state_name IS NOT NULL - AND pop_zip5 IS NOT NULL - THEN pop_zip5 - END AS pop_zip_string, - CASE - WHEN ( - pop_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND pop_state_name IS NOT NULL - AND pop_zip5 IS NOT NULL - ) - THEN TO_JSONB( + zip_cte AS ( + SELECT + CONCAT(zips.zip5, ', ', UPPER(sd.name), ', ', 'UNITED STATES') AS location, + TO_JSONB( JSONB_BUILD_OBJECT( - 'zip_code', pop_zip5, - 'state_name', UPPER(pop_state_name), + 'zip_code', zips.zip5, + 'state_name', UPPER(sd.name), 'country_name', 'UNITED STATES', 'location_type', 'zip_code' ) - ) - END AS pop_zip_json, + ) AS location_json + FROM + zips_grouped AS zips + JOIN + state_data AS sd ON sd.code = zips.state_abbreviation + ), -- Current Congressional district - CASE - WHEN ( - pop_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND pop_state_name IS NOT NULL - AND pop_congressional_code_current IS NOT NULL - AND pop_state_code IS NOT NULL - ) - THEN CONCAT(UPPER(pop_state_code), pop_congressional_code_current) - END AS pop_current_cd_string, - CASE - WHEN ( - pop_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND pop_state_name IS NOT NULL - AND pop_congressional_code_current IS NOT NULL - AND pop_state_code IS NOT NULL - ) - THEN TO_JSONB( + current_cd_pop_cte AS ( + SELECT + CONCAT(UPPER(pop_state_code), pop_congressional_code_current) AS location, + TO_JSONB( JSONB_BUILD_OBJECT( - 'current_cd', CONCAT(UPPER(pop_state_code), '-', pop_congressional_code_current), - 'state_name', UPPER(pop_state_name), + 'current_cd', CONCAT(UPPER(pop_state_code), '-', pop_congressional_code_current), + 'state_name', UPPER(sd.name), 'country_name', 'UNITED STATES', 'location_type', 'current_cd' ) - ) - END AS pop_current_cd_json, - -- Original Congressional district - CASE - WHEN ( - pop_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND pop_state_name IS NOT NULL - AND pop_congressional_code IS NOT NULL - AND pop_state_code IS NOT NULL - ) - THEN CONCAT(UPPER(pop_state_code), pop_congressional_code) - END AS pop_original_cd_string, - CASE - WHEN ( - pop_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND pop_state_name IS NOT NULL - AND pop_congressional_code IS NOT NULL - AND pop_state_code IS NOT NULL - ) - THEN TO_JSONB( - JSONB_BUILD_OBJECT( - 'original_cd', CONCAT(UPPER(pop_state_code), '-', pop_congressional_code), - 'state_name', UPPER(pop_state_name), - 'country_name', 'UNITED STATES', - 'location_type', 'original_cd' - ) - ) - END AS pop_original_cd_json, - CASE - WHEN recipient_location_country_name = 'UNITED STATES OF AMERICA' - THEN 'UNITED STATES' - ELSE recipient_location_country_name - END AS recipient_location_country_string, - CASE - WHEN recipient_location_country_name = 'UNITED STATES OF AMERICA' - THEN TO_JSONB( - JSONB_BUILD_OBJECT( - 'country_name', 'UNITED STATES', - 'location_type', 'country' - ) - ) - ELSE + ) AS location_json + FROM + rpt.transaction_search + RIGHT JOIN + state_data AS sd ON sd.code = pop_state_code + WHERE + pop_state_code IS NOT NULL + AND + pop_congressional_code_current ~ '^[0-9]{2}$' + ), + current_cd_rl_cte AS ( + SELECT + CONCAT(UPPER(recipient_location_state_code), recipient_location_congressional_code_current) AS location, TO_JSONB( JSONB_BUILD_OBJECT( - 'country_name', recipient_location_country_name, - 'location_type', 'country' - ) - ) - END AS recipient_location_country_json, - -- State - CASE - WHEN recipient_location_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') AND recipient_location_state_name IS NOT NULL - THEN CONCAT(UPPER(recipient_location_state_name), ', ', 'UNITED STATES') - END AS recipient_location_state_string, - CASE - WHEN recipient_location_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') AND recipient_location_state_name IS NOT NULL - THEN TO_JSONB( - JSONB_BUILD_OBJECT( - 'state_name', UPPER(recipient_location_state_name), - 'country_name', 'UNITED STATES', - 'location_type', 'state' - ) - ) - END AS recipient_location_state_json, - -- City - CASE - WHEN ( - recipient_location_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND recipient_location_state_name IS NOT NULL - AND recipient_location_city_name IS NOT NULL - ) - THEN CONCAT(UPPER(recipient_location_city_name), ', ', UPPER(recipient_location_state_name), ', ', 'UNITED STATES') - WHEN ( - recipient_location_country_name NOT IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND recipient_location_country_name IS NOT NULL - AND recipient_location_city_name IS NOT NULL - ) - THEN concat(UPPER(recipient_location_city_name), ', ', UPPER(recipient_location_country_name)) - END AS recipient_location_city_string, - CASE - WHEN ( - recipient_location_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND recipient_location_state_name IS NOT NULL - AND recipient_location_city_name IS NOT NULL - ) - THEN TO_JSONB( - JSONB_BUILD_OBJECT( - 'city_name', UPPER(recipient_location_city_name), - 'state_name', UPPER(recipient_location_state_name), - 'country_name', 'UNITED STATES', - 'location_type', 'city' - ) - ) - WHEN ( - recipient_location_country_name NOT IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND recipient_location_country_name IS NOT NULL - AND recipient_location_city_name IS NOT NULL - ) - THEN TO_JSONB( - JSONB_BUILD_OBJECT( - 'city_name', UPPER(recipient_location_city_name), - 'state_name', NULL, - 'country_name', UPPER(recipient_location_country_name), - 'location_type', 'city' - ) - ) - END AS recipient_location_city_json, - -- County - CASE - WHEN ( - recipient_location_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND recipient_location_state_name IS NOT NULL - AND recipient_location_state_fips IS NOT NULL - AND recipient_location_state_code IS NOT NULL - AND recipient_location_county_name IS NOT NULL - ) - THEN CONCAT(UPPER(recipient_location_county_name), ' COUNTY, ', UPPER(recipient_location_state_name), ', ', 'UNITED STATES') - END AS recipient_location_county_string, - CASE - WHEN ( - recipient_location_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND recipient_location_state_name IS NOT NULL - AND recipient_location_state_fips IS NOT NULL - AND recipient_location_county_code IS NOT NULL - AND recipient_location_county_name IS NOT NULL - ) - THEN TO_JSONB( - JSONB_BUILD_OBJECT( - 'county_name', UPPER(recipient_location_county_name), - 'county_fips', CONCAT(recipient_location_state_fips, recipient_location_county_code), - 'state_name', UPPER(recipient_location_state_name), - 'country_name', 'UNITED STATES', - 'location_type', 'county' - ) - ) - END AS recipient_location_county_json, - -- Zip code - CASE - WHEN ( - recipient_location_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND recipient_location_state_name IS NOT NULL - AND recipient_location_zip5 IS NOT NULL - ) - THEN recipient_location_zip5 - END AS recipient_location_zip_string, - CASE - WHEN ( - recipient_location_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND recipient_location_state_name IS NOT NULL - AND recipient_location_zip5 IS NOT NULL - ) - THEN TO_JSONB( - JSONB_BUILD_OBJECT( - 'zip_code', recipient_location_zip5, - 'state_name', UPPER(recipient_location_state_name), + 'current_cd', + CONCAT(UPPER(recipient_location_state_code), '-', recipient_location_congressional_code_current), + 'state_name', UPPER(sd.name), 'country_name', 'UNITED STATES', - 'location_type', 'zip_code' + 'location_type', 'current_cd' ) ) - END AS recipient_location_zip_json, - -- Current Congressional district - CASE - WHEN ( - recipient_location_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND recipient_location_state_name IS NOT NULL - AND recipient_location_congressional_code_current IS NOT NULL - AND recipient_location_state_code IS NOT NULL - ) - THEN CONCAT(UPPER(recipient_location_state_code), recipient_location_congressional_code_current) - END AS recipient_location_current_cd_string, - CASE - WHEN ( - recipient_location_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND recipient_location_state_name IS NOT NULL - AND recipient_location_congressional_code_current IS NOT NULL - AND recipient_location_state_code IS NOT NULL - ) - THEN TO_JSONB( + FROM + rpt.transaction_search + RIGHT JOIN + state_data AS sd ON sd.code = recipient_location_state_code + WHERE + recipient_location_state_code IS NOT NULL + AND + recipient_location_congressional_code_current ~ '^[0-9]{2}$' + ), + -- Original Congressional district + original_cd_pop_cte AS ( + SELECT + CONCAT(UPPER(pop_state_code), pop_congressional_code) AS location, + TO_JSONB( JSONB_BUILD_OBJECT( - 'current_cd', CONCAT(UPPER(recipient_location_state_code), '-', recipient_location_congressional_code_current), - 'state_name', UPPER(recipient_location_state_name), + 'original_cd', CONCAT(UPPER(pop_state_code), '-', pop_congressional_code), + 'state_name', UPPER(sd.name), 'country_name', 'UNITED STATES', - 'location_type', 'current_cd' + 'location_type', 'original_cd' ) ) - END AS recipient_location_current_cd_json, - -- Original Congressional district - CASE - WHEN ( - recipient_location_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND recipient_location_state_name IS NOT NULL - AND recipient_location_congressional_code IS NOT NULL - AND recipient_location_state_code IS NOT NULL - ) - THEN CONCAT(UPPER(recipient_location_state_code), recipient_location_congressional_code) - END AS recipient_location_original_cd_string, - CASE - WHEN ( - recipient_location_country_name IN ('UNITED STATES OF AMERICA', 'UNITED STATES') - AND recipient_location_state_name IS NOT NULL - AND recipient_location_congressional_code IS NOT NULL - AND recipient_location_state_code IS NOT NULL - ) - THEN TO_JSONB( + FROM + rpt.transaction_search + RIGHT JOIN + state_data AS sd ON sd.code = pop_state_code + WHERE + pop_state_code IS NOT NULL + AND + pop_congressional_code ~ '^[0-9]{2}$' + ), + original_cd_rl_cte AS ( + SELECT + CONCAT(UPPER(recipient_location_state_code), recipient_location_congressional_code) AS location, + TO_JSONB( JSONB_BUILD_OBJECT( 'original_cd', CONCAT(UPPER(recipient_location_state_code), '-', recipient_location_congressional_code), - 'state_name', UPPER(recipient_location_state_name), + 'state_name', UPPER(sd.name), 'country_name', 'UNITED STATES', 'location_type', 'original_cd' ) ) - END AS recipient_location_original_cd_json -FROM - rpt.transaction_search -WHERE - pop_country_name IS NOT NULL - OR - recipient_location_country_name IS NOT NULL -), -normalized_column_names_cte AS ( - SELECT - pop_country_string AS location, - pop_country_json AS location_json - FROM - transaction_locations_cte - UNION - SELECT - pop_state_string AS location, - pop_state_json AS location_json - FROM - transaction_locations_cte - UNION - SELECT - pop_city_string AS location, - pop_city_json AS location_json - FROM - transaction_locations_cte - UNION - SELECT - pop_county_string AS location, - pop_county_json AS location_json - FROM - transaction_locations_cte - UNION - SELECT - pop_zip_string AS location, - pop_zip_json AS location_json - FROM - transaction_locations_cte - UNION - SELECT - pop_current_cd_string AS location, - pop_current_cd_json AS location_json - FROM - transaction_locations_cte - UNION - SELECT - pop_original_cd_string AS location, - pop_original_cd_json AS location_json - FROM - transaction_locations_cte - UNION - SELECT - recipient_location_country_string AS location, - recipient_location_country_json AS location_json - FROM - transaction_locations_cte - UNION - SELECT - recipient_location_state_string AS location, - recipient_location_state_json AS location_json - FROM - transaction_locations_cte - UNION - SELECT - recipient_location_city_string AS location, - recipient_location_city_json AS location_json - FROM - transaction_locations_cte - UNION - SELECT - recipient_location_county_string AS location, - recipient_location_county_json AS location_json - FROM - transaction_locations_cte - UNION - SELECT - recipient_location_zip_string AS location, - recipient_location_zip_json AS location_json - FROM - transaction_locations_cte - UNION - SELECT - recipient_location_current_cd_string AS location, - recipient_location_current_cd_json AS location_json - FROM - transaction_locations_cte - UNION - SELECT - recipient_location_original_cd_string AS location, - recipient_location_original_cd_json AS location_json - FROM - transaction_locations_cte -) + FROM + rpt.transaction_search + RIGHT JOIN + state_data AS sd ON sd.code = recipient_location_state_code + WHERE + recipient_location_state_code IS NOT NULL + AND + recipient_location_congressional_code ~ '^[0-9]{2}$' + ) SELECT ROW_NUMBER() OVER (ORDER BY location, location_json) AS id, location, location_json FROM - normalized_column_names_cte -WHERE - -- Only include locations that have at least two characters - location ~ '[A-Z0-9].*[A-Z0-9]' - and - location_json IS NOT NULL + ( + SELECT * FROM country_cte + UNION + SELECT * FROM state_cte + UNION + SELECT * FROM city_domestic + UNION + SELECT * FROM city_foreign_pop_cte + UNION + SELECT * FROM city_foreign_rl_cte + UNION + SELECT * FROM county_cte + UNION + SELECT * FROM zip_cte + UNION + SELECT * FROM current_cd_pop_cte + UNION + SELECT * FROM current_cd_rl_cte + UNION + SELECT * FROM original_cd_pop_cte + UNION + SELECT * FROM original_cd_rl_cte + ) AS union_all diff --git a/usaspending_api/disaster/management/commands/generate_covid19_download.py b/usaspending_api/disaster/management/commands/generate_covid19_download.py index 9218debb82..13b793f6e3 100644 --- a/usaspending_api/disaster/management/commands/generate_covid19_download.py +++ b/usaspending_api/disaster/management/commands/generate_covid19_download.py @@ -10,6 +10,7 @@ from usaspending_api.common.helpers.s3_helpers import upload_download_file_to_s3 from usaspending_api.common.helpers.spark_helpers import configure_spark_session, get_active_spark_session +from usaspending_api.common.helpers.sql_helpers import read_sql_file_to_text from usaspending_api.common.helpers.download_csv_strategies import ( PostgresToCSVStrategy, SparkToCSVStrategy, @@ -142,13 +143,15 @@ def process_data_copy_jobs(self): logger.info(f"Creating new COVID-19 download zip file: {self.zip_file_path}") self.filepaths_to_delete.append(self.zip_file_path) - for sql_file, final_name in self.download_file_list: + for source_sql, final_name in self.download_file_list: final_path = self._create_data_csv_dest_path(final_name) intermediate_data_file_path = final_path.parent / (final_path.name + "_temp") - data_file_names, count = self.download_to_csv( - sql_file, final_path, final_name, str(intermediate_data_file_path) + if self.compute_type_arg == ComputeTypeEnum.POSTGRES.value: + source_sql = read_sql_file_to_text(Path(source_sql)) + download_metadata = self.download_to_csv( + source_sql, final_path, final_name, str(intermediate_data_file_path) ) - if count <= 0: + if download_metadata.number_of_rows <= 0: logger.warning(f"Empty data file generated: {final_path}!") self.filepaths_to_delete.extend(self.working_dir_path.glob(f"{final_path.stem}*")) @@ -162,9 +165,9 @@ def complete_zip_and_upload(self): logger.info(f"Created database record {db_id} for future retrieval") logger.info("Marking zip file for deletion in cleanup") else: - logger.warn("Not uploading zip file to S3. Leaving file locally") + logger.warning("Not uploading zip file to S3. Leaving file locally") self.filepaths_to_delete.remove(self.zip_file_path) - logger.warn("Not creating database record") + logger.warning("Not creating database record") @property def download_file_list(self): diff --git a/usaspending_api/download/delta_models/account_download.py b/usaspending_api/download/delta_models/account_download.py new file mode 100644 index 0000000000..1381a7b399 --- /dev/null +++ b/usaspending_api/download/delta_models/account_download.py @@ -0,0 +1,363 @@ +ACCOUNT_DOWNLOAD_COLUMNS = { + "financial_accounts_by_awards_id": {"delta": "INTEGER NOT NULL", "postgres": "INTEGER NOT NULL"}, + "submission_id": {"delta": "INTEGER NOT NULL", "postgres": "INTEGER NOT NULL"}, + "owning_agency_name": {"delta": "STRING", "postgres": "TEXT"}, + "federal_account_symbol": {"delta": "STRING", "postgres": "TEXT"}, + "federal_account_name": {"delta": "STRING", "postgres": "TEXT"}, + "agency_identifier_name": {"delta": "STRING", "postgres": "TEXT"}, + "program_activity_code": {"delta": "STRING", "postgres": "TEXT"}, + "program_activity_name": {"delta": "STRING", "postgres": "TEXT"}, + "object_class_code": {"delta": "STRING", "postgres": "TEXT"}, + "object_class_name": {"delta": "STRING", "postgres": "TEXT"}, + "direct_or_reimbursable_funding_source": {"delta": "STRING", "postgres": "TEXT"}, + "disaster_emergency_fund_code": {"delta": "STRING", "postgres": "TEXT"}, + "disaster_emergency_fund_name": {"delta": "STRING", "postgres": "TEXT"}, + "award_unique_key": {"delta": "STRING", "postgres": "TEXT"}, + "award_id_piid": {"delta": "STRING", "postgres": "TEXT"}, + "parent_award_id_piid": {"delta": "STRING", "postgres": "TEXT"}, + "award_id_fain": {"delta": "STRING", "postgres": "TEXT"}, + "award_id_uri": {"delta": "STRING", "postgres": "TEXT"}, + "award_base_action_date": {"delta": "DATE", "postgres": "DATE"}, + "award_latest_action_date": {"delta": "DATE", "postgres": "DATE"}, + "period_of_performance_start_date": {"delta": "DATE", "postgres": "DATE"}, + "period_of_performance_current_end_date": {"delta": "DATE", "postgres": "DATE"}, + "ordering_period_end_date": {"delta": "DATE", "postgres": "DATE"}, + "idv_type_code": {"delta": "STRING", "postgres": "TEXT"}, + "idv_type": {"delta": "STRING", "postgres": "TEXT"}, + "prime_award_base_transaction_description": {"delta": "STRING", "postgres": "TEXT"}, + "awarding_agency_code": {"delta": "STRING", "postgres": "TEXT"}, + "awarding_agency_name": {"delta": "STRING", "postgres": "TEXT"}, + "awarding_subagency_code": {"delta": "STRING", "postgres": "TEXT"}, + "awarding_subagency_name": {"delta": "STRING", "postgres": "TEXT"}, + "awarding_office_code": {"delta": "STRING", "postgres": "TEXT"}, + "awarding_office_name": {"delta": "STRING", "postgres": "TEXT"}, + "funding_agency_code": {"delta": "STRING", "postgres": "TEXT"}, + "funding_agency_name": {"delta": "STRING", "postgres": "TEXT"}, + "funding_sub_agency_code": {"delta": "STRING", "postgres": "TEXT"}, + "funding_sub_agency_name": {"delta": "STRING", "postgres": "TEXT"}, + "funding_office_code": {"delta": "STRING", "postgres": "TEXT"}, + "funding_office_name": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_uei": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_duns": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_name": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_name_raw": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_parent_uei": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_parent_duns": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_parent_name": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_parent_name_raw": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_country": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_state": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_county": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_city": {"delta": "STRING", "postgres": "TEXT"}, + "primary_place_of_performance_country": {"delta": "STRING", "postgres": "TEXT"}, + "primary_place_of_performance_state": {"delta": "STRING", "postgres": "TEXT"}, + "primary_place_of_performance_county": {"delta": "STRING", "postgres": "TEXT"}, + "primary_place_of_performance_zip_code": {"delta": "STRING", "postgres": "TEXT"}, + "cfda_number": {"delta": "STRING", "postgres": "TEXT"}, + "cfda_title": {"delta": "STRING", "postgres": "TEXT"}, + "product_or_service_code": {"delta": "STRING", "postgres": "TEXT"}, + "product_or_service_code_description": {"delta": "STRING", "postgres": "TEXT"}, + "naics_code": {"delta": "STRING", "postgres": "TEXT"}, + "naics_description": {"delta": "STRING", "postgres": "TEXT"}, + "national_interest_action_code": {"delta": "STRING", "postgres": "TEXT"}, + "national_interest_action": {"delta": "STRING", "postgres": "TEXT"}, + "reporting_agency_name": {"delta": "STRING", "postgres": "TEXT"}, + "submission_period": {"delta": "STRING", "postgres": "TEXT"}, + "funding_toptier_agency_id": {"delta": "INTEGER", "postgres": "INTEGER"}, + "federal_account_id": {"delta": "INTEGER", "postgres": "INTEGER"}, + "budget_function": {"delta": "STRING", "postgres": "TEXT"}, + "budget_function_code": {"delta": "STRING", "postgres": "TEXT"}, + "budget_subfunction": {"delta": "STRING", "postgres": "TEXT"}, + "budget_subfunction_code": {"delta": "STRING", "postgres": "TEXT"}, + "transaction_obligated_amount": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)"}, + "gross_outlay_amount_fyb_to_period_end": {"delta": "NUMERIC(23,2)", "postgres": "NUMERIC(23,2)"}, + "ussgl487200_downward_adj_prior_year_prepaid_undeliv_order_oblig": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + }, + "ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig": { + "delta": "NUMERIC(23,2)", + "postgres": "NUMERIC(23,2)", + }, + "award_base_action_date_fiscal_year": {"delta": "INTEGER", "postgres": "INTEGER"}, + "award_latest_action_date_fiscal_year": {"delta": "INTEGER", "postgres": "INTEGER"}, + "award_type_code": {"delta": "STRING", "postgres": "TEXT"}, + "award_type": {"delta": "STRING", "postgres": "TEXT"}, + "prime_award_summary_recipient_cd_original": {"delta": "STRING", "postgres": "TEXT"}, + "prime_award_summary_recipient_cd_current": {"delta": "STRING", "postgres": "TEXT"}, + "recipient_zip_code": {"delta": "STRING", "postgres": "TEXT"}, + "prime_award_summary_place_of_performance_cd_original": {"delta": "STRING", "postgres": "TEXT"}, + "prime_award_summary_place_of_performance_cd_current": {"delta": "STRING", "postgres": "TEXT"}, + "usaspending_permalink": {"delta": "STRING", "postgres": "TEXT"}, + "last_modified_date": {"delta": "DATE", "postgres": "DATE"}, + "reporting_fiscal_period": {"delta": "INTEGER", "postgres": "INTEGER"}, + "reporting_fiscal_quarter": {"delta": "INTEGER", "postgres": "INTEGER"}, + "reporting_fiscal_year": {"delta": "INTEGER", "postgres": "INTEGER"}, + "quarter_format_flag": {"delta": "BOOLEAN", "postgres": "BOOLEAN"}, +} + +ACCOUNT_DOWNLOAD_DELTA_COLUMNS = {k: v["delta"] for k, v in ACCOUNT_DOWNLOAD_COLUMNS.items()} +ACCOUNT_DOWNLOAD_POSTGRES_COLUMNS = {k: v["postgres"] for k, v in ACCOUNT_DOWNLOAD_COLUMNS.items()} + +account_download_create_sql_string = rf""" + CREATE OR REPLACE TABLE {{DESTINATION_TABLE}} ( + {", ".join([f'{key} {val}' for key, val in ACCOUNT_DOWNLOAD_DELTA_COLUMNS.items()])} + ) + USING DELTA + LOCATION 's3a://{{SPARK_S3_BUCKET}}/{{DELTA_LAKE_S3_PATH}}/{{DESTINATION_DATABASE}}/{{DESTINATION_TABLE}}' + """ + +account_download_load_sql_string = rf""" + INSERT OVERWRITE {{DESTINATION_DATABASE}}.{{DESTINATION_TABLE}} ( + {",".join(list(ACCOUNT_DOWNLOAD_COLUMNS))} + ) + SELECT + financial_accounts_by_awards.financial_accounts_by_awards_id, + financial_accounts_by_awards.submission_id, + toptier_agency.name AS owning_agency_name, + federal_account.federal_account_code AS federal_account_symbol, + federal_account.account_title AS federal_account_name, + cgac_aid.agency_name AS agency_identifier_name, + ref_program_activity.program_activity_code, + ref_program_activity.program_activity_name, + object_class.object_class AS object_class_code, + object_class.object_class_name, + object_class.direct_reimbursable AS direct_or_reimbursable_funding_source, + financial_accounts_by_awards.disaster_emergency_fund_code, + disaster_emergency_fund_code.title AS disaster_emergency_fund_name, + award_search.generated_unique_award_id AS award_unique_key, + financial_accounts_by_awards.piid AS award_id_piid, + financial_accounts_by_awards.parent_award_id AS parent_award_id_piid, + financial_accounts_by_awards.fain AS award_id_fain, + financial_accounts_by_awards.uri AS award_id_uri, + CAST(award_search.date_signed AS DATE) AS award_base_action_date, + CAST(award_search.certified_date AS DATE) AS award_latest_action_date, + CAST(award_search.period_of_performance_start_date AS DATE), + CAST(award_search.period_of_performance_current_end_date AS DATE), + CAST(transaction_search.ordering_period_end_date AS DATE), + transaction_search.idv_type AS idv_type_code, + transaction_search.idv_type_description AS idv_type, + award_search.description AS prime_award_base_transaction_description, + transaction_search.awarding_agency_code, + transaction_search.awarding_toptier_agency_name_raw AS awarding_agency_name, + transaction_search.awarding_sub_tier_agency_c AS awarding_subagency_code, + transaction_search.awarding_subtier_agency_name_raw AS awarding_subagency_name, + transaction_search.awarding_office_code, + transaction_search.awarding_office_name, + transaction_search.funding_agency_code, + transaction_search.funding_toptier_agency_name_raw AS funding_agency_name, + transaction_search.funding_sub_tier_agency_co AS funding_sub_agency_code, + transaction_search.funding_subtier_agency_name_raw AS funding_sub_agency_name, + transaction_search.funding_office_code, + transaction_search.funding_office_name, + transaction_search.recipient_uei, + transaction_search.recipient_unique_id AS recipient_duns, + transaction_search.recipient_name, + transaction_search.recipient_name_raw, + transaction_search.parent_uei AS recipient_parent_uei, + transaction_search.parent_uei AS recipient_parent_duns, + transaction_search.parent_recipient_name AS recipient_parent_name, + transaction_search.parent_recipient_name_raw AS recipient_parent_name_raw, + transaction_search.recipient_location_country_code AS recipient_country, + transaction_search.recipient_location_state_code AS recipient_state, + transaction_search.recipient_location_county_name AS recipient_county, + transaction_search.recipient_location_city_name AS recipient_city, + transaction_search.pop_country_name AS primary_place_of_performance_country, + transaction_search.pop_state_name AS primary_place_of_performance_state, + transaction_search.pop_county_name AS primary_place_of_performance_county, + transaction_search.place_of_performance_zip4a AS primary_place_of_performance_zip_code, + transaction_search.cfda_number, + transaction_search.cfda_title, + transaction_search.product_or_service_code, + transaction_search.product_or_service_description AS product_or_service_code_description, + transaction_search.naics_code, + transaction_search.naics_description, + transaction_search.national_interest_action AS national_interest_action_code, + transaction_search.national_interest_desc AS national_interest_action, + submission_attributes.reporting_agency_name AS reporting_agency_name, + CASE + WHEN submission_attributes.quarter_format_flag = TRUE + THEN + CONCAT( + CAST('FY' AS STRING), + CAST(submission_attributes.reporting_fiscal_year AS STRING), + CAST('Q' AS STRING), + CAST( + submission_attributes.reporting_fiscal_quarter AS STRING + ) + ) + ELSE + CONCAT( + CAST('FY' AS STRING), + CAST(submission_attributes.reporting_fiscal_year AS STRING), + CAST('P' AS STRING), + LPAD( + CAST( + submission_attributes.reporting_fiscal_period AS STRING + ), + 2, + '0' + ) + ) + END AS submission_period, + treasury_appropriation_account.funding_toptier_agency_id AS funding_toptier_agency_id, + treasury_appropriation_account.federal_account_id AS federal_account_id, + treasury_appropriation_account.budget_function_title AS budget_function, + treasury_appropriation_account.budget_function_code AS budget_function_code, + treasury_appropriation_account.budget_subfunction_title AS budget_subfunction, + treasury_appropriation_account.budget_subfunction_code AS budget_subfunction_code, + financial_accounts_by_awards.transaction_obligated_amount AS transaction_obligated_amount, + financial_accounts_by_awards.gross_outlay_amount_by_award_cpe as gross_outlay_amount_fyb_to_period_end, + financial_accounts_by_awards.ussgl487200_down_adj_pri_ppaid_undel_orders_oblig_refund_cpe as ussgl487200_downward_adj_prior_year_prepaid_undeliv_order_oblig, + financial_accounts_by_awards.ussgl497200_down_adj_pri_paid_deliv_orders_oblig_refund_cpe as ussgl497200_downward_adj_of_prior_year_paid_deliv_orders_oblig, + EXTRACT( + YEAR FROM (award_search.date_signed) + INTERVAL '3 months' + ) AS award_base_action_date_fiscal_year, + EXTRACT( + YEAR FROM (award_search.certified_date) + INTERVAL '3 months' + ) AS award_latest_action_date_fiscal_year, + COALESCE( + transaction_search.contract_award_type, + transaction_search.type + ) AS award_type_code, + COALESCE( + transaction_search.contract_award_type_desc, + transaction_search.type_description + ) AS award_type, + CASE + WHEN + transaction_search.recipient_location_state_code IS NOT NULL + AND transaction_search.recipient_location_congressional_code IS NOT NULL + AND NOT ( + transaction_search.recipient_location_state_code = '' + AND transaction_search.recipient_location_state_code IS NOT NULL + ) + THEN + CONCAT( + transaction_search.recipient_location_state_code, '-', + transaction_search.recipient_location_congressional_code + ) + ELSE transaction_search.recipient_location_congressional_code + END AS prime_award_summary_recipient_cd_original, + CASE + WHEN + transaction_search.recipient_location_state_code IS NOT NULL + AND transaction_search.recipient_location_congressional_code_current IS NOT NULL + AND NOT ( + transaction_search.recipient_location_state_code = '' + AND transaction_search.recipient_location_state_code IS NOT NULL + ) + THEN + CONCAT( + transaction_search.recipient_location_state_code, '-', + transaction_search.recipient_location_congressional_code_current + ) + ELSE transaction_search.recipient_location_congressional_code_current + END AS prime_award_summary_recipient_cd_current, + COALESCE( + transaction_search.legal_entity_zip4, + CONCAT( + CAST(transaction_search.recipient_location_zip5 AS STRING), + CAST(transaction_search.legal_entity_zip_last4 AS STRING) + ) + ) AS recipient_zip_code, + CASE + WHEN + transaction_search.pop_state_code IS NOT NULL + AND transaction_search.pop_congressional_code IS NOT NULL + AND NOT ( + transaction_search.pop_state_code = '' + AND transaction_search.pop_state_code IS NOT NULL + ) + THEN + CONCAT( + transaction_search.pop_state_code, + '-', + transaction_search.pop_congressional_code + ) + ELSE transaction_search.pop_congressional_code + END AS prime_award_summary_place_of_performance_cd_original, + CASE + WHEN + transaction_search.pop_state_code IS NOT NULL + AND transaction_search.pop_congressional_code_current IS NOT NULL + AND NOT ( + transaction_search.pop_state_code = '' + AND transaction_search.pop_state_code IS NOT NULL + ) + THEN + CONCAT( + transaction_search.pop_state_code, + '-', + transaction_search.pop_congressional_code_current + ) + ELSE transaction_search.pop_congressional_code_current + END AS prime_award_summary_place_of_performance_cd_current, + CASE + WHEN award_search.generated_unique_award_id IS NOT NULL + THEN + CONCAT( + '{{AWARD_URL}}', + URL_ENCODE(award_search.generated_unique_award_id), + '/' + ) + ELSE '' + END AS usaspending_permalink, + CAST(submission_attributes.published_date AS DATE) AS last_modified_date, + submission_attributes.reporting_fiscal_period, + submission_attributes.reporting_fiscal_quarter, + submission_attributes.reporting_fiscal_year, + submission_attributes.quarter_format_flag + FROM raw.financial_accounts_by_awards + INNER JOIN global_temp.submission_attributes AS submission_attributes + ON ( + financial_accounts_by_awards.submission_id + = submission_attributes.submission_id + ) + LEFT OUTER JOIN global_temp.treasury_appropriation_account + ON ( + financial_accounts_by_awards.treasury_account_id + = treasury_appropriation_account.treasury_account_identifier + ) + LEFT OUTER JOIN award_search + ON ( + financial_accounts_by_awards.award_id = award_search.award_id + ) + LEFT OUTER JOIN transaction_search + ON ( + award_search.latest_transaction_search_id + = transaction_search.transaction_id + ) + LEFT OUTER JOIN global_temp.ref_program_activity + ON ( + financial_accounts_by_awards.program_activity_id + = ref_program_activity.id + ) + LEFT OUTER JOIN global_temp.object_class + ON ( + financial_accounts_by_awards.object_class_id = object_class.id + ) + LEFT OUTER JOIN global_temp.disaster_emergency_fund_code + ON ( + financial_accounts_by_awards.disaster_emergency_fund_code + = disaster_emergency_fund_code.code + ) + LEFT OUTER JOIN global_temp.federal_account + ON ( + treasury_appropriation_account.federal_account_id = federal_account.id + ) + LEFT OUTER JOIN global_temp.toptier_agency + ON ( + federal_account.parent_toptier_agency_id + = toptier_agency.toptier_agency_id + ) + LEFT OUTER JOIN global_temp.cgac AS cgac_aid + ON ( + treasury_appropriation_account.agency_id = cgac_aid.cgac_code + ) + LEFT OUTER JOIN global_temp.cgac AS cgac_ata + ON ( + treasury_appropriation_account.allocation_transfer_agency_id + = cgac_ata.cgac_code + ); + """ diff --git a/usaspending_api/download/management/__init__.py b/usaspending_api/download/management/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/usaspending_api/download/management/commands/__init__.py b/usaspending_api/download/management/commands/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/usaspending_api/download/management/commands/delta_downloads/__init__.py b/usaspending_api/download/management/commands/delta_downloads/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/__init__.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/builders.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/builders.py new file mode 100644 index 0000000000..fcf871a86a --- /dev/null +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/builders.py @@ -0,0 +1,134 @@ +from dataclasses import dataclass +from functools import reduce +from typing import Any + +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql import functions as sf, Column + +from usaspending_api.download.management.commands.delta_downloads.award_financial.filters import AccountDownloadFilter +from usaspending_api.download.management.commands.delta_downloads.award_financial.columns import ( + federal_account_groupby_cols, + federal_account_select_cols, +) +from usaspending_api.submissions.helpers import get_submission_ids_for_periods + + +class AccountDownloadDataFrameBuilder: + + def __init__( + self, + spark: SparkSession, + account_download_filter: AccountDownloadFilter, + table_name: str = "rpt.account_download", + ): + self.reporting_fiscal_year = account_download_filter.fy + self.reporting_fiscal_quarter = account_download_filter.quarter or account_download_filter.period // 3 + self.reporting_fiscal_period = account_download_filter.period or account_download_filter.quarter * 3 + self.agency = account_download_filter.agency + self.federal_account_id = account_download_filter.federal_account + self.budget_function = account_download_filter.budget_function + self.budget_subfunction = account_download_filter.budget_subfunction + self.def_codes = account_download_filter.def_codes + self.df: str = spark.table(table_name) + self.groupby_cols: list[str] = federal_account_groupby_cols + self.select_cols: list[str] = federal_account_select_cols + + def filter_to_latest_submissions_for_agencies(self, col_name: str, otherwise: Any = None) -> Column: + """Filter to the latest submission regardless of whether the agency submitted on a monthly or quarterly basis""" + return ( + sf.when( + sf.col("submission_id").isin( + get_submission_ids_for_periods( + self.reporting_fiscal_year, self.reporting_fiscal_quarter, self.reporting_fiscal_period + ) + ), + sf.col(col_name), + ) + .otherwise(otherwise) + .alias(col_name) + ) + + @property + def combined_filters(self) -> Column: + + @dataclass + class Condition: + name: str + condition: Column + apply: bool + + conditions = [ + Condition(name="year", condition=sf.col("reporting_fiscal_year") == self.reporting_fiscal_year, apply=True), + Condition( + name="quarter or month", + condition=( + (sf.col("reporting_fiscal_period") <= self.reporting_fiscal_period) & ~sf.col("quarter_format_flag") + ) + | ( + (sf.col("reporting_fiscal_quarter") <= self.reporting_fiscal_quarter) + & sf.col("quarter_format_flag") + ), + apply=True, + ), + Condition( + name="agency", condition=sf.col("funding_toptier_agency_id") == self.agency, apply=bool(self.agency) + ), + Condition( + name="federal account", + condition=sf.col("federal_account_id") == self.federal_account_id, + apply=bool(self.federal_account_id), + ), + Condition( + name="budget function", + condition=sf.col("budget_function_code") == self.budget_function, + apply=bool(self.budget_function), + ), + Condition( + name="budget subfunction", + condition=sf.col("budget_subfunction_code") == self.budget_subfunction, + apply=bool(self.budget_subfunction), + ), + Condition( + name="def_codes", + condition=sf.col("disaster_emergency_fund_code").isin(self.def_codes), + apply=bool(self.def_codes), + ), + ] + return reduce( + lambda x, y: x & y, + [condition.condition for condition in conditions if condition.apply], + ) + + @staticmethod + def collect_concat(col_name: str, concat_str: str = "; ") -> Column: + return sf.concat_ws(concat_str, sf.sort_array(sf.collect_set(col_name))).alias(col_name) + + @property + def source_df(self) -> DataFrame: + return ( + self.df.filter(self.combined_filters) + .groupBy(self.groupby_cols) + .agg( + *[ + self.collect_concat(col) + for col in ["reporting_agency_name", "budget_function", "budget_subfunction"] + ], + sf.sum("transaction_obligated_amount").alias("transaction_obligated_amount"), + *[ + sf.sum(self.filter_to_latest_submissions_for_agencies(col)).alias(col) + for col in [ + "gross_outlay_amount_FYB_to_period_end", + "USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig", + "USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig", + ] + ], + sf.max(sf.col("last_modified_date")).alias("last_modified_date"), + ) + .filter( + (sf.col("gross_outlay_amount_FYB_to_period_end") != 0) + | (sf.col("USSGL487200_downward_adj_prior_year_prepaid_undeliv_order_oblig") != 0) + | (sf.col("USSGL497200_downward_adj_of_prior_year_paid_deliv_orders_oblig") != 0) + | (sf.col("transaction_obligated_amount") != 0) + ) + .select(self.select_cols) + ) diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py new file mode 100644 index 0000000000..b36e956503 --- /dev/null +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/columns.py @@ -0,0 +1,80 @@ +from usaspending_api.download.v2.download_column_historical_lookups import query_paths + +federal_account_groupby_cols = [ + "owning_agency_name", + "federal_account_symbol", + "federal_account_name", + "agency_identifier_name", + "program_activity_code", + "program_activity_name", + "object_class_code", + "object_class_name", + "direct_or_reimbursable_funding_source", + "disaster_emergency_fund_code", + "disaster_emergency_fund_name", + "award_unique_key", + "award_id_piid", + "parent_award_id_piid", + "award_id_fain", + "award_id_uri", + "award_base_action_date", + "award_latest_action_date", + "period_of_performance_start_date", + "period_of_performance_current_end_date", + "ordering_period_end_date", + "idv_type_code", + "idv_type", + "prime_award_base_transaction_description", + "awarding_agency_code", + "awarding_agency_name", + "awarding_subagency_code", + "awarding_subagency_name", + "awarding_office_code", + "awarding_office_name", + "funding_agency_code", + "funding_agency_name", + "funding_sub_agency_code", + "funding_sub_agency_name", + "funding_office_code", + "funding_office_name", + "recipient_uei", + "recipient_duns", + "recipient_name", + "recipient_name_raw", + "recipient_parent_uei", + "recipient_parent_duns", + "recipient_parent_name", + "recipient_parent_name_raw", + "recipient_country", + "recipient_state", + "recipient_county", + "recipient_city", + "primary_place_of_performance_country", + "primary_place_of_performance_state", + "primary_place_of_performance_county", + "primary_place_of_performance_zip_code", + "cfda_number", + "cfda_title", + "product_or_service_code", + "product_or_service_code_description", + "naics_code", + "naics_description", + "national_interest_action_code", + "national_interest_action", + "submission_period", + "award_type_code", + "award_type", + "recipient_zip_code", + "award_base_action_date_fiscal_year", + "award_latest_action_date_fiscal_year", + "usaspending_permalink", + "prime_award_summary_recipient_cd_original", + "prime_award_summary_recipient_cd_current", + "prime_award_summary_place_of_performance_cd_original", + "prime_award_summary_place_of_performance_cd_current", +] + +federal_account_select_cols = [ + col if not col.startswith("last_modified_date") else "last_modified_date" + for col in query_paths["award_financial"]["federal_account"].keys() +] diff --git a/usaspending_api/download/management/commands/delta_downloads/award_financial/filters.py b/usaspending_api/download/management/commands/delta_downloads/award_financial/filters.py new file mode 100644 index 0000000000..dbe21264e8 --- /dev/null +++ b/usaspending_api/download/management/commands/delta_downloads/award_financial/filters.py @@ -0,0 +1,73 @@ +import warnings +from typing import Any + +from pydantic import BaseModel, root_validator, validator +from pydantic.fields import ModelField + +from usaspending_api.accounts.models import FederalAccount +from usaspending_api.common.exceptions import InvalidParameterException +from usaspending_api.references.models import ToptierAgency + + +class AccountDownloadFilter(BaseModel): + fy: int + period: int | None = None + quarter: int | None = None + agency: int | None = None + federal_account: int | None = None + budget_function: str | None = None + budget_subfunction: str | None = None + def_codes: list[str] | None = None + + @validator("fy", "period", "quarter", "agency", "federal_account", pre=True) + @classmethod + def ensure_int_or_none(cls, value: Any, field: ModelField) -> Any: + if value == "all": + result = None + elif value is None: + result = value + elif not isinstance(value, int): + try: + result = int(value) + except ValueError: + raise InvalidParameterException(f"{field.name} must be an integer.") + else: + result = value + return result + + @validator("budget_function", "budget_subfunction", pre=True) + @classmethod + def check_for_all(cls, value: Any) -> Any: + if value == "all": + return None + else: + return value + + @validator("agency") + @classmethod + def check_agency_exists(cls, value: Any) -> Any: + if value is not None and not ToptierAgency.objects.filter(toptier_agency_id=value).exists(): + raise InvalidParameterException("Agency with that ID does not exist") + return value + + @validator("federal_account") + @classmethod + def check_federal_account_exists(cls, value: Any) -> Any: + if value is not None and not FederalAccount.objects.filter(id=value).exists(): + raise InvalidParameterException("Federal Account with that ID does not exist") + return value + + @root_validator + @classmethod + def check_period_quarter(cls, values: dict[str, Any]) -> dict[str, Any]: + period, quarter = values.get("period"), values.get("quarter") + if period is None and quarter is None: + raise InvalidParameterException("Must define period or quarter.") + if period is not None and quarter is not None: + values["quarter"] = quarter = None + warnings.warn("Both quarter and period are set. Only using period.") + if period is not None and period not in range(2, 13): + raise InvalidParameterException("Period must be between 2 and 12") + if quarter is not None and quarter not in range(1, 5): + raise InvalidParameterException("Quarter must be between 1 and 4") + return values diff --git a/usaspending_api/download/management/commands/generate_spark_download.py b/usaspending_api/download/management/commands/generate_spark_download.py new file mode 100644 index 0000000000..8b5d5e43e5 --- /dev/null +++ b/usaspending_api/download/management/commands/generate_spark_download.py @@ -0,0 +1,201 @@ +import json +import logging +import os +import traceback +from pathlib import Path +from typing import Optional, Dict, Tuple, Type, List, Union + +from django.conf import settings +from django.core.management.base import BaseCommand +from django.utils.functional import cached_property +from pyspark.sql import SparkSession + +from usaspending_api.common.etl.spark import create_ref_temp_views +from usaspending_api.common.exceptions import InvalidParameterException +from usaspending_api.common.helpers.download_csv_strategies import SparkToCSVStrategy +from usaspending_api.common.helpers.s3_helpers import upload_download_file_to_s3 +from usaspending_api.common.helpers.spark_helpers import ( + configure_spark_session, + get_active_spark_session, + get_jdbc_connection_properties, + get_usas_jdbc_url, +) +from usaspending_api.common.spark.configs import DEFAULT_EXTRA_CONF +from usaspending_api.download.filestreaming.download_generation import build_data_file_name +from usaspending_api.download.filestreaming.download_source import DownloadSource +from usaspending_api.download.lookups import JOB_STATUS_DICT, FILE_FORMATS, VALUE_MAPPINGS +from usaspending_api.download.management.commands.delta_downloads.award_financial.builders import ( + AccountDownloadDataFrameBuilder, +) +from usaspending_api.download.management.commands.delta_downloads.award_financial.filters import AccountDownloadFilter +from usaspending_api.download.models import DownloadJob +from usaspending_api.download.v2.request_validations import AccountDownloadValidator, DownloadValidatorBase + +logger = logging.getLogger(__name__) + +DOWNLOAD_SPEC = { + "award_financial": { + "federal_account": { + "df_builder": AccountDownloadDataFrameBuilder, + "validator_type": AccountDownloadValidator, + } + } +} + + +class Command(BaseCommand): + + help = "Generate a download zip file based on the provided type and level." + + download_job_id: int + download_job: DownloadJob + download_level: str + download_query: str + download_source: DownloadSource + download_spec: Dict + download_type: str + download_validator_type: Type[DownloadValidatorBase] + file_format_spec: Dict + file_prefix: str + jdbc_properties: Dict + jdbc_url: str + should_cleanup: bool + spark: SparkSession + working_dir_path: Path + + def add_arguments(self, parser): + parser.add_argument("--download-type", type=str, required=True, choices=list(DOWNLOAD_SPEC)) + parser.add_argument( + "--download-level", + type=str, + required=True, + choices=set( + download_level + for download_level_list in [DOWNLOAD_SPEC[key] for key in DOWNLOAD_SPEC] + for download_level in download_level_list + ), + ) + parser.add_argument("--download-job-id", type=int, required=True) + parser.add_argument("--file-format", type=str, required=False, choices=list(FILE_FORMATS), default="csv") + parser.add_argument("--file-prefix", type=str, required=False, default="") + parser.add_argument("--skip-local-cleanup", action="store_true") + + def handle(self, *args, **options): + self.spark = get_active_spark_session() + spark_created_by_command = False + if not self.spark: + spark_created_by_command = True + self.spark = configure_spark_session(**DEFAULT_EXTRA_CONF, spark_context=self.spark) + + # Resolve Parameters + self.download_type = options["download_type"] + self.download_level = options["download_level"] + self.download_job_id = options["download_job_id"] + self.file_prefix = options["file_prefix"] + self.should_cleanup = not options["skip_local_cleanup"] + + if self.download_level not in DOWNLOAD_SPEC[self.download_type].keys(): + raise ValueError( + f'Provided download level of "{self.download_level}" is not supported ' + f'for download type of "{self.download_type}".' + ) + + download_spec = DOWNLOAD_SPEC[self.download_type][self.download_level] + self.file_format_spec = FILE_FORMATS[options["file_format"]] + self.df_builder = download_spec["df_builder"] + self.download_validator_type = download_spec["validator_type"] + self.jdbc_properties = get_jdbc_connection_properties() + self.jdbc_url = get_usas_jdbc_url() + + self.working_dir_path = Path(settings.CSV_LOCAL_PATH) + if not self.working_dir_path.exists(): + self.working_dir_path.mkdir() + + create_ref_temp_views(self.spark) + + self.download_job, self.download_source = self.get_download_job() + self.process_download() + + if spark_created_by_command: + self.spark.stop() + + @cached_property + def download_name(self) -> str: + return self.download_job.file_name.replace(".zip", "") + + def get_download_job(self) -> Tuple[DownloadJob, DownloadSource]: + download_job = DownloadJob.objects.get(download_job_id=self.download_job_id) + if download_job.job_status_id != JOB_STATUS_DICT["ready"]: + raise InvalidParameterException(f"Download Job {self.download_job_id} is not ready.") + json_request = json.loads(download_job.json_request) + download_source = DownloadSource( + VALUE_MAPPINGS[self.download_type]["table_name"], + self.download_level, + self.download_type, + json_request.get("agency", "all"), + ) + download_source.file_name = build_data_file_name(download_source, download_job, piid=None, assistance_id=None) + + return download_job, download_source + + def process_download(self): + self.start_download() + files_to_cleanup = [] + try: + spark_to_csv_strategy = SparkToCSVStrategy(logger) + zip_file_path = self.working_dir_path / f"{self.download_name}.zip" + download_request = json.loads(self.download_job.json_request) + account_download_filter = AccountDownloadFilter(**download_request["filters"]) + source_df = self.df_builder(spark=self.spark, account_download_filter=account_download_filter).source_df + csv_metadata = spark_to_csv_strategy.download_to_csv( + source_sql=None, + destination_path=self.working_dir_path / self.download_name, + destination_file_name=self.download_name, + working_dir_path=self.working_dir_path, + download_zip_path=zip_file_path, + source_df=source_df, + ) + files_to_cleanup.extend(csv_metadata.filepaths) + self.download_job.file_size = os.stat(zip_file_path).st_size + self.download_job.number_of_rows = csv_metadata.number_of_rows + self.download_job.number_of_columns = csv_metadata.number_of_columns + upload_download_file_to_s3(zip_file_path) + except InvalidParameterException as e: + exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob" + self.fail_download(exc_msg, e) + raise + except Exception as e: + exc_msg = "An exception was raised while attempting to process the DownloadJob" + self.fail_download(exc_msg, e) + raise + finally: + if self.should_cleanup: + self.cleanup(files_to_cleanup) + self.finish_download() + + def start_download(self) -> None: + self.download_job.job_status_id = JOB_STATUS_DICT["running"] + self.download_job.save() + logger.info(f"Starting DownloadJob {self.download_job.download_job_id}") + + def fail_download(self, msg: str, e: Optional[Exception] = None) -> None: + if e: + stack_trace = "".join(traceback.format_exception(type(e), value=e, tb=e.__traceback__)) + self.download_job.error_message = f"{msg}:\n{stack_trace}" + else: + self.download_job.error_message = msg + logger.error(msg) + self.download_job.job_status_id = JOB_STATUS_DICT["failed"] + self.download_job.save() + + def finish_download(self) -> None: + self.download_job.job_status_id = JOB_STATUS_DICT["finished"] + self.download_job.save() + logger.info(f"Finished processing DownloadJob {self.download_job.download_job_id}") + + def cleanup(self, path_list: List[Union[Path, str]]) -> None: + for path in path_list: + if isinstance(path, str): + path = Path(path) + logger.info(f"Removing {path}") + path.unlink() diff --git a/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py new file mode 100644 index 0000000000..c8953bc390 --- /dev/null +++ b/usaspending_api/download/tests/integration/test_account_download_dataframe_builder.py @@ -0,0 +1,131 @@ +from unittest.mock import patch + +import pandas as pd +import pytest +from django.core.management import call_command +from model_bakery import baker +from usaspending_api.download.management.commands.delta_downloads.award_financial.columns import ( + federal_account_select_cols, + federal_account_groupby_cols, +) +from usaspending_api.download.management.commands.delta_downloads.award_financial.builders import ( + AccountDownloadDataFrameBuilder, +) +from usaspending_api.download.management.commands.delta_downloads.award_financial.filters import AccountDownloadFilter + + +@pytest.fixture(scope="function") +def account_download_table(spark, s3_unittest_data_bucket, hive_unittest_metastore_db): + call_command( + "create_delta_table", + f"--destination-table=account_download", + f"--spark-s3-bucket={s3_unittest_data_bucket}", + ) + columns = list(set(federal_account_select_cols + federal_account_groupby_cols)) + [ + "reporting_fiscal_year", + "reporting_fiscal_quarter", + "reporting_fiscal_period", + "quarter_format_flag", + "submission_id", + "federal_account_id", + "funding_toptier_agency_id", + "budget_function_code", + "budget_subfunction_code", + ] + test_data_df = pd.DataFrame( + data={ + "reporting_fiscal_year": [2018, 2018, 2018, 2018, 2019], + "quarter_format_flag": [True, True, False, True, True], + "reporting_fiscal_quarter": [1, 2, None, 4, 2], + "reporting_fiscal_period": [None, None, 5, None, None], + "transaction_obligated_amount": [100, 100, 100, 100, 100], + "submission_id": [1, 2, 3, 4, 5], + "owning_agency_name": ["test1", "test2", "test2", "test2", "test3"], + "reporting_agency_name": ["A", "B", "C", "D", "E"], + "budget_function": ["A", "B", "C", "D", "E"], + "budget_subfunction": ["A", "B", "C", "D", "E"], + "gross_outlay_amount_FYB_to_period_end": [100, 100, 100, 100, 100], + "funding_toptier_agency_id": [1, 2, 2, 2, 3], + "federal_account_id": [1, 2, 2, 2, 3], + }, + columns=columns, + ).fillna("dummy_text") + ( + spark.createDataFrame(test_data_df) + .write.format("delta") + .mode("overwrite") + .option("overwriteSchema", "true") + .saveAsTable("rpt.account_download") + ) + yield + + +@pytest.fixture +def agency_models(db): + baker.make("references.ToptierAgency", pk=1, toptier_code="123") + baker.make("references.ToptierAgency", pk=2, toptier_code="456") + baker.make("references.ToptierAgency", pk=3, toptier_code="789") + + +@pytest.fixture +def federal_account_models(db): + baker.make("accounts.FederalAccount", pk=1, agency_identifier="123", main_account_code="0111") + baker.make("accounts.FederalAccount", pk=2, agency_identifier="234", main_account_code="0222") + baker.make("accounts.FederalAccount", pk=3, agency_identifier="345", main_account_code="0333") + + +@patch( + "usaspending_api.download.management.commands.delta_downloads.award_financial.builders.get_submission_ids_for_periods" +) +def test_account_download_dataframe_builder(mock_get_submission_ids_for_periods, spark, account_download_table): + mock_get_submission_ids_for_periods.return_value = [1, 2, 4, 5] + account_download_filter = AccountDownloadFilter( + fy=2018, + quarter=4, + ) + builder = AccountDownloadDataFrameBuilder(spark, account_download_filter, "rpt.account_download") + result = builder.source_df + for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: + assert sorted(result.toPandas()[col].to_list()) == ["A", "B; C; D"] + assert sorted(result.toPandas().transaction_obligated_amount.to_list()) == [100, 300] + assert sorted(result.toPandas().gross_outlay_amount_FYB_to_period_end.to_list()) == [100, 200] + + +@patch( + "usaspending_api.download.management.commands.delta_downloads.award_financial.builders.get_submission_ids_for_periods" +) +def test_filter_by_agency(mock_get_submission_ids_for_periods, spark, account_download_table, agency_models): + mock_get_submission_ids_for_periods.return_value = [1, 2, 4, 5] + + account_download_filter = AccountDownloadFilter( + fy=2018, + quarter=4, + agency=2, + ) + builder = AccountDownloadDataFrameBuilder(spark, account_download_filter) + result = builder.source_df + for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: + assert sorted(result.toPandas()[col].to_list()) == ["B; C; D"] + assert sorted(result.toPandas().transaction_obligated_amount.to_list()) == [300] + assert sorted(result.toPandas().gross_outlay_amount_FYB_to_period_end.to_list()) == [200] + + +@patch( + "usaspending_api.download.management.commands.delta_downloads.award_financial.builders.get_submission_ids_for_periods" +) +def test_filter_by_federal_account_id( + mock_get_submission_ids_for_periods, spark, account_download_table, federal_account_models +): + mock_get_submission_ids_for_periods.return_value = [1, 2, 4, 5] + + account_download_filter = AccountDownloadFilter( + fy=2018, + quarter=4, + federal_account=1, + ) + builder = AccountDownloadDataFrameBuilder(spark, account_download_filter) + result = builder.source_df + for col in ["reporting_agency_name", "budget_function", "budget_subfunction"]: + assert sorted(result.toPandas()[col].to_list()) == ["A"] + assert sorted(result.toPandas().transaction_obligated_amount.to_list()) == [100] + assert sorted(result.toPandas().gross_outlay_amount_FYB_to_period_end.to_list()) == [100] diff --git a/usaspending_api/download/tests/integration/test_account_download_filter.py b/usaspending_api/download/tests/integration/test_account_download_filter.py new file mode 100644 index 0000000000..73eafc9c58 --- /dev/null +++ b/usaspending_api/download/tests/integration/test_account_download_filter.py @@ -0,0 +1,81 @@ +import pytest +from model_bakery import baker + +from usaspending_api.common.exceptions import InvalidParameterException +from usaspending_api.download.management.commands.delta_downloads.award_financial.filters import AccountDownloadFilter + + +@pytest.fixture +def agency_models(db): + baker.make("references.ToptierAgency", pk=1, toptier_code="123") + baker.make("references.ToptierAgency", pk=2, toptier_code="456") + baker.make("references.ToptierAgency", pk=3, toptier_code="789") + + +@pytest.fixture +def federal_account_models(db): + baker.make("accounts.FederalAccount", pk=1, agency_identifier="123", main_account_code="0111") + baker.make("accounts.FederalAccount", pk=2, agency_identifier="234", main_account_code="0222") + baker.make("accounts.FederalAccount", pk=3, agency_identifier="345", main_account_code="0333") + + +def test_account_download_filter_cast_to_int(agency_models, federal_account_models): + test_data = {"fy": "2018", "quarter": "4", "agency": "2", "federal_account": "3"} + result = AccountDownloadFilter(**test_data) + assert result.fy == 2018 + assert result.quarter == 4 + assert result.agency == 2 + assert result.federal_account == 3 + + +def test_account_download_handle_all(agency_models, federal_account_models): + test_data = { + "fy": "2018", + "quarter": "4", + "agency": "all", + "federal_account": "all", + "budget_function": "all", + "budget_subfunction": "all", + } + result = AccountDownloadFilter(**test_data) + assert result.fy == 2018 + assert result.quarter == 4 + assert result.agency is None + assert result.federal_account is None + assert result.budget_function is None + assert result.budget_subfunction is None + + +def test_account_download_both_period_quarter(agency_models, federal_account_models): + test_data = {"fy": "2018", "period": "12", "quarter": "4"} + with pytest.warns() as warnings: + result = AccountDownloadFilter(**test_data) + assert result.fy == 2018 + assert result.period == 12 + assert result.quarter is None + assert len(warnings) == 1 + assert str(warnings[0].message) == "Both quarter and period are set. Only using period." + + +def test_account_download_none_period_quarter(agency_models, federal_account_models): + test_data = {"fy": "2018"} + with pytest.raises(InvalidParameterException, match="Must define period or quarter."): + AccountDownloadFilter(**test_data) + + +def test_account_download_no_agency(agency_models, federal_account_models): + test_data = {"fy": "2018", "period": 2, "agency": 3} + result = AccountDownloadFilter(**test_data) + assert result.agency == 3 + test_data = {"fy": "2018", "period": 2, "agency": 4} + with pytest.raises(InvalidParameterException, match="Agency with that ID does not exist"): + AccountDownloadFilter(**test_data) + + +def test_account_download_no_federal_account(agency_models, federal_account_models): + test_data = {"fy": "2018", "period": 2, "federal_account": 3} + result = AccountDownloadFilter(**test_data) + assert result.federal_account == 3 + test_data = {"fy": "2018", "period": 2, "federal_account": 4} + with pytest.raises(InvalidParameterException, match="Federal Account with that ID does not exist"): + result = AccountDownloadFilter(**test_data) diff --git a/usaspending_api/etl/elasticsearch_loader_helpers/aggregate_key_functions.py b/usaspending_api/etl/elasticsearch_loader_helpers/aggregate_key_functions.py index 42584bdc21..60cb727fc0 100644 --- a/usaspending_api/etl/elasticsearch_loader_helpers/aggregate_key_functions.py +++ b/usaspending_api/etl/elasticsearch_loader_helpers/aggregate_key_functions.py @@ -1,7 +1,10 @@ +import datetime import json import logging from typing import Optional +from dateutil.relativedelta import relativedelta + from usaspending_api.recipient.models import RecipientProfile logger = logging.getLogger("script") @@ -197,3 +200,15 @@ def location_type_agg_key(record: dict) -> Optional[str]: return json_data.get("location_type") else: raise ValueError("Unable to get the 'location_type' key from the 'location_json' field") + + +def fiscal_action_date(record: dict) -> Optional[datetime.date]: + if record.get("action_date") is None: + return None + return record["action_date"] + relativedelta(months=3) + + +def sub_fiscal_action_date(record: dict) -> Optional[datetime.date]: + if record.get("sub_action_date") is None: + return None + return record["sub_action_date"] + relativedelta(months=3) diff --git a/usaspending_api/etl/elasticsearch_loader_helpers/controller_for_spark.py b/usaspending_api/etl/elasticsearch_loader_helpers/controller_for_spark.py index a94b1d4501..51320a0034 100644 --- a/usaspending_api/etl/elasticsearch_loader_helpers/controller_for_spark.py +++ b/usaspending_api/etl/elasticsearch_loader_helpers/controller_for_spark.py @@ -58,12 +58,14 @@ def ensure_view_exists(self, sql_view_name: str, force_recreate=True) -> None: elif self.config["load_type"] == "recipient": identifier_replacements = None elif self.config["load_type"] == "location": - identifier_replacements["array_agg"] = "collect_list" - identifier_replacements["json_agg"] = "collect_list" - # Replace the Postgres regex operator with the Databricks regex operator identifier_replacements["~"] = "rlike" identifier_replacements["jsonb_build_object"] = "map" identifier_replacements["to_jsonb"] = "to_json" + identifier_replacements["state_data"] = "global_temp.state_data" + identifier_replacements["ref_country_code"] = "global_temp.ref_country_code" + identifier_replacements["ref_city_county_state_code"] = "global_temp.ref_city_county_state_code" + identifier_replacements["zips_grouped"] = "global_temp.zips_grouped" + else: raise ValueError( f"Unrecognized load_type {self.config['load_type']}, or this function does not yet support it" diff --git a/usaspending_api/etl/elasticsearch_loader_helpers/transform_data.py b/usaspending_api/etl/elasticsearch_loader_helpers/transform_data.py index 06ecd3074e..5936da5a38 100644 --- a/usaspending_api/etl/elasticsearch_loader_helpers/transform_data.py +++ b/usaspending_api/etl/elasticsearch_loader_helpers/transform_data.py @@ -17,13 +17,13 @@ def transform_award_data(worker: TaskSpec, records: List[dict]) -> List[dict]: - converters = { + replace_fields = { "spending_by_defc": convert_json_data_to_dict, "federal_accounts": convert_json_array_to_list_of_str, "program_activities": convert_json_data_to_dict, } # TODO: Move some of the 1:1 agg_keys that match a field already on Elasticsearch - agg_key_creations = { + insert_fields = { "awarding_subtier_agency_agg_key": lambda x: x["awarding_subtier_agency_code"], "awarding_toptier_agency_agg_key": lambda x: x["awarding_toptier_agency_code"], "funding_subtier_agency_agg_key": lambda x: x["funding_subtier_agency_code"], @@ -32,6 +32,7 @@ def transform_award_data(worker: TaskSpec, records: List[dict]) -> List[dict]: "psc_agg_key": lambda x: x["product_or_service_code"], "defc_agg_key": lambda x: x["disaster_emergency_fund_codes"], "cfda_agg_key": lambda x: x["cfda_number"], + "fiscal_action_date": funcs.fiscal_action_date, "pop_congressional_agg_key": funcs.pop_congressional_agg_key, "pop_congressional_cur_agg_key": funcs.pop_congressional_cur_agg_key, "pop_county_agg_key": funcs.pop_county_agg_key, @@ -59,16 +60,16 @@ def transform_award_data(worker: TaskSpec, records: List[dict]) -> List[dict]: "pop_county_population", "pop_congressional_population", ] - return transform_data(worker, records, converters, agg_key_creations, drop_fields, settings.ES_ROUTING_FIELD) + return transform_data(worker, records, replace_fields, insert_fields, drop_fields, settings.ES_ROUTING_FIELD) def transform_transaction_data(worker: TaskSpec, records: List[dict]) -> List[dict]: - converters = { + replace_fields = { "federal_accounts": convert_json_array_to_list_of_str, "program_activities": convert_json_data_to_dict, } # TODO: Move some of the 1:1 agg_keys that match a field already on Elasticsearch - agg_key_creations = { + insert_fields = { "recipient_agg_key": funcs.transaction_recipient_agg_key, "awarding_subtier_agency_agg_key": lambda x: x["awarding_sub_tier_agency_c"], "awarding_toptier_agency_agg_key": lambda x: x["awarding_agency_code"], @@ -103,14 +104,14 @@ def transform_transaction_data(worker: TaskSpec, records: List[dict]) -> List[di "recipient_levels", "funding_toptier_agency_id", ] - return transform_data(worker, records, converters, agg_key_creations, drop_fields, settings.ES_ROUTING_FIELD) + return transform_data(worker, records, replace_fields, insert_fields, drop_fields, settings.ES_ROUTING_FIELD) def transform_subaward_data(worker: TaskSpec, records: List[dict]) -> List[dict]: - converters = { + replace_fields = { "program_activities": convert_json_data_to_dict, } - agg_key_creations = { + insert_fields = { "sub_pop_country_agg_key": lambda x: x["sub_pop_country_code"], "sub_pop_congressional_cur_agg_key": funcs.sub_pop_congressional_cur_agg_key, "sub_pop_county_agg_key": funcs.sub_pop_county_agg_key, @@ -126,27 +127,28 @@ def transform_subaward_data(worker: TaskSpec, records: List[dict]) -> List[dict] "defc_agg_key": lambda x: x["disaster_emergency_fund_codes"], "cfda_agg_key": lambda x: x["cfda_number"], "sub_recipient_agg_key": funcs.subaward_recipient_agg_key, + "sub_fiscal_action_date": funcs.sub_fiscal_action_date, } drop_fields = [] - return transform_data(worker, records, converters, agg_key_creations, drop_fields, None) + return transform_data(worker, records, replace_fields, insert_fields, drop_fields, None) def transform_location_data(worker: TaskSpec, records: List[dict]) -> List[dict]: - converters = { + replace_fields = { "location_json": dump_dict_to_string, } - agg_key_creations = {"location_type": funcs.location_type_agg_key} + insert_fields = {"location_type": funcs.location_type_agg_key} drop_fields = [] - return transform_data(worker, records, converters, agg_key_creations, drop_fields, None) + return transform_data(worker, records, replace_fields, insert_fields, drop_fields, None) def transform_data( worker: TaskSpec, records: List[dict], - converters: Dict[str, Callable], - agg_key_creations: Dict[str, Callable], + replace_fields: Dict[str, Callable], + insert_fields: Dict[str, Callable], drop_fields: List[str], routing_field: Optional[str] = None, ) -> List[dict]: @@ -155,10 +157,20 @@ def transform_data( start = perf_counter() for record in records: - for field, converter in converters.items(): - record[field] = converter(record[field]) - for key, transform_func in agg_key_creations.items(): - record[key] = transform_func(record) + record.update( + { + # Replace fields + **{field: func(record[field]) for field, func in replace_fields.items()}, + # Create new fields + **{field: func(record) for field, func in insert_fields.items()}, + # Explicitly setting the ES _id field to match the postgres PK value allows + # bulk index operations to be upserts without creating duplicate documents + # IF and ONLY IF a routing meta field is not also provided (one whose value differs + # from the doc _id field). If explicit routing is done, UPSERTs may cause duplicates, + # so docs must be deleted before UPSERTed. (More info in streaming_post_to_es(...)) + "_id": record[worker.field_for_es_id], + } + ) # Route all documents with the same recipient to the same shard # This allows for accuracy and early-termination of "top N" recipient category aggregation queries @@ -168,16 +180,9 @@ def transform_data( if routing_field: record["routing"] = record[routing_field] - # Explicitly setting the ES _id field to match the postgres PK value allows - # bulk index operations to be upserts without creating duplicate documents - # IF and ONLY IF a routing meta field is not also provided (one whose value differs - # from the doc _id field). If explicit routing is done, UPSERTs may cause duplicates, - # so docs must be deleted before UPSERTed. (More info in streaming_post_to_es(...)) - record["_id"] = record[worker.field_for_es_id] - # Removing data which were used for creating aggregate keys and aren't necessary standalone for key in drop_fields: - record.pop(key) + del record[key] duration = perf_counter() - start logger.info(format_log(f"Transformation operation took {duration:.2f}s", name=worker.name, action="Transform")) diff --git a/usaspending_api/etl/es_award_template.json b/usaspending_api/etl/es_award_template.json index 497e3d4f21..66d01c877e 100644 --- a/usaspending_api/etl/es_award_template.json +++ b/usaspending_api/etl/es_award_template.json @@ -186,6 +186,10 @@ "type": "date", "format": "yyyy-MM-dd" }, + "fiscal_action_date": { + "type": "date", + "format": "yyyy-MM-dd" + }, "fiscal_year": { "type": "integer" }, diff --git a/usaspending_api/etl/es_subaward_template.json b/usaspending_api/etl/es_subaward_template.json index 134374754d..821e80fc23 100644 --- a/usaspending_api/etl/es_subaward_template.json +++ b/usaspending_api/etl/es_subaward_template.json @@ -368,6 +368,10 @@ "type": "date", "format": "yyyy-MM-dd" }, + "sub_fiscal_action_date": { + "type": "date", + "format": "yyyy-MM-dd" + }, "sub_awardee_or_recipient_uniqu": { "type": "text" }, diff --git a/usaspending_api/etl/management/commands/load_query_to_delta.py b/usaspending_api/etl/management/commands/load_query_to_delta.py index e3f3952e96..210b4ff40d 100644 --- a/usaspending_api/etl/management/commands/load_query_to_delta.py +++ b/usaspending_api/etl/management/commands/load_query_to_delta.py @@ -19,6 +19,11 @@ covid_faba_spending_load_sql_strings, ) from usaspending_api.disaster.models import CovidFABASpending +from usaspending_api.download.delta_models.account_download import ( + ACCOUNT_DOWNLOAD_POSTGRES_COLUMNS, + account_download_create_sql_string, + account_download_load_sql_string, +) from usaspending_api.recipient.delta_models import ( RECIPIENT_LOOKUP_POSTGRES_COLUMNS, RECIPIENT_PROFILE_POSTGRES_COLUMNS, @@ -50,6 +55,7 @@ subaward_search_load_sql_string, ) from usaspending_api.search.models import AwardSearch, SubawardSearch, SummaryStateView, TransactionSearch +from usaspending_api.settings import HOST from usaspending_api.transactions.delta_models import ( SUMMARY_STATE_VIEW_COLUMNS, SUMMARY_STATE_VIEW_POSTGRES_COLUMNS, @@ -65,6 +71,8 @@ transaction_search_overwrite_load_sql_string, ) +AWARD_URL = f"{HOST}/award/" if "localhost" in HOST else f"https://{HOST}/award/" + logger = logging.getLogger(__name__) TABLE_SPEC = { @@ -308,6 +316,27 @@ "tsvectors": None, "postgres_partition_spec": None, }, + "account_download": { + "model": None, + "is_from_broker": False, + "source_query": [account_download_load_sql_string], + "source_query_incremental": None, + "source_database": None, + "source_table": None, + "destination_database": "rpt", + "swap_table": None, + "swap_schema": None, + "partition_column": "financial_accounts_by_awards_id", + "partition_column_type": "numeric", + "is_partition_column_unique": False, + "delta_table_create_sql": account_download_create_sql_string, + "source_schema": ACCOUNT_DOWNLOAD_POSTGRES_COLUMNS, + "custom_schema": None, + "column_names": list(ACCOUNT_DOWNLOAD_POSTGRES_COLUMNS), + "postgres_seq_name": None, + "tsvectors": None, + "postgres_partition_spec": None, + }, } @@ -410,5 +439,6 @@ def run_spark_sql(self, query): JDBC_DRIVER=jdbc_conn_props["driver"], JDBC_FETCHSIZE=jdbc_conn_props["fetchsize"], JDBC_URL=get_broker_jdbc_url(), + AWARD_URL=AWARD_URL, ) ) diff --git a/usaspending_api/etl/management/commands/load_transactions_in_delta.py b/usaspending_api/etl/management/commands/load_transactions_in_delta.py index 2cc8d96459..1360cbf273 100644 --- a/usaspending_api/etl/management/commands/load_transactions_in_delta.py +++ b/usaspending_api/etl/management/commands/load_transactions_in_delta.py @@ -224,7 +224,7 @@ def award_id_lookup_pre_delete(self): """ # TODO: The values returned here are put into a list in an 'IN' clause in award_id_lookup_post_delete. - # However, there is a limit on the number of values one can manually put into an 'IN' cluase (i.e., not + # However, there is a limit on the number of values one can manually put into an 'IN' clause (i.e., not # returned by a SELECT subquery inside the 'IN'). Thus, this code should return a dataframe directly, # create a temporary view from the dataframe in award_id_lookup_post_delete, and use that temporary # view to either do a subquery in the 'IN' clause or to JOIN against. @@ -1142,11 +1142,11 @@ def prepare_orphaned_award_temp_table(): except AnalysisException as e: if re.match( r"^\[TABLE_OR_VIEW_NOT_FOUND\] The table or view `raw`\.`transaction_normalized` cannot be found\..*$", - e.desc, + str(e), re.MULTILINE, ): # In this case, we just don't populate transaction_id_lookup - logger.warn( + logger.warning( "Skipping population of transaction_id_lookup table; no raw.transaction_normalized table." ) raw_transaction_normalized_exists = False @@ -1170,11 +1170,11 @@ def prepare_orphaned_award_temp_table(): except AnalysisException as e: if re.match( r"^\[TABLE_OR_VIEW_NOT_FOUND\] The table or view `raw`\.`transaction_fabs` cannot be found\..*$", - e.desc, + str(e), re.MULTILINE, ): # In this case, we just skip extending the orphaned transactions with this table - logger.warn( + logger.warning( "Skipping extension of orphaned_transaction_info table using raw.transaction_fabs table." ) @@ -1198,11 +1198,11 @@ def prepare_orphaned_award_temp_table(): except AnalysisException as e: if re.match( r"^\[TABLE_OR_VIEW_NOT_FOUND\] The table or view `raw`\.`transaction_fpds` cannot be found\..*$", - e.desc, + str(e), re.MULTILINE, ): # In this case, we just skip extending the orphaned transactions with this table - logger.warn( + logger.warning( "Skipping extension of orphaned_transaction_info table using raw.transaction_fpds table." ) @@ -1250,7 +1250,7 @@ def prepare_orphaned_award_temp_table(): """ ) else: - logger.warn( + logger.warning( "No raw.transaction_fabs or raw.transaction_fpds tables, so not finding additional orphaned " "transactions in raw.transaction_normalized" ) @@ -1364,7 +1364,7 @@ def prepare_orphaned_award_temp_table(): if not raw_transaction_normalized_exists: # In this case, we just don't populate award_id_lookup - logger.warn("Skipping population of award_id_lookup table; no raw.transaction_normalized table.") + logger.warning("Skipping population of award_id_lookup table; no raw.transaction_normalized table.") # Without a raw.transaction_normalized table, can't get a maximum award_id from it, either. max_id = None @@ -1510,11 +1510,11 @@ def prepare_orphaned_award_temp_table(): except AnalysisException as e: if re.match( rf"^\[TABLE_OR_VIEW_NOT_FOUND\] The table or view `raw`\.`{destination_table}` cannot be found\..*$", - e.desc, + str(e), re.MULTILINE, ): # In this case, we just don't copy anything over - logger.warn( + logger.warning( f"Skipping copy of {destination_table} table from 'raw' to 'int' database; " f"no raw.{destination_table} table." ) diff --git a/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_lookups.py b/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_lookups.py index ca4a261bb8..978cfa345d 100644 --- a/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_lookups.py +++ b/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_lookups.py @@ -293,7 +293,7 @@ def verify( except pyspark.sql.utils.AnalysisException as e: if re.match( rf"^\[TABLE_OR_VIEW_NOT_FOUND\] The table or view `raw`\.`{table_name}` cannot be found\..*$", - e.desc, + str(e), re.MULTILINE, ): pass diff --git a/usaspending_api/references/management/commands/load_offices.py b/usaspending_api/references/management/commands/load_offices.py index d538062e96..0084fba7e1 100644 --- a/usaspending_api/references/management/commands/load_offices.py +++ b/usaspending_api/references/management/commands/load_offices.py @@ -52,7 +52,7 @@ def process_data(self): @property def broker_fetch_sql(self): - return f""" + return """ SELECT office_code, office_name, @@ -82,15 +82,11 @@ def usas_unlinked_offices_sql(self): ) s; CREATE INDEX awarding_office_code_idx_temp ON temp_unique_office_codes_from_source (awarding_office_code); CREATE INDEX funding_office_code_idx_temp ON temp_unique_office_codes_from_source (funding_office_code); - DELETE FROM office WHERE office_code IN ( - SELECT DISTINCT office_code - FROM office AS o - LEFT JOIN temp_unique_office_codes_from_source s - ON s.awarding_office_code = o.office_code - OR s.funding_office_code = o.office_code - - WHERE s.awarding_office_code IS NULL - AND s.funding_office_code IS NULL + DELETE FROM office + WHERE NOT EXISTS ( + SELECT 1 + FROM temp_unique_office_codes_from_source AS s + WHERE s.awarding_office_code = office.office_code OR s.funding_office_code = office.office_code ); DROP TABLE IF EXISTS temp_unique_office_codes_from_source; """ diff --git a/usaspending_api/references/tests/integration/test_location_autocomplete_v2.py b/usaspending_api/references/tests/integration/test_location_autocomplete_v2.py index 3d12632afc..f2b1597f9d 100644 --- a/usaspending_api/references/tests/integration/test_location_autocomplete_v2.py +++ b/usaspending_api/references/tests/integration/test_location_autocomplete_v2.py @@ -11,138 +11,55 @@ @pytest.fixture def location_data_fixture(db): + denmark = baker.make("references.RefCountryCode", country_code="DNK", country_name="DENMARK") + france = baker.make("references.RefCountryCode", country_code="FRA", country_name="FRANCE") + baker.make("references.RefCountryCode", country_code="USA", country_name="UNITED STATES") + + baker.make("recipient.StateData", id="1", code="CO", name="Colorado") + baker.make("recipient.StateData", id="2", code="CA", name="California") + baker.make("recipient.StateData", id="3", code="TX", name="Texas") + baker.make("recipient.StateData", id="4", code="IL", name="Illinois") + baker.make("recipient.StateData", id="5", code="OK", name="Oklahoma") + + baker.make("references.CityCountyStateCode", id=1, feature_name="Denver", state_alpha="CO") + baker.make("references.CityCountyStateCode", id=2, feature_name="Texas A City", state_alpha="TX") + baker.make("references.CityCountyStateCode", id=3, feature_name="Texas B City", state_alpha="TX") + baker.make("references.CityCountyStateCode", id=4, feature_name="Texas C City", state_alpha="IL") + baker.make("references.CityCountyStateCode", id=5, feature_name="Texas D City", state_alpha="OK") + baker.make("references.CityCountyStateCode", id=6, feature_name="Texas E City", state_alpha="TX") + baker.make("references.CityCountyStateCode", id=7, feature_name="Texas F City", state_alpha="TX") + baker.make("references.CityCountyStateCode", id=8, county_name="Los Angeles", state_alpha="CA") + + baker.make("references.ZipsGrouped", zips_grouped_id=1, zip5="90210", state_abbreviation="CA") + baker.make("references.ZipsGrouped", zips_grouped_id=2, zip5="90211", state_abbreviation="CA") + baker.make( "search.TransactionSearch", transaction_id=500, is_fpds=False, transaction_unique_id="TRANSACTION500", - pop_country_name="UNITED STATES", - pop_state_name="CALIFORNIA", - pop_state_code="CA", - pop_city_name="LOS ANGELES", - pop_county_name="LOS ANGELES", - pop_zip5=90001, - pop_congressional_code_current="34", - pop_congressional_code="34", - pop_state_fips="11", - pop_county_code="111", - recipient_location_country_name="UNITED STATES", - recipient_location_state_name="COLORADO", - recipient_location_city_name="DENVER", - recipient_location_county_name="DENVER", - recipient_location_zip5=80012, - recipient_location_congressional_code_current="01", - recipient_location_congressional_code="01", - recipient_location_state_fips="22", - recipient_location_county_code="222", + pop_country_name=denmark.country_name, + pop_country_code=denmark.country_code, + pop_city_name="COPENHAGEN", + recipient_location_country_name=france.country_name, + recipient_location_country_code=france.country_code, + recipient_location_city_name="PARIS", ) baker.make( "search.TransactionSearch", transaction_id=501, is_fpds=False, transaction_unique_id="TRANSACTION501", - pop_country_name="DENMARK", - pop_state_name=None, - pop_city_name=None, - pop_county_name=None, - pop_zip5=None, - pop_congressional_code_current=None, - pop_congressional_code=None, - pop_state_fips="33", - pop_county_code="3333", - recipient_location_country_name="UNITED STATES", - recipient_location_state_name="GEORGIA", - recipient_location_city_name="KINGSLAND", - recipient_location_county_name="CAMDEN", - recipient_location_zip5=31548, - recipient_location_congressional_code_current="01", - recipient_location_congressional_code="01", - recipient_location_state_fips="13", - recipient_location_county_code="444", - ) - baker.make( - "search.TransactionSearch", - transaction_id=502, - is_fpds=False, - transaction_unique_id="TRANSACTION502", - pop_country_name="DENMARK", - pop_state_name=None, - pop_city_name=None, - pop_county_name=None, - pop_zip5=None, - pop_congressional_code_current=None, - pop_congressional_code=None, - recipient_location_country_name="UNITED STATES", - recipient_location_state_name="FAKE STATE", - recipient_location_city_name="FAKE CITY", - recipient_location_county_name="FAKE COUNTY", - recipient_location_zip5=75001, - recipient_location_congressional_code_current="30", - recipient_location_congressional_code="30", - ) - - -@pytest.fixture -def location_data_fixture_multiple_locations(db): - baker.make( - "search.TransactionSearch", - transaction_id=100, - is_fpds=False, - transaction_unique_id="TRANSACTION100", pop_country_name="UNITED STATES", - pop_state_name="TEXAS", - pop_city_name="TEXAS A CITY", - pop_county_name=None, - pop_zip5=None, - pop_congressional_code_current=None, - pop_congressional_code=None, - recipient_location_country_name="UNITED STATES", - recipient_location_state_name="TEXAS", - recipient_location_city_name="TEXAS B CITY", - recipient_location_county_name="FAKE COUNTY", - recipient_location_zip5=75001, - recipient_location_congressional_code_current="30", - recipient_location_congressional_code="30", - ) - baker.make( - "search.TransactionSearch", - transaction_id=101, - is_fpds=False, - transaction_unique_id="TRANSACTION101", - pop_country_name="UNITED STATES", - pop_state_name="ILLINOIS", - pop_city_name="TEXAS C CITY", - pop_county_name=None, - pop_zip5=None, - pop_congressional_code_current=None, - pop_congressional_code=None, - recipient_location_country_name="UNITED STATES", - recipient_location_state_name="OKLAHOMA", - recipient_location_city_name="TEXAS D CITY", - recipient_location_county_name=None, - recipient_location_zip5=75001, - recipient_location_congressional_code_current="30", - recipient_location_congressional_code="30", - ) - baker.make( - "search.TransactionSearch", - transaction_id=102, - is_fpds=False, - transaction_unique_id="TRANSACTION102", - pop_country_name="UNITED STATES", - pop_state_name="TEXAS", - pop_city_name="TEXAS E CITY", - pop_county_name=None, - pop_zip5=None, - pop_congressional_code_current=None, - pop_congressional_code=None, + pop_country_code="USA", + pop_state_code="CA", + pop_congressional_code_current="34", + pop_congressional_code="34", recipient_location_country_name="UNITED STATES", - recipient_location_state_name="CALIFORNIA", - recipient_location_city_name="TEXAS F CITY", - recipient_location_county_name=None, - recipient_location_zip5=75001, - recipient_location_congressional_code_current="30", - recipient_location_congressional_code="30", + recipient_location_country_code="USA", + recipient_location_state_code="CA", + recipient_location_congressional_code_current="34", + recipient_location_congressional_code="34", ) @@ -212,6 +129,54 @@ def test_congressional_district_results(client, monkeypatch, location_data_fixtu } +def test_zipcode_results(client, monkeypatch, location_data_fixture, elasticsearch_location_index): + monkeypatch.setattr( + "usaspending_api.common.elasticsearch.search_wrappers.LocationSearch._index_name", + settings.ES_LOCATIONS_QUERY_ALIAS_PREFIX, + ) + elasticsearch_location_index.update_index() + + response = client.post( + "/api/v2/autocomplete/location", + content_type="application/json", + data=json.dumps({"search_text": "90210"}), + ) + + assert response.status_code == status.HTTP_200_OK + assert len(response.data) == 3 + assert response.data["count"] == 1 + assert response.data["messages"] == [""] + assert response.data["results"] == { + "zip_codes": [ + {"zip_code": "90210", "state_name": "CALIFORNIA", "country_name": "UNITED STATES"}, + ] + } + + +def test_county_results(client, monkeypatch, location_data_fixture, elasticsearch_location_index): + monkeypatch.setattr( + "usaspending_api.common.elasticsearch.search_wrappers.LocationSearch._index_name", + settings.ES_LOCATIONS_QUERY_ALIAS_PREFIX, + ) + elasticsearch_location_index.update_index() + + response = client.post( + "/api/v2/autocomplete/location", + content_type="application/json", + data=json.dumps({"search_text": "los angeles"}), + ) + + assert response.status_code == status.HTTP_200_OK + assert len(response.data) == 3 + assert response.data["count"] == 1 + assert response.data["messages"] == [""] + assert response.data["results"] == { + "counties": [ + {"county_name": "LOS ANGELES", "state_name": "CALIFORNIA", "country_name": "UNITED STATES"}, + ], + } + + def test_no_results(client, monkeypatch, location_data_fixture, elasticsearch_location_index): monkeypatch.setattr( "usaspending_api.common.elasticsearch.search_wrappers.LocationSearch._index_name", @@ -255,9 +220,7 @@ def test_verify_no_missing_fields(client, monkeypatch, location_data_fixture, el assert len(results.hits) == 0 -def test_limits_by_location_type( - client, monkeypatch, location_data_fixture_multiple_locations, elasticsearch_location_index -): +def test_limits_by_location_type(client, monkeypatch, location_data_fixture, elasticsearch_location_index): """Test that the endpoint returns (at most) 5 results of each `location_type` by default""" monkeypatch.setattr( @@ -274,15 +237,9 @@ def test_limits_by_location_type( assert len(response.data) == 3 assert response.data["count"] == 6 assert response.data["messages"] == [""] + assert 0 < len(response.data["results"]["cities"]) <= 5 assert 0 < len(response.data["results"]["states"]) <= 5 - assert response.data["results"] == { - "cities": [ - {"city_name": "TEXAS A CITY", "state_name": "TEXAS", "country_name": "UNITED STATES"}, - {"city_name": "TEXAS B CITY", "state_name": "TEXAS", "country_name": "UNITED STATES"}, - {"city_name": "TEXAS D CITY", "state_name": "OKLAHOMA", "country_name": "UNITED STATES"}, - {"city_name": "TEXAS E CITY", "state_name": "TEXAS", "country_name": "UNITED STATES"}, - {"city_name": "TEXAS C CITY", "state_name": "ILLINOIS", "country_name": "UNITED STATES"}, - ], - "states": [{"state_name": "TEXAS", "country_name": "UNITED STATES"}], - } + + assert "cities" in response.data["results"].keys() + assert "states" in response.data["results"].keys() diff --git a/usaspending_api/reporting/tests/integration/test_agency_code_overview.py b/usaspending_api/reporting/tests/integration/test_agency_code_overview.py index 806ca326aa..addfa63d7f 100644 --- a/usaspending_api/reporting/tests/integration/test_agency_code_overview.py +++ b/usaspending_api/reporting/tests/integration/test_agency_code_overview.py @@ -1,10 +1,10 @@ import pytest - from django.conf import settings from model_bakery import baker from rest_framework import status from usaspending_api.agency.v2.views.agency_base import AgencyBase +from usaspending_api.reporting.models import ReportingAgencyOverview url = "/api/v2/reporting/agencies/123/overview/" @@ -30,6 +30,7 @@ def setup_test_data(db): reporting_fiscal_period=12, published_date="2021-02-11", ) + agency = baker.make( "references.ToptierAgency", toptier_code="123", abbreviation="ABC", name="Test Agency", _fill_optional=True ) @@ -53,6 +54,12 @@ def setup_test_data(db): awarding_toptier_agency_id=agency.toptier_agency_id, tas_rendering_label="tas-3-overview", ), + baker.make( + "accounts.TreasuryAppropriationAccount", + treasury_account_identifier=996, + awarding_toptier_agency_id=agency.toptier_agency_id, + tas_rendering_label="tas-3-overview", + ), ] approps = [ {"sub_id": sub.submission_id, "treasury_account": treas_accounts[0], "total_resources": 50}, @@ -795,3 +802,161 @@ def test_invalid_monthly_period_filter(client): response = resp.json() assert len(response["results"]) == 30 + + +@pytest.mark.django_db +def test_agency_with_only_file_c_or_d_awards(client): + sub = baker.make( + "submissions.SubmissionAttributes", + submission_id=1, + toptier_code="999", + quarter_format_flag=False, + reporting_fiscal_year=2019, + reporting_fiscal_period=9, + published_date="2019-07-03", + ) + agency = baker.make( + "references.ToptierAgency", toptier_code="999", abbreviation="ABC", name="Test Agency", _fill_optional=True + ) + ta = baker.make( + "accounts.TreasuryAppropriationAccount", + treasury_account_identifier=999, + awarding_toptier_agency_id=agency.toptier_agency_id, + tas_rendering_label="tas-999-overview", + ) + baker.make( + "accounts.AppropriationAccountBalances", + submission_id=sub.submission_id, + treasury_account_identifier=ta, + total_budgetary_resources_amount_cpe=10, + ) + baker.make( + "reporting.ReportingAgencyTas", + fiscal_year=sub.reporting_fiscal_year, + fiscal_period=sub.reporting_fiscal_period, + tas_rendering_label=ta.tas_rendering_label, + toptier_code=agency.toptier_code, + diff_approp_ocpa_obligated_amounts=0, + appropriation_obligated_amount=100, + ) + baker.make( + "reporting.ReportingAgencyOverview", + reporting_agency_overview_id=999, + toptier_code=999, + fiscal_year=2019, + fiscal_period=9, + total_budgetary_resources=None, + unlinked_procurement_c_awards=5, + unlinked_procurement_d_awards=None, + unlinked_assistance_c_awards=None, + unlinked_assistance_d_awards=None, + ) + + # Only unlinked procurement awards from File C + response = client.get("/api/v2/reporting/agencies/999/overview/").json() + expected_results = [ + { + "fiscal_year": 2019, + "fiscal_period": 9, + "current_total_budget_authority_amount": None, + "total_budgetary_resources": None, + "percent_of_total_budgetary_resources": None, + "recent_publication_date": "2019-07-03T00:00:00Z", + "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": None, + "tas_accounts_total": 100, + "tas_obligation_not_in_gtas_total": 0, + "missing_tas_accounts_count": 0, + }, + "obligation_difference": None, + "unlinked_contract_award_count": 5, + "unlinked_assistance_award_count": None, + "assurance_statement_url": f"{settings.FILES_SERVER_BASE_URL}/agency_submissions/2019-P09-999_Test%20Agency%20(ABC)-Agency_Comments.txt", + }, + ] + assert response["results"] == expected_results + + # Only unlinked procurement awards from File D + ReportingAgencyOverview.objects.filter(reporting_agency_overview_id=999).update( + unlinked_procurement_c_awards=None, unlinked_procurement_d_awards=10 + ) + response = client.get("/api/v2/reporting/agencies/999/overview/").json() + expected_results = [ + { + "fiscal_year": 2019, + "fiscal_period": 9, + "current_total_budget_authority_amount": None, + "total_budgetary_resources": None, + "percent_of_total_budgetary_resources": None, + "recent_publication_date": "2019-07-03T00:00:00Z", + "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": None, + "tas_accounts_total": 100, + "tas_obligation_not_in_gtas_total": 0, + "missing_tas_accounts_count": 0, + }, + "obligation_difference": None, + "unlinked_contract_award_count": 10, + "unlinked_assistance_award_count": None, + "assurance_statement_url": f"{settings.FILES_SERVER_BASE_URL}/agency_submissions/2019-P09-999_Test%20Agency%20(ABC)-Agency_Comments.txt", + }, + ] + assert response["results"] == expected_results + + # Only unlinked assistance awards from File C + ReportingAgencyOverview.objects.filter(reporting_agency_overview_id=999).update( + unlinked_procurement_d_awards=None, unlinked_assistance_c_awards=15 + ) + response = client.get("/api/v2/reporting/agencies/999/overview/").json() + expected_results = [ + { + "fiscal_year": 2019, + "fiscal_period": 9, + "current_total_budget_authority_amount": None, + "total_budgetary_resources": None, + "percent_of_total_budgetary_resources": None, + "recent_publication_date": "2019-07-03T00:00:00Z", + "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": None, + "tas_accounts_total": 100, + "tas_obligation_not_in_gtas_total": 0, + "missing_tas_accounts_count": 0, + }, + "obligation_difference": None, + "unlinked_contract_award_count": None, + "unlinked_assistance_award_count": 15, + "assurance_statement_url": f"{settings.FILES_SERVER_BASE_URL}/agency_submissions/2019-P09-999_Test%20Agency%20(ABC)-Agency_Comments.txt", + }, + ] + assert response["results"] == expected_results + + # Only unlinked assistance awards from File D + ReportingAgencyOverview.objects.filter(reporting_agency_overview_id=999).update( + unlinked_assistance_c_awards=None, unlinked_assistance_d_awards=20 + ) + response = client.get("/api/v2/reporting/agencies/999/overview/").json() + expected_results = [ + { + "fiscal_year": 2019, + "fiscal_period": 9, + "current_total_budget_authority_amount": None, + "total_budgetary_resources": None, + "percent_of_total_budgetary_resources": None, + "recent_publication_date": "2019-07-03T00:00:00Z", + "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": None, + "tas_accounts_total": 100, + "tas_obligation_not_in_gtas_total": 0, + "missing_tas_accounts_count": 0, + }, + "obligation_difference": None, + "unlinked_contract_award_count": None, + "unlinked_assistance_award_count": 20, + "assurance_statement_url": f"{settings.FILES_SERVER_BASE_URL}/agency_submissions/2019-P09-999_Test%20Agency%20(ABC)-Agency_Comments.txt", + }, + ] + assert response["results"] == expected_results diff --git a/usaspending_api/reporting/v2/views/agencies/toptier_code/overview.py b/usaspending_api/reporting/v2/views/agencies/toptier_code/overview.py index 76eca517ab..89692b2454 100644 --- a/usaspending_api/reporting/v2/views/agencies/toptier_code/overview.py +++ b/usaspending_api/reporting/v2/views/agencies/toptier_code/overview.py @@ -1,13 +1,13 @@ -from django.db.models import Subquery, OuterRef, DecimalField, Func, F, Q, IntegerField +from django.db.models import DecimalField, F, Func, IntegerField, OuterRef, Q, Subquery from rest_framework.response import Response from usaspending_api.agency.v2.views.agency_base import AgencyBase, PaginationMixin +from usaspending_api.common.cache_decorator import cache_response from usaspending_api.common.helpers.generic_helper import get_pagination_metadata -from usaspending_api.references.models import GTASSF133Balances -from usaspending_api.references.models import ToptierAgency -from usaspending_api.reporting.models import ReportingAgencyOverview, ReportingAgencyTas, ReportingAgencyMissingTas -from usaspending_api.submissions.models import SubmissionAttributes +from usaspending_api.references.models import GTASSF133Balances, ToptierAgency +from usaspending_api.reporting.models import ReportingAgencyMissingTas, ReportingAgencyOverview, ReportingAgencyTas from usaspending_api.submissions.helpers import is_valid_monthly_period +from usaspending_api.submissions.models import SubmissionAttributes class AgencyOverview(PaginationMixin, AgencyBase): @@ -15,6 +15,7 @@ class AgencyOverview(PaginationMixin, AgencyBase): endpoint_doc = "usaspending_api/api_contracts/contracts/v2/reporting/agencies/toptier_code/overview.md" + @cache_response() def get(self, request, toptier_code): self.sortable_columns = [ "current_total_budget_authority_amount", @@ -192,13 +193,13 @@ def format_result(self, result): else None ) unlinked_assistance_award_count = ( - result["unlinked_assistance_c_awards"] + result["unlinked_assistance_d_awards"] - if result["unlinked_assistance_c_awards"] and result["unlinked_assistance_d_awards"] + (result["unlinked_assistance_c_awards"] or 0) + (result["unlinked_assistance_d_awards"] or 0) + if result["unlinked_assistance_c_awards"] or result["unlinked_assistance_d_awards"] else None ) unlinked_contract_award_count = ( - result["unlinked_procurement_c_awards"] + result["unlinked_procurement_d_awards"] - if result["unlinked_procurement_c_awards"] and result["unlinked_procurement_d_awards"] + (result["unlinked_procurement_c_awards"] or 0) + (result["unlinked_procurement_d_awards"] or 0) + if result["unlinked_procurement_c_awards"] or result["unlinked_procurement_d_awards"] else None ) formatted_result.update( diff --git a/usaspending_api/search/tests/integration/test_spending_over_time_details.py b/usaspending_api/search/tests/integration/test_spending_over_time_details.py index 0f0023dde9..d57a5eee61 100644 --- a/usaspending_api/search/tests/integration/test_spending_over_time_details.py +++ b/usaspending_api/search/tests/integration/test_spending_over_time_details.py @@ -5,7 +5,7 @@ from model_bakery import baker from rest_framework import status -from usaspending_api.common.helpers.fiscal_year_helpers import generate_fiscal_year +from usaspending_api.common.helpers.fiscal_year_helpers import generate_fiscal_year, generate_date_range from usaspending_api.common.helpers.generic_helper import get_time_period_message from usaspending_api.search.tests.data.utilities import setup_elasticsearch_test @@ -114,7 +114,7 @@ def populate_models(db): latest_transaction_id=1, latest_transaction_search=ts1, category="direct payment", - action_date=datetime(2011, 3, 1), + action_date=ts1.action_date, date_signed=datetime(2010, 3, 1), fiscal_year=generate_fiscal_year(ts1.fiscal_action_date), total_outlays=0.00, @@ -126,7 +126,7 @@ def populate_models(db): latest_transaction_id=2, latest_transaction_search=ts2, category="idv", - action_date=datetime(2011, 3, 1), + action_date=ts2.action_date, date_signed=datetime(2010, 3, 1), fiscal_year=generate_fiscal_year(ts2.fiscal_action_date), total_outlays=10.00, @@ -138,7 +138,7 @@ def populate_models(db): latest_transaction_id=3, latest_transaction_search=ts3, category="loans", - action_date=datetime(2012, 3, 1), + action_date=ts3.action_date, date_signed=datetime(2011, 3, 1), fiscal_year=generate_fiscal_year(ts3.fiscal_action_date), total_outlays=20.00, @@ -150,7 +150,7 @@ def populate_models(db): latest_transaction_id=4, latest_transaction_search=ts4, category="other", - action_date=datetime(2013, 3, 1), + action_date=ts4.action_date, date_signed=datetime(2012, 3, 1), fiscal_year=generate_fiscal_year(ts4.fiscal_action_date), total_outlays=30.00, @@ -162,7 +162,7 @@ def populate_models(db): latest_transaction_id=5, latest_transaction_search=ts5, category="grant", - action_date=datetime(2014, 3, 1), + action_date=ts5.action_date, date_signed=datetime(2013, 3, 1), fiscal_year=generate_fiscal_year(ts5.fiscal_action_date), total_outlays=40.00, @@ -174,7 +174,7 @@ def populate_models(db): latest_transaction_id=6, latest_transaction_search=ts6, category="grant", - action_date=datetime(2015, 3, 1), + action_date=ts6.action_date, date_signed=datetime(2014, 3, 1), fiscal_year=generate_fiscal_year(ts6.fiscal_action_date), total_outlays=50.00, @@ -186,7 +186,7 @@ def populate_models(db): latest_transaction_id=7, latest_transaction_search=ts7, category="grant", - action_date=datetime(2016, 3, 1), + action_date=ts7.action_date, date_signed=datetime(2015, 3, 1), fiscal_year=generate_fiscal_year(ts7.fiscal_action_date), total_outlays=60.00, @@ -198,7 +198,7 @@ def populate_models(db): latest_transaction_id=8, latest_transaction_search=ts8, category="loans", - action_date=datetime(2017, 3, 1), + action_date=ts8.action_date, date_signed=datetime(2016, 3, 1), fiscal_year=generate_fiscal_year(ts8.fiscal_action_date), total_outlays=70.00, @@ -732,55 +732,55 @@ def test_spending_over_time_fy_ordering_awards(client, monkeypatch, elasticsearc "spending_level": "awards", "results": [ { - "aggregated_amount": 100.00, - "time_period": {"fiscal_year": "2010"}, "Contract_Obligations": 0, + "Contract_Outlays": 0, "Direct_Obligations": 100.0, + "Direct_Outlays": 0.0, "Grant_Obligations": 0, - "Idv_Obligations": 0, - "Loan_Obligations": 0, - "Other_Obligations": 0, - "total_outlays": 0.0, - "Contract_Outlays": 0, - "Direct_Outlays": 0, "Grant_Outlays": 0, + "Idv_Obligations": 0, "Idv_Outlays": 0, + "Loan_Obligations": 0, "Loan_Outlays": 0, + "Other_Obligations": 0, "Other_Outlays": 0, + "aggregated_amount": 100.0, + "time_period": {"fiscal_year": "2010"}, + "total_outlays": 0.0, }, { - "aggregated_amount": 110.00, - "time_period": {"fiscal_year": "2011"}, "Contract_Obligations": 0, - "Direct_Obligations": 0, - "Grant_Obligations": 0, - "Idv_Obligations": 110.0, - "Loan_Obligations": 0, - "Other_Obligations": 0, - "total_outlays": 10.0, "Contract_Outlays": 0, + "Direct_Obligations": 0, "Direct_Outlays": 0, + "Grant_Obligations": 0, "Grant_Outlays": 0, + "Idv_Obligations": 110.0, "Idv_Outlays": 10.0, + "Loan_Obligations": 0, "Loan_Outlays": 0, + "Other_Obligations": 0, "Other_Outlays": 0, + "aggregated_amount": 110.0, + "time_period": {"fiscal_year": "2011"}, + "total_outlays": 10.0, }, { - "aggregated_amount": 120.0, - "time_period": {"fiscal_year": "2012"}, "Contract_Obligations": 0, - "Direct_Obligations": 0, - "Grant_Obligations": 0, - "Idv_Obligations": 0, - "Loan_Obligations": 120.0, - "Other_Obligations": 0, - "total_outlays": 20.0, "Contract_Outlays": 0, + "Direct_Obligations": 0, "Direct_Outlays": 0, + "Grant_Obligations": 0, "Grant_Outlays": 0, + "Idv_Obligations": 0, "Idv_Outlays": 0, + "Loan_Obligations": 120.0, "Loan_Outlays": 20.0, + "Other_Obligations": 0, "Other_Outlays": 0, + "aggregated_amount": 120.0, + "time_period": {"fiscal_year": "2012"}, + "total_outlays": 20.0, }, ], "messages": expected_messages, @@ -793,7 +793,7 @@ def test_spending_over_time_fy_ordering_awards(client, monkeypatch, elasticsearc ) assert resp.status_code == status.HTTP_200_OK - assert expected_response == resp.data, "Unexpected or missing content!" + assert expected_response == resp.json(), "Unexpected or missing content!" # ensure ordering is correct confirm_proper_ordering(group, resp.data["results"]) @@ -1812,3 +1812,154 @@ def test_spending_over_time_new_awards_only_filter( # ensure ordering is correct confirm_proper_ordering(group, resp.data["results"]) + + +@pytest.mark.django_db +def test_spending_over_time_month_awards(client, monkeypatch, elasticsearch_award_index, populate_models): + setup_elasticsearch_test(monkeypatch, elasticsearch_award_index) + + group = "month" + + test_payload = { + "group": group, + "spending_level": "awards", + "filters": { + "time_period": [ + {"start_date": "2009-10-01", "end_date": "2010-09-30"}, + ] + }, + } + populated_results = { + (2010, 6): { + "aggregated_amount": 100.0, + "time_period": {"fiscal_year": "2010", "month": "6"}, + "Contract_Obligations": 0, + "Direct_Obligations": 100.0, + "Grant_Obligations": 0, + "Idv_Obligations": 0, + "Loan_Obligations": 0, + "Other_Obligations": 0, + "total_outlays": 0.0, + "Contract_Outlays": 0, + "Direct_Outlays": 0, + "Grant_Outlays": 0, + "Idv_Outlays": 0, + "Loan_Outlays": 0, + "Other_Outlays": 0, + }, + (2011, 6): { + "aggregated_amount": 110.0, + "time_period": {"fiscal_year": "2011", "month": "6"}, + "Contract_Obligations": 0, + "Direct_Obligations": 0, + "Grant_Obligations": 0, + "Idv_Obligations": 110.0, + "Loan_Obligations": 0, + "Other_Obligations": 0, + "total_outlays": 10.0, + "Contract_Outlays": 0, + "Direct_Outlays": 0, + "Grant_Outlays": 0, + "Idv_Outlays": 10.0, + "Loan_Outlays": 0, + "Other_Outlays": 0, + }, + } + expected_response = { + "group": group, + "spending_level": "awards", + "results": [ + populated_results.get( + (date_range["fiscal_year"], date_range["fiscal_month"]), + { + "aggregated_amount": 0, + "time_period": { + "fiscal_year": str(date_range["fiscal_year"]), + "month": str(date_range["fiscal_month"]), + }, + "Contract_Obligations": 0, + "Direct_Obligations": 0, + "Grant_Obligations": 0, + "Idv_Obligations": 0, + "Loan_Obligations": 0, + "Other_Obligations": 0, + "total_outlays": 0.0, + "Contract_Outlays": 0, + "Direct_Outlays": 0, + "Grant_Outlays": 0, + "Idv_Outlays": 0, + "Loan_Outlays": 0, + "Other_Outlays": 0, + }, + ) + for date_range in generate_date_range(datetime(2009, 10, 1), datetime(2011, 6, 1), group) + ], + "messages": expected_messages, + } + + resp = client.post( + get_spending_over_time_url(), + content_type="application/json", + data=json.dumps(test_payload), + ) + + assert resp.status_code == status.HTTP_200_OK + assert expected_response == resp.json(), "Unexpected or missing content!" + + # ensure ordering is correct + confirm_proper_ordering(group, resp.data["results"]) + + +@pytest.mark.django_db +def test_spending_over_time_month_subawards(client, monkeypatch, elasticsearch_subaward_index, populate_models): + baker.make( + "search.SubawardSearch", + broker_subaward_id=1, + action_date="2010-11-01", + sub_action_date="2010-11-01", + subaward_amount=500.00, + subaward_type="grant", + ) + setup_elasticsearch_test(monkeypatch, elasticsearch_subaward_index) + + group = "month" + + test_payload = { + "group": group, + "spending_level": "subawards", + "filters": { + "time_period": [ + {"start_date": "2010-10-01", "end_date": "2011-09-30"}, + ] + }, + } + + expected_response = { + "group": group, + "spending_level": "subawards", + "results": [ + { + "aggregated_amount": 500.0 if month == 2 else 0, + "time_period": {"fiscal_year": "2011", "month": str(month)}, + "Contract_Obligations": 0, + "Grant_Obligations": 500.0 if month == 2 else 0, + "total_outlays": None, + "Contract_Outlays": None, + "Grant_Outlays": None, + } + for month in range(1, 13) + ], + "messages": expected_messages, + } + + resp = client.post( + get_spending_over_time_url(), + content_type="application/json", + data=json.dumps(test_payload), + ) + + assert resp.status_code == status.HTTP_200_OK + assert expected_response == resp.json(), "Unexpected or missing content!" + + # ensure ordering is correct + confirm_proper_ordering(group, resp.data["results"]) diff --git a/usaspending_api/search/v2/views/spending_by_award.py b/usaspending_api/search/v2/views/spending_by_award.py index bf4c270bdd..9652cbb02e 100644 --- a/usaspending_api/search/v2/views/spending_by_award.py +++ b/usaspending_api/search/v2/views/spending_by_award.py @@ -12,6 +12,7 @@ from django.db.models import F from django.utils.text import slugify from elasticsearch_dsl import Q as ES_Q +from elasticsearch_dsl.response import Response as ES_Response from rest_framework.response import Response from rest_framework.views import APIView @@ -28,12 +29,12 @@ non_loan_assist_mapping, ) from usaspending_api.awards.v2.lookups.lookups import ( + SUBAWARD_MAPPING_LOOKUP, contract_type_mapping, idv_type_mapping, loan_type_mapping, non_loan_assistance_type_mapping, subaward_mapping, - SUBAWARD_MAPPING_LOOKUP, ) from usaspending_api.common.api_versioning import API_TRANSFORM_FUNCTIONS, api_transformations from usaspending_api.common.cache_decorator import cache_response @@ -51,7 +52,7 @@ from usaspending_api.common.validator.tinyshield import TinyShield from usaspending_api.recipient.models import RecipientProfile from usaspending_api.references.helpers import get_def_codes_by_group -from usaspending_api.references.models import Agency, ToptierAgencyPublishedDABSView +from usaspending_api.references.models import Agency from usaspending_api.search.filters.elasticsearch.filter import QueryType from usaspending_api.search.filters.time_period.decorators import NewAwardsOnlyTimePeriod from usaspending_api.search.filters.time_period.query_types import AwardSearchTimePeriod, SubawardSearchTimePeriod @@ -367,7 +368,7 @@ def populate_response(self, results: list, has_next: bool, models: List[dict]) - def query_elasticsearch( self, base_search: AwardSearch | SubawardSearch, filter_query: ES_Q, sorts: list[dict[str, str]] - ) -> Response: + ) -> ES_Response: record_num = (self.pagination["page"] - 1) * self.pagination["limit"] # random page jumping was removed due to performance concerns if (self.last_record_sort_value is None and self.last_record_unique_id is not None) or ( @@ -403,7 +404,7 @@ def query_elasticsearch( return response - def query_elasticsearch_awards(self) -> Response: + def query_elasticsearch_awards(self) -> ES_Response: filter_options = {} time_period_obj = AwardSearchTimePeriod( default_end_date=settings.API_MAX_DATE, default_start_date=settings.API_SEARCH_MIN_DATE @@ -450,7 +451,7 @@ def query_elasticsearch_awards(self) -> Response: return self.query_elasticsearch(AwardSearch(), filter_query, sorts) - def query_elasticsearch_subawards(self) -> Response: + def query_elasticsearch_subawards(self) -> ES_Response: filter_options = {} time_period_obj = SubawardSearchTimePeriod( default_end_date=settings.API_MAX_DATE, default_start_date=settings.API_SEARCH_MIN_DATE @@ -466,25 +467,31 @@ def query_elasticsearch_subawards(self) -> Response: return self.query_elasticsearch(SubawardSearch(), filter_query, sorts) - # For an unknown reason, ES tends to return the awarding agency toptier codes as integers or floats, instead of as - # text. This function casts the code back to a string and appends any leading zeroes that were lost. - def get_agency_database_id(self, code): - code = str(code).zfill(3) - agency_id = Agency.objects.filter(toptier_agency__toptier_code=code, toptier_flag=True).first() - submission = SubmissionAttributes.objects.filter(toptier_code=code).first() - if submission is None or agency_id is None: - return None - return agency_id.id + def _get_lookup_data(self) -> dict[str, dict[str, str | int]]: + # { : {"id": , "slug": } } + agency_lookup: dict[str, dict[str, str | int]] = { + ag["code"]: {"id": ag["id"], "slug": slugify(ag["name"])} + for ag in ( + Agency.objects.filter( + toptier_flag=True, + toptier_agency__toptier_code__in=SubmissionAttributes.objects.values("toptier_code"), + ) + .select_related("toptieragencypublisheddabsview") + .annotate(code=F("toptier_agency__toptier_code"), name=F("toptieragencypublisheddabsview__name")) + .values("id", "code", "name") + .all() + ) + } - def get_agency_slug(self, code): - code = str(code).zfill(3) - submission = ToptierAgencyPublishedDABSView.objects.filter(toptier_code=code).first() - if submission is None: - return None - return slugify(submission.name) + return agency_lookup - def construct_es_response_for_prime_awards(self, response) -> dict: + def construct_es_response_for_prime_awards(self, response: ES_Response) -> dict: results = [] + + if len(response) == 0: + return self.construct_es_response(results, response) + + agency_lookup = self._get_lookup_data() should_return_display_award_id = "Award ID" in self.fields should_return_recipient_id = "recipient_id" in self.fields for res in response: @@ -513,9 +520,9 @@ def construct_es_response_for_prime_awards(self, response) -> dict: if row.get("Total Outlays"): row["Total Outlays"] = float(row["Total Outlays"]) if row.get("Awarding Agency"): - code = row.pop("agency_code") - row["awarding_agency_id"] = self.get_agency_database_id(code) - row["agency_slug"] = self.get_agency_slug(code) + code = str(row.pop("agency_code")).zfill(3) + row["awarding_agency_id"] = agency_lookup.get(code, {}).get("id", None) + row["agency_slug"] = agency_lookup.get(code, {}).get("slug", None) if row.get("def_codes"): if self.filters.get("def_codes"): row["def_codes"] = list(filter(lambda x: x in self.filters.get("def_codes"), row["def_codes"])) @@ -588,7 +595,7 @@ def construct_es_response_for_prime_awards(self, response) -> dict: return self.construct_es_response(results, response) - def construct_es_response_for_subawards(self, response: Response) -> dict[str, Any]: + def construct_es_response_for_subawards(self, response: ES_Response) -> dict[str, Any]: results = [] for res in response: hit = res.to_dict() @@ -691,7 +698,7 @@ def calculate_complex_fields(self, row, hit): return row - def construct_es_response(self, results: list[dict[str, Any]], response: Response) -> dict[str, Any]: + def construct_es_response(self, results: list[dict[str, Any]] | list, response: ES_Response) -> dict[str, Any]: last_record_unique_id = None last_record_sort_value = None offset = 1 diff --git a/usaspending_api/search/v2/views/spending_over_time.py b/usaspending_api/search/v2/views/spending_over_time.py index 6a833c2a2d..a57cf6e8ac 100644 --- a/usaspending_api/search/v2/views/spending_over_time.py +++ b/usaspending_api/search/v2/views/spending_over_time.py @@ -135,27 +135,16 @@ def validate_request_data(json_data: dict) -> Tuple[dict, dict]: return validated_data, models - def subawards_group_by_time_period_agg(self) -> A: - if self.group == "fiscal_year": - return A("terms", field="sub_fiscal_year") - else: - return A( - "date_histogram", - field="sub_action_date", - interval="year" if (self.group == "calendar_year") else self.group, - format="yyyy-MM-dd", - ) - - def awards_group_by_time_period_agg(self) -> A: - if self.group == "fiscal_year": - return A("terms", field="fiscal_year", size="100", order={"_key": "asc"}) - else: - return A( - "date_histogram", - field="action_date", - interval="year" if (self.group == "calendar_year") else self.group, - format="yyyy-MM-dd", - ) + def get_time_period_aggregation(self): + field_prefix = "sub_" if self.spending_level == SpendingLevel.SUBAWARD else "" + field = "action_date" if self.group == "calendar_year" else "fiscal_action_date" + interval = "year" if self.group in ["calendar_year", "fiscal_year"] else self.group + return A( + "date_histogram", + field=f"{field_prefix}{field}", + interval=interval, + format="yyyy-MM-dd", + ) def apply_elasticsearch_aggregations(self, search: Search) -> None: """ @@ -169,22 +158,14 @@ def apply_elasticsearch_aggregations(self, search: Search) -> None: if isinstance(search, AwardSearch): category_field = "category.keyword" obligation_field = "generated_pragmatic_obligation" - group_by_time_period_agg = self.awards_group_by_time_period_agg() - elif isinstance(search, TransactionSearch): - category_field = "award_category" - obligation_field = "generated_pragmatic_obligation" - - group_by_time_period_agg = A( - "date_histogram", - field="action_date" if self.group == "calendar_year" else "fiscal_action_date", - interval="year" if (self.group == "fiscal_year" or self.group == "calendar_year") else self.group, - format="yyyy-MM-dd", - ) - elif isinstance(search, SubawardSearch): category_field = "subaward_type.keyword" obligation_field = "subaward_amount" - group_by_time_period_agg = self.subawards_group_by_time_period_agg() + else: + category_field = "award_category" + obligation_field = "generated_pragmatic_obligation" + + group_by_time_period_agg = self.get_time_period_aggregation() """ The individual aggregations that are needed; with two different sum aggregations to handle issues with @@ -221,12 +202,7 @@ def apply_elasticsearch_aggregations(self, search: Search) -> None: def set_time_period(self, bucket: dict) -> dict: time_period = {} - - if self.group == "fiscal_year" and self.spending_level != SpendingLevel.TRANSACTION: - key_as_date = datetime.strptime(str(bucket["key"]), "%Y") - else: - key_as_date = datetime.strptime(bucket["key_as_string"], "%Y-%m-%d") - + key_as_date = datetime.strptime(bucket["key_as_string"], "%Y-%m-%d") time_period["calendar_year" if self.group == "calendar_year" else "fiscal_year"] = str(key_as_date.year) if self.group == "quarter": quarter = (key_as_date.month - 1) // 3 + 1 @@ -318,9 +294,7 @@ def parse_elasticsearch_bucket(self, bucket: dict) -> dict: return response_object - def build_elasticsearch_result_transactions( - self, agg_response: AggResponse, time_periods: list[TimePeriod] - ) -> list: + def build_elasticsearch_result(self, agg_response: AggResponse, time_periods: list[TimePeriod]) -> list: """ In this function we are just taking the elasticsearch aggregate response and looping through the buckets to create a results object for each time interval. @@ -332,15 +306,19 @@ def build_elasticsearch_result_transactions( * "calendar_year" returns a list of dictionaries containing {calendar year} * "fiscal_year" returns list of dictionaries containing {fiscal year} * "quarter" returns a list of dictionaries containing {fiscal year and quarter} - * "month" returns a list of dictionaries containg {fiscal year and month} + * "month" returns a list of dictionaries containing {fiscal year and month} NOTE the generate_date_range() can also generate non fiscal date range (calendar ranges) as well. """ results = [] min_date, max_date = min_and_max_from_date_ranges([asdict(time_period) for time_period in time_periods]) + date_buckets = agg_response.group_by_time_period.buckets + + if self.spending_level == SpendingLevel.AWARD and date_buckets: + max_date_from_results = datetime.strptime(date_buckets[-1]["key_as_string"], "%Y-%m-%d") + max_date = max(max_date, max_date_from_results) date_range = generate_date_range(min_date, max_date, self.group) - date_buckets = agg_response.group_by_time_period.buckets parsed_bucket = None for fiscal_date in date_range: @@ -361,10 +339,39 @@ def build_elasticsearch_result_transactions( results.append(parsed_bucket) parsed_bucket = None else: - results.append( - { - "aggregated_amount": 0, - "time_period": time_period, + default_value = { + "aggregated_amount": 0, + "time_period": time_period, + } + if self.spending_level == SpendingLevel.AWARD: + default_value = { + **default_value, + "Contract_Obligations": 0, + "Direct_Obligations": 0, + "Grant_Obligations": 0, + "Idv_Obligations": 0, + "Loan_Obligations": 0, + "Other_Obligations": 0, + "total_outlays": 0, + "Contract_Outlays": 0, + "Direct_Outlays": 0, + "Grant_Outlays": 0, + "Idv_Outlays": 0, + "Loan_Outlays": 0, + "Other_Outlays": 0, + } + elif self.spending_level == SpendingLevel.SUBAWARD: + default_value = { + **default_value, + "Contract_Obligations": 0, + "Grant_Obligations": 0, + "total_outlays": None, + "Contract_Outlays": None, + "Grant_Outlays": None, + } + else: + default_value = { + **default_value, "Contract_Obligations": 0, "Direct_Obligations": 0, "Grant_Obligations": 0, @@ -379,65 +386,10 @@ def build_elasticsearch_result_transactions( "Loan_Outlays": None, "Other_Outlays": None, } - ) + results.append(default_value) return results - def build_elasticsearch_result_awards_subawards(self, agg_response: AggResponse) -> list: - """ - In this function we are just taking the elasticsearch aggregate response and looping through the - buckets to create a results object for each time interval. - - Using a min_date, max_date, and a frequency indicator generates either a list of dictionaries - containing fiscal year information (fiscal year, fiscal quarter, and fiscal month) or a list - of dictionaries containing calendar year information (calendar year). The following are the format - of date_range based on the frequency: - * "calendar_year" returns a list of dictionaries containing {calendar year} - * "fiscal_year" returns list of dictionaries containing {fiscal year} - * "quarter" returns a list of dictionaries containing {fiscal year and quarter} - * "month" returns a list of dictionaries containg {fiscal year and month} - NOTE the generate_date_range() can also generate non fiscal date range (calendar ranges) as well. - """ - - results = [] - date_buckets = agg_response.group_by_time_period.buckets - parsed_bucket = None - - if date_buckets is not None: - for bucket in date_buckets: - parsed_bucket = self.parse_elasticsearch_bucket(bucket.to_dict()) - results.append(parsed_bucket) - - return results - - def set_default_for_subawards(self, time_periods: list[TimePeriod], overall_results: list) -> list: - """if there is no data for that fiscal year, set default overall_results for that year""" - - min_date, max_date = min_and_max_from_date_ranges([asdict(time_period) for time_period in time_periods]) - date_range = generate_date_range(min_date, max_date, self.group) - if date_range.count != overall_results.count: - for year in date_range: - if not ( - any( - overall_result["time_period"] == {"fiscal_year": str(year["fiscal_year"])} - for overall_result in overall_results - ) - ): - overall_results.append( - { - "aggregated_amount": 0, - "total_outlays": None, - "time_period": {"fiscal_year": str(year["fiscal_year"])}, - "Contract_Obligations": 0, - "Contract_Outlays": None, - "Grant_Obligations": 0, - "Grant_Outlays": None, - } - ) - overall_results = sorted(overall_results, key=lambda x: x["time_period"]["fiscal_year"]) - - return overall_results - @cache_response() def post(self, request: Request) -> Response: self.original_filters = request.data.get("filters") @@ -468,7 +420,6 @@ def post(self, request: Request) -> Response: default_time_period = {"start_date": settings.API_SEARCH_MIN_DATE, "end_date": end_date} # if time periods have been passed in use those, otherwise use the one calculated above - results = None time_periods = [ TimePeriod(**time_period) for time_period in self.filters.get("time_period", [default_time_period]) ] @@ -477,22 +428,17 @@ def post(self, request: Request) -> Response: query_type = QueryType(self.spending_level.value) query_with_filters = QueryWithFilters(query_type) filter_query = query_with_filters.generate_elasticsearch_query(self.filters) - if self.spending_level == SpendingLevel.SUBAWARD: - search = SubawardSearch().filter(filter_query) - self.apply_elasticsearch_aggregations(search) - response = search.handle_execute() - results_with_values = self.build_elasticsearch_result_awards_subawards(response.aggs) - results = self.set_default_for_subawards(time_periods, results_with_values) - elif self.spending_level == SpendingLevel.TRANSACTION: - search = TransactionSearch().filter(filter_query) - self.apply_elasticsearch_aggregations(search) - response = search.handle_execute() - results = self.build_elasticsearch_result_transactions(response.aggs, time_periods) - elif self.spending_level == SpendingLevel.AWARD: - search = AwardSearch().filter(filter_query) - self.apply_elasticsearch_aggregations(search) - response = search.handle_execute() - results = self.build_elasticsearch_result_awards_subawards(response.aggs) + match self.spending_level: + case SpendingLevel.AWARD: + search = AwardSearch + case SpendingLevel.SUBAWARD: + search = SubawardSearch + case _: + search = TransactionSearch + search = search().filter(filter_query) + self.apply_elasticsearch_aggregations(search) + response = search.handle_execute() + results = self.build_elasticsearch_result(response.aggs, time_periods) raw_response = OrderedDict( [ diff --git a/usaspending_api/static_doc_files/css/main.css b/usaspending_api/static_doc_files/css/main.css index 539ae9a945..7f66a3c69b 100644 --- a/usaspending_api/static_doc_files/css/main.css +++ b/usaspending_api/static_doc_files/css/main.css @@ -7786,6 +7786,10 @@ button.close { position: fixed } +.request-info { + clear: both +} + @-ms-viewport { width: device-width } diff --git a/usaspending_api/tests/conftest_spark.py b/usaspending_api/tests/conftest_spark.py index 7cc9625439..31a628cc74 100644 --- a/usaspending_api/tests/conftest_spark.py +++ b/usaspending_api/tests/conftest_spark.py @@ -3,6 +3,7 @@ import uuid from datetime import datetime from pathlib import Path +from typing import TYPE_CHECKING, Generator import boto3 import pytest @@ -10,7 +11,6 @@ from django.db import connections from model_bakery import baker from psycopg2.extensions import AsIs -from pyspark.sql import SparkSession from usaspending_api import settings from usaspending_api.common.etl.spark import create_ref_temp_views from usaspending_api.common.helpers.spark_helpers import ( @@ -18,10 +18,14 @@ is_spark_context_stopped, stop_spark_context, ) +from usaspending_api.common.spark.configs import LOCAL_BASIC_EXTRA_CONF from usaspending_api.config import CONFIG from usaspending_api.etl.award_helpers import update_awards from usaspending_api.etl.management.commands.create_delta_table import LOAD_QUERY_TABLE_SPEC, LOAD_TABLE_TABLE_SPEC +if TYPE_CHECKING: + from pyspark.sql import SparkSession + # ==== Spark Automated Integration Test Fixtures ==== # # How to determine a working dependency set: @@ -37,26 +41,6 @@ # uncertain of compatibility, see what working version-sets are aligned to an Amazon EMR release here: # https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-release-app-versions-6.x.html -# The versions below are determined by the current version of Databricks in use -_SCALA_VERSION = "2.12" -_HADOOP_VERSION = "3.3.4" -_SPARK_VERSION = "3.5.0" -_DELTA_VERSION = "3.1.0" - - -# List of Maven coordinates for required JAR files used by running code, which can be added to the driver and -# executor class paths -SPARK_SESSION_JARS = [ - # "com.amazonaws:aws-java-sdk:1.12.31", - # hadoop-aws is an add-on to hadoop with Classes that allow hadoop to interface with an S3A (AWS S3) FileSystem - # NOTE That in order to work, the version number should be the same as the Hadoop version used by your Spark runtime - # It SHOULD pull in (via Ivy package manager from maven repo) the version of com.amazonaws:aws-java-sdk that is - # COMPATIBLE with it (so that should not be set as a dependent package by us) - f"org.apache.hadoop:hadoop-aws:{_HADOOP_VERSION}", - "org.postgresql:postgresql:42.2.23", - f"io.delta:delta-spark_{_SCALA_VERSION}:{_DELTA_VERSION}", -] - DELTA_LAKE_UNITTEST_SCHEMA_NAME = "unittest" @@ -138,7 +122,7 @@ def s3_unittest_data_bucket(s3_unittest_data_bucket_setup_and_teardown): @pytest.fixture(scope="session") -def spark(tmp_path_factory) -> SparkSession: +def spark(tmp_path_factory) -> Generator["SparkSession", None, None]: """Throw an error if coming into a test using this fixture which needs to create a NEW SparkContext (i.e. new JVM invocation to run Spark in a java process) AND, proactively cleanup any SparkContext created by this test after it completes @@ -157,31 +141,10 @@ def spark(tmp_path_factory) -> SparkSession: # another test-scoped fixture should be created, pulling this in, and blowing away all schemas and tables as part # of each run spark_sql_warehouse_dir = str(tmp_path_factory.mktemp(basename="spark-warehouse", numbered=False)) - extra_conf = { - # This is the default, but being explicit - "spark.master": "local[*]", - "spark.driver.host": "127.0.0.1", # if not set fails in local envs, trying to use network IP instead - # Client deploy mode is the default, but being explicit. - # Means the driver node is the place where the SparkSession is instantiated (and/or where spark-submit - # process is started from, even if started under the hood of a Py4J JavaGateway). With a "standalone" (not - # YARN or Mesos or Kubernetes) cluster manager, only client mode is supported. - "spark.submit.deployMode": "client", - # Default of 1g (1GiB) for Driver. Increase here if the Java process is crashing with memory errors - "spark.driver.memory": "1g", - "spark.executor.memory": "1g", - "spark.ui.enabled": "false", # Does the same as setting SPARK_TESTING=true env var - "spark.jars.packages": ",".join(SPARK_SESSION_JARS), - # Delta Lake config for Delta tables and SQL. Need these to keep Delta table metadata in the metastore - "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", - "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog", - # See comment below about old date and time values cannot parsed without these - "spark.sql.legacy.parquet.datetimeRebaseModeInWrite": "LEGACY", # for dates at/before 1900 - "spark.sql.legacy.parquet.int96RebaseModeInWrite": "LEGACY", # for timestamps at/before 1900 - # For Spark SQL warehouse dir and Hive metastore_db + **LOCAL_BASIC_EXTRA_CONF, "spark.sql.warehouse.dir": spark_sql_warehouse_dir, "spark.hadoop.javax.jdo.option.ConnectionURL": f"jdbc:derby:;databaseName={spark_sql_warehouse_dir}/metastore_db;create=true", - "spark.sql.jsonGenerator.ignoreNullFields": "false", # keep nulls in our json } spark = configure_spark_session( app_name="Unit Test Session", @@ -197,7 +160,7 @@ def spark(tmp_path_factory) -> SparkSession: @pytest.fixture -def hive_unittest_metastore_db(spark: SparkSession): +def hive_unittest_metastore_db(spark: "SparkSession"): """A fixture that WIPES all of the schemas (aka databases) and tables in each schema from the hive metastore_db at the end of each test run, so that the metastore is fresh. @@ -234,7 +197,7 @@ def hive_unittest_metastore_db(spark: SparkSession): @pytest.fixture -def delta_lake_unittest_schema(spark: SparkSession, hive_unittest_metastore_db): +def delta_lake_unittest_schema(spark: "SparkSession", hive_unittest_metastore_db): """Specify which Delta 'SCHEMA' to use (NOTE: 'SCHEMA' and 'DATABASE' are interchangeable in Delta Spark SQL), and cleanup any objects created in the schema after the test run.""" @@ -1534,7 +1497,7 @@ def populate_usas_data_and_recipients_from_broker(db, populate_usas_data, popula yield -def create_all_delta_tables(spark: SparkSession, s3_bucket: str, tables_to_load: list): +def create_all_delta_tables(spark: "SparkSession", s3_bucket: str, tables_to_load: list): load_query_tables = [val for val in tables_to_load if val in LOAD_QUERY_TABLE_SPEC] load_table_tables = [val for val in tables_to_load if val in LOAD_TABLE_TABLE_SPEC] for dest_table in load_table_tables + load_query_tables: @@ -1555,7 +1518,7 @@ def create_all_delta_tables(spark: SparkSession, s3_bucket: str, tables_to_load: call_command("create_delta_table", f"--destination-table={dest_table}", f"--spark-s3-bucket={s3_bucket}") -def create_and_load_all_delta_tables(spark: SparkSession, s3_bucket: str, tables_to_load: list): +def create_and_load_all_delta_tables(spark: "SparkSession", s3_bucket: str, tables_to_load: list): create_all_delta_tables(spark, s3_bucket, tables_to_load) load_query_tables = [val for val in tables_to_load if val in LOAD_QUERY_TABLE_SPEC] diff --git a/usaspending_api/transactions/management/commands/delete_procurement_records.py b/usaspending_api/transactions/management/commands/delete_procurement_records.py index dd55a1611d..96afce0feb 100644 --- a/usaspending_api/transactions/management/commands/delete_procurement_records.py +++ b/usaspending_api/transactions/management/commands/delete_procurement_records.py @@ -42,7 +42,7 @@ def fetch_deleted_transactions(self) -> Optional[dict]: logger.info(f"Obtained {len(numeric_ids)} IDs for {date_}") ids_to_delete[date_].extend(numeric_ids) else: - logger.warn(f"No {'valid ' if bool(string_ids) else ''}IDs for {date_}!") + logger.warning(f"No {'valid ' if bool(string_ids) else ''}IDs for {date_}!") total_ids = sum([len(v) for v in ids_to_delete.values()]) logger.info(f"Total number of delete records to process: {total_ids}")