Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 29 additions & 9 deletions process_report/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,23 @@
]


SUPPLEMENTAL_START_DATE = "Start Date"
SUPPLEMENTAL_END_DATE = "End Date"


@functools.lru_cache
def get_rates_info():
return load_from_url()


def _is_in_time_range(start, end) -> bool:
# Leveraging inherent lexicographical order of YYYY-MM strings
return (
start <= invoice_settings.invoice_month
and invoice_settings.invoice_month <= end
)


class Loader:
@functools.lru_cache
def get_csv_invoice_filepath_list(self) -> list[str]:
Expand Down Expand Up @@ -126,13 +138,6 @@ def get_nonbillable_projects(self) -> pandas.DataFrame:
3. Is Timed: Boolean indicating if the nonbillable status is time-bound
"""

def _is_in_time_range(timed_object) -> bool:
# Leveraging inherent lexicographical order of YYYY-MM strings
return (
timed_object["start"] <= invoice_settings.invoice_month
and invoice_settings.invoice_month <= timed_object["end"]
)

project_list = []
with open(invoice_settings.nonbillable_projects_filepath) as file:
projects_dict = yaml.safe_load(file)
Expand All @@ -142,7 +147,7 @@ def _is_in_time_range(timed_object) -> bool:
cluster_list = project.get("clusters")

if project.get("start"):
if not _is_in_time_range(project):
if not _is_in_time_range(project["start"], project["end"]):
continue

if cluster_list:
Expand All @@ -154,7 +159,7 @@ def _is_in_time_range(timed_object) -> bool:
for cluster in cluster_list:
cluster_start_time = cluster.get("start")
if cluster_start_time:
if _is_in_time_range(cluster):
if _is_in_time_range(cluster["start"], cluster["end"]):
project_list.append((project_name, cluster["name"], True))
elif not cluster_start_time:
project_list.append((project_name, cluster["name"], False))
Expand All @@ -179,5 +184,20 @@ def get_nonbillable_timed_projects(self) -> list[tuple[str, str]]:
].itertuples(index=False, name=None)
)

def get_supplement_api_data(self) -> pandas.DataFrame:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a docstring here and in general going forward?

The reason I ask this is because I was wondering why this supplemental data like pi name and institution name are time bound, but after digging around turns it's a design decision. So, the docstring could convey that useful information (even linking to the issue or comment where we made this non obvious choice).

supplemental_df = pandas.DataFrame()
if invoice_settings.supplement_api_data_filepath:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you should log when a config file is load and when it's not loaded.

supplemental_df = pandas.read_csv(
invoice_settings.supplement_api_data_filepath
)
in_time_range_mask = supplemental_df.apply(
lambda row: _is_in_time_range(
row[SUPPLEMENTAL_START_DATE], row[SUPPLEMENTAL_END_DATE]
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the supplemental data as of now[1] has no start and end date, so this would just error out.

What is the behaviour supposed to be when no start or end date is found? Erroring out? logging? or assume the data is applicable (cc: @joachimweyl)

[1] https://github.com/CCI-MOC/invoicing-private-data/blob/main/project_api_data.csv

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Log, and we should add dates. We will need to work with RH to obtain dates. The template is supposed to be gathering this but all of these projects are pre the template

),
axis=1,
)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: not that what you are doing is incorrect, but you could do the following which is more pandas like:

in_time_range_mask = ( 
(supplemental_df[SUPPLEMENTAL_START_DATE] <= invoice_settings.invoice_month) &
(invoice_settings.invoice_month <= supplemental_df[SUPPLEMENTAL_END_DATE]) 
)

This way we don't have to operate row by row. Not that we are running into performance issues but this is more of a vector operation.

supplemental_df = supplemental_df[in_time_range_mask]
return supplemental_df


loader = Loader()
22 changes: 22 additions & 0 deletions process_report/processors/coldfront_fetch_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,21 @@
CF_ATTR_INSTITUTION_SPECIFIC_CODE = "Institution-Specific Code"
CF_ATTR_IS_COURSE = "Is Course?"

SUPPLEMENTAL_PROJECT_ID = "Project - Allocation Name"
SUPPLEMENTAL_PROJECT_NAME = "Project - Title"
SUPPLEMENTAL_PI = "Manager (PI)"
SUPPLEMENTAL_CLUSTER_NAME = "Cluster Name"


@dataclass
class ColdfrontFetchProcessor(processor.Processor):
nonbillable_projects: pandas.DataFrame = field(
default_factory=loader.get_nonbillable_projects
)
coldfront_data_filepath: str = invoice_settings.coldfront_api_filepath
supplement_api_data: pandas.DataFrame = field(
default_factory=lambda: loader.get_supplement_api_data()
)

@functools.cached_property
def coldfront_client(self):
Expand Down Expand Up @@ -125,6 +133,20 @@ def _get_allocation_data(self, coldfront_api_data):
except KeyError:
continue

for _, row in self.supplement_api_data.iterrows():
project_id = row[SUPPLEMENTAL_PROJECT_ID]
project_name = row[SUPPLEMENTAL_PROJECT_NAME]
pi_name = row[SUPPLEMENTAL_PI]
cluster_name = row[SUPPLEMENTAL_CLUSTER_NAME]

allocation_data[(project_id, cluster_name)] = {
invoice.PROJECT_FIELD: project_name,
invoice.PI_FIELD: pi_name,
invoice.INSTITUTION_ID_FIELD: "N/A",
invoice.CLUSTER_NAME_FIELD: cluster_name,
invoice.IS_COURSE_FIELD: False, # (TODO) Quan Assuming supplemental data does not contain course info?
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean by "Assuming supplemental data does not contain course info?"

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@joachimweyl The supplemental data's purpose is to provide data that the Coldfront API would normally have (an allocation's name, PI, whether it belongs to a course). The current supplemental data file does not currently have a column to indicate if a project belongs in a course. I wanted to ask if we want to assume projects listed in this file can be assumed to never be in courses.

@joachimweyl Adding the extra column to indicate course-membership is simple.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please check the supplemental data file against the original; it looks like it is out of date.
I don't know that we can never assume they are courses, but so far, none of them have been. I would say it is worth adding the column just in case.

}

return allocation_data

def _validate_allocation_data(self, allocation_data):
Expand Down
1 change: 1 addition & 0 deletions process_report/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
class Settings(BaseSettings):
# Coldfront info
coldfront_api_filepath: str | None = None
supplement_api_data_filepath: str | None = None
keycloak_client_id: str | None = None
keycloak_client_secret: str | None = None

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -257,3 +257,47 @@ def test_missing_project_cluster_tuples(self, mock_get_allocation_data):
assert str(cm.value) == (
f"Projects {expected_missing} not found in Coldfront and are billable! Please check the project names"
)

@mock.patch(
"process_report.processors.coldfront_fetch_processor.ColdfrontFetchProcessor._fetch_coldfront_allocation_api",
)
def test_supplement_api_data_used_when_coldfront_missing(
self, mock_get_allocation_data
):
"""Supplement API rows are applied to invoice in _get_allocation_data()."""
mock_get_allocation_data.return_value = self._get_mock_allocation_data(
["P1"],
["PI1"],
["IC1"],
["stack"],
)

supplemental_df = pandas.DataFrame(
{
"Project - Allocation Name": ["P2"],
"Project - Title": ["P2-supplement-name"],
"Manager (PI)": ["PI2"],
"Cluster Name": ["stack"],
}
)

test_invoice = self._get_test_invoice(
["P1", "P2"], cluster_name=["stack", "stack"]
)

expected_invoice = self._get_test_invoice(
["P1", "P2"],
["P1-name", "P2-supplement-name"],
["PI1", "PI2"],
["IC1", "N/A"],
["stack", "stack"],
[False, False],
)

test_coldfront_fetch_proc = test_utils.new_coldfront_fetch_processor(
data=test_invoice, supplement_api_data=supplemental_df
)
test_coldfront_fetch_proc.process()
output_invoice = test_coldfront_fetch_proc.data

assert output_invoice.equals(expected_invoice)
10 changes: 9 additions & 1 deletion process_report/tests/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,23 @@ def new_coldfront_fetch_processor(
data=None,
nonbillable_projects=None,
coldfront_data_filepath=None,
supplement_api_data=None,
):
if data is None:
data = pandas.DataFrame()
if nonbillable_projects is None:
nonbillable_projects = pandas.DataFrame(
columns=["Project Name", "Cluster", "Is Timed"]
)
if supplement_api_data is None:
supplement_api_data = pandas.DataFrame()
return coldfront_fetch_processor.ColdfrontFetchProcessor(
invoice_month, data, name, nonbillable_projects, coldfront_data_filepath
invoice_month,
data,
name,
nonbillable_projects,
coldfront_data_filepath,
supplement_api_data,
)


Expand Down
Loading