invoicing/process_report/processors/validate_billable_pi_processor.py at ac65c506b95decee8d8ba358f249f950c985055d · CCI-MOC/invoicing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from dataclasses import dataclass, field
import logging

import pandas

from process_report.loader import loader
from process_report.invoices import invoice
from process_report.processors import processor
from process_report import util

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


NONBILLABLE_CLUSTERS = ["ocp-test", "barcelona"]


def find_billable_projects(
    data: pandas.DataFrame, nonbillable_projects: pandas.DataFrame
) -> pandas.Series:
    """
    Takes as input:
    - `data`: DataFrame containing invoice data with project and cluster columns
    - `nonbillable_projects`: DataFrame output from `loader.get_nonbillable_projects()`
    Returns a boolean series indicating whether each project in `data` is billable

    `data` is searched for projects which are:
    - Always nonbillable (not cluster-specific)
    - Cluster-specific nonbillable projects
    - In nonbillable clusters, currently only `ocp-test`
    Project names are compared in a case-insensitive manner.

    There is a convoluted reason why the `Project - Allocation` column is checked:
    Input invoices to this pipeline are expected to have the `Project - Allocation`
    and `Project - Allocation ID` columns both populated by the project ID.
    `nonbillable_projects` usually identify projects by names, which are more
    human-readable. However, we found it acceptable to use IDs for non-Coldfront projects
    `ColdfrontFetchProcessor` attempts to populate `Project - Allocation` with project
    names, then checks if all non-Coldfront projects are nonbillable
    This check works because we allow `nonbillable_projects` to contain project IDs for non-Coldfront projects.

    Ultimately, it is important to note that `Project - Allocation` may contain the project name or ID.
    """

    def _apply_lowercase(data: pandas.DataFrame, col) -> pandas.DataFrame:
        data_copy = data.copy()
        data_copy[col] = data_copy[col].str.lower()
        return data_copy

    data_lower = _apply_lowercase(data, invoice.PROJECT_FIELD)
    nonbillable_projects_lower = _apply_lowercase(
        nonbillable_projects, invoice.NONBILLABLE_PROJECT_NAME
    )

    cluster_agnostic_projects = (
        nonbillable_projects_lower[
            nonbillable_projects_lower[invoice.NONBILLABLE_CLUSTER_NAME].isna()
        ][invoice.NONBILLABLE_PROJECT_NAME]
        .unique()
        .tolist()
    )

    # Use left join and filter on `source` column to find billable projects
    # https://pandas.pydata.org/docs/reference/api/pandas.merge.html
    merged_data = pandas.merge(
        data_lower,
        nonbillable_projects_lower,
        how="left",
        left_on=[invoice.PROJECT_FIELD, invoice.CLUSTER_NAME_FIELD],
        right_on=[invoice.NONBILLABLE_PROJECT_NAME, invoice.NONBILLABLE_CLUSTER_NAME],
        indicator="source",
    )

    cluster_agnostic_mask = ~merged_data[invoice.PROJECT_FIELD].isin(
        cluster_agnostic_projects
    )
    cluster_specific_mask = ~merged_data["source"].eq("both")
    nonbillable_cluster_mask = ~merged_data[invoice.CLUSTER_NAME_FIELD].isin(
        NONBILLABLE_CLUSTERS
    )
    billable_override_mask = merged_data[invoice.NONBILLABLE_IS_BILLABLE_OVERRIDE]
    return (
        cluster_agnostic_mask & cluster_specific_mask & nonbillable_cluster_mask
    ) | billable_override_mask


@dataclass
class ValidateBillablePIsProcessor(processor.Processor):
    """
    This processor validates the billable PIs and projects in the data,
    and determines if a project is billable or not using several criterias:

    - The PI is nonbillable
    - The project (identified by project name) is nonbillable
    - The project belongs in `NONBILLABLE_CLUSTERS`
    """

    nonbillable_pis: list[str] = field(default_factory=loader.get_nonbillable_pis)
    nonbillable_projects: pandas.DataFrame = field(
        default_factory=loader.get_nonbillable_projects
    )

    @staticmethod
    def _validate_pi_names(data: pandas.DataFrame):
        invalid_pi_projects = data[pandas.isna(data[invoice.PI_FIELD])]
        for i, row in invalid_pi_projects.iterrows():
            if row[invoice.IS_BILLABLE_FIELD]:
                logger.warning(
                    f"Billable project {row[invoice.PROJECT_FIELD]} has empty PI field"
                )
        return pandas.isna(data[invoice.PI_FIELD])

    @staticmethod
    def _get_billables(
        data: pandas.DataFrame,
        nonbillable_pis: list[str],
        nonbillable_projects: pandas.DataFrame,
    ):
        institute_list = util.load_institute_list()

        pi_mask = ~data[invoice.PI_FIELD].isin(nonbillable_pis)
        project_mask = find_billable_projects(data, nonbillable_projects)
        courses_mask = ~(
            data[invoice.IS_COURSE_FIELD]
            & data[invoice.INSTITUTION_FIELD].isin(
                institute_list.nonbillable_course_list
            )
        )

        return pi_mask & project_mask & courses_mask

    def _process(self):
        self.data[invoice.IS_BILLABLE_FIELD] = self._get_billables(
            self.data, self.nonbillable_pis, self.nonbillable_projects
        )
        self.data[invoice.MISSING_PI_FIELD] = self._validate_pi_names(self.data)