Skip to content

Commit 3992ed1

Browse files
committed
added context to new qa indicators for app volumes
1 parent de89dc7 commit 3992ed1

File tree

2 files changed

+100
-54
lines changed

2 files changed

+100
-54
lines changed

notebooks/qa_checks.ipynb

Lines changed: 52 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
5+
"execution_count": 9,
66
"id": "fd2d413a-666f-49d4-b86c-7d6898a496a5",
77
"metadata": {},
88
"outputs": [],
@@ -28,7 +28,7 @@
2828
},
2929
{
3030
"cell_type": "code",
31-
"execution_count": 2,
31+
"execution_count": 10,
3232
"id": "be959c79",
3333
"metadata": {},
3434
"outputs": [],
@@ -54,7 +54,7 @@
5454
},
5555
{
5656
"cell_type": "code",
57-
"execution_count": 3,
57+
"execution_count": 11,
5858
"id": "0a4c5d52",
5959
"metadata": {},
6060
"outputs": [],
@@ -112,16 +112,12 @@
112112
},
113113
{
114114
"cell_type": "code",
115-
"execution_count": 23,
115+
"execution_count": 12,
116116
"id": "080b9a53",
117117
"metadata": {},
118118
"outputs": [],
119119
"source": [
120-
"\n",
121-
"# QA check: Service volumes vary by a higher than expected amount\n",
122-
"\n",
123-
"# Returns the year in which the fiscal year ends, as a number.\n",
124-
"def fy_to_num(fiscal_yr):\n",
120+
"def fy_to_num(fiscal_yr): # Returns the year in which the fiscal year ends, as a number.\n",
125121
" return pd.to_numeric(fiscal_yr.split('-')[-1])\n",
126122
"\n",
127123
"# Select fields used for analysis\n",
@@ -136,6 +132,9 @@
136132
" suffixes = ['', '_max']\n",
137133
")\n",
138134
"\n",
135+
"# Identify the rows belonging to the latest fiscal year\n",
136+
"si_variance_qa['latest_fy_bool'] = si_variance_qa['fy_num'] == si_variance_qa['fy_num_max']\n",
137+
"\n",
139138
"# Only consider records with at least 4 years of reported non-zero values (latest + 3)\n",
140139
"# Remove records without any application volume\n",
141140
"si_variance_qa = si_variance_qa.loc[si_variance_qa['num_applications_total']>0]\n",
@@ -152,10 +151,6 @@
152151
"# Then only keep records with 4 or more years\n",
153152
"si_variance_qa = si_variance_qa.loc[si_variance_qa['years_reported']>=4]\n",
154153
"\n",
155-
"\n",
156-
"# Identify the rows belonging to the latest fiscal year\n",
157-
"si_variance_qa['latest_fy_bool'] = si_variance_qa['fy_num'] == si_variance_qa['fy_num_max']\n",
158-
"\n",
159154
"# Determine the average number of applications and their standard deviation\n",
160155
"# by service and fiscal year, excluding the latest fiscal year\n",
161156
"si_variance_qa = pd.merge(\n",
@@ -173,27 +168,56 @@
173168
"# each number of applications is.\n",
174169
"si_variance_qa['apps_stdevs_away_from_mean'] = np.abs(si_variance_qa['num_applications_total']-si_variance_qa['mean'])/si_variance_qa['std_dev']\n",
175170
"\n",
176-
"# Reveal results only for latest fiscal year\n",
177-
"si_variance_qa.loc[si_variance_qa['latest_fy_bool']].sort_values(['apps_stdevs_away_from_mean', 'service_id'], ascending=False)\n",
178-
"\n",
179-
"# si_variance_qa.loc[(si_variance_qa['service_id'] == 'SRV02936')]\n",
180-
"\n",
181171
"# Issues to identify:\n",
182-
"# std_dev = 0; this is because for all years (except the latest), the num_applications_total is the same\n",
183-
"\n",
184-
"si_variance_qa['qa_no_volume_variation'] = (si_variance_qa['std_dev'] == 0)\n",
185-
"\n",
186-
"# apps_stdevs_away_from_mean > some threshhold\n",
172+
"# 1. The difference between the number of applications and the mean, in units of standard deviation, is greater than some threshold\n",
187173
"# this is for big swings that would need to be investigated.\n",
188174
"stdevs_away_from_mean_threshold = 20\n",
189-
"si_variance_qa['qa_extreme_volume_variation'] = ((si_variance_qa['apps_stdevs_away_from_mean'] > stdevs_away_from_mean_threshold) & ~si_variance_qa['qa_no_volume_variation'])\n",
175+
"si_variance_qa['qa_extreme_volume_variation'] = (si_variance_qa['apps_stdevs_away_from_mean'] > stdevs_away_from_mean_threshold)\n",
176+
"\n",
177+
"# 2. Standard deviation is 0 (std_dev = 0)\n",
178+
"# this is when for all years (except the latest) the num_applications_total is the same\n",
179+
"si_variance_qa['qa_no_volume_variation'] = ((si_variance_qa['std_dev'] == 0) & ~si_variance_qa['qa_extreme_volume_variation'])\n",
180+
"\n",
181+
"# Add these checks into the si dataframe\n",
182+
"# The merge is there to generate an indicator (true/false) that describes\n",
183+
"# whether the service in the si is part of the si_variance_qa dataframe, filtered for the\n",
184+
"# check in question\n",
185+
"si = pd.merge(\n",
186+
" si,\n",
187+
" si_variance_qa.loc[\n",
188+
" (si_variance_qa['latest_fy_bool'] & \n",
189+
" si_variance_qa['qa_no_volume_variation']),\n",
190+
" ['fiscal_yr', 'service_id', 'org_id']\n",
191+
" ],\n",
192+
" on=['fiscal_yr', 'service_id', 'org_id'], \n",
193+
" how='left',\n",
194+
" indicator='qa_no_volume_variation'\n",
195+
")\n",
196+
"\n",
197+
"si['qa_no_volume_variation'] = (si['qa_no_volume_variation'] == 'both')\n",
198+
"\n",
199+
"si = pd.merge(\n",
200+
" si,\n",
201+
" si_variance_qa.loc[\n",
202+
" (si_variance_qa['latest_fy_bool'] & \n",
203+
" si_variance_qa['qa_extreme_volume_variation']),\n",
204+
" ['fiscal_yr', 'service_id', 'org_id']\n",
205+
" ],\n",
206+
" on=['fiscal_yr', 'service_id', 'org_id'], \n",
207+
" how='left',\n",
208+
" indicator='qa_extreme_volume_variation'\n",
209+
")\n",
190210
"\n",
211+
"si['qa_extreme_volume_variation'] = (si['qa_extreme_volume_variation'] == 'both')\n",
191212
"\n",
213+
"si['fy_num_applications_total'] = \"(\"+si['fiscal_yr']+\": \"+si['num_applications_total'].astype('str')+\")\"\n",
214+
"si_apps_by_fy = si.groupby(['org_id', 'service_id'], as_index=False).agg({'fy_num_applications_total': lambda x: ', '.join(sorted(x))})\n",
192215
"\n",
193-
"si_variance_qa = si_variance_qa.loc[si_variance_qa['latest_fy_bool'] & \n",
194-
" (si_variance_qa['qa_no_volume_variation'] | si_variance_qa['qa_extreme_volume_variation']),\n",
195-
" ['fiscal_yr', 'service_id', 'org_id', 'qa_no_volume_variation', 'qa_extreme_volume_variation']]\n",
196-
"\n"
216+
"si = pd.merge(\n",
217+
" si.drop(columns=['fy_num_applications_total']),\n",
218+
" si_apps_by_fy,\n",
219+
" on=['org_id', 'service_id']\n",
220+
")"
197221
]
198222
}
199223
],

src/qa.py

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -64,23 +64,23 @@ def qa_check(si, ss, config, snapshot=False):
6464
si = pd.merge(si, sid_registry[['service_id', 'org_id']], how='left', on='service_id', suffixes=['', '_sid_registry'])
6565
si = pd.merge(si, dept.rename(columns={'org_id': 'org_id_sid_registry'}), how='left', on='org_id_sid_registry', suffixes=['', '_sid_registry'])
6666

67-
# QA check: unregistered service ID
67+
# === QA check: unregistered service ID
6868
# This service id is not registered in the service id registry
6969
si['qa_unregistered_sid'] = si['org_id_sid_registry'].isna()
7070

71-
# QA check: reused service ID
71+
# === QA check: reused service ID
7272
# This service id is registered to a different organization
7373
si['qa_reused_sid'] = (si['org_id'] != si['org_id_sid_registry']) & ~(si['qa_unregistered_sid'])
7474
si['reused_sid_correct_org'] = si['org_id'].astype(str) +' : ' + si['department_en_sid_registry'] + ' | ' + si['department_fr_sid_registry']
7575

76-
# QA check: Record is reported for a fiscal year that is incomplete or in the future.
76+
# === QA check: Record is reported for a fiscal year that is incomplete or in the future.
7777
si['fiscal_yr_end_date'] = pd.to_datetime(si['fiscal_yr'].str.split('-').str[1]+'-04-01')
7878
si['qa_si_fiscal_yr_out_of_scope'] = si['fiscal_yr_end_date'].dt.date >= current_date
7979

8080
ss['fiscal_yr_end_date'] = pd.to_datetime(ss['fiscal_yr'].str.split('-').str[1]+'-04-01')
8181
ss['qa_ss_fiscal_yr_out_of_scope'] = ss['fiscal_yr_end_date'].dt.date >= current_date
8282

83-
# QA check: Record has contradiction between client feedback channels and online interaction points for feedback
83+
# === QA check: Record has contradiction between client feedback channels and online interaction points for feedback
8484
si['qa_client_feedback_contradiction'] = (
8585

8686
# Service accepts client feedback via the online channel (ONL) but online issue resolution or feedback is not applicable or not activated
@@ -98,7 +98,7 @@ def qa_check(si, ss, config, snapshot=False):
9898
)
9999
)
100100

101-
# QA check: Service reports no volume, but associated Service standards have volume
101+
# === QA check: Service reports no volume, but associated Service standards have volume
102102
ss_vol_by_service = (
103103
ss.groupby(['fiscal_yr', 'service_id'])['total_volume']
104104
.sum()
@@ -114,16 +114,16 @@ def qa_check(si, ss, config, snapshot=False):
114114

115115
si['qa_no_si_app_volume'] = (si['num_applications_total'] == 0)
116116

117-
# QA check: Service standard reports no volume
117+
# === QA check: Service standard reports no volume
118118
ss['qa_no_ss_volume'] = (ss['total_volume'] == 0)
119119

120-
# QA check: Services where 'econom' (business) are a client type should not be 'NA' for CRA BN as ID
120+
# === QA check: Services where 'econom' (business) are a client type should not be 'NA' for CRA BN as ID
121121
si['qa_use_of_cra_bn_applicable'] = (
122122
(si['client_target_groups'].str.contains('ECONOM')) &
123123
(si['cra_bn_identifier_usage'] == 'NA')
124124
)
125125

126-
# QA check for programs
126+
# === QA check: for programs
127127
# Prepare a dataframe that splits service inventory into one-program-per-row: si_prog
128128
si['org_id'] = si['org_id'].astype(str)
129129
program['org_id'] = program['org_id'].astype(str)
@@ -141,12 +141,12 @@ def qa_check(si, ss, config, snapshot=False):
141141
# Join si_prog with program_list on program_id and org_id
142142
si_prog = si_prog.merge(program, on=['program_id', 'org_id'], how='left', suffixes=('_si', '_prog'), indicator=True)
143143

144-
# qa check: program id belongs to different department
144+
# === QA check: program id belongs to different department
145145
si_prog_wrong_org = si_prog[si_prog['_merge'] == 'left_only'] # Keep only mismatched rows
146146
si_prog_wrong_org = si_prog_wrong_org.groupby(['fiscal_yr', 'service_id', 'org_id'], as_index=False).agg({'program_id': lambda x: '<>'.join(sorted(map(str, x.dropna())))})
147147
si_prog_wrong_org.rename(columns={'program_id':'mismatched_program_ids'}, inplace=True)
148148

149-
# qa check: program id is old/expired
149+
# === QA check: program id is old/expired
150150
si_prog['latest_valid_fy_ending_in'] = pd.to_numeric(si_prog['latest_valid_fy'].str.split('-').str[1].fillna(0), errors = 'coerce').astype(int)
151151
si_prog['reported_fy_ending_in'] = pd.to_numeric(si_prog['fiscal_yr'].str.split('-').str[1].fillna(0), errors = 'coerce').astype(int)
152152
si_prog['program_id_latest_valid_fy'] = si_prog['program_id']+': '+si_prog['latest_valid_fy']
@@ -161,10 +161,10 @@ def qa_check(si, ss, config, snapshot=False):
161161
si = pd.merge(si, si_prog_wrong_org, on=['fiscal_yr', 'service_id', 'org_id'], how='left')
162162
si['qa_program_id_wrong_org'] = ~(si['mismatched_program_ids'].isnull())
163163

164-
# QA check: Service standard performance is greater than 100%
164+
# === QA check: Service standard performance is greater than 100%
165165
ss['qa_performance_over_100'] = ss['volume_meeting_target']>ss['total_volume']
166166

167-
# QA check: Service volumes vary by a higher than expected amount
167+
# === QA check: Service volumes vary by a higher than expected amount
168168
def fy_to_num(fiscal_yr): # Returns the year in which the fiscal year ends, as a number.
169169
return pd.to_numeric(fiscal_yr.split('-')[-1])
170170

@@ -180,6 +180,9 @@ def fy_to_num(fiscal_yr): # Returns the year in which the fiscal year ends, as a
180180
suffixes = ['', '_max']
181181
)
182182

183+
# Identify the rows belonging to the latest fiscal year
184+
si_variance_qa['latest_fy_bool'] = si_variance_qa['fy_num'] == si_variance_qa['fy_num_max']
185+
183186
# Only consider records with at least 4 years of reported non-zero values (latest + 3)
184187
# Remove records without any application volume
185188
si_variance_qa = si_variance_qa.loc[si_variance_qa['num_applications_total']>0]
@@ -196,10 +199,6 @@ def fy_to_num(fiscal_yr): # Returns the year in which the fiscal year ends, as a
196199
# Then only keep records with 4 or more years
197200
si_variance_qa = si_variance_qa.loc[si_variance_qa['years_reported']>=4]
198201

199-
200-
# Identify the rows belonging to the latest fiscal year
201-
si_variance_qa['latest_fy_bool'] = si_variance_qa['fy_num'] == si_variance_qa['fy_num_max']
202-
203202
# Determine the average number of applications and their standard deviation
204203
# by service and fiscal year, excluding the latest fiscal year
205204
si_variance_qa = pd.merge(
@@ -218,16 +217,19 @@ def fy_to_num(fiscal_yr): # Returns the year in which the fiscal year ends, as a
218217
si_variance_qa['apps_stdevs_away_from_mean'] = np.abs(si_variance_qa['num_applications_total']-si_variance_qa['mean'])/si_variance_qa['std_dev']
219218

220219
# Issues to identify:
221-
# 1. Standard deviation is 0 (std_dev = 0)
222-
# this is when for all years (except the latest) the num_applications_total is the same
223-
si_variance_qa['qa_no_volume_variation'] = (si_variance_qa['std_dev'] == 0)
224-
225-
# 2. The difference between the number of applications and the mean, in units of standard deviation, is greater than some threshold
220+
# 1. The difference between the number of applications and the mean, in units of standard deviation, is greater than some threshold
226221
# this is for big swings that would need to be investigated.
227222
stdevs_away_from_mean_threshold = 20
228-
si_variance_qa['qa_extreme_volume_variation'] = ((si_variance_qa['apps_stdevs_away_from_mean'] > stdevs_away_from_mean_threshold) & ~si_variance_qa['qa_no_volume_variation'])
223+
si_variance_qa['qa_extreme_volume_variation'] = (si_variance_qa['apps_stdevs_away_from_mean'] > stdevs_away_from_mean_threshold)
224+
225+
# 2. Standard deviation is 0 (std_dev = 0)
226+
# this is when for all years (except the latest) the num_applications_total is the same
227+
si_variance_qa['qa_no_volume_variation'] = ((si_variance_qa['std_dev'] == 0) & ~si_variance_qa['qa_extreme_volume_variation'])
229228

230229
# Add these checks into the si dataframe
230+
# The merge is there to generate an indicator (true/false) that describes
231+
# whether the service in the si is part of the si_variance_qa dataframe, filtered for the
232+
# check in question
231233
si = pd.merge(
232234
si,
233235
si_variance_qa.loc[
@@ -246,7 +248,7 @@ def fy_to_num(fiscal_yr): # Returns the year in which the fiscal year ends, as a
246248
si,
247249
si_variance_qa.loc[
248250
(si_variance_qa['latest_fy_bool'] &
249-
si_variance_qa['qa_no_volume_variation']),
251+
si_variance_qa['qa_extreme_volume_variation']),
250252
['fiscal_yr', 'service_id', 'org_id']
251253
],
252254
on=['fiscal_yr', 'service_id', 'org_id'],
@@ -256,6 +258,22 @@ def fy_to_num(fiscal_yr): # Returns the year in which the fiscal year ends, as a
256258

257259
si['qa_extreme_volume_variation'] = (si['qa_extreme_volume_variation'] == 'both')
258260

261+
# Generate context for qa_report:
262+
# Display a field with all the reported application volumes and their fiscal years
263+
264+
# Create the field in the si
265+
si['fy_num_applications_total'] = "("+si['fiscal_yr']+": "+si['num_applications_total'].astype('str')+")"
266+
267+
# Create a grouped version with the contents of each field concatenated (joined)
268+
si_apps_by_fy = si.groupby(['org_id', 'service_id'], as_index=False).agg({'fy_num_applications_total': lambda x: ', '.join(sorted(x))})
269+
270+
# Merge the concatenated vales back into si, while dropping the original column.
271+
si = pd.merge(
272+
si.drop(columns=['fy_num_applications_total']),
273+
si_apps_by_fy,
274+
on=['org_id', 'service_id']
275+
)
276+
259277

260278
# === EXPORT DATA TO CSV ===
261279
# Define the DataFrames to export to csv and their corresponding names
@@ -292,7 +310,9 @@ def generate_context(row):
292310
'qa_program_id_old': f"{row['program_id_latest_valid_fy']}",
293311
'qa_ss_vol_without_si_vol': f"service applications: {row['num_applications_total']}, standard volumes: {row['total_volume_ss']}",
294312
'qa_si_fiscal_yr_out_of_scope': f"{row['fiscal_yr']}",
295-
'qa_ss_fiscal_yr_out_of_scope': f"{row['fiscal_yr']}"
313+
'qa_ss_fiscal_yr_out_of_scope': f"{row['fiscal_yr']}",
314+
'qa_extreme_volume_variation': f"{row['fy_num_applications_total']}",
315+
'qa_no_volume_variation': f"{row['fy_num_applications_total']}"
296316
}
297317

298318
return issue_messages.get(row['qa_field_name'])
@@ -343,7 +363,8 @@ def generate_context(row):
343363
'reused_sid_correct_org',
344364
'program_id',
345365
'program_id_latest_valid_fy',
346-
'mismatched_program_ids'
366+
'mismatched_program_ids',
367+
'fy_num_applications_total'
347368
]
348369

349370
# Transform data to have all qa issues in a single column
@@ -388,7 +409,8 @@ def generate_context(row):
388409
'reused_sid_correct_org', # replaced by context field
389410
'program_id', # replaced by context field
390411
'program_id_latest_valid_fy', # replaced by context field
391-
'mismatched_program_ids' # replaced by context field
412+
'mismatched_program_ids', # replaced by context field
413+
'fy_num_applications_total' # replaced by context field
392414
])
393415

394416
si_qa_report = si_qa_report.sort_values(by=['org_id', 'severity_en', 'service_id'])

0 commit comments

Comments
 (0)