refactoring the bulk up app and minor updates. (#387)

faridseifi · web-flow · commit 995aeebea493 · 2024-05-13T14:08:48.000-04:00
diff --git a/bulk-up/src/bulk_up/req.py b/bulk-up/src/bulk_up/req.py
@@ -98,18 +98,20 @@ def post_json_message(filename):
 
             with lock:
                 add_response(response, response_data)
-                add_candidates(response_data)
+                add_candidates(response_data, data["performance_month"])
 
     except Exception as e:
         print(f"Error processing {filename}: {e}")
 
 
-def add_candidates(response_data: dict):
+def add_candidates(response_data: dict, performance_month: str):
     global candidate_df
     data = response_data.get("candidates", None)
     if data:
+        candidates = pd.DataFrame(data[1:], columns=data[0])
+        candidates["performance_month"] = performance_month
         candidate_df = pd.concat(
-            [candidate_df, pd.DataFrame(data[1:], columns=data[0])], ignore_index=True
+            [candidate_df, candidates], ignore_index=True
         )
 
 
@@ -126,7 +128,7 @@ def add_response(response: requests.Response, response_data):
     response_df = pd.concat(
         [response_df, pd.DataFrame(response_dict)], ignore_index=True
     )
-    print(response_dict)
+    print(response_dict,  end='\r')
 
 
 def analyse_responses():
@@ -157,72 +159,70 @@ def analyse_candidates():
     if OUTPUT:
         candidate_df.to_csv(OUTPUT, index=False)
 
-    # causal pathways
+    
     candidate_df.rename(columns={"acceptable_by": "causal_pathway"}, inplace=True)
-    causal_pathway = (
-        candidate_df.groupby("causal_pathway")["selected"]
+    candidate_df["score"] = candidate_df["score"].astype(float)
+    candidate_df.rename(columns={"name": "message"}, inplace=True)
+
+    # causal pathways
+    causal_pathway_report = build_table("causal_pathway")
+    print(causal_pathway_report, "\n")
+
+    # messages
+    message_report = build_table("message")
+    print(message_report, "\n")
+    
+    # measures
+    measure_report = build_table("measure")
+    print(measure_report, "\n")    
+
+
+def build_table(grouping_column):
+    report_table = (
+        candidate_df.groupby(grouping_column)["selected"]
         .agg(acceptable=("count"), selected=("sum"))
         .reset_index()
     )
-    candidate_df["score"] = candidate_df["score"].astype(float)
-    scores = (
-        candidate_df.groupby("causal_pathway")["score"]
+    scores = round(
+        candidate_df.groupby(grouping_column)["score"]
         .agg(acceptable_score=("mean"))
         .reset_index()
-    )
-    causal_pathway = pd.merge(causal_pathway, scores, on="causal_pathway", how="left")
+    ,2)
+    report_table = pd.merge(report_table, scores, on=grouping_column, how="left")
 
-    causal_pathway["% acceptable"] = round(
-        causal_pathway["acceptable"] / causal_pathway["acceptable"].sum() * 100, 1
+    report_table["% acceptable"] = round(
+        report_table["acceptable"] / report_table["acceptable"].sum() * 100, 1
     )
-    causal_pathway["% selected"] = round(
-        causal_pathway["selected"] / causal_pathway["acceptable"] * 100, 1
+    report_table["% selected"] = round(
+        report_table["selected"] / report_table["selected"].sum() * 100, 1
     )
-    selected_scores = (
+    report_table["% of acceptable selected"] = round(
+        report_table["selected"] / report_table["acceptable"] * 100, 1
+    )
+    selected_scores = round(
         candidate_df[candidate_df["selected"]]
-        .groupby("causal_pathway")["score"]
+        .groupby(grouping_column)["score"]
         .agg(selected_score=("mean"))
         .reset_index()
-    )
-    causal_pathway = pd.merge(
-        causal_pathway, selected_scores, on="causal_pathway", how="left"
+    ,2)
+    report_table = pd.merge(
+        report_table, selected_scores, on=grouping_column, how="left"
     )
 
-    causal_pathway = causal_pathway[
+    report_table = report_table[
         [
-            "causal_pathway",
+            grouping_column,
             "acceptable",
             "% acceptable",
             "acceptable_score",
             "selected",
             "% selected",
             "selected_score",
+            "% of acceptable selected",
         ]
     ]
-    print(causal_pathway, "\n")
-
-    # messages
-    candidate_df.rename(columns={"name": "message"}, inplace=True)
-    message = (
-        candidate_df.groupby("message")["selected"]
-        .agg(total=("count"), selected=("sum"))
-        .reset_index()
-    )
-    message["%"] = round(message["total"] / message["total"].sum() * 100, 1)
-    message["% selected"] = round(message["selected"] / message["total"] * 100, 1)
-    message = message[["message", "%", "total", "selected", "% selected"]]
-    print(message, "\n")
-
-    # measures
-    measure = (
-        candidate_df.groupby("measure")["selected"]
-        .agg(total=("count"), selected=("sum"))
-        .reset_index()
-    )
-    measure["%"] = round(measure["total"] / measure["total"].sum() * 100, 1)
-    measure["% selected"] = round(measure["selected"] / measure["total"] * 100, 1)
-    measure = measure[["measure", "%", "total", "selected", "% selected"]]
-    print(measure, "\n")
+    
+    return report_table
 
 
 def extract_number(filename):
@@ -246,7 +246,7 @@ def main():
     if SAMPLE:
         n = min(SAMPLE, len(input_files))
         input_files = sorted(random.sample(input_files, n), key=extract_number)
-
+    
     with ThreadPoolExecutor(WORKERS) as executor:
         executor.map(post_json_message, input_files)