455 add a new command to the cli for batch processing 2 (#456)

faridseifi · web-flow · commit 88acb81d7b4b · 2025-05-07T11:23:06.000-04:00
* Adding batch command with detailed stats. removing single command

* factored out individual data structure preparation from pipeline.
diff --git a/.python-version b/.python-version
diff --git a/scaffold/api.py b/scaffold/api.py
@@ -1,7 +1,7 @@
 from fastapi import FastAPI, Request
 from fastapi.responses import RedirectResponse
 
-from scaffold.pipeline import pipeline
+from scaffold.pipeline import run_pipeline
 from scaffold.startup import startup
 from scaffold.utils.settings import settings
 
@@ -30,5 +30,6 @@ async def template():
 @app.post("/createprecisionfeedback/")
 async def createprecisionfeedback(info: Request):
     req_info = await info.json()
+    full_message = run_pipeline(req_info)
 
-    return pipeline(req_info)
+    return full_message
diff --git a/scaffold/bitstomach/bitstomach.py b/scaffold/bitstomach/bitstomach.py
@@ -2,6 +2,7 @@
 from rdflib import RDF, BNode, Graph, Literal
 
 from scaffold.bitstomach.signals import SIGNALS
+from scaffold.utils import settings
 from scaffold.utils.namespace import PSDO, SLOWMO
 
 
@@ -33,6 +34,8 @@ def extract_signals(perf_df: pd.DataFrame) -> Graph:
 
 
 def prepare(req_info):
+    if settings.settings.performance_month:
+        req_info["performance_month"] = settings.settings.performance_month
     performance_data = req_info["Performance_data"]
     performance_df = pd.DataFrame(performance_data[1:], columns=performance_data[0])
 
diff --git a/scaffold/cli.py b/scaffold/cli.py
@@ -7,35 +7,19 @@
 import uvicorn
 from loguru import logger
 
-from scaffold.pipeline import pipeline
+from scaffold.pipeline import run_pipeline
 from scaffold.startup import startup
+from scaffold.utils.utils import (
+    add_candidates,
+    add_response,
+    analyse_candidates,
+    analyse_responses,
+    extract_number,
+)
 
 cli = typer.Typer(no_args_is_help=True)
 
 
-@cli.command()
-def single(
-    file_path: Annotated[
-        pathlib.Path, typer.Argument(help="Path to input data in JSON format")
-    ],
-) -> None:
-    startup()
-
-    input = orjson.loads(file_path.read_bytes())
-    result = pipeline(input)
-
-    directory = file_path.parent / "messages"
-    os.makedirs(directory, exist_ok=True)
-
-    new_filename = f"{file_path.stem} - message for {input['performance_month']}.json"
-
-    output_path = directory / new_filename
-
-    output_path.write_bytes(orjson.dumps(result, option=orjson.OPT_INDENT_2))
-
-    logger.info(f"Message created at {output_path}")
-
-
 @cli.command()
 def batch(
     file_path: Annotated[
@@ -45,11 +29,11 @@ def batch(
     max_files: Annotated[
         int, typer.Option("--max-files", help="Maximum number of files to process")
     ] = None,
-    count_only: Annotated[
+    stats_only: Annotated[
         bool,
         typer.Option(
-            "--count-only",
-            help="Only simulate processing; count successes and failures",
+            "--stats-only",
+            help="Only simulate processing; count successes and failures and additional stats",
         ),
     ] = False,
 ) -> None:
@@ -58,7 +42,7 @@ def batch(
     if file_path.is_file() and file_path.suffix == ".json":
         input_files = [file_path]
     elif file_path.is_dir():
-        input_files = sorted(file_path.glob("*.json"))
+        input_files = sorted(file_path.glob("*.json"), key=extract_number)
     else:
         logger.error(
             f"Invalid input: {file_path} is neither a .json file nor a directory containing .json files."
@@ -74,9 +58,10 @@ def batch(
     for input_file in input_files:
         try:
             input_data = orjson.loads(input_file.read_bytes())
-            result = pipeline(input_data)
 
-            if not count_only:
+            response_data = run_pipeline(input_data)
+
+            if not stats_only:
                 directory = input_file.parent / "messages"
                 os.makedirs(directory, exist_ok=True)
 
@@ -87,7 +72,7 @@ def batch(
                 output_path = directory / new_filename
 
                 output_path.write_bytes(
-                    orjson.dumps(result, option=orjson.OPT_INDENT_2)
+                    orjson.dumps(response_data, option=orjson.OPT_INDENT_2)
                 )
                 logger.info(f"Message created at {output_path}")
             else:
@@ -97,9 +82,17 @@ def batch(
         except Exception as e:
             logger.error(f"✘ Failed to process {input_file}: {e}")
             failure_count += 1
+            response_data = e.detail
+
+        add_response(response_data)
+        if not stats_only:
+            add_candidates(response_data, input_data["performance_month"])
 
     logger.info(f"Total files scanned: {len(input_files)}")
     logger.info(f"Successful: {success_count}, Failed: {failure_count}")
+    analyse_responses()
+    if not stats_only:
+        analyse_candidates(file_path / "messages" / "candidates.csv")
 
 
 @cli.command()
diff --git a/scaffold/pictoralist/pictoralist.py b/scaffold/pictoralist/pictoralist.py
@@ -18,17 +18,12 @@ class Pictoralist:
     def __init__(
         self,
         performance_dataframe,
-        serialized_perf_df,
         selected_candidate,
         settings,
-        message_instance_id,
     ):
         ## Setup variables to process selected message
         # Needs cleanup to stop redundant var declaration (those passed directly to prepare_selected_message)
         self.performance_data = performance_dataframe  # Dataframe of recipient perf data (performance_data_df)
-        self.performance_block = str(
-            serialized_perf_df
-        )  # Pull un-altered performance (serialized JSON) data to append output messsage with
 
         # Need refactor
         self.selected_measure = str(
@@ -57,7 +52,6 @@ def __init__(
             self.acceptable_by.append(
                 pathway
             )  # Add string value of rdflib literal to list
-        self.message_instance_id = message_instance_id
         self.base64_image = []  # Initialize as empty key to later fill image into
         self.staff_ID = int(
             performance_dataframe["staff_number"].iloc[0]
@@ -168,8 +162,10 @@ def fill_missing_months(self):
             )  # reset col name from index to month
 
             # Forward fill 'measure' and percent-scale version of 'MPOG_goal' columns with the previous valid values
-            self.performance_data["measure"].fillna(method="ffill", inplace=True)
-            self.performance_data["goal_percent"].fillna(method="ffill", inplace=True)
+            self.performance_data["measure"] = self.performance_data["measure"].ffill()
+            self.performance_data["goal_percent"] = self.performance_data[
+                "goal_percent"
+            ].ffill()
 
             # Debugging statement
             # logger.debug(f"After gap fill, dataframe is:")
@@ -505,11 +501,8 @@ def prepare_selected_message(self):
             "performance_month": self.performance_data["month"]
             .iloc[-1]
             .strftime("%B %Y"),  # Becomes string in response, format here
-            "performance_data": self.performance_block,
             "message_generated_datetime": self.init_time,
             "message": message,
         }
-        if self.message_instance_id is not None:
-            full_message["message_instance_id"] = self.message_instance_id
 
         return full_message
diff --git a/scaffold/pipeline.py b/scaffold/pipeline.py
@@ -1,5 +1,6 @@
 import os
 
+import pandas as pd
 import psutil
 from fastapi import HTTPException
 from loguru import logger
@@ -12,36 +13,32 @@
 from scaffold.pictoralist.pictoralist import Pictoralist
 from scaffold.utils.namespace import PSDO, SLOWMO
 from scaffold.utils.settings import settings
+from scaffold.utils.utils import set_logger
 
+set_logger()
 
-def pipeline(req_info):
-    if settings.performance_month:
-        req_info["performance_month"] = settings.performance_month
-
-    preferences = startup.set_preferences(req_info)
 
+def pipeline(preferences: dict, history: dict, performance_df: pd.DataFrame):
     cool_new_super_graph = Graph()
     cool_new_super_graph += startup.base_graph
 
     # BitStomach
     logger.debug("Calling BitStomach from main...")
 
-    performance_data_df = bitstomach.prepare(req_info)
-    # TODO: find a place for measures to live...mabe move these two line into prepare or make a measurees class
+    # TODO: find a place for measures to live...may be move these two line into prepare or make a measurees class
     measures = set(cool_new_super_graph[: RDF.type : PSDO.performance_measure_content])
 
-    performance_data_df.attrs["valid_measures"] = [
-        m for m in performance_data_df.attrs["valid_measures"] if BNode(m) in measures
+    performance_df.attrs["valid_measures"] = [
+        m for m in performance_df.attrs["valid_measures"] if BNode(m) in measures
     ]
-    g: Graph = bitstomach.extract_signals(performance_data_df)
+    g: Graph = bitstomach.extract_signals(performance_df)
 
     performance_content = g.resource(BNode("performance_content"))
     if len(list(performance_content[PSDO.motivating_information])) == 0:
         cool_new_super_graph.close()
         detail = {
             "message": "Insufficient significant data found for providing feedback, process aborted.",
-            "message_instance_id": req_info["message_instance_id"],
-            "staff_number": performance_data_df.attrs["staff_number"],
+            "staff_number": performance_df.attrs["staff_number"],
         }
         raise HTTPException(
             status_code=400,
@@ -57,11 +54,11 @@ def pipeline(req_info):
 
     # #Esteemer
     logger.debug("Calling Esteemer from main...")
-    history: dict = req_info.get("History", {})
+
     history = {
         key: value
         for key, value in history.items()
-        if key < performance_data_df.attrs["performance_month"]
+        if key < performance_df.attrs["performance_month"]
     }
 
     measures: set[BNode] = set(
@@ -90,11 +87,9 @@ def pipeline(req_info):
     if selected_message["message_text"] != "No message selected":
         ## Initialize and run message and display generation:
         pc = Pictoralist(
-            performance_data_df,
-            req_info["Performance_data"],
+            performance_df,
             selected_message,
             settings,
-            req_info["message_instance_id"],
         )
         pc.prep_data_for_graphing()  # Setup dataframe of one measure, cleaned for graphing
         pc.fill_missing_months()  # Fill holes in dataframe where they exist
@@ -120,11 +115,25 @@ def pipeline(req_info):
             (
                 BNode("p1"),
                 URIRef("http://example.com/slowmo#IsAboutPerformer"),
-                Literal(int(performance_data_df["staff_number"].iloc[0])),
+                Literal(int(performance_df["staff_number"].iloc[0])),
             )
         )
         response["candidates"] = utils.candidates_records(cool_new_super_graph)
 
     response.update(full_selected_message)
 
     return response
+
+
+def run_pipeline(req_info):
+    preferences = startup.set_preferences(req_info)
+    history: dict = req_info.get("History", {})
+    performance_df = bitstomach.prepare(req_info)
+    try:
+        full_message = pipeline(preferences, history, performance_df)
+        full_message["message_instance_id"] = req_info["message_instance_id"]
+        full_message["performance_data"] = req_info["Performance_data"]
+    except HTTPException as e:
+        e.detail["message_instance_id"] = req_info["message_instance_id"]
+        raise e
+    return full_message
diff --git a/scaffold/startup.py b/scaffold/startup.py
@@ -1,6 +1,5 @@
 import csv
 import json
-import sys
 from io import StringIO
 
 import matplotlib
@@ -11,14 +10,10 @@
 
 from scaffold.utils.graph_operations import manifest_to_graph
 from scaffold.utils.settings import settings
+from scaffold.utils.utils import set_logger
+
+set_logger()
 
-logger.remove()
-logger.add(
-    sys.stdout, colorize=True, format="{level}|  {message}", level=settings.log_level
-)
-logger.at_least = (
-    lambda lvl: logger.level(lvl).no >= logger.level(settings.log_level).no
-)
 matplotlib.use("Agg")
 mpm: dict = {}
 default_preferences: dict = {}
diff --git a/scaffold/utils/utils.py b/scaffold/utils/utils.py