457 add csv input processing to scaffold (#458)

faridseifi · web-flow · commit a64466d7cfd8 · 2025-05-13T11:24:02.000-04:00
* initial csv processing command added

* script to generate performance_data, history and preferences in csv format using a  bunch of json input files added to bulkup

* history and preferences are programmed to be loaded from csv input for batch_csv command.
diff --git a/README.md b/README.md
@@ -93,13 +93,27 @@ curl --data "@tests/test_cases/input_message.json" http://localhost:8000/createp
 ```
 
 Run SCAFFOLD CLI
-First install the python app. Then use the following command to run the pipeline on one input file
+First install the python app. Then use the following command to run the pipeline on one json input file
 
 ```zsh
-ENV_PATH=/user/.../dev.env pipeline single '/path/to/input/file.json'
+ENV_PATH=/user/.../dev.env pipeline batch '/path/to/input/file.json'
 ```
 
-or use the following command to run the pipeline api
+Use the following command to run the pipeline on some or all json input files in a folder
+
+```zsh
+ENV_PATH=/user/.../dev.env pipeline batch '/path/to/input/folder/' --max-files 500
+```
+Use --max-files if you need to limit the number of files to process.
+
+Use the following command to run the pipeline passing preformance_data, history and preferences as separate CSV files
+
+```zsh
+ENV_PATH=/user/.../dev.env pipeline batch_csv '/path/to/performance/data/file.csv' '/path/to/preferences/file.csv' '/path/to/history/file.csv' --performance-month {performance month i.e. 2024-05-01} --max-files 500
+```
+Use --performance-month to set the performance month for batch_csv command.
+
+Use the following command to run the pipeline api
 
 ```zsh
 ENV_PATH=/user/.../dev.env pipeline web
diff --git a/bulk-up/src/bulk_up/prepare_csv_inputs.py b/bulk-up/src/bulk_up/prepare_csv_inputs.py
@@ -0,0 +1,78 @@
+import csv
+import re
+from pathlib import Path
+
+import orjson
+
+
+def extract_number(filename):
+    match = re.search(r"_(\d+)", str(filename))
+    if match:
+        return int(match.group(1))
+    else:
+        return float("inf")  # Return infinity if no numeric part found
+
+
+file_path = Path("/home/faridsei/dev/test/2024-06-24/2024-05-01/")
+input_files = sorted(file_path.glob("*.json"), key=extract_number)
+with open("performance_data.csv", "w", newline="") as file:
+    writer = csv.writer(file)
+    for index, input_file in enumerate(input_files):
+        input_data = orjson.loads(input_file.read_bytes())
+        if index == 0:
+            writer.writerows(input_data["Performance_data"])
+        else:
+            writer.writerows(input_data["Performance_data"][1:])
+
+
+fieldnames = [
+    "staff_number",
+    "Social gain",
+    "Social stayed better",
+    "Worsening",
+    "Improving",
+    "Social loss",
+    "Social stayed worse",
+    "Social better",
+    "Social worse",
+    "Social approach",
+    "Goal gain",
+    "Goal approach",
+    "Display_Format",
+]
+with open("preferences.csv", "w", newline="") as file:
+    writer = csv.DictWriter(file, fieldnames=fieldnames)
+    writer.writeheader()
+    for index, input_file in enumerate(input_files):
+        input_data = orjson.loads(input_file.read_bytes())
+        if input_data["Preferences"].get("Utilities", {}).get("Message_Format", {}):
+            preferences = {"staff_number": input_data["Performance_data"][1][0]}
+            preferences.update(
+                input_data["Preferences"].get("Utilities", {}).get("Message_Format", {})
+            )
+            preferences["Display_Format"] = next(
+                (
+                    k
+                    for k, v in input_data["Preferences"]
+                    .get("Utilities", {})
+                    .get("Display_Format", {})
+                    .items()
+                    if v == 1
+                ),
+                None,
+            )
+            writer.writerows([preferences])
+
+all_keys = set(["staff_number"])
+for input_file in input_files:
+    input_data = orjson.loads(input_file.read_bytes())
+    all_keys.update(input_data["History"].keys())
+with open("history.csv", "w", newline="") as file:
+    writer = csv.DictWriter(file, fieldnames=all_keys)
+    writer.writeheader()
+    for index, input_file in enumerate(input_files):
+        input_data = orjson.loads(input_file.read_bytes())
+        if input_data["History"]:
+            history = {"staff_number": input_data["Performance_data"][1][0]}
+            history.update(input_data["History"])
+            writer.writerows([history])
diff --git a/scaffold/bitstomach/bitstomach.py b/scaffold/bitstomach/bitstomach.py
@@ -34,18 +34,22 @@ def extract_signals(perf_df: pd.DataFrame) -> Graph:
 
 
 def prepare(req_info):
+    performance_month = req_info["performance_month"]
     if settings.settings.performance_month:
-        req_info["performance_month"] = settings.settings.performance_month
+        performance_month = settings.settings.performance_month
+        
+    
     performance_data = req_info["Performance_data"]
     performance_df = pd.DataFrame(performance_data[1:], columns=performance_data[0])
 
+    return prepare_performance_df(performance_month, performance_df)
+
+def prepare_performance_df(performance_month, performance_df):
     performance_df.attrs["staff_number"] = int(performance_df.at[0, "staff_number"])
 
     performance_df["goal_comparator_content"] = performance_df["MPOG_goal"]
 
-    performance_df.attrs["performance_month"] = req_info.get(
-        "performance_month", performance_df["month"].max()
-    )
+    performance_df.attrs["performance_month"] = performance_month if performance_month else performance_df["month"].max()
 
     performance_df = performance_df[
         performance_df["month"] <= performance_df.attrs["performance_month"]
diff --git a/scaffold/cli.py b/scaffold/cli.py
@@ -1,20 +1,24 @@
 import os
 import pathlib
+import subprocess
 from typing import Annotated
 
 import orjson
+import pandas as pd
 import typer
-import uvicorn
 from loguru import logger
 
-from scaffold.pipeline import run_pipeline
-from scaffold.startup import startup
+from scaffold.bitstomach.bitstomach import prepare_performance_df
+from scaffold.pipeline import pipeline, run_pipeline
+from scaffold.startup import set_preferences, startup
 from scaffold.utils.utils import (
     add_candidates,
     add_response,
     analyse_candidates,
     analyse_responses,
     extract_number,
+    get_history,
+    get_preferences,
 )
 
 cli = typer.Typer(no_args_is_help=True)
@@ -96,8 +100,107 @@ def batch(
 
 
 @cli.command()
-def web():
-    uvicorn.run("scaffold.api:app", reload=False, use_colors=True)
+def batch_csv(
+    performance_data_path: Annotated[
+        pathlib.Path,
+        typer.Argument(help="Path to a CSV file containing performance data"),
+    ],
+    preferences_path: Annotated[
+        pathlib.Path,
+        typer.Argument(help="Path to a CSV file containing the preferences"),
+    ],
+    history_path: Annotated[
+        pathlib.Path,
+        typer.Argument(help="Path to a CSV file containing the history"),
+    ],
+    max_files: Annotated[
+        int, typer.Option("--max-files", help="Maximum number of files to process")
+    ] = None,
+    performance_month: Annotated[
+        str, typer.Option("--performance-month", help="Performance month")
+    ] = None,
+    stats_only: Annotated[
+        bool,
+        typer.Option(
+            "--stats-only",
+            help="Only simulate processing; count successes and failures and additional stats",
+        ),
+    ] = False,
+):
+    startup()
+
+    all_performance_data = pd.read_csv(performance_data_path, parse_dates=["month"])
+    all_preferences = pd.read_csv(preferences_path)
+    all_hostory = pd.read_csv(history_path)
+
+    if max_files is not None:
+        first_n_staff = (
+            all_performance_data["staff_number"].drop_duplicates().head(max_files)
+        )
+        performance_data = all_performance_data[
+            all_performance_data["staff_number"].isin(first_n_staff)
+        ].reset_index(drop=True)
+        # performance_data = all_performance_data[all_performance_data['staff_number'].isin(set(range(1, max_files + 1)))].reset_index(drop=True)
+    success_count = 0
+    failure_count = 0
+    for provider_id in performance_data["staff_number"].unique().tolist():
+        try:
+            preferences = set_preferences(
+                get_preferences(
+                    all_preferences[all_preferences["staff_number"] == provider_id]
+                )
+            )
+            history = get_history(
+                all_hostory[all_hostory["staff_number"] == provider_id]
+            )
+
+            performance_df = prepare_performance_df(
+                performance_month,
+                performance_data[
+                    performance_data["staff_number"] == provider_id
+                ].reset_index(drop=True),
+            )
+            result = pipeline(preferences, history, performance_df)
+            if not stats_only:
+                directory = performance_data_path.parent / "messages"
+                os.makedirs(directory, exist_ok=True)
+
+                performance_month = performance_month
+                new_filename = (
+                    f"Provider_{provider_id} - message for {performance_month}.json"
+                )
+                output_path = directory / new_filename
+
+                output_path.write_bytes(
+                    orjson.dumps(result, option=orjson.OPT_INDENT_2)
+                )
+                logger.info(f"Message created at {output_path}")
+            else:
+                logger.info(f"✔ Would process: Provider_{provider_id}")
+
+            success_count += 1
+
+        except Exception as e:
+            logger.error(f"✘ Failed to process Provider_{provider_id}: {e}")
+            failure_count += 1
+            result = e.detail
+
+        add_response(result)
+        if not stats_only:
+            add_candidates(result, performance_month)
+
+    logger.info(f"Successful: {success_count}, Failed: {failure_count}")
+    analyse_responses()
+    if not stats_only:
+        analyse_candidates(performance_data_path.parent / "messages" / "candidates.csv")
+
+
+@cli.command()
+def web(workers: int = 5):
+    # uvicorn.run(["scaffold.api:app","--workers", str(workers)], reload=False, use_colors=True)
+    subprocess.run(
+        ["uvicorn", "scaffold.api:app", "--workers", str(workers), "--use-colors"]
+    )
 
 
 if __name__ == "__main__":
diff --git a/scaffold/utils/graph_operations.py b/scaffold/utils/graph_operations.py
@@ -5,6 +5,9 @@
 from loguru import logger
 from rdflib import Graph
 
+from scaffold.utils.utils import set_logger
+
+set_logger()
 
 def manifest_to_graph(manifest_path: str) -> Graph:
     g: Graph = Graph()
diff --git a/scaffold/utils/utils.py b/scaffold/utils/utils.py
@@ -1,6 +1,8 @@
+import ast
 import re
 import sys
 
+import numpy as np
 import pandas as pd
 from loguru import logger
 
@@ -142,3 +144,53 @@ def set_logger():
     logger.at_least = (
         lambda lvl: logger.level(lvl).no >= logger.level(settings.log_level).no
     )
+
+
+def get_preferences(preferences_row):
+    if preferences_row.empty:
+        return {}
+
+    preferences = {"Utilities": {"Message_Format": {}, "Display_Format": {}}}
+
+    # We'll just use the first row of the CSV
+    row = preferences_row.iloc[0]
+
+    for key in preferences_row.columns:
+        value = row[key]
+        if key == "staff_number":
+            continue  # skip or store if you need it
+        elif key == "Display_Format":
+            # Example: "Bar chart, Line chart"
+            preferences["Utilities"]["Display_Format"] = {
+                "Bar chart": 0,
+                "Line chart": 0,
+                "Text-only": 0,
+                "System-generated": "0",
+            }
+            preferences["Utilities"]["Display_Format"][value] = 1
+        else:
+            if isinstance(value, (np.float64, np.int64)):
+                value = value.item()
+            preferences["Utilities"]["Message_Format"][key] = value
+
+    return preferences
+
+
+def get_history(history_row):
+    if history_row.empty:
+        return {}
+
+    history = {}
+    row = history_row.iloc[0]
+    for col in history_row.columns:
+        if col == "staff_number":
+            continue
+        value = row[col]
+        if pd.notna(value):
+            try:
+                value = ast.literal_eval(value)
+            except Exception:
+                continue  # or handle invalid format
+            history[col] = value
+
+    return history