Full implementation of gsheet/group-attendance endpoint with a test verifying dataframe shapes.

sezavala · sezavala · commit 6032c8026753 · 2026-02-01T13:13:36.000-08:00
diff --git a/src/api.py b/src/api.py
@@ -12,6 +12,7 @@
 from src.students.withdrawal_processing.router import router as student_withdrawal_router
 
 from src.gsheet.refresh.router import router as gsheet_refresh_router
+from src.gsheet.group_attendance.router import router as gsheet_router
 from src.utils.authorization import verify_api_key
 
 api_router = APIRouter(dependencies=[Depends(verify_api_key)])
@@ -86,6 +87,13 @@
     tags=["Students"],
 )
 
+# /api/gsheet/...
+api_router.include_router(
+    gsheet_router,
+    prefix="/gsheet",
+    tags=["GSheet"],
+)
+
 # /api/gsheet/refresh/...
 api_router.include_router(
     gsheet_refresh_router,
diff --git a/src/database/postgres/core.py b/src/database/postgres/core.py
@@ -13,7 +13,7 @@ class Base(DeclarativeBase):
 # Default False (cloud environment) TODO: Move this elsewhere that's more universal
 env_required = False
 if env_required:
-    load_dotenv(dotenv_path="./../../.env")
+    load_dotenv(dotenv_path=".env")
 
 # Engine & Session Configuration
 # Note that currently, sessions are the only way to interface with the database
diff --git a/src/gsheet/group_attendance/router.py b/src/gsheet/group_attendance/router.py
@@ -0,0 +1,46 @@
+from typing import Any, Dict, Optional
+from fastapi import APIRouter, Depends, HTTPException, status
+from sqlalchemy.exc import SQLAlchemyError
+from sqlalchemy.orm import Session
+import gspread
+from datetime import date, timedelta
+
+from src.config import settings
+from src.database.postgres.core import make_session
+from src.database.postgres.core import engine as CONN
+import src.gsheet.utils as utils
+import src.gsheet.group_attendance.service as service
+
+router = APIRouter()
+
+@router.post("/group-attendance",
+    description="Load in attendance information given a group of IDs and a date range in the associated Google Sheet",
+    response_description="Updated group attendance roster",
+    status_code=status.HTTP_201_CREATED)
+def refresh_group_attendance(spreadsheet_id: str, spreadsheet_name: str, start_date: Optional[date] = None, end_date: Optional[date] = None, db: Session = Depends(make_session)) -> Dict[str, Any]:
+    """
+    Copy database records of select students from a specified Google Sheet and record them onto the same Sheet given a date range
+    """
+    try:
+        if not start_date or not end_date:
+            end_date = date.today()
+            start_date = end_date - timedelta(days=settings.default_attendance_lookback_days)
+
+        if settings.app_env == "production":
+            gc = utils.create_credentials()
+            cti_ids = service.fetch_cti_ids_from_sheet(spreadsheet_id, spreadsheet_name, gc)
+            key = settings.roster_sheet_key
+        else:
+            gc = gspread.service_account(filename='gspread_credentials.json')
+            cti_ids = service.fetch_cti_ids_from_sheet(spreadsheet_id, spreadsheet_name, gc)
+            key = settings.test_sheet_key
+
+        # pass CTI IDs into the group attendance service
+        attendance_data = service.fetch_group_attendance(CONN, start_date, end_date, cti_ids)
+
+        return utils.write_to_gsheet(attendance_data, spreadsheet_name, gc, key)
+
+    except SQLAlchemyError as e:
+        db.rollback()
+        raise HTTPException(status_code=500, detail=str(e))
+    
diff --git a/src/gsheet/group_attendance/service.py b/src/gsheet/group_attendance/service.py
@@ -0,0 +1,130 @@
+from fastapi import HTTPException
+from sqlalchemy import select, func, cast, and_, Date
+from sqlalchemy.orm import Session
+from sqlalchemy.engine import Engine
+from sqlalchemy.dialects.postgresql import array_agg
+
+from src.database.postgres.models import Attendance, StudentAttendance, StudentEmail
+import gspread
+import pandas
+import numpy as np
+from typing import List, Dict
+from datetime import date
+from src.config import settings
+
+def fetch_group_attendance(eng: Engine, start_date: date, end_date: date, cti_ids: Dict[int, str]):
+    """
+    Fetch attendance records and create an attendance matrix of select cti_ids and a date range,
+    given the associated Accelerate tables
+    @param eng: A SQLAlchemy Engine object that connects to the database
+    """
+    if not cti_ids:
+        return pandas.DataFrame(columns=["cti_id", "email"])
+
+    # 1) Build cti_id -> email mapping, defaulting to "NOT FOUND"
+    id_to_email = fetch_cti_emails(eng, cti_ids)
+
+    # 2) Build base CTI/email frame
+    cti_data = pandas.DataFrame(
+        [{"cti_id": cid, "email": id_to_email.get(cid, "NOT FOUND")} for cid in cti_ids]
+    ).set_index("cti_id")
+
+    # 3) Build date columns
+    dates = pandas.date_range(start_date, end_date)
+    date_grid = np.zeros((len(cti_ids), len(dates)), dtype=bool)
+    pandas_grid = pandas.DataFrame(date_grid, index=cti_data.index, columns=dates)
+
+    result_grid = pandas.concat([cti_data, pandas_grid], axis=1)
+
+    # 4) Fetch attendance for those CTI IDs and date range
+    attendance_query = (
+        select(
+            StudentAttendance.cti_id,
+            cast(Attendance.session_start, Date).label("session_date"),
+        )
+        .join(Attendance, Attendance.session_id == StudentAttendance.session_id)
+        .where(
+            and_(StudentAttendance.cti_id.in_(cti_ids),
+                 cast(Attendance.session_start, Date).between(start_date, end_date),
+            )
+        )
+    )
+
+    print(attendance_query)
+
+    attendance_frame = pandas.read_sql(attendance_query, eng)
+    print(attendance_frame)
+    if not attendance_frame.empty:
+        attendance_frame["session_date"] = pandas.to_datetime(attendance_frame["session_date"])
+
+        for row in attendance_frame.itertuples(index=False):
+            if row.cti_id in result_grid.index and row.session_date in result_grid.columns:
+                result_grid.loc[row.cti_id, row.session_date] = True
+
+    # From here on, ALWAYS normalize before returning
+    final_df = result_grid.reset_index()
+
+    # Normalize headers
+    final_df.columns = [
+        col.strftime("%Y-%m-%d") if hasattr(col, "strftime") else str(col)
+        for col in final_df.columns
+    ]
+
+    # Simple integer index
+    final_df.index = range(len(final_df))
+
+    # Everything as string so gspread/JSON is happy
+    final_df = final_df.astype(str)
+
+    return final_df
+
+def fetch_cti_ids_from_sheet(spreadsheet_id: str, worksheet_name: str, gc: gspread.client.Client) -> List[int]:
+    sh = gc.open_by_key(spreadsheet_id)
+    worksheet = sh.worksheet(worksheet_name)
+
+    headers = worksheet.row_values(1)
+    headers = [header.strip().lower() for header in headers]
+
+    try:
+        column_index = headers.index("cti_id") + 1
+    except ValueError:
+        print("Column name not found")
+        return
+    
+    column_values = worksheet.col_values(column_index)
+
+    data = []
+
+    for value in column_values[1:]:
+        if value:
+            try:
+                data.append(int(value))
+            except ValueError:
+                # Skip
+                continue
+    
+    worksheet.clear()
+    
+    return data
+
+def fetch_cti_emails(eng: Engine, cti_ids: List[int]) -> Dict[int, str]:
+    ids_to_email = dict.fromkeys(cti_ids, "NOT FOUND")
+
+    attendance_query = (
+        select(
+            StudentEmail.cti_id,
+            StudentEmail.email
+        )
+        .where(
+            and_(
+                StudentEmail.cti_id.in_(cti_ids),
+                StudentEmail.is_primary
+            )
+        )
+    )
+
+    email_frame = pandas.read_sql(attendance_query, eng)
+    for index, row in email_frame.iterrows():
+        ids_to_email[row.cti_id] = row.email
+
+    return ids_to_email
diff --git a/src/gsheet/refresh/attendance/router.py b/src/gsheet/refresh/attendance/router.py
@@ -35,35 +35,3 @@ def refresh_attendance(db: Session = Depends(make_session)) -> Dict[str, Any]:
     except SQLAlchemyError as e:
         db.rollback()
         raise HTTPException(status_code=500, detail=str(e))
-
-@router.post("/group-attendance",
-    description="Load in attendance information given a group of IDs and a date range in the associated Google Sheet",
-    response_description="Updated group attendance roster",
-    status_code=status.HTTP_201_CREATED)
-def refresh_group_attendance(spreadsheet_id: str, spreadsheet_name: str, start_date: Optional[date] = None, end_date: Optional[date] = None, db: Session = Depends(make_session)) -> Dict[str, Any]:
-    """
-    Copy database records of select students from a specified Google Sheet and record them onto the same Sheet given a date range
-    """
-    try:
-        if not start_date or not end_date:
-            end_date = date.today()
-            start_date = end_date - timedelta(days=settings.default_attendance_lookback_days)
-
-        if settings.app_env == "production":
-            gc = utils.create_credentials()
-            cti_ids = service.fetch_cti_ids_from_sheet(spreadsheet_id, spreadsheet_name, gc)
-            key = settings.roster_sheet_key
-        else:
-            gc = gspread.service_account(filename='gspread_credentials.json')
-            cti_ids = service.fetch_cti_ids_from_sheet(spreadsheet_id, spreadsheet_name, gc)
-            key = settings.test_sheet_key
-
-        # pass CTI IDs into the group attendance service
-        attendance_data = service.fetch_group_attendance(CONN, start_date, end_date, cti_ids)
-
-        return utils.write_to_gsheet(attendance_data, spreadsheet_name, gc, key)
-
-    except SQLAlchemyError as e:
-        db.rollback()
-        raise HTTPException(status_code=500, detail=str(e))
-    
diff --git a/src/gsheet/refresh/attendance/service.py b/src/gsheet/refresh/attendance/service.py
@@ -1,18 +1,14 @@
 from fastapi import HTTPException
-from sqlalchemy import select, func, cast
+from sqlalchemy import select, func
 from sqlalchemy.orm import Session
 from sqlalchemy.engine import Engine
 from sqlalchemy.dialects.postgresql import array_agg
 
-from src.database.postgres.models import Attendance, StudentAttendance, StudentEmail
+from src.database.postgres.models import Attendance
 import gspread
 import pandas
-import numpy as np
-from typing import List, Dict
-from datetime import date
 from src.config import settings
 
-
 def fetch_attendance(eng: Engine):
     """
     Fetch roster from associated Accelerate tables, and return it as a pd dataframe
@@ -57,99 +53,4 @@ def fetch_attendance(eng: Engine):
         "End Date": str,
         "Processed On": str}) # Date objects not allowed
     attendance_frame = attendance_frame.fillna('') # Empty cells (na) not allowed, replaced with empty strings
-    return attendance_frame
-
-def fetch_group_attendance(eng: Engine, start_date: date, end_date: date, cti_ids: Dict[int, str]):
-    """
-    Fetch attendance records and create an attendance matrix of select cti_ids and a date range,
-    given the associated Accelerate tables
-    @param eng: A SQLAlchemy Engine object that connects to the database
-    """
-    if not cti_ids:
-        return pandas.DataFrame(columns=["cti_id", "email"])
-
-    # 1) Build cti_id -> email mapping, defaulting to "NOT FOUND"
-    id_to_email = fetch_cti_emails(eng, cti_ids)
-
-    # 2) Build base CTI/email frame
-    cti_data = pandas.DataFrame(
-        [{"cti_id": cid, "email": id_to_email.get(cid, "NOT FOUND")} for cid in cti_ids]
-    ).set_index("cti_id")
-
-    # 3) Build date columns
-    dates = pandas.date_range(start_date, end_date)
-    date_grid = np.zeros((len(cti_ids), len(dates)), dtype=bool)
-    pandas_grid = pandas.DataFrame(date_grid, index=cti_data.index, columns=dates)
-
-    result_grid = pandas.concat([cti_data, pandas_grid], axis=1)
-
-    # 4) Fetch attendance for those CTI IDs and date range
-    attendance_query = (
-        select(
-            StudentAttendance.cti_id,
-            cast(Attendance.session_start, date).label("session_date"),
-        )
-        .join(Attendance, Attendance.session_id == StudentAttendance.session_id)
-        .where(StudentAttendance.cti_id.in_(cti_ids))
-        .where(Attendance.session_start.between(start_date, end_date))
-    )
-
-    attendance_frame = pandas.read_sql(attendance_query, eng)
-    if attendance_frame.empty:
-        return result_grid
-
-    attendance_frame["session_date"] = pandas.to_datetime(attendance_frame["session_date"])
-
-    # 5) Mark True where there was attendance
-    for row in attendance_frame.itertuples(index=False):
-        # row.cti_id, row.session_date
-        if row.cti_id in result_grid.index and row.session_date in result_grid.columns:
-            result_grid.loc[row.cti_id, row.session_date] = True
-
-    result_grid.columns = result_grid.columns.map(lambda x: x.strftime('%Y-%m-%d') if hasattr(x, 'strftime') else str(x))
-
-    return result_grid
-
-def fetch_cti_ids_from_sheet(spreadsheet_id: str, worksheet_name: str, gc: gspread.client.Client) -> List[int]:
-    sh = gc.open_by_key(spreadsheet_id)
-    worksheet = sh.worksheet(worksheet_name)
-
-    headers = worksheet.row_values(1)
-    headers = [header.strip().lower() for header in headers]
-
-    try:
-        column_index = headers.index("cti_id") + 1
-    except ValueError:
-        print("Column name not found")
-        return
-    
-    column_values = worksheet.col_values(column_index)
-
-    data = []
-
-    for value in column_values[1:]:
-        if value:
-            try:
-                data.append(int(value))
-            except ValueError:
-                # Skip
-                continue
-    
-    return data
-
-def fetch_cti_emails(eng: Engine, cti_ids: List[int]) -> Dict[int, str]:
-    ids_to_email = dict.fromkeys(cti_ids, "NOT FOUND")
-
-    attendance_query = (
-        select(
-            StudentEmail.cti_id,
-            StudentEmail.email
-        )
-        .where(StudentEmail.cti_id.in_(cti_ids))
-    )
-
-    email_frame = pandas.read_sql(attendance_query, eng)
-    for row in email_frame.iterrows():
-        ids_to_email[row.cti_id] = row.email
-
-    return ids_to_email
+    return attendance_frame
diff --git a/tests/gsheet/test_gsheet.py b/tests/gsheet/test_gsheet.py