py-spectrabrainz/upload_to_gdrive.py at main · brain-image-library/py-spectrabrainz · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
#!/usr/bin/env python3
"""
upload_to_gdrive.py

Copyright (c) 2026 Pittsburgh Supercomputing Center (PSC),
Brain Image Library (BIL)

Author: icaoberg

Description:
    Generate an Excel report from YYYYMMDD.tsv files and upload it to Google Drive
    using rclone.

    The script performs the following steps:
        1. Find input TSV files matching the pattern YYYYMMDD.tsv
        2. Create or update 'spectrabrainz-report.xlsx'
        3. For each TSV file:
            - Load data into a worksheet named after the date
            - Sort rows by the 'completion' column (descending), if present
        4. Apply status-based row coloring using the 'state' column:
            - Completed -> green
            - Failed    -> red
            - Canceled  -> yellow
        5. Autosize all columns for readability
        6. Upload the resulting spreadsheet to Google Drive via rclone

Usage:
    python ./upload_to_gdrive.py

Requirements:
    - Python 3
    - pandas
    - openpyxl
    - rclone installed and configured
    - TSV files named like YYYYMMDD.tsv in the working directory

Notes:
    - The remote path is configured via RCLONE_REMOTE_PATH.
    - Existing sheets with the same name will be replaced.
"""

import io
import re
import subprocess
from pathlib import Path

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import pandas as pd
from openpyxl import load_workbook, Workbook
from openpyxl.drawing.image import Image as XLImage
from openpyxl.styles import PatternFill, Font
from openpyxl.utils import get_column_letter

RCLONE_REMOTE_PATH = "PSC:Brain_Image_Library/spectrabrainz/"


# -----------------------------------------------------------
# Find input *.tsv files
# -----------------------------------------------------------
def find_tsv_files(base_dir: Path):
    """Return a sorted list of TSV files matching YYYYMMDD.tsv in base_dir."""
    pattern = re.compile(r"^\d{8}\.tsv$")
    return sorted(
        (f for f in base_dir.iterdir() if f.is_file() and pattern.match(f.name)),
        reverse=True,
    )


# -----------------------------------------------------------
# Write Excel from TSV (pandas 2.x safe)
# -----------------------------------------------------------
def write_excel_from_tsv(tsv_files, excel_path: Path):
    """Create or update an Excel workbook from a list of TSV files."""
    if excel_path.exists():
        print(f"Appending to existing Excel file: {excel_path}")
        mode = "a"
    else:
        print(f"Creating new Excel file: {excel_path}")
        mode = "w"

    writer_kwargs = dict(engine="openpyxl", mode=mode)
    if mode == "a":
        writer_kwargs["if_sheet_exists"] = "replace"

    with pd.ExcelWriter(excel_path, **writer_kwargs) as writer:
        for tsv_file in tsv_files:
            sheet_name = tsv_file.stem
            print(f"Processing {tsv_file.name} → sheet '{sheet_name}'")

            df = pd.read_csv(tsv_file, sep="\t")

            # ---- SORT by completion (date) DESC if present
            if "completion" in df.columns:
                completion_dt = pd.to_datetime(df["completion"], errors="coerce")
                df = df.assign(_completion_dt=completion_dt).sort_values(
                    by="_completion_dt",
                    ascending=False,
                    na_position="last",
                ).drop(columns=["_completion_dt"])

                # Write back in a consistent string format (Excel-friendly),
                # keeping blanks where parsing failed.
                df["completion"] = completion_dt.dt.strftime("%Y-%m-%d %H:%M:%S").where(
                    completion_dt.notna(), ""
                )
            else:
                print("  Note: column 'completion' not found; skipping sort.")

            df.to_excel(writer, sheet_name=sheet_name, index=False)


# -----------------------------------------------------------
# Autosize columns
# -----------------------------------------------------------
def autosize_columns(ws):
    """Autosize all columns in an openpyxl worksheet based on content length."""
    for col_idx in range(1, ws.max_column + 1):
        letter = get_column_letter(col_idx)
        max_len = 0

        for row in range(1, ws.max_row + 1):
            val = ws.cell(row=row, column=col_idx).value
            val = "" if val is None else str(val)
            max_len = max(max_len, len(val))

        ws.column_dimensions[letter].width = max_len + 2


# -----------------------------------------------------------
# Ensure workbook exists
# -----------------------------------------------------------
def ensure_workbook_exists(excel_path: Path):
    """Create an empty workbook if the Excel file does not yet exist."""
    if not excel_path.exists():
        print(f"Excel file missing; creating: {excel_path}")
        wb = Workbook()
        wb.save(excel_path)


# -----------------------------------------------------------
# Apply formatting (colors) + autosize
# -----------------------------------------------------------
def apply_backup_status_formatting(excel_path: Path, sheet_names):
    """Apply row coloring based on 'state' column and autosize columns."""
    print("Applying formatting and autosizing columns...")

    ensure_workbook_exists(excel_path)
    wb = load_workbook(excel_path)

    fills = {
        "Completed": PatternFill(start_color="228B22", end_color="228B22", fill_type="solid"),
        "Failed": PatternFill(start_color="B22222", end_color="B22222", fill_type="solid"),
        "Canceled": PatternFill(start_color="FFD700", end_color="FFD700", fill_type="solid"),
    }

    white_font = Font(color="FFFFFF")
    black_font = Font(color="000000")

    for sheet_name in sheet_names:
        if sheet_name not in wb.sheetnames:
            print(f"Warning: Sheet '{sheet_name}' missing; skipping.")
            continue

        ws = wb[sheet_name]
        print(f"Formatting sheet '{sheet_name}'")

        # Locate 'state' column
        state_col = None
        for col in range(1, ws.max_column + 1):
            if ws.cell(row=1, column=col).value == "state":
                state_col = col
                break

        if state_col is None:
            print("  Warning: column 'state' not found; skipping coloring.")
            autosize_columns(ws)
            continue

        for row in range(2, ws.max_row + 1):
            state = ws.cell(row=row, column=state_col).value
            fill = fills.get(state)

            if fill:
                for col in range(1, ws.max_column + 1):
                    cell = ws.cell(row=row, column=col)
                    cell.fill = fill
                    cell.font = black_font if state == "Canceled" else white_font

        autosize_columns(ws)

    wb.save(excel_path)
    print("Formatting + autosizing complete.")


# -----------------------------------------------------------
# State histogram chart sheet
# -----------------------------------------------------------
HIST_SHEET = "Histogram of states"
_KNOWN_STATES = ("Completed", "Failed", "Canceled")
_STATE_COLORS = {
    "Completed": "#228B22",
    "Failed":    "#B22222",
    "Canceled":  "#FFD700",
    "Queued":    "#808080",
}


def _count_states(ws):
    """Return a dict of state -> count for a worksheet, treating unknown/null as Queued."""
    counts = {s: 0 for s in (*_KNOWN_STATES, "Queued")}
    state_col = None
    for col in range(1, ws.max_column + 1):
        if ws.cell(row=1, column=col).value == "state":
            state_col = col
            break

    for row in range(2, ws.max_row + 1):
        val = ws.cell(row=row, column=state_col).value if state_col else None
        if val in _KNOWN_STATES:
            counts[val] += 1
        else:
            counts["Queued"] += 1

    return counts


def add_histogram_sheet(excel_path: Path, sheet_names):
    """Build a stacked bar chart of states per day and embed it as the first sheet."""
    print(f"Building '{HIST_SHEET}'...")

    wb = load_workbook(excel_path)

    # Collect counts; chart shows dates in ascending (chronological) order
    labels = list(reversed(sheet_names))
    state_data = {s: [] for s in (*_KNOWN_STATES, "Queued")}
    for name in labels:
        counts = _count_states(wb[name]) if name in wb.sheetnames else {s: 0 for s in state_data}
        for s in state_data:
            state_data[s].append(counts[s])

    # Build the stacked bar chart
    x = range(len(labels))
    fig, ax = plt.subplots(figsize=(max(10, len(labels) * 1.4), 6))

    bottoms = [0] * len(labels)
    for state in (*_KNOWN_STATES, "Queued"):
        vals = state_data[state]
        ax.bar(x, vals, bottom=bottoms, label=state, color=_STATE_COLORS[state])
        bottoms = [b + v for b, v in zip(bottoms, vals)]

    ax.set_xticks(list(x))
    ax.set_xticklabels(labels, rotation=45, ha="right")
    ax.set_xlabel("Date")
    ax.set_ylabel("Count")
    ax.set_title("Histogram of States per Day")
    ax.legend()
    plt.tight_layout()

    buf = io.BytesIO()
    fig.savefig(buf, format="png", dpi=150)
    buf.seek(0)
    plt.close(fig)

    # Remove existing histogram sheet if present
    if HIST_SHEET in wb.sheetnames:
        del wb[HIST_SHEET]

    ws_hist = wb.create_sheet(HIST_SHEET)
    ws_hist.add_image(XLImage(buf), "A1")

    # Move to first position
    wb.move_sheet(HIST_SHEET, offset=-(len(wb.sheetnames) - 1))

    wb.save(excel_path)
    print(f"Sheet '{HIST_SHEET}' added as first sheet.")


# -----------------------------------------------------------
# rclone upload
# -----------------------------------------------------------
def upload_with_rclone(excel_path: Path, remote_path: str):
    """Upload the Excel file to the given rclone remote path."""
    print(f"Sorting done. Uploading '{excel_path}' → '{remote_path}'")
    try:
        subprocess.run(
            ["rclone", "copy", str(excel_path), remote_path, "--progress"],
            check=True,
        )
        print("Upload OK.")
    except FileNotFoundError:
        print("ERROR: rclone not found.")
    except subprocess.CalledProcessError as e:
        print(f"rclone failed: exit {e.returncode}")


# -----------------------------------------------------------
# Main
# -----------------------------------------------------------
def main():
    """Main entry point for report generation and upload."""
    cwd = Path(".").resolve()
    excel_path = cwd / "spectrabrainz-report.xlsx"

    tsv_files = find_tsv_files(cwd)
    if not tsv_files:
        print("No YYYYMMDD.tsv files found.")
        return

    sheet_names = [f.stem for f in tsv_files]

    write_excel_from_tsv(tsv_files, excel_path)
    apply_backup_status_formatting(excel_path, sheet_names)
    add_histogram_sheet(excel_path, sheet_names)
    upload_with_rclone(excel_path, RCLONE_REMOTE_PATH)


if __name__ == "__main__":
    main()