NMAquiferMigrationStats/copy_amp_review_data.py at master · DataIntegrationGroup/NMAquiferMigrationStats · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
from __future__ import print_function
import os
from googleapiclient.discovery import build
from google.oauth2.service_account import Credentials

# --- CONFIGURATION ---
SPREADSHEET_ID = "1iQzeKqRWHIKbnNptH_wRQEpJ_pt1rI00ax9d5BhDAhU"
SOURCE_SHEET = "Copy of AMP_review"  # where we copy FROM
TARGET_SHEET = "AMP_review"          # where we copy TO

# Columns used to match rows
MATCH_COLS = ["NMAquifer_Table.Field", "PointID", "Error"]

# Columns to copy from source to target
COPY_COLS = ["Reviewed (yes/no)", "Fixed (yes/no)", "AMP_Reviewer", "Notes"]

# Path to your service account JSON key (adjust as needed)
SERVICE_ACCOUNT_FILE = "transfermetrics_service_account.json"

SCOPES = ["https://www.googleapis.com/auth/spreadsheets"]


def col_index_to_a1(col_index_0_based):
    """
    Convert a 0-based column index (0 -> A, 1 -> B, ...) to A1-style column letters.
    """
    col = col_index_0_based + 1  # make it 1-based
    result = ""
    while col > 0:
        col, remainder = divmod(col - 1, 26)
        result = chr(65 + remainder) + result
    return result


def get_service():
    """
    Build and return the Sheets API service using a service account.
    """
    creds = Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE, scopes=SCOPES
    )
    service = build("sheets", "v4", credentials=creds)
    return service


def fetch_sheet_values(service, sheet_name):
    """
    Fetch all values from a sheet (A1:ZZZ range) and return as list of lists.
    """
    range_name = f"'{sheet_name}'!A1:ZZZ"
    result = (
        service.spreadsheets()
        .values()
        .get(spreadsheetId=SPREADSHEET_ID, range=range_name)
        .execute()
    )
    values = result.get("values", [])
    return values


def build_header_index_map(header_row):
    """
    Given a header row (list of column names), return a dict: name -> index.
    """
    return {name: idx for idx, name in enumerate(header_row)}


def main():
    service = get_service()

    # --- Fetch both sheets ---
    source_values = fetch_sheet_values(service, SOURCE_SHEET)
    target_values = fetch_sheet_values(service, TARGET_SHEET)

    if not source_values:
        print(f"No data found in {SOURCE_SHEET}.")
        return
    if not target_values:
        print(f"No data found in {TARGET_SHEET}.")
        return

    # First row assumed to be headers
    source_header = source_values[0]
    target_header = target_values[0]

    source_header_map = build_header_index_map(source_header)
    target_header_map = build_header_index_map(target_header)

    # Ensure all required columns exist
    for col in MATCH_COLS + COPY_COLS:
        if col not in source_header_map:
            raise ValueError(f"Column '{col}' not found in source sheet header.")
    for col in MATCH_COLS + COPY_COLS:
        if col not in target_header_map:
            raise ValueError(f"Column '{col}' not found in target sheet header.")

    # --- Build lookup from SOURCE SHEET keyed by (NMAquifer_Table.Field, PointID, Error) ---
    source_lookup = {}
    for row in source_values[1:]:  # skip header
        row_extended = row + [""] * (len(source_header) - len(row))
        key = tuple(row_extended[source_header_map[col]] for col in MATCH_COLS)

        copy_data = {col: row_extended[source_header_map[col]] for col in COPY_COLS}
        # If duplicate keys exist, the last one wins; you could log this if it matters
        source_lookup[key] = copy_data

    print(f"Built source lookup with {len(source_lookup)} unique keys.")

    # --- Prepare batch update for TARGET SHEET ---
    data_updates = []
    matched_count = 0
    updated_rows_count = 0
    updated_keys = []

    # Precompute min/max col indices for the span we’ll write per row
    copy_col_indices = [target_header_map[c] for c in COPY_COLS]
    min_copy_idx = min(copy_col_indices)
    max_copy_idx = max(copy_col_indices)

    # Row index in A1 is 1-based; target_values[0] is header, so row 2 is first data row
    for i, row in enumerate(target_values[1:], start=2):
        row_extended = row + [""] * (len(target_header) - len(row))
        key = tuple(row_extended[target_header_map[col]] for col in MATCH_COLS)

        if key in source_lookup:
            matched_count += 1
            src_vals = source_lookup[key]

            # Capture old & new values for the COPY_COLS
            old_vals = {}
            new_vals = {}
            changed = False

            for col_name in COPY_COLS:
                col_idx = target_header_map[col_name]
                current_val = row_extended[col_idx]
                new_val = src_vals[col_name]

                old_vals[col_name] = current_val
                new_vals[col_name] = new_val

                if current_val != new_val:
                    changed = True
                    row_extended[col_idx] = new_val  # update in memory

            if changed:
                updated_rows_count += 1
                updated_keys.append(key)

                # Build the range for the span of columns from min_copy_idx to max_copy_idx
                start_col_letter = col_index_to_a1(min_copy_idx)
                end_col_letter = col_index_to_a1(max_copy_idx)
                cell_range = f"'{TARGET_SHEET}'!{start_col_letter}{i}:{end_col_letter}{i}"

                # Values for that entire span (we preserve intermediate columns)
                span_values = row_extended[min_copy_idx : max_copy_idx + 1]

                data_updates.append(
                    {
                        "range": cell_range,
                        "values": [span_values],
                    }
                )

                # Log details for this row
                print(f"\nUpdating row {i} (key={key}):")
                print("  Old values:")
                for c in COPY_COLS:
                    print(f"    {c}: {old_vals[c]!r}")
                print("  New values:")
                for c in COPY_COLS:
                    print(f"    {c}: {new_vals[c]!r}")

    print(f"\nTotal matches found: {matched_count}")
    print(f"Rows with changes to apply: {updated_rows_count}")

    # --- Execute batch update ---
    if not data_updates:
        print("No matching rows with changes. Nothing to update.")
        return

    body = {
        "valueInputOption": "USER_ENTERED",
        "data": data_updates,
    }

    result = (
        service.spreadsheets()
        .values()
        .batchUpdate(
            spreadsheetId=SPREADSHEET_ID,
            body=body,
        )
        .execute()
    )

    total_cells = sum(len(r["values"]) * len(r["values"][0]) for r in data_updates)
    print(f"\nUpdate complete. {total_cells} cells updated across {updated_rows_count} rows.")


if __name__ == "__main__":
    main()