population db update

sayakpaul · sayakpaul · commit e45e4ebc832d · 2025-06-06T11:52:06.000+05:30
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -36,6 +36,8 @@ jobs:
           nvidia-smi
       - name: Install dependencies
         run: |
+          apt update
+          apt install -y libpq-dev postgresql-client
           python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
           python -m uv pip install -e [quality,test]
           python -m uv pip install -r benchmarks/requirements.txt
diff --git a/benchmarks/populate_into_db.py b/benchmarks/populate_into_db.py
@@ -1,40 +1,31 @@
+import datetime
 import os
+import uuid
 
 import pandas as pd
 import psycopg2
 import psycopg2.extras
 
 
-FINAL_CSV_FILENAME = "benchmark_outputs/collated_results.csv"
-TABLE_NAME = "diffusers_benchmarks"
+# FINAL_CSV_FILENAME = "benchmark_outputs/collated_results.csv"
+# https://github.com/huggingface/transformers/blob/593e29c5e2a9b17baec010e8dc7c1431fed6e841/benchmark/init_db.sql#L27
+TABLE_NAME = "model_measurements"
 
 if __name__ == "__main__":
-    conn = psycopg2.connect(
-        host=os.getenv("PGHOST"),
-        database=os.getenv("PGDATABASE"),
-        user=os.getenv("PGUSER"),
-        password=os.getenv("PGPASSWORD"),
-    )
+    try:
+        conn = psycopg2.connect(
+            host=os.getenv("PGHOST"),
+            database=os.getenv("PGDATABASE"),
+            user=os.getenv("PGUSER"),
+            password=os.getenv("PGPASSWORD"),
+        )
+        print("DB connection established successfully.")
+    except Exception:
+        raise
     cur = conn.cursor()
 
-    cur.execute(f"""
-    CREATE TABLE IF NOT EXISTS {TABLE_NAME} (
-        scenario       TEXT,
-        model_cls      TEXT,
-        num_params_M   REAL,
-        flops_M        REAL,
-        time_plain_s   REAL,
-        mem_plain_GB   REAL,
-        time_compile_s REAL,
-        mem_compile_GB REAL,
-        fullgraph      BOOLEAN,
-        mode           TEXT,
-        github_sha     TEXT
-    );
-    """)
-    conn.commit()
-
-    df = pd.read_csv(FINAL_CSV_FILENAME)
+    # df = pd.read_csv(FINAL_CSV_FILENAME)
+    df = pd.read_csv("collated_results.csv")
 
     # Helper to cast values (or None) given a dtype
     def _cast_value(val, dtype: str):
@@ -64,61 +55,60 @@ def _cast_value(val, dtype: str):
 
         return val
 
-    rows_to_insert = []
-    for _, row in df.iterrows():
-        scenario = _cast_value(row.get("scenario"), "text")
-        model_cls = _cast_value(row.get("model_cls"), "text")
-        num_params_M = _cast_value(row.get("num_params_M"), "float")
-        flops_M = _cast_value(row.get("flops_M"), "float")
-        time_plain_s = _cast_value(row.get("time_plain_s"), "float")
-        mem_plain_GB = _cast_value(row.get("mem_plain_GB"), "float")
-        time_compile_s = _cast_value(row.get("time_compile_s"), "float")
-        mem_compile_GB = _cast_value(row.get("mem_compile_GB"), "float")
-        fullgraph = _cast_value(row.get("fullgraph"), "bool")
-        mode = _cast_value(row.get("mode"), "text")
-
-        # If "github_sha" column exists in the CSV, cast it; else default to None
-        if "github_sha" in df.columns:
-            github_sha = _cast_value(row.get("github_sha"), "text")
-        else:
-            github_sha = None
-
-        rows_to_insert.append(
-            (
-                scenario,
-                model_cls,
-                num_params_M,
-                flops_M,
-                time_plain_s,
-                mem_plain_GB,
-                time_compile_s,
-                mem_compile_GB,
-                fullgraph,
-                mode,
-                github_sha,
-            )
+    try:
+        rows_to_insert = []
+        id_for_benchmark = str(uuid.uuid4()) + "_" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+        for _, row in df.iterrows():
+            scenario = _cast_value(row.get("scenario"), "text")
+            model_cls = _cast_value(row.get("model_cls"), "text")
+            num_params_M = _cast_value(row.get("num_params_M"), "float")
+            flops_M = _cast_value(row.get("flops_M"), "float")
+            time_plain_s = _cast_value(row.get("time_plain_s"), "float")
+            mem_plain_GB = _cast_value(row.get("mem_plain_GB"), "float")
+            time_compile_s = _cast_value(row.get("time_compile_s"), "float")
+            mem_compile_GB = _cast_value(row.get("mem_compile_GB"), "float")
+            fullgraph = _cast_value(row.get("fullgraph"), "bool")
+            mode = _cast_value(row.get("mode"), "text")
+
+            # If "github_sha" column exists in the CSV, cast it; else default to None
+            if "github_sha" in df.columns:
+                github_sha = _cast_value(row.get("github_sha"), "text")
+            else:
+                github_sha = None
+
+            if github_sha:
+                benchmark_id = f"{model_cls}-{scenario}-{github_sha}"
+            else:
+                benchmark_id = f"{model_cls}-{scenario}-{id_for_benchmark}"
+
+            measurements = {
+                "scenario": scenario,
+                "model_cls": model_cls,
+                "num_params_M": num_params_M,
+                "flops_M": flops_M,
+                "time_plain_s": time_plain_s,
+                "mem_plain_GB": mem_plain_GB,
+                "time_compile_s": time_compile_s,
+                "mem_compile_GB": mem_compile_GB,
+                "fullgraph": fullgraph,
+                "mode": mode,
+                "github_sha": github_sha,
+            }
+            rows_to_insert.append((benchmark_id, measurements))
+
+        # Batch-insert all rows
+        insert_sql = f"""
+        INSERT INTO {TABLE_NAME} (
+            benchmark_id,
+            measurements
         )
+        VALUES (%s, %s);
+        """
+
+        psycopg2.extras.execute_batch(cur, insert_sql, rows_to_insert)
+        conn.commit()
 
-    # Batch-insert all rows (with NULL for any None)
-    insert_sql = """
-    INSERT INTO benchmarks (
-        scenario,
-        model_cls,
-        num_params_M,
-        flops_M,
-        time_plain_s,
-        mem_plain_GB,
-        time_compile_s,
-        mem_compile_GB,
-        fullgraph,
-        mode,
-        github_sha
-    )
-    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
-    """
-
-    psycopg2.extras.execute_batch(cur, insert_sql, rows_to_insert)
-    conn.commit()
-
-    cur.close()
-    conn.close()
+        cur.close()
+        conn.close()
+    except Exception as e:
+        print(f"Exception: {e}")