fixed bigquery

GiulioTrombin · GiulioTrombin · commit b6aa3ea3ca40 · 2025-04-15T15:03:08.000-04:00
diff --git a/Spotify_Dashboard.py b/Spotify_Dashboard.py
@@ -1,25 +1,102 @@
+import os
+import zipfile
+import gzip
+import shutil
+import pandas as pd
 import streamlit as st
 import pandas_gbq
 from google.oauth2 import service_account
+from kaggle.api.kaggle_api_extended import KaggleApi
+from google.cloud import storage
+from google.cloud import bigquery
 import folium
 from folium.plugins import MarkerCluster
 from streamlit_folium import st_folium
 from streamlit_extras.let_it_rain import rain
 
-# bigquery
+# ------------------ SETUP ------------------
+bucket_name = "run-sources-sipa-adv-c-alexa-giulio-us-central1"
+kaggle_dataset = "asaniczka/top-spotify-songs-in-73-countries-daily-updated"
+local_zip = "top-spotify-songs-in-73-countries-daily-updated.zip"
+local_csv = "universal_top_spotify_songs.csv"
+
 credentials = service_account.Credentials.from_service_account_info(
     st.secrets["gcp_service_account"]
 )
 project_id = st.secrets["gcp_service_account"]["project_id"]
-spotify_data = "spotify"
-table = "universal_top_spotify_songs"
+dataset_id = "spotify"
+table_id = "universal_top_spotify_songs"
+table_ref = f"{project_id}.{dataset_id}.{table_id}"
+
+# ------------------ CACHED KAGGLE → BQ UPDATE ------------------
+@st.cache_data(ttl=3600)
+def update_bigquery_from_kaggle():
+    try:
+        api = KaggleApi()
+        api.authenticate()
+
+        api.dataset_download_files(kaggle_dataset, path=".", unzip=False)
+        with zipfile.ZipFile(local_zip, 'r') as zip_ref:
+            zip_ref.extractall(".")
+
+        df = pd.read_csv(local_csv, parse_dates=['snapshot_date'])
+        latest_snapshot = df['snapshot_date'].max()
+
+        df_latest = df[
+            (df['snapshot_date'] == latest_snapshot) &
+            (df['country'].isin(['US', 'FR', 'IT', 'ES', 'MX']))
+        ]
+
+        compressed_csv = "latest_snapshot.csv.gz"
+        df_latest.to_csv(compressed_csv, index=False, compression='gzip')
+
+        storage_client = storage.Client(credentials=credentials, project=project_id)
+        bucket = storage_client.bucket(bucket_name)
+        blob = bucket.blob(compressed_csv)
+        blob.upload_from_filename(compressed_csv)
+
+        bq_client = bigquery.Client(credentials=credentials, project=project_id)
+        job_config = bigquery.LoadJobConfig(
+            source_format=bigquery.SourceFormat.CSV,
+            skip_leading_rows=1,
+            autodetect=True,
+            write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
+            compression_type=bigquery.Compression.GZIP,
+        )
+
+        uri = f"gs://{bucket_name}/{compressed_csv}"
+        load_job = bq_client.load_table_from_uri(uri, table_ref, job_config=job_config)
+        load_job.result()
+
+        os.remove(local_csv)
+        os.remove(compressed_csv)
+
+        return latest_snapshot
+
+    except Exception as e:
+        return None
+
+# ------------------ DATA LOAD ------------------
+with st.spinner("⏳ Updating dataset from Kaggle to BigQuery..."):
+    latest_snapshot = update_bigquery_from_kaggle()
+
+if latest_snapshot is None:
+    latest_date_query = f"SELECT MAX(snapshot_date) AS latest_date FROM `{table_ref}`"
+    latest_date_df = pandas_gbq.read_gbq(latest_date_query, project_id=project_id, credentials=credentials)
+    latest_snapshot = latest_date_df['latest_date'][0]
+
+st.info(f"📅 Latest data in BigQuery: {latest_snapshot}")
+
 query = f"""
     SELECT DISTINCT artists, country, name, is_explicit, speechiness, danceability, acousticness, liveness
-    FROM `{project_id}.{spotify_data}.{table}` 
+    FROM `{table_ref}`
     WHERE country IN ('IT','US','FR','ES','MX')
-"""  
+    AND snapshot_date = DATE('{latest_snapshot}')
+"""
 spotify_data = pandas_gbq.read_gbq(query, project_id=project_id, credentials=credentials)
 
+# ------------------ CLEANING & VISUALS ------------------
+
 # cleaning the data
 spotify_data["artists"] = spotify_data["artists"].astype(str).str.split(", ")
 spotify_data2 = spotify_data.explode("artists")
diff --git a/requirements.txt b/requirements.txt
@@ -14,4 +14,7 @@ pandas-gbq
 google-cloud-bigquery 
 openpyxl
 folium
-streamlit_folium
+streamlit_folium
+google-cloud-storage
+google-auth
+google-crc32c