Skip to content

Commit b6aa3ea

Browse files
committed
fixed bigquery
1 parent f6cc095 commit b6aa3ea

File tree

2 files changed

+86
-6
lines changed

2 files changed

+86
-6
lines changed

Spotify_Dashboard.py

Lines changed: 82 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,102 @@
1+
import os
2+
import zipfile
3+
import gzip
4+
import shutil
5+
import pandas as pd
16
import streamlit as st
27
import pandas_gbq
38
from google.oauth2 import service_account
9+
from kaggle.api.kaggle_api_extended import KaggleApi
10+
from google.cloud import storage
11+
from google.cloud import bigquery
412
import folium
513
from folium.plugins import MarkerCluster
614
from streamlit_folium import st_folium
715
from streamlit_extras.let_it_rain import rain
816

9-
# bigquery
17+
# ------------------ SETUP ------------------
18+
bucket_name = "run-sources-sipa-adv-c-alexa-giulio-us-central1"
19+
kaggle_dataset = "asaniczka/top-spotify-songs-in-73-countries-daily-updated"
20+
local_zip = "top-spotify-songs-in-73-countries-daily-updated.zip"
21+
local_csv = "universal_top_spotify_songs.csv"
22+
1023
credentials = service_account.Credentials.from_service_account_info(
1124
st.secrets["gcp_service_account"]
1225
)
1326
project_id = st.secrets["gcp_service_account"]["project_id"]
14-
spotify_data = "spotify"
15-
table = "universal_top_spotify_songs"
27+
dataset_id = "spotify"
28+
table_id = "universal_top_spotify_songs"
29+
table_ref = f"{project_id}.{dataset_id}.{table_id}"
30+
31+
# ------------------ CACHED KAGGLE → BQ UPDATE ------------------
32+
@st.cache_data(ttl=3600)
33+
def update_bigquery_from_kaggle():
34+
try:
35+
api = KaggleApi()
36+
api.authenticate()
37+
38+
api.dataset_download_files(kaggle_dataset, path=".", unzip=False)
39+
with zipfile.ZipFile(local_zip, 'r') as zip_ref:
40+
zip_ref.extractall(".")
41+
42+
df = pd.read_csv(local_csv, parse_dates=['snapshot_date'])
43+
latest_snapshot = df['snapshot_date'].max()
44+
45+
df_latest = df[
46+
(df['snapshot_date'] == latest_snapshot) &
47+
(df['country'].isin(['US', 'FR', 'IT', 'ES', 'MX']))
48+
]
49+
50+
compressed_csv = "latest_snapshot.csv.gz"
51+
df_latest.to_csv(compressed_csv, index=False, compression='gzip')
52+
53+
storage_client = storage.Client(credentials=credentials, project=project_id)
54+
bucket = storage_client.bucket(bucket_name)
55+
blob = bucket.blob(compressed_csv)
56+
blob.upload_from_filename(compressed_csv)
57+
58+
bq_client = bigquery.Client(credentials=credentials, project=project_id)
59+
job_config = bigquery.LoadJobConfig(
60+
source_format=bigquery.SourceFormat.CSV,
61+
skip_leading_rows=1,
62+
autodetect=True,
63+
write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
64+
compression_type=bigquery.Compression.GZIP,
65+
)
66+
67+
uri = f"gs://{bucket_name}/{compressed_csv}"
68+
load_job = bq_client.load_table_from_uri(uri, table_ref, job_config=job_config)
69+
load_job.result()
70+
71+
os.remove(local_csv)
72+
os.remove(compressed_csv)
73+
74+
return latest_snapshot
75+
76+
except Exception as e:
77+
return None
78+
79+
# ------------------ DATA LOAD ------------------
80+
with st.spinner("⏳ Updating dataset from Kaggle to BigQuery..."):
81+
latest_snapshot = update_bigquery_from_kaggle()
82+
83+
if latest_snapshot is None:
84+
latest_date_query = f"SELECT MAX(snapshot_date) AS latest_date FROM `{table_ref}`"
85+
latest_date_df = pandas_gbq.read_gbq(latest_date_query, project_id=project_id, credentials=credentials)
86+
latest_snapshot = latest_date_df['latest_date'][0]
87+
88+
st.info(f"📅 Latest data in BigQuery: {latest_snapshot}")
89+
1690
query = f"""
1791
SELECT DISTINCT artists, country, name, is_explicit, speechiness, danceability, acousticness, liveness
18-
FROM `{project_id}.{spotify_data}.{table}`
92+
FROM `{table_ref}`
1993
WHERE country IN ('IT','US','FR','ES','MX')
20-
"""
94+
AND snapshot_date = DATE('{latest_snapshot}')
95+
"""
2196
spotify_data = pandas_gbq.read_gbq(query, project_id=project_id, credentials=credentials)
2297

98+
# ------------------ CLEANING & VISUALS ------------------
99+
23100
# cleaning the data
24101
spotify_data["artists"] = spotify_data["artists"].astype(str).str.split(", ")
25102
spotify_data2 = spotify_data.explode("artists")

requirements.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,7 @@ pandas-gbq
1414
google-cloud-bigquery
1515
openpyxl
1616
folium
17-
streamlit_folium
17+
streamlit_folium
18+
google-cloud-storage
19+
google-auth
20+
google-crc32c

0 commit comments

Comments
 (0)