|
| 1 | +import os |
| 2 | +import zipfile |
| 3 | +import gzip |
| 4 | +import shutil |
| 5 | +import pandas as pd |
1 | 6 | import streamlit as st |
2 | 7 | import pandas_gbq |
3 | 8 | from google.oauth2 import service_account |
| 9 | +from kaggle.api.kaggle_api_extended import KaggleApi |
| 10 | +from google.cloud import storage |
| 11 | +from google.cloud import bigquery |
4 | 12 | import folium |
5 | 13 | from folium.plugins import MarkerCluster |
6 | 14 | from streamlit_folium import st_folium |
7 | 15 | from streamlit_extras.let_it_rain import rain |
8 | 16 |
|
9 | | -# bigquery |
| 17 | +# ------------------ SETUP ------------------ |
| 18 | +bucket_name = "run-sources-sipa-adv-c-alexa-giulio-us-central1" |
| 19 | +kaggle_dataset = "asaniczka/top-spotify-songs-in-73-countries-daily-updated" |
| 20 | +local_zip = "top-spotify-songs-in-73-countries-daily-updated.zip" |
| 21 | +local_csv = "universal_top_spotify_songs.csv" |
| 22 | + |
10 | 23 | credentials = service_account.Credentials.from_service_account_info( |
11 | 24 | st.secrets["gcp_service_account"] |
12 | 25 | ) |
13 | 26 | project_id = st.secrets["gcp_service_account"]["project_id"] |
14 | | -spotify_data = "spotify" |
15 | | -table = "universal_top_spotify_songs" |
| 27 | +dataset_id = "spotify" |
| 28 | +table_id = "universal_top_spotify_songs" |
| 29 | +table_ref = f"{project_id}.{dataset_id}.{table_id}" |
| 30 | + |
| 31 | +# ------------------ CACHED KAGGLE → BQ UPDATE ------------------ |
| 32 | +@st.cache_data(ttl=3600) |
| 33 | +def update_bigquery_from_kaggle(): |
| 34 | + try: |
| 35 | + api = KaggleApi() |
| 36 | + api.authenticate() |
| 37 | + |
| 38 | + api.dataset_download_files(kaggle_dataset, path=".", unzip=False) |
| 39 | + with zipfile.ZipFile(local_zip, 'r') as zip_ref: |
| 40 | + zip_ref.extractall(".") |
| 41 | + |
| 42 | + df = pd.read_csv(local_csv, parse_dates=['snapshot_date']) |
| 43 | + latest_snapshot = df['snapshot_date'].max() |
| 44 | + |
| 45 | + df_latest = df[ |
| 46 | + (df['snapshot_date'] == latest_snapshot) & |
| 47 | + (df['country'].isin(['US', 'FR', 'IT', 'ES', 'MX'])) |
| 48 | + ] |
| 49 | + |
| 50 | + compressed_csv = "latest_snapshot.csv.gz" |
| 51 | + df_latest.to_csv(compressed_csv, index=False, compression='gzip') |
| 52 | + |
| 53 | + storage_client = storage.Client(credentials=credentials, project=project_id) |
| 54 | + bucket = storage_client.bucket(bucket_name) |
| 55 | + blob = bucket.blob(compressed_csv) |
| 56 | + blob.upload_from_filename(compressed_csv) |
| 57 | + |
| 58 | + bq_client = bigquery.Client(credentials=credentials, project=project_id) |
| 59 | + job_config = bigquery.LoadJobConfig( |
| 60 | + source_format=bigquery.SourceFormat.CSV, |
| 61 | + skip_leading_rows=1, |
| 62 | + autodetect=True, |
| 63 | + write_disposition=bigquery.WriteDisposition.WRITE_APPEND, |
| 64 | + compression_type=bigquery.Compression.GZIP, |
| 65 | + ) |
| 66 | + |
| 67 | + uri = f"gs://{bucket_name}/{compressed_csv}" |
| 68 | + load_job = bq_client.load_table_from_uri(uri, table_ref, job_config=job_config) |
| 69 | + load_job.result() |
| 70 | + |
| 71 | + os.remove(local_csv) |
| 72 | + os.remove(compressed_csv) |
| 73 | + |
| 74 | + return latest_snapshot |
| 75 | + |
| 76 | + except Exception as e: |
| 77 | + return None |
| 78 | + |
| 79 | +# ------------------ DATA LOAD ------------------ |
| 80 | +with st.spinner("⏳ Updating dataset from Kaggle to BigQuery..."): |
| 81 | + latest_snapshot = update_bigquery_from_kaggle() |
| 82 | + |
| 83 | +if latest_snapshot is None: |
| 84 | + latest_date_query = f"SELECT MAX(snapshot_date) AS latest_date FROM `{table_ref}`" |
| 85 | + latest_date_df = pandas_gbq.read_gbq(latest_date_query, project_id=project_id, credentials=credentials) |
| 86 | + latest_snapshot = latest_date_df['latest_date'][0] |
| 87 | + |
| 88 | +st.info(f"📅 Latest data in BigQuery: {latest_snapshot}") |
| 89 | + |
16 | 90 | query = f""" |
17 | 91 | SELECT DISTINCT artists, country, name, is_explicit, speechiness, danceability, acousticness, liveness |
18 | | - FROM `{project_id}.{spotify_data}.{table}` |
| 92 | + FROM `{table_ref}` |
19 | 93 | WHERE country IN ('IT','US','FR','ES','MX') |
20 | | -""" |
| 94 | + AND snapshot_date = DATE('{latest_snapshot}') |
| 95 | +""" |
21 | 96 | spotify_data = pandas_gbq.read_gbq(query, project_id=project_id, credentials=credentials) |
22 | 97 |
|
| 98 | +# ------------------ CLEANING & VISUALS ------------------ |
| 99 | + |
23 | 100 | # cleaning the data |
24 | 101 | spotify_data["artists"] = spotify_data["artists"].astype(str).str.split(", ") |
25 | 102 | spotify_data2 = spotify_data.explode("artists") |
|
0 commit comments