Skip to content

Commit e0a1eb6

Browse files
committed
added data loading file
1 parent 4d15dd2 commit e0a1eb6

File tree

2 files changed

+85
-82
lines changed

2 files changed

+85
-82
lines changed

Spotify_Dashboard.py

Lines changed: 1 addition & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -1,92 +1,11 @@
1-
import os
2-
import zipfile
31
import gzip
42
import shutil
5-
import pandas as pd
63
import streamlit as st
74
import pandas_gbq
8-
from google.oauth2 import service_account
9-
from kaggle.api.kaggle_api_extended import KaggleApi
10-
from google.cloud import storage
11-
from google.cloud import bigquery
12-
import folium
135
from folium.plugins import MarkerCluster
146
from streamlit_folium import st_folium
157
from streamlit_extras.let_it_rain import rain
16-
17-
# setup
18-
bucket_name = "run-sources-sipa-adv-c-alexa-giulio-us-central1"
19-
kaggle_dataset = "asaniczka/top-spotify-songs-in-73-countries-daily-updated"
20-
local_zip = "top-spotify-songs-in-73-countries-daily-updated.zip"
21-
local_csv = "universal_top_spotify_songs.csv"
22-
23-
credentials = service_account.Credentials.from_service_account_info(
24-
st.secrets["gcp_service_account"]
25-
)
26-
project_id = st.secrets["gcp_service_account"]["project_id"]
27-
dataset_id = "spotify"
28-
table_id = "universal_top_spotify_songs"
29-
table_ref = f"{project_id}.{dataset_id}.{table_id}"
30-
31-
# cache and big query
32-
@st.cache_data(ttl=3600)
33-
def update_bigquery_from_kaggle():
34-
try:
35-
api = KaggleApi()
36-
api.authenticate()
37-
38-
api.dataset_download_files(kaggle_dataset, path=".", unzip=False)
39-
with zipfile.ZipFile(local_zip, 'r') as zip_ref:
40-
zip_ref.extractall(".")
41-
42-
df = pd.read_csv(local_csv, parse_dates=['snapshot_date'])
43-
latest_snapshot = df['snapshot_date'].max()
44-
45-
df_latest = df[
46-
(df['snapshot_date'] == latest_snapshot) &
47-
(df['country'].isin(['US', 'FR', 'IT', 'ES', 'MX']))
48-
]
49-
50-
compressed_csv = "latest_snapshot.csv.gz"
51-
df_latest.to_csv(compressed_csv, index=False, compression='gzip')
52-
53-
storage_client = storage.Client(credentials=credentials, project=project_id)
54-
bucket = storage_client.bucket(bucket_name)
55-
blob = bucket.blob(compressed_csv)
56-
blob.upload_from_filename(compressed_csv)
57-
58-
bq_client = bigquery.Client(credentials=credentials, project=project_id)
59-
job_config = bigquery.LoadJobConfig(
60-
source_format=bigquery.SourceFormat.CSV,
61-
skip_leading_rows=1,
62-
autodetect=True,
63-
write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
64-
compression_type=bigquery.Compression.GZIP,
65-
)
66-
67-
uri = f"gs://{bucket_name}/{compressed_csv}"
68-
load_job = bq_client.load_table_from_uri(uri, table_ref, job_config=job_config)
69-
load_job.result()
70-
71-
os.remove(local_csv)
72-
os.remove(compressed_csv)
73-
74-
return latest_snapshot
75-
76-
except Exception as e:
77-
return None
78-
79-
# loading data
80-
#with st.spinner("⏳ Updating dataset from Kaggle to BigQuery..."):
81-
#latest_snapshot = update_bigquery_from_kaggle()
82-
latest_snapshot = update_bigquery_from_kaggle()
83-
84-
if latest_snapshot is None:
85-
latest_date_query = f"SELECT MAX(snapshot_date) AS latest_date FROM `{table_ref}`"
86-
latest_date_df = pandas_gbq.read_gbq(latest_date_query, project_id=project_id, credentials=credentials)
87-
latest_snapshot = latest_date_df['latest_date'][0]
88-
89-
#st.info(f"📅 Latest data in BigQuery: {latest_snapshot}")
8+
from Spotify_Data_Load import table_ref, latest_snapshot, project_id, credentials, folium
909

9110
query = f"""
9211
SELECT DISTINCT artists, country, name, is_explicit, speechiness, danceability, acousticness, liveness

Spotify_Data_Load.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
from google.oauth2 import service_account
2+
from kaggle.api.kaggle_api_extended import KaggleApi
3+
from google.cloud import storage
4+
from google.cloud import bigquery
5+
import folium
6+
import streamlit as st
7+
import pandas as pd
8+
import os
9+
import zipfile
10+
import pandas_gbq
11+
12+
# setup
13+
bucket_name = "run-sources-sipa-adv-c-alexa-giulio-us-central1"
14+
kaggle_dataset = "asaniczka/top-spotify-songs-in-73-countries-daily-updated"
15+
local_zip = "top-spotify-songs-in-73-countries-daily-updated.zip"
16+
local_csv = "universal_top_spotify_songs.csv"
17+
18+
credentials = service_account.Credentials.from_service_account_info(
19+
st.secrets["gcp_service_account"]
20+
)
21+
project_id = st.secrets["gcp_service_account"]["project_id"]
22+
dataset_id = "spotify"
23+
table_id = "universal_top_spotify_songs"
24+
table_ref = f"{project_id}.{dataset_id}.{table_id}"
25+
26+
# cache and big query
27+
@st.cache_data(ttl=3600)
28+
def update_bigquery_from_kaggle():
29+
try:
30+
api = KaggleApi()
31+
api.authenticate()
32+
33+
api.dataset_download_files(kaggle_dataset, path=".", unzip=False)
34+
with zipfile.ZipFile(local_zip, 'r') as zip_ref:
35+
zip_ref.extractall(".")
36+
37+
df = pd.read_csv(local_csv, parse_dates=['snapshot_date'])
38+
latest_snapshot = df['snapshot_date'].max()
39+
40+
df_latest = df[
41+
(df['snapshot_date'] == latest_snapshot) &
42+
(df['country'].isin(['US', 'FR', 'IT', 'ES', 'MX']))
43+
]
44+
45+
compressed_csv = "latest_snapshot.csv.gz"
46+
df_latest.to_csv(compressed_csv, index=False, compression='gzip')
47+
48+
storage_client = storage.Client(credentials=credentials, project=project_id)
49+
bucket = storage_client.bucket(bucket_name)
50+
blob = bucket.blob(compressed_csv)
51+
blob.upload_from_filename(compressed_csv)
52+
53+
bq_client = bigquery.Client(credentials=credentials, project=project_id)
54+
job_config = bigquery.LoadJobConfig(
55+
source_format=bigquery.SourceFormat.CSV,
56+
skip_leading_rows=1,
57+
autodetect=True,
58+
write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
59+
compression_type=bigquery.Compression.GZIP,
60+
)
61+
62+
uri = f"gs://{bucket_name}/{compressed_csv}"
63+
load_job = bq_client.load_table_from_uri(uri, table_ref, job_config=job_config)
64+
load_job.result()
65+
66+
os.remove(local_csv)
67+
os.remove(compressed_csv)
68+
69+
return latest_snapshot
70+
71+
except Exception as e:
72+
return None
73+
74+
# loading data
75+
#with st.spinner("⏳ Updating dataset from Kaggle to BigQuery..."):
76+
#latest_snapshot = update_bigquery_from_kaggle()
77+
latest_snapshot = update_bigquery_from_kaggle()
78+
79+
if latest_snapshot is None:
80+
latest_date_query = f"SELECT MAX(snapshot_date) AS latest_date FROM `{table_ref}`"
81+
latest_date_df = pandas_gbq.read_gbq(latest_date_query, project_id=project_id, credentials=credentials)
82+
latest_snapshot = latest_date_df['latest_date'][0]
83+
84+
#st.info(f"📅 Latest data in BigQuery: {latest_snapshot}")

0 commit comments

Comments
 (0)