-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathspotify_data_project.py
More file actions
141 lines (89 loc) · 4.3 KB
/
spotify_data_project.py
File metadata and controls
141 lines (89 loc) · 4.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# -*- coding: utf-8 -*-
"""Spotify Data project
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/14lFzkB-evBObeLmcIAPiwXoQVT4PZOnE
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df_tracks = pd.read_csv('tracks.csv')
df_tracks.head()
# let's check if there are null values
pd.isnull(df_tracks).sum()
#Just one for each collum.
#Let's see some information about our data set
df_tracks.info()
#Let's see the least popular songs for the genre of our choice:
sorted_df = df_tracks.sort_values('popularity', ascending = True).head(10)
sorted_df
#all statistics from our data set:
df_tracks.describe().transpose()
#Let's see the most popular songs for the genre of our choice:
most_popular = df_tracks.query('popularity>90', inplace = False).sort_values('popularity', ascending = False)
most_popular
# Organizing the songs in order of release
df_tracks.set_index('release_date', inplace = True)
df_tracks.index = pd.to_datetime(df_tracks.index)
df_tracks.head
# Searching for specific data:
df_tracks[['artists']].iloc[18]
#converting milliseconds to seconds for time duration column
df_tracks['duration'] = df_tracks['duration_ms'].apply(lambda x: round(x/1000))
df_tracks.drop('duration_ms', inplace = True, axis = 1)
df_tracks.duration.head()
df_tracks
# creating a splendor map
corr_df = df_tracks.drop(['key', 'mode', 'explicit'], axis = 1).corr(method = 'pearson')
plt.figure(figsize = (14,6))
heatmap = sns.heatmap(corr_df, annot = True, fmt = '.1g', vmin =-1, vmax = 1, center = 0, cmap = 'inferno', linewidths = 1, linecolor = 'Black' )
heatmap.set_title("Correlation HeatMap Between Variable")
heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation = 90)
#Here +1 means that there is a positive correlation between the variables, while - results means that there is no such correlation.
# We can conclude for example that there is a relationship between energy and loudness, and that there is no difference between energy and acoustics.
sample_df = df_tracks.sample(int(0.004*len(df_tracks)))
print(len(sample_df))
# Observing the interaction between intensity and energy
#positive correlation
plt.figure(figsize = (10,6))
sns.regplot(data = sample_df, y = 'loudness', x = 'energy', color = 'g',line_kws=dict(color="Black")).set(title = 'Loudness vs Energy - Correlation')
#negative correlation
plt.figure(figsize = (10,6))
sns.regplot(data = sample_df, y = 'energy', x = 'acousticness', color = 'r',line_kws=dict(color="Black")).set(title = 'Acousticness vs Energy - Correlation')
#no correlation
plt.figure(figsize = (10,6))
sns.regplot(data = sample_df, y = 'duration', x = 'liveness', color = 'y',line_kws=dict(color="Black")).set(title = 'Duration vs Liveness - Correlation')
# Create a distribution chart to see the total number of songs each year
df_tracks['dates'] = df_tracks.index.get_level_values('release_date')
df_tracks.dates=pd.to_datetime(df_tracks.dates)
years = df_tracks.dates.dt.year
sns.displot(years,discrete=True,aspect = 2, height = 5, kind = 'hist', color = "c").set(title = 'Number of songs per year')
# Through this graph, we can see that our data includes songs from 1920 onwards, with the highest number around 2019.
#Barplot Year x Duration
total_dr = df_tracks.duration
fig_dims = (18,7)
fig, ax = plt.subplots(figsize = fig_dims)
fig = sns.barplot(x = years, y = total_dr, ax = ax, errwidth = False).set(title = "Year vs Duration")
plt.xticks(rotation = 90)
#Line plot
total_dr = df_tracks.duration
sns.set_style(style = 'whitegrid')
fig_dims = (10,5)
fig, ax = plt.subplots(figsize = fig_dims)
fig = sns.lineplot(x = years, y = total_dr, ax = ax).set(title = "Year vs Duration")
plt.xticks(rotation = 60)
#Additional Analysis by genres:
#using a new data set
df_genre=pd.read_csv('SpotifyFeatures.csv')
df_genre.head()
plt.title("Duration of the songs in Different Genres")
sns.color_palette('rocket', as_cmap = True)
sns.barplot(y='genre', x='duration_ms', data=df_genre)
plt.xlabel("Duration in milli seconds")
plt.ylabel('Genres')
#Ordering by Popularity
sns.set_style(style = 'darkgrid')
plt.figure(figsize = (10,5))
famous = df_genre.sort_values('popularity', ascending = False).head(10)
sns.barplot(y='genre', x='popularity', data = famous).set(title = 'Top 5 Genres by Popularity')