Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Data Analysis/olympics_Analysis/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<h2>olympics-data-analysis-web-app</h2>

A Streamlit web application for the analysis of olympics dataset

Dataset Link: https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results

Live Link: https://olympicdatanalysis.streamlit.app/
189 changes: 189 additions & 0 deletions Data Analysis/olympics_Analysis/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import streamlit as st
import pandas as pd
import preprocessor,helper
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff

##def main():

df = pd.read_csv('athlete_events.csv')
region_df = pd.read_csv('noc_regions.csv')

df = preprocessor.preprocess(df, region_df)

user_menu = st.sidebar.radio(
'Select an Option',
('Medal Tally','Overall Analysis','Country-wise Analysis','Athlete wise Analysis')
)

st.sidebar.title("Olympics Analysis")
st.sidebar.image('https://e7.pngegg.com/pngimages/1020/402/png-clipart-2024-summer-olympics-brand-circle-area-olympic-rings-olympics-logo-text-sport.png')
if user_menu == 'Medal Tally':
st.sidebar.header("Medal Tally")
years, country = helper.country_year_list(df)

selected_year = st.sidebar.selectbox("Select Year",years)
selected_country = st.sidebar.selectbox("Select Country", country)

medal_tally = helper.fetch_medal_tally(df,selected_year,selected_country)
if selected_year == 'Overall' and selected_country =='Overall':
st.title("Overall Tally")
if selected_year!='Overall' and selected_country=="Overall":
st.title("Medal Tally in " + str(selected_year) + " Olympics ")
if selected_year=="Overall" and selected_country != "overall":
st.title(selected_country + " Overall Performance ")
if selected_year!='Overall' and selected_country!= "overall":
st.title(selected_country + " Performance in " + str(selected_year) + " Olympics ")

st.table(medal_tally)

if user_menu =='Overall Analysis':
editions=df['Year'].unique().shape[0]-1
cities=df['City'].unique().shape[0]
sports=df['Sport'].unique().shape[0]
events=df['Event'].unique().shape[0]
athletes=df['Name'].unique().shape[0]
nations=df['region'].unique().shape[0]

st.title(" Top Statistics")
col1,col2,col3=st.columns(3)
with col1:
st.header(" Edition ")
st.title(editions)

with col2:
st.header(" Hosts ")
st.title(cities)

with col3:
st.header(" Sports ")
st.title(sports)

col1,col2,col3=st.columns(3)
with col1:
st.header(" Events ")
st.title(events)

with col2:
st.header(" Nations ")
st.title(nations)

with col3:
st.header(" Athletes ")
st.title(athletes)

nations_over_time=helper.data_over_time(df,'region')
fig=px.line(nations_over_time,x='Edition',y='region')
st.title(" Participating Nations over the years ")
st.plotly_chart(fig)


events_over_time=helper.data_over_time(df,'Event')
fig=px.line(events_over_time,x='Edition',y='Event')
st.title(" Events over the years ")
st.plotly_chart(fig)

athlete_over_time=helper.data_over_time(df,'Name')
fig=px.line(athlete_over_time,x='Edition',y='Name')
st.title(" Athletes over the years ")
st.plotly_chart(fig)

st.title(" No. of Events over time(Every Sport)")
fig,ax = plt.subplots(figsize=(20,20))
x=df.drop_duplicates(['Year','Sport','Event'])
ax=sns.heatmap(x.pivot_table(index='Sport',columns='Year',values='Event',aggfunc='count').fillna(0).astype('int'),annot=True)
st.pyplot(fig)

st.title("Most Successful Athletes")
sport_list=df['Sport'].unique().tolist()
sport_list.sort()
sport_list.insert(0,'Overall')

selected_sport = st.selectbox("Select a Sport ",sport_list)
x=helper.most_successful(df,selected_sport)
st.table(x)

if user_menu =='Country-wise Analysis':

st.sidebar.title('Country-wise Analysis')

country_list=df['region'].dropna().unique().tolist()
country_list.sort()

selected_country=st.sidebar.selectbox('Select a Country',country_list)

country_df=helper.yearwise_medal_tally(df,selected_country)
fig=px.line(country_df,x='Year',y='Medal')
st.title(selected_country + " Medal Tally over the years ")
st.plotly_chart(fig)


st.title(selected_country + " excels int the following sports")
pt=helper.country_event_heatmap(df,selected_country)
fig,ax = plt.subplots(figsize=(20,20))
ax=sns.heatmap(pt,annot=True)
st.pyplot(fig)

st.title(" Top 10 athletes " + selected_country)
top10_df=helper.most_successful_countrywise(df,selected_country)
st.table(top10_df)

if user_menu == 'Athlete wise Analysis':
athlete_df=df.drop_duplicates(subset=['Name','region'])

x1=athlete_df['Age'].dropna()
x2=athlete_df[athlete_df['Medal']=='Gold']['Age'].dropna()
x3=athlete_df[athlete_df['Medal']=='Silver']['Age'].dropna()
x4=athlete_df[athlete_df['Medal']=='Bronze']['Age'].dropna()

fig=ff.create_distplot([x1,x2,x3,x4],['Overall Age','Gold Medalist','Silver Medalist','Bronze Medalist'],show_hist=False,show_rug=False)

fig.update_layout(autosize=False,width=1000,height=600)
st.plotly_chart(fig)

x = []
name = []
famous_sports = ['Basketball', 'Judo', 'Football', 'Tug-Of-War', 'Athletics',
'Swimming', 'Badminton', 'Sailing', 'Gymnastics',
'Art Competitions', 'Handball', 'Weightlifting', 'Wrestling',
'Water Polo', 'Hockey', 'Rowing', 'Fencing',
'Shooting', 'Boxing', 'Taekwondo', 'Cycling', 'Diving', 'Canoeing',
'Tennis', 'Golf', 'Softball', 'Archery',
'Volleyball', 'Synchronized Swimming', 'Table Tennis', 'Baseball',
'Rhythmic Gymnastics', 'Rugby Sevens',
'Beach Volleyball', 'Triathlon', 'Rugby', 'Polo', 'Ice Hockey']
for sport in famous_sports:
temp_df = athlete_df[athlete_df['Sport'] == sport]
x.append(temp_df[temp_df['Medal'] == 'Gold']['Age'].dropna())
name.append(sport)

fig = ff.create_distplot(x, name, show_hist=False, show_rug=False)
fig.update_layout(autosize=False, width=1000, height=600)
st.title("Distribution of Age wrt Sports(Gold Medalist)")
st.plotly_chart(fig)


sport_list=df['Sport'].unique().tolist()
sport_list.sort()
sport_list.insert(0,'Overall')

st.title("Height Vs Weight ")
selected_sport=st.selectbox(' Select a Sport ',sport_list)
temp_df=helper.weight_v_height(df,selected_sport)
fig,ax=plt.subplots()
ax=sns.scatterplot(x='Weight',y='Height',data=temp_df,hue=temp_df['Medal'],style=temp_df['Sex'],s=60)

st.pyplot(fig)

st.title(' Men VS Women Participation over the years')
final=helper.men_vs_women(df)
fig=px.line(final,x='Year',y=["Male","Female"])
fig.update_layout(autosize=False, width=1000, height=600)
st.plotly_chart(fig)





124 changes: 124 additions & 0 deletions Data Analysis/olympics_Analysis/helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import numpy as np


def fetch_medal_tally(df,year,country):
medal_df=df.drop_duplicates(subset=['Team','NOC','Games','Year','City','Sport','Event','Medal'])
flag=0
if year =='Overall' and country=='Overall':
temp_df=medal_df
if year =='Overall' and country!= 'Overall':
flag=1
temp_df=medal_df[medal_df['region'] == country]
if year!= "Overall" and country=='Overall':
temp_df=medal_df[medal_df['Year'] == int(year)]
if year!= "Overall" and country!='Overall':
temp_df=medal_df[(medal_df['Year'] == int(year)) & (medal_df['region']==country)]

if flag==1:
x =temp_df.groupby('Year').sum()[['Gold','Silver','Bronze']].sort_values('Year').reset_index()

else:
x =temp_df.groupby('region').sum()[['Gold','Silver','Bronze']].sort_values('Gold',ascending=False).reset_index()

x['total']= x['Gold'] + x['Silver'] +x['Bronze']

x['Gold'] = x['Gold'].astype('int')
x['Silver'] = x['Silver'].astype('int')
x['Bronze'] = x['Bronze'].astype('int')
x['total'] = x['total'].astype('int')

return x

def medal_tally(df):
medal_tally = df.drop_duplicates(subset=['Team', 'NOC', 'Games', 'Year', 'City', 'Sport', 'Event', 'Medal'])
medal_tally = (medal_tally.groupby('region').sum()[['Gold', 'Silver', 'Bronze']].sort_values('Gold', ascending=False).reset_index())

medal_tally['total'] = medal_tally['Gold'] + medal_tally['Silver'] + medal_tally['Bronze']

medal_tally['Gold'] = medal_tally['Gold'].astype('int')
medal_tally['Silver'] = medal_tally['Silver'].astype('int')
medal_tally['Bronze'] = medal_tally['Bronze'].astype('int')
medal_tally['total'] = medal_tally['total'].astype('int')

return medal_tally

def country_year_list(df):
years = df['Year'].unique().tolist()
years.sort()
years.insert(0, 'Overall')

country = np.unique(df['region'].dropna().values).tolist()
country.sort()

country.insert(0, 'Overall')

return years,country

def data_over_time(df,col):
nations_over_time=df.drop_duplicates(['Year',col])['Year'].value_counts().reset_index().sort_values('Year')
nations_over_time.rename(columns={'Year': 'Edition','count': col},inplace=True)
return nations_over_time

def most_successful(df,sport):
temp_df=df.dropna(subset=['Medal'])

if sport!= 'Overall':
temp_df=temp_df[temp_df['Sport']==sport]

x= temp_df['Name'].value_counts().reset_index().head(15).merge(df)[['Name','count','Sport','region']].drop_duplicates('Name')
x.rename(columns={'count':'Medals'},inplace=True)
return x

def yearwise_medal_tally(df,country):
temp_df=df.dropna(subset=['Medal'])
temp_df.drop_duplicates(subset=['Team','NOC','Games','Year','City','Sport','Event','Medal'],inplace=True)

new_df=temp_df[temp_df['region']==country]
final_df=new_df.groupby('Year').count()['Medal'].reset_index()

return final_df


def country_event_heatmap(df,country):
temp_df=df.dropna(subset=['Medal'])
temp_df.drop_duplicates(subset=['Team','NOC','Games','Year','City','Sport','Event','Medal'],inplace=True)

new_df=temp_df[temp_df['region']==country]
pt=new_df.pivot_table(index='Sport',columns='Year',values='Medal',aggfunc='count').fillna(0)
return pt


def most_successful_countrywise(df,country):
temp_df=df.dropna(subset=['Medal'])


temp_df=temp_df[temp_df['region']==country]

x= temp_df['Name'].value_counts().reset_index().head(10).merge(df)[['Name','count','Sport']].drop_duplicates('Name')
x.rename(columns={'count':'Medals'},inplace=True)
return x

def weight_v_height(df,sport):
athlete_df=df.drop_duplicates(subset=['Name','region'])
athlete_df['Medal'].fillna('No Medal',inplace=True)
if sport!=' Overall ':

temp_df=athlete_df[athlete_df['Sport'] == sport]
return temp_df

else:
return athlete_df


def men_vs_women(df):
athlete_df=df.drop_duplicates(subset=['Name','region'])

men=athlete_df[athlete_df['Sex']=='M'].groupby('Year').count()['Name'].reset_index()
women=athlete_df[athlete_df['Sex']=='F'].groupby('Year').count()['Name'].reset_index()

final=men.merge(women,on='Year',how='left')
final.rename(columns={'Name_x':'Male','Name_y':'Female'},inplace=True)

final.fillna(0,inplace=True)

return final
17 changes: 17 additions & 0 deletions Data Analysis/olympics_Analysis/preprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import pandas as pd
#df=pd.read_csv('athlete_events.csv')
#egion_df=pd.read_csv('noc_regions.csv')

def preprocess(df,region_df):
##global df,region_df

# filtering for summer olympics

df = df[df['Season'] == 'Summer']
# merge with region_df
df = df.merge(region_df,on = 'NOC', how='left')
# dropping duplicates
df.drop_duplicates(inplace=True)
# one hor encoding medals
df = pd.concat([df, pd.get_dummies(df['Medal'])], axis=1)
return df
4 changes: 4 additions & 0 deletions Data Analysis/olympics_Analysis/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
streamlit
plotly
seaborn
matplotlib
9 changes: 9 additions & 0 deletions Data Analysis/olympics_Analysis/setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
mkdir -p ~/.streamlit/

echo "\
[server]\n\
port = $PORT\n\
enableCORS = false\n\
headless = true\n\
\n\
" > ~/.streamlit/config.toml
Loading