From 0db46b468b8e87caa8611d6e4a99ac3984932454 Mon Sep 17 00:00:00 2001 From: Raj Shekhar Date: Sat, 12 Oct 2024 15:44:55 +0530 Subject: [PATCH] Successfully Added Olympics_analysis_web_App --- Data Analysis/olympics_Analysis/README.md | 7 + Data Analysis/olympics_Analysis/app.py | 189 ++++++++++++++++++ Data Analysis/olympics_Analysis/helper.py | 124 ++++++++++++ .../olympics_Analysis/preprocessor.py | 17 ++ .../olympics_Analysis/requirements.txt | 4 + Data Analysis/olympics_Analysis/setup.sh | 9 + 6 files changed, 350 insertions(+) create mode 100644 Data Analysis/olympics_Analysis/README.md create mode 100644 Data Analysis/olympics_Analysis/app.py create mode 100644 Data Analysis/olympics_Analysis/helper.py create mode 100644 Data Analysis/olympics_Analysis/preprocessor.py create mode 100644 Data Analysis/olympics_Analysis/requirements.txt create mode 100644 Data Analysis/olympics_Analysis/setup.sh diff --git a/Data Analysis/olympics_Analysis/README.md b/Data Analysis/olympics_Analysis/README.md new file mode 100644 index 000000000..09c7b14b2 --- /dev/null +++ b/Data Analysis/olympics_Analysis/README.md @@ -0,0 +1,7 @@ +

olympics-data-analysis-web-app

+ +A Streamlit web application for the analysis of olympics dataset + +Dataset Link: https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results + +Live Link: https://olympicdatanalysis.streamlit.app/ \ No newline at end of file diff --git a/Data Analysis/olympics_Analysis/app.py b/Data Analysis/olympics_Analysis/app.py new file mode 100644 index 000000000..6b19a3a31 --- /dev/null +++ b/Data Analysis/olympics_Analysis/app.py @@ -0,0 +1,189 @@ +import streamlit as st +import pandas as pd +import preprocessor,helper +import plotly.express as px +import matplotlib.pyplot as plt +import seaborn as sns +import plotly.figure_factory as ff + +##def main(): + +df = pd.read_csv('athlete_events.csv') +region_df = pd.read_csv('noc_regions.csv') + +df = preprocessor.preprocess(df, region_df) + +user_menu = st.sidebar.radio( + 'Select an Option', + ('Medal Tally','Overall Analysis','Country-wise Analysis','Athlete wise Analysis') +) + +st.sidebar.title("Olympics Analysis") +st.sidebar.image('https://e7.pngegg.com/pngimages/1020/402/png-clipart-2024-summer-olympics-brand-circle-area-olympic-rings-olympics-logo-text-sport.png') +if user_menu == 'Medal Tally': + st.sidebar.header("Medal Tally") + years, country = helper.country_year_list(df) + + selected_year = st.sidebar.selectbox("Select Year",years) + selected_country = st.sidebar.selectbox("Select Country", country) + + medal_tally = helper.fetch_medal_tally(df,selected_year,selected_country) + if selected_year == 'Overall' and selected_country =='Overall': + st.title("Overall Tally") + if selected_year!='Overall' and selected_country=="Overall": + st.title("Medal Tally in " + str(selected_year) + " Olympics ") + if selected_year=="Overall" and selected_country != "overall": + st.title(selected_country + " Overall Performance ") + if selected_year!='Overall' and selected_country!= "overall": + st.title(selected_country + " Performance in " + str(selected_year) + " Olympics ") + + st.table(medal_tally) + +if user_menu =='Overall Analysis': + editions=df['Year'].unique().shape[0]-1 + cities=df['City'].unique().shape[0] + sports=df['Sport'].unique().shape[0] + events=df['Event'].unique().shape[0] + athletes=df['Name'].unique().shape[0] + nations=df['region'].unique().shape[0] + + st.title(" Top Statistics") + col1,col2,col3=st.columns(3) + with col1: + st.header(" Edition ") + st.title(editions) + + with col2: + st.header(" Hosts ") + st.title(cities) + + with col3: + st.header(" Sports ") + st.title(sports) + + col1,col2,col3=st.columns(3) + with col1: + st.header(" Events ") + st.title(events) + + with col2: + st.header(" Nations ") + st.title(nations) + + with col3: + st.header(" Athletes ") + st.title(athletes) + + nations_over_time=helper.data_over_time(df,'region') + fig=px.line(nations_over_time,x='Edition',y='region') + st.title(" Participating Nations over the years ") + st.plotly_chart(fig) + + + events_over_time=helper.data_over_time(df,'Event') + fig=px.line(events_over_time,x='Edition',y='Event') + st.title(" Events over the years ") + st.plotly_chart(fig) + + athlete_over_time=helper.data_over_time(df,'Name') + fig=px.line(athlete_over_time,x='Edition',y='Name') + st.title(" Athletes over the years ") + st.plotly_chart(fig) + + st.title(" No. of Events over time(Every Sport)") + fig,ax = plt.subplots(figsize=(20,20)) + x=df.drop_duplicates(['Year','Sport','Event']) + ax=sns.heatmap(x.pivot_table(index='Sport',columns='Year',values='Event',aggfunc='count').fillna(0).astype('int'),annot=True) + st.pyplot(fig) + + st.title("Most Successful Athletes") + sport_list=df['Sport'].unique().tolist() + sport_list.sort() + sport_list.insert(0,'Overall') + + selected_sport = st.selectbox("Select a Sport ",sport_list) + x=helper.most_successful(df,selected_sport) + st.table(x) + +if user_menu =='Country-wise Analysis': + + st.sidebar.title('Country-wise Analysis') + + country_list=df['region'].dropna().unique().tolist() + country_list.sort() + + selected_country=st.sidebar.selectbox('Select a Country',country_list) + + country_df=helper.yearwise_medal_tally(df,selected_country) + fig=px.line(country_df,x='Year',y='Medal') + st.title(selected_country + " Medal Tally over the years ") + st.plotly_chart(fig) + + + st.title(selected_country + " excels int the following sports") + pt=helper.country_event_heatmap(df,selected_country) + fig,ax = plt.subplots(figsize=(20,20)) + ax=sns.heatmap(pt,annot=True) + st.pyplot(fig) + + st.title(" Top 10 athletes " + selected_country) + top10_df=helper.most_successful_countrywise(df,selected_country) + st.table(top10_df) + +if user_menu == 'Athlete wise Analysis': + athlete_df=df.drop_duplicates(subset=['Name','region']) + + x1=athlete_df['Age'].dropna() + x2=athlete_df[athlete_df['Medal']=='Gold']['Age'].dropna() + x3=athlete_df[athlete_df['Medal']=='Silver']['Age'].dropna() + x4=athlete_df[athlete_df['Medal']=='Bronze']['Age'].dropna() + + fig=ff.create_distplot([x1,x2,x3,x4],['Overall Age','Gold Medalist','Silver Medalist','Bronze Medalist'],show_hist=False,show_rug=False) + + fig.update_layout(autosize=False,width=1000,height=600) + st.plotly_chart(fig) + + x = [] + name = [] + famous_sports = ['Basketball', 'Judo', 'Football', 'Tug-Of-War', 'Athletics', + 'Swimming', 'Badminton', 'Sailing', 'Gymnastics', + 'Art Competitions', 'Handball', 'Weightlifting', 'Wrestling', + 'Water Polo', 'Hockey', 'Rowing', 'Fencing', + 'Shooting', 'Boxing', 'Taekwondo', 'Cycling', 'Diving', 'Canoeing', + 'Tennis', 'Golf', 'Softball', 'Archery', + 'Volleyball', 'Synchronized Swimming', 'Table Tennis', 'Baseball', + 'Rhythmic Gymnastics', 'Rugby Sevens', + 'Beach Volleyball', 'Triathlon', 'Rugby', 'Polo', 'Ice Hockey'] + for sport in famous_sports: + temp_df = athlete_df[athlete_df['Sport'] == sport] + x.append(temp_df[temp_df['Medal'] == 'Gold']['Age'].dropna()) + name.append(sport) + + fig = ff.create_distplot(x, name, show_hist=False, show_rug=False) + fig.update_layout(autosize=False, width=1000, height=600) + st.title("Distribution of Age wrt Sports(Gold Medalist)") + st.plotly_chart(fig) + + + sport_list=df['Sport'].unique().tolist() + sport_list.sort() + sport_list.insert(0,'Overall') + + st.title("Height Vs Weight ") + selected_sport=st.selectbox(' Select a Sport ',sport_list) + temp_df=helper.weight_v_height(df,selected_sport) + fig,ax=plt.subplots() + ax=sns.scatterplot(x='Weight',y='Height',data=temp_df,hue=temp_df['Medal'],style=temp_df['Sex'],s=60) + + st.pyplot(fig) + + st.title(' Men VS Women Participation over the years') + final=helper.men_vs_women(df) + fig=px.line(final,x='Year',y=["Male","Female"]) + fig.update_layout(autosize=False, width=1000, height=600) + st.plotly_chart(fig) + + + + + diff --git a/Data Analysis/olympics_Analysis/helper.py b/Data Analysis/olympics_Analysis/helper.py new file mode 100644 index 000000000..920cca15c --- /dev/null +++ b/Data Analysis/olympics_Analysis/helper.py @@ -0,0 +1,124 @@ +import numpy as np + + +def fetch_medal_tally(df,year,country): + medal_df=df.drop_duplicates(subset=['Team','NOC','Games','Year','City','Sport','Event','Medal']) + flag=0 + if year =='Overall' and country=='Overall': + temp_df=medal_df + if year =='Overall' and country!= 'Overall': + flag=1 + temp_df=medal_df[medal_df['region'] == country] + if year!= "Overall" and country=='Overall': + temp_df=medal_df[medal_df['Year'] == int(year)] + if year!= "Overall" and country!='Overall': + temp_df=medal_df[(medal_df['Year'] == int(year)) & (medal_df['region']==country)] + + if flag==1: + x =temp_df.groupby('Year').sum()[['Gold','Silver','Bronze']].sort_values('Year').reset_index() + + else: + x =temp_df.groupby('region').sum()[['Gold','Silver','Bronze']].sort_values('Gold',ascending=False).reset_index() + + x['total']= x['Gold'] + x['Silver'] +x['Bronze'] + + x['Gold'] = x['Gold'].astype('int') + x['Silver'] = x['Silver'].astype('int') + x['Bronze'] = x['Bronze'].astype('int') + x['total'] = x['total'].astype('int') + + return x + +def medal_tally(df): + medal_tally = df.drop_duplicates(subset=['Team', 'NOC', 'Games', 'Year', 'City', 'Sport', 'Event', 'Medal']) + medal_tally = (medal_tally.groupby('region').sum()[['Gold', 'Silver', 'Bronze']].sort_values('Gold', ascending=False).reset_index()) + + medal_tally['total'] = medal_tally['Gold'] + medal_tally['Silver'] + medal_tally['Bronze'] + + medal_tally['Gold'] = medal_tally['Gold'].astype('int') + medal_tally['Silver'] = medal_tally['Silver'].astype('int') + medal_tally['Bronze'] = medal_tally['Bronze'].astype('int') + medal_tally['total'] = medal_tally['total'].astype('int') + + return medal_tally + +def country_year_list(df): + years = df['Year'].unique().tolist() + years.sort() + years.insert(0, 'Overall') + + country = np.unique(df['region'].dropna().values).tolist() + country.sort() + + country.insert(0, 'Overall') + + return years,country + +def data_over_time(df,col): + nations_over_time=df.drop_duplicates(['Year',col])['Year'].value_counts().reset_index().sort_values('Year') + nations_over_time.rename(columns={'Year': 'Edition','count': col},inplace=True) + return nations_over_time + +def most_successful(df,sport): + temp_df=df.dropna(subset=['Medal']) + + if sport!= 'Overall': + temp_df=temp_df[temp_df['Sport']==sport] + + x= temp_df['Name'].value_counts().reset_index().head(15).merge(df)[['Name','count','Sport','region']].drop_duplicates('Name') + x.rename(columns={'count':'Medals'},inplace=True) + return x + +def yearwise_medal_tally(df,country): + temp_df=df.dropna(subset=['Medal']) + temp_df.drop_duplicates(subset=['Team','NOC','Games','Year','City','Sport','Event','Medal'],inplace=True) + + new_df=temp_df[temp_df['region']==country] + final_df=new_df.groupby('Year').count()['Medal'].reset_index() + + return final_df + + +def country_event_heatmap(df,country): + temp_df=df.dropna(subset=['Medal']) + temp_df.drop_duplicates(subset=['Team','NOC','Games','Year','City','Sport','Event','Medal'],inplace=True) + + new_df=temp_df[temp_df['region']==country] + pt=new_df.pivot_table(index='Sport',columns='Year',values='Medal',aggfunc='count').fillna(0) + return pt + + +def most_successful_countrywise(df,country): + temp_df=df.dropna(subset=['Medal']) + + + temp_df=temp_df[temp_df['region']==country] + + x= temp_df['Name'].value_counts().reset_index().head(10).merge(df)[['Name','count','Sport']].drop_duplicates('Name') + x.rename(columns={'count':'Medals'},inplace=True) + return x + +def weight_v_height(df,sport): + athlete_df=df.drop_duplicates(subset=['Name','region']) + athlete_df['Medal'].fillna('No Medal',inplace=True) + if sport!=' Overall ': + + temp_df=athlete_df[athlete_df['Sport'] == sport] + return temp_df + + else: + return athlete_df + + +def men_vs_women(df): + athlete_df=df.drop_duplicates(subset=['Name','region']) + + men=athlete_df[athlete_df['Sex']=='M'].groupby('Year').count()['Name'].reset_index() + women=athlete_df[athlete_df['Sex']=='F'].groupby('Year').count()['Name'].reset_index() + + final=men.merge(women,on='Year',how='left') + final.rename(columns={'Name_x':'Male','Name_y':'Female'},inplace=True) + + final.fillna(0,inplace=True) + + return final diff --git a/Data Analysis/olympics_Analysis/preprocessor.py b/Data Analysis/olympics_Analysis/preprocessor.py new file mode 100644 index 000000000..c662ab45c --- /dev/null +++ b/Data Analysis/olympics_Analysis/preprocessor.py @@ -0,0 +1,17 @@ +import pandas as pd +#df=pd.read_csv('athlete_events.csv') +#egion_df=pd.read_csv('noc_regions.csv') + +def preprocess(df,region_df): + ##global df,region_df + + # filtering for summer olympics + + df = df[df['Season'] == 'Summer'] + # merge with region_df + df = df.merge(region_df,on = 'NOC', how='left') + # dropping duplicates + df.drop_duplicates(inplace=True) + # one hor encoding medals + df = pd.concat([df, pd.get_dummies(df['Medal'])], axis=1) + return df diff --git a/Data Analysis/olympics_Analysis/requirements.txt b/Data Analysis/olympics_Analysis/requirements.txt new file mode 100644 index 000000000..51afaed97 --- /dev/null +++ b/Data Analysis/olympics_Analysis/requirements.txt @@ -0,0 +1,4 @@ +streamlit +plotly +seaborn +matplotlib \ No newline at end of file diff --git a/Data Analysis/olympics_Analysis/setup.sh b/Data Analysis/olympics_Analysis/setup.sh new file mode 100644 index 000000000..d39033d9e --- /dev/null +++ b/Data Analysis/olympics_Analysis/setup.sh @@ -0,0 +1,9 @@ +mkdir -p ~/.streamlit/ + +echo "\ +[server]\n\ +port = $PORT\n\ +enableCORS = false\n\ +headless = true\n\ +\n\ +" > ~/.streamlit/config.toml \ No newline at end of file