|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +"""Video_Game_Sales_prediction.ipynb |
| 3 | +
|
| 4 | +Automatically generated by Colaboratory. |
| 5 | +
|
| 6 | +Original file is located at |
| 7 | + https://colab.research.google.com/drive/1tc9I7bxLJWCAEnqyPVi1Y2nIRLz3hNxR |
| 8 | +""" |
| 9 | + |
| 10 | +from google.colab import files |
| 11 | +uploaded = files.upload() |
| 12 | +for fn in uploaded.keys(): |
| 13 | + print('User uploaded file "{name}" with length {length} bytes'.format( |
| 14 | + name=fn, length=len(uploaded[fn]))) |
| 15 | + |
| 16 | +"""# File Imports""" |
| 17 | + |
| 18 | +#Import necessary libraries |
| 19 | +import numpy as np |
| 20 | +import pandas as pd |
| 21 | +import matplotlib.pyplot as plt |
| 22 | +import seaborn as sns |
| 23 | +from sklearn.model_selection import train_test_split |
| 24 | +import warnings |
| 25 | +warnings.filterwarnings(action='ignore') |
| 26 | + |
| 27 | +# Read the csv files |
| 28 | +input = pd.read_csv("Train.csv") |
| 29 | + |
| 30 | +#print all columns to understand the dataset |
| 31 | +input.head() |
| 32 | + |
| 33 | +"""# Data cleaning""" |
| 34 | + |
| 35 | +input.isnull().sum() |
| 36 | + |
| 37 | +"""There are no null values in the dataset. So we can move to the next step of removing unnecessary columns. |
| 38 | +
|
| 39 | +From dataset, we can observe that except `id` column, all the other columns play a significant role in final sales of videogames. So it can be dropped. |
| 40 | +""" |
| 41 | + |
| 42 | +input = input.drop(columns=['ID']) |
| 43 | +train, test = train_test_split(input, test_size=0.2, random_state=42, shuffle=True) |
| 44 | + |
| 45 | +"""# Descriptive Statistics""" |
| 46 | + |
| 47 | +train.shape, test.shape |
| 48 | + |
| 49 | +train.nunique() |
| 50 | + |
| 51 | +#If you are seeing the output below for the first time visit this link |
| 52 | +#to understand what the values in each of this rows(mean, std, min, max) actually |
| 53 | +#are:- https://www.w3resource.com/pandas/dataframe/dataframe-describe.php |
| 54 | +train.describe() |
| 55 | + |
| 56 | +"""From above table, my first insight is I can create bar charts of **console, year**, **category** and **ratings** columns easily. For other columns I might have to go for some other visual representation since the the number of unique values is high. |
| 57 | +
|
| 58 | +* From **SalesInMillions** column we can see that average |
| 59 | +sales have been around 2 million and max sales have reached a mark of about 84 million🤩 and min sales were around just 1500😔. |
| 60 | +* From **year** column we can see that data covers sales from the year 1997 to 2019 |
| 61 | +* **Critic Points** range from 0.5 to 23.25 while **user points** range from 0.0003 to 2.32. We might need to noramlise this values on same scale else critic points will have higher impact than user points on final prediction although in reality both of them should have equal importance. |
| 62 | +
|
| 63 | +# EDA |
| 64 | +
|
| 65 | +I am first opting for auto EDA packages like pandas-profiling for generating visualisations and there corresponding reports. |
| 66 | +""" |
| 67 | + |
| 68 | +!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip |
| 69 | + |
| 70 | +from pandas_profiling import ProfileReport |
| 71 | +report = ProfileReport(train, title="Report", html={'style': {'full_width':True}}, explorative=True, missing_diagrams={'bar': True}) |
| 72 | + |
| 73 | +report.to_notebook_iframe() |
| 74 | + |
| 75 | +#Save the report in file |
| 76 | +report.to_file("pandas_profiling_report.html") |
| 77 | + |
| 78 | +"""From the above reports we can gain following insights:- |
| 79 | +* Console column graph: |
| 80 | +<img src="https://res.cloudinary.com/dk22rcdch/image/upload/v1595439244/VideoGameDatasetAnalysisImages/Screenshot_2020-07-22_at_11.02.44_PM_nxz5cm.png" width=400> |
| 81 | +The sales of **PS2** were the highest in the data set |
| 82 | +
|
| 83 | +* Years Column graph: |
| 84 | +<img src="https://res.cloudinary.com/dk22rcdch/image/upload/v1595439371/VideoGameDatasetAnalysisImages/Screenshot_2020-07-22_at_11.05.51_PM_ycn3nl.png" width=400> |
| 85 | +The sales were highest between the period **2005-2010**. |
| 86 | +
|
| 87 | +* Game category column graph: |
| 88 | +<img src="https://res.cloudinary.com/dk22rcdch/image/upload/v1595439531/VideoGameDatasetAnalysisImages/Screenshot_2020-07-22_at_11.08.40_PM_ugwpdi.png" width=400> |
| 89 | + **Action** category games are most popular |
| 90 | +
|
| 91 | +Now let's compare individual columns with target(SalesInMillions) column to gain a few more insights into the data. |
| 92 | +""" |
| 93 | + |
| 94 | +#Sales of games that happened corresponding to each console. |
| 95 | +df = pd.DataFrame(train.groupby(['CONSOLE']).agg({'SalesInMillions': 'sum'})) |
| 96 | + |
| 97 | +df.plot.bar(figsize=(12, 6)) |
| 98 | + |
| 99 | +"""**💡Insight**: From the above graph we can see that sales were highest for PS3 platform followed by Xbox360""" |
| 100 | + |
| 101 | +df = pd.DataFrame(train.groupby(['YEAR']).agg({'SalesInMillions': 'sum'})) |
| 102 | + |
| 103 | +df.plot.bar(figsize=(12, 6)) |
| 104 | + |
| 105 | +"""**💡Insight**: From the above graph we can see that sales were highest in the year 2010""" |
| 106 | + |
| 107 | +df = pd.DataFrame(train.groupby(['CATEGORY']).agg({'SalesInMillions': 'sum'})) |
| 108 | + |
| 109 | +df.plot.bar(figsize=(12, 6)) |
| 110 | + |
| 111 | +"""**💡Insight**: From the above graph we can see that sales were highest for action genre |
| 112 | +
|
| 113 | +# Model training |
| 114 | +""" |
| 115 | + |
| 116 | +!pip install catboost |
| 117 | + |
| 118 | +import catboost as cat |
| 119 | +cat_feat = ['CONSOLE','CATEGORY', 'PUBLISHER', 'RATING'] |
| 120 | +features = list(set(train.columns)-set(['SalesInMillions'])) |
| 121 | +target = 'SalesInMillions' |
| 122 | +model = cat.CatBoostRegressor(random_state=100,cat_features=cat_feat,verbose=0) |
| 123 | +model.fit(train[features],train[target]) |
| 124 | + |
| 125 | +"""# Model Accuracy""" |
| 126 | + |
| 127 | +y_true= pd.DataFrame(data=test[target], columns=['SalesInMillions']) |
| 128 | +test_temp = test.drop(columns=[target]) |
| 129 | + |
| 130 | +y_pred = model.predict(test_temp[features]) |
| 131 | + |
| 132 | +from sklearn.metrics import mean_squared_error |
| 133 | +from math import sqrt |
| 134 | + |
| 135 | +rmse = sqrt(mean_squared_error(y_true, y_pred)) |
| 136 | +print(rmse) |
| 137 | + |
| 138 | +import pickle |
| 139 | +filename = 'finalized_model.sav' |
| 140 | + |
| 141 | +pickle.dump(model, open(filename, 'wb')) |
| 142 | + |
| 143 | +loaded_model = pickle.load(open(filename, 'rb')) |
| 144 | + |
| 145 | +test_temp[features].head(1) |
| 146 | + |
| 147 | +loaded_model.predict(test_temp[features].head(1)) |
| 148 | + |
0 commit comments