stores.csv :
This file contains anonymized information about the 45 stores, indicating the type and size of store.
train.csv :
This is the historical training data, which covers to 2010-02-05 to 2012-11-01. Within this file you will find the following fields:
Store - the store number Dept - the department number Date - the week Weekly_Sales - sales for the given department in the given store IsHoliday - whether the week is a special holiday week
test.csv :
This file is identical to train.csv, except we have withheld the weekly sales. You must predict the sales for each triplet of store, department, and date in this file.
features.csv :
This file contains additional data related to the store, department, and regional activity for the given dates. It contains the following fields:
Store - the store number.
Date - the week.
Temperature - average temperature in the region.
Fuel_Price - cost of fuel in the region.
MarkDown1-5 - anonymized data related to promotional markdowns that Walmart is running. MarkDown data is only available after Nov 2011, and is not available for all stores all the time. Any missing value is marked with an NA.
CPI - the consumer price index.
Unemployment - the unemployment rate.
IsHoliday - whether the week is a special holiday week
For convenience, the four holidays fall within the following weeks in the dataset (not all holidays are in the data):
Super Bowl: 12-Feb-10, 11-Feb-11, 10-Feb-12, 8-Feb-13 Labor Day: 10-Sep-10, 9-Sep-11, 7-Sep-12, 6-Sep-13 Thanksgiving: 26-Nov-10, 25-Nov-11, 23-Nov-12, 29-Nov-13 Christmas: 31-Dec-10, 30-Dec-11, 28-Dec-12, 27-Dec-13
Time-series forecasting and Regression
Holiday markdown events are included in the dataset. These markdowns are known to affect sales, but it is challenging to predict which departments are affected and the extent of the impact..
weighted mean absolute error (WMAE):
where
$$ {WMAE} = \frac{1}{\sum{w_i}} \sum_{i=1}^n w_i | y_i - \hat{y}_i $$n is the number of rows.
y^i is the predicted sales
yi is the actual sales
wi are weights.
w = 5 if the week is a holiday week, 1 otherwise
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from prettytable import PrettyTable
from xgboost import XGBRegressor
from tqdm import tqdm_notebook as tqdm
import warnings
warnings.filterwarnings('ignore')
features = pd.read_csv('features.csv')
stores = pd.read_csv('stores.csv')
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
stores.head()
df.head()
features.head()
df_stores = pd.merge(df,stores,on='Store',how='left')
test_stores = pd.merge(test,stores,on='Store',how='left')
df_final = df_stores.merge(features,how='left')
test_final = test_stores.merge(features,how='left')
df_final.Date = pd.to_datetime(df_final.Date)
test_final.Date = pd.to_datetime(test_final.Date)
df_final.head()
test_final.head()
print('Basic description of our numerical features in our Train data :\n')
df_final.describe()
print('Basic description of our numerical features in our Test data :\n')
test_final.describe()
df_final.info()
print('There are total {} number of Observations & Having {} number of variables in Train data'.format(df_final.shape[0],df_final.shape[1]))
print('There are total {} number of Observations & Having {} number of variables in Test data'.format(test_final.shape[0],test_final.shape[1]))
print('Data in Train data is from {} to {}'.format(df_final.Date.min(),df_final.Date.max()))
print('Data in Train data is from {} to {}'.format(test_final.Date.min(),test_final.Date.min()))
df_final.Date.min(),df_final.Date.max()
test.Date.min(),test.Date.max(),test.shape
## Checking for duplicates
df_final.duplicated().any()
print('_'*40)
print('Number of missing values per variable :')
print('_'*40)
df_final.isnull().sum()
print('Number of sales less than zero :',(df_final.Weekly_Sales<0).sum())
def cmpr(x,size=(12,4),spc=0.2):
fig,ax = plt.subplots(1,2,figsize=size)
plt.subplots_adjust(wspace=spc)
sns.countplot(y=x,data=df_final,order=df_final[x].value_counts().index,ax=ax[0])
i=df_final.groupby(x)['Weekly_Sales'].sum().sort_values()[::-1]
sns.violinplot(x=x,y='Weekly_Sales',ax=ax[1],data=df_final,order=i.index);
plt.grid()
def corr(col):
ax=sns.heatmap(df_final[[col,'Weekly_Sales']].corr(),annot=True);
plt.title('Correlation between %s and Weekly_Sales'%(col));
plt.setp(ax.get_yticklabels(),rotation=0);
plt.figure(figsize=(6,6))
sns.kdeplot(df_final.Size,);
sns.boxplot(df_final.Size,);
sns.boxplot(x='Type',y='Size',data=df_final);
sns.jointplot(x='Size', y="Weekly_Sales", data=df_final ,kind='hex',height=7, ratio=3);
sns.jointplot(x='Size', y="Weekly_Sales", data=df_final[df_final.Weekly_Sales<60000] ,kind='hex',height=7, ratio=3);
plt.figure(figsize=(18,6));
sns.boxenplot(y='Weekly_Sales',x='Size',data=df_final,hue='Type');
corr('Size')
Observations :
df_final.Type.value_counts()
plt.figure(figsize=(6,6))
sns.countplot(y='Type',data=df_final,order=df_final['Type'].value_counts().index);
plt.figure(figsize=(6,6))
sns.countplot(y='Type',data=df_final,order=df_final['Type'].value_counts().index,hue='IsHoliday',);
#sns.countplot(y='Dept',data=df_final,order=df_final['Dept'].value_counts().index);
fig,ax = plt.subplots(2,1,figsize=(18,10))
#plt.figure(figsize=(20,5))
sns.lineplot(x='Date',y='Weekly_Sales',data=df_final,hue='Type',ax=ax[0]);
#plt.figure(figsize=(20,5))
ax[1].scatter(y=df_final[df_final.Type=='A'].Weekly_Sales,x=df_final[df_final.Type=='A'].Date,label='Type A');
ax[1].scatter(y=df_final[df_final.Type=='B'].Weekly_Sales,x=df_final[df_final.Type=='B'].Date,label='Type B');
ax[1].scatter(y=df_final[df_final.Type=='C'].Weekly_Sales,x=df_final[df_final.Type=='C'].Date,label='Type C');
plt.legend();
sns.boxenplot(x='Type',y='Weekly_Sales',data=df_final);
plt.figure(figsize=(15,5))
sns.kdeplot(df_final[df_final.Type=='A'].Weekly_Sales,label='Weekly_Sales A')
sns.kdeplot(df_final[df_final.Type=='B'].Weekly_Sales,label='Weekly_Sales B')
sns.kdeplot(df_final[df_final.Type=='C'].Weekly_Sales,label='Weekly_Sales C',color='r');
i=df_final.groupby('Type')['Weekly_Sales'].sum().sort_values()[::-1]
tmp=pd.DataFrame({'Type':i.index,'Total Sales':i.values})
tmp
Observations :
df_final.Store.nunique()
plt.figure(figsize=(15,12))
sns.countplot(y='Store',data=df_final,order=df_final['Store'].value_counts().index);
plt.figure(figsize=(15,12))
sns.countplot(y='Store',data=df_final,order=df_final['Store'].value_counts().index,hue='Type');
plt.figure(figsize=(18,6));
sns.boxenplot(y='Weekly_Sales',x='Store',data=df_final,hue='Type');
plt.figure(figsize=(18,12))
sns.lineplot(x='Date',y='Weekly_Sales',data=df_final,hue='Store');
i=df_final.groupby('Store')['Weekly_Sales'].sum().sort_values()[::-1]
tmp=pd.DataFrame({'Store':i.index,'Total Sales':i.values})
tmp['Type']= tmp.Store.map(dict(zip(df_final.Store,df_final.Type)))
tmp
Observations :
df_final.Dept.nunique()
plt.figure(figsize=(15,22))
sns.countplot(y='Dept',data=df_final,order=df_final['Dept'].value_counts().index);
plt.figure(figsize=(15,22))
sns.countplot(y='Dept',data=df_final,order=df_final['Dept'].value_counts().index,hue='Type');
#sns.countplot(y='Dept',data=df_final,order=df_final['Dept'].value_counts().index);
plt.figure(figsize=(18,6));
sns.boxenplot(y='Weekly_Sales',x='Dept',data=df_final,hue='Type');
plt.figure(figsize=(18,12))
sns.lineplot(x='Date',y='Weekly_Sales',data=df_final,hue='Dept');
i=df_final.groupby('Dept')['Weekly_Sales'].sum().sort_values()[::-1]
tmp=pd.DataFrame({'Dept':i.index,'Total Sales':i.values})
#tmp['Counts'] = tmp.Dept.map(dict(zip(df_final.Dept.value_counts().index,df_final.Dept.value_counts().values)))
tmp
Observations :
df_final.IsHoliday.value_counts()
plt.figure(figsize=(6,6))
sns.countplot(y='IsHoliday',data=df_final,order=df_final['IsHoliday'].value_counts().index);
plt.figure(figsize=(6,6))
sns.countplot(y='IsHoliday',data=df_final,order=df_final['IsHoliday'].value_counts().index,hue='Type');
#sns.countplot(y='Dept',data=df_final,order=df_final['Dept'].value_counts().index);
sns.lineplot(x='Date',y='Weekly_Sales',data=df_final,hue='IsHoliday');
sns.boxenplot(x='IsHoliday',y='Weekly_Sales',data=df_final);
df_final[df_final.IsHoliday==True].Date.unique()
tmp=pd.DataFrame({'y':df_final[df_final.IsHoliday==True].Weekly_Sales,'x':df_final[df_final.IsHoliday==True].Date})
plt.figure(figsize=(20,5))
plt.scatter(y=df_final[df_final.IsHoliday==False].Weekly_Sales,x=df_final[df_final.IsHoliday==False].Date);
plt.scatter(y=df_final[df_final.IsHoliday==True].Weekly_Sales,x=df_final[df_final.IsHoliday==True].Date);
plt.figure(figsize=(20,5))
plt.scatter(y=df_final[(df_final.IsHoliday==False)& (df_final.Type=='A')].Weekly_Sales,x=df_final[(df_final.IsHoliday==False) & (df_final.Type=='A')].Date,label='Not Holiday & A Store');
plt.scatter(y=df_final[(df_final.IsHoliday==False)& (df_final.Type=='B')].Weekly_Sales,x=df_final[(df_final.IsHoliday==False) & (df_final.Type=='B')].Date,label='Not Holiday & B Store');
plt.scatter(y=df_final[(df_final.IsHoliday==False)& (df_final.Type=='C')].Weekly_Sales,x=df_final[(df_final.IsHoliday==False) & (df_final.Type=='C')].Date,label='Not Holiday & C Store');
plt.scatter(y=df_final[(df_final.IsHoliday==True)& (df_final.Type=='A')].Weekly_Sales,x=df_final[(df_final.IsHoliday==True) & (df_final.Type=='A')].Date,label='Is Holiday & A Store');
plt.scatter(y=df_final[(df_final.IsHoliday==True)& (df_final.Type=='B')].Weekly_Sales,x=df_final[(df_final.IsHoliday==True) & (df_final.Type=='B')].Date,label='Is Holiday & B Store');
plt.scatter(y=df_final[(df_final.IsHoliday==True)& (df_final.Type=='C')].Weekly_Sales,x=df_final[(df_final.IsHoliday==True) & (df_final.Type=='C')].Date,label='Is Holiday & C Store');
plt.legend();
#plt.scatter(y=df_final[df_final.IsHoliday==True].Weekly_Sales,x=df_final[df_final.IsHoliday==True].Date);
i=df_final.groupby('IsHoliday')['Weekly_Sales'].sum().sort_values()[::-1]
tmp=pd.DataFrame({'IsHoliday':i.index,'Total Sales':i.values})
tmp
Observations :
plt.figure(figsize=(6,6))
sns.kdeplot(df_final.Temperature,shade=True);
sns.boxplot(df_final.Temperature,);
sns.boxplot(x='Type',y='Temperature',data=df_final);
sns.lineplot(x='Date', y="Temperature",data=df_final);
sns.jointplot(x='Temperature', y="Weekly_Sales", data=df_final ,kind='hex',height=7, ratio=3);
sns.jointplot(x='Temperature', y="Weekly_Sales", data=df_final[df_final.Weekly_Sales<60000] ,kind='hex',height=7, ratio=3);
sns.heatmap(df_final[['Temperature','Weekly_Sales']].corr(),annot=True);
Observations :
plt.figure(figsize=(6,6))
sns.kdeplot(df_final.Fuel_Price,shade=True);
#sns.kdeplot(df_final.Temperature,shade=True,cumulative=True,);
sns.boxplot(df_final.Fuel_Price,);
sns.jointplot(x='Fuel_Price', y="Weekly_Sales", data=df_final ,kind='hex',height=7, ratio=3);
sns.jointplot(x='Fuel_Price', y="Weekly_Sales", data=df_final[df_final.Weekly_Sales<60000] ,kind='hex',height=7, ratio=3);
sns.lineplot(x='Date', y="Fuel_Price",data=df_final);
sns.scatterplot(x='Fuel_Price', y="Weekly_Sales",data=df_final);
corr('Fuel_Price')
Observations :
plt.figure(figsize=(6,6))
sns.kdeplot(df_final.CPI,shade=True);
#sns.kdeplot(df_final.Temperature,shade=True,cumulative=True,);
sns.boxplot(df_final.CPI,);
sns.jointplot(x='CPI', y="Weekly_Sales", data=df_final ,kind='hex',height=7, ratio=3);
sns.jointplot(x='CPI', y="Weekly_Sales", data=df_final[df_final.Weekly_Sales<60000] ,kind='hex',height=7, ratio=3);
corr('CPI')
Observations :
plt.figure(figsize=(6,6))
sns.kdeplot(df_final.Unemployment,shade=True);
#sns.kdeplot(df_final.Temperature,shade=True,cumulative=True,);
sns.boxplot(df_final.Unemployment,);
sns.jointplot(x='Unemployment', y="Weekly_Sales", data=df_final ,kind='hex',height=7, ratio=3);
sns.jointplot(x='Unemployment', y="Weekly_Sales", data=df_final[df_final.Weekly_Sales<60000] ,kind='hex',height=7, ratio=3);
corr('Unemployment')
Observations :
df_final.MarkDown1.describe()
print('Percentage of missing values in Markdown1 Variable is : %f '%(df_final.MarkDown1.isnull().sum()/len(df_final)*100))
df_final.MarkDown1.describe()
plt.figure(figsize=(6,6))
sns.kdeplot(df_final.MarkDown1,shade=True);
sns.boxplot(df_final.MarkDown1,);
sns.jointplot(x='MarkDown1', y="Weekly_Sales", data=df_final ,kind='hex',height=7, ratio=3);
sns.jointplot(x='MarkDown1', y="Weekly_Sales", data=df_final[df_final.Weekly_Sales<60000] ,kind='hex',height=7, ratio=3);
corr('MarkDown1')
tmp = df_final[df_final.MarkDown1.notnull()]
tmp['Weekly_Sales'] = tmp.Weekly_Sales.map(lambda x: x if x>0 else 1)
tmp = pd.DataFrame({'x':np.log(tmp.MarkDown1),'y':np.log(tmp.Weekly_Sales)})
sns.heatmap(annot=True,data=tmp.corr());
Observations :
df_final.MarkDown2.describe()
print('Percentage of missing values in MarkDown2 Variable is : %f '%(df_final.MarkDown2.isnull().sum()/len(df_final)*100))
plt.figure(figsize=(6,6))
sns.kdeplot(df_final[df_final.MarkDown2.notnull()].MarkDown2,shade=True);
sns.boxplot(df_final.MarkDown2,);
sns.jointplot(x='MarkDown2', y="Weekly_Sales", data=df_final ,kind='hex',height=7, ratio=3);
sns.jointplot(x='MarkDown2', y="Weekly_Sales", data=df_final[df_final.Weekly_Sales<60000] ,kind='hex',height=7, ratio=3);
corr('MarkDown2')
tmp = df_final[df_final.MarkDown2.notnull()]
tmp['Weekly_Sales'] = tmp.Weekly_Sales.map(lambda x: x if x>0 else 1)
tmp = pd.DataFrame({'x':np.log(tmp.MarkDown2),'y':np.log(tmp.Weekly_Sales)})
sns.heatmap(annot=True,data=tmp.corr());
Observations :
df_final.MarkDown3.describe()
print('Percentage of missing values in MarkDown3 Variable is : %f '%(df_final.MarkDown3.isnull().sum()/len(df_final)*100))
plt.figure(figsize=(6,6))
sns.kdeplot(df_final[df_final.MarkDown3.notnull()].MarkDown3,shade=True);
sns.boxplot(df_final.MarkDown3,);
sns.jointplot(x='MarkDown3', y="Weekly_Sales", data=df_final ,kind='hex',height=7, ratio=3);
sns.jointplot(x='MarkDown3', y="Weekly_Sales", data=df_final[(df_final.Weekly_Sales<50000) & (df_final.MarkDown3<50)] ,kind='hex',height=7, ratio=3);
corr('MarkDown3')
tmp = df_final[df_final.MarkDown3.notnull()]
tmp['Weekly_Sales'] = tmp.Weekly_Sales.map(lambda x: x if x>0 else 1)
tmp = pd.DataFrame({'x':np.log(tmp.MarkDown3),'y':np.log(tmp.Weekly_Sales)})
sns.heatmap(annot=True,data=tmp.corr());
Observations :
df_final.MarkDown4.describe()
print('Percentage of missing values in MarkDown4 Variable is : %f '%(df_final.MarkDown4.isnull().sum()/len(df_final)*100))
plt.figure(figsize=(6,6))
sns.kdeplot(df_final[df_final.MarkDown4.notnull()].MarkDown4,shade=True);
sns.boxplot(df_final.MarkDown4,);
sns.jointplot(x='MarkDown4', y="Weekly_Sales", data=df_final ,kind='hex',height=7, ratio=3);
corr('MarkDown4')
tmp = df_final[df_final.MarkDown4.notnull()]
tmp['Weekly_Sales'] = tmp.Weekly_Sales.map(lambda x: x if x>0 else 1)
tmp = pd.DataFrame({'x':np.log(tmp.MarkDown4),'y':np.log(tmp.Weekly_Sales)})
sns.heatmap(annot=True,data=tmp.corr());
Observations :
df_final.MarkDown5.describe()
print('Percentage of missing values in MarkDown5 Variable is : %f '%(df_final.MarkDown5.isnull().sum()/len(df_final)*100))
plt.figure(figsize=(6,6))
sns.kdeplot(df_final[df_final.MarkDown5.notnull()].MarkDown5,shade=True);
sns.boxplot(df_final.MarkDown5,);
sns.jointplot(x='MarkDown5', y="Weekly_Sales", data=df_final ,kind='hex',height=7, ratio=3);
corr('MarkDown5')
tmp = df_final[df_final.MarkDown5.notnull()]
tmp['Weekly_Sales'] = tmp.Weekly_Sales.map(lambda x: x if x>0 else 1)
tmp = pd.DataFrame({'x':np.log(tmp.MarkDown5),'y':np.log(tmp.Weekly_Sales)})
sns.heatmap(annot=True,data=tmp.corr());
Observations :
fig,ax = plt.subplots(5,1,figsize=(18,40))
plt.subplots_adjust(wspace=0.3)
mrk = ['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']
for i,axis in enumerate(ax.ravel()):
if i==5 :
fig.delaxes(axis)
break
sns.boxenplot(y=mrk[i],x='Dept',data=df_final,ax=axis,hue='IsHoliday');
For each department:
Markdowns varying even from one department to another
fig,ax = plt.subplots(5,1,figsize=(15,40))
plt.subplots_adjust(wspace=0.3)
mrk = ['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']
for i,axis in enumerate(ax.ravel()):
if i==5 :
fig.delaxes(axis)
break
sns.boxenplot(y=mrk[i],x='Store',data=df_final,ax=axis,hue='IsHoliday');
For each Store:
fig,ax = plt.subplots(2,3,figsize=(15,10))
plt.subplots_adjust(wspace=0.3)
mrk = ['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']
for i,axis in enumerate(ax.ravel()):
if i==5 :
fig.delaxes(axis)
break
sns.boxenplot(y=mrk[i],x='Type',data=df_final,ax=axis,hue='IsHoliday');
fig,ax = plt.subplots(1,3,figsize=(18,6),sharey=True,sharex=True)
sns.scatterplot(x=df_final.Date.dt.month,y='Weekly_Sales',data=df_final,hue='IsHoliday',ax=ax[0]);
ax[0].set_xlabel('Month');
sns.scatterplot(x=df_final[df_final.IsHoliday==False].Date.dt.month,y='Weekly_Sales',data=df_final,ax=ax[1]);
ax[1].set_xlabel('Month');
ax[1].set_title('Non Holiday Sales');
sns.scatterplot(x=df_final[df_final.IsHoliday==False].Date.dt.month,y='Weekly_Sales',data=df_final,hue='Type',ax=ax[2]);
ax[2].set_xlabel('Month');
ax[2].set_title('Non Holiday Sales');
fig,ax = plt.subplots(1,3,figsize=(18,6),sharey=True,sharex=True)
sns.scatterplot(x=df_final.Date.dt.week,y='Weekly_Sales',data=df_final,hue='IsHoliday',ax=ax[0]);
ax[0].set_xlabel('Week');
sns.scatterplot(x=df_final[df_final.IsHoliday==False].Date.dt.week,y='Weekly_Sales',data=df_final,ax=ax[1]);
ax[1].set_xlabel('Week');
ax[1].set_title('Non Holiday Sales');
sns.scatterplot(x=df_final[df_final.IsHoliday==False].Date.dt.week,y='Weekly_Sales',data=df_final,hue='Type',ax=ax[2]);
ax[2].set_xlabel('Week');
ax[2].set_title('Non Holiday Sales');
fig,ax = plt.subplots(1,3,figsize=(18,6),sharey=True,sharex=True)
sns.scatterplot(x=df_final.Date.dt.day,y='Weekly_Sales',data=df_final,hue='IsHoliday',ax=ax[0]);
ax[0].set_xlabel('Date of a month');
sns.scatterplot(x=df_final[df_final.IsHoliday==False].Date.dt.day,y='Weekly_Sales',data=df_final,ax=ax[1]);
ax[1].set_xlabel('Date of a month');
ax[1].set_title('Non Holiday Sales');
sns.scatterplot(x=df_final[df_final.IsHoliday==False].Date.dt.day,y='Weekly_Sales',data=df_final,hue='Type',ax=ax[2]);
ax[2].set_xlabel('Date of a month');
ax[2].set_title('Non Holiday Sales');
plt.figure(figsize=(12,8))
sns.heatmap(df_final.corr(),annot=True,);
temp = df_final.copy()
nn_hldy_sls_mean = temp[temp.IsHoliday==False].Weekly_Sales.mean()
## effect of holiday on store
hldy_store_sls = df_final[df_final.IsHoliday==True].groupby('Store').Weekly_Sales.mean()
nn_hldy_store_sls = df_final[df_final.IsHoliday==False].groupby('Store').Weekly_Sales.mean()
#hldy_effect_stores = (hldy_store_sls-nn_hldy_store_sls)/nn_hldy_store_sls
hldy_effect_stores = hldy_store_sls/nn_hldy_sls_mean #nn_hldy_store_sls
## effect of holiday on Dept
hldy_dept_sls = df_final[df_final.IsHoliday==True].groupby('Dept').Weekly_Sales.mean()
nn_hldy_dept_sls = df_final[df_final.IsHoliday==False].groupby('Dept').Weekly_Sales.mean()
#hldy_effect_dept = (hldy_dept_sls-nn_hldy_dept_sls)/nn_hldy_dept_sls
hldy_effect_dept = hldy_dept_sls/nn_hldy_sls_mean #_hldy_dept_sls
temp['sales'] = temp.Weekly_Sales/nn_hldy_sls_mean
temp['hldy_effect_stores'] = temp.Store.map(dict(zip(hldy_effect_stores.index,hldy_effect_stores.values)))
temp['hldy_effect_dept'] = temp.Store.map(dict(zip(hldy_effect_dept.index,hldy_effect_dept.values)))
## effect of holiday and size on Dept
hldy_size_A_sales = df_final[(df_final.Type=='A') & (df_final.IsHoliday==True)].groupby('Dept').Weekly_Sales.mean()
hldy_size_B_sales = df_final[(df_final.Type=='B') & (df_final.IsHoliday==True)].groupby('Dept').Weekly_Sales.mean()
hldy_size_C_sales = df_final[(df_final.Type=='C') & (df_final.IsHoliday==True)].groupby('Dept').Weekly_Sales.mean()
nn_hldy_size_A_sales = df_final[(df_final.Type=='A') & (df_final.IsHoliday==False)].groupby('Dept').Weekly_Sales.mean()
nn_hldy_size_B_sales = df_final[(df_final.Type=='B') & (df_final.IsHoliday==False)].groupby('Dept').Weekly_Sales.mean()
nn_hldy_size_C_sales = df_final[(df_final.Type=='C') & (df_final.IsHoliday==False)].groupby('Dept').Weekly_Sales.mean()
#hldy_nd_sze_A_effect_dept = (hldy_size_A_sales-nn_hldy_size_A_sales)/nn_hldy_size_A_sales
hldy_nd_sze_A_effect_dept = hldy_size_A_sales/nn_hldy_sls_mean #nn_hldy_size_A_sales
hldy_nd_sze_A_effect_dept = dict(zip(hldy_nd_sze_A_effect_dept.index,hldy_nd_sze_A_effect_dept.values))
#hldy_nd_sze_B_effect_dept = (hldy_size_B_sales-nn_hldy_size_B_sales)/nn_hldy_size_B_sales
hldy_nd_sze_B_effect_dept = hldy_size_B_sales/nn_hldy_sls_mean #nn_hldy_size_B_sales
hldy_nd_sze_B_effect_dept = dict(zip(hldy_nd_sze_B_effect_dept.index,hldy_nd_sze_B_effect_dept.values))
#hldy_nd_sze_C_effect_dept = (hldy_size_C_sales-nn_hldy_size_C_sales)/nn_hldy_size_C_sales
hldy_nd_sze_C_effect_dept = hldy_size_C_sales/nn_hldy_sls_mean #nn_hldy_size_C_sales
hldy_nd_sze_C_effect_dept = dict(zip(hldy_nd_sze_C_effect_dept.index,hldy_nd_sze_C_effect_dept.values))
lst=[]
for i,j in temp[['Type','Dept']].values:
try :
if i=='A':
lst.append(hldy_nd_sze_A_effect_dept[j])
if i=='B':
lst.append(hldy_nd_sze_B_effect_dept[j])
if i=='C':
lst.append(hldy_nd_sze_C_effect_dept[j])
except :
lst.append(0)
temp['hldy_nd_sze_effect_dept'] = lst
plt.figure(figsize=(6,6))
sns.heatmap(temp[['sales','hldy_effect_stores', 'hldy_effect_dept', 'hldy_nd_sze_effect_dept']].corr(),annot=True);
fig,ax = plt.subplots(3,3,figsize=(18,15))
ax = ax.ravel()
for axis in ax:
store = np.random.choice(df_final.Store.unique())
dept = np.random.choice(df_final.Dept.unique())
sns.lineplot(x='Date',y='Weekly_Sales',data = df_final[(df_final.Store==store) & (df_final.Dept==dept)],
hue='IsHoliday',ax=axis);
axis.set_title('Store :%d,Dept :%d'%(store,dept))
train_unsplt=df_final[df_final.Date >= df_final.Date.min()+pd.DateOffset(months=12)]
train,test = train_test_split(train_unsplt,test_size=0.2,random_state=5)
train,cv = train_test_split(train,test_size=0.2,random_state=5)
train.shape,cv.shape,test.shape
dct={}
for store,dept,year,week,sales in zip(df_final.Store,df_final.Dept,df_final.Date.dt.year,df_final.Date.dt.week,df_final.Weekly_Sales):
dct['_'.join(list(map(str,[store,dept,year,week])))] = sales
lstyr=[]
for store,dept,year,week in zip(test_final.Store,test_final.Dept,test_final.Date.dt.year,test_final.Date.dt.week):
try:
lstyr.append(dct['_'.join(list(map(str,[store,dept,year-1,week])))])
except:
lstyr.append(0)
## effect of holiday on store
nn_hldy_sls_mean = train[train.IsHoliday==False].Weekly_Sales.mean()
hldy_store_sls = train[train.IsHoliday==True].groupby('Store').Weekly_Sales.mean()
nn_hldy_store_sls = train[train.IsHoliday==False].groupby('Store').Weekly_Sales.mean()
#hldy_effect_stores = (hldy_store_sls-nn_hldy_store_sls)/nn_hldy_store_sls
hldy_effect_stores = hldy_store_sls/nn_hldy_sls_mean #nn_hldy_store_sls
## effect of holiday on Dept
hldy_dept_sls = train[train.IsHoliday==True].groupby('Dept').Weekly_Sales.mean()
nn_hldy_dept_sls = train[train.IsHoliday==False].groupby('Dept').Weekly_Sales.mean()
#hldy_effect_dept = (hldy_dept_sls-nn_hldy_dept_sls)/nn_hldy_dept_sls
hldy_effect_dept = hldy_dept_sls/nn_hldy_sls_mean #_hldy_dept_sls
## effect of holiday and size on Dept
hldy_size_A_sales = train[(train.Type=='A') & (train.IsHoliday==True)].groupby('Dept').Weekly_Sales.mean()
hldy_size_B_sales = train[(train.Type=='B') & (train.IsHoliday==True)].groupby('Dept').Weekly_Sales.mean()
hldy_size_C_sales = train[(train.Type=='C') & (train.IsHoliday==True)].groupby('Dept').Weekly_Sales.mean()
nn_hldy_size_A_sales = train[(train.Type=='A') & (train.IsHoliday==False)].groupby('Dept').Weekly_Sales.mean()
nn_hldy_size_B_sales = train[(train.Type=='B') & (train.IsHoliday==False)].groupby('Dept').Weekly_Sales.mean()
nn_hldy_size_C_sales = train[(train.Type=='C') & (train.IsHoliday==False)].groupby('Dept').Weekly_Sales.mean()
#hldy_nd_sze_A_effect_dept = (hldy_size_A_sales-nn_hldy_size_A_sales)/nn_hldy_size_A_sales
hldy_nd_sze_A_effect_dept = hldy_size_A_sales/nn_hldy_sls_mean #nn_hldy_size_A_sales
hldy_nd_sze_A_effect_dept = dict(zip(hldy_nd_sze_A_effect_dept.index,hldy_nd_sze_A_effect_dept.values))
#hldy_nd_sze_B_effect_dept = (hldy_size_B_sales-nn_hldy_size_B_sales)/nn_hldy_size_B_sales
hldy_nd_sze_B_effect_dept = hldy_size_B_sales/nn_hldy_sls_mean #nn_hldy_size_B_sales
hldy_nd_sze_B_effect_dept = dict(zip(hldy_nd_sze_B_effect_dept.index,hldy_nd_sze_B_effect_dept.values))
#hldy_nd_sze_C_effect_dept = (hldy_size_C_sales-nn_hldy_size_C_sales)/nn_hldy_size_C_sales
hldy_nd_sze_C_effect_dept = hldy_size_C_sales/nn_hldy_sls_mean #nn_hldy_size_C_sales
hldy_nd_sze_C_effect_dept = dict(zip(hldy_nd_sze_C_effect_dept.index,hldy_nd_sze_C_effect_dept.values))
def hldy_sze_effct(temp):
lst=[]
for i,j in temp[['Type','Dept']].values:
try :
if i=='A':
lst.append(hldy_nd_sze_A_effect_dept[j])
if i=='B':
lst.append(hldy_nd_sze_B_effect_dept[j])
if i=='C':
lst.append(hldy_nd_sze_C_effect_dept[j])
except :
lst.append(1)
return lst
mrk1 = train.groupby(['Type','IsHoliday']).MarkDown1.mean().unstack()
mrk2 = train.groupby(['Type','IsHoliday']).MarkDown2.mean().unstack()
mrk3 = train.groupby(['Type','IsHoliday']).MarkDown3.mean().unstack()
mrk4 = train.groupby(['Type','IsHoliday']).MarkDown4.mean().unstack()
mrk5 = train.groupby(['Type','IsHoliday']).MarkDown5.mean().unstack()
def fillna(data,col,mrk):
lst=[]
for tpe,hldy,mark in data[['Type','IsHoliday',col]].values:
if mark != mark : lst.append(mrk.loc[tpe,hldy])
else : lst.append(mark)
return lst
def last_year_sales(data):
lstyr=[]
for store,dept,year,week in zip(data.Store,data.Dept,data.Date.dt.year,data.Date.dt.week):
try:
lstyr.append(dct['_'.join(list(map(str,[store,dept,year-1,week])))])
except:
lstyr.append(0)
return np.array(lstyr)/nn_hldy_sls_mean
def preprocess(data,tst=False):
data = data.copy()
data.loc[(data.Type!='C') & (data.Size<50000), ['Type']] = 'C'
if not tst : data['sales'] = data.Weekly_Sales/nn_hldy_sls_mean
if tst :
data['CPI'].fillna(data.CPI.mean(),inplace=True)
data['Unemployment'].fillna(data.Unemployment.mean(),inplace=True)
data['week'] = data.Date.dt.week
data['month'] = data.Date.dt.month
data['hldy_effect_stores'] = data.Store.map(dict(zip(hldy_effect_stores.index,hldy_effect_stores.values)))
data['hldy_effect_dept'] = data.Store.map(dict(zip(hldy_effect_dept.index,hldy_effect_dept.values)))
data['hldy_nd_sze_effect_dept'] = hldy_sze_effct(data)
data['last_year_sales'] = last_year_sales(data)
data.fillna(1,inplace=True)
data = pd.concat([data,pd.get_dummies(data.Store,prefix='Store'),
pd.get_dummies(data.Dept,prefix='Dept'),
pd.get_dummies(data.IsHoliday,prefix='IsHoliday'),
pd.get_dummies(data.Type,prefix='Type'),
pd.get_dummies(data.week,prefix='week'),
pd.get_dummies(data.month,prefix='month')],axis=1)
#data['MarkDown1'] = fillna(data,'MarkDown1',mrk1)
#data['MarkDown2'] = fillna(data,'MarkDown2',mrk2)
#data['MarkDown3'] = fillna(data,'MarkDown3',mrk3)
data['MarkDown4'] = fillna(data,'MarkDown4',mrk4)
#data['MarkDown5'] = fillna(data,'MarkDown5',mrk5)
data.drop(['Date','Store','Dept','IsHoliday','Type','week','month','MarkDown1','MarkDown2','MarkDown3','MarkDown5'],axis=1,inplace=True)
if not tst: data = data.sample(frac=1).reset_index(drop=True)
return data
train_unsplt_prcssd = preprocess(train_unsplt)
train_prcssd = preprocess(train)
cv_prcssd = preprocess(cv).align(train_prcssd,axis=1,join='right')[0].fillna(0)
test_prcssd = preprocess(test).align(train_prcssd,axis=1,join='right')[0].fillna(0)
final_test_procssd = preprocess(test_final,tst=True).align(train_prcssd,axis=1,join='right')[0].fillna(0)
list(train_prcssd.columns)
train_prcssd.shape,cv_prcssd.shape,test_prcssd.shape,final_test_procssd.shape
x_train_ttl = train_unsplt_prcssd.drop(['Weekly_Sales','sales'],axis=1)
x_train = train_prcssd.drop(['Weekly_Sales','sales'],axis=1)
x_cv = cv_prcssd.drop(['Weekly_Sales','sales'],axis=1)
x_test = test_prcssd.drop(['Weekly_Sales','sales'],axis=1)
x_final_test = final_test_procssd.drop(['Weekly_Sales','sales'],axis=1)
y_train_ttl = train_unsplt_prcssd.sales.values
y_train = train_prcssd.sales.values
y_cv = cv_prcssd.sales.values
y_test = test_prcssd.sales.values
x_train.shape,x_cv.shape,x_test.shape,x_final_test.shape,y_train.shape,y_cv.shape,y_test.shape
train_unsplt_w8s = train_unsplt_prcssd.IsHoliday_True.map({0:1,1:5})
train_w8s = x_train.IsHoliday_True.map({0:1,1:5})
cv_w8s = x_cv.IsHoliday_True.map({0:1,1:5})
test_w8s = x_test.IsHoliday_True.map({0:1,1:5})
test_final_w8s = x_final_test.IsHoliday_True.map({0:1,1:5})
def performance(lr,model='',nrm=0,dl=0):
print('Performance of %s Model\n'%(model))
print('*'*60,'\n')
if nrm : trn,cv,tst = MinMaxScaler().fit_transform(x_train),MinMaxScaler().fit_transform(x_cv),MinMaxScaler().fit_transform(x_test)
if not nrm : trn,cv,tst = x_train,x_cv,x_test
#ytrn,ycv,ytst = y_train, y_cv, y_test
#if not dl :yptr,ypcv,yp = lr.predict(trn), lr.predict(cv), lr.predict(tst)'''
ytrn,ycv,ytst = y_train*nn_hldy_sls_mean, y_cv*nn_hldy_sls_mean, y_test*nn_hldy_sls_mean
if not dl :yptr,ypcv,yp = lr.predict(trn)*nn_hldy_sls_mean, lr.predict(cv)*nn_hldy_sls_mean, lr.predict(tst)*nn_hldy_sls_mean
if dl: yptr,ypcv,yp = lr.predict(trn).ravel()*nn_hldy_sls_mean, lr.predict(cv).ravel()*nn_hldy_sls_mean, lr.predict(tst).ravel()*nn_hldy_sls_mean
r2_trn,r2_cv,r2_tst = r2_score(ytrn,yptr),r2_score(ycv,ypcv),r2_score(ytst,yp)
rmse_trn,rmse_cv,rmse_tst = wmae(ytrn,yptr,train_w8s,mltply=0),wmae(ycv,ypcv,cv_w8s,mltply=0),wmae(ytst,yp,test_w8s,mltply=0)
table = PrettyTable()
table.field_names = ['Data','WMAE','R^2']
for i,j,k in zip(['Train','CV','Test'],[rmse_trn,rmse_cv,rmse_tst],[r2_trn,r2_cv,r2_tst]):
table.add_row([i,j,k])
print(table)
print('*'*90,'\n')
fig,ax=plt.subplots(2,2,figsize=(15,10));
#plt.subplots_adjust(wspace=0.2)
i,j=0,0
sns.regplot(ytst,yp,ax=ax[i,j],color='m');
#print(ax.shape,ax)
ax[i,j].set_xlabel('True Values')
ax[i,j].set_ylabel("Predicted Values")
ax[i,j].set_title('Prediction Plot')
i,j =0,1
#print(ax[i,j])
sns.scatterplot(x_test.reset_index().index,yp[np.argsort(ytst)],label='Predicted values',ax=ax[i,j]);
sns.scatterplot(x_test.reset_index().index,np.sort(ytst),label='True Values',ax=ax[i,j]);
i,j=1,0
pc = [i*5 for i in range(21)]
abs_err = abs(ytst-yp)
prcntl = np.percentile(abs_err,pc)
sns.scatterplot(pc,prcntl,color='r',ax=ax[i,j])
#ax[i,j].scatter([95+i/10 for i in range(6)],np.percentile(abs_err,[95+i/10 for i in range(6)]),color='b')
ax[i,j].plot(pc,prcntl,);
ax[i,j].set_xlabel('Percentile of Absolute errors')
ax[i,j].set_xlabel('Absolute errors')
ax[i,j].set_title('Percentiles of Absolute errors')
plt.sca(ax[i,j])
plt.xticks(pc)
plt.setp(ax[i,j].get_xticklabels(),rotation=90);
i,j = 1,1
sns.kdeplot(ytst,ax=ax[i,j],label='True values')
sns.kdeplot(yp,ax=ax[i,j],label='Predicted values')
ax[i,j].set_title('Distribution of true and predicted sales')
#ax[i,j].set_ylim(0,lm);
return r2_trn,r2_tst,rmse_trn,rmse_tst
def feature_imp(cl):
importances = cl.feature_importances_
indices = np.argsort(importances)[::-1]
names = [x_train.columns[i] for i in indices]
plt.figure(figsize=(10,7))
plt.barh(range(30,0,-1),importances[indices][:30],height=0.8,tick_label=names[:30])
plt.show()
def wmae(ytrue,ypred,w8s,mltply=1):
if type(ypred) == list : ypred = np.array(ypred)
if mltply : ytrue,ypred = ytrue*nn_hldy_sls_mean,ypred*nn_hldy_sls_mean
return np.average(np.abs(ypred - ytrue),weights=w8s)
r2_trn,r2_tst,rmse_trn,rmse_tst = np.zeros((4,10))
mn = y_train.mean()
mn
ytrn_p = [mn for i in range(len(y_train))]
y_tst_p = [mn for i in range(len(y_test))]
i=0
r2_trn[i],r2_tst[i],rmse_trn[i],rmse_tst[i] = r2_score(y_train*nn_hldy_sls_mean,ytrn_p),r2_score(y_test*nn_hldy_sls_mean,y_tst_p),wmae(y_train,ytrn_p,train_w8s),wmae(y_test,y_tst_p,test_w8s)
print('WMAE for mean prediction on Train data :',wmae(y_train*nn_hldy_sls_mean,[mn*nn_hldy_sls_mean for i in range(len(y_train))],train_w8s,mltply=0))
print('WMAE for mean prediction on CV data :',wmae(y_cv*nn_hldy_sls_mean,[mn*nn_hldy_sls_mean for i in range(len(y_cv))],cv_w8s,mltply=0))
print('WMAE for mean prediction on Test data :',wmae(y_test*nn_hldy_sls_mean,[mn*nn_hldy_sls_mean for i in range(len(y_test))],test_w8s,mltply=0))
lr = LinearRegression(n_jobs=-1).fit(x_train,y_train,sample_weight=train_w8s)
i=1
r2_trn[i],r2_tst[i],rmse_trn[i],rmse_tst[i] = performance(lr,model='Linear Regression')
chk=np.inf
params={}
for n_estimators in [50,100]:
for max_depth in [3,7,15,None]:
for min_samples_split in [2,5,15]:
for min_samples_leaf in [5,10,50,100]:
clf = RandomForestRegressor(n_estimators=n_estimators,min_samples_leaf=min_samples_leaf,n_jobs=-1,
min_samples_split=min_samples_split,max_depth=max_depth,random_state=5)
clf.fit(x_train,y_train,sample_weight=train_w8s)
err = wmae(y_cv,clf.predict(x_cv),cv_w8s,mltply=1)
#print(err)
if err <chk:
params = {'n_estimators':n_estimators,'min_samples_leaf':min_samples_leaf,'min_samples_split':min_samples_split,'max_depth':max_depth}
print(params)
print('R^2 score on Train is %f and on Validation is %f'%(clf.score(x_train,y_train),clf.score(x_cv,y_cv)))
print('WMAE on Train is %f and on Validation is %f'%(wmae(y_train,clf.predict(x_train),train_w8s,mltply=1),wmae(y_cv,clf.predict(x_cv),cv_w8s,mltply=1)))
print('='*80)
chk=err
params
clf = RandomForestRegressor(n_estimators=100, min_samples_leaf=5 ,n_jobs=-1,
min_samples_split=2, max_depth=None, random_state=5)
clf.fit(x_train,y_train,sample_weight=train_w8s)
feature_imp(clf)
i=2
r2_trn[i],r2_tst[i],rmse_trn[i],rmse_tst[i] = performance(clf,model='Random Forest Regressor')
chk=np.inf
trn_err=[]
tst_err=[]
params={}
for n_estimators in [100,250,500]:
for max_depth in [2,3,5,7,15]:
for learning_rate in [0.08,0.1,0.12]:
for reg_alpha in [0.0,0.005,0.001]:
for min_child_weight in [1,3,5]:
clf = XGBRegressor( n_estimators=n_estimators,learning_rate=learning_rate,reg_alpha=reg_alpha,n_jobs=8,tree_method='gpu_hist',
objective='reg:squarederror',max_depth=max_depth,random_state=5,min_child_weight=min_child_weight)
clf.fit(x_train,y_train,sample_weight=train_w8s)
err = wmae(y_cv,clf.predict(x_cv),cv_w8s,mltply=1)
err_trn = wmae(y_train,clf.predict(x_train),train_w8s,mltply=1)
trn_err.append(err_trn)
tst_err.append(err)
if err <chk:
params = {'n_estimators':n_estimators,'learning_rate':learning_rate,'reg_alpha':reg_alpha,'max_depth':max_depth,'min_child_weight':min_child_weight}
print(params)
print('R^2 score on Train is %f and on Validation is %f'%(clf.score(x_train1,y_train),clf.score(x_cv1,y_cv)))
print('WMAE on Train is %f and on Validation is %f'%(err_trn,err))
print('='*80)
chk=err
prms_xgb
clf_xgb = XGBRegressor( n_estimators=500,learning_rate=0.12,reg_alpha=0.001,n_jobs=-1,nthread=-1,tree_method='gpu_hist',
objective='reg:squarederror',max_depth=15,random_state=5,min_child_weight=5)
clf_xgb.fit(x_train,y_train,sample_weight=train_w8s)
feature_imp(clf_xgb)
i=3
r2_trn[i],r2_tst[i],rmse_trn[i],rmse_tst[i] = performance(clf_xgb,model='XGBRegressor')
x_train_nrm = MinMaxScaler().fit_transform(x_train)
x_cv_nrm = MinMaxScaler().fit_transform(x_cv)
x_test_nrm = MinMaxScaler().fit_transform(x_test)
def r2score(y_true, y_pred):
u = K.sum(K.square(y_true - y_pred))
v = K.sum(K.square(y_true - K.mean(y_true)))
return K.ones_like(v) - (u / v)
import tensorflow as tf
import keras.backend as K
from keras.layers import Dense,Input
from keras.models import Sequential
model = Sequential()
model.add(Dense(256,activation='relu',input_dim=x_train_nrm.shape[1]))
model.add(Dense(512,activation='relu'))
model.add(Dense(1024,activation='relu'))
model.add(Dense(1024,activation='relu'))
model.add(Dense(512,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(1,kernel_regularizer='l2'))
model.compile('adam',loss='mse',metrics=['mse',r2score])
model.fit(x_train_nrm,y_train,batch_size=256,epochs=20,validation_data=(x_cv_nrm,y_cv))
#model.save('model1.h5')
model = tf.keras.models.load_model('model1.h5',custom_objects={'r2score':r2score})
i=4
r2_trn[i],r2_tst[i],rmse_trn[i],rmse_tst[i] = performance(model,model='Neural Network',nrm=1,dl=1)
class ensmble:
def __init__(self,mdls):
self.models = mdls
def predict(self,x):
yp = np.zeros(len(x))
for mdl in self.models:
if isinstance(mdl,tf.python.keras.engine.sequential.Sequential):
yp += mdl.predict(MinMaxScaler().fit_transform(x)).ravel()
else : yp += mdl.predict(x)
return yp/len(self.models)
##Training all models on Total training data
lr = LinearRegression(n_jobs=-1).fit(x_train,y_train,sample_weight=train_w8s)
clf = RandomForestRegressor(n_estimators=100, min_samples_leaf=5 ,n_jobs=-1,
min_samples_split=2, max_depth=None, random_state=5).fit(x_train,y_train,sample_weight=train_w8s)
clf_xgb = XGBRegressor( n_estimators=500,learning_rate=0.12,reg_alpha=0.001,n_jobs=-1,nthread=-1,tree_method='gpu_hist',
objective='reg:squarederror',max_depth=15,random_state=5,min_child_weight=5).fit(x_train,y_train,sample_weight=train_w8s)
ensmbl = ensmble([clf,clf_xgb,model])
i=5
r2_trn[i],r2_tst[i],rmse_trn[i],rmse_tst[i] = performance(ensmbl,model='Ensemble Model')
''''##Training all models on Total training data
lr = LinearRegression(n_jobs=-1).fit(x_train_ttl,y_train_ttl,sample_weight=train_unsplt_w8s)
clf = RandomForestRegressor(n_estimators=100, min_samples_leaf=5 ,n_jobs=-1,
min_samples_split=2, max_depth=None, random_state=5).fit(x_train_ttl,y_train_ttl,sample_weight=train_unsplt_w8s)
clf_xgb = XGBRegressor( n_estimators=500,learning_rate=0.12,reg_alpha=0.001,n_jobs=-1,nthread=-1,tree_method='gpu_hist',
objective='reg:squarederror',max_depth=15,random_state=5,min_child_weight=5).fit(x_train_ttl,y_train_ttl,sample_weight=train_unsplt_w8s)
ypred_rf = clf.predict(x_final_test)
ypred_xgb = clf_xgb.predict(x_final_test)
y_lst_yr = np.array(lstyr)/nn_hldy_sls_mean
y_nn = model.predict(MinMaxscaler().fit_transform(x_test_nrm)).ravel()
##Averaging all model predictions
yf = (ypred_xgb+ypred_rf+y_lst_yr+y_nn)/4
sbmssn = pd.read_csv('sampleSubmission.csv')
sbmssn['Weekly_Sales'] = yf * nn_hldy_sls_mean
sbmssn.to_csv('final_avg_rf_xgb.csv',index=False)''''
total_err_nn = np.abs(model.predict(x_test_nrm).ravel()-y_test).sum()*nn_hldy_sls_mean
total_err_lr = np.abs(lr.predict(x_test)-y_test).sum()*nn_hldy_sls_mean
total_err_rf = np.abs(clf.predict(x_test)-y_test).sum()*nn_hldy_sls_mean
total_err_xgb = np.abs(clf_xgb.predict(x_test)-y_test).sum()*nn_hldy_sls_mean
total_err_ensmbl = np.abs(ensmbl.predict(x_test)-y_test).sum()*nn_hldy_sls_mean
table = PrettyTable()
table.field_names = ['Model','Train R^2 Score','Test R^2 Score','Train WMAE','Test WMAE']
for i,clssfr in enumerate(['Simple Mean Prediction','Linear Regression','Random Forest Regressor','XGBRegressor','Neural Network','Ensemble (Average)']):
table.add_row([clssfr,r2_trn[i],r2_tst[i],rmse_trn[i],rmse_tst[i]])
print(table)
print('Total sales of test data : ',np.sum(y_test)*nn_hldy_sls_mean/1000_000,'M$')
table = PrettyTable()
table.field_names = ['Model','Sales mis predicted (*1M $)']
for clssfr,err in zip(['Linear Regression','Random Forest Regressor','XGBRegressor','Neural Network','Ensemble (Average)'],
[total_err_lr/1000_000 ,total_err_rf/1000_000 ,total_err_xgb/1000_000 ,total_err_nn/1000_000,total_err_ensmbl/1000_000]):
table.add_row([clssfr,err])
print(table)
print('Total sales of test data : ',np.sum(y_test)*nn_hldy_sls_mean/1000_000,'M$')
table = PrettyTable()
table.field_names = ['Model','Sales mis predicted (*1M $)']
for clssfr,err in zip(['Linear Regression','Random Forest Regressor','XGBRegressor','Neural Network','Ensemble (Average)'],
[total_err_lr/1000_000 ,total_err_lr/1000_000 ,total_err_rf/1000_000 ,total_err_xgb/1000_000,total_err_ensmbl/1000_000]):
table.add_row([clssfr,err])
print(table)