from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # allow multiple outputs in a cell
import warnings
warnings.filterwarnings("ignore")


import pandas as pd


filepath = '../dat/train_comb.csv'
data = pd.read_csv(filepath)


data.head(3)


data.shape

(30990, 16)


data_store1 = data.loc[data['Store']==1]


data_store1.shape

(10244, 16)


data_store1_dept9 = data_store1[data_store1.Dept == 9]


data_store1_dept9.head()
data_store1_dept9.shape

(143, 16)


data_store1_dept9 = data_store1_dept9.sort_values('Date')
data_store1_dept9[['Date', 'Weekly_Sales']].iloc[:52]\
    .set_index('Date').plot(rot=90);


df = pd.read_csv("../dat/train-store1.csv")


df[['Date']].dtypes

Date    object
dtype: object


df.Date=pd.to_datetime(df.Date)


df[['Date']].dtypes

Date    datetime64[ns]
dtype: object


df['week'] = df.Date.dt.week
df['month'] = df.Date.dt.month 
df['year'] = df.Date.dt.year


df.shape

(10244, 19)


from sklearn.model_selection import train_test_split


df_train, df_test =  train_test_split(df, test_size=0.20, random_state = 42)


print('Original set  ---> ',df.shape,
      '\nTraining set  ---> ',df_train.shape,
      '\nTesting set   ---> ', df_test.shape)

Original set  --->  (10244, 19) 
Training set  --->  (8195, 19) 
Testing set   --->  (2049, 19)


df_train.dtypes

Store                    int64
Dept                     int64
Date            datetime64[ns]
Weekly_Sales           float64
IsHoliday                 bool
Temperature            float64
Fuel_Price             float64
MarkDown1              float64
MarkDown2              float64
MarkDown3              float64
MarkDown4              float64
MarkDown5              float64
CPI                    float64
Unemployment           float64
Type                    object
Size                     int64
week                     int64
month                    int64
year                     int64
dtype: object


pd.options.display.float_format = "{:,.2f}".format
df_train.describe()


df_train.isna().sum()

Store              0
Dept               0
Date               0
Weekly_Sales       0
IsHoliday          0
Temperature        0
Fuel_Price         0
MarkDown1       5264
MarkDown2       5771
MarkDown3       5317
MarkDown4       5264
MarkDown5       5264
CPI                0
Unemployment       0
Type               0
Size               0
week               0
month              0
year               0
dtype: int64


target = 'Weekly_Sales'


import seaborn as sns
sns.distplot(df_train[target],bins=10);


(df_train[target] <= 0).sum() # Expected Output: 13

13


(df_train[target] <= 0).sum()/df_train.shape[0]

# Expected Output: 0.0015863331299572911 or 0.16%

0.0015863331299572911


mask = df_train[target] > 0
df_train = df_train[mask]
df_train.shape # Expected Output: (8182, 19)

(8182, 19)


[(col, df[col].nunique())for col in df_train.columns]

[('Store', 1),
 ('Dept', 77),
 ('Date', 143),
 ('Weekly_Sales', 10042),
 ('IsHoliday', 2),
 ('Temperature', 143),
 ('Fuel_Price', 137),
 ('MarkDown1', 51),
 ('MarkDown2', 41),
 ('MarkDown3', 49),
 ('MarkDown4', 51),
 ('MarkDown5', 51),
 ('CPI', 143),
 ('Unemployment', 12),
 ('Type', 1),
 ('Size', 1),
 ('week', 52),
 ('month', 12),
 ('year', 3)]


external_factors = ['Temperature','CPI','Unemployment', 'Fuel_Price']


df.boxplot(column=['Temperature'], grid=False, figsize=(6,4))

<AxesSubplot:>


df.boxplot(column=external_factors, grid=False, figsize=(6,4))

<AxesSubplot:>


# Expected Output:


import matplotlib.pyplot as plt
print('\033[1mNumeric Features Distribution'.center(100))

figsize = (12, 4)

n=len(external_factors)
colors = ['g', 'b', 'r', 'y', 'k']

# histogram
plt.figure(figsize=figsize)
for i in range(len(external_factors)):
    plt.subplot(1,n,i+1)
    sns.distplot(df_train[external_factors[i]],
                 bins=10, 
                 color = colors[i])
plt.tight_layout();

# boxplot
plt.figure(figsize=figsize)
for i in range(len(external_factors)):
    plt.subplot(1,n,i+1)
    df_train.boxplot(external_factors[i], grid=False)
plt.tight_layout();

                                 Numeric Features Distribution


df_train['Date'].min(), df_train['Date'].max() # Expected Output: (Timestamp('2010-02-05 00:00:00'), Timestamp('2012-10-26 00:00:00'))

(Timestamp('2010-02-05 00:00:00'), Timestamp('2012-10-26 00:00:00'))


categoricalFeatures = ['year','month','week','IsHoliday', 'Dept']


df_train.IsHoliday.value_counts()

False    7586
True      596
Name: IsHoliday, dtype: int64


sns.countplot(x = 'month', data=df_train)

<AxesSubplot:xlabel='month', ylabel='count'>


#Visualising the categorical features 

print('\033[1mVisualising Categorical Features:'.center(100))

plt.figure(figsize=(12,12))

for i in range(len(categoricalFeatures)):
    plt.subplot(6,1,i+1)
    sns.countplot(df_train[categoricalFeatures[i]])
plt.tight_layout();

                               Visualising Categorical Features:


plt.figure(figsize=(12, 6))
sns.lineplot(data=df_train, x="week", y="Weekly_Sales",  style='year');


#df_train.groupby(by=['IsHoliday']).mean()
df_train[['Weekly_Sales','IsHoliday']].groupby(by=['IsHoliday']).mean()
#df_train.groupby().mean()

df_train.groupby('IsHoliday')['Weekly_Sales'].mean()

IsHoliday
False   21,756.05
True    23,737.05
Name: Weekly_Sales, dtype: float64


superbowl_mask = df_train['Date'].isin(['2010-02-12', '2011-02-11', '2012-02-10'])
laborday_mask = df_train['Date'].isin(['2010-09-10', '2011-09-09','2012-09-07'])
thanksgiving_mask = df_train['Date'].isin(['2010-11-26', '2011-11-25'])
christmas_mask = df_train['Date'].isin(['2010-12-31', '2011-12-30'])


df_train['superbowl'] = superbowl_mask
df_train['laborday'] = laborday_mask
df_train['thanksgiving'] =thanksgiving_mask
df_train['christmas'] = christmas_mask


df_train.groupby(['christmas'])\
        .agg(count = ('christmas', 'size'), 
             avg_weekly_sales= ('Weekly_Sales','mean'))


holidays = ['superbowl', 'laborday', 'thanksgiving', 'christmas']
for holiday in holidays:
    summary_stats = df_train.groupby([holiday])\
        .agg(count = (holiday, 'size'), 
             avg_weekly_sales= ('Weekly_Sales','mean'))
    print(summary_stats)
    print()

           count  avg_weekly_sales
superbowl                         
False       8001         21,845.80
True         181         24,311.98

          count  avg_weekly_sales
laborday                         
False      8007         21,884.35
True        175         22,632.78

              count  avg_weekly_sales
thanksgiving                         
False          8067         21,813.97
True            115         27,959.84

           count  avg_weekly_sales
christmas                         
False       8057         21,921.06
True         125         20,565.56


plt.figure(figsize=(10,4))
sns.scatterplot(data=df_train[df_train.year==2011], x = 'Dept', y= target, hue='IsHoliday');


sns.lineplot(data=df_train, x="Fuel_Price", y="Weekly_Sales");


sns.lineplot(data=df_train, x="Temperature", y="Weekly_Sales");


sns.lineplot(data=df_train, x="CPI", y="Weekly_Sales");


sns.lineplot(data=df_train, x="Unemployment", y="Weekly_Sales");


plt.figure(figsize=(6, 6))
df_train_reduced = df_train[[target] + external_factors]
corr = df_train_reduced.corr(method='spearman')
heatmap = sns.heatmap(corr.sort_values(by=target, ascending=False),
                      vmin=-1, vmax=1, annot=True, fmt='.1g', cmap='BrBG')
heatmap.set_title('Features Correlating with Sales Price', fontdict={'fontsize':12}, pad=16);


df_train.columns[df_train.isna().sum() != 0]

Index(['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5'], dtype='object')


md_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
for col in ['MarkDown'+str(i) for i in range(1,6)]:
    perc_missing =  df_train[col].isna().sum()/df_train[col].shape[0]#perc_missing:float
    print (f'{col}: {perc_missing:.0%} is missing')

MarkDown1: 64% is missing
MarkDown2: 70% is missing
MarkDown3: 65% is missing
MarkDown4: 64% is missing
MarkDown5: 64% is missing


# YOUR CODE HERE # this works for smaller dataset #use median value.
df_train = df_train.fillna(df_train.median())


df_train.isna().sum()

Store           0
Dept            0
Date            0
Weekly_Sales    0
IsHoliday       0
Temperature     0
Fuel_Price      0
MarkDown1       0
MarkDown2       0
MarkDown3       0
MarkDown4       0
MarkDown5       0
CPI             0
Unemployment    0
Type            0
Size            0
week            0
month           0
year            0
superbowl       0
laborday        0
thanksgiving    0
christmas       0
dtype: int64


(df_train.isna().sum() != 0).sum() # sanity check: 0

0


plt.figure(figsize=figsize)
for i in range(len(md_cols)):
    plt.subplot(1,len(md_cols),i+1)
    sns.distplot(df_train[md_cols[i]],
                 hist_kws=dict(linewidth=2),
                 bins=10, 
                 color = colors[i])
plt.tight_layout();


#inspect outliers: valid,
#identify importance


from fitter import Fitter, get_common_distributions, get_distributions


plt.figure(figsize=figsize)
for i in range(len(md_cols)):
    plt.subplot(1,len(md_cols),i+1)

    f = Fitter(df_train[md_cols[i]].values,
           distributions=['gamma',
                          'lognorm',
                          "beta",
                          "burr",
                          "norm"])
    f.fit()
    f.summary()
plt.tight_layout();


#f = Fitter(df_train[md_cols[2]].values)
#f.fit()
#f.summary()


# identify outliers using IQR
df_train.describe()


#Calcualte IQR
#find out filtered IQR
Q1, Q3 = df['Weekly_Sales'].quantile([0.25,0.75])
IQR = Q3 - Q1

df_filtered = df_train.query('(@Q1 - 1.5 * @IQR) <= Weekly_Sales <= (@Q3 + 1.5 * @IQR)')


df_filtered.shape
df_train.shape

(7695, 23)

(8182, 23)


#outliers percentage
olp = 100* (df_train.shape[0] - df_filtered.shape[0])/df_train.shape[0]

print("outliers percentage is ", round(olp,2), "%")

outliers percentage is  5.95 %


df_train.join(df_filtered, rsuffix='_df_filtered').boxplot()

<AxesSubplot:>


plt.figure(figsize=figsize)
for i in range(len(md_cols)):
    plt.subplot(1,len(md_cols),i+1)
    sns.distplot(df_filtered[md_cols[i]],
                 hist_kws=dict(linewidth=2),
                 bins=10, 
                 color = colors[i])
plt.tight_layout();


df_ole = df_train.query('(@Q1 - 1.5 * @IQR) >Weekly_Sales ' and 'Weekly_Sales > (@Q3 + 1.5 * @IQR)')


df_ole.describe()


sns.distplot(df_ole[target])

<AxesSubplot:xlabel='Weekly_Sales', ylabel='Density'>


sns.scatterplot(data=df_ole[df_ole.year==2011], x = 'Dept', y= target, hue='IsHoliday');


df_ole['Dept'].unique()

array([94, 90, 38, 95, 92, 93,  5, 72,  7, 91])


df_ole.groupby(by=['Dept']).sum().sort_values(by=['Store'], ascending=False)


#explore weekly sales and weeks

plt.figure(figsize=(12, 6))
sns.lineplot(data=df_ole, x="week", y="Weekly_Sales",  style='year');


plt.figure(figsize=(12, 6))
sns.lineplot(data=df_ole, x="month", y="Weekly_Sales",  style='year');


categoricalFeatures

['year', 'month', 'week', 'IsHoliday', 'Dept']


#Visualising the outlier data categorical features 

print('\033[1mVisualising Outlier Categorical Features:'.center(100))

plt.figure(figsize=(12,12))

for i in range(len(categoricalFeatures)):
    plt.subplot(6,1,i+1)
    sns.countplot(df_ole[categoricalFeatures[i]])
plt.tight_layout();

                           Visualising Outlier Categorical Features:


from sklearn.preprocessing import MinMaxScaler

numericalFeatures = ['Temperature', 'MarkDown1']
df_train_num = df_train[numericalFeatures]


df_train_num.describe() # Check the summary statistics


scaler = MinMaxScaler()
print(scaler.fit(df_train_num))

MinMaxScaler()


train_norm = scaler.transform(df_train_num)


pd.DataFrame(train_norm, columns=df_train_num.columns).describe()


# Expected Output:


from sklearn.preprocessing import OneHotEncoder
categoricalFeatures = ['Dept', 'IsHoliday']
df_train_cat = df_train[categoricalFeatures]
ohe = OneHotEncoder(handle_unknown='ignore',sparse = False).fit(df_train_cat)


train_ohe = ohe.transform(df_train_cat)


train_ohe.shape, df_train_cat.shape # Expected Output: ((8182, 79), (8182, 2))

((8182, 79), (8182, 2))


import numpy as np
X_train = np.hstack([train_norm, train_ohe])


X_train.shape # sanity check: (8182, 81)

(8182, 81)


df = pd.read_csv('../dat/train-store1.csv')


df.shape

(10244, 16)


df.describe()


X, y = df.drop(columns=target), df[target]


from sklearn.pipeline import Pipeline


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
numeric_features = ['CPI', 'MarkDown1']
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")), 
    ("normalization", MinMaxScaler())
])


categorical_features = ['Dept', 'IsHoliday']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')


from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)


from sklearn.linear_model import LinearRegression

model = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", LinearRegression())]
)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('normalization',
                                                                   MinMaxScaler())]),
                                                  ['CPI', 'MarkDown1']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Dept', 'IsHoliday'])])),
                ('model', LinearRegression())])


print("model score: %.3f" % model.score(X_test, y_test))

model score: 0.949


data = df.drop(columns=categorical_features)
sns.pairplot(data)

<seaborn.axisgrid.PairGrid at 0x7fb3f9bd0880>


X.columns

Index(['Store', 'Dept', 'Date', 'IsHoliday', 'Temperature', 'Fuel_Price',
       'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI',
       'Unemployment', 'Type', 'Size'],
      dtype='object')

	Store	Dept	Date	Weekly_Sales	IsHoliday	Temperature	Fuel_Price	MarkDown1	MarkDown2	MarkDown3	MarkDown4	MarkDown5	CPI	Unemployment	Type	Size
0	1	1	2010-02-05	24924.50	False	42.31	2.572	NaN	NaN	NaN	NaN	NaN	211.096358	8.106	A	151315
1	1	1	2010-02-12	46039.49	True	38.51	2.548	NaN	NaN	NaN	NaN	NaN	211.242170	8.106	A	151315
2	1	1	2010-02-19	41595.55	False	39.93	2.514	NaN	NaN	NaN	NaN	NaN	211.289143	8.106	A	151315

	Store	Dept	Date	Weekly_Sales	IsHoliday	Temperature	Fuel_Price	MarkDown1	MarkDown2	MarkDown3	MarkDown4	MarkDown5	CPI	Unemployment	Type	Size
1144	1	9	2010-02-05	16930.99	False	42.31	2.572	NaN	NaN	NaN	NaN	NaN	211.096358	8.106	A	151315
1145	1	9	2010-02-12	16562.49	True	38.51	2.548	NaN	NaN	NaN	NaN	NaN	211.242170	8.106	A	151315
1146	1	9	2010-02-19	15880.85	False	39.93	2.514	NaN	NaN	NaN	NaN	NaN	211.289143	8.106	A	151315
1147	1	9	2010-02-26	15175.52	False	46.63	2.561	NaN	NaN	NaN	NaN	NaN	211.319643	8.106	A	151315
1148	1	9	2010-03-05	24064.70	False	46.50	2.625	NaN	NaN	NaN	NaN	NaN	211.350143	8.106	A	151315

	Store	Dept	Weekly_Sales	Temperature	Fuel_Price	MarkDown1	MarkDown2	MarkDown3	MarkDown4	MarkDown5	CPI	Unemployment	Size	week	month	year
count	8,195.00	8,195.00	8,195.00	8,195.00	8,195.00	2,931.00	2,424.00	2,878.00	2,931.00	2,931.00	8,195.00	8,195.00	8,195.00	8,195.00	8,195.00	8,195.00
mean	1.00	44.65	21,865.28	68.19	3.22	8,045.43	2,961.55	1,236.83	3,683.59	5,023.69	216.00	7.61	151,315.00	25.89	6.47	2,010.96
std	0.00	29.95	27,970.00	14.16	0.43	6,484.49	8,032.30	7,830.99	5,849.69	3,303.07	4.33	0.38	0.00	14.19	3.25	0.80
min	1.00	1.00	-863.00	35.40	2.51	410.31	0.50	0.25	8.00	554.92	210.34	6.57	151,315.00	1.00	1.00	2,010.00
25%	1.00	20.00	3,502.09	57.79	2.76	4,039.39	40.48	6.00	577.14	3,127.88	211.57	7.35	151,315.00	14.00	4.00	2,010.00
50%	1.00	38.00	10,357.32	69.64	3.29	6,154.14	137.86	30.23	1,822.55	4,325.19	215.46	7.79	151,315.00	26.00	6.00	2,011.00
75%	1.00	72.00	31,647.36	80.48	3.59	10,121.97	1,569.00	101.64	3,639.42	6,222.25	220.64	7.84	151,315.00	38.00	9.00	2,012.00
max	1.00	99.00	203,670.47	91.65	3.91	34,577.06	46,011.38	55,805.51	32,403.87	20,475.32	223.44	8.11	151,315.00	52.00	12.00	2,012.00

	Store	Dept	Weekly_Sales	Temperature	Fuel_Price	MarkDown1	MarkDown2	MarkDown3	MarkDown4	MarkDown5	CPI	Unemployment	Size	week	month	year
count	8,182.00	8,182.00	8,182.00	8,182.00	8,182.00	8,182.00	8,182.00	8,182.00	8,182.00	8,182.00	8,182.00	8,182.00	8,182.00	8,182.00	8,182.00	8,182.00
mean	1.00	44.66	21,900.35	68.19	3.22	6,828.42	973.07	454.66	2,487.43	4,575.73	216.00	7.61	151,315.00	25.91	6.47	2,010.96
std	0.00	29.97	27,978.36	14.16	0.43	3,981.30	4,556.17	4,679.52	3,611.66	2,004.60	4.33	0.38	0.00	14.19	3.25	0.80
min	1.00	1.00	0.02	35.40	2.51	410.31	0.50	0.25	8.00	554.92	210.34	6.57	151,315.00	1.00	1.00	2,010.00
25%	1.00	20.00	3,527.13	57.79	2.76	6,154.14	137.86	30.23	1,822.55	4,325.19	211.57	7.35	151,315.00	14.00	4.00	2,010.00
50%	1.00	38.00	10,373.65	69.64	3.29	6,154.14	137.86	30.23	1,822.55	4,325.19	215.46	7.79	151,315.00	26.00	6.00	2,011.00
75%	1.00	72.00	31,666.22	80.48	3.59	6,154.14	137.86	30.23	1,822.55	4,325.19	220.64	7.84	151,315.00	38.00	9.00	2,012.00
max	1.00	99.00	203,670.47	91.65	3.91	34,577.06	46,011.38	55,805.51	32,403.87	20,475.32	223.44	8.11	151,315.00	52.00	12.00	2,012.00

	Store	Dept	Weekly_Sales	Temperature	Fuel_Price	MarkDown1	MarkDown2	MarkDown3	MarkDown4	MarkDown5	CPI	Unemployment	Size	week	month	year
count	487.00	487.00	487.00	487.00	487.00	487.00	487.00	487.00	487.00	487.00	487.00	487.00	487.00	487.00	487.00	487.00
mean	1.00	83.57	105,977.39	67.91	3.26	7,047.70	1,015.99	636.11	2,674.84	4,618.88	216.41	7.58	151,315.00	25.78	6.47	2,011.04
std	0.00	20.80	26,097.23	14.07	0.42	4,479.65	4,370.99	5,626.98	3,958.21	2,087.97	4.33	0.39	0.00	14.32	3.27	0.79
min	1.00	5.00	73,460.08	35.40	2.51	410.31	0.50	0.25	8.00	554.92	210.34	6.57	151,315.00	1.00	1.00	2,010.00
25%	1.00	90.00	81,580.79	57.36	2.83	6,154.14	137.86	30.23	1,822.55	4,325.19	211.67	7.35	151,315.00	14.00	4.00	2,010.00
50%	1.00	92.00	103,724.16	69.31	3.35	6,154.14	137.86	30.23	1,822.55	4,325.19	215.73	7.74	151,315.00	25.00	6.00	2,011.00
75%	1.00	94.50	125,745.54	80.43	3.62	6,154.14	137.86	30.23	1,822.55	4,325.19	221.21	7.84	151,315.00	37.00	9.00	2,012.00
max	1.00	95.00	203,670.47	91.65	3.91	34,577.06	46,011.38	55,805.51	32,403.87	20,475.32	223.44	8.11	151,315.00	52.00	12.00	2,012.00

EDA with Walmart Sales Data¶

Business Objectives¶

Learning Objectives¶

Data Overview¶

Task I: Load Data¶

Task II: Target, Features, and Distributions¶

Task III: Impact from Holidays¶

Task IV: Visualize Relationship between Macroeconomic & External Factors and Sales¶

Task V: Feature Engineering¶

Task VI: Pipeline¶

Note¶

References¶

	Store	Weekly_Sales	IsHoliday	Temperature	Fuel_Price	MarkDown1	MarkDown2	MarkDown3	MarkDown4	MarkDown5	CPI	Unemployment	Size	week	month	year	superbowl	laborday	thanksgiving	christmas
Dept
95	122	14,728,369.80	10	8,304.16	392.93	837,327.37	131,607.70	63,191.29	322,919.49	567,357.05	26,351.86	928.42	18460430	3200	799	245337	3	3	2	2
92	117	15,817,584.55	10	8,163.62	380.05	817,194.75	124,652.98	62,735.69	290,884.83	542,574.48	25,316.28	887.37	17703855	3093	769	235289	3	3	2	2
90	100	8,381,253.47	8	6,812.87	330.56	686,372.40	116,417.67	6,067.00	258,023.81	461,459.54	21,659.99	757.50	15131500	2503	630	201110	3	3	1	1
38	70	5,987,639.00	4	4,599.01	223.54	525,951.74	45,614.65	4,762.87	217,720.97	341,461.78	15,111.91	530.77	10592050	1548	398	140768	2	2	0	0
93	55	4,427,566.85	7	3,771.89	184.87	414,201.03	55,972.40	59,312.55	152,129.69	243,611.09	11,967.60	412.84	8322325	1525	385	110619	2	3	2	0
91	7	526,231.95	2	526.68	24.60	67,342.49	7,666.63	495.04	32,445.38	35,364.36	1,551.49	49.50	1059205	162	42	14084	1	1	0	0
72	6	794,329.92	3	305.77	17.09	27,752.23	689.92	56,561.13	7,323.10	20,595.11	1,283.32	47.62	907890	207	50	12062	1	0	2	0
94	5	400,797.20	0	324.37	17.05	34,489.89	11,613.71	130.72	15,699.51	20,703.42	1,075.27	38.26	756575	67	18	10055	0	0	0	0
7	3	382,197.99	0	150.13	8.87	15,033.64	316.20	695.16	3,670.00	11,389.81	642.17	23.54	453945	152	36	6031	0	0	0	0
5	2	165,016.24	1	112.47	6.12	6,564.45	235.86	55,835.74	1,830.55	4,880.11	429.87	15.70	302630	98	23	4021	0	0	1	0

	Store	Dept	Weekly_Sales	Temperature	Fuel_Price	MarkDown1	MarkDown2	MarkDown3	MarkDown4	MarkDown5	CPI	Unemployment	Size
count	10,244.00	10,244.00	10,244.00	10,244.00	10,244.00	3,657.00	3,015.00	3,588.00	3,657.00	3,657.00	10,244.00	10,244.00	10,244.00
mean	1.00	44.39	21,710.54	68.22	3.22	8,086.73	2,967.16	1,245.40	3,755.21	5,027.75	216.00	7.61	151,315.00
std	0.00	29.87	27,748.95	14.20	0.43	6,542.42	7,911.85	7,867.46	5,950.68	3,267.92	4.34	0.38	0.00
min	1.00	1.00	-863.00	35.40	2.51	410.31	0.50	0.25	8.00	554.92	210.34	6.57	151,315.00
25%	1.00	20.00	3,465.62	57.79	2.76	4,039.39	40.48	6.00	577.14	3,127.88	211.53	7.35	151,315.00
50%	1.00	38.00	10,289.38	69.64	3.29	6,154.14	137.86	30.23	1,822.55	4,325.19	215.46	7.79	151,315.00
75%	1.00	72.00	31,452.96	80.48	3.59	10,121.97	1,569.00	101.64	3,750.59	6,222.25	220.64	7.84	151,315.00
max	1.00	99.00	203,670.47	91.65	3.91	34,577.06	46,011.38	55,805.51	32,403.87	20,475.32	223.44	8.11	151,315.00