import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
id = '1EW37JE8wS8xDzUh6ow7YG3FpF2LVl_33'
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('online_shoppers_intention.csv.gz')
shopper = pd.read_csv('online_shoppers_intention.csv.gz')
shopper.head()
Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | ProductRelated_Duration | BounceRates | ExitRates | PageValues | SpecialDay | Month | OperatingSystems | Browser | Region | TrafficType | VisitorType | Weekend | Revenue | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.0 | 0 | 0.0 | 1 | 0.000000 | 0.20 | 0.20 | 0.0 | 0.0 | Feb | 1 | 1 | 1 | 1 | Returning_Visitor | False | False |
1 | 0 | 0.0 | 0 | 0.0 | 2 | 64.000000 | 0.00 | 0.10 | 0.0 | 0.0 | Feb | 2 | 2 | 1 | 2 | Returning_Visitor | False | False |
2 | 0 | 0.0 | 0 | 0.0 | 1 | 0.000000 | 0.20 | 0.20 | 0.0 | 0.0 | Feb | 4 | 1 | 9 | 3 | Returning_Visitor | False | False |
3 | 0 | 0.0 | 0 | 0.0 | 2 | 2.666667 | 0.05 | 0.14 | 0.0 | 0.0 | Feb | 3 | 2 | 2 | 4 | Returning_Visitor | False | False |
4 | 0 | 0.0 | 0 | 0.0 | 10 | 627.500000 | 0.02 | 0.05 | 0.0 | 0.0 | Feb | 3 | 3 | 1 | 4 | Returning_Visitor | True | False |
shopper.dtypes
Administrative int64 Administrative_Duration float64 Informational int64 Informational_Duration float64 ProductRelated int64 ProductRelated_Duration float64 BounceRates float64 ExitRates float64 PageValues float64 SpecialDay float64 Month object OperatingSystems int64 Browser int64 Region int64 TrafficType int64 VisitorType object Weekend bool Revenue bool dtype: object
shopper.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Administrative | 12330.0 | 2.315166 | 3.321784 | 0.0 | 0.000000 | 1.000000 | 4.000000 | 27.000000 |
Administrative_Duration | 12330.0 | 80.818611 | 176.779107 | 0.0 | 0.000000 | 7.500000 | 93.256250 | 3398.750000 |
Informational | 12330.0 | 0.503569 | 1.270156 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 24.000000 |
Informational_Duration | 12330.0 | 34.472398 | 140.749294 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 2549.375000 |
ProductRelated | 12330.0 | 31.731468 | 44.475503 | 0.0 | 7.000000 | 18.000000 | 38.000000 | 705.000000 |
ProductRelated_Duration | 12330.0 | 1194.746220 | 1913.669288 | 0.0 | 184.137500 | 598.936905 | 1464.157214 | 63973.522230 |
BounceRates | 12330.0 | 0.022191 | 0.048488 | 0.0 | 0.000000 | 0.003112 | 0.016813 | 0.200000 |
ExitRates | 12330.0 | 0.043073 | 0.048597 | 0.0 | 0.014286 | 0.025156 | 0.050000 | 0.200000 |
PageValues | 12330.0 | 5.889258 | 18.568437 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 361.763742 |
SpecialDay | 12330.0 | 0.061427 | 0.198917 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
OperatingSystems | 12330.0 | 2.124006 | 0.911325 | 1.0 | 2.000000 | 2.000000 | 3.000000 | 8.000000 |
Browser | 12330.0 | 2.357097 | 1.717277 | 1.0 | 2.000000 | 2.000000 | 2.000000 | 13.000000 |
Region | 12330.0 | 3.147364 | 2.401591 | 1.0 | 1.000000 | 3.000000 | 4.000000 | 9.000000 |
TrafficType | 12330.0 | 4.069586 | 4.025169 | 1.0 | 2.000000 | 2.000000 | 4.000000 | 20.000000 |
shopper.columns
Index(['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend', 'Revenue'], dtype='object')
shopper.isnull().sum(axis=0).sort_values(ascending=False)
Administrative 0 Administrative_Duration 0 Weekend 0 VisitorType 0 TrafficType 0 Region 0 Browser 0 OperatingSystems 0 Month 0 SpecialDay 0 PageValues 0 ExitRates 0 BounceRates 0 ProductRelated_Duration 0 ProductRelated 0 Informational_Duration 0 Informational 0 Revenue 0 dtype: int64
shopper.nunique()
Administrative 27 Administrative_Duration 3335 Informational 17 Informational_Duration 1258 ProductRelated 311 ProductRelated_Duration 9551 BounceRates 1872 ExitRates 4777 PageValues 2704 SpecialDay 6 Month 10 OperatingSystems 8 Browser 13 Region 9 TrafficType 20 VisitorType 3 Weekend 2 Revenue 2 dtype: int64
Something interesting
Training data is June to December, Test data is Feb.-March
shopper['Month'].unique()
array(['Feb', 'Mar', 'May', 'Oct', 'June', 'Jul', 'Aug', 'Nov', 'Sep', 'Dec'], dtype=object)
df_train = shopper[shopper['Month'].isin(['Feb','Mar'])]
df_test = shopper[shopper['Month'].isin(['June', 'Jul', 'Aug', 'Nov', 'Sep',
'Dec'])]
target = ['Revenue']
cat_features = ['Administrative', 'Administrative_Duration', 'Informational',
'Informational_Duration']
num_features = ['BounceRates', 'ExitRates', 'PageValues']
other_factors =['SpecialDay', 'Month',
'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType',
'Weekend']
y = df_train[target]
df_cat = df_train[cat_features]
df_num = df_train[num_features]
df_factor = df_train[other_factors]
print('\033[1mNumeric Features Distribution'.center(100))
figsize = (12, 4)
n=len(num_features)
colors = ['g', 'b', 'r', 'y', 'k']
# histogram
plt.figure(figsize=figsize)
for i in range(len(num_features)):
plt.subplot(1,n,i+1)
sns.distplot(shopper[num_features[i]],
bins=100,
color = colors[i])
plt.tight_layout();
plt.figure(figsize=figsize)
for i in range(len(num_features)):
plt.subplot(1,n,i+1)
df_train.boxplot(num_features[i], grid=False)
plt.tight_layout();
Numeric Features Distribution
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
non_num = ['Month', 'VisitorType','Weekend', 'Revenue']
for c in non_num:
print(df_train[c].unique())
['Feb' 'Mar'] ['Returning_Visitor' 'New_Visitor'] [False True] [False True]
month_mask = {s:i+1 for i, s in enumerate(['Jan','Feb', 'Mar','Apr', 'May', 'Oct', 'June', 'Jul', 'Aug', 'Nov', 'Sep', 'Dec'])}
vistor_mask = {s:i+1 for i, s in enumerate(['Returning_Visitor', 'New_Visitor', 'Other'])}
binary_mask = {s:i for i, s in enumerate(['False', 'True'])}
print(month_mask)
{'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Oct': 6, 'June': 7, 'Jul': 8, 'Aug': 9, 'Nov': 10, 'Sep': 11, 'Dec': 12}
df = df_train.copy()
# change non-numerical value to numeric with the above mask
mask = [month_mask,vistor_mask,binary_mask,binary_mask]
for c in range(len(non_num)):
df[non_num[c]] = df[non_num[c]].replace(mask[c])
df['Weekend'] = df['Weekend'].astype(int) # Use 1 and 0 to represent Ture and False
df['Revenue'] = df['Revenue'].astype(int)
df.head(2)
Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | ProductRelated_Duration | BounceRates | ExitRates | PageValues | SpecialDay | Month | OperatingSystems | Browser | Region | TrafficType | VisitorType | Weekend | Revenue | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.0 | 0 | 0.0 | 1 | 0.0 | 0.2 | 0.2 | 0.0 | 0.0 | 2 | 1 | 1 | 1 | 1 | 1 | 0 | 0 |
1 | 0 | 0.0 | 0 | 0.0 | 2 | 64.0 | 0.0 | 0.1 | 0.0 | 0.0 | 2 | 2 | 2 | 1 | 2 | 1 | 0 | 0 |
sns.pairplot(df_num)
<seaborn.axisgrid.PairGrid at 0x7fe69fc18ad0>
Missing Jan, and April data
for c in range(len(non_num)):
df_test[non_num[c]] = df_test[non_num[c]].replace(mask[c])
df_test['Weekend'] = df_test['Weekend'].astype(int) # Use 1 and 0 to represent Ture and False
df_test['Revenue'] = df_test['Revenue'].astype(int)
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy This is separate from the ipykernel package so we can avoid doing imports until /usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy """ /usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
import warnings
warnings.filterwarnings("ignore")
print('\033[1mCat Features Distribution'.center(100))
figsize = (12, 4)
n=len(cat_features)
colors = ['g', 'b', 'r', 'y', 'k']
# histogram
plt.figure(figsize=figsize)
for i in range(len(cat_features)):
plt.subplot(1,n,i+1)
sns.distplot(shopper[cat_features[i]],
bins=100,
color = colors[i])
plt.tight_layout();
plt.figure(figsize=figsize)
for i in range(len(cat_features)):
plt.subplot(1,n,i+1)
shopper.boxplot(cat_features[i], grid=False)
plt.tight_layout();
Cat Features Distribution
print('\033[1mOther Factors Distribution'.center(100))
figsize = (12, 4)
n=len(other_factors)
colors = ['g', 'b', 'r', 'y', 'k','g','b','r']
# histogram
plt.figure(figsize=figsize)
for i in range(len(other_factors)):
plt.subplot(1,n,i+1)
sns.distplot(df[other_factors[i]],
bins=100,
color = colors[i])
plt.tight_layout();
plt.figure(figsize=figsize)
for i in range(len(other_factors)):
plt.subplot(1,n,i+1)
df.boxplot(other_factors[i], grid=False)
plt.tight_layout();
Other Factors Distribution
y.describe()
Revenue | |
---|---|
count | 2091 |
unique | 2 |
top | False |
freq | 1896 |
1.4 Split data before feature engineering to avoid data leak issue
from sklearn.model_selection import train_test_split
small_df = df.copy().sample(n=int(1000), random_state=12)
plt.figure(figsize=(12,10))
cor = small_df.corr()
sns.heatmap(cor, vmin = -1, vmax = 1)
<matplotlib.axes._subplots.AxesSubplot at 0x7fe69b16aa10>
keep_columns = np.full(cor.shape[0], True)
for i in range(cor.shape[0] - 1):
for j in range(i + 1, cor.shape[0] - 1):
if (np.abs(cor.iloc[i, j]) >= 0.8): # 0.8 is the correlation threshold
keep_columns[j] = False
selected_columns = df.columns[keep_columns]
df_reduced = df[selected_columns]
X = df_reduced.drop(columns=['Revenue'])
y = df_reduced['Revenue']
df_test_reduced = df_test[selected_columns]
X_test = df_test_reduced.drop(columns=['Revenue'])
y_test = df_test_reduced['Revenue']
print("selected features: ", len(X.columns),X.columns)
selected features: 15 Index(['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'BounceRates', 'PageValues', 'SpecialDay', 'Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend'], dtype='object')
df_reduced.head(3)
Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | BounceRates | PageValues | SpecialDay | Month | OperatingSystems | Browser | Region | TrafficType | VisitorType | Weekend | Revenue | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.0 | 0 | 0.0 | 1 | 0.2 | 0.0 | 0.0 | 2 | 1 | 1 | 1 | 1 | 1 | 0 | 0 |
1 | 0 | 0.0 | 0 | 0.0 | 2 | 0.0 | 0.0 | 0.0 | 2 | 2 | 2 | 1 | 2 | 1 | 0 | 0 |
2 | 0 | 0.0 | 0 | 0.0 | 1 | 0.2 | 0.0 | 0.0 | 2 | 4 | 1 | 9 | 3 | 1 | 0 | 0 |
j = 0
fig = plt.figure(figsize=(16, 12))
for i in range(len(selected_columns)-1):
plt.subplot(4,4,j+1 )
j += 1
d = df_reduced[selected_columns[i]]
sns.histplot(d[df_reduced['Revenue'] == 0], color='g', label='no' )
sns.histplot(d[df_reduced['Revenue'] == 1], color='r', label='yes' )
plt.legend(loc='best')
fig.suptitle("Selected Features vs Revenue Analysis")
fig.subplots_adjust(top=1.05);
fig.tight_layout()
print("False class percentage is ",100* df_reduced[df_reduced['Revenue']==0].shape[0]/df_reduced.shape[0], "%")
False class percentage is 90.67431850789096 %
Informational duration show very few yes data for revenue. Other featurs show more or less the same pattern for yes and no. Imbalanced data should be taken into consideration before training models.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
norm_x = df_reduced[['Administrative_Duration','Informational_Duration','BounceRates','PageValues']]
df_reduced[['Administrative_Duration_norm','Informational_Duration_norm','BounceRates_norm','PageValues_norm']] = scaler.fit_transform(norm_x)
df_reduced.drop(columns=['Administrative_Duration','Informational_Duration','BounceRates','PageValues'],axis = 1, inplace=True)
df_reduced.describe()
Administrative | Informational | ProductRelated | SpecialDay | Month | OperatingSystems | Browser | Region | TrafficType | VisitorType | Weekend | Revenue | Administrative_Duration_norm | Informational_Duration_norm | BounceRates_norm | PageValues_norm | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2091.000000 | 2091.000000 | 2091.000000 | 2091.000000 | 2091.000000 | 2091.000000 | 2091.000000 | 2091.000000 | 2091.000000 | 2091.000000 | 2091.000000 | 2091.000000 | 2.091000e+03 | 2.091000e+03 | 2.091000e+03 | 2.091000e+03 |
mean | 1.769488 | 0.391200 | 19.049737 | 0.020564 | 2.912004 | 2.064562 | 2.281683 | 3.000956 | 3.142994 | 1.111430 | 0.243424 | 0.093257 | -1.699050e-17 | -1.953908e-17 | -6.796200e-18 | -4.077720e-17 |
std | 2.780218 | 1.179976 | 22.883550 | 0.113820 | 0.283357 | 0.762232 | 1.402849 | 2.331806 | 2.972300 | 0.314739 | 0.429252 | 0.290862 | 1.000239e+00 | 1.000239e+00 | 1.000239e+00 | 1.000239e+00 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | -3.972724e-01 | -2.438402e-01 | -4.365093e-01 | -2.678064e-01 |
25% | 0.000000 | 0.000000 | 5.000000 | 0.000000 | 3.000000 | 2.000000 | 2.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | -3.972724e-01 | -2.438402e-01 | -4.365093e-01 | -2.678064e-01 |
50% | 0.000000 | 0.000000 | 12.000000 | 0.000000 | 3.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 1.000000 | 0.000000 | 0.000000 | -3.972724e-01 | -2.438402e-01 | -4.365093e-01 | -2.678064e-01 |
75% | 3.000000 | 0.000000 | 25.000000 | 0.000000 | 3.000000 | 2.000000 | 2.000000 | 4.000000 | 3.000000 | 1.000000 | 0.000000 | 0.000000 | -1.463651e-02 | -2.438402e-01 | -1.761783e-01 | -2.678064e-01 |
max | 24.000000 | 16.000000 | 328.000000 | 1.000000 | 3.000000 | 7.000000 | 10.000000 | 9.000000 | 15.000000 | 2.000000 | 1.000000 | 1.000000 | 1.184250e+01 | 1.244800e+01 | 3.208125e+00 | 1.871234e+01 |
norm_x_test = df_test_reduced[['Administrative_Duration','Informational_Duration','BounceRates','PageValues']]
df_test_reduced[['Administrative_Duration_norm','Informational_Duration_norm','BounceRates_norm','PageValues_norm']] = scaler.fit_transform(norm_x_test)
df_test_reduced.drop(columns=['Administrative_Duration','Informational_Duration','BounceRates','PageValues'],axis = 1, inplace=True)
X_test_new = df_test_reduced.drop(columns=['Revenue'])
y_test_new = df_test_reduced['Revenue']
print("True class of revenue is ",df_reduced[df_reduced['Revenue']==1].shape[0])
True class of revenue is 195
true_re = df_reduced[df_reduced['Revenue']==1]
false_re = df_reduced[df_reduced['Revenue']==0][:195]
norm_dist_df = pd.concat([true_re, false_re])
sns.distplot(norm_dist_df['Revenue'])
<matplotlib.axes._subplots.AxesSubplot at 0x7fe699f7ad50>
X_new = norm_dist_df.drop(columns=['Revenue'])
y_new = norm_dist_df[['Revenue']]
print(X_new.shape, y_new.shape)
(390, 15) (390, 1)
X_norm = df_reduced.drop(columns=['Revenue'])
y_norm = df_reduced[['Revenue']]
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
%%time
LR = LogisticRegression()
LR.fit(X_new, y_new)
LR_pred = LR.predict(X_test_new)
CPU times: user 54.3 ms, sys: 5.99 ms, total: 60.3 ms Wall time: 110 ms
X_new.columns
Index(['Administrative', 'Informational', 'ProductRelated', 'SpecialDay', 'Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend', 'Administrative_Duration_norm', 'Informational_Duration_norm', 'BounceRates_norm', 'PageValues_norm'], dtype='object')
print(classification_report(y_test_new, LR_pred))
precision recall f1-score support 0 0.00 0.00 0.00 5093 1 0.19 1.00 0.33 1233 accuracy 0.19 6326 macro avg 0.10 0.50 0.16 6326 weighted avg 0.04 0.19 0.06 6326
%%time
LR_N = LogisticRegression()
LR_N.fit(X_norm, y_norm)
LR_pred_orig = LR_N.predict(X_test_new)
print("Report before subsampling")
print(classification_report(y_test_new, LR_pred_orig))
Report before subsampling precision recall f1-score support 0 0.86 0.96 0.91 5093 1 0.69 0.34 0.45 1233 accuracy 0.84 6326 macro avg 0.78 0.65 0.68 6326 weighted avg 0.83 0.84 0.82 6326 CPU times: user 131 ms, sys: 135 ms, total: 267 ms Wall time: 236 ms
from sklearn.svm import SVC
%%time
svm = SVC(kernel = 'linear',random_state=0)
svm.fit(X_new, y_new)
svm_pred = svm.predict(X_test_new)
print(classification_report(y_test_new, svm_pred))
precision recall f1-score support 0 0.00 0.00 0.00 5093 1 0.19 1.00 0.33 1233 accuracy 0.19 6326 macro avg 0.10 0.50 0.16 6326 weighted avg 0.04 0.19 0.06 6326 CPU times: user 103 ms, sys: 29.1 ms, total: 132 ms Wall time: 174 ms
%%time
svm_n = SVC(kernel = 'linear',random_state=0)
svm_n.fit(X_norm,y_norm)
svm_pred_norm = svm_n.predict(X_test_new)
print(classification_report(y_test_new, svm_pred_norm))
precision recall f1-score support 0 0.00 0.00 0.00 5093 1 0.19 1.00 0.33 1233 accuracy 0.19 6326 macro avg 0.10 0.50 0.16 6326 weighted avg 0.04 0.19 0.06 6326 CPU times: user 2.41 s, sys: 15.9 ms, total: 2.43 s Wall time: 3.05 s
test with subsampling
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from subprocess import call
RF = RandomForestClassifier(max_depth =5)
RF.fit(X_new, y_new)
rf_pred = RF.predict(X_test_new)
print(classification_report(y_test_new , rf_pred))
precision recall f1-score support 0 0.92 0.88 0.90 5093 1 0.59 0.69 0.64 1233 accuracy 0.85 6326 macro avg 0.75 0.79 0.77 6326 weighted avg 0.86 0.85 0.85 6326
export_graphviz(RF.estimators_[0], max_depth=5, out_file='tree.dot',
feature_names = selected_columns[:-1],
rounded = True, proportion = False,
precision = 2, filled = True)
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
0
# Use Random Forest to get feature ranks/importances for each feature
importances = RF.feature_importances_
std = np.std([tree.feature_importances_ for tree in RF.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X_new.shape[1]):
print("%d. %s (feature %d) (%f)" %
(f + 1, X_new.columns[indices[f]], indices[f], importances[indices[f]]))
# Plot the impurity-based feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_new.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X_new.shape[1]), indices)
plt.xlim([-1, X_new.shape[1]]);
Feature ranking: 1. PageValues_norm (feature 14) (0.375309) 2. Month (feature 4) (0.255033) 3. Administrative_Duration_norm (feature 11) (0.091557) 4. ProductRelated (feature 2) (0.071777) 5. Administrative (feature 0) (0.071460) 6. SpecialDay (feature 3) (0.044815) 7. Informational_Duration_norm (feature 12) (0.022610) 8. BounceRates_norm (feature 13) (0.020488) 9. TrafficType (feature 8) (0.015348) 10. VisitorType (feature 9) (0.009463) 11. Informational (feature 1) (0.008952) 12. Browser (feature 6) (0.004435) 13. Region (feature 7) (0.004160) 14. OperatingSystems (feature 5) (0.003225) 15. Weekend (feature 10) (0.001368)
1.With subsampling, logistic regression shows higher recall for class 'True', it can predict 100% for class 'True' which is important. However, for class 'False' it is 0%, means it is really bad to predict 'false' class.
from sklearn.metrics import plot_roc_curve
classifiers = [LR,svm, RF]
plt.figure(figsize=(10,8))
ax = plt.gca()
for c in range(len(classifiers)):
plot_roc_curve(classifiers[c],X_test_new,y_test_new,ax=ax)
plt.title('Three Classifiers ROC comparison')
Text(0.5, 1.0, 'Three Classifiers ROC comparison')
classifiers = [LR_N,svm_n, RF]
plt.figure(figsize=(10,8))
ax = plt.gca()
for c in range(len(classifiers)):
plot_roc_curve(classifiers[c],X_test_new,y_test_new,ax=ax)
plt.title('Three Classifiers ROC comparison with Imbalanced class data')
Text(0.5, 1.0, 'Three Classifiers ROC comparison with Imbalanced class data')
The imbalanced class data shows a better result than the subsampling
from sklearn.metrics import roc_curve, auc, balanced_accuracy_score, f1_score
n = [10, 50, 100, 500,1000 ]
for i in n:
RF = RandomForestClassifier(i,max_depth =10)
RF.fit(X_new, y_new)
rf_pred = RF.predict(X_test_new)
print("**************n_estimators is ", i,"************************")
print(classification_report(y_test_new , rf_pred))
**************n_estimators is 10 ************************ precision recall f1-score support 0 0.92 0.89 0.90 5093 1 0.59 0.68 0.63 1233 accuracy 0.85 6326 macro avg 0.76 0.78 0.77 6326 weighted avg 0.86 0.85 0.85 6326 **************n_estimators is 50 ************************ precision recall f1-score support 0 0.92 0.88 0.90 5093 1 0.59 0.69 0.63 1233 accuracy 0.85 6326 macro avg 0.76 0.79 0.77 6326 weighted avg 0.86 0.85 0.85 6326 **************n_estimators is 100 ************************ precision recall f1-score support 0 0.92 0.88 0.90 5093 1 0.59 0.69 0.63 1233 accuracy 0.84 6326 macro avg 0.75 0.79 0.77 6326 weighted avg 0.86 0.84 0.85 6326 **************n_estimators is 500 ************************ precision recall f1-score support 0 0.92 0.88 0.90 5093 1 0.59 0.69 0.63 1233 accuracy 0.85 6326 macro avg 0.75 0.79 0.77 6326 weighted avg 0.86 0.85 0.85 6326 **************n_estimators is 1000 ************************ precision recall f1-score support 0 0.92 0.88 0.90 5093 1 0.59 0.69 0.64 1233 accuracy 0.85 6326 macro avg 0.76 0.79 0.77 6326 weighted avg 0.86 0.85 0.85 6326
from sklearn.cluster import KMeans
shopper.head(1)
Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | ProductRelated_Duration | BounceRates | ExitRates | PageValues | SpecialDay | Month | OperatingSystems | Browser | Region | TrafficType | VisitorType | Weekend | Revenue | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.0 | 0 | 0.0 | 1 | 0.0 | 0.2 | 0.2 | 0.0 | 0.0 | Feb | 1 | 1 | 1 | 1 | Returning_Visitor | False | False |
df_cluster = shopper.copy()
for c in range(len(non_num)):
df_cluster[non_num[c]] = df_cluster[non_num[c]].replace(mask[c])
df_cluster['Weekend'] = df_cluster['Weekend'].astype(int) # Use 1 and 0 to represent Ture and False
df_cluster['Revenue'] = df_cluster['Revenue'].astype(int)
clusters = []
for n in range(1,11):
km = KMeans(n_clusters=n).fit(df_cluster)
clusters.append(km.inertia_)
fig, ax = plt.subplots(figsize=(12,8))
sns.lineplot(x=list(range(1,11)), y = clusters, ax = ax)
ax.set_title("searching for elbow")
ax.set_xlabel("Clusters")
ax.set_ylabel("Inertia")
Text(0, 0.5, 'Inertia')
km = KMeans(n_clusters=4).fit(df_cluster)
df_cluster['Labels'] = km.labels_
d_false = df_cluster[df_cluster['Revenue']==0]
d_true= df_cluster[df_cluster['Revenue']==1]
fig = plt.figure(figsize=(40,30))
ax = fig.add_subplot(431)
sns.scatterplot(df_cluster['ProductRelated'],df_cluster['BounceRates'],hue=df_cluster['Labels'])
plt.title("Labels for BounceRates")
ax = fig.add_subplot(432)
sns.scatterplot(df_cluster['ProductRelated'],df_cluster['ProductRelated_Duration'],hue=df_cluster['Labels'])
plt.title("Labels for BounceRates")
ax = fig.add_subplot(433)
sns.scatterplot(df_cluster['ProductRelated'],df_cluster['ExitRates'],hue=df_cluster['Labels'])
plt.title("Labels for ExitRates")
ax = fig.add_subplot(434)
sns.scatterplot(df_cluster['ProductRelated'],df_cluster['PageValues'],hue=df_cluster['Labels'])
plt.title("Labels for PageValues")
ax = fig.add_subplot(435)
sns.scatterplot(df_cluster['ProductRelated'],df_cluster['Administrative_Duration'],hue=df_cluster['Labels'])
plt.title("Labels for Administrative_Duration")
ax = fig.add_subplot(436)
sns.scatterplot(df_cluster['ProductRelated'],df_cluster['Informational_Duration'],hue=df_cluster['Labels'])
plt.title("Labels for Informational_Duration")
ax = fig.add_subplot(437)
sns.scatterplot(df_cluster['ProductRelated'],df_cluster['VisitorType'],hue=df_cluster['Labels'])
plt.title("Labels for Informational_Duration")
ax = fig.add_subplot(438)
sns.scatterplot(df_cluster['ProductRelated'],df_cluster['ProductRelated_Duration'],hue=df_cluster['VisitorType'])
plt.title("Labels for Informational_Duration")
Text(0.5, 1.0, 'Labels for Informational_Duration')
km = KMeans(n_clusters=4).fit(df_cluster)
df_cluster['Labels'] = km.labels_
df1 = df_cluster[df_cluster['VisitorType']==1]
df2 = df_cluster[df_cluster['VisitorType']==2]
df3 = df_cluster[df_cluster['VisitorType']==3]
fig = plt.figure(figsize=(20,15))
ax = fig.add_subplot(231)
sns.scatterplot(df1['ProductRelated'],df1['Informational_Duration'],hue=df_cluster['Labels'])
plt.title("Labels for VisitorType 1")
ax = fig.add_subplot(232)
sns.scatterplot(df2['ProductRelated'],df2['Informational_Duration'],hue=df_cluster['Labels'])
plt.title("Labels for VisitorType 2")
ax = fig.add_subplot(233)
sns.scatterplot(df3['ProductRelated'],df3['Informational_Duration'],hue=df_cluster['Labels'])
plt.title("Labels for VisitorType 3")
Text(0.5, 1.0, 'Labels for VisitorType 3')
df1.head(1)
Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | BounceRates | PageValues | SpecialDay | Month | OperatingSystems | Browser | Region | TrafficType | VisitorType | Weekend | Revenue | Labels | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.0 | 0 | 0.0 | 1 | 0.2 | 0.0 | 0.0 | 2 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 |
km = KMeans(n_clusters=3).fit(df_cluster)
df_cluster['Labels'] = km.labels_
d_false = df_cluster[df_cluster['Revenue']==0]
d_true= df_cluster[df_cluster['Revenue']==1]
fig = plt.figure(figsize=(20,15))
ax = fig.add_subplot(431)
sns.scatterplot(df_cluster['ProductRelated'],df_cluster['BounceRates'],hue=df_cluster['Labels'])
plt.title("Labels for BounceRates")
ax = fig.add_subplot(432)
sns.scatterplot(df_cluster['ProductRelated'],df_cluster['ProductRelated_Duration'],hue=df_cluster['Labels'])
plt.title("Labels for BounceRates")
ax = fig.add_subplot(433)
sns.scatterplot(df_cluster['ProductRelated'],df_cluster['ExitRates'],hue=df_cluster['Labels'])
plt.title("Labels for ExitRates")
ax = fig.add_subplot(434)
sns.scatterplot(df_cluster['ProductRelated'],df_cluster['PageValues'],hue=df_cluster['Labels'])
plt.title("Labels for PageValues")
ax = fig.add_subplot(435)
sns.scatterplot(df_cluster['ProductRelated'],df_cluster['Administrative_Duration'],hue=df_cluster['Labels'])
plt.title("Labels for Administrative_Duration")
ax = fig.add_subplot(436)
sns.scatterplot(df_cluster['ProductRelated'],df_cluster['Informational_Duration'],hue=df_cluster['Labels'])
plt.title("Labels for Informational_Duration")
ax = fig.add_subplot(437)
sns.scatterplot(df_cluster['ProductRelated'],df_cluster['VisitorType'],hue=df_cluster['Labels'])
plt.title("Labels for Informational_Duration")
ax = fig.add_subplot(438)
sns.scatterplot(df_cluster['ProductRelated'],df_cluster['ProductRelated_Duration'],hue=df_cluster['VisitorType'])
plt.title("Labels for Informational_Duration")
Text(0.5, 1.0, 'Labels for Informational_Duration')
from scipy.cluster import hierarchy
from scipy.spatial import distance_matrix
dist = distance_matrix(df_cluster, df_cluster)
Z = hierarchy.linkage(dist, 'complete')
plt.figure(figsize=(18, 50))
dendro = hierarchy.dendrogram(Z)
Use June-September data to predict October-December revenue.
use normalized values for numeric features to avoid outlier 'drawing effect'
df_cluster.columns
Index(['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend', 'Revenue', 'Labels'], dtype='object')
df_cluster.head(1)
Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | ProductRelated_Duration | BounceRates | ExitRates | PageValues | SpecialDay | Month | OperatingSystems | Browser | Region | TrafficType | VisitorType | Weekend | Revenue | Labels | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.0 | 0 | 0.0 | 1 | 0.0 | 0.2 | 0.2 | 0.0 | 0.0 | 2 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 |
df_cluster = df_cluster[selected_columns]
print(df_cluster.shape)
(12330, 16)
df_semi_train = df_cluster[df_cluster['Month'].isin([6,7,8,9])]
df_semi_test = df_cluster[df_cluster['Month'].isin([10,11,12])]
df_semi_valid = df_cluster[df_cluster['Month'].isin([2,3])]
X_semi_train = df_semi_train.drop(columns=['Revenue'])
y_semi_train = df_semi_train[['Revenue']]
X_semi_test = df_semi_test.drop(columns=['Revenue'])
y_semi_test = df_semi_test[['Revenue']]
X_semi_valid = df_semi_valid.drop(columns=['Revenue'])
y_semi_valid = df_semi_valid[['Revenue']]
svm_label = SVC()
svm_label.fit(X_semi_train,y_semi_train)
SVC()
y_semi_test_pred = pd.DataFrame(svm_label.predict(X_semi_test),columns=['Revenue'])
X_semi = pd.concat([X_semi_train,X_semi_test])
y_semi_self = pd.concat([y_semi_train,y_semi_test_pred])
y_semi_orig = pd.concat([y_semi_train,y_semi_test])
print(X_semi.shape, y_semi_self.shape, y_semi_orig.shape)
(6875, 15) (6875, 1) (6875, 1)
RF.fit(X_semi, y_semi_self)
semi_pred_self = RF.predict(X_semi_valid)
print(classification_report(y_semi_valid,semi_pred_self))
precision recall f1-score support 0 0.95 0.99 0.97 1896 1 0.82 0.45 0.58 195 accuracy 0.94 2091 macro avg 0.88 0.72 0.77 2091 weighted avg 0.93 0.94 0.93 2091
pd.Series(RF.feature_importances_, index=X_semi.columns).nlargest(15).plot(kind='barh')
<matplotlib.axes._subplots.AxesSubplot at 0x7fe68c4b6f90>
RF.fit(X_semi, y_semi_orig)
semi_pred_orig = RF.predict(X_semi_valid)
print(classification_report(y_semi_valid,semi_pred_orig))
precision recall f1-score support 0 0.96 0.98 0.97 1896 1 0.78 0.62 0.69 195 accuracy 0.95 2091 macro avg 0.87 0.80 0.83 2091 weighted avg 0.94 0.95 0.95 2091
pd.Series(RF.feature_importances_, index=X_semi.columns).nlargest(15).plot(kind='barh')
<matplotlib.axes._subplots.AxesSubplot at 0x7fe68c2be890>
RF.fit(X_semi_train, y_semi_train)
semi_pred_J = RF.predict(X_semi_valid)
print(classification_report(y_semi_valid,semi_pred_J))
precision recall f1-score support 0 0.95 0.98 0.97 1896 1 0.78 0.54 0.64 195 accuracy 0.94 2091 macro avg 0.87 0.76 0.80 2091 weighted avg 0.94 0.94 0.94 2091
pd.Series(RF.feature_importances_, index=X_semi.columns).nlargest(15).plot(kind='barh')
<matplotlib.axes._subplots.AxesSubplot at 0x7fe698a4b990>