%%sh
wget https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip

--2022-05-27 05:42:33--  https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 579043 (565K) [application/x-httpd-php]
Saving to: ‘bank.zip’

     0K .......... .......... .......... .......... ..........  8%  830K 1s
    50K .......... .......... .......... .......... .......... 17%  886K 1s
   100K .......... .......... .......... .......... .......... 26% 1.71M 0s
   150K .......... .......... .......... .......... .......... 35% 1.68M 0s
   200K .......... .......... .......... .......... .......... 44% 67.9M 0s
   250K .......... .......... .......... .......... .......... 53% 1.73M 0s
   300K .......... .......... .......... .......... .......... 61% 63.0M 0s
   350K .......... .......... .......... .......... .......... 70% 42.5M 0s
   400K .......... .......... .......... .......... .......... 79% 24.6M 0s
   450K .......... .......... .......... .......... .......... 88% 2.05M 0s
   500K .......... .......... .......... .......... .......... 97% 77.1M 0s
   550K .......... .....                                      100% 96.8M=0.2s

2022-05-27 05:42:33 (2.38 MB/s) - ‘bank.zip’ saved [579043/579043]

ls

bank.zip  conf/  eventlogs/  logs/  metastore_db/  preload_class.lst*


%%sh
unzip bank.zip

Archive:  bank.zip
  inflating: bank-full.csv           
  inflating: bank-names.txt          
  inflating: bank.csv

ls

bank-full.csv   bank.csv  conf/       logs/          preload_class.lst*
bank-names.txt  bank.zip  eventlogs/  metastore_db/


from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('ml-bank').getOrCreate()
df = spark.read.csv('file:/databricks/driver/bank.csv', header=True, inferSchema=True, sep=';')
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)


df.show(5)

+---+-----------+-------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---+
|age|        job|marital|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+-----------+-------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---+
| 30| unemployed|married|  primary|     no|   1787|     no|  no|cellular| 19|  oct|      79|       1|   -1|       0| unknown| no|
| 33|   services|married|secondary|     no|   4789|    yes| yes|cellular| 11|  may|     220|       1|  339|       4| failure| no|
| 35| management| single| tertiary|     no|   1350|    yes|  no|cellular| 16|  apr|     185|       1|  330|       1| failure| no|
| 30| management|married| tertiary|     no|   1476|    yes| yes| unknown|  3|  jun|     199|       4|   -1|       0| unknown| no|
| 59|blue-collar|married|secondary|     no|      0|    yes|  no| unknown|  5|  may|     226|       1|   -1|       0| unknown| no|
+---+-----------+-------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---+
only showing top 5 rows


import pandas as pd

pd.DataFrame(df.take(5))


df.count()

Out[8]: 4521


df.describe().toPandas()


numeric_features = [name for name, dtype in df.dtypes if dtype == 'int']
df.select(numeric_features).toPandas()


numeric_data = df.select(numeric_features).toPandas()
axs = pd.plotting.scatter_matrix(numeric_data, figsize=(8, 8));
n = len(numeric_data.columns)

for i in range(n):
    v = axs[i, 0]
    v.yaxis.label.set_rotation(0)
    v.yaxis.label.set_ha('right')
    v.set_yticks(())
    h = axs[n - 1, i]
    h.xaxis.label.set_rotation(90)
    h.set_xticks(())


df = df.drop('month','day')


from pyspark.ml.feature import OneHotEncoder , StringIndexer, VectorAssembler

categoricalColumns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
stages = []

for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

label_stringIdx = StringIndexer(inputCol = 'y', outputCol = 'label')
stages += [label_stringIdx]
numericCols = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols = assemblerInputs,outputCol="features") # [YOUR CODE HERE]
stages += [assembler]


from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)

pipelineModel = pipeline.fit(df) # [YOUR CODE HERE]


transformed_df = pipelineModel.transform(df) # [YOUR CODE HERE]
transformed_df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)
 |-- jobIndex: double (nullable = false)
 |-- jobclassVec: vector (nullable = true)
 |-- maritalIndex: double (nullable = false)
 |-- maritalclassVec: vector (nullable = true)
 |-- educationIndex: double (nullable = false)
 |-- educationclassVec: vector (nullable = true)
 |-- defaultIndex: double (nullable = false)
 |-- defaultclassVec: vector (nullable = true)
 |-- housingIndex: double (nullable = false)
 |-- housingclassVec: vector (nullable = true)
 |-- loanIndex: double (nullable = false)
 |-- loanclassVec: vector (nullable = true)
 |-- contactIndex: double (nullable = false)
 |-- contactclassVec: vector (nullable = true)
 |-- poutcomeIndex: double (nullable = false)
 |-- poutcomeclassVec: vector (nullable = true)
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)


selectedCols = ['label', 'features'] + df.columns
df = transformed_df.select(selectedCols) # 
df.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)


df.show()
pd.DataFrame(df.take(5))

+-----+--------------------+---+-------------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+--------+---+
|label|            features|age|          job|marital|education|default|balance|housing|loan| contact|duration|campaign|pdays|previous|poutcome|  y|
+-----+--------------------+---+-------------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+--------+---+
|  0.0|(30,[8,11,15,16,1...| 30|   unemployed|married|  primary|     no|   1787|     no|  no|cellular|      79|       1|   -1|       0| unknown| no|
|  0.0|(30,[4,11,13,16,1...| 33|     services|married|secondary|     no|   4789|    yes| yes|cellular|     220|       1|  339|       4| failure| no|
|  0.0|(30,[0,12,14,16,1...| 35|   management| single| tertiary|     no|   1350|    yes|  no|cellular|     185|       1|  330|       1| failure| no|
|  0.0|(30,[0,11,14,16,1...| 30|   management|married| tertiary|     no|   1476|    yes| yes| unknown|     199|       4|   -1|       0| unknown| no|
|  0.0|(30,[1,11,13,16,1...| 59|  blue-collar|married|secondary|     no|      0|    yes|  no| unknown|     226|       1|   -1|       0| unknown| no|
|  0.0|(30,[0,12,14,16,1...| 35|   management| single| tertiary|     no|    747|     no|  no|cellular|     141|       2|  176|       3| failure| no|
|  0.0|(30,[6,11,14,16,1...| 36|self-employed|married| tertiary|     no|    307|    yes|  no|cellular|     341|       1|  330|       2|   other| no|
|  0.0|(30,[2,11,13,16,1...| 39|   technician|married|secondary|     no|    147|    yes|  no|cellular|     151|       2|   -1|       0| unknown| no|
|  0.0|(30,[7,11,14,16,1...| 41| entrepreneur|married| tertiary|     no|    221|    yes|  no| unknown|      57|       2|   -1|       0| unknown| no|
|  0.0|(30,[4,11,15,16,1...| 43|     services|married|  primary|     no|    -88|    yes| yes|cellular|     313|       1|  147|       2| failure| no|
|  0.0|(30,[4,11,13,16,1...| 39|     services|married|secondary|     no|   9374|    yes|  no| unknown|     273|       1|   -1|       0| unknown| no|
|  0.0|(30,[3,11,13,16,1...| 43|       admin.|married|secondary|     no|    264|    yes|  no|cellular|     113|       2|   -1|       0| unknown| no|
|  0.0|(30,[2,11,14,16,1...| 36|   technician|married| tertiary|     no|   1109|     no|  no|cellular|     328|       2|   -1|       0| unknown| no|
|  1.0|(30,[10,12,13,16,...| 20|      student| single|secondary|     no|    502|     no|  no|cellular|     261|       1|   -1|       0| unknown|yes|
|  0.0|(30,[1,11,13,16,1...| 31|  blue-collar|married|secondary|     no|    360|    yes| yes|cellular|      89|       1|  241|       1| failure| no|
|  0.0|(30,[0,11,14,16,1...| 40|   management|married| tertiary|     no|    194|     no| yes|cellular|     189|       2|   -1|       0| unknown| no|
|  0.0|(30,[2,11,13,16,1...| 56|   technician|married|secondary|     no|   4073|     no|  no|cellular|     239|       5|   -1|       0| unknown| no|
|  0.0|(30,[3,12,14,16,1...| 37|       admin.| single| tertiary|     no|   2317|    yes|  no|cellular|     114|       1|  152|       2| failure| no|
|  0.0|(30,[1,12,15,16,1...| 25|  blue-collar| single|  primary|     no|   -221|    yes|  no| unknown|     250|       1|   -1|       0| unknown| no|
|  0.0|(30,[4,11,13,16,1...| 31|     services|married|secondary|     no|    132|     no|  no|cellular|     148|       1|  152|       1|   other| no|
+-----+--------------------+---+-------------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+--------+---+
only showing top 20 rows


train, test = df.randomSplit(weights = [0.80, 0.20], seed = 13)


print(train.count(),len(train.columns))
print(test.count(),len(test.columns))

3641 17
880 17


from pyspark.ml.classification import LogisticRegression

Model = LogisticRegression(featuresCol='features',labelCol='label',maxIter=10)
lrModel = Model.fit(train)


import matplotlib.pyplot as plt
import numpy as np
beta = np.sort(lrModel.coefficients)
plt.plot(beta)
plt.ylabel('Beta Coefficients')
plt.show()


trainingSummary = lrModel.summary
roc = trainingSummary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))

/databricks/spark/python/pyspark/sql/context.py:134: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.
  warnings.warn(

Training set areaUnderROC: 0.8817240616561219


predictions = lrModel.transform(test) # [YOUR CODE HERE]


pd.DataFrame(predictions.take(10))


from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

Out[35]: 0.890667366395529


from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(maxIter=10)
gbtModel = gbt.fit(train)
predictions = gbtModel.transform(test)
predictions.select('age', 'job', 'label', 'rawPrediction', 'prediction', 'probability').show(10)

+---+----------+-----+--------------------+----------+--------------------+
|age|       job|label|       rawPrediction|prediction|         probability|
+---+----------+-----+--------------------+----------+--------------------+
| 31|management|  0.0|[1.18529281764925...|       0.0|[0.91455663485687...|
| 34|management|  0.0|[0.89164308962147...|       0.0|[0.85610216835048...|
| 56|management|  0.0|[0.09853759463557...|       0.0|[0.54910995316813...|
| 34|management|  0.0|[0.93976911240204...|       0.0|[0.86755807700341...|
| 34|management|  0.0|[1.28181478730636...|       0.0|[0.92848384242603...|
| 35|management|  0.0|[1.24103158537832...|       0.0|[0.92287477619178...|
| 36|management|  0.0|[1.28919259046192...|       0.0|[0.92945746481579...|
| 42|management|  0.0|[1.24103158537832...|       0.0|[0.92287477619178...|
| 52|management|  0.0|[1.28181478730636...|       0.0|[0.92848384242603...|
| 42|management|  0.0|[-0.1202797102844...|       1.0|[0.44014849493633...|
+---+----------+-----+--------------------+----------+--------------------+
only showing top 10 rows


from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(gbt.maxDepth, [2, 4, 6])
             .addGrid(gbt.maxBins, [20, 60])
             .addGrid(gbt.maxIter, [10, 20])
             .build())
cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cvModel = cv.fit(train)
predictions = cvModel.transform(test)
evaluator.evaluate(predictions)

Out[37]: 0.8939348502455289

Subscription Prediction with PySpark and MLlib¶

Learning Objectives¶

Part 1: Data Loader¶

Part 2: Exploring The Data¶

Part 3: Preparing Data for Machine Learning¶

Part 4: Pipeline¶

Part 5: Logistic Regression Model¶

OPTIONAL: HyperParameter Tuning a Gradient-Boosted Tree Classifier¶

Acknowledgements¶

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16
0	30	unemployed	married	primary	no	1787	no	no	cellular	19	oct	79	1	-1	0	unknown	no
1	33	services	married	secondary	no	4789	yes	yes	cellular	11	may	220	1	339	4	failure	no
2	35	management	single	tertiary	no	1350	yes	no	cellular	16	apr	185	1	330	1	failure	no
3	30	management	married	tertiary	no	1476	yes	yes	unknown	3	jun	199	4	-1	0	unknown	no
4	59	blue-collar	married	secondary	no	0	yes	no	unknown	5	may	226	1	-1	0	unknown	no

	summary	age	job	marital	education	default	balance	housing	loan	contact	day	month	duration	campaign	pdays	previous	poutcome	y
0	count	4521	4521	4521	4521	4521	4521	4521	4521	4521	4521	4521	4521	4521	4521	4521	4521	4521
1	mean	41.17009511170095	None	None	None	None	1422.6578190665782	None	None	None	15.915284229152842	None	263.96129174961294	2.793629727936297	39.766644547666445	0.5425790754257908	None	None
2	stddev	10.576210958711263	None	None	None	None	3009.6381424673395	None	None	None	8.247667327229934	None	259.85663262468216	3.1098066601885823	100.12112444301656	1.6935623506071211	None	None
3	min	19	admin.	divorced	primary	no	-3313	no	no	cellular	1	apr	4	1	-1	0	failure	no
4	max	87	unknown	single	unknown	yes	71188	yes	yes	unknown	31	sep	3025	50	871	25	unknown	yes

	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16
0	(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...	30	unemployed	married	primary	no	1787	no	no	cellular	79	1	-1	0	unknown	no
1	(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...	33	services	married	secondary	no	4789	yes	yes	cellular	220	1	339	4	failure	no
2	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	35	management	single	tertiary	no	1350	yes	no	cellular	185	1	330	1	failure	no
3	(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	30	management	married	tertiary	no	1476	yes	yes	unknown	199	4	-1	0	unknown	no
4	(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	59	blue-collar	married	secondary	no	0	yes	no	unknown	226	1	-1	0	unknown	no