!pip install -q seaborn ## Use seaborn for pairplot
!pip install -q tpot  # Use TPOT for automl

     |████████████████████████████████| 87 kB 3.1 MB/s 
     |████████████████████████████████| 192.9 MB 68 kB/s 
     |████████████████████████████████| 160 kB 20.5 MB/s 
  Building wheel for stopit (setup.py) ... done


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Make NumPy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

2.8.2


url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = [
  'MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
  'Acceleration', 'Model Year', 'Origin'
  ]

dataset = pd.read_csv(url, names=column_names, na_values='?', 
                      comment='\t', sep=' ', skipinitialspace=True)


dataset.tail()


dataset = dataset.dropna()


dataset['Origin'] = dataset['Origin'].replace({1: 'USA', 2: 'Europe', 3: 'Japan'})


dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')
dataset.tail()


train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index) #Don't use the sample


sns.pairplot(train_dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']], diag_kind='kde');


train_dataset.describe().transpose()


train_features = train_dataset.drop(['MPG'], axis=1)
test_features = test_dataset.drop(['MPG'], axis=1)

train_labels = train_dataset['MPG']
test_labels = test_dataset['MPG']


print(train_features.shape, train_labels.shape)

(314, 9) (314,)


normalizer =  tf.keras.layers.Normalization()


normalizer.adapt(train_features)


print(f'feature mean: {normalizer.mean.numpy().squeeze()}\n')
print(f'feature variance: {normalizer.variance.numpy().squeeze()}')

feature mean: [   5.478  195.318  104.869 2990.252   15.559   75.898    0.178    0.197
    0.624]

feature variance: [     2.88   10850.413   1446.699 709896.9        7.755     13.467
      0.147      0.158      0.235]


first = np.array(train_features[:1])

with np.printoptions(precision=2, suppress=True):
  print('First example:', first)
  print()
  print('Normalized:', normalizer(first).numpy())

First example: [[   4.    90.    75.  2125.    14.5   74.     0.     0.     1. ]]

Normalized: [[-0.87 -1.01 -0.79 -1.03 -0.38 -0.52 -0.47 -0.5   0.78]]


linear_model = tf.keras.Sequential([
    normalizer,
    tf.keras.layers.Dense(1)
])


linear_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 normalization (Normalizatio  (None, 9)                19        
 n)                                                              
                                                                 
 dense (Dense)               (None, 1)                 10        
                                                                 
=================================================================
Total params: 29
Trainable params: 10
Non-trainable params: 19
_________________________________________________________________


linear_model.predict(train_features[0:10])

array([[ 0.07 ],
       [-1.188],
       [ 1.359],
       [-1.132],
       [-1.939],
       [-0.365],
       [-1.955],
       [-1.506],
       [ 0.282],
       [-0.978]], dtype=float32)


linear_model.layers[1].kernel

<tf.Variable 'dense/kernel:0' shape=(9, 1) dtype=float32, numpy=
array([[ 0.529],
       [ 0.282],
       [ 0.235],
       [-0.565],
       [-0.241],
       [-0.528],
       [ 0.158],
       [-0.381],
       [-0.076]], dtype=float32)>


linear_model.compile(
    optimizer= tf.keras.optimizers.Adam(learning_rate=0.1),
    loss= 'MAE',
    
    )


%%time
history = linear_model.fit(train_features, train_labels, epochs=100, verbose=0, validation_split = 0.2)

CPU times: user 5.86 s, sys: 211 ms, total: 6.07 s
Wall time: 9.79 s


hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()


def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.ylim([0, 10])
  plt.xlabel('Epoch')
  plt.ylabel('Error [MPG]')
  plt.legend()
  plt.grid(True)


plot_loss(history)


test_results = {}

test_results['linear_model'] = linear_model.evaluate(test_features, test_labels)

3/3 [==============================] - 0s 5ms/step - loss: 2.4857


test_results

{'linear_model': 2.4857208728790283}


def build_and_compile_model(norm):
  model = tf.keras.Sequential([norm,tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(64, activation='relu'),tf.keras.layers.Dense(1)])
  model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(lr=0.1))
  return model


dnn_model =  build_and_compile_model(normalizer)


dnn_model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 normalization (Normalizatio  (None, 9)                19        
 n)                                                              
                                                                 
 dense_9 (Dense)             (None, 64)                640       
                                                                 
 dense_10 (Dense)            (None, 64)                4160      
                                                                 
 dense_11 (Dense)            (None, 1)                 65        
                                                                 
=================================================================
Total params: 4,884
Trainable params: 4,865
Non-trainable params: 19
_________________________________________________________________


%%time
history = dnn_model.fit(
    train_features,
    train_labels,
    validation_split=0.2,
    verbose=0, epochs=100)

CPU times: user 4.64 s, sys: 270 ms, total: 4.91 s
Wall time: 4.47 s


plot_loss(history)


test_results['dnn_model'] = dnn_model.evaluate(test_features, test_labels, verbose=0)


pd.DataFrame(test_results, index=['Mean absolute error [MPG]']).T


test_predictions = dnn_model.predict(test_features).flatten()
print(test_predictions)
print(test_labels)
a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
lims = [0, 50]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

[14.062 10.166 12.083 24.798 18.921 11.674 11.953 11.208 17.724 29.451
 21.948 24.643 14.053 23.433 12.268 13.853 13.769 12.425 16.579 12.374
 12.732 23.449 18.077 19.605 27.003 21.876 15.815 22.234 16.57  18.016
 25.576 21.014 17.73  18.78  24.131 15.192 18.166 26.368 27.383 17.463
 28.38  26.547 14.722 30.063 33.661 33.094 20.019 21.413 19.287 23.963
 29.144 17.514 29.953 17.589 16.912 17.173 30.451 33.077 21.776 23.944
 32.738 31.274 25.189 26.715 30.264 37.887 35.247 32.941 30.962 24.802
 21.94  21.684 27.274 27.723 33.604 34.721 37.073 27.774]
9      15.0
25     10.0
28      9.0
31     25.0
33     19.0
       ... 
369    34.0
375    36.0
382    34.0
384    32.0
396    28.0
Name: MPG, Length: 78, dtype: float64


error = history.history['val_loss']
plt.hist(error, bins=25)
plt.xlabel('Prediction Error [MPG]')
_ = plt.ylabel('Count')


dnn_model.save('dnn_model')

INFO:tensorflow:Assets written to: dnn_model/assets


reloaded = tf.keras.models.load_model('dnn_model')

test_results['reloaded'] = reloaded.evaluate(
    test_features, test_labels, verbose=0)


pd.DataFrame(test_results, index=['Mean absolute error [MPG]']).T


fake = np.outer(np.ones(train_features.shape[0]), train_features.median())
fake = pd.DataFrame(fake, columns = train_features.columns)
fake.Displacement = np.linspace(0, 500, train_features.shape[0])


def plot_displacement(x, y):
  plt.scatter(train_features['Displacement'], train_labels, label='Data')
  plt.plot(x, y, color='k', label='Predictions')
  plt.xlabel('Displacement')
  plt.ylabel('MPG')
  plt.legend()


plot_displacement(fake.Displacement, linear_model(fake))


plot_displacement(fake.Displacement, dnn_model.predict(fake))


%%time
from tpot import TPOTRegressor
tpot = TPOTRegressor(generations=10, 
                     population_size=40,
                     scoring='neg_mean_absolute_error',
                     verbosity=2,
                     random_state=42)
tpot.fit(train_features, train_labels)
print(f"Tpop score on test data: {tpot.score(test_features, test_labels):.2f}")
tpot.export('tpot_mpg_pipeline.py')

Generation 1 - Current best internal CV score: -2.0391493673159626

Generation 2 - Current best internal CV score: -2.0391493673159626

Generation 3 - Current best internal CV score: -2.0391493673159626

Generation 4 - Current best internal CV score: -2.0391493673159626

Generation 5 - Current best internal CV score: -1.9760831555452802

Generation 6 - Current best internal CV score: -1.9399572887864829

Generation 7 - Current best internal CV score: -1.9399572887864829

Generation 8 - Current best internal CV score: -1.9399572887864829

Generation 9 - Current best internal CV score: -1.9399572887864829

Generation 10 - Current best internal CV score: -1.9132874807987714

Best pipeline: ExtraTreesRegressor(LassoLarsCV(input_matrix, normalize=False), bootstrap=False, max_features=0.7000000000000001, min_samples_leaf=1, min_samples_split=6, n_estimators=100)
Tpop score on test data: -1.74
CPU times: user 9min 30s, sys: 1min 14s, total: 10min 44s
Wall time: 9min 33s

/usr/local/lib/python3.7/dist-packages/sklearn/base.py:451: UserWarning: X does not have valid feature names, but LassoLarsCV was fitted with feature names
  "X does not have valid feature names, but"


cat tpot_mpg_pipeline.py

import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=42)

# Average CV score on the training set was: -1.9132874807987714
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=False)),
    ExtraTreesRegressor(bootstrap=False, max_features=0.7000000000000001, min_samples_leaf=1, min_samples_split=6, n_estimators=100)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

	MPG	Cylinders	Displacement	Horsepower	Weight	Acceleration	Model Year	Origin
393	27.0	4	140.0	86.0	2790.0	15.6	82	1
394	44.0	4	97.0	52.0	2130.0	24.6	82	2
395	32.0	4	135.0	84.0	2295.0	11.6	82	1
396	28.0	4	120.0	79.0	2625.0	18.6	82	1
397	31.0	4	119.0	82.0	2720.0	19.4	82	1

	MPG	Cylinders	Displacement	Horsepower	Weight	Acceleration	Model Year	Europe	USA
393	27.0	4	140.0	86.0	2790.0	15.6	82	0	1
394	44.0	4	97.0	52.0	2130.0	24.6	82	1	0
395	32.0	4	135.0	84.0	2295.0	11.6	82	0	1
396	28.0	4	120.0	79.0	2625.0	18.6	82	0	1
397	31.0	4	119.0	82.0	2720.0	19.4	82	0	1

	count	mean	std	min	25%	50%	75%	max
MPG	314.0	23.310510	7.728652	10.0	17.00	22.0	28.95	46.6
Cylinders	314.0	5.477707	1.699788	3.0	4.00	4.0	8.00	8.0
Displacement	314.0	195.318471	104.331589	68.0	105.50	151.0	265.75	455.0
Horsepower	314.0	104.869427	38.096214	46.0	76.25	94.5	128.00	225.0
Weight	314.0	2990.251592	843.898596	1649.0	2256.50	2822.5	3608.00	5140.0
Acceleration	314.0	15.559236	2.789230	8.0	13.80	15.5	17.20	24.8
Model Year	314.0	75.898089	3.675642	70.0	73.00	76.0	79.00	82.0
Europe	314.0	0.178344	0.383413	0.0	0.00	0.0	0.00	1.0
Japan	314.0	0.197452	0.398712	0.0	0.00	0.0	0.00	1.0
USA	314.0	0.624204	0.485101	0.0	0.00	1.0	1.00	1.0

Fuel efficiency prediction¶

Learning Objectives¶

Task 1. Data: Auto MPG dataset¶

Task 2. Normalization Layer¶

Task 3. Linear regression¶

Task 4. Regression with a deep neural network (DNN)¶

Task 5. Make predictions¶

Task 6. Nonlinearity¶

Task 7. AutoML - TPOT¶

Additional Resources¶

Acknowledgement and Copyright¶

Acknowledgement¶

Copyright 2018 The TensorFlow Authors.¶

	loss	val_loss	epoch
95	2.511822	2.480037	95
96	2.466233	2.531178	96
97	2.475829	2.464532	97
98	2.468823	2.492199	98
99	2.481020	2.459210	99

	Mean absolute error [MPG]
linear_model	2.485721
dnn_model	2.252128

	Mean absolute error [MPG]
linear_model	2.485721
dnn_model	2.252128
reloaded	2.252128