import pandas as pd
import numpy as np
from util import *
import warnings
warnings.filterwarnings("ignore")


import os

datafolder = 'GPE7'
file_path = 'Dat/'+datafolder
files = os.listdir(file_path)
#files.remove('readme')


files

['.DS_Store', 'C1538.xls', 'C1532.xls', 'C1526.xls', 'C1535.xls', 'C1536.xls']


#get data from dataframe for interested columns
def get_df(path='ProcessedData/'+datafolder+'/',fileName = 'OCV-charge-final.csv', interestCol='Voltage(V)'):
    
    
    df = read_file(path+fileName)
    if type(interestCol) == list:
        new = df[interestCol]
        
    else:
        new = df[[interestCol]]
    return new

#convert data from columns to rows, reset index with cellID
def convertData(df, index_col = 'CellID'):
    ndf = df.unstack().to_frame().T
    ndf.columns = ndf.columns.map('{0[0]}_{0[1]}'.format)
    ndf['CellID'] = index_col
    ndf.set_index(['CellID'],inplace=True)
    
    return ndf


path='ProcessedData/'+datafolder+'/'
# original columns name
col =['Cycle ID', 'Step Name', 'Voltage(V)',
       'Current(mA)', 'Capacity(mAh)', 'Time(h:min:s.ms)',
       'Realtime', 'Time', 'RTime']
col2 =['Cycle ID', 'Step Name', 'Voltage(V)-d',
       'Current(mA)', 'Capacity(mAh)', 'Time(h:min:s.ms)',
       'Realtime', 'Time', 'RTime']
cap_col=['Cycle ID', 'Cap_Chg(mAh)',
       'Cap_DChg(mAh)', 'Chg/DChg Efficiency(%)']

#set the number correctly depending the ending condition
cyc_dis = 74 # define how many cycles in the datasheet, length of data
cyc_charging = 75 #define voltage data
#rename columns' name for all data frame
all_list = ['OCV_charge', 'OCV_discharge', 'Cap_new','Vol_new']

OCV_charge = ['OCV_charge'+str(c) for c in range(1,cyc_charging+1)]

OCV_discharge = ['OCV_discharge'+str(c) for c in range(1,cyc_dis+1)]

drop_cycid = ['Cycle ID'+str(c) for c in range(1,cyc_charging+1) ]
Cap_new = []
Vol_new = []
voltage_max = []
voltage_min = []
cyc = []

for i, j in zip(col, cap_col):
    for l in range(1,cyc_charging+1):

        Vol_new.append(i+str(l))
        Cap_new.append(j+str(l))
        
for l in range(1,cyc_charging+1):
   
    voltage_max.append('Voltage_max'+str(l))
    
    voltage_min.append('Voltage_min'+str(l))
    cyc.append('Cycle ID'+str(l))


print(len(cyc), len(Vol_new))

75 300


#c = df_final.columns.to_list()
#c[0].startswith('Cyc')
Col_cap = [c+"C1526" for c in cap_col]
cell_cap = get_df(path, 'Cap_final.csv', interestCol=Col_cap)
cell_cap.head()


cell_cap.tail()


cell_cap['Cap_retention'] = cell_cap['Cap_DChg(mAh)C1526']/cell_cap.iloc[0]['Cap_Chg(mAh)C1526']


cell_cap.tail()


p = convertData(cell_cap,index_col= "C1526")


p.head()


p.shape

(1, 375)


import seaborn as sns
sns.pairplot(cell_cap[cell_cap.columns], diag_kind='kde')

<seaborn.axisgrid.PairGrid at 0x7f8bc95d9d30>


#!pip install tensorflow


#!pip install keras


from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
#from keras.wrappers.scikit_learn import KerasClassifier
from keras.wrappers.scikit_learn import KerasRegressor
def create_model(learning_rate=0.01):
    linear_model = tf.keras.Sequential([normalizer, tf.keras.layers.Dense(1)])
    linear_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='MAE',metrics=['accuracy'])
    
    return linear_model

clf = KerasRegressor(build_fn=create_model, epochs=100, verbose=0, validation_split = 0.2)

def tf_train(X, y, clf,standardize = True) -> None:
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)
    print(X_train.shape, y_train.shape)

    if standardize:
        #pipeline = Pipeline([('clf',clf)])
        pipe = Pipeline([('scaler',StandardScaler()), ('classifier',clf)])
        print("Using Normalized data")
    else:
        pipe = Pipeline(['classifier',clf])
    

    history = pipe.fit(X_train, y_train)
    
    y_pred = pipe.predict(X_test)
    
    print("Mean squared error ",mean_squared_error(y_test, y_pred))
    print("r2 score ",r2_score(y_test, y_pred))
    return y_pred


# opening the file in read mode
my_file = open(path+datafolder+"cells.txt", "r")
  
# reading the file
data = my_file.read()
  
# replacing end of line('/n') with ' ' and
# splitting the text it further when '.' is seen.
cells_into_list = data.split("\n")
cells_into_list.pop()
ocv_1 = pd.DataFrame()
ocv_2 = pd.DataFrame()
Cap = pd.DataFrame()
df_max = pd.DataFrame()
df_min = pd.DataFrame()
for cell in cells_into_list:
    print("processing cells: ", cell)
    Col = 'Voltage(V)' + cell
    Col_cap = [c+cell for c in cap_col]
    
    df = get_df(path, 'OCV-charge-final.csv', interestCol=Col)
    ndf = convertData(df,index_col=cell)
    #change column name according to cycle numbers
    print("Coverting charging OCV...")
    ndf.columns=OCV_charge
    
    ocv_1 = pd.concat([ocv_1,ndf], axis=0)
    
    df2 = get_df(path, 'OCV-discharge-final.csv', interestCol=Col)
    print("Coverting discharging OCV ...")
    ndf2 = convertData(df2,index_col=cell)
    ndf2.columns=OCV_discharge
    ocv_2 = pd.concat([ocv_2,ndf2], axis=0)
    
    cell_cap = get_df(path, 'Cap_final.csv', interestCol=Col_cap)
    cell_cap['Cap_retention'] = cell_cap['Cap_DChg(mAh)'+cell]/cell_cap.iloc[0]['Cap_Chg(mAh)'+cell]
    p = convertData(cell_cap,index_col= cell)
    p.columns = Cap_new
    
    Cap = pd.concat([Cap,p], axis=0)
    
    #find out the max and minimum potential during charging discharging
    vol = pd.read_csv(path+ 'Vol_final.csv')
    max_v = pd.DataFrame(vol.groupby('Cycle ID'+cell)[Col].max())
    min_v = pd.DataFrame(vol.groupby('Cycle ID'+cell)[Col].min())
    print("max_v, min_v shape",max_v.shape, min_v.shape)
    #convert data to one row
    print("Coverting max charging OCV...")
    df_max_v = convertData(max_v,index_col=cell)
    df_max_v.columns=voltage_max[:df_max_v.shape[1]] #different length cycling
    
    print("Coverting min discharging OCV...")
    df_min_v = convertData(min_v,index_col=cell)
    df_min_v.columns=voltage_min[:df_min_v.shape[1]]
    
    df_max = pd.concat([df_max,df_max_v],axis=0)
    df_min = pd.concat([df_min,df_min_v],axis=0)

#concat all dataframe into one    
df_final = pd.concat([ocv_1,
ocv_2,
Cap,
df_max,
df_min], axis = 1)    
print("All data processing is done!")


# This cell converts data with individual cell name
re = [] # retention
cy = [] # cycle ID
vo = [] # charge voltage
vo_d = [] # discharge voltage
# opening the file in read mode
my_file = open(path+datafolder+"cells.txt", "r")
  
# reading the file
data = my_file.read()
  
# replacing end of line('/n') with ' ' and
# splitting the text it further when '.' is seen.
cells_into_list = data.split("\n")
cells_into_list.pop()
ocv_1 = pd.DataFrame()
ocv_2 = pd.DataFrame()
Cap = pd.DataFrame()
df_max = pd.DataFrame()
df_min = pd.DataFrame()
for cell in cells_into_list:
    cy.append('Cycle ID'+cell)
    vo.append('Voltage(V)'+cell)
    vo_d.append('Voltage(V)-d'+cell)
    print("processing cells: ", cell)
    Col = 'Voltage(V)' + cell
    Col2 = 'Voltage(V)-d' + cell
    Col_cap = [c+cell for c in cap_col]
    
    df = get_df(path, 'OCV-charge-final.csv', interestCol=Col)
    print("Coverting charging OCV...")
    
    ocv_1 = pd.concat([ocv_1,df], axis=1)
    ocv_1 = ocv_1.loc[~ocv_1.index.duplicated(keep='first')]
    
    df2 = get_df(path, 'OCV-discharge-final.csv', interestCol=Col2)
    print("Coverting discharging OCV ...")
    

    ocv_2 = pd.concat([ocv_2,df2], axis=1)
    ocv_2 = ocv_2.loc[~ocv_2.index.duplicated(keep='first')]
    
    cell_cap = get_df(path, 'Cap_final.csv', interestCol=Col_cap)
    cell_cap['Cap_retention'+cell] = cell_cap['Cap_DChg(mAh)'+cell]/cell_cap.iloc[0]['Cap_Chg(mAh)'+cell]
    re.append('Cap_retention'+cell)
    Cap = pd.concat([Cap,cell_cap], axis=1)
    Cap = Cap.loc[~Cap.index.duplicated(keep='first')]


    #find out the max and minimum potential during charging discharging
    vol = pd.read_csv(path+ 'Vol_final.csv')
    max_v = pd.DataFrame(vol.groupby('Cycle ID'+cell)[Col].max())
    min_v = pd.DataFrame(vol.groupby('Cycle ID'+cell)[Col].min())
    print("max_v, min_v shape",max_v.shape, min_v.shape)
    #convert data to one row
    print("Coverting max charging OCV...")
    df_max = pd.concat([df_max,max_v],axis=1)
    df_max = df_max.loc[~df_max.index.duplicated(keep='first')]


    print("Coverting min discharging OCV...")
    
    df_min = pd.concat([df_min,min_v],axis=1)
    df_min = df_min.loc[~df_min.index.duplicated(keep='first')]
    


#concat all dataframe into one    
df_final = pd.concat([ocv_1,
ocv_2,
Cap,
df_max,
df_min], axis = 1)    
print("All data processing is done!")

processing cells:  C1538
Coverting charging OCV...
Coverting discharging OCV ...
max_v, min_v shape (75, 1) (75, 1)
Coverting max charging OCV...
Coverting min discharging OCV...
processing cells:  C1532
Coverting charging OCV...
Coverting discharging OCV ...
max_v, min_v shape (75, 1) (75, 1)
Coverting max charging OCV...
Coverting min discharging OCV...
processing cells:  C1526
Coverting charging OCV...
Coverting discharging OCV ...
max_v, min_v shape (74, 1) (74, 1)
Coverting max charging OCV...
Coverting min discharging OCV...
processing cells:  C1535
Coverting charging OCV...
Coverting discharging OCV ...
max_v, min_v shape (75, 1) (75, 1)
Coverting max charging OCV...
Coverting min discharging OCV...
processing cells:  C1536
Coverting charging OCV...
Coverting discharging OCV ...
max_v, min_v shape (74, 1) (74, 1)
Coverting max charging OCV...
Coverting min discharging OCV...
All data processing is done!


# this cell convert all cells data in one .
re = [] # retention
cy = [] # cycle ID
vo = [] # charge voltage
vo_d = [] # discharge voltage
# opening the file in read mode
my_file = open(path+datafolder+"cells.txt", "r")
  
# reading the file
data = my_file.read()
  
# replacing end of line('/n') with ' ' and
# splitting the text it further when '.' is seen.
cells_into_list = data.split("\n")
cells_into_list.pop()
ocv_1 = pd.DataFrame()
ocv_2 = pd.DataFrame()
Cap = pd.DataFrame()
df_max = pd.DataFrame()
df_min = pd.DataFrame()
for cell in cells_into_list:
    cy.append('Cycle ID'+cell)
    vo.append('Voltage(V)'+cell)
    vo_d.append('Voltage(V)-d'+cell)
    print("processing cells: ", cell)
    Col = 'Voltage(V)' + cell
    Col2 = 'Voltage(V)-d' + cell
    Col_cap = [c+cell for c in cap_col]
    
    df = get_df(path, 'OCV-charge-final.csv', interestCol=Col)
    print("Coverting charging OCV...")
    
    df.columns = ['Voltage(V)']
    ocv_1 = pd.concat([ocv_1,df], axis=0,ignore_index=True)
    ocv_1 = ocv_1.loc[~ocv_1.index.duplicated(keep='first')]
    
    df2 = get_df(path, 'OCV-discharge-final.csv', interestCol=Col2)
    print("Coverting discharging OCV ...")
    
    df2.columns = ['Voltage(V)-d']
    ocv_2 = pd.concat([ocv_2,df2], axis=0,ignore_index=True)
    ocv_2 = ocv_2.loc[~ocv_2.index.duplicated(keep='first')]
    
    cell_cap = get_df(path, 'Cap_final.csv', interestCol=Col_cap)
    cell_cap['Cap_retention'+cell] = cell_cap['Cap_DChg(mAh)'+cell]/cell_cap.iloc[0]['Cap_Chg(mAh)'+cell]
    re.append('Cap_retention'+cell)
    cell_cap.columns = ['Cycle ID', 'Cap_Chg(mAh)', 'Cap_DChg(mAh)',
       'Chg/DChg Efficiency(%)', 'Cap_retention']
    Cap = pd.concat([Cap,cell_cap], axis=0,ignore_index=True)
    Cap = Cap.loc[~Cap.index.duplicated(keep='first')]


    #find out the max and minimum potential during charging discharging
    vol = pd.read_csv(path+ 'Vol_final.csv')
    max_v = pd.DataFrame(vol.groupby('Cycle ID'+cell)[Col].max())
    min_v = pd.DataFrame(vol.groupby('Cycle ID'+cell)[Col].min())
    print("max_v, min_v shape",max_v.shape, min_v.shape)
    #convert data to one row
    print("Coverting max charging OCV...")
    max_v.columns = ['Voltage(V)-max']
    df_max = pd.concat([df_max,max_v],axis=0,ignore_index=True)
    df_max = df_max.loc[~df_max.index.duplicated(keep='first')]


    print("Coverting min discharging OCV...")
    min_v.columns = ['Voltage(V)-min']
    df_min = pd.concat([df_min,min_v],axis=0,ignore_index=True)
    df_min = df_min.loc[~df_min.index.duplicated(keep='first')]
    


#concat all dataframe into one    
df_final_one = pd.concat([ocv_1,
ocv_2,
Cap,
df_max,
df_min], axis = 1)    
print("All data processing is done!")

processing cells:  C1538
Coverting charging OCV...
Coverting discharging OCV ...
max_v, min_v shape (75, 1) (75, 1)
Coverting max charging OCV...
Coverting min discharging OCV...
processing cells:  C1532
Coverting charging OCV...
Coverting discharging OCV ...
max_v, min_v shape (75, 1) (75, 1)
Coverting max charging OCV...
Coverting min discharging OCV...
processing cells:  C1526
Coverting charging OCV...
Coverting discharging OCV ...
max_v, min_v shape (74, 1) (74, 1)
Coverting max charging OCV...
Coverting min discharging OCV...
processing cells:  C1535
Coverting charging OCV...
Coverting discharging OCV ...
max_v, min_v shape (75, 1) (75, 1)
Coverting max charging OCV...
Coverting min discharging OCV...
processing cells:  C1536
Coverting charging OCV...
Coverting discharging OCV ...
max_v, min_v shape (74, 1) (74, 1)
Coverting max charging OCV...
Coverting min discharging OCV...
All data processing is done!


df_final_one.columns

Index(['Voltage(V)', 'Voltage(V)-d', 'Cycle ID', 'Cap_Chg(mAh)',
       'Cap_DChg(mAh)', 'Chg/DChg Efficiency(%)', 'Cap_retention',
       'Voltage(V)-max', 'Voltage(V)-min'],
      dtype='object')


df_min_v.head()


sns.pairplot(df_final_one,hue='Cycle ID')

<seaborn.axisgrid.PairGrid at 0x7f8ba1a23f10>


df_final_one.tail()


df_final_one.shape

(76, 9)


df_final_one.describe().T


from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score


def train(X, y, standardize = True) -> None:
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)


    if standardize:
        
        pipe = make_pipeline(StandardScaler(), linear_model.LinearRegression())
        print("Using Normalized data")
    else:
        pipe = make_pipeline(linear_model.LinearRegression())
    
    pipeline = pipe.fit(X_train,y_train)
    y_pred = pipeline.predict(X_test)
    
    print("Mean squared error ",mean_squared_error(y_test, y_pred))
    print("r2 score ",r2_score(y_test, y_pred))
    return y_pred


#This training session has efficiency, voltage as X, retention is y. 
# it should get a linear fitting well data since retention is linearly correlated to charge and dischrge capacity
df_final.dropna(inplace=True)
df_final.drop(cy, axis=1, inplace=True)
X = df_final.drop(re, axis=1)
y = df_final[re]
print(X.shape, y.shape)
train(X, y, standardize = True)

(72, 35) (72, 5)
Using Normalized data
Mean squared error  1.7420678323630677e-32
r2 score  1.0

array([[0.93323239, 0.92948773, 0.9423445 , 0.93894286, 0.94440159],
       [0.93506085, 0.93577326, 0.9483729 , 0.9434    , 0.94780149],
       [0.92591852, 0.91531671, 0.93185909, 0.9324    , 0.93734465],
       [0.93297526, 0.92503071, 0.94048741, 0.9358    , 0.94097312],
       [0.93140392, 0.92477358, 0.94128739, 0.9332    , 0.94125882],
       [0.93826067, 0.93420188, 0.92717351, 0.934     , 0.93840176],
       [0.93297526, 0.93837319, 0.93420188, 0.93711429, 0.94048741],
       [0.92694703, 0.9145453 , 0.91877375, 0.929     , 0.93211623],
       [0.932461  , 0.92740207, 0.94625868, 0.93868571, 0.94154453],
       [0.91491915, 0.90514557, 0.91851661, 0.91642857, 0.92820205],
       [0.92277584, 0.91900231, 0.92374504, 0.92717143, 0.930002  ],
       [0.95137421, 0.95831548, 0.96068684, 0.95542857, 0.96065827],
       [0.93534655, 0.93394474, 0.95228708, 0.94185714, 0.94940145],
       [0.93037541, 0.92897346, 0.9480872 , 0.94002857, 0.94337305],
       [0.93166105, 0.92528785, 0.94468729, 0.93502857, 0.9420588 ]])


X.columns

Index(['Voltage(V)C1538', 'Voltage(V)C1532', 'Voltage(V)C1526',
       'Voltage(V)C1535', 'Voltage(V)C1536', 'Voltage(V)-dC1538',
       'Voltage(V)-dC1532', 'Voltage(V)-dC1526', 'Voltage(V)-dC1535',
       'Voltage(V)-dC1536', 'Cap_Chg(mAh)C1538', 'Cap_DChg(mAh)C1538',
       'Chg/DChg Efficiency(%)C1538', 'Cap_Chg(mAh)C1532',
       'Cap_DChg(mAh)C1532', 'Chg/DChg Efficiency(%)C1532',
       'Cap_Chg(mAh)C1526', 'Cap_DChg(mAh)C1526',
       'Chg/DChg Efficiency(%)C1526', 'Cap_Chg(mAh)C1535',
       'Cap_DChg(mAh)C1535', 'Chg/DChg Efficiency(%)C1535',
       'Cap_Chg(mAh)C1536', 'Cap_DChg(mAh)C1536',
       'Chg/DChg Efficiency(%)C1536', 'Voltage(V)C1538', 'Voltage(V)C1532',
       'Voltage(V)C1526', 'Voltage(V)C1535', 'Voltage(V)C1536',
       'Voltage(V)C1538', 'Voltage(V)C1532', 'Voltage(V)C1526',
       'Voltage(V)C1535', 'Voltage(V)C1536'],
      dtype='object')


df_final.dropna(inplace=True)
X_new = df_final[vo+vo_d]
y_new = df_final[re]
print(X_new.shape, y_new.shape)
train(X_new, y_new, standardize = True)


X_new.columns

Index(['Voltage(V)C1538', 'Voltage(V)C1538', 'Voltage(V)C1538',
       'Voltage(V)C1532', 'Voltage(V)C1532', 'Voltage(V)C1532',
       'Voltage(V)C1526', 'Voltage(V)C1526', 'Voltage(V)C1526',
       'Voltage(V)C1535', 'Voltage(V)C1535', 'Voltage(V)C1535',
       'Voltage(V)C1536', 'Voltage(V)C1536', 'Voltage(V)C1536',
       'Voltage(V)-dC1538', 'Voltage(V)-dC1532', 'Voltage(V)-dC1526',
       'Voltage(V)-dC1535', 'Voltage(V)-dC1536'],
      dtype='object')


# scikit learn training
train(X_new, y_new, standardize = True)

Using Normalized data
Mean squared error  3.8300820045558005e-06
r2 score  0.9551582157842831

array([[0.93125659, 0.92755233, 0.94175398, 0.93804557, 0.94341722],
       [0.93404823, 0.93393317, 0.94771303, 0.94295516, 0.94690992],
       [0.92461709, 0.91337739, 0.93138537, 0.93256632, 0.93760171],
       [0.93086458, 0.92330521, 0.93844584, 0.93377238, 0.93910455],
       [0.92858767, 0.92096183, 0.93992557, 0.93078092, 0.93957201],
       [0.93742076, 0.93306938, 0.92689531, 0.93518941, 0.93941512],
       [0.93210758, 0.93768708, 0.93314921, 0.9371115 , 0.94051248],
       [0.92612363, 0.91478939, 0.91692426, 0.92823036, 0.93084432],
       [0.93148588, 0.92601264, 0.94511104, 0.93721136, 0.94018626],
       [0.92095913, 0.91128887, 0.9249266 , 0.92136828, 0.93298572],
       [0.92357166, 0.91912178, 0.92409218, 0.92838652, 0.93061378],
       [0.94963455, 0.95589937, 0.95943277, 0.95436735, 0.95955672],
       [0.93331531, 0.93183405, 0.9514257 , 0.94126375, 0.94880044],
       [0.93035938, 0.92847614, 0.94831513, 0.93989741, 0.94314697],
       [0.93190155, 0.92419011, 0.94556611, 0.93616341, 0.94277647]])


df_final_one.dropna(inplace=True)
X_new_one = df_final_one[['Voltage(V)', 'Voltage(V)-d','Voltage(V)-max', 'Voltage(V)-min']]
y_new_one = df_final_one[['Cap_retention']]
print(X_new_one.shape, y_new_one.shape)
train(X_new_one, y_new_one, standardize = True)


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


dataset = cell_cap.dropna()
dataset.drop(['Cycle IDC1526'], axis=1, inplace=True)
train_data = dataset.sample(frac = 0.8,random_state=0)
test_data = dataset.drop(train_data.index)

train_data.describe()


dataset.head()


# train_features = train_data.drop(['Cap_retention'], axis=1)
# test_features = test_data.drop(['Cap_retention'], axis=1)
# train_labels = train_data['Cap_retention']
# test_labels = test_data['Cap_retention']
train_features, test_features, train_labels, test_labels = train_test_split(X_new_one,y_new_one,test_size=0.2, random_state=0)
print(train_features.shape, train_labels.shape)

(292, 4) (292, 1)


normalizer = tf.keras.layers.Normalization()
normalizer.adapt(train_features)
print(f'feature mean:{normalizer.mean.numpy().squeeze()}\n')
print(f'feature variance:{normalizer.variance.numpy().squeeze()}\n')

feature mean:[4.2846317 3.5689516 4.2978477 2.9996626]

feature variance:[2.1634660e-05 3.4198158e-05 1.4185647e-03 1.2927492e-03]


linear_model = tf.keras.Sequential([normalizer, tf.keras.layers.Dense(1)])


linear_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss='MAE')


%%time
history = linear_model.fit(train_features, train_labels, epochs=100, verbose=0, validation_split = 0.2)

CPU times: user 2.6 s, sys: 308 ms, total: 2.9 s
Wall time: 2.52 s


hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch

def plot_loss(history):
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.xlabel('Epoch')
    plt.ylabel('Cap. retension')
    plt.legend()
    plt.grid(True)
    plt.title('Loss of training and validation datasets')
plot_loss(history)


test_results = {}

test_results['linear_model'] = linear_model.evaluate(test_features, test_labels)

3/3 [==============================] - 0s 2ms/step - loss: 0.0204


test_results

{'linear_model': 0.020403238013386726}


test_labels[0:1]


linear_model.predict(test_features[0:1])

1/1 [==============================] - 0s 16ms/step

array([[0.93518853]], dtype=float32)


test_predictions = linear_model.predict(test_features).flatten()
print(test_predictions)
print(test_labels)
def plot_err(test_labels,test_predictions):
    a = plt.axes(aspect='equal')
    plt.scatter(test_labels, test_predictions)
    plt.xlabel('True Values [Cap. retention]')
    plt.ylabel('Predictions [Cap. retention]')
    lims = [0.9, 0.98]
    plt.xlim(lims)
    plt.ylim(lims)
    _ = plt.plot(lims, lims)
plot_err(test_labels,test_predictions)

3/3 [==============================] - 0s 1ms/step
[0.93518853 0.93345743 0.9353479  0.93571234 0.9351162  0.93468064
 0.93693304 0.93536174 0.934299   0.93730175 0.93426895 0.93166244
 0.9334172  0.93384963 0.9368563  0.9341507  0.9335197  0.9359204
 0.65846604 0.9320125  0.9327207  0.9351929  0.93552536 0.9333121
 0.9356105  0.935041   0.93401253 0.9361842  0.9336246  0.93282896
 0.9334561  0.9363537  0.9371927  0.9362437  0.93622863 0.93701404
 0.9342227  0.9337846  0.9334619  0.9350292  0.93433595 0.935823
 0.932614   0.9347445  0.93655306 0.93665797 0.9319494  0.93328834
 0.9344626  0.9349158  0.9328428  0.9346716  0.9344144  0.93562526
 0.93625736 0.9357346  0.9353207  0.9342359  0.9356215  0.9375454
 0.93559766 0.9330607  0.9377341  0.93532354 0.9356164  0.9343173
 0.9359117  0.9336048  0.9326871  0.93572    0.6722496  0.9344815
 0.93457407]
     Cap_retention
106       0.924774
262       0.935543
45        0.928261
26        0.943460
78        0.947573
..             ...
220       0.945087
160       0.946002
74        0.267013
171       0.952287
114       0.926345

[73 rows x 1 columns]


error = history.history['val_loss']
plt.hist(error, bins=25)
plt.xlabel('Prediction Error [Cap. retention]')
_ = plt.ylabel('Count')


tf_train(X_new_one, y_new_one, clf,standardize = True)

(292, 4) (292, 1)
Using Normalized data
Mean squared error  21.666438846199082
r2 score  -1600.7242456233232

array([  1.5893304,   1.4809682,   1.3511813,   1.2703431,   1.332761 ,
         1.458071 ,   1.6359656,   1.3496668,   1.4166682,   1.3061583,
         1.6300495,   1.6927972,   1.5648146,   1.6502798,   1.308687 ,
         1.6440763,   1.5860374,   1.4708145, -28.263922 ,   1.6831822,
         1.6044693,   1.5850809,   1.4712341,   1.509927 ,   1.2379196,
         1.6798527,   1.3174121,   1.5509632,   1.44641  ,   1.4641373,
         1.7189019,   1.6157234,   1.3663833,   1.39131  ,   1.5147235,
         1.2598279,   1.3107064,   1.6347802,   1.4998543,   1.4125507,
         1.638058 ,   1.2320821,   1.5471752,   1.6333497,   1.337158 ,
         1.4043167,   1.558923 ,   1.6211083,   1.4569037,   1.4251707,
         1.4741914,   1.5238521,   1.3678386,   1.6179335,   1.5795295,
         1.4740584,   1.3920453,   1.598145 ,   1.4523752,   1.613189 ,
         1.4861934,   1.5203724,   1.3390105,   1.4786355,   1.485678 ,
         1.4077947,   1.4757016,   1.692817 ,   1.5813243,   1.5771105,
       -27.098623 ,   1.5492551,   1.5964956], dtype=float32)


%%time
from tpot import TPOTRegressor
tpot = TPOTRegressor(generations=4, # set to 4 to run faster for testing
                     population_size=40,
                     
                     verbosity=2,
                     random_state=42)
tpot.fit(train_features, train_labels)

Generation 1 - Current best internal CV score: -5.2456329057359486e-05

Generation 2 - Current best internal CV score: -5.2456329057359486e-05

Generation 3 - Current best internal CV score: -5.2029186352093085e-05

Generation 4 - Current best internal CV score: -5.089168529401108e-05

Best pipeline: RandomForestRegressor(LassoLarsCV(input_matrix, normalize=False), bootstrap=True, max_features=0.7500000000000001, min_samples_leaf=11, min_samples_split=9, n_estimators=100)
CPU times: user 2min 39s, sys: 6.54 s, total: 2min 46s
Wall time: 1min 47s

TPOTRegressor(generations=4, population_size=40, random_state=42, verbosity=2)


from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoLarsCV
import joblib
from pathlib import Path
#BASE_DIR = Path(__file__).resolve(strict=True).parent
BASE_DIR = '/Users/michael/Documents/Projects/NewareDataProcessing'
# Instantiate and fit the model
rfc = RandomForestRegressor(bootstrap=True, max_features=0.7500000000000001, min_samples_leaf=11, min_samples_split=9, n_estimators=100)
model = rfc.fit(train_features, train_labels)

# Use the model to predict diabetes type
rfc_predictions = model.predict(test_features)

joblib.dump(model, Path(BASE_DIR).joinpath(f"{'RFR'}.joblib"))

['/Users/michael/Documents/Projects/NewareDataProcessing/RFR.joblib']


# quick test with other cells data
six = pd.read_csv("ProcessedData/OneHundred/Final.csv")
six.dropna(inplace=True)
X_new_ot = six[['Voltage(V)', 'Voltage(V)-d','Voltage(V)-max', 'Voltage(V)-min']]
y_new_ot= six[['Cap_retention']]
y_new_ot.replace(to_replace = 0, value = 1, inplace=True)

y_pred = model.predict(X_new_ot)

mse = mean_squared_error(y_new_ot, y_pred)
score = model.score(X_new_ot, y_new_ot)
print("R-squared:", score)
print("MSE: ", mse)
print("RMSE: ", mse*(1/2.0)) 
x_ax = range(len(y_new_ot))
plt.plot(x_ax, y_new_ot, linewidth=1, label="original")
plt.plot(x_ax, y_pred, linewidth=1.1, label="predicted")
plt.title("y-test and y-predicted data")
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend(loc='best',fancybox=True, shadow=True)
plt.grid(True)
plt.show()

R-squared: -2.1794558040090903
MSE:  0.37744250079647074
RMSE:  0.18872125039823537


six.head(6)


from sklearn.metrics import mean_squared_error

mse = mean_squared_error(test_labels, rfc_predictions)
score = model.score(train_features, train_labels)
print("R-squared:", score)
print("MSE: ", mse)
print("RMSE: ", mse*(1/2.0))

R-squared: 0.655229440029654
MSE:  0.013747138922283401
RMSE:  0.006873569461141701


x_ax = range(len(test_labels))
plt.plot(x_ax, test_labels, linewidth=1, label="original")
plt.plot(x_ax, rfc_predictions, linewidth=1.1, label="predicted")
plt.title("y-test and y-predicted data")
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend(loc='best',fancybox=True, shadow=True)
plt.grid(True)
plt.show()


import shap

explainer = shap.TreeExplainer(model)
shap_val = explainer.shap_values(train_features)
# #shap.summary_plot(shap_val, XN.values,plot_type='bar')
#shap.plots.beeswarm(shap_val, max_display=20)
shap.summary_plot(shap_val, train_features)


from sklearn.tree import export_graphviz
estimator = model.estimators_[5]
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = train_features.columns,
                class_names = train_labels.columns,
                rounded = True, proportion = False, 
                precision = 2, filled = True)


# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')


c = df_final.columns.to_list()
print(c)


df_final[['Cycle ID1', 'Cap_Chg(mAh)1', 'Cap_DChg(mAh)1', 'Chg/DChg Efficiency(%)1']]


#df_final.reset_index(inplace=True)
#df_final.rename(columns={'CellID':'CellID'})


#x = df_final.pivot(columns='CellID')


#x.shape


df_train = df_final.copy()
df_train.drop(columns=drop_cycid,inplace=True)
df_train.fillna(0, inplace=True)


df_train.head()


cor = df_train.corr()
keep_columns = np.full(cor.shape[0], True)
for i in range(cor.shape[0] - 1):
    for j in range(i + 1, cor.shape[0] - 1):
        if (np.abs(cor.iloc[i, j]) >= 0.9): # 0.8 is the correlation threshold
            keep_columns[j] = False
selected_columns = df_train.columns[keep_columns]
X_train_reduced = df_train[selected_columns]


X_train_reduced.shape


X_train_reduced.columns


X_train_reduced.tail()


#clustering the data
from sklearn.preprocessing import StandardScaler

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

import os
import warnings

warnings.filterwarnings('ignore')
from matplotlib import style
style.use('dark_background')


from sklearn.cluster import KMeans
X = X_train_reduced
clusters = []

for i in range(1, 5):
    km =  KMeans(n_clusters=i).fit(X) 
    clusters.append(km.inertia_)
    
fig, ax = plt.subplots(figsize=(12, 8))
sns.lineplot(x=list(range(1, 5)), y=clusters, ax=ax)
ax.set_title('Searching for Elbow')
ax.set_xlabel('Clusters')
ax.set_ylabel('Inertia')


km3 = KMeans(n_clusters = 3).fit(X) 

X['Labels'] = km3.labels_
plt.figure(figsize=(12, 8))
sns.scatterplot(X['OCV_charge1'], X['Cap_Chg(mAh)3'], hue=X['Labels'], 
                palette=sns.color_palette('hls', 3))
plt.title('KMeans with 3 Clusters')
plt.show()


X.head()


t = X.copy()
t.reset_index(inplace=True)

t.rename(columns={'CellID':'CellID'})


plt.figure(figsize=(12, 8))
sns.scatterplot( t['OCV_charge1'],t['Cap_Chg(mAh)3'], hue=t['Labels'], 
                palette=sns.color_palette('hls', 3))
plt.title('KMeans with 3 Clusters')
plt.show()


t['Voltage_min1']


t[['CellID','Labels']].sort_values(by='Labels')


from sklearn.decomposition import PCA
n=df_train.shape[0]

#Finding principal components for the data
pca =PCA(n_components=n, random_state=1)# Applying the PCA algorithm with random state = 1
data_pca1 = pd.DataFrame(pca.fit_transform(df_train)) #Fitting and transforming the pca function on scaled data

#The percentage of variance explained by each principal component
exp_var = pca.explained_variance_ratio_

plt.figure(figsize = (10,10))
plt.plot(range(1,24), exp_var.cumsum(), marker = 'o', linestyle = '--')
plt.title("Explained Variances by Components")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")


# finding the least number of components that can explain more than 90% variance
sum = 0
for ix, i in enumerate(exp_var):
  sum = sum + i
  if(sum>0.90):
    print("Number of PCs that explain at least 90% variance: ", ix+1)
    break


pc_comps = ['PC1','PC2','PC3']
data_pca = pd.DataFrame(np.round(pca.components_[:3,:],2),index=pc_comps,columns=df_train.columns)
data_pca.T


df_concat = pd.concat([data_pca1, X], axis=1)

plt.figure(figsize = (7,7))
#Create a scatter plot with x=0 and y=1 using df_concat dataframe
sns.scatterplot(x = 0, y = 1, data=df_concat, hue = 'OCV_charge1')

plt.xlabel("PC1")
plt.ylabel("PC2")


df_concat.head()

	Cycle IDC1526	Cap_Chg(mAh)C1526	Cap_DChg(mAh)C1526	Chg/DChg Efficiency(%)C1526
0	1.0	3.5001	3.1974	91.352
1	2.0	3.2762	3.2799	100.113
2	3.0	3.2909	3.3607	102.120
3	4.0	3.3658	3.3276	98.864
4	5.0	3.3224	3.2781	98.667

	Cycle IDC1526	Cap_Chg(mAh)C1526	Cap_DChg(mAh)C1526	Chg/DChg Efficiency(%)C1526
70	71.0	3.2330	3.3079	102.315
71	72.0	3.3123	3.2176	97.141
72	73.0	3.2164	3.2149	99.951
73	74.0	3.2158	0.6326	19.671
74	NaN	NaN	NaN	NaN

	Cycle IDC1526	Cap_Chg(mAh)C1526	Cap_DChg(mAh)C1526	Chg/DChg Efficiency(%)C1526	Cap_retention
70	71.0	3.2330	3.3079	102.315	0.945087
71	72.0	3.3123	3.2176	97.141	0.919288
72	73.0	3.2164	3.2149	99.951	0.918517
73	74.0	3.2158	0.6326	19.671	0.180738
74	NaN	NaN	NaN	NaN	NaN

	Cycle IDC1526_0	Cycle IDC1526_1	Cycle IDC1526_2	Cycle IDC1526_3	Cycle IDC1526_4	Cycle IDC1526_5	Cycle IDC1526_6	Cycle IDC1526_7	Cycle IDC1526_8	Cycle IDC1526_9	...	Cap_retention_65	Cap_retention_66	Cap_retention_67	Cap_retention_68	Cap_retention_69	Cap_retention_70	Cap_retention_71	Cap_retention_72	Cap_retention_73	Cap_retention_74
CellID
C1526	1.0	2.0	3.0	4.0	5.0	6.0	7.0	8.0	9.0	10.0	...	0.917974	0.928973	0.915117	0.916917	0.923231	0.945087	0.919288	0.918517	0.180738	NaN

	Voltage(V)	Voltage(V)-d	Cycle ID	Cap_Chg(mAh)	Cap_DChg(mAh)	Chg/DChg Efficiency(%)	Cap_retention	Voltage(V)-max	Voltage(V)-min
71.0	4.2872	3.5543	72.0	3.2268	3.2752	101.501	0.935718	4.2999	2.9975
72.0	4.2863	3.5630	73.0	3.2749	3.2024	97.787	0.914919	4.3002	2.9991
73.0	4.2857	3.5636	74.0	3.2029	3.1951	99.754	0.912834	4.2999	2.9988
74.0	4.2860	NaN	75.0	3.1985	0.9346	29.220	0.267013	4.2999	2.9966
75.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	4.2999	3.5912

Copy right@Michael Guo¶

In this notebook, I am cocatenate all meaningful data in one dataframe.¶

These data include: voltage, capacity, efficiency, charging OCV, discharging OCV, charging IR, discharging IR.¶

Later, each cells dQ/dV peak position.¶

opening the file in read mode¶

reading the file¶

replacing end of line('/n') with ' ' and¶

splitting the text it further when '.' is seen.¶

Define functions to read files with selected columns, convert data from a column data to row data.¶

Define selected columns name, change the name according to the cell name¶

Experiment regression on single cell data to predict capacity retention.¶

Convert data for all cells, export data to one dataframe for clustering¶

Convert data in one dataframe for one cell, selected features ocv, capacity for regression,¶

Since all cells have the similar properties, we will concat all data together with the same column name.¶

use scikit learn for training¶

Only use OCV for retention prediction¶

use one cell data for the same training df_final_one¶

Explore tpot to search which regressor is the best¶

Use Randomforest regressor for the training¶

Explore correlation of data, remove high correlated data¶

Dimension reduction with correlation: The dimension is reduced from 207 features to 8 features¶

Explore clustering for the data¶

	count	mean	std	min	25%	50%	75%	max
Voltage(V)	365.0	4.284702	0.004368	4.250000	4.282900	4.2854	4.286900	4.291600
Voltage(V)-d	365.0	3.569271	0.006049	3.554300	3.564600	3.5689	3.573500	3.589400
Cycle ID	365.0	37.035616	21.160593	1.000000	19.000000	37.0000	55.000000	75.000000
Cap_Chg(mAh)	365.0	3.278646	0.044866	3.148100	3.254800	3.2801	3.299900	3.500200
Cap_DChg(mAh)	365.0	3.258836	0.188175	0.632600	3.251500	3.2753	3.297300	3.369800
Chg/DChg Efficiency(%)	365.0	99.398082	5.731816	19.671000	99.333000	100.0170	100.384000	102.714000
Cap_retention	365.0	0.931069	0.053763	0.180738	0.928973	0.9358	0.942086	0.962772
Voltage(V)-max	365.0	4.298298	0.033746	3.655700	4.299900	4.3002	4.300500	4.300900
Voltage(V)-min	365.0	3.002579	0.055184	2.992600	2.996300	2.9975	2.999100	3.619100

	Cap_Chg(mAh)C1526	Cap_DChg(mAh)C1526	Chg/DChg Efficiency(%)C1526	Cap_retention
count	59.000000	59.000000	59.000000	59.000000
mean	3.292112	3.238314	98.348017	0.925206
std	0.048912	0.347416	10.518651	0.099259
min	3.202500	0.632600	19.671000	0.180738
25%	3.267400	3.251500	99.120500	0.928973
50%	3.299400	3.294600	99.958000	0.941287
75%	3.319050	3.308800	100.313500	0.945344
max	3.500100	3.362500	102.120000	0.960687

	Unnamed: 0	Voltage(V)	Voltage(V)-d	Cycle ID	Cap_Chg(mAh)	Cap_DChg(mAh)	Chg/DChg Efficiency(%)	Cap_retention	Voltage(V)-max	Voltage(V)-min
0	0	3.4139	3.4933	1	3.5000	2.9844	85.267	0.852686	4.0516	3.0000
1	1	4.0448	3.5091	2	3.5001	3.4149	97.565	0.975686	4.1979	2.9997
2	2	4.1861	3.5063	3	3.5001	3.5149	100.424	1.004257	4.2184	2.9997
3	3	4.2069	3.5292	4	3.5000	3.3471	95.631	0.956314	4.2196	3.0000
4	4	4.2007	3.5224	5	3.5001	3.5461	101.315	1.013171	4.2748	2.9991
5	5	4.2513	3.4936	6	3.5001	0.0000	0.000	0.000000	4.2494	3.5931