import pandas as pd
import numpy as np
from util import *
import warnings
warnings.filterwarnings("ignore")
import os
datafolder = 'GPE7'
file_path = 'Dat/'+datafolder
files = os.listdir(file_path)
#files.remove('readme')
files
['.DS_Store', 'C1538.xls', 'C1532.xls', 'C1526.xls', 'C1535.xls', 'C1536.xls']
exp_path = 'ProcessedData/' ocv_charging = read_file(exp_path+'OCV-charge-final.csv')
ocv_charging.head(5)
c = ocv_charging.columns c
col =['Cycle ID', 'Step Name', 'Voltage(V)', 'Current(mA)', 'Capacity(mAh)', 'Time(h:min:s.ms)', 'Realtime', 'Time', 'RTime']
my_file = open("cells.txt", "r")
data = my_file.read()
cells_into_list = data.split("\n")
#get data from dataframe for interested columns
def get_df(path='ProcessedData/'+datafolder+'/',fileName = 'OCV-charge-final.csv', interestCol='Voltage(V)'):
df = read_file(path+fileName)
if type(interestCol) == list:
new = df[interestCol]
else:
new = df[[interestCol]]
return new
#convert data from columns to rows, reset index with cellID
def convertData(df, index_col = 'CellID'):
ndf = df.unstack().to_frame().T
ndf.columns = ndf.columns.map('{0[0]}_{0[1]}'.format)
ndf['CellID'] = index_col
ndf.set_index(['CellID'],inplace=True)
return ndf
path='ProcessedData/'+datafolder+'/'
# original columns name
col =['Cycle ID', 'Step Name', 'Voltage(V)',
'Current(mA)', 'Capacity(mAh)', 'Time(h:min:s.ms)',
'Realtime', 'Time', 'RTime']
col2 =['Cycle ID', 'Step Name', 'Voltage(V)-d',
'Current(mA)', 'Capacity(mAh)', 'Time(h:min:s.ms)',
'Realtime', 'Time', 'RTime']
cap_col=['Cycle ID', 'Cap_Chg(mAh)',
'Cap_DChg(mAh)', 'Chg/DChg Efficiency(%)']
#set the number correctly depending the ending condition
cyc_dis = 74 # define how many cycles in the datasheet, length of data
cyc_charging = 75 #define voltage data
#rename columns' name for all data frame
all_list = ['OCV_charge', 'OCV_discharge', 'Cap_new','Vol_new']
OCV_charge = ['OCV_charge'+str(c) for c in range(1,cyc_charging+1)]
OCV_discharge = ['OCV_discharge'+str(c) for c in range(1,cyc_dis+1)]
drop_cycid = ['Cycle ID'+str(c) for c in range(1,cyc_charging+1) ]
Cap_new = []
Vol_new = []
voltage_max = []
voltage_min = []
cyc = []
for i, j in zip(col, cap_col):
for l in range(1,cyc_charging+1):
Vol_new.append(i+str(l))
Cap_new.append(j+str(l))
for l in range(1,cyc_charging+1):
voltage_max.append('Voltage_max'+str(l))
voltage_min.append('Voltage_min'+str(l))
cyc.append('Cycle ID'+str(l))
print(len(cyc), len(Vol_new))
75 300
#c = df_final.columns.to_list()
#c[0].startswith('Cyc')
Col_cap = [c+"C1526" for c in cap_col]
cell_cap = get_df(path, 'Cap_final.csv', interestCol=Col_cap)
cell_cap.head()
Cycle IDC1526 | Cap_Chg(mAh)C1526 | Cap_DChg(mAh)C1526 | Chg/DChg Efficiency(%)C1526 | |
---|---|---|---|---|
0 | 1.0 | 3.5001 | 3.1974 | 91.352 |
1 | 2.0 | 3.2762 | 3.2799 | 100.113 |
2 | 3.0 | 3.2909 | 3.3607 | 102.120 |
3 | 4.0 | 3.3658 | 3.3276 | 98.864 |
4 | 5.0 | 3.3224 | 3.2781 | 98.667 |
cell_cap.tail()
Cycle IDC1526 | Cap_Chg(mAh)C1526 | Cap_DChg(mAh)C1526 | Chg/DChg Efficiency(%)C1526 | |
---|---|---|---|---|
70 | 71.0 | 3.2330 | 3.3079 | 102.315 |
71 | 72.0 | 3.3123 | 3.2176 | 97.141 |
72 | 73.0 | 3.2164 | 3.2149 | 99.951 |
73 | 74.0 | 3.2158 | 0.6326 | 19.671 |
74 | NaN | NaN | NaN | NaN |
cell_cap['Cap_retention'] = cell_cap['Cap_DChg(mAh)C1526']/cell_cap.iloc[0]['Cap_Chg(mAh)C1526']
cell_cap.tail()
Cycle IDC1526 | Cap_Chg(mAh)C1526 | Cap_DChg(mAh)C1526 | Chg/DChg Efficiency(%)C1526 | Cap_retention | |
---|---|---|---|---|---|
70 | 71.0 | 3.2330 | 3.3079 | 102.315 | 0.945087 |
71 | 72.0 | 3.3123 | 3.2176 | 97.141 | 0.919288 |
72 | 73.0 | 3.2164 | 3.2149 | 99.951 | 0.918517 |
73 | 74.0 | 3.2158 | 0.6326 | 19.671 | 0.180738 |
74 | NaN | NaN | NaN | NaN | NaN |
p = convertData(cell_cap,index_col= "C1526")
p.head()
Cycle IDC1526_0 | Cycle IDC1526_1 | Cycle IDC1526_2 | Cycle IDC1526_3 | Cycle IDC1526_4 | Cycle IDC1526_5 | Cycle IDC1526_6 | Cycle IDC1526_7 | Cycle IDC1526_8 | Cycle IDC1526_9 | ... | Cap_retention_65 | Cap_retention_66 | Cap_retention_67 | Cap_retention_68 | Cap_retention_69 | Cap_retention_70 | Cap_retention_71 | Cap_retention_72 | Cap_retention_73 | Cap_retention_74 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
CellID | |||||||||||||||||||||
C1526 | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | 6.0 | 7.0 | 8.0 | 9.0 | 10.0 | ... | 0.917974 | 0.928973 | 0.915117 | 0.916917 | 0.923231 | 0.945087 | 0.919288 | 0.918517 | 0.180738 | NaN |
1 rows × 375 columns
p.shape
(1, 375)
import seaborn as sns
sns.pairplot(cell_cap[cell_cap.columns], diag_kind='kde')
<seaborn.axisgrid.PairGrid at 0x7f8bc95d9d30>
Use linear regression first
#!pip install tensorflow
#!pip install keras
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
#from keras.wrappers.scikit_learn import KerasClassifier
from keras.wrappers.scikit_learn import KerasRegressor
def create_model(learning_rate=0.01):
linear_model = tf.keras.Sequential([normalizer, tf.keras.layers.Dense(1)])
linear_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='MAE',metrics=['accuracy'])
return linear_model
clf = KerasRegressor(build_fn=create_model, epochs=100, verbose=0, validation_split = 0.2)
def tf_train(X, y, clf,standardize = True) -> None:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)
print(X_train.shape, y_train.shape)
if standardize:
#pipeline = Pipeline([('clf',clf)])
pipe = Pipeline([('scaler',StandardScaler()), ('classifier',clf)])
print("Using Normalized data")
else:
pipe = Pipeline(['classifier',clf])
history = pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print("Mean squared error ",mean_squared_error(y_test, y_pred))
print("r2 score ",r2_score(y_test, y_pred))
return y_pred
Need to predefine the 'cells.txt' file
# opening the file in read mode
my_file = open(path+datafolder+"cells.txt", "r")
# reading the file
data = my_file.read()
# replacing end of line('/n') with ' ' and
# splitting the text it further when '.' is seen.
cells_into_list = data.split("\n")
cells_into_list.pop()
ocv_1 = pd.DataFrame()
ocv_2 = pd.DataFrame()
Cap = pd.DataFrame()
df_max = pd.DataFrame()
df_min = pd.DataFrame()
for cell in cells_into_list:
print("processing cells: ", cell)
Col = 'Voltage(V)' + cell
Col_cap = [c+cell for c in cap_col]
df = get_df(path, 'OCV-charge-final.csv', interestCol=Col)
ndf = convertData(df,index_col=cell)
#change column name according to cycle numbers
print("Coverting charging OCV...")
ndf.columns=OCV_charge
ocv_1 = pd.concat([ocv_1,ndf], axis=0)
df2 = get_df(path, 'OCV-discharge-final.csv', interestCol=Col)
print("Coverting discharging OCV ...")
ndf2 = convertData(df2,index_col=cell)
ndf2.columns=OCV_discharge
ocv_2 = pd.concat([ocv_2,ndf2], axis=0)
cell_cap = get_df(path, 'Cap_final.csv', interestCol=Col_cap)
cell_cap['Cap_retention'] = cell_cap['Cap_DChg(mAh)'+cell]/cell_cap.iloc[0]['Cap_Chg(mAh)'+cell]
p = convertData(cell_cap,index_col= cell)
p.columns = Cap_new
Cap = pd.concat([Cap,p], axis=0)
#find out the max and minimum potential during charging discharging
vol = pd.read_csv(path+ 'Vol_final.csv')
max_v = pd.DataFrame(vol.groupby('Cycle ID'+cell)[Col].max())
min_v = pd.DataFrame(vol.groupby('Cycle ID'+cell)[Col].min())
print("max_v, min_v shape",max_v.shape, min_v.shape)
#convert data to one row
print("Coverting max charging OCV...")
df_max_v = convertData(max_v,index_col=cell)
df_max_v.columns=voltage_max[:df_max_v.shape[1]] #different length cycling
print("Coverting min discharging OCV...")
df_min_v = convertData(min_v,index_col=cell)
df_min_v.columns=voltage_min[:df_min_v.shape[1]]
df_max = pd.concat([df_max,df_max_v],axis=0)
df_min = pd.concat([df_min,df_min_v],axis=0)
#concat all dataframe into one
df_final = pd.concat([ocv_1,
ocv_2,
Cap,
df_max,
df_min], axis = 1)
print("All data processing is done!")
# This cell converts data with individual cell name
re = [] # retention
cy = [] # cycle ID
vo = [] # charge voltage
vo_d = [] # discharge voltage
# opening the file in read mode
my_file = open(path+datafolder+"cells.txt", "r")
# reading the file
data = my_file.read()
# replacing end of line('/n') with ' ' and
# splitting the text it further when '.' is seen.
cells_into_list = data.split("\n")
cells_into_list.pop()
ocv_1 = pd.DataFrame()
ocv_2 = pd.DataFrame()
Cap = pd.DataFrame()
df_max = pd.DataFrame()
df_min = pd.DataFrame()
for cell in cells_into_list:
cy.append('Cycle ID'+cell)
vo.append('Voltage(V)'+cell)
vo_d.append('Voltage(V)-d'+cell)
print("processing cells: ", cell)
Col = 'Voltage(V)' + cell
Col2 = 'Voltage(V)-d' + cell
Col_cap = [c+cell for c in cap_col]
df = get_df(path, 'OCV-charge-final.csv', interestCol=Col)
print("Coverting charging OCV...")
ocv_1 = pd.concat([ocv_1,df], axis=1)
ocv_1 = ocv_1.loc[~ocv_1.index.duplicated(keep='first')]
df2 = get_df(path, 'OCV-discharge-final.csv', interestCol=Col2)
print("Coverting discharging OCV ...")
ocv_2 = pd.concat([ocv_2,df2], axis=1)
ocv_2 = ocv_2.loc[~ocv_2.index.duplicated(keep='first')]
cell_cap = get_df(path, 'Cap_final.csv', interestCol=Col_cap)
cell_cap['Cap_retention'+cell] = cell_cap['Cap_DChg(mAh)'+cell]/cell_cap.iloc[0]['Cap_Chg(mAh)'+cell]
re.append('Cap_retention'+cell)
Cap = pd.concat([Cap,cell_cap], axis=1)
Cap = Cap.loc[~Cap.index.duplicated(keep='first')]
#find out the max and minimum potential during charging discharging
vol = pd.read_csv(path+ 'Vol_final.csv')
max_v = pd.DataFrame(vol.groupby('Cycle ID'+cell)[Col].max())
min_v = pd.DataFrame(vol.groupby('Cycle ID'+cell)[Col].min())
print("max_v, min_v shape",max_v.shape, min_v.shape)
#convert data to one row
print("Coverting max charging OCV...")
df_max = pd.concat([df_max,max_v],axis=1)
df_max = df_max.loc[~df_max.index.duplicated(keep='first')]
print("Coverting min discharging OCV...")
df_min = pd.concat([df_min,min_v],axis=1)
df_min = df_min.loc[~df_min.index.duplicated(keep='first')]
#concat all dataframe into one
df_final = pd.concat([ocv_1,
ocv_2,
Cap,
df_max,
df_min], axis = 1)
print("All data processing is done!")
processing cells: C1538 Coverting charging OCV... Coverting discharging OCV ... max_v, min_v shape (75, 1) (75, 1) Coverting max charging OCV... Coverting min discharging OCV... processing cells: C1532 Coverting charging OCV... Coverting discharging OCV ... max_v, min_v shape (75, 1) (75, 1) Coverting max charging OCV... Coverting min discharging OCV... processing cells: C1526 Coverting charging OCV... Coverting discharging OCV ... max_v, min_v shape (74, 1) (74, 1) Coverting max charging OCV... Coverting min discharging OCV... processing cells: C1535 Coverting charging OCV... Coverting discharging OCV ... max_v, min_v shape (75, 1) (75, 1) Coverting max charging OCV... Coverting min discharging OCV... processing cells: C1536 Coverting charging OCV... Coverting discharging OCV ... max_v, min_v shape (74, 1) (74, 1) Coverting max charging OCV... Coverting min discharging OCV... All data processing is done!
# this cell convert all cells data in one .
re = [] # retention
cy = [] # cycle ID
vo = [] # charge voltage
vo_d = [] # discharge voltage
# opening the file in read mode
my_file = open(path+datafolder+"cells.txt", "r")
# reading the file
data = my_file.read()
# replacing end of line('/n') with ' ' and
# splitting the text it further when '.' is seen.
cells_into_list = data.split("\n")
cells_into_list.pop()
ocv_1 = pd.DataFrame()
ocv_2 = pd.DataFrame()
Cap = pd.DataFrame()
df_max = pd.DataFrame()
df_min = pd.DataFrame()
for cell in cells_into_list:
cy.append('Cycle ID'+cell)
vo.append('Voltage(V)'+cell)
vo_d.append('Voltage(V)-d'+cell)
print("processing cells: ", cell)
Col = 'Voltage(V)' + cell
Col2 = 'Voltage(V)-d' + cell
Col_cap = [c+cell for c in cap_col]
df = get_df(path, 'OCV-charge-final.csv', interestCol=Col)
print("Coverting charging OCV...")
df.columns = ['Voltage(V)']
ocv_1 = pd.concat([ocv_1,df], axis=0,ignore_index=True)
ocv_1 = ocv_1.loc[~ocv_1.index.duplicated(keep='first')]
df2 = get_df(path, 'OCV-discharge-final.csv', interestCol=Col2)
print("Coverting discharging OCV ...")
df2.columns = ['Voltage(V)-d']
ocv_2 = pd.concat([ocv_2,df2], axis=0,ignore_index=True)
ocv_2 = ocv_2.loc[~ocv_2.index.duplicated(keep='first')]
cell_cap = get_df(path, 'Cap_final.csv', interestCol=Col_cap)
cell_cap['Cap_retention'+cell] = cell_cap['Cap_DChg(mAh)'+cell]/cell_cap.iloc[0]['Cap_Chg(mAh)'+cell]
re.append('Cap_retention'+cell)
cell_cap.columns = ['Cycle ID', 'Cap_Chg(mAh)', 'Cap_DChg(mAh)',
'Chg/DChg Efficiency(%)', 'Cap_retention']
Cap = pd.concat([Cap,cell_cap], axis=0,ignore_index=True)
Cap = Cap.loc[~Cap.index.duplicated(keep='first')]
#find out the max and minimum potential during charging discharging
vol = pd.read_csv(path+ 'Vol_final.csv')
max_v = pd.DataFrame(vol.groupby('Cycle ID'+cell)[Col].max())
min_v = pd.DataFrame(vol.groupby('Cycle ID'+cell)[Col].min())
print("max_v, min_v shape",max_v.shape, min_v.shape)
#convert data to one row
print("Coverting max charging OCV...")
max_v.columns = ['Voltage(V)-max']
df_max = pd.concat([df_max,max_v],axis=0,ignore_index=True)
df_max = df_max.loc[~df_max.index.duplicated(keep='first')]
print("Coverting min discharging OCV...")
min_v.columns = ['Voltage(V)-min']
df_min = pd.concat([df_min,min_v],axis=0,ignore_index=True)
df_min = df_min.loc[~df_min.index.duplicated(keep='first')]
#concat all dataframe into one
df_final_one = pd.concat([ocv_1,
ocv_2,
Cap,
df_max,
df_min], axis = 1)
print("All data processing is done!")
processing cells: C1538 Coverting charging OCV... Coverting discharging OCV ... max_v, min_v shape (75, 1) (75, 1) Coverting max charging OCV... Coverting min discharging OCV... processing cells: C1532 Coverting charging OCV... Coverting discharging OCV ... max_v, min_v shape (75, 1) (75, 1) Coverting max charging OCV... Coverting min discharging OCV... processing cells: C1526 Coverting charging OCV... Coverting discharging OCV ... max_v, min_v shape (74, 1) (74, 1) Coverting max charging OCV... Coverting min discharging OCV... processing cells: C1535 Coverting charging OCV... Coverting discharging OCV ... max_v, min_v shape (75, 1) (75, 1) Coverting max charging OCV... Coverting min discharging OCV... processing cells: C1536 Coverting charging OCV... Coverting discharging OCV ... max_v, min_v shape (74, 1) (74, 1) Coverting max charging OCV... Coverting min discharging OCV... All data processing is done!
df_final_one.columns
Index(['Voltage(V)', 'Voltage(V)-d', 'Cycle ID', 'Cap_Chg(mAh)', 'Cap_DChg(mAh)', 'Chg/DChg Efficiency(%)', 'Cap_retention', 'Voltage(V)-max', 'Voltage(V)-min'], dtype='object')
df_min_v.head()
sns.pairplot(df_final_one,hue='Cycle ID')
<seaborn.axisgrid.PairGrid at 0x7f8ba1a23f10>
df_final_one.tail()
Voltage(V) | Voltage(V)-d | Cycle ID | Cap_Chg(mAh) | Cap_DChg(mAh) | Chg/DChg Efficiency(%) | Cap_retention | Voltage(V)-max | Voltage(V)-min | |
---|---|---|---|---|---|---|---|---|---|
71.0 | 4.2872 | 3.5543 | 72.0 | 3.2268 | 3.2752 | 101.501 | 0.935718 | 4.2999 | 2.9975 |
72.0 | 4.2863 | 3.5630 | 73.0 | 3.2749 | 3.2024 | 97.787 | 0.914919 | 4.3002 | 2.9991 |
73.0 | 4.2857 | 3.5636 | 74.0 | 3.2029 | 3.1951 | 99.754 | 0.912834 | 4.2999 | 2.9988 |
74.0 | 4.2860 | NaN | 75.0 | 3.1985 | 0.9346 | 29.220 | 0.267013 | 4.2999 | 2.9966 |
75.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 4.2999 | 3.5912 |
df_final_one.shape
(76, 9)
df_final_one.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Voltage(V) | 365.0 | 4.284702 | 0.004368 | 4.250000 | 4.282900 | 4.2854 | 4.286900 | 4.291600 |
Voltage(V)-d | 365.0 | 3.569271 | 0.006049 | 3.554300 | 3.564600 | 3.5689 | 3.573500 | 3.589400 |
Cycle ID | 365.0 | 37.035616 | 21.160593 | 1.000000 | 19.000000 | 37.0000 | 55.000000 | 75.000000 |
Cap_Chg(mAh) | 365.0 | 3.278646 | 0.044866 | 3.148100 | 3.254800 | 3.2801 | 3.299900 | 3.500200 |
Cap_DChg(mAh) | 365.0 | 3.258836 | 0.188175 | 0.632600 | 3.251500 | 3.2753 | 3.297300 | 3.369800 |
Chg/DChg Efficiency(%) | 365.0 | 99.398082 | 5.731816 | 19.671000 | 99.333000 | 100.0170 | 100.384000 | 102.714000 |
Cap_retention | 365.0 | 0.931069 | 0.053763 | 0.180738 | 0.928973 | 0.9358 | 0.942086 | 0.962772 |
Voltage(V)-max | 365.0 | 4.298298 | 0.033746 | 3.655700 | 4.299900 | 4.3002 | 4.300500 | 4.300900 |
Voltage(V)-min | 365.0 | 3.002579 | 0.055184 | 2.992600 | 2.996300 | 2.9975 | 2.999100 | 3.619100 |
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
def train(X, y, standardize = True) -> None:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)
if standardize:
pipe = make_pipeline(StandardScaler(), linear_model.LinearRegression())
print("Using Normalized data")
else:
pipe = make_pipeline(linear_model.LinearRegression())
pipeline = pipe.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
print("Mean squared error ",mean_squared_error(y_test, y_pred))
print("r2 score ",r2_score(y_test, y_pred))
return y_pred
#This training session has efficiency, voltage as X, retention is y.
# it should get a linear fitting well data since retention is linearly correlated to charge and dischrge capacity
df_final.dropna(inplace=True)
df_final.drop(cy, axis=1, inplace=True)
X = df_final.drop(re, axis=1)
y = df_final[re]
print(X.shape, y.shape)
train(X, y, standardize = True)
(72, 35) (72, 5) Using Normalized data Mean squared error 1.7420678323630677e-32 r2 score 1.0
array([[0.93323239, 0.92948773, 0.9423445 , 0.93894286, 0.94440159], [0.93506085, 0.93577326, 0.9483729 , 0.9434 , 0.94780149], [0.92591852, 0.91531671, 0.93185909, 0.9324 , 0.93734465], [0.93297526, 0.92503071, 0.94048741, 0.9358 , 0.94097312], [0.93140392, 0.92477358, 0.94128739, 0.9332 , 0.94125882], [0.93826067, 0.93420188, 0.92717351, 0.934 , 0.93840176], [0.93297526, 0.93837319, 0.93420188, 0.93711429, 0.94048741], [0.92694703, 0.9145453 , 0.91877375, 0.929 , 0.93211623], [0.932461 , 0.92740207, 0.94625868, 0.93868571, 0.94154453], [0.91491915, 0.90514557, 0.91851661, 0.91642857, 0.92820205], [0.92277584, 0.91900231, 0.92374504, 0.92717143, 0.930002 ], [0.95137421, 0.95831548, 0.96068684, 0.95542857, 0.96065827], [0.93534655, 0.93394474, 0.95228708, 0.94185714, 0.94940145], [0.93037541, 0.92897346, 0.9480872 , 0.94002857, 0.94337305], [0.93166105, 0.92528785, 0.94468729, 0.93502857, 0.9420588 ]])
X.columns
Index(['Voltage(V)C1538', 'Voltage(V)C1532', 'Voltage(V)C1526', 'Voltage(V)C1535', 'Voltage(V)C1536', 'Voltage(V)-dC1538', 'Voltage(V)-dC1532', 'Voltage(V)-dC1526', 'Voltage(V)-dC1535', 'Voltage(V)-dC1536', 'Cap_Chg(mAh)C1538', 'Cap_DChg(mAh)C1538', 'Chg/DChg Efficiency(%)C1538', 'Cap_Chg(mAh)C1532', 'Cap_DChg(mAh)C1532', 'Chg/DChg Efficiency(%)C1532', 'Cap_Chg(mAh)C1526', 'Cap_DChg(mAh)C1526', 'Chg/DChg Efficiency(%)C1526', 'Cap_Chg(mAh)C1535', 'Cap_DChg(mAh)C1535', 'Chg/DChg Efficiency(%)C1535', 'Cap_Chg(mAh)C1536', 'Cap_DChg(mAh)C1536', 'Chg/DChg Efficiency(%)C1536', 'Voltage(V)C1538', 'Voltage(V)C1532', 'Voltage(V)C1526', 'Voltage(V)C1535', 'Voltage(V)C1536', 'Voltage(V)C1538', 'Voltage(V)C1532', 'Voltage(V)C1526', 'Voltage(V)C1535', 'Voltage(V)C1536'], dtype='object')
df_final.dropna(inplace=True)
X_new = df_final[vo+vo_d]
y_new = df_final[re]
print(X_new.shape, y_new.shape)
train(X_new, y_new, standardize = True)
X_new.columns
Index(['Voltage(V)C1538', 'Voltage(V)C1538', 'Voltage(V)C1538', 'Voltage(V)C1532', 'Voltage(V)C1532', 'Voltage(V)C1532', 'Voltage(V)C1526', 'Voltage(V)C1526', 'Voltage(V)C1526', 'Voltage(V)C1535', 'Voltage(V)C1535', 'Voltage(V)C1535', 'Voltage(V)C1536', 'Voltage(V)C1536', 'Voltage(V)C1536', 'Voltage(V)-dC1538', 'Voltage(V)-dC1532', 'Voltage(V)-dC1526', 'Voltage(V)-dC1535', 'Voltage(V)-dC1536'], dtype='object')
# scikit learn training
train(X_new, y_new, standardize = True)
Using Normalized data Mean squared error 3.8300820045558005e-06 r2 score 0.9551582157842831
array([[0.93125659, 0.92755233, 0.94175398, 0.93804557, 0.94341722], [0.93404823, 0.93393317, 0.94771303, 0.94295516, 0.94690992], [0.92461709, 0.91337739, 0.93138537, 0.93256632, 0.93760171], [0.93086458, 0.92330521, 0.93844584, 0.93377238, 0.93910455], [0.92858767, 0.92096183, 0.93992557, 0.93078092, 0.93957201], [0.93742076, 0.93306938, 0.92689531, 0.93518941, 0.93941512], [0.93210758, 0.93768708, 0.93314921, 0.9371115 , 0.94051248], [0.92612363, 0.91478939, 0.91692426, 0.92823036, 0.93084432], [0.93148588, 0.92601264, 0.94511104, 0.93721136, 0.94018626], [0.92095913, 0.91128887, 0.9249266 , 0.92136828, 0.93298572], [0.92357166, 0.91912178, 0.92409218, 0.92838652, 0.93061378], [0.94963455, 0.95589937, 0.95943277, 0.95436735, 0.95955672], [0.93331531, 0.93183405, 0.9514257 , 0.94126375, 0.94880044], [0.93035938, 0.92847614, 0.94831513, 0.93989741, 0.94314697], [0.93190155, 0.92419011, 0.94556611, 0.93616341, 0.94277647]])
df_final_one.dropna(inplace=True)
X_new_one = df_final_one[['Voltage(V)', 'Voltage(V)-d','Voltage(V)-max', 'Voltage(V)-min']]
y_new_one = df_final_one[['Cap_retention']]
print(X_new_one.shape, y_new_one.shape)
train(X_new_one, y_new_one, standardize = True)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
dataset = cell_cap.dropna()
dataset.drop(['Cycle IDC1526'], axis=1, inplace=True)
train_data = dataset.sample(frac = 0.8,random_state=0)
test_data = dataset.drop(train_data.index)
train_data.describe()
Cap_Chg(mAh)C1526 | Cap_DChg(mAh)C1526 | Chg/DChg Efficiency(%)C1526 | Cap_retention | |
---|---|---|---|---|
count | 59.000000 | 59.000000 | 59.000000 | 59.000000 |
mean | 3.292112 | 3.238314 | 98.348017 | 0.925206 |
std | 0.048912 | 0.347416 | 10.518651 | 0.099259 |
min | 3.202500 | 0.632600 | 19.671000 | 0.180738 |
25% | 3.267400 | 3.251500 | 99.120500 | 0.928973 |
50% | 3.299400 | 3.294600 | 99.958000 | 0.941287 |
75% | 3.319050 | 3.308800 | 100.313500 | 0.945344 |
max | 3.500100 | 3.362500 | 102.120000 | 0.960687 |
dataset.head()
Cap_Chg(mAh)C1526 | Cap_DChg(mAh)C1526 | Chg/DChg Efficiency(%)C1526 | Cap_retention | |
---|---|---|---|---|
0 | 3.5001 | 3.1974 | 91.352 | 0.913517 |
1 | 3.2762 | 3.2799 | 100.113 | 0.937088 |
2 | 3.2909 | 3.3607 | 102.120 | 0.960173 |
3 | 3.3658 | 3.3276 | 98.864 | 0.950716 |
4 | 3.3224 | 3.2781 | 98.667 | 0.936573 |
# train_features = train_data.drop(['Cap_retention'], axis=1)
# test_features = test_data.drop(['Cap_retention'], axis=1)
# train_labels = train_data['Cap_retention']
# test_labels = test_data['Cap_retention']
train_features, test_features, train_labels, test_labels = train_test_split(X_new_one,y_new_one,test_size=0.2, random_state=0)
print(train_features.shape, train_labels.shape)
(292, 4) (292, 1)
normalizer = tf.keras.layers.Normalization()
normalizer.adapt(train_features)
print(f'feature mean:{normalizer.mean.numpy().squeeze()}\n')
print(f'feature variance:{normalizer.variance.numpy().squeeze()}\n')
feature mean:[4.2846317 3.5689516 4.2978477 2.9996626] feature variance:[2.1634660e-05 3.4198158e-05 1.4185647e-03 1.2927492e-03]
linear_model = tf.keras.Sequential([normalizer, tf.keras.layers.Dense(1)])
linear_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss='MAE')
%%time
history = linear_model.fit(train_features, train_labels, epochs=100, verbose=0, validation_split = 0.2)
CPU times: user 2.6 s, sys: 308 ms, total: 2.9 s Wall time: 2.52 s
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
def plot_loss(history):
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.xlabel('Epoch')
plt.ylabel('Cap. retension')
plt.legend()
plt.grid(True)
plt.title('Loss of training and validation datasets')
plot_loss(history)
test_results = {}
test_results['linear_model'] = linear_model.evaluate(test_features, test_labels)
3/3 [==============================] - 0s 2ms/step - loss: 0.0204
test_results
{'linear_model': 0.020403238013386726}
test_labels[0:1]
Cap_retention | |
---|---|
106 | 0.924774 |
linear_model.predict(test_features[0:1])
1/1 [==============================] - 0s 16ms/step
array([[0.93518853]], dtype=float32)
test_predictions = linear_model.predict(test_features).flatten()
print(test_predictions)
print(test_labels)
def plot_err(test_labels,test_predictions):
a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [Cap. retention]')
plt.ylabel('Predictions [Cap. retention]')
lims = [0.9, 0.98]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)
plot_err(test_labels,test_predictions)
3/3 [==============================] - 0s 1ms/step [0.93518853 0.93345743 0.9353479 0.93571234 0.9351162 0.93468064 0.93693304 0.93536174 0.934299 0.93730175 0.93426895 0.93166244 0.9334172 0.93384963 0.9368563 0.9341507 0.9335197 0.9359204 0.65846604 0.9320125 0.9327207 0.9351929 0.93552536 0.9333121 0.9356105 0.935041 0.93401253 0.9361842 0.9336246 0.93282896 0.9334561 0.9363537 0.9371927 0.9362437 0.93622863 0.93701404 0.9342227 0.9337846 0.9334619 0.9350292 0.93433595 0.935823 0.932614 0.9347445 0.93655306 0.93665797 0.9319494 0.93328834 0.9344626 0.9349158 0.9328428 0.9346716 0.9344144 0.93562526 0.93625736 0.9357346 0.9353207 0.9342359 0.9356215 0.9375454 0.93559766 0.9330607 0.9377341 0.93532354 0.9356164 0.9343173 0.9359117 0.9336048 0.9326871 0.93572 0.6722496 0.9344815 0.93457407] Cap_retention 106 0.924774 262 0.935543 45 0.928261 26 0.943460 78 0.947573 .. ... 220 0.945087 160 0.946002 74 0.267013 171 0.952287 114 0.926345 [73 rows x 1 columns]
error = history.history['val_loss']
plt.hist(error, bins=25)
plt.xlabel('Prediction Error [Cap. retention]')
_ = plt.ylabel('Count')
tf_train(X_new_one, y_new_one, clf,standardize = True)
(292, 4) (292, 1) Using Normalized data Mean squared error 21.666438846199082 r2 score -1600.7242456233232
array([ 1.5893304, 1.4809682, 1.3511813, 1.2703431, 1.332761 , 1.458071 , 1.6359656, 1.3496668, 1.4166682, 1.3061583, 1.6300495, 1.6927972, 1.5648146, 1.6502798, 1.308687 , 1.6440763, 1.5860374, 1.4708145, -28.263922 , 1.6831822, 1.6044693, 1.5850809, 1.4712341, 1.509927 , 1.2379196, 1.6798527, 1.3174121, 1.5509632, 1.44641 , 1.4641373, 1.7189019, 1.6157234, 1.3663833, 1.39131 , 1.5147235, 1.2598279, 1.3107064, 1.6347802, 1.4998543, 1.4125507, 1.638058 , 1.2320821, 1.5471752, 1.6333497, 1.337158 , 1.4043167, 1.558923 , 1.6211083, 1.4569037, 1.4251707, 1.4741914, 1.5238521, 1.3678386, 1.6179335, 1.5795295, 1.4740584, 1.3920453, 1.598145 , 1.4523752, 1.613189 , 1.4861934, 1.5203724, 1.3390105, 1.4786355, 1.485678 , 1.4077947, 1.4757016, 1.692817 , 1.5813243, 1.5771105, -27.098623 , 1.5492551, 1.5964956], dtype=float32)
%%time
from tpot import TPOTRegressor
tpot = TPOTRegressor(generations=4, # set to 4 to run faster for testing
population_size=40,
verbosity=2,
random_state=42)
tpot.fit(train_features, train_labels)
Generation 1 - Current best internal CV score: -5.2456329057359486e-05 Generation 2 - Current best internal CV score: -5.2456329057359486e-05 Generation 3 - Current best internal CV score: -5.2029186352093085e-05 Generation 4 - Current best internal CV score: -5.089168529401108e-05 Best pipeline: RandomForestRegressor(LassoLarsCV(input_matrix, normalize=False), bootstrap=True, max_features=0.7500000000000001, min_samples_leaf=11, min_samples_split=9, n_estimators=100) CPU times: user 2min 39s, sys: 6.54 s, total: 2min 46s Wall time: 1min 47s
TPOTRegressor(generations=4, population_size=40, random_state=42, verbosity=2)
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoLarsCV
import joblib
from pathlib import Path
#BASE_DIR = Path(__file__).resolve(strict=True).parent
BASE_DIR = '/Users/michael/Documents/Projects/NewareDataProcessing'
# Instantiate and fit the model
rfc = RandomForestRegressor(bootstrap=True, max_features=0.7500000000000001, min_samples_leaf=11, min_samples_split=9, n_estimators=100)
model = rfc.fit(train_features, train_labels)
# Use the model to predict diabetes type
rfc_predictions = model.predict(test_features)
joblib.dump(model, Path(BASE_DIR).joinpath(f"{'RFR'}.joblib"))
['/Users/michael/Documents/Projects/NewareDataProcessing/RFR.joblib']
# quick test with other cells data
six = pd.read_csv("ProcessedData/OneHundred/Final.csv")
six.dropna(inplace=True)
X_new_ot = six[['Voltage(V)', 'Voltage(V)-d','Voltage(V)-max', 'Voltage(V)-min']]
y_new_ot= six[['Cap_retention']]
y_new_ot.replace(to_replace = 0, value = 1, inplace=True)
y_pred = model.predict(X_new_ot)
mse = mean_squared_error(y_new_ot, y_pred)
score = model.score(X_new_ot, y_new_ot)
print("R-squared:", score)
print("MSE: ", mse)
print("RMSE: ", mse*(1/2.0))
x_ax = range(len(y_new_ot))
plt.plot(x_ax, y_new_ot, linewidth=1, label="original")
plt.plot(x_ax, y_pred, linewidth=1.1, label="predicted")
plt.title("y-test and y-predicted data")
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend(loc='best',fancybox=True, shadow=True)
plt.grid(True)
plt.show()
R-squared: -2.1794558040090903 MSE: 0.37744250079647074 RMSE: 0.18872125039823537
six.head(6)
Unnamed: 0 | Voltage(V) | Voltage(V)-d | Cycle ID | Cap_Chg(mAh) | Cap_DChg(mAh) | Chg/DChg Efficiency(%) | Cap_retention | Voltage(V)-max | Voltage(V)-min | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3.4139 | 3.4933 | 1 | 3.5000 | 2.9844 | 85.267 | 0.852686 | 4.0516 | 3.0000 |
1 | 1 | 4.0448 | 3.5091 | 2 | 3.5001 | 3.4149 | 97.565 | 0.975686 | 4.1979 | 2.9997 |
2 | 2 | 4.1861 | 3.5063 | 3 | 3.5001 | 3.5149 | 100.424 | 1.004257 | 4.2184 | 2.9997 |
3 | 3 | 4.2069 | 3.5292 | 4 | 3.5000 | 3.3471 | 95.631 | 0.956314 | 4.2196 | 3.0000 |
4 | 4 | 4.2007 | 3.5224 | 5 | 3.5001 | 3.5461 | 101.315 | 1.013171 | 4.2748 | 2.9991 |
5 | 5 | 4.2513 | 3.4936 | 6 | 3.5001 | 0.0000 | 0.000 | 0.000000 | 4.2494 | 3.5931 |
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(test_labels, rfc_predictions)
score = model.score(train_features, train_labels)
print("R-squared:", score)
print("MSE: ", mse)
print("RMSE: ", mse*(1/2.0))
R-squared: 0.655229440029654 MSE: 0.013747138922283401 RMSE: 0.006873569461141701
x_ax = range(len(test_labels))
plt.plot(x_ax, test_labels, linewidth=1, label="original")
plt.plot(x_ax, rfc_predictions, linewidth=1.1, label="predicted")
plt.title("y-test and y-predicted data")
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend(loc='best',fancybox=True, shadow=True)
plt.grid(True)
plt.show()
import shap
explainer = shap.TreeExplainer(model)
shap_val = explainer.shap_values(train_features)
# #shap.summary_plot(shap_val, XN.values,plot_type='bar')
#shap.plots.beeswarm(shap_val, max_display=20)
shap.summary_plot(shap_val, train_features)
from sklearn.tree import export_graphviz
estimator = model.estimators_[5]
# Export as dot file
export_graphviz(estimator, out_file='tree.dot',
feature_names = train_features.columns,
class_names = train_labels.columns,
rounded = True, proportion = False,
precision = 2, filled = True)
# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')
c = df_final.columns.to_list()
print(c)
df_final[['Cycle ID1', 'Cap_Chg(mAh)1', 'Cap_DChg(mAh)1', 'Chg/DChg Efficiency(%)1']]
#df_final.reset_index(inplace=True)
#df_final.rename(columns={'CellID':'CellID'})
#x = df_final.pivot(columns='CellID')
#x.shape
df_train = df_final.copy()
df_train.drop(columns=drop_cycid,inplace=True)
df_train.fillna(0, inplace=True)
df_train.head()
cor = df_train.corr()
keep_columns = np.full(cor.shape[0], True)
for i in range(cor.shape[0] - 1):
for j in range(i + 1, cor.shape[0] - 1):
if (np.abs(cor.iloc[i, j]) >= 0.9): # 0.8 is the correlation threshold
keep_columns[j] = False
selected_columns = df_train.columns[keep_columns]
X_train_reduced = df_train[selected_columns]
X_train_reduced.shape
X_train_reduced.columns
X_train_reduced.tail()
#clustering the data
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
import warnings
warnings.filterwarnings('ignore')
from matplotlib import style
style.use('dark_background')
from sklearn.cluster import KMeans
X = X_train_reduced
clusters = []
for i in range(1, 5):
km = KMeans(n_clusters=i).fit(X)
clusters.append(km.inertia_)
fig, ax = plt.subplots(figsize=(12, 8))
sns.lineplot(x=list(range(1, 5)), y=clusters, ax=ax)
ax.set_title('Searching for Elbow')
ax.set_xlabel('Clusters')
ax.set_ylabel('Inertia')
km3 = KMeans(n_clusters = 3).fit(X)
X['Labels'] = km3.labels_
plt.figure(figsize=(12, 8))
sns.scatterplot(X['OCV_charge1'], X['Cap_Chg(mAh)3'], hue=X['Labels'],
palette=sns.color_palette('hls', 3))
plt.title('KMeans with 3 Clusters')
plt.show()
X.head()
t = X.copy()
t.reset_index(inplace=True)
t.rename(columns={'CellID':'CellID'})
plt.figure(figsize=(12, 8)) plt.subplot(2,2,1) sns.scatterplot( t['Voltage_min1'],t['OCV_charge1'], hue=t['Labels'], palette=sns.color_palette('hls', 3)) plt.title('KMeans with 3 Clusters') plt.subplot(2,2,2) sns.scatterplot( t['Voltage_max1'],t['OCV_charge1'], hue=t['Labels'], palette=sns.color_palette('hls', 3)) plt.title('KMeans with 3 Clusters') plt.subplot(2,2,3) sns.scatterplot( t['Voltage_max1'],t['Voltage_min1'], hue=t['Labels'], palette=sns.color_palette('hls', 3)) plt.title('KMeans with 3 Clusters')
plt.subplot(2,2,4) sns.scatterplot( t['CellID'],t['Voltage_min1'], hue=t['Labels'], palette=sns.color_palette('hls', 3)) plt.title('KMeans with 3 Clusters') plt.show()
plt.figure(figsize=(12, 8))
sns.scatterplot( t['OCV_charge1'],t['Cap_Chg(mAh)3'], hue=t['Labels'],
palette=sns.color_palette('hls', 3))
plt.title('KMeans with 3 Clusters')
plt.show()
t['Voltage_min1']
t[['CellID','Labels']].sort_values(by='Labels')
from sklearn.decomposition import PCA
n=df_train.shape[0]
#Finding principal components for the data
pca =PCA(n_components=n, random_state=1)# Applying the PCA algorithm with random state = 1
data_pca1 = pd.DataFrame(pca.fit_transform(df_train)) #Fitting and transforming the pca function on scaled data
#The percentage of variance explained by each principal component
exp_var = pca.explained_variance_ratio_
plt.figure(figsize = (10,10))
plt.plot(range(1,24), exp_var.cumsum(), marker = 'o', linestyle = '--')
plt.title("Explained Variances by Components")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
# finding the least number of components that can explain more than 90% variance
sum = 0
for ix, i in enumerate(exp_var):
sum = sum + i
if(sum>0.90):
print("Number of PCs that explain at least 90% variance: ", ix+1)
break
pc_comps = ['PC1','PC2','PC3']
data_pca = pd.DataFrame(np.round(pca.components_[:3,:],2),index=pc_comps,columns=df_train.columns)
data_pca.T
df_concat = pd.concat([data_pca1, X], axis=1)
plt.figure(figsize = (7,7))
#Create a scatter plot with x=0 and y=1 using df_concat dataframe
sns.scatterplot(x = 0, y = 1, data=df_concat, hue = 'OCV_charge1')
plt.xlabel("PC1")
plt.ylabel("PC2")
df_concat.head()