import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams.update({'axes.labelsize':16, 'axes.titlesize':20})
import seaborn as sns
sns.set_style('whitegrid')
from sklearn.preprocessing import OneHotEncoder


users_info_df = pd.read_csv('../../data/users_info.csv')
users_ds_df = pd.read_csv('../../data/users_daily_stats.csv')


users_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4130 entries, 0 to 4129
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           4130 non-null   int64  
 1   ID                   4130 non-null   int64  
 2   DIABETES_TYPE        4130 non-null   object 
 3   GENDER               4130 non-null   object 
 4   DATE_OF_BIRTH        4130 non-null   object 
 5   CURRENT_AGE          4130 non-null   int64  
 6   AGE_WHEN_REGISTERED  4130 non-null   int64  
 7   APPLICATION_NAME     4128 non-null   object 
 8   OS                   4128 non-null   object 
 9   HAS_CGM              4130 non-null   bool   
 10  HAS_PUMP             4130 non-null   bool   
 11  HAS_METER            4130 non-null   bool   
 12  SYNC_COUNT           4130 non-null   int64  
 13  EVENT_COUNT          879 non-null    float64
 14  FOOD_COUNT           879 non-null    float64
 15  EXERCISE_COUNT       879 non-null    float64
 16  MEDICATION_COUNT     879 non-null    float64
dtypes: bool(3), float64(4), int64(5), object(5)
memory usage: 463.9+ KB


users_info_df['APP_LENGTH'] = users_info_df['CURRENT_AGE'] - \
                                users_info_df['AGE_WHEN_REGISTERED']


users_info_df['HAS_OTHER_COUNT'] = users_info_df['EVENT_COUNT'].notnull()


# One hot encode the APPLICATION_NAME variable
users_info_df = pd.concat(
    [users_info_df, 
     users_info_df['APPLICATION_NAME'].str.get_dummies(sep=",")
    ], axis=1)


# One hot encode the OS variable
users_info_df = pd.concat(
    [users_info_df, 
     users_info_df['OS'].str.get_dummies(sep=",")
    ], axis=1)


users_info_df.columns

Index(['Unnamed: 0', 'ID', 'DIABETES_TYPE', 'GENDER', 'DATE_OF_BIRTH',
       'CURRENT_AGE', 'AGE_WHEN_REGISTERED', 'APPLICATION_NAME', 'OS',
       'HAS_CGM', 'HAS_PUMP', 'HAS_METER', 'SYNC_COUNT', 'EVENT_COUNT',
       'FOOD_COUNT', 'EXERCISE_COUNT', 'MEDICATION_COUNT', 'APP_LENGTH',
       'HAS_OTHER_COUNT', 'c4c', 'kiosk', 'logbook', 'patient_uploader',
       'server', 'android', 'browser', 'ios'],
      dtype='object')


users_info_var_select = ['GENDER','CURRENT_AGE','HAS_PUMP','HAS_METER',
                        'SYNC_COUNT','APP_LENGTH','HAS_OTHER_COUNT',
                        'c4c','kiosk','logbook','patient_uploader',
                        'server','android','browser','ios']


users_ds_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375830 entries, 0 to 375829
Data columns (total 22 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Unnamed: 0               375830 non-null  int64  
 1   USER_ID                  375830 non-null  int64  
 2   TIME_CGM_ACTIVE          375830 non-null  float64
 3   AVERAGE_VALUE            375830 non-null  float64
 4   READING_COUNT            375830 non-null  int64  
 5   ABOVE_180                375830 non-null  int64  
 6   ABOVE_250                375830 non-null  int64  
 7   ABOVE_400                375830 non-null  int64  
 8   BELOW_50                 375830 non-null  int64  
 9   BELOW_54                 375830 non-null  int64  
 10  BELOW_60                 375830 non-null  int64  
 11  BELOW_70                 375830 non-null  int64  
 12  LOWEST_VALUE             375830 non-null  float64
 13  HIGHEST_VALUE            375830 non-null  float64
 14  DATE                     375830 non-null  object 
 15  TOTAL_INSULIN            156858 non-null  float64
 16  TOTAL_BASAL              156858 non-null  float64
 17  TOTAL_BOLUS              156858 non-null  float64
 18  HAS_REMOTE_SMBG_DATA     375649 non-null  object 
 19  HAS_REMOTE_CGM_DATA      375649 non-null  object 
 20  HAS_REMOTE_INSULIN_DATA  375649 non-null  object 
 21  HAS_IN_CLINIC_SYNC       375649 non-null  object 
dtypes: float64(7), int64(10), object(5)
memory usage: 63.1+ MB


users_ds_df['TIME_CGM_ACTIVE'].describe()

count    375830.000000
mean          0.999197
std           0.161408
min           0.003472
25%           1.000000
50%           1.000000
75%           1.000000
max           3.274306
Name: TIME_CGM_ACTIVE, dtype: float64


sns.histplot(users_ds_df['TIME_CGM_ACTIVE'], bins=30);


temp_vars = ['TIME_CGM_ACTIVE','ABOVE_180','ABOVE_250','ABOVE_400',
                 'BELOW_50','BELOW_54','BELOW_60','BELOW_70']
temp_vars.append('READING_COUNT')
users_ds_df[temp_vars].head(12)


# Change the units of these reading counts variable from counts to % of time
read_cnt_vars = ['ABOVE_180','ABOVE_250','ABOVE_400',
                 'BELOW_50','BELOW_54','BELOW_60','BELOW_70']
for i,var in enumerate(read_cnt_vars):
    #users_ds_df[var] = users_ds_df[var] / users_ds_df['TIME_CGM_ACTIVE']
    users_ds_df[var] = round(100 * users_ds_df[var] / 
                             users_ds_df['READING_COUNT'],2)
users_ds_df[temp_vars].head(12)


users_ds_df["ABOVE_180"].describe()

count    375830.000000
mean         30.836868
std          24.832058
min           0.000000
25%          10.270000
50%          26.040000
75%          46.880000
max         100.000000
Name: ABOVE_180, dtype: float64


users_ds_df["BELOW_70"].describe()

count    375830.000000
mean          2.824835
std           5.234385
min           0.000000
25%           0.000000
50%           0.350000
75%           3.470000
max         100.000000
Name: BELOW_70, dtype: float64


# Create variable for Glycemic Risk Index (GRI)
# GRI = (3.0 × VLow) + (2.4 × Low) + (1.6 × VHigh) + (0.8 × High)
# or alternatively: GRI = (3.0 × HypoComp) + (1.6 × HyperComp)
# where Hypoglycemia Component = VLow + (0.8 × Low)
#   and Hyperglycemia Component = VHigh + (0.5 × High)
# and where VLow = # time < 54, Low = # time from 54 to < 70
#          VHigh = # time > 250, High = # time from 250 to > 180
GRI_temp = pd.DataFrame()
GRI_temp['VLow'] = users_ds_df['BELOW_54']
GRI_temp['Low'] = users_ds_df['BELOW_70'] - users_ds_df['BELOW_54']
GRI_temp['VHigh'] = users_ds_df['ABOVE_250']
GRI_temp['High'] = users_ds_df['ABOVE_180'] - users_ds_df['ABOVE_250']


# Create HypoComp, HyperComp, and GRI for each user and day in the daily stats dataframe
users_ds_df['GRI_HYPO'] = GRI_temp['VLow'] + 0.8 * GRI_temp['Low']
users_ds_df['GRI_HYPER'] = GRI_temp['VHigh'] + 0.5 * GRI_temp['High']
users_ds_df['GRI'] = 3 * users_ds_df['GRI_HYPO'] \
                     + 1.6 * users_ds_df['GRI_HYPER']


# sort values by USER_ID and DATE
users_ds_df.sort_values(by=['USER_ID','DATE'], inplace=True)
users_ds_df.head(10).T


users_ds_df[["LOWEST_VALUE","HIGHEST_VALUE"]].describe()


users_ds_df[['DATE','GRI']].head(20)


users_ds_df['GRI_PAST_2WK'] = users_ds_df['GRI'].rolling(
                                14).mean()
indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=7)
users_ds_df['GRI_FUTURE_1WK'] = users_ds_df['GRI'].rolling(
                    window=indexer, min_periods=1).mean()
indexer2 = pd.api.indexers.FixedForwardWindowIndexer(window_size=14)
users_ds_df['GRI_FUTURE_2WK'] = users_ds_df['GRI'].rolling(
                    window=indexer2, min_periods=1).mean()


users_ds_df[['DATE','GRI','GRI_PAST_2WK','GRI_FUTURE_1WK','GRI_FUTURE_2WK']].head(20)


users_ds_df[['TIME_CGM_ACTIVE','AVERAGE_VALUE','DATE',
             'GRI_HYPO','GRI_HYPER','GRI']
           ][users_ds_df['USER_ID'] == 4036
            ].head(10)


user_num = 900
single_user = users_ds_df[users_ds_df['USER_ID'] == user_num].sort_values(by="DATE")
single_user = single_user.set_index(single_user["DATE"])
sns.lineplot(data=single_user[[
    "GRI","GRI_PAST_2WK","GRI_FUTURE_2WK"]].iloc[14:45]);
plt.xticks(rotation=45);
plt.ylabel("GRI");


plt.hist(users_ds_df['DATE'][users_ds_df['DATE'] <= '2021-04-14'], bins=14)
plt.xticks(rotation=45);


plt.hist(users_ds_df['GRI_FUTURE_2WK'][users_ds_df['DATE'] == '2021-04-14']);


users_ds_df['GRI_FUTURE_2WK'][users_ds_df['DATE'] == '2021-04-14'].describe()

count    4130.000000
mean       39.118552
std        22.869756
min         0.099429
25%        22.390893
50%        35.553286
75%        51.138250
max       152.405143
Name: GRI_FUTURE_2WK, dtype: float64


(users_ds_df['GRI_FUTURE_2WK'][users_ds_df['DATE'] == '2021-04-14'] > 40).sum()

1726


1726/4130

0.4179176755447942


(users_ds_df['GRI_PAST_2WK'][users_ds_df['DATE'] == '2021-04-14'] > 40).sum()

1707


((users_ds_df['GRI_PAST_2WK'][users_ds_df['DATE'] == '2021-04-14'] > 40)
& (users_ds_df['GRI_FUTURE_2WK'][users_ds_df['DATE'] == '2021-04-14'] > 40)).sum()

1455


((users_ds_df['GRI_PAST_2WK'][users_ds_df['DATE'] == '2021-04-14'] < 40)
& (users_ds_df['GRI_FUTURE_2WK'][users_ds_df['DATE'] == '2021-04-14'] > 40)).sum()

271


users_go_over_thd = ((users_ds_df['GRI_PAST_2WK'][users_ds_df['DATE'] == '2021-04-14'] < 40)
& (users_ds_df['GRI_FUTURE_2WK'][users_ds_df['DATE'] == '2021-04-14'] > 40)).reset_index()


((users_ds_df['GRI_PAST_2WK'][users_ds_df['DATE'] == '2021-04-14'] > 40)
& (users_ds_df['GRI_FUTURE_2WK'][users_ds_df['DATE'] == '2021-04-14'] < 40)).sum()

252


users_go_under_thd = ((users_ds_df['GRI_PAST_2WK'][users_ds_df['DATE'] == '2021-04-14'] > 40)
& (users_ds_df['GRI_FUTURE_2WK'][users_ds_df['DATE'] == '2021-04-14'] < 40)).reset_index()


users_stay_over_thd = ((users_ds_df['GRI_PAST_2WK'][users_ds_df['DATE'] == '2021-04-14'] > 40)
& (users_ds_df['GRI_FUTURE_2WK'][users_ds_df['DATE'] == '2021-04-14'] > 40)).reset_index()


users_stay_under_thd = ((users_ds_df['GRI_PAST_2WK'][users_ds_df['DATE'] == '2021-04-14'] < 40)
& (users_ds_df['GRI_FUTURE_2WK'][users_ds_df['DATE'] == '2021-04-14'] < 40)).reset_index()


users_stay_under_thd[0].head()

0    False
1    False
2    False
3     True
4    False
Name: 0, dtype: bool


((users_ds_df['GRI_PAST_2WK'][users_ds_df['DATE'] == '2021-04-14'] < 40)
& (users_ds_df['GRI_FUTURE_2WK'][users_ds_df['DATE'] == '2021-04-14'] < 40)).sum()

2152


271/4130

0.06561743341404358


users_info_df["CURRENT_AGE"][users_go_over_thd[0]].describe()

count    271.000000
mean      42.579336
std       14.946577
min       18.000000
25%       31.000000
50%       41.000000
75%       54.000000
max       83.000000
Name: CURRENT_AGE, dtype: float64


users_ds_df.columns

Index(['Unnamed: 0', 'USER_ID', 'TIME_CGM_ACTIVE', 'AVERAGE_VALUE',
       'READING_COUNT', 'ABOVE_180', 'ABOVE_250', 'ABOVE_400', 'BELOW_50',
       'BELOW_54', 'BELOW_60', 'BELOW_70', 'LOWEST_VALUE', 'HIGHEST_VALUE',
       'DATE', 'TOTAL_INSULIN', 'TOTAL_BASAL', 'TOTAL_BOLUS',
       'HAS_REMOTE_SMBG_DATA', 'HAS_REMOTE_CGM_DATA',
       'HAS_REMOTE_INSULIN_DATA', 'HAS_IN_CLINIC_SYNC', 'GRI_HYPO',
       'GRI_HYPER', 'GRI', 'GRI_PAST_2WK', 'GRI_FUTURE_1WK', 'GRI_FUTURE_2WK'],
      dtype='object')


dateChoice = '2021-04-14'
# We select variables for a certain day, and then reset the index to the user_id
# Note: the index is the user_id because the data was ordered by user_id (so we don't need to set it explicitly)
X = users_ds_df[['TOTAL_INSULIN', 'TOTAL_BASAL', 'TOTAL_BOLUS',
                'HAS_REMOTE_SMBG_DATA', 'HAS_REMOTE_CGM_DATA',
                'HAS_REMOTE_INSULIN_DATA', 'HAS_IN_CLINIC_SYNC',
                'GRI_FUTURE_2WK'
                ]][users_ds_df['DATE'] == dateChoice] \
                .reset_index().drop(columns='index')


X.head()


X.shape

(4130, 8)


X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4130 entries, 0 to 4129
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   TOTAL_INSULIN            1710 non-null   float64
 1   TOTAL_BASAL              1710 non-null   float64
 2   TOTAL_BOLUS              1710 non-null   float64
 3   HAS_REMOTE_SMBG_DATA     4128 non-null   object 
 4   HAS_REMOTE_CGM_DATA      4128 non-null   object 
 5   HAS_REMOTE_INSULIN_DATA  4128 non-null   object 
 6   HAS_IN_CLINIC_SYNC       4128 non-null   object 
 7   GRI_FUTURE_2WK           4130 non-null   float64
dtypes: float64(4), object(4)
memory usage: 258.2+ KB


# Remind ourselves of the variable names of the user data dataframe
users_info_df.columns

Index(['Unnamed: 0', 'ID', 'DIABETES_TYPE', 'GENDER', 'DATE_OF_BIRTH',
       'CURRENT_AGE', 'AGE_WHEN_REGISTERED', 'APPLICATION_NAME', 'OS',
       'HAS_CGM', 'HAS_PUMP', 'HAS_METER', 'SYNC_COUNT', 'EVENT_COUNT',
       'FOOD_COUNT', 'EXERCISE_COUNT', 'MEDICATION_COUNT', 'APP_LENGTH',
       'HAS_OTHER_COUNT', 'c4c', 'kiosk', 'logbook', 'patient_uploader',
       'server', 'android', 'browser', 'ios'],
      dtype='object')


# First, create a variable of user data with only the columns we want
users_info_temp = users_info_df.drop(columns=[
    "DIABETES_TYPE","DATE_OF_BIRTH","AGE_WHEN_REGISTERED","APPLICATION_NAME","OS",
    "EVENT_COUNT","FOOD_COUNT","EXERCISE_COUNT","MEDICATION_COUNT","Unnamed: 0"
]).set_index("ID")


users_info_temp.head(10)


# Now, join the two dataframes
X = X.join(users_info_temp)


# Quick check to see that the data is the same - it is
X[X.columns[-16:]].iloc[1332:1337]


# These variables we will make into multiple variables for data from the past 2 weeks:
var_proc_2wk = ['TIME_CGM_ACTIVE','AVERAGE_VALUE','ABOVE_250','BELOW_54',
               'LOWEST_VALUE','HIGHEST_VALUE','GRI']


temp = users_ds_df['DATE']
temp.head()

3828     2021-04-01
4419     2021-04-02
11018    2021-04-03
15663    2021-04-04
18285    2021-04-05
Name: DATE, dtype: object


from datetime import datetime


temp2 = datetime.strptime(temp, '%y/%m/%d')

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/var/folders/jf/8qzdy32d3y9c7v75pkb_34640000gp/T/ipykernel_61756/1224098244.py in <module>
----> 1 temp2 = datetime.strptime(temp, '%y/%m/%d')

TypeError: strptime() argument 1 must be str, not Series


# We make 14 variables containing all the users, for the last 14 days
dates = ['2021-04-14','2021-04-13','2021-04-12','2021-04-11','2021-04-10',
        '2021-04-09','2021-04-08','2021-04-07','2021-04-06','2021-04-05',
        '2021-04-04','2021-04-03','2021-04-02','2021-04-01']
for j in range(len(dates)):
    globals()[f"data_{j}"] = users_ds_df[var_proc_2wk] \
        [users_ds_df['DATE'] == dates[j]].reset_index()


# We will create 14 values for each variable, one each for each day in the past 2 weeks
for var in var_proc_2wk:
    for j in range(len(dates)):
        X[f"{var}_{j}"] = globals()[f"data_{j}"][var]

/var/folders/jf/8qzdy32d3y9c7v75pkb_34640000gp/T/ipykernel_33447/1243508662.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  X[f"{var}_{j}"] = globals()[f"data_{j}"][var]
/var/folders/jf/8qzdy32d3y9c7v75pkb_34640000gp/T/ipykernel_33447/1243508662.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  X[f"{var}_{j}"] = globals()[f"data_{j}"][var]


X.head()


# Now we create an additional value for each variable
# which is the average over the last 2 week period
for var in var_proc_2wk:
    # create a list of column names:
    names = []
    for j in range(len(dates)):
        names.append(f"{var}_{j}")
    # Now take the average of these values and save it as a new variable
    X[f"{var}_avg2wk"] = X[names].mean(axis=1)


X.head()


has_insulin = (X['TOTAL_INSULIN'].isna()).astype(int)
X.drop(columns=['TOTAL_INSULIN','TOTAL_BASAL','TOTAL_BOLUS'],
      inplace=True)
X['HAS_INSULIN_DATA'] = has_insulin


# We will predict whether the average GRI over the next 2 weeks will be > 40
# Values above 40 are classified as Zone C
y = (X.GRI_FUTURE_2WK > 40).astype(int)


y.value_counts()
# Approximately 40% of users will be above this threshold

0    2404
1    1726
Name: GRI_FUTURE_2WK, dtype: int64


# We save the future data in a separate variable for later use
future_data = X['GRI_FUTURE_2WK']


# We remove the future data from X
X = X.drop(columns='GRI_FUTURE_2WK')


# There are a couple NaNs
X['HAS_REMOTE_CGM_DATA'].isna().sum()

2


# We will fill the NaNs with False values
X.fillna(False, inplace=True);


# And change the booleans to 0's and 1's
var_convert = ['HAS_REMOTE_SMBG_DATA', 'HAS_REMOTE_CGM_DATA',
               'HAS_REMOTE_INSULIN_DATA', 'HAS_IN_CLINIC_SYNC',
               'HAS_CGM','HAS_PUMP','HAS_METER','HAS_OTHER_COUNT']
for var in var_convert:
    X[var] = (X[var]).astype(int)


# And change Gender so that Male = 1 and Female = 0
X["GENDER"].replace(["male","female"], [1,0], inplace=True)


X.shape

(4130, 126)


X.head()


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,y,random_state=42, test_size=0.2)


import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 10))
cor = X_train.corr()
sns.heatmap(cor, vmin=-1, vmax=1, cmap="PiYG");


import numpy as np
keep_columns = np.full(cor.shape[0], True)
for i in range(cor.shape[0] - 1):
    for j in range(i + 1, cor.shape[0] - 1):
        if (np.abs(cor.iloc[i, j]) >= 0.8): # 0.8 is the correlation threshold
            keep_columns[j] = False
selected_columns = X_train.columns[keep_columns]
X_train_reduced = X_train[selected_columns]


X_train_reduced.shape

(3304, 79)


# We also want to keep the 2-week average GRI variable if it is dropped
ind = [i for i, x in enumerate(X_train.columns == 'GRI_avg2wk') if x] 
keep_columns[ind] = True
selected_columns = X_train.columns[keep_columns]
X_train_reduced = X_train[selected_columns]


X_test_reduced = X_test[selected_columns]


print("The feature set has been reduced to " + 
      f"{X_train_reduced.shape[1]} features " +
      f"from {X.shape[1]} features.")

The feature set has been reduced to 80 features from 126 features.


X_train_reduced.columns

Index(['HAS_REMOTE_SMBG_DATA', 'HAS_REMOTE_CGM_DATA',
       'HAS_REMOTE_INSULIN_DATA', 'HAS_IN_CLINIC_SYNC', 'GENDER',
       'CURRENT_AGE', 'HAS_CGM', 'HAS_PUMP', 'HAS_METER', 'SYNC_COUNT',
       'APP_LENGTH', 'HAS_OTHER_COUNT', 'c4c', 'kiosk', 'logbook',
       'patient_uploader', 'server', 'android', 'browser', 'ios',
       'TIME_CGM_ACTIVE_0', 'TIME_CGM_ACTIVE_1', 'TIME_CGM_ACTIVE_2',
       'TIME_CGM_ACTIVE_3', 'TIME_CGM_ACTIVE_4', 'TIME_CGM_ACTIVE_5',
       'TIME_CGM_ACTIVE_6', 'TIME_CGM_ACTIVE_7', 'TIME_CGM_ACTIVE_8',
       'TIME_CGM_ACTIVE_9', 'TIME_CGM_ACTIVE_10', 'TIME_CGM_ACTIVE_11',
       'TIME_CGM_ACTIVE_12', 'TIME_CGM_ACTIVE_13', 'AVERAGE_VALUE_0',
       'AVERAGE_VALUE_1', 'AVERAGE_VALUE_2', 'AVERAGE_VALUE_3',
       'AVERAGE_VALUE_4', 'AVERAGE_VALUE_5', 'AVERAGE_VALUE_6',
       'AVERAGE_VALUE_7', 'AVERAGE_VALUE_8', 'AVERAGE_VALUE_9',
       'AVERAGE_VALUE_10', 'AVERAGE_VALUE_11', 'AVERAGE_VALUE_12',
       'AVERAGE_VALUE_13', 'BELOW_54_0', 'BELOW_54_1', 'BELOW_54_2',
       'BELOW_54_3', 'BELOW_54_4', 'BELOW_54_5', 'BELOW_54_6', 'BELOW_54_7',
       'BELOW_54_8', 'BELOW_54_9', 'BELOW_54_10', 'BELOW_54_11', 'BELOW_54_12',
       'BELOW_54_13', 'LOWEST_VALUE_0', 'LOWEST_VALUE_1', 'LOWEST_VALUE_2',
       'LOWEST_VALUE_3', 'LOWEST_VALUE_4', 'LOWEST_VALUE_5', 'LOWEST_VALUE_6',
       'LOWEST_VALUE_7', 'LOWEST_VALUE_8', 'LOWEST_VALUE_9', 'LOWEST_VALUE_10',
       'LOWEST_VALUE_11', 'LOWEST_VALUE_12', 'LOWEST_VALUE_13',
       'BELOW_54_avg2wk', 'LOWEST_VALUE_avg2wk', 'GRI_avg2wk',
       'HAS_INSULIN_DATA'],
      dtype='object')


X_train.columns[:70]

Index(['HAS_REMOTE_SMBG_DATA', 'HAS_REMOTE_CGM_DATA',
       'HAS_REMOTE_INSULIN_DATA', 'HAS_IN_CLINIC_SYNC', 'GENDER',
       'CURRENT_AGE', 'HAS_CGM', 'HAS_PUMP', 'HAS_METER', 'SYNC_COUNT',
       'APP_LENGTH', 'HAS_OTHER_COUNT', 'c4c', 'kiosk', 'logbook',
       'patient_uploader', 'server', 'android', 'browser', 'ios',
       'TIME_CGM_ACTIVE_0', 'TIME_CGM_ACTIVE_1', 'TIME_CGM_ACTIVE_2',
       'TIME_CGM_ACTIVE_3', 'TIME_CGM_ACTIVE_4', 'TIME_CGM_ACTIVE_5',
       'TIME_CGM_ACTIVE_6', 'TIME_CGM_ACTIVE_7', 'TIME_CGM_ACTIVE_8',
       'TIME_CGM_ACTIVE_9', 'TIME_CGM_ACTIVE_10', 'TIME_CGM_ACTIVE_11',
       'TIME_CGM_ACTIVE_12', 'TIME_CGM_ACTIVE_13', 'AVERAGE_VALUE_0',
       'AVERAGE_VALUE_1', 'AVERAGE_VALUE_2', 'AVERAGE_VALUE_3',
       'AVERAGE_VALUE_4', 'AVERAGE_VALUE_5', 'AVERAGE_VALUE_6',
       'AVERAGE_VALUE_7', 'AVERAGE_VALUE_8', 'AVERAGE_VALUE_9',
       'AVERAGE_VALUE_10', 'AVERAGE_VALUE_11', 'AVERAGE_VALUE_12',
       'AVERAGE_VALUE_13', 'ABOVE_250_0', 'ABOVE_250_1', 'ABOVE_250_2',
       'ABOVE_250_3', 'ABOVE_250_4', 'ABOVE_250_5', 'ABOVE_250_6',
       'ABOVE_250_7', 'ABOVE_250_8', 'ABOVE_250_9', 'ABOVE_250_10',
       'ABOVE_250_11', 'ABOVE_250_12', 'ABOVE_250_13', 'BELOW_54_0',
       'BELOW_54_1', 'BELOW_54_2', 'BELOW_54_3', 'BELOW_54_4', 'BELOW_54_5',
       'BELOW_54_6', 'BELOW_54_7'],
      dtype='object')


X_train.columns[70:]

Index(['BELOW_54_8', 'BELOW_54_9', 'BELOW_54_10', 'BELOW_54_11', 'BELOW_54_12',
       'BELOW_54_13', 'LOWEST_VALUE_0', 'LOWEST_VALUE_1', 'LOWEST_VALUE_2',
       'LOWEST_VALUE_3', 'LOWEST_VALUE_4', 'LOWEST_VALUE_5', 'LOWEST_VALUE_6',
       'LOWEST_VALUE_7', 'LOWEST_VALUE_8', 'LOWEST_VALUE_9', 'LOWEST_VALUE_10',
       'LOWEST_VALUE_11', 'LOWEST_VALUE_12', 'LOWEST_VALUE_13',
       'HIGHEST_VALUE_0', 'HIGHEST_VALUE_1', 'HIGHEST_VALUE_2',
       'HIGHEST_VALUE_3', 'HIGHEST_VALUE_4', 'HIGHEST_VALUE_5',
       'HIGHEST_VALUE_6', 'HIGHEST_VALUE_7', 'HIGHEST_VALUE_8',
       'HIGHEST_VALUE_9', 'HIGHEST_VALUE_10', 'HIGHEST_VALUE_11',
       'HIGHEST_VALUE_12', 'HIGHEST_VALUE_13', 'GRI_0', 'GRI_1', 'GRI_2',
       'GRI_3', 'GRI_4', 'GRI_5', 'GRI_6', 'GRI_7', 'GRI_8', 'GRI_9', 'GRI_10',
       'GRI_11', 'GRI_12', 'GRI_13', 'TIME_CGM_ACTIVE_avg2wk',
       'AVERAGE_VALUE_avg2wk', 'ABOVE_250_avg2wk', 'BELOW_54_avg2wk',
       'LOWEST_VALUE_avg2wk', 'HIGHEST_VALUE_avg2wk', 'GRI_avg2wk',
       'HAS_INSULIN_DATA'],
      dtype='object')


# Load the tools needed for model evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# Function to print the accuracy and classification report
def print_acc_and_CR(y_test, model_predictions):
    # Calculate and display the model accuracy
    print('Accuracy: {:.2f}%'.format(accuracy_score(y_test, model_predictions) * 100))
    # Calculate and display the classification report for the model
    print('Classification report: \n', classification_report(y_test, model_predictions))


# Function to display the confusion matrix
def display_confusion_matrix(y_test, model_predictions):
    # Calculate and display the confusion matrix
    model_confusionMatrix = confusion_matrix(y_test, model_predictions)
    # Create variables that will be displayed as text on the plot
    strings2 = np.asarray([['True Negatives \n', 'False Positives \n'], ['False Negatives \n', 'True Positives \n']])
    labels2 = (np.asarray(["{0} {1:g}".format(string, value)
                          for string, value in zip(strings2.flatten(),model_confusionMatrix.flatten())])
             ).reshape(2, 2)
    # Use a heat map plot to display the results
    sns.heatmap(model_confusionMatrix, annot=labels2, fmt='', vmin=0, annot_kws={"fontsize":17})
    plt.xlabel('Predicted value');
    plt.ylabel('Actual value');


basic_predictions = X_test_reduced['GRI_avg2wk'] > 40


print_acc_and_CR(y_test, basic_predictions)

Accuracy: 86.32%
Classification report: 
               precision    recall  f1-score   support

           0       0.87      0.89      0.88       461
           1       0.85      0.84      0.84       365

    accuracy                           0.86       826
   macro avg       0.86      0.86      0.86       826
weighted avg       0.86      0.86      0.86       826


display_confusion_matrix(y_test, basic_predictions)


fraction_users_crossing_GRI_40 = (53 + 60) / len(y_test)


from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model

# Instantiate and fit the model
rfc = RandomForestClassifier(random_state=0)
rfc.fit(X_train_reduced, y_train)

# Use the model to predict diabetes type
rfc_predictions = rfc.predict(X_test_reduced)


print_acc_and_CR(y_test, rfc_predictions)

Accuracy: 86.92%
Classification report: 
               precision    recall  f1-score   support

           0       0.88      0.88      0.88       461
           1       0.85      0.85      0.85       365

    accuracy                           0.87       826
   macro avg       0.87      0.87      0.87       826
weighted avg       0.87      0.87      0.87       826


display_confusion_matrix(y_test, rfc_predictions)


# Set a number of features to consider when looking at rank/importance
n_features = 20

# Use Random Forest to get feature ranks/importances for each feature
importances = rfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfc.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

# Look at importance for first 20 features
for f in range(n_features):#range(X_train_reduced.shape[1]):
    print("%d. %s (feature %d) (%f)" %
          (f + 1, X_train_reduced.columns[indices[f]], indices[f], importances[indices[f]]))

# Plot the impurity-based feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(n_features), importances[indices[:n_features]],
        color="r", yerr=std[indices[:n_features]], align="center")
plt.xticks(range(n_features), indices[:n_features])
plt.xlim([-1, n_features]);

Feature ranking:
1. GRI_avg2wk (feature 78) (0.209625)
2. AVERAGE_VALUE_0 (feature 34) (0.070902)
3. AVERAGE_VALUE_1 (feature 35) (0.044240)
4. AVERAGE_VALUE_2 (feature 36) (0.041567)
5. AVERAGE_VALUE_12 (feature 46) (0.037425)
6. AVERAGE_VALUE_5 (feature 39) (0.037363)
7. AVERAGE_VALUE_3 (feature 37) (0.036208)
8. AVERAGE_VALUE_8 (feature 42) (0.035234)
9. AVERAGE_VALUE_9 (feature 43) (0.034251)
10. AVERAGE_VALUE_13 (feature 47) (0.030140)
11. AVERAGE_VALUE_6 (feature 40) (0.029346)
12. AVERAGE_VALUE_10 (feature 44) (0.025226)
13. AVERAGE_VALUE_4 (feature 38) (0.024881)
14. AVERAGE_VALUE_11 (feature 45) (0.024741)
15. AVERAGE_VALUE_7 (feature 41) (0.022315)
16. BELOW_54_avg2wk (feature 76) (0.015694)
17. LOWEST_VALUE_avg2wk (feature 77) (0.013761)
18. LOWEST_VALUE_8 (feature 70) (0.011641)
19. LOWEST_VALUE_6 (feature 68) (0.010170)
20. LOWEST_VALUE_0 (feature 62) (0.009773)


# Import the tool needed
from sklearn.linear_model import LogisticRegression

# Instantiate and train the model
logregmodel = LogisticRegression(max_iter=100000, class_weight="balanced")
logregmodel.fit(X_train_reduced,y_train)

# Make predictions
logregmodel_predictions = logregmodel.predict(X_test_reduced)


print_acc_and_CR(y_test, logregmodel_predictions)

Accuracy: 86.68%
Classification report: 
               precision    recall  f1-score   support

           0       0.89      0.87      0.88       461
           1       0.84      0.86      0.85       365

    accuracy                           0.87       826
   macro avg       0.86      0.87      0.87       826
weighted avg       0.87      0.87      0.87       826


display_confusion_matrix(y_test, logregmodel_predictions)


# We import the modules we need from sci-kit learn
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
import operator


# We will normalize our data first, so that our coefficients will conceptually relate to each variable's importance
X_train_std = StandardScaler().fit_transform(X_train_reduced)

# Stratified K-Fold is a version of K-Fold that keeps class ratios, we will separate the training data into 8 sets
K = 8
k_fold = StratifiedKFold(n_splits=K)

# Instatiate the logistic regression model
# Note that we need to increase the maximum number of iterations from the default to get the analysis to run
lr_model = LogisticRegression(max_iter=600)

# Run the recursive feature elimination with cross validation analysis and fit the model
rfecv = RFECV(estimator=lr_model, step=1, cv=k_fold, scoring='accuracy')
rfecv.fit(X_train_std, y_train)

RFECV(cv=StratifiedKFold(n_splits=8, random_state=None, shuffle=False),
      estimator=LogisticRegression(max_iter=600), scoring='accuracy')


# Create a plot to show the model accuracy as features are added one at a time
plt.figure()
plt.title('Recursive Feature Elimination with Logistic Regression')
plt.xlabel('Number of selected features')
plt.ylabel('{}-Fold Validation Accuracy'.format(K))

# The variable grid_scores_ is the list of accuracy scores for the different number of features
plt.plot(range(1, len(rfecv.grid_scores_) + 1), 
         [np.array(x).mean() for x in rfecv.grid_scores_]);
#plt.grid(True)
plt.xticks(np.arange(1, 65, step=4));

/Users/jasonphiltron/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The `grid_scores_` attribute is deprecated in version 1.0 in favor of `cv_results_` and will be removed in version 1.2.
  warnings.warn(msg, category=FutureWarning)


from xgboost import XGBClassifier

# Instantiate and fit the model
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train_reduced, y_train)

# Make predictions
xgb_predictions = xgb.predict(X_test_reduced)


print_acc_and_CR(y_test, xgb_predictions)

Accuracy: 85.71%
Classification report: 
               precision    recall  f1-score   support

           0       0.86      0.89      0.87       461
           1       0.86      0.81      0.83       365

    accuracy                           0.86       826
   macro avg       0.86      0.85      0.85       826
weighted avg       0.86      0.86      0.86       826


display_confusion_matrix(y_test, xgb_predictions)


crossing_users_indices = [i for i, x in enumerate(y) if x == 1]


GRI_changes_crossing_users = future_data[crossing_users_indices] - \
    X['GRI_avg2wk'][crossing_users_indices]


GRI_changes_crossing_users.describe()

count    1726.000000
mean        1.380259
std        12.091505
min       -59.122286
25%        -5.601107
50%         1.709714
75%         8.880214
max        44.537571
dtype: float64


sns.histplot(x=GRI_changes_crossing_users)
plt.xlabel('Change in GRI')
plt.title('Change in 2 Week Average GRI \n for users that crossed GRI = 40');


GRI_changes_all_users = future_data - X['GRI_avg2wk']


GRI_changes_all_users.describe()

count    4130.000000
mean       -0.111433
std         9.864801
min       -69.073714
25%        -5.417964
50%         0.051143
75%         5.547179
max        44.537571
dtype: float64


sns.histplot(x=GRI_changes_all_users)
plt.xlabel('Change in GRI')
plt.title('Change in 2 Week Average GRI \n for all users');


# Let's calculate the fraction of users that have a GRI that increases by at least 10
# this may be something that is more interesting and clinically useful to predict
((future_data - X['GRI_avg2wk']) > 10).astype(int).sum() / len(X)

0.12033898305084746


from supervised.automl import AutoML
automl = AutoML(algorithms=["Extra Trees","Xgboost",'Random Forest'], 
                train_ensemble=False, explain_level=2)
automl.fit(X_train_reduced, y_train)


auto_pred = automl.predict(X_test)
print("accuracy:", accuracy(y_test_new, auto_pred))
print("precision:", precision(y_test_new, auto_pred,average = 'macro'))
print("recall:", recall(y_test_new, auto_pred,average = 'macro'))
print("f1 score:", f1_score(y_test_new, auto_pred,average = 'macro'))
print("confusion matrix:")
print(confusion_matrix(y_test, auto_pred))


automl.get_leaderboard()


%%time
from tpot import TPOTClassifier
tpot = TPOTClassifier(generations=4, # set to 4 to run faster for testing
                     population_size=40,
                     scoring='neg_log_loss',
                     verbosity=2,
                     random_state=42)
tpot.fit(X_train_reduced, y_train)

Generation 1 - Current best internal CV score: -0.3240182649839661

Generation 2 - Current best internal CV score: nan

Generation 3 - Current best internal CV score: nan

Generation 4 - Current best internal CV score: nan

Best pipeline: GradientBoostingClassifier(input_matrix, learning_rate=0.1, max_depth=2, max_features=1.0, min_samples_leaf=8, min_samples_split=5, n_estimators=100, subsample=0.6500000000000001)
CPU times: user 17min 42s, sys: 5min 28s, total: 23min 11s
Wall time: 14min 23s

TPOTClassifier(generations=4, population_size=40, random_state=42,
               scoring='neg_log_loss', verbosity=2)


print(f"TPOT score on test data: {tpot.score(X_test_reduced, y_test):.2f}")
tpot.export('tpot_GRI_pipeline.py')

TPOT score on test data: -0.35

	LOWEST_VALUE	HIGHEST_VALUE
count	375830.000000	375830.00000
mean	74.545346	266.64763
std	27.175515	66.25717
min	39.000000	39.00000
25%	57.000000	217.00000
50%	69.000000	260.00000
75%	86.000000	311.00000
max	401.000000	401.00000

	DATE	GRI
3828	2021-04-01	85.560
4419	2021-04-02	100.832
11018	2021-04-03	118.608
15663	2021-04-04	64.168
18285	2021-04-05	47.504
20938	2021-04-06	101.112
27770	2021-04-07	143.008
31927	2021-04-08	106.488
35205	2021-04-09	108.888
39760	2021-04-10	74.440
45270	2021-04-11	114.728
45880	2021-04-12	95.280
49751	2021-04-13	109.448
54032	2021-04-14	105.832
59229	2021-04-15	102.776
64875	2021-04-16	75.560
67948	2021-04-17	20.688
72719	2021-04-18	39.712
76365	2021-04-19	33.328
79754	2021-04-20	36.392

	DATE	GRI	GRI_PAST_2WK	GRI_FUTURE_1WK	GRI_FUTURE_2WK
3828	2021-04-01	85.560	NaN	94.398857	98.278286
4419	2021-04-02	100.832	NaN	97.388571	99.508000
11018	2021-04-03	118.608	NaN	98.539429	97.702857
15663	2021-04-04	64.168	NaN	92.229714	90.708571
18285	2021-04-05	47.504	NaN	99.452571	88.961714
20938	2021-04-06	101.112	NaN	106.277714	87.949143
27770	2021-04-07	143.008	NaN	107.468571	83.326286
31927	2021-04-08	106.488	NaN	102.157714	74.301714
35205	2021-04-09	108.888	NaN	101.627429	67.211429
39760	2021-04-10	74.440	NaN	96.866286	62.509143
45270	2021-04-11	114.728	NaN	89.187429	63.005714
45880	2021-04-12	95.280	NaN	78.470857	62.767429
49751	2021-04-13	109.448	NaN	69.620571	60.445714
54032	2021-04-14	105.832	98.278286	59.184000	54.684000
59229	2021-04-15	102.776	99.508000	46.445714	50.438286
64875	2021-04-16	75.560	97.702857	32.795429	46.548571
67948	2021-04-17	20.688	90.708571	28.152000	43.533143
72719	2021-04-18	39.712	88.961714	36.824000	45.249714
76365	2021-04-19	33.328	87.949143	47.064000	47.829714
79754	2021-04-20	36.392	83.326286	51.270857	46.699429

	TIME_CGM_ACTIVE	AVERAGE_VALUE	DATE	GRI_HYPO	GRI_HYPER	GRI
7	1.000000	226.468750	2021-04-01	0.000	54.685	87.496
8185	0.906250	205.015326	2021-04-02	0.000	38.890	62.224
10716	1.000000	217.815972	2021-04-03	0.000	48.785	78.056
12841	1.000000	216.138889	2021-04-04	0.000	52.255	83.608
20569	1.000000	254.333333	2021-04-05	0.000	68.055	108.888
24205	0.972222	269.664286	2021-04-06	0.000	83.035	132.856
25602	0.993056	181.356643	2021-04-07	0.280	34.265	55.664
30774	1.000000	173.694444	2021-04-08	1.112	28.125	48.336
37067	0.972222	184.275000	2021-04-09	0.000	35.535	56.856
39937	0.972222	179.328571	2021-04-10	0.000	31.425	50.280

	TOTAL_INSULIN	TOTAL_BASAL	TOTAL_BOLUS	HAS_REMOTE_SMBG_DATA	HAS_REMOTE_CGM_DATA	HAS_REMOTE_INSULIN_DATA	HAS_IN_CLINIC_SYNC	GRI_FUTURE_2WK	GENDER	CURRENT_AGE	...	GRI_4	GRI_5	GRI_6	GRI_7	GRI_8	GRI_9	GRI_10	GRI_11	GRI_12	GRI_13
0	49.15	42.00	7.15	False	True	True	False	54.684000	male	57	...	74.440	108.888	106.488	143.008	101.112	47.504	64.168	118.608	100.832	85.560
1	16.80	16.80	0.00	False	True	False	False	92.694286	female	47	...	160.000	132.144	7.608	58.352	75.544	92.136	64.840	16.784	45.080	44.568
2	51.60	29.30	22.30	False	True	False	False	52.608857	female	18	...	38.344	35.838	52.648	66.054	77.094	66.172	42.000	105.008	86.934	25.840
3	NaN	NaN	NaN	False	True	False	False	17.202571	female	28	...	8.328	6.672	10.280	5.832	22.512	6.944	37.776	18.120	25.748	20.072
4	58.15	22.05	36.10	False	True	False	False	52.899429	female	31	...	63.614	0.000	98.386	24.084	55.978	51.456	55.562	77.234	87.350	65.066

Creation of baseline model for GRI¶

Import statements¶

Data loading¶

Quick look at variables we have in user info file¶

Generate additional variables from user info¶

Quick look at variables we have in daily stats file¶

Generate additional (time series) variables from daily stats¶

Also include variables from the user info data file¶

Convert daily variables into structured data format¶

Create the prediction variable¶

Split data into train and test sets¶

View feature correlation¶

Create a couple functions for visualizing results¶

Check the performance of a super basic model¶

Fit a Random Forest Classifier¶

Fit a Logistic Regression Model¶

K-Fold validation and recursive feature elimination¶

Gradient Boosted Classifier¶

A quick look at how much GRI changes¶

Using autoML to find an optimized model¶

Using TPOT to find an optimized model¶

Concluding remarks¶

Future work¶

	TIME_CGM_ACTIVE	ABOVE_180	ABOVE_250	BELOW_54	BELOW_60	BELOW_70	READING_COUNT
0	1.000000	30.21	7.64	0.00	0.69	1.74	288
1	1.000000	47.57	13.54	0.00	0.00	1.04	288
2	1.000000	4.51	0.00	0.00	0.00	2.08	288
3	0.986111	5.99	0.00	0.00	0.00	3.52	284
4	1.000000	9.38	0.00	1.39	4.17	9.38	288
5	1.003472	60.55	35.29	0.00	0.00	0.00	289
6	1.000000	0.00	0.00	0.00	2.78	9.38	288
7	1.000000	77.08	32.29	0.00	0.00	0.00	288
8	1.000000	18.06	0.00	0.00	0.00	1.04	288
9	1.000000	12.50	5.56	0.00	0.00	0.35	288
10	1.000000	45.49	12.85	0.00	0.00	0.00	288
11	0.979167	18.09	1.77	0.00	0.00	0.71	282

	3828	4419	11018	15663	18285	20938	27770	31927	35205	39760
Unnamed: 0	3828	4419	11018	15663	18285	20938	27770	31927	35205	39760
USER_ID	0	0	0	0	0	0	0	0	0	0
TIME_CGM_ACTIVE	1.0	1.0	1.0	1.0	1.0	1.0	0.899306	0.996528	1.0	1.0
AVERAGE_VALUE	218.4375	241.354167	257.315972	199.736111	192.989583	242.006944	304.602317	231.735192	238.920139	211.03125
READING_COUNT	288	288	288	288	288	288	259	287	288	288
ABOVE_180	82.99	88.54	83.68	55.56	59.38	70.83	100.0	74.22	87.5	67.36
ABOVE_250	23.96	37.5	64.58	24.65	0.0	55.56	78.76	56.79	48.61	25.69
ABOVE_400	0.0	0.0	0.0	0.0	0.0	0.0	0.39	0.0	0.0	0.0
BELOW_50	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
BELOW_54	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
BELOW_60	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
BELOW_70	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.7	0.0	0.0
LOWEST_VALUE	143.0	114.0	73.0	110.0	127.0	158.0	225.0	68.0	80.0	114.0
HIGHEST_VALUE	296.0	383.0	372.0	308.0	250.0	325.0	401.0	324.0	323.0	322.0
DATE	2021-04-01	2021-04-02	2021-04-03	2021-04-04	2021-04-05	2021-04-06	2021-04-07	2021-04-08	2021-04-09	2021-04-10
TOTAL_INSULIN	22.15	22.45	22.5	20.2	20.2	20.2	29.0	21.3	24.75	21.25
TOTAL_BASAL	18.0	15.7	18.0	18.0	18.0	16.6	19.75	18.0	20.9	18.0
TOTAL_BOLUS	4.15	6.75	4.5	2.2	2.2	3.6	9.25	3.3	3.85	3.25
HAS_REMOTE_SMBG_DATA	False	False	False	False	False	False	False	False	False	False
HAS_REMOTE_CGM_DATA	True	True	True	True	True	True	True	True	True	True
HAS_REMOTE_INSULIN_DATA	True	True	True	True	True	True	True	True	True	True
HAS_IN_CLINIC_SYNC	False	False	False	False	False	False	False	False	False	False
GRI_HYPO	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.56	0.0	0.0
GRI_HYPER	53.475	63.02	74.13	40.105	29.69	63.195	89.38	65.505	68.055	46.525
GRI	85.56	100.832	118.608	64.168	47.504	101.112	143.008	106.488	108.888	74.44

	GENDER	CURRENT_AGE	HAS_CGM	HAS_PUMP	HAS_METER	SYNC_COUNT	APP_LENGTH	HAS_OTHER_COUNT	c4c	kiosk	logbook	patient_uploader	server	android	browser	ios
ID
1332	female	19	True	True	False	1244	5	False	0	1	1	0	0	1	0	1
1333	male	54	True	True	False	1739	3	False	0	0	1	1	0	0	1	1
1334	male	48	True	True	False	1296	1	False	0	0	1	0	0	0	1	0
1335	female	49	True	True	True	930	6	False	0	1	1	0	0	1	0	1
1336	male	33	True	True	False	1221	4	False	0	0	1	0	0	0	0	1
1337	female	28	True	True	False	1270	8	False	0	0	1	0	0	0	0	1
1338	female	31	True	True	False	1260	6	False	0	1	1	0	0	1	0	0
1339	male	46	True	True	False	2008	6	False	0	1	1	0	0	1	1	1
1340	male	40	True	True	True	990	4	False	1	0	1	0	0	1	0	1
1341	female	67	True	True	False	2317	2	False	0	0	1	1	0	0	1	1

	HAS_REMOTE_CGM_DATA	HAS_REMOTE_INSULIN_DATA	GENDER	CURRENT_AGE	HAS_CGM	HAS_PUMP	HAS_METER	SYNC_COUNT	...	GRI_12	GRI_13	TIME_CGM_ACTIVE_avg2wk	AVERAGE_VALUE_avg2wk	ABOVE_250_avg2wk	BELOW_54_avg2wk	LOWEST_VALUE_avg2wk	HIGHEST_VALUE_avg2wk	GRI_avg2wk	HAS_INSULIN_DATA
0	1	1	1	57	1	1	0	1209	...	100.832	85.560	0.992560	235.341072	41.626429	0.000000	129.928571	328.000000	98.278286	0
1	1	0	0	47	1	1	0	895	...	45.080	44.568	0.724454	229.794895	34.435714	0.000000	144.357143	314.285714	76.854857	0
2	1	0	0	18	1	1	0	1694	...	86.934	25.840	0.969246	161.175960	8.502857	1.960714	51.142857	299.857143	58.726714	0
3	1	0	0	28	1	1	1	1794	...	25.748	20.072	0.982143	119.962507	0.797143	0.371429	60.000000	220.000000	16.206857	1
4	1	0	0	31	1	1	0	1072	...	87.350	65.066	0.909722	158.483504	10.853571	2.951429	52.500000	287.071429	59.656000	0