Commit 5f0b3ee7 authored by jeeheh's avatar jeeheh
Browse files

Initial Commit

parents
'''
Updated: February 1, 2018
Written by: Jeeheh Oh
Purpose: Performs stratified, kfold, cross validation on the training set in order to determine optimal value for C hyperparameter that controls L2 regularization strength in logistic regression.
How to use:
-- a. Complete reference.py
-- b. Run code once for each potential hyperparameter value. (eg input into terminal: python cv.py cindex=0, python cv.py cindex=1,..., python cv.py cindex=14). This doesn't have to be done via the terminal. The cindex variable can be changed in the ## Set Variables ## section of this code. Results will be aggregated in model.py.
Saved Outputs:
-- auc_cv: Cross Validation AUC, shape = len(c_param) x nfolds. Only the column corresponding to c is complete.
-- auc_train_ef: Training AUC, scalar
-- auc_test_ef: Test AUC, scalar
-- c: C parameter used
-- c_param: List of C parameters in grid search
'''
import numpy as np
import pandas as pd
import pickle
import sys
import reference as reference
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import csr_matrix
## Set Variables ##
pathout,randseed,c_param,nfolds=reference.returnVars()
np.random.seed(randseed)
# Which C parameter tested in this run
cindex=0 #cindex[0-14]
# Used to set any variables from terminal
# eg: python cv.py cindex=1
list=sys.argv[1:]
for i in list:
exec(i)
############################################################################################
############################################################################################
# Import data
xtrain,ytrain,xtest,ytest,eid_train,eid_test,day_train,day_test=reference.loadData()
# Returns AUC score using the maximum estimate per patient.
def scorer(estimator,X,y,eid):
proba=estimator.decision_function(X)
df=pd.DataFrame({'eid':eid,'proba':proba,'label':y})
df=df.groupby(df['eid'],as_index=False).max()
fpr, tpr, thresholds=metrics.roc_curve(df.label,df.proba,pos_label=1)
return metrics.auc(fpr,tpr)
# Stratified K Fold, Clustered on Patient
from sklearn.model_selection import StratifiedKFold
dfskf=pd.DataFrame({'eid':eid_train,'y':ytrain}) #create dataframe of eid and y
dfskf.drop_duplicates(subset='eid',inplace=True) #make it unique by eid
skf=StratifiedKFold(n_splits=nfolds)
skf.get_n_splits(dfskf.eid,dfskf.y)
index=1
dfskf['fold']=np.ones(dfskf.shape[0]) #'fold' indicates which fold eid is in
for _,test_index in skf.split(dfskf.eid,dfskf.y):
dfskf.iloc[test_index,dfskf.columns.get_loc("fold")]=index
index=index+1
eidyear_train=pd.DataFrame({'eid':eid_train}) #now translate the 'fold' to indicies
eidyear_train['index']=eidyear_train.index
eidyear_train=eidyear_train.merge(dfskf,how='left',on='eid',indicator=True)
# CV
c=c_param[cindex]
auc_cv=np.empty((len(c_param),nfolds))
auc_cv[:]=np.NAN
clf=linear_model.LogisticRegression(penalty='l2',class_weight='balanced',C=c)
for foldindex, year in enumerate(np.arange(nfolds)+1):
#List of Index Numbers
cvtrain_idx=eidyear_train.loc[eidyear_train.fold!=year,:]
cvtest_idx=eidyear_train.loc[eidyear_train.fold==year,:]
cvtrain_idx2=np.array(cvtrain_idx['index'],dtype=pd.Series).tolist()
cvtest_idx2=np.array(cvtest_idx['index'],dtype=pd.Series).tolist()
zx,zy=reference.subsample(eid_train[cvtrain_idx2],day_train[cvtrain_idx2],xtrain[cvtrain_idx2,:],ytrain[cvtrain_idx2])
clf.fit(zx,zy)
auc_cv[cindex,foldindex]=scorer(clf,xtrain[cvtest_idx2,:],ytrain[cvtest_idx2],eid_train[cvtest_idx2])
# Optional: Calculates the train and test AUC in addition to the CV AUC.
clf=linear_model.LogisticRegression(penalty='l2',class_weight='balanced',C=c)
zx,zy=reference.subsample(eid_train,day_train,xtrain,ytrain)
clf.fit(zx,zy)
auc_train_ef=scorer(clf,xtrain,ytrain,eid_train)
auc_test_ef=scorer(clf,xtest,ytest,eid_test)
pickle.dump([auc_cv,auc_train_ef,auc_test_ef,c,c_param], open(pathout+"cv_c"+str(cindex)+"_auc.pickle", "wb" ) )
'''
Updated: February 1, 2018
Written by: Jeeheh Oh
Purpose: Trains final logistic regression model on complete training set using the optimal C value.
How to use:
-- a. Complete reference.py
-- b. Run
Saved Outputs:
-- auc_cv: Cross validation AUC, complete for all values of c, shape = len(c_param) x nfolds
-- c: optimal hyperparameter c value
-- clf: final classifier
-- df, dftr: Data frame containing columns for: eid, day, rawest(raw score from logistic regression), label, rollingscore (the rolling average of the raw score) for test and training. Data is at the day level of granularity.
-- df2,dftr2: Data frame containing eid, label and rollingscore. Data is at the admission level. Maximum rolling score is kept for each admission.
'''
import numpy as np
import pandas as pd
import reference as reference
import pickle
import sys
from sklearn import linear_model
from sklearn import metrics
from scipy.sparse import csr_matrix
pathout,randseed,c_param,nfolds=reference.returnVars()
np.random.seed(randseed)
# Import data
xtrain,ytrain,xtest,ytest,eid_train,eid_test,day_train,day_test=maggie.loadData()
# Import data: CV
auc_cv=np.empty((len(c_param),nfolds))
auc_cv[:]=np.NAN
auc_train_ef=[]
auc_test_ef=[]
for i in range(len(c_param)):
zauc_cv,zauc_train_ef,zauc_test_ef,_,_=pickle.load(open(pathout+"cv_c"+str(i)+"_auc.pickle","rb"))
#auc_cv,auc_train_ef,auc_test_ef,c,c_param
auc_cv[i,:]=zauc_cv[i,:]
auc_train_ef.append(zauc_train_ef)
auc_test_ef.append(zauc_test_ef)
# Find best c
c=c_param[np.argmax(np.mean(auc_cv,axis=1))]
# Learn Model
clf=linear_model.LogisticRegression(penalty='l2',class_weight='balanced',C=c)
zx,zy=reference.subsample(eid_train,day_train,xtrain,ytrain)
clf.fit(zx,zy)
# Create Output Dataframe
idx=np.where(clf.classes_==1)[0]
test_rawest=clf.predict_proba(xtest)[:,idx]
train_rawest=clf.predict_proba(xtrain)[:,idx]
df=pd.DataFrame({'encounterID':eid_test,'day':day_test ,'rawest':test_rawest.reshape((len(test_rawest),)),'label':ytest.reshape((len(ytest),))})
dftr=pd.DataFrame({'encounterID':eid_train,'day':day_train ,'rawest':train_rawest.reshape((len(train_rawest),)),'label':ytrain.reshape((len(ytrain),))})
#Smoothing
df=df.sort_values(['encounterID','day'],ascending=[1,1])
df['csum']=df['rawest'].groupby([df['encounterID']]).cumsum()
df['rollingscore']=df['csum']/df['day']
dftr=dftr.sort(['encounterID','day'],ascending=[1,1])
dftr['csum']=dftr['rawest'].groupby([dftr['encounterID']]).cumsum()
dftr['rollingscore']=dftr['csum']/dftr['day']
df2=df.loc[df.day>=2,['rollingscore','label','encounterID']].groupby([df['encounterID']]).max()
dftr2=dftr.loc[dftr.day>=2,['rollingscore','label','encounterID']].groupby([dftr['encounterID']]).max()
# Save Data
with open(pathout+'learn_model.pickle','wb') as f:
pickle.dump([auc_cv, c, clf, df, df2,dftr,dftr2],f)
\ No newline at end of file
import pandas as pd
# Fill out desired variable settings
def returnVars():
#Location where you want output to be saved to
pathout='/../'
#subsample random seed
randseed= #eg: 11
# C parameter Grid Search Values
c_param=[5**(-14),5**(-13),5**(-12),5**(-11),5**(-10),5**(-9),5**(-8),5**(-7),5**(-6),5**(-5),5**(-4),5**(-3),5**(-2),5**(-1),5**(0)]
# Number of folds for the Cross Validation
nfolds=5
return pathout,randseed,c_param,nfolds
# Complete Function so that it returns the appropriate data variables
def loadData():
# m = dimensionality of features. eg: If our observations consist only of Heart Rate and Blood Pressure, m=2.
# n_train, n_test = number of patient days we have data for in the training and test set. eg: If our training dataset consists of: (Patient A, LOS=3), (Patient B, LOS=2). Then n_train=5.
# eid = Admission ID/Encounter ID. Each admission should have a unique ID regardless if it is a returning patient.
# day = Day of observation, in relation to the admission day. eg: 1, 2, 3, 4, etc.
xtrain= #Data: scipy sparse csr matrix of size n_train xm
ytrain= #Labels: numpy array of size n_train
xtest= #Data: scipy sparse csr matrix of size n_test xm
ytest= #Labels: numpy array of size n_test
eid_train= #eids corresponding to each row of xtrain: numpy array of size n_train
eid_test= #eids corresponding to each row of xtest: numpy array of size n_test
day_train= #day corresponding to each row of xtrain: numpy array of size n_train
day_test= #day corresponding to each row of xtest: numpy array of size n_test
return xtrain,ytrain,xtest,ytest,eid_train,eid_test,day_train,day_test
# Returns subsampled version of data, where for eg, if size=3, each patient is only represented 3 times in the dataset, aka only 3 days from that patient are kept in the subsampled dataset.
# Set desired subsample size
def subsample(eid,day,x,y):
df=pd.DataFrame({'eid':eid,'day':day})
df['index']=df.index
size= #eg: 3
replace=True
subspl=lambda obj: obj.loc[np.random.choice(obj.index,size,replace),:]
df=df.groupby('eid',as_index=False).apply(subspl)
x=x[df['index'].tolist(),:]
y=y[df['index'].tolist()]
return x,y
\ No newline at end of file
'''
Updated: February 1, 2018
Written by: Jeeheh Oh
Purpose: Performs stratified, kfold, cross validation on the training set in order to determine optimal value for C hyperparameter that controls L2 regularization strength in logistic regression.
How to use:
-- a. Complete reference.py
-- b. Run code once for each potential hyperparameter value. (eg input into terminal: python cv.py cindex=0, python cv.py cindex=1,..., python cv.py cindex=14). This doesn't have to be done via the terminal. The cindex variable can be changed in the ## Set Variables ## section of this code. Results will be aggregated in model.py.
Saved Outputs:
-- auc_cv: Cross Validation AUC, shape = len(c_param) x nfolds. Only the column corresponding to c is completed.
-- auc_train_ef: Training AUC, scalar
-- auc_test_ef: Test AUC, scalar
-- c: C parameter used
-- c_param: List of C parameters in grid search
'''
import numpy as np
import pandas as pd
import pickle
import sys
import reference as reference
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import csr_matrix
## Set Variables ##
pathout,randseed,c_param,nfolds=reference.returnVars()
np.random.seed(randseed)
# Which C parameter tested in this run
cindex=0 #cindex[0-14]
# Used to set any variables from terminal
# eg: python cv.py cindex=1
list=sys.argv[1:]
for i in list:
exec(i)
############################################################################################
############################################################################################
# Import data
xtrain,ytrain,xtest,ytest,eid_train,eid_test,day_train,day_test=reference.loadData()
# Returns AUC score using the maximum estimate per patient.
def scorer(estimator,X,y,eid):
proba=estimator.decision_function(X)
df=pd.DataFrame({'eid':eid,'proba':proba,'label':y})
df=df.groupby(df['eid'],as_index=False).max()
fpr, tpr, thresholds=metrics.roc_curve(df.label,df.proba,pos_label=1)
return metrics.auc(fpr,tpr)
# Stratified K Fold, Clustered on Patient
from sklearn.model_selection import StratifiedKFold
dfskf=pd.DataFrame({'eid':eid_train,'y':ytrain}) #create dataframe of eid and y
dfskf.drop_duplicates(subset='eid',inplace=True) #make it unique by eid
skf=StratifiedKFold(n_splits=nfolds)
skf.get_n_splits(dfskf.eid,dfskf.y)
index=1
dfskf['fold']=np.ones(dfskf.shape[0]) #'fold' indicates which fold eid is in
for _,test_index in skf.split(dfskf.eid,dfskf.y):
dfskf.iloc[test_index,dfskf.columns.get_loc("fold")]=index
index=index+1
eidyear_train=pd.DataFrame({'eid':eid_train}) #now translate the 'fold' to indicies
eidyear_train['index']=eidyear_train.index
eidyear_train=eidyear_train.merge(dfskf,how='left',on='eid',indicator=True)
# CV
c=c_param[cindex]
auc_cv=np.empty((len(c_param),nfolds))
auc_cv[:]=np.NAN
clf=linear_model.LogisticRegression(penalty='l2',class_weight='balanced',C=c)
for foldindex, year in enumerate(np.arange(nfolds)+1):
#List of Index Numbers
cvtrain_idx=eidyear_train.loc[eidyear_train.fold!=year,:]
cvtest_idx=eidyear_train.loc[eidyear_train.fold==year,:]
cvtrain_idx2=np.array(cvtrain_idx['index'],dtype=pd.Series).tolist()
cvtest_idx2=np.array(cvtest_idx['index'],dtype=pd.Series).tolist()
zx,zy=reference.subsample(eid_train[cvtrain_idx2],day_train[cvtrain_idx2],xtrain[cvtrain_idx2,:],ytrain[cvtrain_idx2])
clf.fit(zx,zy)
auc_cv[cindex,foldindex]=scorer(clf,xtrain[cvtest_idx2,:],ytrain[cvtest_idx2],eid_train[cvtest_idx2])
# Optional: Calculates the train and test AUC in addition to the CV AUC.
clf=linear_model.LogisticRegression(penalty='l2',class_weight='balanced',C=c)
zx,zy=reference.subsample(eid_train,day_train,xtrain,ytrain)
clf.fit(zx,zy)
auc_train_ef=scorer(clf,xtrain,ytrain,eid_train)
auc_test_ef=scorer(clf,xtest,ytest,eid_test)
pickle.dump([auc_cv,auc_train_ef,auc_test_ef,c,c_param], open(pathout+"cv_c"+str(cindex)+"_auc.pickle", "wb" ) )
'''
Updated: February 1, 2018
Written by: Jeeheh Oh
Purpose: Trains final logistic regression model on complete training set using the optimal C value.
How to use:
-- a. Complete reference.py
-- b. Run
Saved Outputs:
-- auc_cv: Cross validation AUC, complete for all values of c, shape = len(c_param) x nfolds
-- c: optimal hyperparameter c value
-- clf: final classifier
-- df, dftr: Data frame containing columns for: eid, day, rawest(raw score from logistic regression), label, rollingscore (the rolling average of the raw score) for test and training. Data is at the day level of granularity.
-- df2,dftr2: Data frame containing eid, label and rollingscore for test and training. Data is at the admission level. Maximum rolling score is kept for each admission.
'''
import numpy as np
import pandas as pd
import reference as reference
import pickle
import sys
from sklearn import linear_model
from sklearn import metrics
from scipy.sparse import csr_matrix
pathout,randseed,c_param,nfolds=reference.returnVars()
np.random.seed(randseed)
# Import data
xtrain,ytrain,xtest,ytest,eid_train,eid_test,day_train,day_test=maggie.loadData()
# Import data: CV
auc_cv=np.empty((len(c_param),nfolds))
auc_cv[:]=np.NAN
auc_train_ef=[]
auc_test_ef=[]
for i in range(len(c_param)):
zauc_cv,zauc_train_ef,zauc_test_ef,_,_=pickle.load(open(pathout+"cv_c"+str(i)+"_auc.pickle","rb"))
#auc_cv,auc_train_ef,auc_test_ef,c,c_param
auc_cv[i,:]=zauc_cv[i,:]
auc_train_ef.append(zauc_train_ef)
auc_test_ef.append(zauc_test_ef)
# Find best c
c=c_param[np.argmax(np.mean(auc_cv,axis=1))]
# Learn Model
clf=linear_model.LogisticRegression(penalty='l2',class_weight='balanced',C=c)
zx,zy=reference.subsample(eid_train,day_train,xtrain,ytrain)
clf.fit(zx,zy)
# Create Output Dataframe
idx=np.where(clf.classes_==1)[0]
test_rawest=clf.predict_proba(xtest)[:,idx]
train_rawest=clf.predict_proba(xtrain)[:,idx]
df=pd.DataFrame({'encounterID':eid_test,'day':day_test ,'rawest':test_rawest.reshape((len(test_rawest),)),'label':ytest.reshape((len(ytest),))})
dftr=pd.DataFrame({'encounterID':eid_train,'day':day_train ,'rawest':train_rawest.reshape((len(train_rawest),)),'label':ytrain.reshape((len(ytrain),))})
#Smoothing
df=df.sort_values(['encounterID','day'],ascending=[1,1])
df['csum']=df['rawest'].groupby([df['encounterID']]).cumsum()
df['rollingscore']=df['csum']/df['day']
dftr=dftr.sort(['encounterID','day'],ascending=[1,1])
dftr['csum']=dftr['rawest'].groupby([dftr['encounterID']]).cumsum()
dftr['rollingscore']=dftr['csum']/dftr['day']
df2=df.loc[df.day>=2,['rollingscore','label','encounterID']].groupby([df['encounterID']]).max()
dftr2=dftr.loc[dftr.day>=2,['rollingscore','label','encounterID']].groupby([dftr['encounterID']]).max()
# Save Data
with open(pathout+'learn_model.pickle','wb') as f:
pickle.dump([auc_cv, c, clf, df, df2,dftr,dftr2],f)
\ No newline at end of file
import pandas as pd
# Fill out desired variable settings
def returnVars():
#Location where you want output to be saved to
pathout='/../'
#subsample random seed
randseed= #eg: 11
# C parameter Grid Search Values
c_param=[5**(-14),5**(-13),5**(-12),5**(-11),5**(-10),5**(-9),5**(-8),5**(-7),5**(-6),5**(-5),5**(-4),5**(-3),5**(-2),5**(-1),5**(0)]
# Number of folds for the Cross Validation
nfolds=5
return pathout,randseed,c_param,nfolds
# Complete Function so that it returns the appropriate data variables
def loadData():
# m = dimensionality of features. eg: If our observations consist only of Heart Rate and Blood Pressure, m=2.
# n_train, n_test = number of patient days we have data for in the training and test set. eg: If our training dataset consists of: (Patient A, LOS=3), (Patient B, LOS=2). Then n_train=5.
# eid = Admission ID/Encounter ID. Each admission should have a unique ID regardless if it is a returning patient.
# day = Day of observation, in relation to the admission day. eg: 1, 2, 3, 4, etc.
xtrain= #Data: scipy sparse csr matrix of size n_train xm
ytrain= #Labels: numpy array of size n_train
xtest= #Data: scipy sparse csr matrix of size n_test xm
ytest= #Labels: numpy array of size n_test
eid_train= #eids corresponding to each row of xtrain: numpy array of size n_train
eid_test= #eids corresponding to each row of xtest: numpy array of size n_test
day_train= #day corresponding to each row of xtrain: numpy array of size n_train
day_test= #day corresponding to each row of xtest: numpy array of size n_test
return xtrain,ytrain,xtest,ytest,eid_train,eid_test,day_train,day_test
# Returns subsampled version of data, where for eg, if size=3, each patient is only represented 3 times in the dataset, aka only 3 days from that patient are kept in the subsampled dataset.
# Set desired subsample size
def subsample(eid,day,x,y):
df=pd.DataFrame({'eid':eid,'day':day})
df['index']=df.index
size= #eg: 3
replace=True
subspl=lambda obj: obj.loc[np.random.choice(obj.index,size,replace),:]
df=df.groupby('eid',as_index=False).apply(subspl)
x=x[df['index'].tolist(),:]
y=y[df['index'].tolist()]
return x,y
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment