Commit cbc4a8e3 authored by Shengpu Tang (tangsp)'s avatar Shengpu Tang (tangsp)
Browse files

code for model training

parent 7bd486dd
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/tangsp/mimic3_experiments/lib/data.py:14: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n",
" config = yaml.load(f)\n"
]
}
],
"source": [
"from lib.data import _Mimic3Reader\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"timestep = 1.0"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finish reading data \t 2.28 s\n",
"s (15873, 98)\n",
"X (15873, 4, 4045)\n",
"Finish reading data \t 11.16 s\n",
"s (14174, 96)\n",
"X (14174, 12, 4816)\n",
"Finish reading data \t 3.49 s\n",
"s (19342, 98)\n",
"X (19342, 4, 4522)\n",
"Finish reading data \t 10.31 s\n",
"s (17588, 97)\n",
"X (17588, 12, 5500)\n"
]
}
],
"source": [
"for task in ['ARF', 'Shock']:\n",
" for duration in [4, 12]:\n",
" reader = _Mimic3Reader(task, duration, timestep)\n",
" print('s', reader.s.shape)\n",
" print('X', reader.X.shape)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finish reading data \t 35.72 s\n",
"s (11695, 97)\n",
"X (11695, 48, 7411)\n"
]
}
],
"source": [
"for task in ['mortality']:\n",
" for duration in [48]:\n",
" reader = _Mimic3Reader(task, duration, timestep)\n",
" print('s', reader.s.shape)\n",
" print('X', reader.X.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Config file:
- `config.yaml`. Change data_path to point to the directory where the features/labels/population are stored
Library files:
- `data.py`
- `models.py`
- `trainer.py`
- `experiment.py`
- `eval_deep.py`
- `evaluate.py`
Executable files:
- `run_deep.py`
- `run_shallow.py`
Notebooks:
- `RunShallow.ipynb`
- `NewEval_Deep.ipynb`
- `Evaluation.ipynb`
- `PredictionGap.ipynb`
data_path: /data4/tangsp/mimic3_features/
model_names: {
'CNN': 'CNN_V3',
'RNN': 'RNN_V2',
'LR': 'LR',
'RF': 'RF',
}
train:
budget: 50
repeat: 1
epochs: 15
feature_dimension:
ARF:
4.0 : 4143
12.0: 4912
Shock:
4.0 : 4620
12.0: 5597
mortality:
48.0: 7508
import sys, os, time, pickle, random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
import yaml
with open('config.yaml') as f:
config = yaml.load(f)
data_path = config['data_path']
def get_test(task, fuse=False, duration=4, timestep=0.5, normalize=True, batch_size=64):
"""
Returns:
pytorch DataLoader for test
"""
print('Reading files')
reader = _Mimic3Reader(task, duration, timestep)
_, _, Xy_te = reader.get_splits(gap=0.0, random_state=0, verbose=False)
te = EHRDataset(*Xy_te, fuse=fuse)
num_workers = 1
te_loader = DataLoader(te, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
print('Time series shape, Static shape, Label shape, Class balance:')
print('\t', 'te', te_loader.dataset.X.shape, te_loader.dataset.s.shape, te_loader.dataset.y.shape, te_loader.dataset.y.mean())
if fuse:
print('Fused dimensions:', te_loader.dataset[0][0].shape)
return te_loader
def get_train_val_test(task, fuse=False, duration=4, timestep=0.5, normalize=True, batch_size=64):
"""
Returns:
pytorch DataLoader for train, val, test
"""
print('Reading files')
reader = _Mimic3Reader(task, duration, timestep)
Xy_tr, Xy_va, Xy_te = reader.get_splits(gap=0.0, random_state=0)
te = EHRDataset(*Xy_te, fuse=fuse)
va = EHRDataset(*Xy_va, fuse=fuse)
tr = EHRDataset(*Xy_tr, fuse=fuse)
num_workers = 1
tr_loader = DataLoader(tr, batch_size=batch_size, shuffle=True , num_workers=num_workers, pin_memory=True)
va_loader = DataLoader(va, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
te_loader = DataLoader(te, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
print(tr_loader.dataset.y.sum() + va_loader.dataset.y.sum() + te_loader.dataset.y.sum(), '/', reader.X.shape[0])
print('')
print('Time series shape, Static shape, Label shape, Class balance:')
print('\t', 'tr', tr_loader.dataset.X.shape, tr_loader.dataset.s.shape, tr_loader.dataset.y.shape, tr_loader.dataset.y.mean())
print('\t', 'va', va_loader.dataset.X.shape, va_loader.dataset.s.shape, va_loader.dataset.y.shape, va_loader.dataset.y.mean())
print('\t', 'te', te_loader.dataset.X.shape, te_loader.dataset.s.shape, te_loader.dataset.y.shape, te_loader.dataset.y.mean())
if fuse:
print('Fused dimensions:', tr_loader.dataset[0][0].shape)
return tr_loader, va_loader, te_loader
def get_benchmark_splits(fuse=False, batch_size=64):
task = 'mortality'
duration = 48
timestep = 1.0
df_label = pd.read_csv(data_path + 'population/pop.mortality_benchmark.csv').rename(columns={'{}_LABEL'.format(task): 'LABEL'})
X = sparse.load_npz(data_path +'features/benchmark.outcome={}.T={}.dt={}/X.npz'.format(task, duration, timestep)).todense()
s = sparse.load_npz(data_path +'features/benchmark.outcome={}.T={}.dt={}/s.npz'.format(task, duration, timestep)).todense()
tr_idx = df_label[df_label['partition'] == 'train'].index.values
va_idx = df_label[df_label['partition'] == 'val' ].index.values
te_idx = df_label[df_label['partition'] == 'test' ].index.values
def _select_examples(rows):
return (
X[rows],
s[rows],
df_label.iloc[rows][['LABEL']].values,
)
Xy_tr = _select_examples(tr_idx)
Xy_va = _select_examples(va_idx)
Xy_te = _select_examples(te_idx)
print('ICU stay splits:', len(tr_idx), len(va_idx), len(te_idx))
te = EHRDataset(*Xy_te, fuse=fuse)
va = EHRDataset(*Xy_va, fuse=fuse)
tr = EHRDataset(*Xy_tr, fuse=fuse)
num_workers = 1
tr_loader = DataLoader(tr, batch_size=batch_size, shuffle=True , num_workers=num_workers, pin_memory=True)
va_loader = DataLoader(va, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
te_loader = DataLoader(te, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
print(tr_loader.dataset.y.sum() + va_loader.dataset.y.sum() + te_loader.dataset.y.sum(), '/', X.shape[0])
print('')
print('Time series shape, Static shape, Label shape, Class balance:')
print('\t', 'tr', tr_loader.dataset.X.shape, tr_loader.dataset.s.shape, tr_loader.dataset.y.shape, tr_loader.dataset.y.mean())
print('\t', 'va', va_loader.dataset.X.shape, va_loader.dataset.s.shape, va_loader.dataset.y.shape, va_loader.dataset.y.mean())
print('\t', 'te', te_loader.dataset.X.shape, te_loader.dataset.s.shape, te_loader.dataset.y.shape, te_loader.dataset.y.mean())
if fuse:
print('Fused dimensions:', tr_loader.dataset[0][0].shape)
return tr_loader, va_loader, te_loader
def get_benchmark_test(fuse=False, batch_size=64):
task = 'mortality'
duration = 48
timestep = 1.0
df_label_all = pd.read_csv(data_path + 'population/{}_{}h.csv'.format(task, duration)).rename(columns={'{}_LABEL'.format(task): 'LABEL'})
df_label = pd.read_csv(data_path + 'population/pop.mortality_benchmark.csv').rename(columns={'{}_LABEL'.format(task): 'LABEL'})
X = sparse.load_npz(data_path +'features/outcome={}.T={}.dt={}/X.npz'.format(task, duration, timestep)).todense()
s = sparse.load_npz(data_path +'features/outcome={}.T={}.dt={}/s.npz'.format(task, duration, timestep)).todense()
te_idx = [df_label_all[df_label_all['ICUSTAY_ID'] == ID].index.values[0] for ID in df_label[df_label['partition'] == 'test' ]['ID']]
def _select_examples(rows):
return (
X[rows],
s[rows],
df_label_all.iloc[rows][['LABEL']].values,
)
Xy_te = _select_examples(te_idx)
print('ICU stay splits:', len(te_idx))
te = EHRDataset(*Xy_te, fuse=fuse)
num_workers = 1
te_loader = DataLoader(te, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
print(te_loader.dataset.y.sum())
print('')
print('Time series shape, Static shape, Label shape, Class balance:')
print('\t', 'te', te_loader.dataset.X.shape, te_loader.dataset.s.shape, te_loader.dataset.y.shape, te_loader.dataset.y.mean())
if fuse:
print('Fused dimensions:', te_loader.dataset[0][0].shape)
return te_loader
import sparse
class _Mimic3Reader(object):
def __init__(self, task, duration, timestep):
"""
"""
self.task = task
self.duration = duration
self.timestep = timestep
start_time = time.time()
self.df_label = pd.read_csv(data_path + 'population/{}_{}h.csv'.format(task, duration)).rename(columns={'{}_LABEL'.format(task): 'LABEL'})
self.df_subjects = pd.read_csv(data_path + 'prep/icustays_MV.csv').merge(self.df_label, on='ICUSTAY_ID', how='right')
self.df_subject_label = self.df_subjects[['SUBJECT_ID', 'ICUSTAY_ID']] \
.merge(self.df_label, on='ICUSTAY_ID', how='right') \
.sort_values(by=['SUBJECT_ID', 'LABEL']) \
.drop_duplicates('SUBJECT_ID', keep='last').reset_index(drop=True)
self.X = sparse.load_npz(data_path +'features/outcome={}.T={}.dt={}/X.npz'.format(task, duration, timestep)).todense()
self.s = sparse.load_npz(data_path +'features/outcome={}.T={}.dt={}/s.npz'.format(task, duration, timestep)).todense()
print('Finish reading data \t {:.2f} s'.format(time.time() - start_time))
def _select_examples(self, rows):
return (
self.X[rows],
self.s[rows],
self.df_label.iloc[rows][['LABEL']].values,
)
def _select_examples_by_patients(self, subject_idx):
subjects = self.df_subject_label.iloc[subject_idx]['SUBJECT_ID']
stays = self.df_subjects[self.df_subjects['SUBJECT_ID'].isin(subjects)]['ICUSTAY_ID'].values
rows = self.df_label.loc[self.df_label['ICUSTAY_ID'].isin(stays)].index.values
return rows
def _exclude_prediction_gap(self, idx, gap):
df = self.df_label.iloc[idx]
df = df[~(df['{}_ONSET_HOUR'.format(self.task)] < self.duration + gap)]
return df.index.values
def get_splits(self, gap=0.0, random_state=None, verbose=True):
"""
fixed, random splits based on patient
"""
print('Creating splits')
tr_idx = self.df_subjects[self.df_subjects['partition'] == 'train'].index.values
va_idx = self.df_subjects[self.df_subjects['partition'] == 'val' ].index.values
te_idx = self.df_subjects[self.df_subjects['partition'] == 'test' ].index.values
try:
import pathlib
pathlib.Path('./output/outcome={}.T={}.dt={}/'.format(self.task, self.duration, self.timestep)).mkdir(parents=True, exist_ok=True)
np.savez(open('./output/outcome={}.T={}.dt={}/idx.npz'.format(self.task, self.duration, self.timestep), 'wb'), tr_idx=tr_idx, va_idx=va_idx, te_idx=te_idx)
except:
print('indices not saved')
raise
Xy_tr = self._select_examples(tr_idx)
Xy_va = self._select_examples(va_idx)
Xy_te = self._select_examples(te_idx)
print('ICU stay splits:', len(tr_idx), len(va_idx), len(te_idx))
return Xy_tr, Xy_va, Xy_te
def get_splits_stratified(self, gap=0.0, random_state=None, verbose=True):
"""
stratified random, split based on subject
into train (70%), val (15%), test (15%)
"""
print('Creating splits')
if random_state is None:
raise UserWarning('Split results are non-deterministic unless `random_state` is set')
# Create 70-15-15 stratified splits based on patient
sss1 = StratifiedShuffleSplit(1, test_size=0.3, random_state=random_state)
sss2 = StratifiedShuffleSplit(1, test_size=0.5, random_state=random_state)
y = self.df_subject_label['LABEL'].values
train_idx, val_test_idx = next(sss1.split(y, y))
y_val_test = y[val_test_idx]
val_idx, test_idx = next(sss2.split(y_val_test, y_val_test))
val_idx = val_test_idx[val_idx]
test_idx = val_test_idx[test_idx]
if verbose:
print('Patient splits:', len(train_idx), len(val_idx), len(test_idx))
tr_idx = self._select_examples_by_patients(train_idx)
va_idx = self._select_examples_by_patients(val_idx)
te_idx = self._select_examples_by_patients(test_idx)
if verbose:
print('ICU stay splits:', len(tr_idx), len(va_idx), len(te_idx))
tr_idx = self._exclude_prediction_gap(tr_idx, gap)
va_idx = self._exclude_prediction_gap(va_idx, gap)
if verbose:
print('ICU stay splits (prediction gap):', len(tr_idx), len(va_idx), len(te_idx))
try:
import pathlib
pathlib.Path('./output/outcome={}.T={}.dt={}/'.format(self.task, self.duration, self.timestep)).mkdir(parents=True, exist_ok=True)
np.savez(open('./output/outcome={}.T={}.dt={}/idx.npz'.format(self.task, self.duration, self.timestep), 'wb'), tr_idx=tr_idx, va_idx=va_idx, te_idx=te_idx)
except:
print('indices not saved')
raise
Xy_tr = self._select_examples(tr_idx)
Xy_va = self._select_examples(va_idx)
Xy_te = self._select_examples(te_idx)
return Xy_tr, Xy_va, Xy_te
def get_splits_random(self, random_state=None):
"""
70-15-15 stratified random split
train, val, test
"""
raise UserWarning('Not splitting by patients')
print('Creating splits')
if random_state is None:
raise UserWarning('Data split is non-deterministic unless `random_state` is set')
sss1 = StratifiedShuffleSplit(1, test_size=0.3, random_state=random_state)
sss2 = StratifiedShuffleSplit(1, test_size=0.5, random_state=random_state)
y = self.df_label['LABEL'].values
train_idx, val_test_idx = next(sss1.split(y, y))
y_val_test = y[val_test_idx]
val_idx, test_idx = next(sss2.split(y_val_test, y_val_test))
val_idx = val_test_idx[val_idx]
test_idx = val_test_idx[test_idx]
print('ICU stay splits:', len(train_idx), len(val_idx), len(test_idx))
dfX_train , dfy_train = self._select_examples(train_idx)
dfX_val, dfy_val = self._select_examples(val_idx)
dfX_test, dfy_test = self._select_examples(test_idx)
return dfX_train, dfy_train, dfX_val, dfy_val, dfX_test, dfy_test
class EHRDataset(Dataset):
def __init__(self, X, s, y, fuse=False):
assert len(X) == len(s)
assert len(X) == len(y)
self.X = X
self.s = s
self.y = y
self.fuse = fuse
def __getitem__(self, index):
if self.fuse:
xi = self.X[index] # LxD
si = self.s[index] # d
L, D = xi.shape
xi = np.hstack((xi, np.tile(si, (L, 1))))
return (
torch.from_numpy(xi).float(),
torch.from_numpy(self.y[index]).float(),
)
else:
return (
torch.from_numpy(self.X[index]).float(),
torch.from_numpy(self.s[index]).float(),
torch.from_numpy(self.y[index]).float(),
)
def __len__(self):
return len(self.y)
import torch
import numpy as np
def get_best_model_info(df_search):
df_search_sorted = df_search.sort_values('best_score', ascending=False).head()
best_model_info = df_search_sorted.iloc[0, 1:]
return best_model_info
def load_best_model(best_model_info, ModelClass, in_channels, L_in, training_params, load_filename=None):
if load_filename is None:
savename = best_model_info['savename']
split = savename.split('/')
split[-1] = 'best_' + split[-1]
load_filename = '/'.join(split)
checkpoint = torch.load(load_filename)
_iter = checkpoint['_iter']
print("Loaded checkpoint (trained for {} iterations)".format(checkpoint['_iter']))
# print(load_filename)
best_HP = best_model_info.drop(['savename', 'best_iter', 'seed']).to_dict()
model = ModelClass(
in_channels, L_in, 1,
**{k:best_HP[k] for k in best_HP.keys() if k not in training_params}
)
model.load_state_dict(checkpoint['state_dict'])
model.cuda()
print("Restored model {} with #params={}".format(ModelClass, sum(p.numel() for p in model.parameters())))
return checkpoint, model
def get_test_predictions(model, te_loader, task=None, model_name=None):
model.eval()
running_pred = []
cuda = True
for i, (X, y) in enumerate(te_loader):
if cuda:
X = X.contiguous().cuda()
y = y.contiguous().cuda(non_blocking=True)
with torch.set_grad_enabled(False):
output = model(X)
running_pred.append((output.data.detach().cpu(), y.data.detach().cpu()))
y_score, y_true = zip(*running_pred)
y_score = torch.cat(y_score).numpy()
y_true = torch.cat(y_true).numpy()
assert (np.stack(te_loader.dataset.y) == y_true).all()
return y_true, y_score
def save_test_predictions(y_true, y_score, task, T, dt, model_name):
import pathlib
pathlib.Path('./output/outcome={}.T={}.dt={}/'.format(task, T, dt)).mkdir(parents=True, exist_ok=True)
fname = './output/outcome={}.T={}.dt={}/{}.test.npz'.format(task, T, dt, model_name)
np.savez(
open(fname, 'wb'),
y_score = y_score,
y_true = y_true,
)
print('Test predictions saved to', fname)
from .trainer import Trainer
import time
import random
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import ParameterSampler
class Experiment(object):
def __init__(self, param_grid, budget=1, repeat=1, n_epochs=5, name='tmp'):
self.name = name
self.budget = budget
self.repeat = repeat # number of restarts with different random seeds
self.n_epochs = n_epochs
self.param_grid = param_grid
self.param_sampler = ParameterSampler(param_grid, n_iter=self.budget, random_state=0)
def run(self):
df_search = pd.DataFrame(columns=['best_score', 'best_iter', 'seed', 'savename'] + list(self.param_grid.keys()))
start_time = time.time()
for run, params in enumerate(self.param_sampler):
print(self.name, '\t', 'Run:', run, '/', self.budget)
print(params)
for i in range(self.repeat):
results = self._run_trial(i, params)
df_search = df_search.append(results, ignore_index=True)
df_search.to_csv('./log/df_search.current.{}.csv'.format(self.name), index=False)
print('Took:', time.time() - start_time)
return df_search
def _run_trial(self, seed, params):
savename = 'checkpoint/{}/{}_seed={}.pth.tar'.format(self.name, params, seed)
random.seed(seed)
np.random.seed(seed)