Commit f2324a5a authored by Shengpu Tang (tangsp)'s avatar Shengpu Tang (tangsp)
Browse files

mimic3 no discretize

parent f1bbf844
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from lib.data import _Mimic3Reader\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"timestep = 1.0"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finish reading data \t 7.25 s\n",
"s (15873, 90)\n",
"X (15873, 4, 2077)\n",
"Finish reading data \t 22.47 s\n",
"s (14174, 88)\n",
"X (14174, 12, 2441)\n",
"Finish reading data \t 11.49 s\n",
"s (19342, 90)\n",
"X (19342, 4, 2302)\n",
"Finish reading data \t 29.93 s\n",
"s (17588, 89)\n",
"X (17588, 12, 2753)\n"
]
}
],
"source": [
"for task in ['ARF', 'Shock']:\n",
" for duration in [4.0, 12.0]:\n",
" reader = _Mimic3Reader(task, duration, timestep)\n",
" print('s', reader.s.shape)\n",
" print('X', reader.X.shape)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finish reading data \t 65.18 s\n",
"s (8577, 88)\n",
"X (8577, 48, 3491)\n"
]
}
],
"source": [
"for task in ['mortality']:\n",
" for duration in [48.0]:\n",
" reader = _Mimic3Reader(task, duration, timestep)\n",
" print('s', reader.s.shape)\n",
" print('X', reader.X.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Config file:
- `config.yaml`. Change data_path to point to the directory where the features/labels/population are stored
Library files:
- `data.py`
- `models.py`
- `trainer.py`
- `experiment.py`
- `eval_deep.py`
- `evaluate.py`
Executable files:
- `run_deep.py`
- `run_shallow.py`
Notebooks:
- `RunShallow.ipynb`
- `NewEval_Deep.ipynb`
- `Evaluation.ipynb`
- `PredictionGap.ipynb`
data_path: ../data/processed/
model_names: {
'CNN': 'CNN_V3',
'RNN': 'RNN_V2',
'LR': 'LR',
'RF': 'RF',
}
train:
budget: 50
repeat: 1
epochs: 15
feature_dimension:
ARF:
4.0 : 2167
12.0: 2529
Shock:
4.0 : 2392
12.0: 2842
mortality:
48.0: 4029
import sys, os, time, pickle, random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
import yaml
with open('config.yaml') as f:
config = yaml.safe_load(f)
data_path = config['data_path']
def get_test(task, duration, timestep, fuse=False, batch_size=64):
"""
Returns:
pytorch DataLoader for test
"""
print('Reading files')
reader = _Mimic3Reader(task, duration, timestep)
_, _, Xy_te = reader.get_splits(gap=0.0, random_state=0, verbose=False)
te = EHRDataset(*Xy_te, fuse=fuse)
num_workers = 1
te_loader = DataLoader(te, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
print('Time series shape, Static shape, Label shape, Class balance:')
print('\t', 'te', te_loader.dataset.X.shape, te_loader.dataset.s.shape, te_loader.dataset.y.shape, te_loader.dataset.y.mean())
if fuse:
print('Fused dimensions:', te_loader.dataset[0][0].shape)
return te_loader
def get_train_val_test(task, fuse=False, duration=4, timestep=0.5, batch_size=64):
"""
Returns:
pytorch DataLoader for train, val, test
"""
print('Reading files')
reader = _Mimic3Reader(task, duration, timestep)
Xy_tr, Xy_va, Xy_te = reader.get_splits(gap=0.0, random_state=0)
te = EHRDataset(*Xy_te, fuse=fuse)
va = EHRDataset(*Xy_va, fuse=fuse)
tr = EHRDataset(*Xy_tr, fuse=fuse)
num_workers = 1
tr_loader = DataLoader(tr, batch_size=batch_size, shuffle=True , num_workers=num_workers, pin_memory=True)
va_loader = DataLoader(va, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
te_loader = DataLoader(te, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
print(tr_loader.dataset.y.sum() + va_loader.dataset.y.sum() + te_loader.dataset.y.sum(), '/', reader.X.shape[0])
print('')
print('Time series shape, Static shape, Label shape, Class balance:')
print('\t', 'tr', tr_loader.dataset.X.shape, tr_loader.dataset.s.shape, tr_loader.dataset.y.shape, tr_loader.dataset.y.mean())
print('\t', 'va', va_loader.dataset.X.shape, va_loader.dataset.s.shape, va_loader.dataset.y.shape, va_loader.dataset.y.mean())
print('\t', 'te', te_loader.dataset.X.shape, te_loader.dataset.s.shape, te_loader.dataset.y.shape, te_loader.dataset.y.mean())
if fuse:
print('Fused dimensions:', tr_loader.dataset[0][0].shape)
return tr_loader, va_loader, te_loader
def get_benchmark_splits(fuse=False, batch_size=64):
task = 'mortality'
duration = 48.0
timestep = 1.0
df_label = pd.read_csv(data_path + 'population/pop.mortality_benchmark.csv').rename(columns={'{}_LABEL'.format(task): 'LABEL'})
X = sparse.load_npz(data_path +'features,discretize=no/benchmark,outcome={},T={},dt={}/X.npz'.format(task, duration, timestep)).todense()
s = sparse.load_npz(data_path +'features,discretize=no/benchmark,outcome={},T={},dt={}/s.npz'.format(task, duration, timestep)).todense()
# Normalize data to range 0 to 1
N, L, D = X.shape
s = MinMaxScaler().fit_transform(s)
X = MinMaxScaler().fit_transform(X.reshape((N,L*D))).reshape((N,L,D))
tr_idx = df_label[df_label['partition'] == 'train'].index.values
va_idx = df_label[df_label['partition'] == 'val' ].index.values
te_idx = df_label[df_label['partition'] == 'test' ].index.values
def _select_examples(rows):
return (
X[rows],
s[rows],
df_label.iloc[rows][['LABEL']].values,
)
Xy_tr = _select_examples(tr_idx)
Xy_va = _select_examples(va_idx)
Xy_te = _select_examples(te_idx)
print('ICU stay splits:', len(tr_idx), len(va_idx), len(te_idx))
te = EHRDataset(*Xy_te, fuse=fuse)
va = EHRDataset(*Xy_va, fuse=fuse)
tr = EHRDataset(*Xy_tr, fuse=fuse)
num_workers = 1
tr_loader = DataLoader(tr, batch_size=batch_size, shuffle=True , num_workers=num_workers, pin_memory=True)
va_loader = DataLoader(va, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
te_loader = DataLoader(te, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
print(tr_loader.dataset.y.sum() + va_loader.dataset.y.sum() + te_loader.dataset.y.sum(), '/', X.shape[0])
print('')
print('Time series shape, Static shape, Label shape, Class balance:')
print('\t', 'tr', tr_loader.dataset.X.shape, tr_loader.dataset.s.shape, tr_loader.dataset.y.shape, tr_loader.dataset.y.mean())
print('\t', 'va', va_loader.dataset.X.shape, va_loader.dataset.s.shape, va_loader.dataset.y.shape, va_loader.dataset.y.mean())
print('\t', 'te', te_loader.dataset.X.shape, te_loader.dataset.s.shape, te_loader.dataset.y.shape, te_loader.dataset.y.mean())
if fuse:
print('Fused dimensions:', tr_loader.dataset[0][0].shape)
return tr_loader, va_loader, te_loader
def get_benchmark_test(fuse=False, batch_size=64):
task = 'mortality'
duration = 48
timestep = 1.0
df_label_all = pd.read_csv(data_path + 'population/{}_{}h.csv'.format(task, duration)).rename(columns={'{}_LABEL'.format(task): 'LABEL'})
df_label = pd.read_csv(data_path + 'population/pop.mortality_benchmark.csv').rename(columns={'{}_LABEL'.format(task): 'LABEL'})
X = sparse.load_npz(data_path +'features,discretize=no/outcome={},T={},dt={}/X.npz'.format(task, duration, timestep)).todense()
s = sparse.load_npz(data_path +'features,discretize=no/outcome={},T={},dt={}/s.npz'.format(task, duration, timestep)).todense()
# Normalize data to range 0 to 1
N, L, D = X.shape
s = MinMaxScaler().fit_transform(s)
X = MinMaxScaler().fit_transform(X.reshape((N,L*D))).reshape((N,L,D))
te_idx = [df_label_all[df_label_all['ICUSTAY_ID'] == ID].index.values[0] for ID in df_label[df_label['partition'] == 'test' ]['ID']]
def _select_examples(rows):
return (
X[rows],
s[rows],
df_label_all.iloc[rows][['LABEL']].values,
)
Xy_te = _select_examples(te_idx)
print('ICU stay splits:', len(te_idx))
te = EHRDataset(*Xy_te, fuse=fuse)
num_workers = 1
te_loader = DataLoader(te, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
print(te_loader.dataset.y.sum())
print('')
print('Time series shape, Static shape, Label shape, Class balance:')
print('\t', 'te', te_loader.dataset.X.shape, te_loader.dataset.s.shape, te_loader.dataset.y.shape, te_loader.dataset.y.mean())
if fuse:
print('Fused dimensions:', te_loader.dataset[0][0].shape)
return te_loader
import sparse
class _Mimic3Reader(object):
def __init__(self, task, duration, timestep):
"""
"""
self.task = task
self.duration = duration
self.timestep = timestep
start_time = time.time()
self.df_label = pd.read_csv(data_path + 'population/{}_{}h.csv'.format(task, duration)).rename(columns={'ID': 'ICUSTAY_ID', '{}_LABEL'.format(task): 'LABEL'})
self.df_subjects = pd.read_csv(data_path + 'prep/icustays_MV.csv').merge(self.df_label, on='ICUSTAY_ID', how='right')
self.df_subject_label = self.df_subjects[['SUBJECT_ID', 'ICUSTAY_ID']] \
.merge(self.df_label, on='ICUSTAY_ID', how='right') \
.sort_values(by=['SUBJECT_ID', 'LABEL']) \
.drop_duplicates('SUBJECT_ID', keep='last').reset_index(drop=True)
if task == 'mortality':
self.X = sparse.load_npz(data_path +'features,discretize=no/benchmark,outcome={},T={},dt={}/X.npz'.format(task, duration, timestep)).todense()
self.s = sparse.load_npz(data_path +'features,discretize=no/benchmark,outcome={},T={},dt={}/s.npz'.format(task, duration, timestep)).todense()
else:
self.X = sparse.load_npz(data_path +'features,discretize=no/outcome={},T={},dt={}/X.npz'.format(task, duration, timestep)).todense()
self.s = sparse.load_npz(data_path +'features,discretize=no/outcome={},T={},dt={}/s.npz'.format(task, duration, timestep)).todense()
# Normalize data to range 0 to 1
N, L, D = self.X.shape
self.s = MinMaxScaler().fit_transform(self.s)
self.X = MinMaxScaler().fit_transform(self.X.reshape((N,L*D))).reshape((N,L,D))
print('Finish reading data \t {:.2f} s'.format(time.time() - start_time))
def _select_examples(self, rows):
return (
self.X[rows],
self.s[rows],
self.df_label.iloc[rows][['LABEL']].values,
)
def _select_examples_by_patients(self, subject_idx):
subjects = self.df_subject_label.iloc[subject_idx]['SUBJECT_ID']
stays = self.df_subjects[self.df_subjects['SUBJECT_ID'].isin(subjects)]['ICUSTAY_ID'].values
rows = self.df_label.loc[self.df_label['ICUSTAY_ID'].isin(stays)].index.values
return rows
def _exclude_prediction_gap(self, idx, gap):
df = self.df_label.iloc[idx]
df = df[~(df['{}_ONSET_HOUR'.format(self.task)] < self.duration + gap)]
return df.index.values
def get_splits(self, gap=0.0, random_state=None, verbose=True):
"""
fixed, random splits based on patient
"""
print('Creating splits')
tr_idx = self.df_subjects[self.df_subjects['partition'] == 'train'].index.values
va_idx = self.df_subjects[self.df_subjects['partition'] == 'val' ].index.values
te_idx = self.df_subjects[self.df_subjects['partition'] == 'test' ].index.values
try:
import pathlib
pathlib.Path('./output/outcome={},T={},dt={}/'.format(self.task, self.duration, self.timestep)).mkdir(parents=True, exist_ok=True)
np.savez(open('./output/outcome={},T={},dt={}/idx.npz'.format(self.task, self.duration, self.timestep), 'wb'), tr_idx=tr_idx, va_idx=va_idx, te_idx=te_idx)
except:
print('indices not saved')
raise
Xy_tr = self._select_examples(tr_idx)
Xy_va = self._select_examples(va_idx)
Xy_te = self._select_examples(te_idx)
print('ICU stay splits:', len(tr_idx), len(va_idx), len(te_idx))
return Xy_tr, Xy_va, Xy_te
def get_splits_stratified(self, gap=0.0, random_state=None, verbose=True):
"""
stratified random, split based on subject
into train (70%), val (15%), test (15%)
"""
print('Creating splits')
if random_state is None:
raise UserWarning('Split results are non-deterministic unless `random_state` is set')
# Create 70-15-15 stratified splits based on patient
sss1 = StratifiedShuffleSplit(1, test_size=0.3, random_state=random_state)
sss2 = StratifiedShuffleSplit(1, test_size=0.5, random_state=random_state)
y = self.df_subject_label['LABEL'].values
train_idx, val_test_idx = next(sss1.split(y, y))
y_val_test = y[val_test_idx]
val_idx, test_idx = next(sss2.split(y_val_test, y_val_test))
val_idx = val_test_idx[val_idx]
test_idx = val_test_idx[test_idx]
if verbose:
print('Patient splits:', len(train_idx), len(val_idx), len(test_idx))
tr_idx = self._select_examples_by_patients(train_idx)
va_idx = self._select_examples_by_patients(val_idx)
te_idx = self._select_examples_by_patients(test_idx)
if verbose:
print('ICU stay splits:', len(tr_idx), len(va_idx), len(te_idx))
tr_idx = self._exclude_prediction_gap(tr_idx, gap)
va_idx = self._exclude_prediction_gap(va_idx, gap)
if verbose:
print('ICU stay splits (prediction gap):', len(tr_idx), len(va_idx), len(te_idx))
try:
import pathlib
pathlib.Path('./output/outcome={},T={},dt={}/'.format(self.task, self.duration, self.timestep)).mkdir(parents=True, exist_ok=True)
np.savez(open('./output/outcome={},T={},dt={}/idx.npz'.format(self.task, self.duration, self.timestep), 'wb'), tr_idx=tr_idx, va_idx=va_idx, te_idx=te_idx)
except:
print('indices not saved')
raise
Xy_tr = self._select_examples(tr_idx)
Xy_va = self._select_examples(va_idx)
Xy_te = self._select_examples(te_idx)
return Xy_tr, Xy_va, Xy_te
def get_splits_random(self, random_state=None):
"""
70-15-15 stratified random split
train, val, test
"""
raise UserWarning('Not splitting by patients')
print('Creating splits')
if random_state is None:
raise UserWarning('Data split is non-deterministic unless `random_state` is set')
sss1 = StratifiedShuffleSplit(1, test_size=0.3, random_state=random_state)
sss2 = StratifiedShuffleSplit(1, test_size=0.5, random_state=random_state)
y = self.df_label['LABEL'].values
train_idx, val_test_idx = next(sss1.split(y, y))
y_val_test = y[val_test_idx]
val_idx, test_idx = next(sss2.split(y_val_test, y_val_test))
val_idx = val_test_idx[val_idx]
test_idx = val_test_idx[test_idx]
print('ICU stay splits:', len(train_idx), len(val_idx), len(test_idx))
dfX_train , dfy_train = self._select_examples(train_idx)
dfX_val, dfy_val = self._select_examples(val_idx)
dfX_test, dfy_test = self._select_examples(test_idx)
return dfX_train, dfy_train, dfX_val, dfy_val, dfX_test, dfy_test
class EHRDataset(Dataset):
def __init__(self, X, s, y, fuse=False):
assert len(X) == len(s)
assert len(X) == len(y)
self.X = X
self.s = s
self.y = y
self.fuse = fuse
def __getitem__(self, index):
if self.fuse:
xi = self.X[index] # LxD
si = self.s[index] # d
L, D = xi.shape
xi = np.hstack((xi, np.tile(si, (L, 1))))
return (
torch.from_numpy(xi).float(),
torch.from_numpy(self.y[index]).float(),
)
else:
return (
torch.from_numpy(self.X[index]).float(),
torch.from_numpy(self.s[index]).float(),
torch.from_numpy(self.y[index]).float(),
)
def __len__(self):
return len(self.y)
import torch
import numpy as np
def get_best_model_info(df_search):
df_search_sorted = df_search.sort_values('best_score', ascending=False).head()
best_model_info = df_search_sorted.iloc[0, 1:]
return best_model_info
def load_best_model(best_model_info, ModelClass, in_channels, L_in, training_params, load_filename=None):
if load_filename is None:
savename = best_model_info['savename']
split = savename.split('/')
split[-1] = 'best_' + split[-1]
load_filename = '/'.join(split)
checkpoint = torch.load(load_filename)
_iter = checkpoint['_iter']
print("Loaded checkpoint (trained for {} iterations)".format(checkpoint['_iter']))
# print(load_filename)
best_HP = best_model_info.drop(['savename', 'best_iter', 'seed']).to_dict()
model = ModelClass(
in_channels, L_in, 1,
**{k:best_HP[k] for k in best_HP.keys() if k not in training_params}
)
model.load_state_dict(checkpoint['state_dict'])
model.cuda()
print("Restored model {} with #params={}".format(ModelClass, sum(p.numel() for p in model.parameters())))
return checkpoint, model
def get_test_predictions(model, te_loader, task=None, model_name=None):
model.eval()
running_pred = []
cuda = True
for i, (X, y) in enumerate(te_loader):
if cuda:
X = X.contiguous().cuda()
y = y.contiguous().cuda(non_blocking=True)
with torch.set_grad_enabled(False):
output = model(X)
running_pred.append((output.data.detach().cpu(), y.data.detach().cpu()))
y_score, y_true = zip(*running_pred)
y_score = torch.cat(y_score).numpy()
y_true = torch.cat(y_true).numpy()
assert (np.stack(te_loader.dataset.y) == y_true).all()
return y_true, y_score
def save_test_predictions(y_true, y_score, task, T, dt, model_name):
import pathlib
pathlib.Path('./output/outcome={}.T={}.dt={}/'.format(task, T, dt)).mkdir(parents=True, exist_ok=True)
fname = './output/outcome={}.T={}.dt={}/{}.test.npz'.format(task, T, dt, model_name)
np.savez(
open(fname, 'wb'),
y_score = y_score,
y_true = y_true,
)
print('Test predictions saved to', fname)
from .trainer import Trainer
import time
import random
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import ParameterSampler
class Experiment(object):
def __init__(self, param_grid, budget=1, repeat=1, n_epochs=5, name='tmp'):
self.name = name
self.budget = budget
self.repeat = repeat # number of restarts with different random seeds
self.n_epochs = n_epochs
self.param_grid = param_grid
self.param_sampler = ParameterSampler(param_grid, n_iter=self.budget, random_state=0)