Commit e3fc4342 authored by Shengpu Tang (tangsp)'s avatar Shengpu Tang (tangsp)
Browse files

experiments: data extraction

parent f4a1c687
%% Cell type:code id: tags:
``` python
import numpy as np
import pandas as pd
import os, sys, time
from datetime import datetime, timedelta
import pickle
from collections import Counter
```
%% Cell type:code id: tags:
``` python
import yaml
config = yaml.safe_load(open('../config.yaml'))
data_path = config['data_path']
mimic3_path = config['mimic3_path']
import pathlib
pathlib.Path(data_path, 'population').mkdir(parents=True, exist_ok=True)
```
%% Cell type:code id: tags:
``` python
patients = pd.read_csv(mimic3_path + 'PATIENTS.csv', parse_dates=['DOB', 'DOD'], usecols=['SUBJECT_ID', 'DOB', 'DOD'])
admissions = pd.read_csv(mimic3_path + 'ADMISSIONS.csv', parse_dates=['DEATHTIME'], usecols=['SUBJECT_ID', 'HADM_ID', 'DEATHTIME', 'HOSPITAL_EXPIRE_FLAG'])
examples = pd.read_csv(data_path + 'prep/icustays_MV.csv', parse_dates=['INTIME', 'OUTTIME']).sort_values(by='ICUSTAY_ID') # Only Metavision
examples = pd.merge(examples, patients, on='SUBJECT_ID', how='left')
examples = pd.merge(examples, admissions, on=['SUBJECT_ID', 'HADM_ID'], how='left')
examples['AGE'] = examples.apply(lambda x: (x['INTIME'] - x['DOB']).total_seconds(), axis=1) / 3600 / 24 / 365.25
examples['LOS'] = examples['LOS'] * 24 # Convert to hours
```
%% Cell type:code id: tags:
``` python
tasks = ['ARF', 'Shock']
label_defs = { task: pd.read_csv(data_path + 'labels/{}.csv'.format(task)) for task in tasks }
```
%% Cell type:code id: tags:
``` python
# Start
N = len(examples['ICUSTAY_ID'].unique())
print('Source population', N)
```
%% Output
Source population 23620
%% Cell type:code id: tags:
``` python
assert (examples['INTIME'] <= examples['OUTTIME']).all()
assert (examples['DBSOURCE'] == 'metavision').all()
```
%% Cell type:code id: tags:
``` python
# Remove non-adults
min_age = 18
max_age = np.inf # no max age
examples = examples[(examples.AGE >= min_age) & (examples.AGE <= max_age)]
print('Exclude non-adults', examples['ICUSTAY_ID'].nunique())
examples_ = examples
```
%% Output
Exclude non-adults 23593
%% Cell type:code id: tags:
``` python
for T in [4, 12]:
print('======')
print('prediction time', T, 'hour')
# Remove died before cutoff hour
examples = examples_[(examples_.DEATHTIME >= examples_.INTIME + timedelta(hours=T)) | (examples_.DEATHTIME.isnull())]
print('Exclude deaths', examples['ICUSTAY_ID'].nunique())
# Remove LOS < cutoff hour
examples = examples[examples['LOS'] >= T]
print('Exclude discharges', examples['ICUSTAY_ID'].nunique())
populations = {}
# Remove event onset before (cutoff)
for task in tasks:
print('---')
print('Outcome', task)
label_def = label_defs[task]
# Needed to preserve index in DataFrame
pop = examples[['ICUSTAY_ID']].reset_index() \
.merge(label_def[['ICUSTAY_ID', '{}_ONSET_HOUR'.format(task)]], on='ICUSTAY_ID', how='left') \
.set_index('index').copy()
pop = pop[(pop['{}_ONSET_HOUR'.format(task)] >= T) | pop['{}_ONSET_HOUR'.format(task)].isnull()]
pop['{}_LABEL'.format(task)] = pop['{}_ONSET_HOUR'.format(task)].notnull().astype(int)
pop.to_csv(data_path + 'population/{}_{}h.csv'.format(task, T), index=False)
# Construct boolean mask
## NOTE: uses pop.index here, assuming index is preserved
idx = pop.index
## Otherwise, there's a slower version
# if False:
# idx = np.array([examples[examples.ICUSTAY_ID == i].index[0] for i in pop['ICUSTAY_ID']])
mask_array = np.zeros(N, dtype=bool)
mask_array[idx] = True
# Save population boolean mask
np.save(data_path + 'population/mask_{}_{}h.npy'.format(task, T), mask_array)
np.savetxt(data_path + 'population/mask_{}_{}h.txt'.format(task, T), mask_array, fmt='%i')
populations[task] = pop
print('Exclude onset', len(pop))
```
%% Output
======
prediction time 4 hour
Exclude deaths 23499
Exclude discharges 23401
---
Outcome ARF
Exclude onset 15873
---
Outcome Shock
Exclude onset 19342
======
prediction time 12 hour
Exclude deaths 23319
Exclude discharges 23060
---
Outcome ARF
Exclude onset 14174
---
Outcome Shock
Exclude onset 17588
%% Cell type:code id: tags:
``` python
for T in [48]:
print('======')
print('prediction time', T, 'hour')
# Remove died before cutoff hour
examples = examples_[(examples_.DEATHTIME >= examples_.INTIME + timedelta(hours=T)) | (examples_.DEATHTIME.isnull())]
print('Exclude deaths', examples['ICUSTAY_ID'].nunique())
# Remove LOS < cutoff hour
examples = examples[examples['LOS'] >= T]
print('Exclude discharges', examples['ICUSTAY_ID'].nunique())
# Remove event onset before (cutoff)
for task in ['mortality']:
print('---')
print('Outcome', task)
examples['{}_LABEL'.format(task)] = examples.HOSPITAL_EXPIRE_FLAG
pop = examples[['ICUSTAY_ID', '{}_LABEL'.format(task)]]
pop.to_csv(data_path + 'population/{}_{}h.csv'.format(task, T), index=False)
print('Exclude onset', len(pop))
```
%% Output
======
prediction time 48 hour
Exclude deaths 22776
Exclude discharges 11695
---
Outcome mortality
Exclude onset 11695
%% Cell type:code id: tags:
``` python
```