Commit e3fc4342 authored by Shengpu Tang (tangsp)'s avatar Shengpu Tang (tangsp)
Browse files

experiments: data extraction

parent f4a1c687
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os, sys, time\n",
"from datetime import datetime, timedelta\n",
"import pickle\n",
"\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import yaml\n",
"config = yaml.safe_load(open('../config.yaml'))\n",
"data_path = config['data_path']\n",
"mimic3_path = config['mimic3_path']\n",
"\n",
"import pathlib\n",
"pathlib.Path(data_path, 'population').mkdir(parents=True, exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"patients = pd.read_csv(mimic3_path + 'PATIENTS.csv', parse_dates=['DOB', 'DOD'], usecols=['SUBJECT_ID', 'DOB', 'DOD'])\n",
"admissions = pd.read_csv(mimic3_path + 'ADMISSIONS.csv', parse_dates=['DEATHTIME'], usecols=['SUBJECT_ID', 'HADM_ID', 'DEATHTIME', 'HOSPITAL_EXPIRE_FLAG'])\n",
"examples = pd.read_csv(data_path + 'prep/icustays_MV.csv', parse_dates=['INTIME', 'OUTTIME']).sort_values(by='ICUSTAY_ID') # Only Metavision\n",
"\n",
"examples = pd.merge(examples, patients, on='SUBJECT_ID', how='left')\n",
"examples = pd.merge(examples, admissions, on=['SUBJECT_ID', 'HADM_ID'], how='left')\n",
"examples['AGE'] = examples.apply(lambda x: (x['INTIME'] - x['DOB']).total_seconds(), axis=1) / 3600 / 24 / 365.25\n",
"\n",
"examples['LOS'] = examples['LOS'] * 24 # Convert to hours"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"tasks = ['ARF', 'Shock']\n",
"label_defs = { task: pd.read_csv(data_path + 'labels/{}.csv'.format(task)) for task in tasks }"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Source population 23620\n"
]
}
],
"source": [
"# Start\n",
"N = len(examples['ICUSTAY_ID'].unique())\n",
"print('Source population', N)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"assert (examples['INTIME'] <= examples['OUTTIME']).all()\n",
"assert (examples['DBSOURCE'] == 'metavision').all()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Exclude non-adults 23593\n"
]
}
],
"source": [
"# Remove non-adults\n",
"min_age = 18\n",
"max_age = np.inf # no max age\n",
"examples = examples[(examples.AGE >= min_age) & (examples.AGE <= max_age)]\n",
"print('Exclude non-adults', examples['ICUSTAY_ID'].nunique())\n",
"examples_ = examples"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"======\n",
"prediction time 4 hour\n",
"Exclude deaths 23499\n",
"Exclude discharges 23401\n",
"---\n",
"Outcome ARF\n",
"Exclude onset 15873\n",
"---\n",
"Outcome Shock\n",
"Exclude onset 19342\n",
"======\n",
"prediction time 12 hour\n",
"Exclude deaths 23319\n",
"Exclude discharges 23060\n",
"---\n",
"Outcome ARF\n",
"Exclude onset 14174\n",
"---\n",
"Outcome Shock\n",
"Exclude onset 17588\n"
]
}
],
"source": [
"for T in [4, 12]:\n",
" print('======')\n",
" print('prediction time', T, 'hour')\n",
"\n",
" # Remove died before cutoff hour\n",
" examples = examples_[(examples_.DEATHTIME >= examples_.INTIME + timedelta(hours=T)) | (examples_.DEATHTIME.isnull())]\n",
" print('Exclude deaths', examples['ICUSTAY_ID'].nunique())\n",
"\n",
" # Remove LOS < cutoff hour\n",
" examples = examples[examples['LOS'] >= T]\n",
" print('Exclude discharges', examples['ICUSTAY_ID'].nunique())\n",
"\n",
" populations = {}\n",
" # Remove event onset before (cutoff)\n",
" for task in tasks:\n",
" print('---')\n",
" print('Outcome', task)\n",
" label_def = label_defs[task]\n",
"\n",
" # Needed to preserve index in DataFrame\n",
" pop = examples[['ICUSTAY_ID']].reset_index() \\\n",
" .merge(label_def[['ICUSTAY_ID', '{}_ONSET_HOUR'.format(task)]], on='ICUSTAY_ID', how='left') \\\n",
" .set_index('index').copy()\n",
" pop = pop[(pop['{}_ONSET_HOUR'.format(task)] >= T) | pop['{}_ONSET_HOUR'.format(task)].isnull()]\n",
" pop['{}_LABEL'.format(task)] = pop['{}_ONSET_HOUR'.format(task)].notnull().astype(int)\n",
" pop.to_csv(data_path + 'population/{}_{}h.csv'.format(task, T), index=False)\n",
"\n",
" # Construct boolean mask\n",
" ## NOTE: uses pop.index here, assuming index is preserved\n",
" idx = pop.index\n",
" ## Otherwise, there's a slower version\n",
" # if False:\n",
" # idx = np.array([examples[examples.ICUSTAY_ID == i].index[0] for i in pop['ICUSTAY_ID']])\n",
" mask_array = np.zeros(N, dtype=bool)\n",
" mask_array[idx] = True\n",
"\n",
" # Save population boolean mask\n",
" np.save(data_path + 'population/mask_{}_{}h.npy'.format(task, T), mask_array)\n",
" np.savetxt(data_path + 'population/mask_{}_{}h.txt'.format(task, T), mask_array, fmt='%i')\n",
"\n",
" populations[task] = pop\n",
" print('Exclude onset', len(pop))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"======\n",
"prediction time 48 hour\n",
"Exclude deaths 22776\n",
"Exclude discharges 11695\n",
"---\n",
"Outcome mortality\n",
"Exclude onset 11695\n"
]
}
],
"source": [
"for T in [48]:\n",
" print('======')\n",
" print('prediction time', T, 'hour')\n",
"\n",
" # Remove died before cutoff hour\n",
" examples = examples_[(examples_.DEATHTIME >= examples_.INTIME + timedelta(hours=T)) | (examples_.DEATHTIME.isnull())]\n",
" print('Exclude deaths', examples['ICUSTAY_ID'].nunique())\n",
"\n",
" # Remove LOS < cutoff hour\n",
" examples = examples[examples['LOS'] >= T]\n",
" print('Exclude discharges', examples['ICUSTAY_ID'].nunique())\n",
"\n",
" # Remove event onset before (cutoff)\n",
" for task in ['mortality']:\n",
" print('---')\n",
" print('Outcome', task)\n",
" examples['{}_LABEL'.format(task)] = examples.HOSPITAL_EXPIRE_FLAG\n",
" pop = examples[['ICUSTAY_ID', '{}_LABEL'.format(task)]]\n",
" pop.to_csv(data_path + 'population/{}_{}h.csv'.format(task, T), index=False)\n",
" print('Exclude onset', len(pop))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This diff is collapsed.
This diff is collapsed.
import os, yaml
with open(os.path.join(os.path.dirname(__file__), '../config.yaml')) as f:
config = yaml.full_load(f)
data_path = os.path.join(os.path.dirname(__file__), config['data_path'])
mimic3_path = os.path.join(os.path.dirname(__file__), config['mimic3_path'])
parallel = True
n_jobs = 72
This diff is collapsed.
"""
generate_labels.py
Author: Shengpu Tang
Generate labels for two adverse outcomes: ARF and shock.
"""
import pandas as pd
import numpy as np
import scipy.stats
import itertools
from collections import OrderedDict, Counter
from joblib import Parallel, delayed
from tqdm import tqdm as tqdm
import yaml
data_path = yaml.full_load(open('../config.yaml'))['data_path']
import pathlib
pathlib.Path(data_path, 'labels').mkdir(parents=True, exist_ok=True)
examples = pd.read_csv(data_path + 'prep/icustays_MV.csv', parse_dates=['INTIME', 'OUTTIME']).sort_values(by='ICUSTAY_ID')
chartevents = pd.read_pickle(data_path + 'prep/chartevents.p')
procedures = pd.read_pickle(data_path + 'prep/procedureevents_mv.p')
inputevents = pd.read_pickle(data_path + 'prep/inputevents_mv.p')
ventilation = [
'225792', # Invasive Ventilation
'225794', # Non-invasive Ventilation
]
PEEP = [
'220339', # PEEP set
]
vasopressors = [
'221906', # Norepinephrine
'221289', # Epinephrine
'221662', # Dopamine
'222315', # Vasopressin
'221749', # Phenylephrine
]
## ARF: (PEEP) OR (mechanical ventilation)
df_PEEP = chartevents[chartevents.ITEMID.isin(PEEP)].copy()
df_vent = procedures[procedures.ITEMID.isin(ventilation)].rename(columns={'t_start': 't'}).copy()
df_ARF = pd.concat([df_PEEP[['ICUSTAY_ID', 't']], df_vent[['ICUSTAY_ID', 't']]], axis=0)
df_ARF['ICUSTAY_ID'] = df_ARF['ICUSTAY_ID'].astype(int)
df_ARF = df_ARF.sort_values(by=['ICUSTAY_ID', 't']).drop_duplicates(['ICUSTAY_ID'], keep='first').reset_index(drop=True)
df_ARF = df_ARF.rename(columns={'t': 'ARF_ONSET_HOUR'})
df_ARF = pd.merge(examples[['ICUSTAY_ID']], df_ARF, on='ICUSTAY_ID', how='left')
df_ARF['ARF_LABEL'] = df_ARF['ARF_ONSET_HOUR'].notnull().astype(int)
print('ARF: ', dict(Counter(df_ARF['ARF_LABEL'])), 'N = {}'.format(len(df_ARF)), sep='\t')
df_ARF.to_csv(data_path + 'labels/ARF.csv', index=False)
## Shock: (one of vasopressors)
df_vaso = inputevents[inputevents.ITEMID.isin(vasopressors)].rename(columns={'t_start': 't'}).copy()
df_shock = df_vaso.copy()
df_shock['ICUSTAY_ID'] = df_shock['ICUSTAY_ID'].astype(int)
df_shock = df_shock.sort_values(by=['ICUSTAY_ID', 't']).drop_duplicates(['ICUSTAY_ID'], keep='first').reset_index(drop=True)
df_shock = df_shock.rename(columns={'t': 'Shock_ONSET_HOUR'})
df_shock = pd.merge(examples[['ICUSTAY_ID']], df_shock, on='ICUSTAY_ID', how='left')
df_shock['Shock_LABEL'] = df_shock['Shock_ONSET_HOUR'].notnull().astype(int)
print('Shock: ', dict(Counter(df_shock['Shock_LABEL'])), 'N = {}'.format(len(df_shock)), sep='\t')
df_shock.to_csv(data_path + 'labels/Shock.csv', index=False)
HR:
- 220045 # Heart Rate
SysBP:
- 224167 # Manual Blood Pressure Systolic Left
- 227243 # Manual Blood Pressure Systolic Right
- 220050 # Arterial Blood Pressure systolic
- 220179 # Non Invasive Blood Pressure systolic
- 225309 # ART BP Systolic
DiaBP:
- 224643 # Manual Blood Pressure Diastolic Left
- 227242 # Manual Blood Pressure Diastolic Right
- 220051 # Arterial Blood Pressure diastolic
- 220180 # Non Invasive Blood Pressure diastolic
- 225310 # ART BP Diastolic
RR:
- 220210 # Respiratory Rate
- 224690 # Respiratory Rate (Total)
Temperature:
- 223761 # Temperature Fahrenheit
- 223762 # Temperature Celsius
SpO2:
- 220277 # O2 saturation pulseoxymetry
Height:
- 226707 # Height
- 226730 # Height (cm)
Weight:
- 224639 # Daily Weight
- 226512 # Admission Weight (Kg)
- 226531 # Admission Weight (lbs.)
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Multitask benchmark: https://www.nature.com/articles/s41597-019-0103-9"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import yaml\n",
"with open('../config.yaml') as f:\n",
" config = yaml.full_load(f)\n",
"\n",
"data_path = config['data_path']"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from collections import defaultdict"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df_train = pd.read_csv('train_listfile.csv')\n",
"df_val = pd.read_csv('val_listfile.csv')\n",
"df_test = pd.read_csv('test_listfile.csv')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.13534500374633882"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train['y_true'].mean()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Removing non-metavision ICU stays"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"icustays = pd.read_csv('all_stays.csv')\n",
"icustays = icustays.sort_values(by=['SUBJECT_ID', 'INTIME', 'OUTTIME']).reset_index(drop=True)\n",
"metavision = icustays[icustays['DBSOURCE'] == 'metavision']['ICUSTAY_ID']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"stays_by_subjects = defaultdict(list)\n",
"for i, (j, k) in icustays[['SUBJECT_ID', 'ICUSTAY_ID']].iterrows():\n",
" stays_by_subjects[j].append(k)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"my_labels = pd.read_csv('../' + data_path + 'population/mortality_48h.csv').set_index('ICUSTAY_ID')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"df_out = []\n",
"for part, df_part in zip(['train', 'val', 'test'], [df_train, df_val, df_test]):\n",
" for i, (name, y) in df_part.iterrows():\n",
" try:\n",
" ID, ep, _ = name.split('_')\n",
" ID = int(ID)\n",
" ep = int(ep[7:]) - 1\n",
" stay_ID = stays_by_subjects[ID][ep]\n",
" if stay_ID in metavision.values and stay_ID in my_labels.index:\n",
" # Only keep patients that are recorded using metavision that have not died by 48 hour\n",
" df_out.append((stay_ID, name, part, y, y))\n",
" my_y = my_labels.loc[stay_ID, 'mortality_LABEL']\n",
" else:\n",
" continue\n",
" except:\n",
" print(name, ID, stay_ID, part, y, my_y)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"df_out = pd.DataFrame(df_out, columns=['ID', 'stay', 'partition', 'mortality_LABEL', 'y_true'])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"df_out = df_out.sort_values(by='ID')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"df_out.to_csv('../' + data_path + 'population/pop.mortality_benchmark.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.12020519995336365"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_out['mortality_LABEL'].mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This diff is collapsed.
"""
python prepare_input.py
"""
import os, yaml
with open(os.path.join(os.path.dirname(__file__), '../config.yaml')) as f:
config = yaml.full_load(f)
data_path = os.path.join(os.path.dirname(__file__), config['data_path'])
parallel = True
ID_col = config['column_names']['ID']
t_col = config['column_names']['t']
var_col = config['column_names']['var_name']
val_col = config['column_names']['var_value']
import argparse
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm