Commit e3fc4342 authored by Shengpu Tang (tangsp)'s avatar Shengpu Tang (tangsp)
Browse files

experiments: data extraction

parent f4a1c687
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os, sys, time\n",
"from datetime import datetime, timedelta\n",
"import pickle\n",
"\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import yaml\n",
"config = yaml.safe_load(open('../config.yaml'))\n",
"data_path = config['data_path']\n",
"mimic3_path = config['mimic3_path']\n",
"\n",
"import pathlib\n",
"pathlib.Path(data_path, 'population').mkdir(parents=True, exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"patients = pd.read_csv(mimic3_path + 'PATIENTS.csv', parse_dates=['DOB', 'DOD'], usecols=['SUBJECT_ID', 'DOB', 'DOD'])\n",
"admissions = pd.read_csv(mimic3_path + 'ADMISSIONS.csv', parse_dates=['DEATHTIME'], usecols=['SUBJECT_ID', 'HADM_ID', 'DEATHTIME', 'HOSPITAL_EXPIRE_FLAG'])\n",
"examples = pd.read_csv(data_path + 'prep/icustays_MV.csv', parse_dates=['INTIME', 'OUTTIME']).sort_values(by='ICUSTAY_ID') # Only Metavision\n",
"\n",
"examples = pd.merge(examples, patients, on='SUBJECT_ID', how='left')\n",
"examples = pd.merge(examples, admissions, on=['SUBJECT_ID', 'HADM_ID'], how='left')\n",
"examples['AGE'] = examples.apply(lambda x: (x['INTIME'] - x['DOB']).total_seconds(), axis=1) / 3600 / 24 / 365.25\n",
"\n",
"examples['LOS'] = examples['LOS'] * 24 # Convert to hours"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"tasks = ['ARF', 'Shock']\n",
"label_defs = { task: pd.read_csv(data_path + 'labels/{}.csv'.format(task)) for task in tasks }"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Source population 23620\n"
]
}
],
"source": [
"# Start\n",
"N = len(examples['ICUSTAY_ID'].unique())\n",
"print('Source population', N)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"assert (examples['INTIME'] <= examples['OUTTIME']).all()\n",
"assert (examples['DBSOURCE'] == 'metavision').all()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Exclude non-adults 23593\n"
]
}
],
"source": [
"# Remove non-adults\n",
"min_age = 18\n",
"max_age = np.inf # no max age\n",
"examples = examples[(examples.AGE >= min_age) & (examples.AGE <= max_age)]\n",
"print('Exclude non-adults', examples['ICUSTAY_ID'].nunique())\n",
"examples_ = examples"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"======\n",
"prediction time 4 hour\n",
"Exclude deaths 23499\n",
"Exclude discharges 23401\n",
"---\n",
"Outcome ARF\n",
"Exclude onset 15873\n",
"---\n",
"Outcome Shock\n",
"Exclude onset 19342\n",
"======\n",
"prediction time 12 hour\n",
"Exclude deaths 23319\n",
"Exclude discharges 23060\n",
"---\n",
"Outcome ARF\n",
"Exclude onset 14174\n",
"---\n",
"Outcome Shock\n",
"Exclude onset 17588\n"
]
}
],
"source": [
"for T in [4, 12]:\n",
" print('======')\n",
" print('prediction time', T, 'hour')\n",
"\n",
" # Remove died before cutoff hour\n",
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import yaml\n",
"\n",
"data_path = yaml.full_load(open('../config.yaml'))['data_path']\n",
"\n",
"from matplotlib import pyplot as plt\n",
"import matplotlib"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"matplotlib.rcParams['figure.figsize'] = [8, 8]\n",
"matplotlib.rcParams['font.size'] = 15"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def visualize_labels(df, task):\n",
" df['{}_ONSET_HOUR'.format(task)].plot.hist(bins=np.arange(-5, 75, 0.5), alpha=0.9)\n",
" plt.xlim(-4,100)\n",
" plt.xlabel('{} onset hour'.format(task))\n",
" plt.savefig('Onset_{}-histogram.png'.format(task), dpi=300)\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df_ARF = pd.read_csv(data_path + 'labels/ARF.csv', index_col='ICUSTAY_ID')\n",
"df_Shock = pd.read_csv(data_path + 'labels/Shock.csv', index_col='ICUSTAY_ID')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 576x576 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 576x576 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"visualize_labels(df_ARF, 'ARF')\n",
"visualize_labels(df_Shock, 'Shock')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from datetime import timedelta\n",
"cutoff_h = 4\n",
"mimic3_path = yaml.full_load(open('config.yaml'))['mimic3_path']"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"patients = pd.read_csv(mimic3_path + 'PATIENTS.csv', parse_dates=['DOB', 'DOD'], usecols=['SUBJECT_ID', 'DOB', 'DOD'])\n",
"admissions = pd.read_csv(mimic3_path + 'ADMISSIONS.csv', parse_dates=['DEATHTIME'], usecols=['SUBJECT_ID', 'HADM_ID', 'DEATHTIME'])\n",
"examples = pd.read_csv(data_path + 'prep/icustays_MV.csv', parse_dates=['INTIME', 'OUTTIME']).sort_values(by='ICUSTAY_ID') # Only Metavision\n",
"\n",
"examples = pd.merge(examples, patients, on='SUBJECT_ID', how='left')\n",
"examples = pd.merge(examples, admissions, on=['SUBJECT_ID', 'HADM_ID'], how='left')\n",
"examples['AGE'] = examples.apply(lambda x: (x['INTIME'] - x['DOB']).total_seconds(), axis=1) / 3600 / 24 / 365.25\n",
"\n",
"examples['LOS'] = examples['LOS'] * 24 # Convert to hours"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>LOS</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>23620.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>3.593499</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>4.971162</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.000400</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>1.151600</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>1.996050</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>3.835000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>101.739000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" LOS\n",
"count 23620.000000\n",
"mean 3.593499\n",
"std 4.971162\n",
"min 0.000400\n",
"25% 1.151600\n",
"50% 1.996050\n",
"75% 3.835000\n",
"max 101.739000"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(examples[['LOS']] / 24.).describe()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Remove non-adults\n",
"min_age = 18\n",
"max_age = np.inf # no max age\n",
"examples = examples[(examples.AGE >= min_age) & (examples.AGE <= max_age)]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"23593"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"examples['ICUSTAY_ID'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ICUSTAY_ID</th>\n",
" <th>DEATHTIME</th>\n",
" <th>INTIME</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>200038</td>\n",
" <td>NaT</td>\n",
" <td>2143-10-24 20:35:24</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>200040</td>\n",
" <td>NaT</td>\n",
" <td>2153-10-24 16:01:41</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>200049</td>\n",
" <td>NaT</td>\n",
" <td>2118-08-28 08:56:44</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import yaml\n",
"with open('../config.yaml') as f:\n",
" config = yaml.full_load(f)\n",
"\n",
"data_path = config['data_path']\n",
"mimic3_path = config['mimic3_path']\n",
"\n",
"import pandas as pd\n",
"import itertools\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"icustays = pd.read_csv(data_path + 'prep/icustays_MV.csv')\n",
"partition = icustays.set_index('ICUSTAY_ID')[['partition']]\n",
"tasks = ['ARF', 'Shock']\n",
"Ts = [4, 12]\n",
"\n",
"populations = {}\n",
"for task, T in itertools.product(tasks, Ts):\n",
" pop = pd.read_csv(data_path + 'population/{}_{}h.csv'.format(task, T))\n",
" populations[task, T] = pop.set_index('ICUSTAY_ID')[['{}_LABEL'.format(task)]]\n",
"\n",
"populations['mortality', 48] = pd.read_csv(data_path + 'population/pop.mortality_benchmark.csv'.format('mortality', 48)) \\\n",
" .set_index('ID')[['{}_LABEL'.format('mortality')]]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df_out = []\n",
"for (task, T), labels in populations.items():\n",
" df = labels.join(partition)\n",
" c = Counter(df['partition'])\n",
" frac = df.groupby('partition').mean()['{}_LABEL'.format(task)]\n",
" df_out.append([task, T, \n",
" len(df), df['{}_LABEL'.format(task)].mean(),\n",
" c['train'], frac['train'], \n",
" c['val'], frac['val'], \n",
" c['test'], frac['test']])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df_out = pd.DataFrame(df_out, columns=['task', 'T', 'TOTAL_N', 'TOTAL_%', 'train_N', 'train_%', 'val_N', 'val_%', 'test_N', 'test_%'])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>task</th>\n",
" <th>T</th>\n",
" <th>TOTAL_N</th>\n",
" <th>TOTAL_%</th>\n",
" <th>train_N</th>\n",
" <th>train_%</th>\n",
" <th>val_N</th>\n",
" <th>val_%</th>\n",
" <th>test_N</th>\n",
" <th>test_%</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ARF</td>\n",
" <td>4</td>\n",
" <td>15873</td>\n",
" <td>0.182700</td>\n",
" <td>11147</td>\n",
" <td>0.182291</td>\n",
" <td>2368</td>\n",
" <td>0.180743</td>\n",
" <td>2358</td>\n",
" <td>0.186599</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ARF</td>\n",
" <td>12</td>\n",
" <td>14174</td>\n",
" <td>0.096515</td>\n",
" <td>9971</td>\n",
" <td>0.097282</td>\n",
" <td>2110</td>\n",
" <td>0.093365</td>\n",
" <td>2093</td>\n",
" <td>0.096034</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Shock</td>\n",
" <td>4</td>\n",
" <td>19342</td>\n",
" <td>0.149416</td>\n",
" <td>13613</td>\n",
" <td>0.148241</td>\n",
" <td>2862</td>\n",
" <td>0.157582</td>\n",
" <td>2867</td>\n",
" <td>0.146843</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Shock</td>\n",
" <td>12</td>\n",
" <td>17588</td>\n",
" <td>0.077553</td>\n",
" <td>12381</td>\n",
" <td>0.075923</td>\n",
" <td>2595</td>\n",
" <td>0.084008</td>\n",