Commit 5432c2f3 authored by Shengpu Tang (tangsp)'s avatar Shengpu Tang (tangsp)
Browse files

add eicu experiments

parent 65d67c21
This diff is collapsed.
eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/'
save_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/extracted/'
import pandas as pd
import numpy as np
from tqdm import tqdm
config = {
'n_rows': {
'medication': 7_301_853,
}
}
def _read_events(fname, t_cols, chunksize):
"""
A helper function to read csv in chunks
Arguments:
- fname is the file name (i.e INPUTEVENTS)
- t_cols is a list that contains the names of the time columns that should be parsed
- chunksize is the size of each chunk
"""
n_rows = config['n_rows'][fname]
with tqdm(desc=fname, total=(n_rows//chunksize+1)) as pbar:
for df in pd.read_csv(eicu_path + '{}.csv'.format(fname), parse_dates=t_cols, chunksize=chunksize):
pbar.update()
yield df
fname = 'medication'
df_M = []
for i, df in enumerate(_read_events(fname, [], chunksize=100000)):
# Remove unknow drug name or drug seqnum
df['drughiclseqno'] = df['drughiclseqno'].astype('Int64')
df = df.dropna(subset=['drugname', 'drughiclseqno'], how='all')
# Combine drug name and ID
df.loc[:, 'drugnameid'] = df[['drugname', 'drughiclseqno']].apply(
lambda x: '{}|{}'.format(x[0], x[1]), axis=1)
df = df.rename(columns={'patientunitstayid': 'ID', 'drugstartoffset': 't'})
df = df.set_index([
'ID', 't', 'drugnameid'
])[['dosage', 'routeadmin', 'frequency']]
df.columns.name = 'property'
df = df.stack()
df.name = 'variable_value'
df = df.reset_index()
df['variable_name'] = df[['drugnameid', 'property']].apply(lambda x: '|'.join(x), axis=1)
df['variable_value'] = pd.to_numeric(df['variable_value'], errors='ignore')
df = df[['ID', 't', 'variable_name', 'variable_value']]
df = df.reset_index(drop=True)
df_M.append(df)
df_out = pd.concat(df_M, ignore_index=True)
try:
df_out.to_parquet(save_path + '{}.parquet'.format(fname), index=False)
except:
df_out.to_pickle(save_path + '{}.pickle'.format(fname))
eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/'
save_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/extracted/'
import pandas as pd
import numpy as np
from tqdm import tqdm
config = {
'n_rows': {
'nurseCharting': 151_604_232,
}
}
def _read_events(fname, t_cols, chunksize):
"""
A helper function to read csv in chunks
Arguments:
- fname is the file name (i.e INPUTEVENTS)
- t_cols is a list that contains the names of the time columns that should be parsed
- chunksize is the size of each chunk
"""
n_rows = config['n_rows'][fname]
with tqdm(desc=fname, total=(n_rows//chunksize+1)) as pbar:
for df in pd.read_csv(eicu_path + '{}.csv'.format(fname), parse_dates=t_cols, chunksize=chunksize):
pbar.update()
yield df
fname = 'nurseCharting'
df_NC = []
for i, df in enumerate(_read_events(fname, [], chunksize=1000000)):
df = df.drop(columns=[
'nursingchartid',
'nursingchartentryoffset',
])
df = df.rename(columns={
'patientunitstayid': 'ID',
'nursingchartoffset': 't',
})
df['variable_name'] = df[[
'nursingchartcelltypecat', 'nursingchartcelltypevallabel',
'nursingchartcelltypevalname'
]].apply(lambda x: '|'.join(x), axis=1)
df['variable_value'] = pd.to_numeric(df['nursingchartvalue'], errors='ignore')
df = df[['ID', 't', 'variable_name', 'variable_value']]
df = df.reset_index(drop=True)
df_NC.append(df)
if i % 40 == 39:
df_out = pd.concat(df_NC, ignore_index=True)
try:
df_out.to_parquet(data_path + '{}_{}.parquet'.format(fname, int(i//40)), index=False)
except:
df_out.to_pickle(data_path + '{}_{}.pickle'.format(fname, int(i//40)))
df_NC = []
df_out = pd.concat(df_NC, ignore_index=True)
try:
df_out.to_parquet(save_path + '{}_{}.parquet'.format(fname, int(i//40)), index=False)
except:
df_out.to_pickle(save_path + '{}_{}.pickle'.format(fname, int(i//40)))
\ No newline at end of file
This diff is collapsed.
# python extract_pivoted.py vitalPeriodic
# python extract_pivoted.py vitalAperiodic
eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/'
save_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/extracted/'
import pandas as pd
import numpy as np
from tqdm import tqdm
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('filename')
args = parser.parse_args()
fname = args.filename
config = {
'n_rows': {
'vitalPeriodic': 146_671_642,
'vitalAperiodic': 25_075_074,
}
}
def _read_events(fname, t_cols, chunksize):
"""
A helper function to read csv in chunks
Arguments:
- fname is the file name (i.e INPUTEVENTS)
- t_cols is a list that contains the names of the time columns that should be parsed
- chunksize is the size of each chunk
"""
n_rows = config['n_rows'][fname]
with tqdm(desc=fname, total=(n_rows//chunksize+1)) as pbar:
for df in pd.read_csv(eicu_path + '{}.csv'.format(fname), parse_dates=t_cols, chunksize=chunksize):
pbar.update()
yield df
df_V = []
for i, df in enumerate(_read_events(fname, [], chunksize=1000000)):
df = df.iloc[:,1:].set_index(['patientunitstayid', 'observationoffset'])
df.columns.name = 'variable_name'
df = df.stack()
df.name = 'variable_value'
df = df.reset_index()
df_V.append(df)
if i % 20 == 0:
df_out = pd.concat(df_V, ignore_index=True)
df_out.to_parquet(save_path + '{}.parquet'.format(fname), index=False)
df_out = pd.concat(df_V, ignore_index=True)
df_out.columns = ['ID', 't', 'variable_name', 'variable_value']
df_out = df_out.groupby(['ID', 't', 'variable_name']).median().reset_index() # Drop duplicates and keep the median value
df_out.to_parquet(save_path + '{}.parquet'.format(fname), index=False)
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/'\n",
"save_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/extracted/'\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"from tqdm import tqdm\n",
"pd.options.display.max_columns = 100"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## RespiratoryCare"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/sw/arcts/centos7/python3.7-anaconda/2019.07/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3057: DtypeWarning: Columns (4,5,6,12,26,27,28,30,31,32,33) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler, result=result)\n"
]
}
],
"source": [
"df_R = pd.read_csv(eicu_path + '{}.csv'.format('respiratoryCare'))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"for c in ['respcarestatusoffset', 'ventstartoffset', \n",
" 'ventendoffset', 'priorventstartoffset', \n",
" 'priorventendoffset']:\n",
" df_R[c] = df_R[c].replace({0: np.nan})"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['respcareid', 'patientunitstayid', 'respcarestatusoffset',\n",
" 'currenthistoryseqnum', 'airwaytype', 'airwaysize', 'airwayposition',\n",
" 'cuffpressure', 'ventstartoffset', 'ventendoffset',\n",
" 'priorventstartoffset', 'priorventendoffset', 'apneaparms',\n",
" 'lowexhmvlimit', 'hiexhmvlimit', 'lowexhtvlimit', 'hipeakpreslimit',\n",
" 'lowpeakpreslimit', 'hirespratelimit', 'lowrespratelimit',\n",
" 'sighpreslimit', 'lowironoxlimit', 'highironoxlimit',\n",
" 'meanairwaypreslimit', 'peeplimit', 'cpaplimit', 'setapneainterval',\n",
" 'setapneatv', 'setapneaippeephigh', 'setapnearr', 'setapneapeakflow',\n",
" 'setapneainsptime', 'setapneaie', 'setapneafio2'],\n",
" dtype='object')"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_R.columns"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"df_R1 = df_R.drop(columns=[\n",
" 'respcareid', 'currenthistoryseqnum', \n",
" 'ventendoffset', \n",
" 'priorventendoffset', \n",
"])\n",
"df_R1 = df_R1.rename(columns={\n",
" 'patientunitstayid': 'ID',\n",
" 'respcarestatusoffset': 't',\n",
"})\n",
"df_R1 = df_R1.set_index(['ID', 't'])\n",
"df_R1.columns.name = 'variable_name'\n",
"df_R1 = df_R1.stack()\n",
"df_R1.name = 'variable_value'\n",
"df_R1 = df_R1.reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# need to make sure ventStartOffset is before the recording time\n",
"df_R1[df_R1['variable_name'] == 'ventstartoffset'][['t', 'variable_value']] \\\n",
" .apply(lambda x: x[0] >= x[1], axis=1).mean()\n",
"\n",
"# remove rows with ventStartOffset before the recording time\n",
"df_R2 = df_R1.loc[\n",
" df_R1.apply(lambda x: x[2] != 'ventstartoffset' or (x[2] == 'ventstartoffset' and x[1] >= x[3]), axis=1)\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df_R2.to_pickle(save_path + '{}.pickle'.format('respiratoryCare'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## IntakeOutput"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df_IO = pd.read_csv(eicu_path + '{}.csv'.format('intakeOutput'))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df_IO = df_IO.drop(columns=['intakeoutputid', 'intakeoutputentryoffset', 'celllabel']) \\\n",
" .rename(columns={\n",
" 'patientunitstayid': 'ID', 'intakeoutputoffset': 't', \n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df_IO1 = df_IO[['ID', 't', 'intaketotal', 'outputtotal', 'dialysistotal', 'nettotal']]\n",
"df_IO1 = df_IO1.set_index(['ID', 't'])\n",
"df_IO1.columns.name = 'variable_name'\n",
"df_IO1 = df_IO1.stack()\n",
"df_IO1.name = 'variable_value'\n",
"df_IO1 = df_IO1.reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df_IO2 = df_IO[['ID', 't', 'cellpath', 'cellvaluenumeric', 'cellvaluetext']] \\\n",
" .rename(columns={'cellpath': 'variable_name'})\n",
"df_IO2['variable_value'] = pd.to_numeric(df_IO2['cellvaluetext'], errors='ignore')\n",
"df_IO2 = df_IO2[['ID', 't', 'variable_name', 'variable_value']]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"del df_IO\n",
"df_IOo = pd.concat([df_IO1, df_IO2])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df_IOo.to_parquet(save_path + 'intakeOutput.parquet')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/'\n",
"data_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/'\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"examples = pd.read_csv(data_path + 'extracted/icustays.csv')\n",
"labels = pd.read_csv(data_path + 'labels/ARF.csv')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"original pupulation: 200859\n"
]
}
],
"source": [
"print(\"original pupulation: \", examples['ICUStayID'].nunique() )"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Exclude non-adults 200234\n"
]
}
],
"source": [
"# Remove non-adults\n",
"min_age = 18\n",
"max_age = np.inf # no max age\n",
"examples = examples[(examples.age >= min_age) & (examples.age <= max_age)]\n",
"print('Exclude non-adults', examples['ICUStayID'].nunique())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"hospitals = sorted(examples['hospitalid'].unique())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"208"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(hospitals)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"df_R = pd.read_pickle(data_path + 'extracted/respiratoryCare.pickle')\n",
"df_RC = pd.read_pickle(data_path + 'extracted/respiratoryCharting.pickle')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"IDs = sorted(set(\n",
" list(df_R['ID'].unique()) + \\\n",
" list(df_RC['ID'].unique())\n",
"))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"127203"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(IDs)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"hospitals_ARF = sorted(examples.loc[examples['ICUStayID'].isin(IDs), 'hospitalid'].unique())"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"191"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(hospitals_ARF) # hospitals to keep"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},