Commit 1f8e076f authored by Shengpu Tang (tangsp)'s avatar Shengpu Tang (tangsp)
Browse files

remove experiments code

parent 56f2f65b
eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/'
save_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/extracted/'
import pandas as pd
import numpy as np
from tqdm import tqdm
config = {
'n_rows': {
'medication': 7_301_853,
}
}
def _read_events(fname, t_cols, chunksize):
"""
A helper function to read csv in chunks
Arguments:
- fname is the file name (i.e INPUTEVENTS)
- t_cols is a list that contains the names of the time columns that should be parsed
- chunksize is the size of each chunk
"""
n_rows = config['n_rows'][fname]
with tqdm(desc=fname, total=(n_rows//chunksize+1)) as pbar:
for df in pd.read_csv(eicu_path + '{}.csv'.format(fname), parse_dates=t_cols, chunksize=chunksize):
pbar.update()
yield df
fname = 'medication'
df_M = []
for i, df in enumerate(_read_events(fname, [], chunksize=100000)):
# Remove unknow drug name or drug seqnum
df['drughiclseqno'] = df['drughiclseqno'].astype('Int64')
df = df.dropna(subset=['drugname', 'drughiclseqno'], how='all')
# Combine drug name and ID
df.loc[:, 'drugnameid'] = df[['drugname', 'drughiclseqno']].apply(
lambda x: '{}|{}'.format(x[0], x[1]), axis=1)
df = df.rename(columns={'patientunitstayid': 'ID', 'drugstartoffset': 't'})
df = df.set_index([
'ID', 't', 'drugnameid'
])[['dosage', 'routeadmin', 'frequency']]
df.columns.name = 'property'
df = df.stack()
df.name = 'variable_value'
df = df.reset_index()
df['variable_name'] = df[['drugnameid', 'property']].apply(lambda x: '|'.join(x), axis=1)
df['variable_value'] = pd.to_numeric(df['variable_value'], errors='ignore')
df = df[['ID', 't', 'variable_name', 'variable_value']]
df = df.reset_index(drop=True)
df_M.append(df)
df_out = pd.concat(df_M, ignore_index=True)
try:
df_out.to_parquet(save_path + '{}.parquet'.format(fname), index=False)
except:
df_out.to_pickle(save_path + '{}.pickle'.format(fname))
eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/'
save_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/extracted/'
import pandas as pd
import numpy as np
from tqdm import tqdm
config = {
'n_rows': {
'nurseCharting': 151_604_232,
}
}
def _read_events(fname, t_cols, chunksize):
"""
A helper function to read csv in chunks
Arguments:
- fname is the file name (i.e INPUTEVENTS)
- t_cols is a list that contains the names of the time columns that should be parsed
- chunksize is the size of each chunk
"""
n_rows = config['n_rows'][fname]
with tqdm(desc=fname, total=(n_rows//chunksize+1)) as pbar:
for df in pd.read_csv(eicu_path + '{}.csv'.format(fname), parse_dates=t_cols, chunksize=chunksize):
pbar.update()
yield df
fname = 'nurseCharting'
df_NC = []
for i, df in enumerate(_read_events(fname, [], chunksize=1000000)):
df = df.drop(columns=[
'nursingchartid',
'nursingchartentryoffset',
])
df = df.rename(columns={
'patientunitstayid': 'ID',
'nursingchartoffset': 't',
})
df['variable_name'] = df[[
'nursingchartcelltypecat', 'nursingchartcelltypevallabel',
'nursingchartcelltypevalname'
]].apply(lambda x: '|'.join(x), axis=1)
df['variable_value'] = pd.to_numeric(df['nursingchartvalue'], errors='ignore')
df = df[['ID', 't', 'variable_name', 'variable_value']]
df = df.reset_index(drop=True)
df_NC.append(df)
if i % 40 == 39:
df_out = pd.concat(df_NC, ignore_index=True)
try:
df_out.to_parquet(data_path + '{}_{}.parquet'.format(fname, int(i//40)), index=False)
except:
df_out.to_pickle(data_path + '{}_{}.pickle'.format(fname, int(i//40)))
df_NC = []
df_out = pd.concat(df_NC, ignore_index=True)
try:
df_out.to_parquet(save_path + '{}_{}.parquet'.format(fname, int(i//40)), index=False)
except:
df_out.to_pickle(save_path + '{}_{}.pickle'.format(fname, int(i//40)))
\ No newline at end of file
# python extract_pivoted.py vitalPeriodic
# python extract_pivoted.py vitalAperiodic
eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/'
save_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/extracted/'
import pandas as pd
import numpy as np
from tqdm import tqdm
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('filename')
args = parser.parse_args()
fname = args.filename
config = {
'n_rows': {
'vitalPeriodic': 146_671_642,
'vitalAperiodic': 25_075_074,
}
}
def _read_events(fname, t_cols, chunksize):
"""
A helper function to read csv in chunks
Arguments:
- fname is the file name (i.e INPUTEVENTS)
- t_cols is a list that contains the names of the time columns that should be parsed
- chunksize is the size of each chunk
"""
n_rows = config['n_rows'][fname]
with tqdm(desc=fname, total=(n_rows//chunksize+1)) as pbar:
for df in pd.read_csv(eicu_path + '{}.csv'.format(fname), parse_dates=t_cols, chunksize=chunksize):
pbar.update()
yield df
df_V = []
for i, df in enumerate(_read_events(fname, [], chunksize=1000000)):
df = df.iloc[:,1:].set_index(['patientunitstayid', 'observationoffset'])
df.columns.name = 'variable_name'
df = df.stack()
df.name = 'variable_value'
df = df.reset_index()
df_V.append(df)
if i % 20 == 0:
df_out = pd.concat(df_V, ignore_index=True)
df_out.to_parquet(save_path + '{}.parquet'.format(fname), index=False)
df_out = pd.concat(df_V, ignore_index=True)
df_out.columns = ['ID', 't', 'variable_name', 'variable_value']
df_out = df_out.groupby(['ID', 't', 'variable_name']).median().reset_index() # Drop duplicates and keep the median value
df_out.to_parquet(save_path + '{}.parquet'.format(fname), index=False)
%% Cell type:code id: tags:
``` python
eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/'
save_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/extracted/'
import pandas as pd
import numpy as np
from tqdm import tqdm
pd.options.display.max_columns = 100
```
%% Cell type:markdown id: tags:
## RespiratoryCare
%% Cell type:code id: tags:
``` python
df_R = pd.read_csv(eicu_path + '{}.csv'.format('respiratoryCare'))
```
%% Output
/sw/arcts/centos7/python3.7-anaconda/2019.07/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3057: DtypeWarning: Columns (4,5,6,12,26,27,28,30,31,32,33) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
%% Cell type:code id: tags:
``` python
for c in ['respcarestatusoffset', 'ventstartoffset',
'ventendoffset', 'priorventstartoffset',
'priorventendoffset']:
df_R[c] = df_R[c].replace({0: np.nan})
```
%% Cell type:code id: tags:
``` python
df_R.columns
```
%% Output
Index(['respcareid', 'patientunitstayid', 'respcarestatusoffset',
'currenthistoryseqnum', 'airwaytype', 'airwaysize', 'airwayposition',
'cuffpressure', 'ventstartoffset', 'ventendoffset',
'priorventstartoffset', 'priorventendoffset', 'apneaparms',
'lowexhmvlimit', 'hiexhmvlimit', 'lowexhtvlimit', 'hipeakpreslimit',
'lowpeakpreslimit', 'hirespratelimit', 'lowrespratelimit',
'sighpreslimit', 'lowironoxlimit', 'highironoxlimit',
'meanairwaypreslimit', 'peeplimit', 'cpaplimit', 'setapneainterval',
'setapneatv', 'setapneaippeephigh', 'setapnearr', 'setapneapeakflow',
'setapneainsptime', 'setapneaie', 'setapneafio2'],
dtype='object')
%% Cell type:code id: tags:
``` python
df_R1 = df_R.drop(columns=[
'respcareid', 'currenthistoryseqnum',
'ventendoffset',
'priorventendoffset',
])
df_R1 = df_R1.rename(columns={
'patientunitstayid': 'ID',
'respcarestatusoffset': 't',
})
df_R1 = df_R1.set_index(['ID', 't'])
df_R1.columns.name = 'variable_name'
df_R1 = df_R1.stack()
df_R1.name = 'variable_value'
df_R1 = df_R1.reset_index()
```
%% Cell type:code id: tags:
``` python
# need to make sure ventStartOffset is before the recording time
df_R1[df_R1['variable_name'] == 'ventstartoffset'][['t', 'variable_value']] \
.apply(lambda x: x[0] >= x[1], axis=1).mean()
# remove rows with ventStartOffset before the recording time
df_R2 = df_R1.loc[
df_R1.apply(lambda x: x[2] != 'ventstartoffset' or (x[2] == 'ventstartoffset' and x[1] >= x[3]), axis=1)
]
```
%% Cell type:code id: tags:
``` python
df_R2.to_pickle(save_path + '{}.pickle'.format('respiratoryCare'))
```
%% Cell type:markdown id: tags:
## IntakeOutput
%% Cell type:code id: tags:
``` python
df_IO = pd.read_csv(eicu_path + '{}.csv'.format('intakeOutput'))
```
%% Cell type:code id: tags:
``` python
df_IO = df_IO.drop(columns=['intakeoutputid', 'intakeoutputentryoffset', 'celllabel']) \
.rename(columns={
'patientunitstayid': 'ID', 'intakeoutputoffset': 't',
})
```
%% Cell type:code id: tags:
``` python
df_IO1 = df_IO[['ID', 't', 'intaketotal', 'outputtotal', 'dialysistotal', 'nettotal']]
df_IO1 = df_IO1.set_index(['ID', 't'])
df_IO1.columns.name = 'variable_name'
df_IO1 = df_IO1.stack()
df_IO1.name = 'variable_value'
df_IO1 = df_IO1.reset_index()
```
%% Cell type:code id: tags:
``` python
df_IO2 = df_IO[['ID', 't', 'cellpath', 'cellvaluenumeric', 'cellvaluetext']] \
.rename(columns={'cellpath': 'variable_name'})
df_IO2['variable_value'] = pd.to_numeric(df_IO2['cellvaluetext'], errors='ignore')
df_IO2 = df_IO2[['ID', 't', 'variable_name', 'variable_value']]
```
%% Cell type:code id: tags:
``` python
del df_IO
df_IOo = pd.concat([df_IO1, df_IO2])
```
%% Cell type:code id: tags:
``` python
df_IOo.to_parquet(save_path + 'intakeOutput.parquet')
```
%% Cell type:code id: tags:
``` python
```