Commit 2a4ada9a authored by Shengpu Tang (tangsp)'s avatar Shengpu Tang (tangsp)
Browse files

v0.2.0

parent 42f93a09
data/*
**output**/
.ipynb_checkpoints
*.png
......
FROM python:3.8
WORKDIR /workdir
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY FIDDLE/ ./FIDDLE/
......@@ -5,10 +5,17 @@ column_names:
var_name: variable_name
var_value: variable_value
use_ordinal_encoding: no
parallel: yes
n_jobs: 72
batch_size: 100
hierarchical_sep: ":"
hierarchical_levels: [0, 1, 2]
discretize: yes
use_ordinal_encoding: no
discretization: ~
value_types:
# enter the feature type that you would like to override in the following format:
FIRST_WARDID: Categorical
......
import os, yaml
with open(os.path.join(os.path.dirname(__file__), 'config.yaml')) as f:
config = yaml.full_load(f)
import copy
ID_col = config['column_names']['ID']
var_col = config['column_names']['var_name']
val_col = config['column_names']['var_value']
t_col = config['column_names']['t']
hierarchical_sep = config['hierarchical_sep']
hierarchical_levels = config['hierarchical_levels']
with open(os.path.join(os.path.dirname(__file__), 'config-default.yaml')) as f:
config_default = yaml.safe_load(f)
use_ordinal_encoding = config['use_ordinal_encoding']
value_type_override = config['value_types']
def load_config(fname):
config = copy.deepcopy(config_default)
if fname:
config_custom = yaml.safe_load(open(fname, 'r'))
for k, v in config_custom.items():
config[k] = v
return config
parallel = True
n_jobs = 72
ID_col = 'ID'
t_col = 't'
var_col = 'variable_name'
val_col = 'variable_value'
if 'column_names' in config_default:
ID_col = config_default['column_names'].get('ID', 'ID')
t_col = config_default['column_names'].get('t', 't')
var_col = config_default['column_names'].get('var_name', 'variable_name')
val_col = config_default['column_names'].get('var_value', 'variable_value')
else:
pass
import argparse
def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
from .config import *
import pandas as pd
import numpy as np
import scipy
import sparse
from collections import defaultdict
from joblib import Parallel, delayed, parallel_backend
from tqdm import tqdm
from sklearn.feature_selection import VarianceThreshold
import sklearn
from collections import defaultdict
try:
from .config import *
except:
from config import *
def print_header(*content, char='='):
print()
print(char * 80)
......@@ -95,11 +85,11 @@ def get_unique_variables(df):
return sorted(df[var_col].unique())
def get_frequent_numeric_variables(df_time_series, variables, threshold, args):
data_path = args.data_path
output_dir = args.output_dir
df_population = args.df_population
T, dt = args.T, args.dt
df_types = pd.read_csv(data_path + 'value_types.csv').set_index(var_col)['value_type']
df_types = pd.read_csv(output_dir + 'value_types.csv').set_index(var_col)['value_type']
numeric_vars = [col for col in variables if df_types[col] == 'Numeric']
df_num_counts = calculate_variable_counts(df_time_series, df_population)[numeric_vars] #gets the count of each variable for each patient.
variables_num_freq = df_num_counts.columns[df_num_counts.mean() >= threshold * np.floor(T/dt)]
......@@ -136,23 +126,41 @@ def select_dtype(df, dtype, dtypes=None):
assert False
return
def smart_qcut_dummify(x, q, use_ordinal_encoding=False):
def compute_bin_edges(x, q):
# ignore strings when performing qcut
z = x.copy()
z = z.apply(make_float)
m = z.apply(np.isreal)
bin_edges = None
if z.loc[m].dropna().nunique() > 1: # when more than one numeric values
if use_ordinal_encoding:
bin_edges = np.nanpercentile(z.loc[m].astype(float).to_numpy(), [0, 20, 40, 60, 80, 100])
bin_edges = np.unique(bin_edges)
col_names = ['{}>={}'.format(z.name, bin_edge) for bin_edge in bin_edges[:-1]]
out = pd.DataFrame(0, z.index, col_names)
for i, bin_edge in enumerate(bin_edges[:-1]):
out.loc[m, col_names[i]] = (z.loc[m] > bin_edge).astype(int)
out = pd.concat([out, pd.get_dummies(z.where(~m, np.nan), prefix=z.name)], axis=1)
if z.loc[m].dropna().nunique() == 2:
pass
else:
bin_edges = list(np.unique(np.nanpercentile(z.loc[m].astype(float).values, np.linspace(0, 100, q+1))))
return (x.name, bin_edges)
def smart_qcut_dummify_parallel(first_arg):
return smart_qcut_dummify(*first_arg)
def smart_qcut_dummify(x, bin_edges, use_ordinal_encoding=False):
# ignore strings when performing qcut
z = x.copy()
z = z.apply(make_float)
m = z.apply(np.isreal)
if z.loc[m].dropna().nunique() > 1: # when more than one unique numeric values
if z.loc[m].dropna().nunique() == 2: # when only two unique numeric values
out = pd.get_dummies(x, prefix=x.name)
else:
z.loc[m] = pd.qcut(z.loc[m].to_numpy(), q=q, duplicates='drop')
out = pd.get_dummies(z, prefix=z.name)
if use_ordinal_encoding:
col_names = ['{}>={}'.format(z.name, bin_edge) for bin_edge in bin_edges[:-1]]
out = pd.DataFrame(0, z.index, col_names)
for i, bin_edge in enumerate(bin_edges[:-1]):
out.loc[m, col_names[i]] = (z.loc[m] >= bin_edge).astype(int)
out = pd.concat([out, pd.get_dummies(z.where(~m, np.nan), prefix=z.name)], axis=1)
else:
z.loc[m] = pd.cut(z.loc[m].to_numpy(), bin_edges, duplicates='drop', include_lowest=True)
out = pd.get_dummies(z, prefix=z.name)
else:
out = pd.get_dummies(x, prefix=x.name)
return out
......@@ -202,13 +210,13 @@ def pivot_event_table(df):
# Handle cases where the same variable is recorded multiple times with the same timestamp
# Adjust the timestamps by epsilon so that all timestamps are unique
eps = 1e-6
m_dups = df.duplicated([ID_col, t_col, var_col], keep=False)
m_dups = df.duplicated([t_col, var_col], keep=False)
df_dups = df[m_dups].copy()
for v, df_v in df_dups.groupby(var_col):
df_dups.loc[df_v.index, t_col] += eps * np.arange(len(df_v))
df = pd.concat([df[~m_dups], df_dups])
assert not df.duplicated([ID_col, t_col, var_col], keep=False).any()
assert not df.duplicated([t_col, var_col], keep=False).any()
return pd.pivot_table(df, val_col, t_col, var_col, 'first')
......
from .config import *
import pickle
import pandas as pd
import numpy as np
import pickle
import time
import os
import yaml
import json
import argparse
from .helpers import str2bool
parser = argparse.ArgumentParser(description='')
parser.add_argument('--T', type=float, required=True)
parser.add_argument('--dt', type=float, required=True)
parser.add_argument('--theta_1', type=float, default=0.001)
parser.add_argument('--theta_2', type=float, default=0.001)
parser.add_argument('--theta_freq', type=float, default=1.0)
parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean'])
parser.add_argument('--binarize', type=str2bool, default=True, nargs='?', const=True)
parser.add_argument('--data_path', type=str, required=True)
parser.add_argument('--input_fname', type=str, required=False)
parser.add_argument('--population', type=str, required=True)
parser.add_argument('--N', type=int, required=False)
parser.add_argument('--Ds', nargs='+', type=int)
parser.add_argument('--no_prefilter', dest='prefilter', action='store_false')
parser.add_argument('--no_postfilter', dest='postfilter', action='store_false')
parser.set_defaults(prefilter=True, postfilter=True)
args = parser.parse_args()
data_path = args.data_path
if not data_path.endswith('/'):
data_path += '/'
population = args.population
T = int(args.T)
dt = args.dt
theta_1 = args.theta_1
theta_2 = args.theta_2
theta_freq = args.theta_freq
stats_functions = args.stats_functions
binarize = args.binarize
df_population = pd.read_csv(population).set_index('ID')
N = args.N or len(df_population)
df_population = df_population.iloc[:args.N]
L = int(np.floor(T/dt))
args.df_population = df_population
args.N = N
args.L = L
args.parallel = parallel
if args.input_fname and os.path.isfile(args.input_fname):
input_fname = args.input_fname
if input_fname.endswith('.p' or '.pickle'):
df_data = pd.read_pickle(input_fname)
elif input_fname.endswith('.csv'):
df_data = pd.read_csv(input_fname)
import FIDDLE.config as FIDDLE_config
import FIDDLE.steps as FIDDLE_steps
def main():
######
# User arguments
######
parser = argparse.ArgumentParser(description='')
# Files
parser.add_argument('--data_fname', type=str, required=True)
parser.add_argument('--population_fname',type=str, required=True)
parser.add_argument('--output_dir', type=str, required=True)
parser.add_argument('--config_fname', type=str, required=False)
# Settings
parser.add_argument('--T', type=float, required=True)
parser.add_argument('--dt', type=float, required=True)
parser.add_argument('--theta_1', type=float, default=0.001)
parser.add_argument('--theta_2', type=float, default=0.001)
parser.add_argument('--theta_freq', type=float, default=1.0)
parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean'])
# Debug
parser.add_argument('--N', type=int, required=False)
parser.add_argument('--Ds', nargs='+', type=int)
parser.add_argument('--no_prefilter', dest='prefilter', action='store_false')
parser.add_argument('--no_postfilter', dest='postfilter', action='store_false')
parser.set_defaults(prefilter=True, postfilter=True)
args = parser.parse_args()
######
# Load files
######
data_fname = args.data_fname
if data_fname.endswith('.p' or '.pickle'):
df_data = pd.read_pickle(data_fname)
elif data_fname.endswith('.csv'):
df_data = pd.read_csv(data_fname)
else:
assert False
elif os.path.isfile(data_path + 'input_data.p'):
input_fname = data_path + 'input_data.p'
df_data = pd.read_pickle(input_fname)
elif os.path.isfile(data_path + 'input_data.pickle'):
input_fname = data_path + 'input_data.pickle'
df_data = pd.read_pickle(input_fname)
elif os.path.isfile(data_path + 'input_data.csv'):
input_fname = data_path + 'input_data.csv'
df_data = pd.read_csv(input_fname)
else:
raise NotImplementedError
from .steps import *
print('Input data file:', input_fname)
print()
print('Input arguments:')
print(' {:<6} = {}'.format('T', T))
print(' {:<6} = {}'.format('dt', dt))
print(' {:<6} = {}'.format('\u03B8\u2081', theta_1))
print(' {:<6} = {}'.format('\u03B8\u2082', theta_2))
print(' {:<6} = {}'.format('\u03B8_freq', theta_freq))
print(' {:<6} = {} {}'.format('k', len(stats_functions), stats_functions))
print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize]))
print()
print('N = {}'.format(N))
print('L = {}'.format(L))
print('', flush=True)
######
# Main
######
if args.prefilter:
print_header('1) Pre-filter')
df_data = pre_filter(df_data, theta_1, df_population, args)
df_data.to_csv(data_path + 'pre-filtered.csv', index=False)
print_header('2) Transform; 3) Post-filter')
df_data, df_types = parse_variable_data_type(df_data, value_type_override, args)
df_time_invariant, df_time_series = split_by_timestamp_type(df_data)
# Process time-invariant data
if len(df_time_invariant) > 0:
s, s_feature_names, s_feature_aliases = process_time_invariant(df_time_invariant, args)
# Process time-dependent data
if len(df_time_series) > 0:
X, X_feature_names, X_feature_aliases = process_time_dependent(df_time_series, args)
raise NotImplementedError
df_population = args.df_population = pd.read_csv(args.population_fname).set_index('ID').sort_index()
config = FIDDLE_config.load_config(args.config_fname)
## Arguments settings
output_dir = args.output_dir
if not output_dir.endswith('/'):
output_dir += '/'
T = args.T
dt = args.dt
theta_1 = args.theta_1
theta_2 = args.theta_2
theta_freq = args.theta_freq
stats_functions = args.stats_functions
args.hierarchical_sep = config.get('hierarchical_sep', ':')
args.hierarchical_levels = config.get('hierarchical_levels', [])
args.value_type_override = config.get('value_types', {})
args.discretize = config.get('discretize', True)
args.use_ordinal_encoding = config.get('use_ordinal_encoding', False)
args.S_discretization_bins = None
args.X_discretization_bins = None
S_discretization_bins = config.get('S_discretization_bins')
X_discretization_bins = config.get('X_discretization_bins')
if S_discretization_bins:
args.s_discretization_bins = json.load(open(S_discretization_bins, 'r'))
if X_discretization_bins:
args.X_discretization_bins = json.load(open(X_discretization_bins, 'r'))
args.parallel = config.get('parallel', False)
args.n_jobs = config.get('n_jobs', 1)
args.batch_size = config.get('batch_size', 100)
N = args.N = args.N or len(df_population)
df_population = df_population.iloc[:args.N]
L = args.L = int(np.floor(T/dt))
print('Input:')
print(' Data :', args.data_fname)
print(' Population:', args.population_fname)
print(' Config :', args.config_fname)
print()
print('Output directory:', args.output_dir)
print()
print('Input arguments:')
print(' {:<6} = {}'.format('T', T))
print(' {:<6} = {}'.format('dt', dt))
print(' {:<6} = {}'.format('\u03B8\u2081', theta_1))
print(' {:<6} = {}'.format('\u03B8\u2082', theta_2))
print(' {:<6} = {}'.format('\u03B8_freq', theta_freq))
print(' {:<6} = {} {}'.format('k', len(stats_functions), stats_functions))
print()
print('{} = {}'.format('discretize', {False: 'no', True: 'yes'}[args.discretize]))
if args.discretize:
print(' S discretization bins:', S_discretization_bins or 'to be computed from data')
print(' X discretization bins:', X_discretization_bins or 'to be computed from data')
print()
print('N = {}'.format(N))
print('L = {}'.format(L))
print('', flush=True)
######
# Main
######
df_population[[]].to_csv(output_dir + 'IDs.csv')
if args.prefilter:
FIDDLE_steps.print_header('1) Pre-filter')
df_data = FIDDLE_steps.pre_filter(df_data, theta_1, df_population, args)
df_data.to_csv(output_dir + 'pre-filtered.csv', index=False)
FIDDLE_steps.print_header('2) Transform; 3) Post-filter')
df_data, df_types = FIDDLE_steps.parse_variable_data_type(df_data, args)
df_time_invariant, df_time_series = FIDDLE_steps.split_by_timestamp_type(df_data)
# Process time-invariant data
S, S_feature_names, S_feature_aliases = FIDDLE_steps.process_time_invariant(df_time_invariant, args)
# Process time-dependent data
X, X_feature_names, X_feature_aliases = FIDDLE_steps.process_time_dependent(df_time_series, args)
if __name__ == '__main__':
main()
This diff is collapsed.
# FIDDLE
FIDDLE – <b>F</b>lex<b>I</b>ble <b>D</b>ata-<b>D</b>riven pipe<b>L</b>in<b>E</b> – is a preprocessing pipeline that transforms structured EHR data into feature vectors that can be used with ML algorithms, relying on only a small number of user-defined arguments.
FIDDLE – <b>F</b>lex<b>I</b>ble <b>D</b>ata-<b>D</b>riven pipe<b>L</b>in<b>E</b> – is a preprocessing pipeline that transforms structured EHR data into feature vectors that can be used with ML algorithms, relying on only a small number of user-defined arguments.
Requires python 3.6 or above. Required packages and versions are listed in `requirements.txt`. Older versions may still work but have not been tested.
Try a quick demo here: [tiny.cc/FIDDLE-demo](https://tiny.cc/FIDDLE-demo)
Note: This README contains latex equations and is best viewed on the GitLab site (https://gitlab.eecs.umich.edu/mld3/FIDDLE).
Note: This README contains latex equations and is best viewed on the [GitLab site](https://gitlab.eecs.umich.edu/mld3/FIDDLE).
## Publications & Resources
- Title: <b>Democratizing EHR analyses with FIDDLE: a flexible data-driven preprocessing pipeline for structured clinical data.</b>
- Authors: Shengpu Tang, Parmida Davarmanesh, Yanmeng Song, Danai Koutra, Michael W. Sjoding, and Jenna Wiens.
- Authors: Shengpu Tang, Parmida Davarmanesh, Yanmeng Song, Danai Koutra, Michael W. Sjoding, and Jenna Wiens.
- Published in JAMIA (Journal of the American Medical Informatics Association), October 2020: [article link](https://doi.org/10.1093/jamia/ocaa139)
- Previously presented at MLHC 2019 (<i>[Machine Learning for Healthcare](https://www.mlforhc.org/)</i>) as a [clinical abstract](https://www.mlforhc.org/s/Sjoding-jete.pdf)
- News coverage on HealthcareITNews: [link](https://www.healthcareitnews.com/news/new-framework-helps-streamline-ehr-data-extraction)
......@@ -23,20 +23,39 @@ If you use FIDDLE in your research, please cite the following publication:
journal = {Journal of the American Medical Informatics Association},
year = {2020},
month = {10},
issn = {1527-974X},
doi = {10.1093/jamia/ocaa139},
}
```
## System Requirements
### Pip
Requires python 3.7 or above (older versions may still work but have not been tested). Required packages and versions are listed in `requirements.txt`. Run the following command to install the required packages.
```bash
pip install -r requirements.txt
```
### Docker
To build the docker image, run the following command:
```bash
docker build -t fiddle-v020 .
```
Refer to the notebook `tests/small_test/Run-docker.ipynb` for an example to run FIDDLE in docker.
## Usage Notes
FIDDLE generates feature vectors based on data within the observation period $`t\in[0,T]`$. This feature representation can be used to make predictions of adverse outcomes at t=T. More specifically, FIDDLE outputs a set of binary feature vectors for each example $`i`$, $`\{(s_i,x_i)\ \text{for}\ i=1 \dots N\}`$ where $`s_i \in R^d`$ contains time-invariant features and $`x_i \in R^{L \times D}`$ contains time-dependent features.
Input:
Input:
- formatted EHR data: `.csv` or `.p`/`.pickle` file, a table with 4 columns \[`ID`, `t`, `variable_name`, `variable_value`\]
- population file: a list of unique `ID`s you want processed
- the output feature matrix will correspond to IDs in lexicographically sorted order
- config file:
- specifies additional settings by providing a custom `config.yaml` file
- a default config file is located at `FIDDLE/config-default.yaml`
- arguments:
- T: The time of prediction; time-dependent features will be generated using data in $`t\in[0,T]`$.
- dt: the temporal granularity at which to "window" time-dependent data.
- T: The time of prediction; time-dependent features will be generated using data in $`t\in[0,T]`$.
- dt: the temporal granularity at which to "window" time-dependent data.
- theta_1: The threshold for Pre-filter.
- theta_2: The threshold for Post-filter.
- theta_freq: The threshold at which we deem a variable “frequent” (for which summary statistics will be calculated).
......@@ -46,8 +65,10 @@ Output: The generated features and associated metadata are located in `{data_pat
- `s.npz`: a sparse array of shape (N, d)
- `X.npz`: a sparse tensor of shape (N, L, D)
- `s.feature_names.txt`: names of _d_ time-invariant features
- `X.feature_names.txt`: names of _D_ time-series features
- `s.feature_names.json`: names of _d_ time-invariant features
- `X.feature_names.json`: names of _D_ time-series features
- `x.feature_aliases.json`: aliases of duplicated time-invariant features
- `X.feature_aliases.json`: aliases of duplicated time-series features
To load the generated features:
......@@ -70,11 +91,11 @@ python -m FIDDLE.run \
## Guidelines on argument settings
The user-defined arguments of FIDDLE include: T, dt, theta_1, theta_2, theta_freq, and K statistics functions. The settings of these arguments could affect the features and how they can be used. We provided reasonable default values in the implementation, and here list some practical considerations: (i) prediction time and frequency, (ii) temporal density of data, and (iii) class balance.
(i) The prediction time and frequency determine the appropriate settings for T and dt. The risk stratification tasks we considered all involve a single prediction at the end of a fixed prediction window. It is thus most reasonable to set T to be the length of prediction window. Another possible formulation is to make multiple predictions where each prediction depends on only data from the past (not the future), using models like LSTM or fully convolutional networks. In that case, for example, if a prediction needs to be made every 4 hours over a 48-hour period, then T should be 48 hours, whereas dt should be at most 4 hours.
(i) The prediction time and frequency determine the appropriate settings for T and dt. The risk stratification tasks we considered all involve a single prediction at the end of a fixed prediction window. It is thus most reasonable to set T to be the length of prediction window. Another possible formulation is to make multiple predictions where each prediction depends on only data from the past (not the future), using models like LSTM or fully convolutional networks. In that case, for example, if a prediction needs to be made every 4 hours over a 48-hour period, then T should be 48 hours, whereas dt should be at most 4 hours.
(ii) The temporal density of data, that is, how often the variables are usually measured, also affects the setting of dt. This can be achieved by plotting a histogram of recording frequency. In our case, we observed that the maximum hourly frequency is ~1.2 times, which suggests dt should not be smaller than 1 hour. While most variables are recorded on average <0.1 time per hour (most of the time not recorded), the 6 vital signs are recorded slightly >1 time per hour. Thus, given that in the ICU, vital signs are usually collected once per hour, we set dt=1. This also implies the setting of θ_freq to be 1. Besides determining the value for dt from context (how granular we want to encode the data), we can also sweep the range (if there are sufficient computational resources and time) given the prediction frequency and the temporal density of data.
(ii) The temporal density of data, that is, how often the variables are usually measured, also affects the setting of dt. This can be achieved by plotting a histogram of recording frequency. In our case, we observed that the maximum hourly frequency is ~1.2 times, which suggests dt should not be smaller than 1 hour. While most variables are recorded on average <0.1 time per hour (most of the time not recorded), the 6 vital signs are recorded slightly >1 time per hour. Thus, given that in the ICU, vital signs are usually collected once per hour, we set dt=1. This also implies the setting of θ_freq to be 1. Besides determining the value for dt from context (how granular we want to encode the data), we can also sweep the range (if there are sufficient computational resources and time) given the prediction frequency and the temporal density of data.
(iii) We recommend setting θ_1=θ_2=θ and be conservative to avoid removing information that could be potentially useful. For binary classification, the rule-of-the-thumb we suggest is to set θ to be about 1/100 of the minority class. For example, our cohorts consist of ~10% positive cases, so setting θ=0.001 is appropriate, whereas for a cohort with only 1% positive cases, then θ=0.0001 is more appropriate. Given sufficient computational resources and time, the value of θ can also be swept and optimized.
(iii) We recommend setting θ_1=θ_2=θ and be conservative to avoid removing information that could be potentially useful. For binary classification, the rule-of-the-thumb we suggest is to set θ to be about 1/100 of the minority class. For example, our cohorts consist of ~10% positive cases, so setting θ=0.001 is appropriate, whereas for a cohort with only 1% positive cases, then θ=0.0001 is more appropriate. Given sufficient computational resources and time, the value of θ can also be swept and optimized.
Finally, for the summary statistics functions, we included by default the most basic statistics functions are minimum, maximum, and mean. If on average, we expect more than one value per time bin, then we can also include higher order statistics such as standard deviation and linear slope.
......@@ -82,4 +103,4 @@ Finally, for the summary statistics functions, we included by default the most b
## Experiments
In order to show the flexibility and utility of FIDDLE, we conducted several experiments using data from MIMIC-III and eICU. The code to reproduce the results are located at https://gitlab.eecs.umich.edu/MLD3/FIDDLE_experiments. The experiments were performed using FIDDLE v0.1.0 and reported in the JAMIA paper; bug fixes and new functionalities have since been implemented and may affect the numerical results.
In order to show the flexibility and utility of FIDDLE, we conducted several experiments using data from MIMIC-III and eICU. The code to reproduce the results are located at https://gitlab.eecs.umich.edu/MLD3/FIDDLE_experiments. The experiments were performed using FIDDLE v0.1.0 and reported in the JAMIA paper; bug fixes and new functionalities have since been implemented and may affect the numerical results.
numpy>=1.16
pandas>=1.0.1
sparse>=0.9.1
scikit-learn>=0.22.1
tqdm>=4.43.0
joblib>=0.13.2
icd9cms>=0.2.1
icd10-cm>=0.0.4
pyyaml>=5.3
numpy>=1.19
pandas>=1.1
sparse>=0.11
scikit-learn>=0.23
tqdm>=4.50
joblib>=0.16
icd9cms==0.2.1
icd10-cm==0.0.4
{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"df = pd.read_csv('./icd_data/input_data.csv')\n",
"df.loc[df['variable_value'] == '71970', 'variable_value'] = '7197'\n",
"df.to_csv('./icd_data/input_data.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Input data file: ./icd_data/input_data.csv\n",
"\n",
"Input arguments:\n",
" T = 4\n",
" dt = 1.0\n",
" θ₁ = 0.001\n",
" θ₂ = 0.001\n",
" θ_freq = 1.0\n",
" k = 3 ['min', 'max', 'mean']\n",
"binarize = yes\n",
"\n",
"N = 53122\n",
"L = 4\n",
"\n",
"\n",
"================================================================================\n",
"1) Pre-filter\n",
"================================================================================\n",
"Remove rows not in population\n",
"Remove rows with t outside of [0, 4]\n",
"Remove rare variables (<= 0.001)\n",
"Total variables : 1\n",
"Rare variables : 0\n",