diff --git a/.gitignore b/.gitignore
index e74dffadfd3127242f2f41b25cf53ccc309c3bc9..95c242e15f8be52ead23d3efcf8fdbb1726bf68e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
data/*
+**output**/
.ipynb_checkpoints
*.png
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..b8b7570639559a207e2b482e0184ffa960abc1da
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:3.8
+WORKDIR /workdir
+
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+COPY FIDDLE/ ./FIDDLE/
diff --git a/FIDDLE/config.yaml b/FIDDLE/config-default.yaml
similarity index 84%
rename from FIDDLE/config.yaml
rename to FIDDLE/config-default.yaml
index d30932a3992530e5f050c8bfcf33579b8b28e83f..b21c5fc5dca809dedee28c4f8620bfa446b87e78 100644
--- a/FIDDLE/config.yaml
+++ b/FIDDLE/config-default.yaml
@@ -5,10 +5,17 @@ column_names:
var_name: variable_name
var_value: variable_value
-use_ordinal_encoding: no
+parallel: yes
+n_jobs: 72
+batch_size: 100
+
hierarchical_sep: ":"
hierarchical_levels: [0, 1, 2]
+discretize: yes
+use_ordinal_encoding: no
+discretization: ~
+
value_types:
# enter the feature type that you would like to override in the following format:
FIRST_WARDID: Categorical
diff --git a/FIDDLE/config.py b/FIDDLE/config.py
index 25fb05c2fbbe9e06ac540d3ef22b321f28bb421d..4b5f74e7d77ae559a1a797a61e297bfbb8ba7f2a 100644
--- a/FIDDLE/config.py
+++ b/FIDDLE/config.py
@@ -1,16 +1,27 @@
import os, yaml
-with open(os.path.join(os.path.dirname(__file__), 'config.yaml')) as f:
- config = yaml.full_load(f)
+import copy
-ID_col = config['column_names']['ID']
-var_col = config['column_names']['var_name']
-val_col = config['column_names']['var_value']
-t_col = config['column_names']['t']
-hierarchical_sep = config['hierarchical_sep']
-hierarchical_levels = config['hierarchical_levels']
+with open(os.path.join(os.path.dirname(__file__), 'config-default.yaml')) as f:
+ config_default = yaml.safe_load(f)
-use_ordinal_encoding = config['use_ordinal_encoding']
-value_type_override = config['value_types']
+def load_config(fname):
+ config = copy.deepcopy(config_default)
+ if fname:
+ config_custom = yaml.safe_load(open(fname, 'r'))
+ for k, v in config_custom.items():
+ config[k] = v
+ return config
-parallel = True
-n_jobs = 72
+
+ID_col = 'ID'
+t_col = 't'
+var_col = 'variable_name'
+val_col = 'variable_value'
+
+if 'column_names' in config_default:
+ ID_col = config_default['column_names'].get('ID', 'ID')
+ t_col = config_default['column_names'].get('t', 't')
+ var_col = config_default['column_names'].get('var_name', 'variable_name')
+ val_col = config_default['column_names'].get('var_value', 'variable_value')
+else:
+ pass
diff --git a/FIDDLE/helpers.py b/FIDDLE/helpers.py
index 93b39916ddc100471b5fdd29f91fa4fe6afc04a2..18142e812c6eff70978e99c737f98de187bd0147 100644
--- a/FIDDLE/helpers.py
+++ b/FIDDLE/helpers.py
@@ -1,29 +1,19 @@
-import argparse
-def str2bool(v):
- if isinstance(v, bool):
- return v
- if v.lower() in ('yes', 'true', 't', 'y', '1'):
- return True
- elif v.lower() in ('no', 'false', 'f', 'n', '0'):
- return False
- else:
- raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
-from .config import *
import pandas as pd
import numpy as np
import scipy
import sparse
from collections import defaultdict
-
-from joblib import Parallel, delayed, parallel_backend
from tqdm import tqdm
from sklearn.feature_selection import VarianceThreshold
import sklearn
from collections import defaultdict
+try:
+ from .config import *
+except:
+ from config import *
+
def print_header(*content, char='='):
print()
print(char * 80)
@@ -95,11 +85,11 @@ def get_unique_variables(df):
return sorted(df[var_col].unique())
def get_frequent_numeric_variables(df_time_series, variables, threshold, args):
- data_path = args.data_path
+ output_dir = args.output_dir
df_population = args.df_population
T, dt = args.T, args.dt
- df_types = pd.read_csv(data_path + 'value_types.csv').set_index(var_col)['value_type']
+ df_types = pd.read_csv(output_dir + 'value_types.csv').set_index(var_col)['value_type']
numeric_vars = [col for col in variables if df_types[col] == 'Numeric']
df_num_counts = calculate_variable_counts(df_time_series, df_population)[numeric_vars] #gets the count of each variable for each patient.
variables_num_freq = df_num_counts.columns[df_num_counts.mean() >= threshold * np.floor(T/dt)]
@@ -136,23 +126,41 @@ def select_dtype(df, dtype, dtypes=None):
assert False
return
-def smart_qcut_dummify(x, q, use_ordinal_encoding=False):
+
+def compute_bin_edges(x, q):
# ignore strings when performing qcut
z = x.copy()
z = z.apply(make_float)
m = z.apply(np.isreal)
+ bin_edges = None
if z.loc[m].dropna().nunique() > 1: # when more than one numeric values
- if use_ordinal_encoding:
- bin_edges = np.nanpercentile(z.loc[m].astype(float).to_numpy(), [0, 20, 40, 60, 80, 100])
- bin_edges = np.unique(bin_edges)
- col_names = ['{}>={}'.format(z.name, bin_edge) for bin_edge in bin_edges[:-1]]
- out = pd.DataFrame(0, z.index, col_names)
- for i, bin_edge in enumerate(bin_edges[:-1]):
- out.loc[m, col_names[i]] = (z.loc[m] > bin_edge).astype(int)
- out = pd.concat([out, pd.get_dummies(z.where(~m, np.nan), prefix=z.name)], axis=1)
+ if z.loc[m].dropna().nunique() == 2:
+ pass
+ else:
+ bin_edges = list(np.unique(np.nanpercentile(z.loc[m].astype(float).values, np.linspace(0, 100, q+1))))
+ return (x.name, bin_edges)
+
+def smart_qcut_dummify_parallel(first_arg):
+ return smart_qcut_dummify(*first_arg)
+
+def smart_qcut_dummify(x, bin_edges, use_ordinal_encoding=False):
+ # ignore strings when performing qcut
+ z = x.copy()
+ z = z.apply(make_float)
+ m = z.apply(np.isreal)
+ if z.loc[m].dropna().nunique() > 1: # when more than one unique numeric values
+ if z.loc[m].dropna().nunique() == 2: # when only two unique numeric values
+ out = pd.get_dummies(x, prefix=x.name)
else:
- z.loc[m] = pd.qcut(z.loc[m].to_numpy(), q=q, duplicates='drop')
- out = pd.get_dummies(z, prefix=z.name)
+ if use_ordinal_encoding:
+ col_names = ['{}>={}'.format(z.name, bin_edge) for bin_edge in bin_edges[:-1]]
+ out = pd.DataFrame(0, z.index, col_names)
+ for i, bin_edge in enumerate(bin_edges[:-1]):
+ out.loc[m, col_names[i]] = (z.loc[m] >= bin_edge).astype(int)
+ out = pd.concat([out, pd.get_dummies(z.where(~m, np.nan), prefix=z.name)], axis=1)
+ else:
+ z.loc[m] = pd.cut(z.loc[m].to_numpy(), bin_edges, duplicates='drop', include_lowest=True)
+ out = pd.get_dummies(z, prefix=z.name)
else:
out = pd.get_dummies(x, prefix=x.name)
return out
@@ -202,13 +210,13 @@ def pivot_event_table(df):
# Handle cases where the same variable is recorded multiple times with the same timestamp
# Adjust the timestamps by epsilon so that all timestamps are unique
eps = 1e-6
- m_dups = df.duplicated([ID_col, t_col, var_col], keep=False)
+ m_dups = df.duplicated([t_col, var_col], keep=False)
df_dups = df[m_dups].copy()
for v, df_v in df_dups.groupby(var_col):
df_dups.loc[df_v.index, t_col] += eps * np.arange(len(df_v))
df = pd.concat([df[~m_dups], df_dups])
- assert not df.duplicated([ID_col, t_col, var_col], keep=False).any()
+ assert not df.duplicated([t_col, var_col], keep=False).any()
return pd.pivot_table(df, val_col, t_col, var_col, 'first')
diff --git a/FIDDLE/run.py b/FIDDLE/run.py
old mode 100755
new mode 100644
index 7caf2f72f4d51a3495ddcd8238573d4749939a3c..c39802ac518aa432c29573d03fdb96f8176f100a
--- a/FIDDLE/run.py
+++ b/FIDDLE/run.py
@@ -1,112 +1,141 @@
-from .config import *
-import pickle
import pandas as pd
import numpy as np
+import pickle
import time
import os
-
+import yaml
+import json
import argparse
-from .helpers import str2bool
-
-parser = argparse.ArgumentParser(description='')
-parser.add_argument('--T', type=float, required=True)
-parser.add_argument('--dt', type=float, required=True)
-parser.add_argument('--theta_1', type=float, default=0.001)
-parser.add_argument('--theta_2', type=float, default=0.001)
-parser.add_argument('--theta_freq', type=float, default=1.0)
-parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean'])
-parser.add_argument('--binarize', type=str2bool, default=True, nargs='?', const=True)
-
-parser.add_argument('--data_path', type=str, required=True)
-parser.add_argument('--input_fname', type=str, required=False)
-parser.add_argument('--population', type=str, required=True)
-parser.add_argument('--N', type=int, required=False)
-parser.add_argument('--Ds', nargs='+', type=int)
-
-parser.add_argument('--no_prefilter', dest='prefilter', action='store_false')
-parser.add_argument('--no_postfilter', dest='postfilter', action='store_false')
-parser.set_defaults(prefilter=True, postfilter=True)
-
-args = parser.parse_args()
-
-data_path = args.data_path
-if not data_path.endswith('/'):
- data_path += '/'
-
-population = args.population
-T = int(args.T)
-dt = args.dt
-theta_1 = args.theta_1
-theta_2 = args.theta_2
-theta_freq = args.theta_freq
-stats_functions = args.stats_functions
-binarize = args.binarize
-
-df_population = pd.read_csv(population).set_index('ID')
-N = args.N or len(df_population)
-df_population = df_population.iloc[:args.N]
-L = int(np.floor(T/dt))
-
-args.df_population = df_population
-args.N = N
-args.L = L
-args.parallel = parallel
-
-if args.input_fname and os.path.isfile(args.input_fname):
- input_fname = args.input_fname
- if input_fname.endswith('.p' or '.pickle'):
- df_data = pd.read_pickle(input_fname)
- elif input_fname.endswith('.csv'):
- df_data = pd.read_csv(input_fname)
+
+import FIDDLE.config as FIDDLE_config
+import FIDDLE.steps as FIDDLE_steps
+
+def main():
+ ######
+ # User arguments
+ ######
+ parser = argparse.ArgumentParser(description='')
+
+ # Files
+ parser.add_argument('--data_fname', type=str, required=True)
+ parser.add_argument('--population_fname',type=str, required=True)
+ parser.add_argument('--output_dir', type=str, required=True)
+ parser.add_argument('--config_fname', type=str, required=False)
+
+ # Settings
+ parser.add_argument('--T', type=float, required=True)
+ parser.add_argument('--dt', type=float, required=True)
+ parser.add_argument('--theta_1', type=float, default=0.001)
+ parser.add_argument('--theta_2', type=float, default=0.001)
+ parser.add_argument('--theta_freq', type=float, default=1.0)
+ parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean'])
+
+ # Debug
+ parser.add_argument('--N', type=int, required=False)
+ parser.add_argument('--Ds', nargs='+', type=int)
+ parser.add_argument('--no_prefilter', dest='prefilter', action='store_false')
+ parser.add_argument('--no_postfilter', dest='postfilter', action='store_false')
+ parser.set_defaults(prefilter=True, postfilter=True)
+
+ args = parser.parse_args()
+
+
+ ######
+ # Load files
+ ######
+
+ data_fname = args.data_fname
+ if data_fname.endswith('.p' or '.pickle'):
+ df_data = pd.read_pickle(data_fname)
+ elif data_fname.endswith('.csv'):
+ df_data = pd.read_csv(data_fname)
else:
- assert False
-elif os.path.isfile(data_path + 'input_data.p'):
- input_fname = data_path + 'input_data.p'
- df_data = pd.read_pickle(input_fname)
-elif os.path.isfile(data_path + 'input_data.pickle'):
- input_fname = data_path + 'input_data.pickle'
- df_data = pd.read_pickle(input_fname)
-elif os.path.isfile(data_path + 'input_data.csv'):
- input_fname = data_path + 'input_data.csv'
- df_data = pd.read_csv(input_fname)
-else:
- raise NotImplementedError
-
-
-from .steps import *
-
-print('Input data file:', input_fname)
-print()
-print('Input arguments:')
-print(' {:<6} = {}'.format('T', T))
-print(' {:<6} = {}'.format('dt', dt))
-print(' {:<6} = {}'.format('\u03B8\u2081', theta_1))
-print(' {:<6} = {}'.format('\u03B8\u2082', theta_2))
-print(' {:<6} = {}'.format('\u03B8_freq', theta_freq))
-print(' {:<6} = {} {}'.format('k', len(stats_functions), stats_functions))
-print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize]))
-print()
-print('N = {}'.format(N))
-print('L = {}'.format(L))
-print('', flush=True)
-
-
-######
-# Main
-######
-if args.prefilter:
- print_header('1) Pre-filter')
- df_data = pre_filter(df_data, theta_1, df_population, args)
- df_data.to_csv(data_path + 'pre-filtered.csv', index=False)
-
-print_header('2) Transform; 3) Post-filter')
-df_data, df_types = parse_variable_data_type(df_data, value_type_override, args)
-df_time_invariant, df_time_series = split_by_timestamp_type(df_data)
-
-# Process time-invariant data
-if len(df_time_invariant) > 0:
- s, s_feature_names, s_feature_aliases = process_time_invariant(df_time_invariant, args)
-
-# Process time-dependent data
-if len(df_time_series) > 0:
- X, X_feature_names, X_feature_aliases = process_time_dependent(df_time_series, args)
+ raise NotImplementedError
+
+ df_population = args.df_population = pd.read_csv(args.population_fname).set_index('ID').sort_index()
+ config = FIDDLE_config.load_config(args.config_fname)
+
+
+ ## Arguments settings
+ output_dir = args.output_dir
+ if not output_dir.endswith('/'):
+ output_dir += '/'
+
+ T = args.T
+ dt = args.dt
+ theta_1 = args.theta_1
+ theta_2 = args.theta_2
+ theta_freq = args.theta_freq
+ stats_functions = args.stats_functions
+
+ args.hierarchical_sep = config.get('hierarchical_sep', ':')
+ args.hierarchical_levels = config.get('hierarchical_levels', [])
+ args.value_type_override = config.get('value_types', {})
+
+ args.discretize = config.get('discretize', True)
+ args.use_ordinal_encoding = config.get('use_ordinal_encoding', False)
+
+ args.S_discretization_bins = None
+ args.X_discretization_bins = None
+ S_discretization_bins = config.get('S_discretization_bins')
+ X_discretization_bins = config.get('X_discretization_bins')
+ if S_discretization_bins:
+ args.s_discretization_bins = json.load(open(S_discretization_bins, 'r'))
+ if X_discretization_bins:
+ args.X_discretization_bins = json.load(open(X_discretization_bins, 'r'))
+
+ args.parallel = config.get('parallel', False)
+ args.n_jobs = config.get('n_jobs', 1)
+ args.batch_size = config.get('batch_size', 100)
+
+ N = args.N = args.N or len(df_population)
+ df_population = df_population.iloc[:args.N]
+ L = args.L = int(np.floor(T/dt))
+
+ print('Input:')
+ print(' Data :', args.data_fname)
+ print(' Population:', args.population_fname)
+ print(' Config :', args.config_fname)
+ print()
+ print('Output directory:', args.output_dir)
+ print()
+ print('Input arguments:')
+ print(' {:<6} = {}'.format('T', T))
+ print(' {:<6} = {}'.format('dt', dt))
+ print(' {:<6} = {}'.format('\u03B8\u2081', theta_1))
+ print(' {:<6} = {}'.format('\u03B8\u2082', theta_2))
+ print(' {:<6} = {}'.format('\u03B8_freq', theta_freq))
+ print(' {:<6} = {} {}'.format('k', len(stats_functions), stats_functions))
+ print()
+ print('{} = {}'.format('discretize', {False: 'no', True: 'yes'}[args.discretize]))
+ if args.discretize:
+ print(' S discretization bins:', S_discretization_bins or 'to be computed from data')
+ print(' X discretization bins:', X_discretization_bins or 'to be computed from data')
+ print()
+ print('N = {}'.format(N))
+ print('L = {}'.format(L))
+ print('', flush=True)
+
+
+ ######
+ # Main
+ ######
+ df_population[[]].to_csv(output_dir + 'IDs.csv')
+
+ if args.prefilter:
+ FIDDLE_steps.print_header('1) Pre-filter')
+ df_data = FIDDLE_steps.pre_filter(df_data, theta_1, df_population, args)
+ df_data.to_csv(output_dir + 'pre-filtered.csv', index=False)
+
+ FIDDLE_steps.print_header('2) Transform; 3) Post-filter')
+ df_data, df_types = FIDDLE_steps.parse_variable_data_type(df_data, args)
+ df_time_invariant, df_time_series = FIDDLE_steps.split_by_timestamp_type(df_data)
+
+ # Process time-invariant data
+ S, S_feature_names, S_feature_aliases = FIDDLE_steps.process_time_invariant(df_time_invariant, args)
+
+ # Process time-dependent data
+ X, X_feature_names, X_feature_aliases = FIDDLE_steps.process_time_dependent(df_time_series, args)
+
+if __name__ == '__main__':
+ main()
diff --git a/FIDDLE/steps.py b/FIDDLE/steps.py
index 509e2bca60bd90238ef3bf3c519b92dae1b06c57..ceb461e2c9ffa8095ea6961af997b624edaac1f3 100644
--- a/FIDDLE/steps.py
+++ b/FIDDLE/steps.py
@@ -4,45 +4,50 @@ FIDDLE Preprocessing steps
2. Transform
3. Post-filter
"""
-from .helpers import *
+try:
+ from .helpers import *
+except:
+ from helpers import *
import time
import json
+import joblib
+import multiprocessing
def pre_filter(df, threshold, df_population, args):
T = int(args.T)
theta_1 = args.theta_1
df_population = args.df_population
-
+
# Remove rows not in population
print('Remove rows not in population')
df = df[df['ID'].isin(df_population.index)]
-
+
# Remove rows with t outside of [0, T)
print('Remove rows with t outside of [0, {}]'.format(T))
df = df[pd.isnull(df[t_col]) | ((0 <= df[t_col]) & (df[t_col] < T))]
-
+
# Data table should not contain duplicate rows with any numerical values
# Check for inconsistencies
- var_names = [v for v, ty in value_type_override.items() if 'hierarchical' in ty.lower() or 'categorical' in ty.lower()]
+ var_names = [v for v, ty in args.value_type_override.items() if 'hierarchical' in ty.lower() or 'categorical' in ty.lower()]
df_tmp = df[~df[var_col].isin(var_names)]
dups = df_tmp.duplicated(subset=[ID_col, t_col, var_col], keep=False)
df_dups = df_tmp[dups]
if any(dups) and any(is_numeric(v) for v in df_dups[val_col] if not pd.isnull(v)):
print(df_dups.head())
raise Exception('Inconsistent numerical values recorded')
-
+
# Remove variables that occur too rarely as defined by the threshold
print('Remove rare variables (<= {})'.format(threshold))
-
+
## Calculate overall occurrence rate of each variable based on IDs
df_count = calculate_variable_counts(df, df_population) # (N x |var|) table of counts
df_bool = df_count.astype(bool) # convert counts to boolean
-
+
## Keep variables that are recorded for more than threshold fraction of IDs
variables_keep = df_bool.columns[df_bool.mean(axis=0) > threshold]
df_out = df[df[var_col].isin(variables_keep)]
assert set(variables_keep) == set(df_out[var_col].unique())
-
+
variables = sorted(df_bool.columns)
variables_remove = sorted(set(variables) - set(variables_keep))
print('Total variables :', len(variables))
@@ -53,22 +58,22 @@ def pre_filter(df, threshold, df_population, args):
return df_out
-def parse_variable_data_type(df_data, value_type_override, args):
+def parse_variable_data_type(df_data, args):
# 1. parse hierarchical values (e.g. ICD codes) into strings
# 2. automatically detect value types, respecting user override, and set dtypes in DataFrames
# 3. pre-map duplicated non-numerical values into multiple categorical variables
- data_path = args.data_path
+ output_dir = args.output_dir
df = df_data
assert val_col in df.columns
print_header('*) Detecting and parsing value types', char='-')
-
+
## 1. Hierarchical values
- var_names = [v for v, ty in value_type_override.items() if 'hierarchical' in ty.lower()]
+ var_names = [v for v, ty in args.value_type_override.items() if 'hierarchical' in ty.lower()]
if len(var_names) == 0: # No hierarchical values
pass
-
+
for var_name in var_names:
- var_type = value_type_override[var_name]
+ var_type = args.value_type_override[var_name]
df_var = df.loc[df[var_col] == var_name, val_col]
if var_type.lower() == 'hierarchical_icd':
# need to figure out ICD version
@@ -79,24 +84,24 @@ def parse_variable_data_type(df_data, value_type_override, args):
df_var = df_var.apply(lambda s: map_icd_hierarchy(s, version=10))
else:
df_var = df_var.apply(lambda s: s.split(hierarchical_sep))
-
+
# Assign mapped values back to original df
df.loc[df[var_col] == var_name, val_col] = df_var
-
+
# Only encode selected levels
df_nonhier = df[~df[var_col].isin(var_names)]
df_hier = df[df[var_col].isin(var_names)]
df_hier_levels = []
- for hier_level in hierarchical_levels:
+ for hier_level in args.hierarchical_levels:
# encode level if available
df_hier_level = df_hier.copy()
df_hier_level[val_col] = df_hier_level[val_col].apply(lambda h: h[min(hier_level, len(h))])
df_hier_levels.append(df_hier_level)
df_hier_levels = pd.concat(df_hier_levels).drop_duplicates()
-
+
# Combine hierarchical and non-hierarchical data
df = pd.concat([df_nonhier, df_hier_levels])
-
+
## 2. Detect value types
data_types = []
@@ -110,10 +115,10 @@ def parse_variable_data_type(df_data, value_type_override, args):
# Determine type of each variable
for variable, values in sorted(values_by_variable.items()):
# Manual override type in config
- if variable in value_type_override:
- data_types.append((variable, value_type_override[variable]))
+ if variable in args.value_type_override:
+ data_types.append((variable, args.value_type_override[variable]))
# Force categorical values to be a string
- if value_type_override[variable] == 'Categorical' and \
+ if args.value_type_override[variable] == 'Categorical' and \
any(is_numeric(v) for v in values if not pd.isnull(v)):
m_var = df[var_col] == variable
df.loc[m_var, val_col] = df.loc[m_var, val_col].apply(lambda s: '_' + str(s))
@@ -126,14 +131,14 @@ def parse_variable_data_type(df_data, value_type_override, args):
data_types.append((variable, 'Numeric + Categorical'))
else:
data_types.append((variable, 'Categorical'))
-
+
df_types = pd.DataFrame(data_types, columns=['variable_name', 'value_type'])
df_types[var_col] = df_types[var_col].astype(str)
df_types = df_types.set_index(var_col)
- fpath = data_path + 'value_types.csv'
+ fpath = output_dir + 'value_types.csv'
df_types.to_csv(fpath, quoting=1)
print('Saved as:', fpath)
-
+
## 3. Pre-map duplicated non-numerical values to separate variables
var_names = [v for v, ty in data_types if 'numeric' not in ty.lower() and 'none' not in ty.lower()]
df_non_num = df[df[var_col].isin(var_names)].copy()
@@ -144,17 +149,17 @@ def parse_variable_data_type(df_data, value_type_override, args):
df_non_num_dup[val_col] = 1
df_non_num[dup_] = df_non_num_dup
df[df[var_col].isin(var_names)] = df_non_num
-
+
return df, df_types['value_type']
def split_by_timestamp_type(df):
print_header('*) Separate time-invariant and time-dependent', char='-')
-
+
variables_inv = df[pd.isnull(df[t_col])][var_col].unique() # Invariant variables have t = NULL
df_time_invariant = df[df[var_col].isin(variables_inv)]
df_time_series = df[~df[var_col].isin(variables_inv)]
-
+
print('Variables (time-invariant):', len(variables_inv))
print('Variables (time-dependent):', df[var_col].nunique() - len(variables_inv))
print('# rows (time-invariant):', len(df_time_invariant))
@@ -163,86 +168,111 @@ def split_by_timestamp_type(df):
def process_time_invariant(df_data_time_invariant, args):
- data_path = args.data_path
+ if len(df_data_time_invariant) == 0:
+ return None, None, None
+
+ output_dir = args.output_dir
df_population = args.df_population
theta_2 = args.theta_2
-
+
+ ##############
print_header('2-A) Transform time-invariant data', char='-')
- dir_path = data_path + '/'
+ dir_path = output_dir + '/'
start_time = time.time()
## Create Nxd^ table
df_time_invariant = transform_time_invariant_table(df_data_time_invariant, df_population)
+ df_time_invariant[[]].to_csv(dir_path + 'S.ID.csv')
print('Time elapsed: %f seconds' % (time.time() - start_time))
## Discretize
- s_all, s_all_feature_names = map_time_invariant_features(df_time_invariant, args)
- sparse.save_npz(dir_path + 's_all.npz', s_all)
- with open(dir_path + 's_all.feature_names.json', 'w') as f:
- json.dump(list(s_all_feature_names), f, sort_keys=True)
- print('Time elapsed: %f seconds' % (time.time() - start_time))
+ S_all, S_all_feature_names, S_discretization_bins = map_time_invariant_features(df_time_invariant, args)
+ sparse.save_npz(dir_path + 'S_all.npz', S_all)
+ json.dump(list(S_all_feature_names), open(dir_path + 'S_all.feature_names.json', 'w'), sort_keys=True)
+ json.dump(S_discretization_bins, open(dir_path + 'S_all.discretization.json', 'w'))
- print_header('3-A) Post-filter time-invariant data', char='-')
-
- ## Filter
- s, s_feature_names, s_feature_aliases = post_filter(s_all, s_all_feature_names, theta_2)
print('Time elapsed: %f seconds' % (time.time() - start_time))
-
- ## Save output
- print()
- print('Output')
- print('s: shape={}, density={:.3f}'.format(s.shape, s.density))
- sparse.save_npz(dir_path + 's.npz', s)
-
- with open(dir_path + 's.feature_names.json', 'w') as f:
- json.dump(list(s_feature_names), f, sort_keys=True)
- with open(dir_path + 's.feature_aliases.json', 'w') as f:
- json.dump(s_feature_aliases, f, sort_keys=True)
-
- print('Total time: %f seconds' % (time.time() - start_time))
- print('', flush=True)
- return s, s_feature_names, s_feature_aliases
+
+ if args.postfilter:
+ ##############
+ print_header('3-A) Post-filter time-invariant data', char='-')
+
+ ## Filter
+ S, S_feature_names, S_feature_aliases = post_filter_time_invariant(S_all, S_all_feature_names, theta_2)
+ print('Time elapsed: %f seconds' % (time.time() - start_time))
+
+ ## Save output
+ print()
+ print('Output')
+ print('S: shape={}, density={:.3f}'.format(S.shape, S.density))
+ sparse.save_npz(dir_path + 'S.npz', S)
+
+ with open(dir_path + 'S.feature_names.json', 'w') as f:
+ json.dump(list(S_feature_names), f, sort_keys=True)
+ with open(dir_path + 'S.feature_aliases.json', 'w') as f:
+ json.dump(S_feature_aliases, f, sort_keys=True)
+
+ print('Total time: %f seconds' % (time.time() - start_time))
+ print('', flush=True)
+ return S, S_feature_names, S_feature_aliases
+ else:
+ return S_all, S_all_feature_names, None
def process_time_dependent(df_data_time_series, args):
- data_path = args.data_path
- theta_2 = args.theta_2
+ if len(df_data_time_series) == 0:
+ return None, None, None
+ output_dir = args.output_dir
+ theta_2 = args.theta_2
+
+ ##############
print_header('2-B) Transform time-dependent data', char='-')
- dir_path = data_path + '/'
+ dir_path = output_dir + '/'
start_time = time.time()
## Create NxLxD^ table
df_time_series, dtypes_time_series = transform_time_series_table(df_data_time_series, args)
print('Time elapsed: %f seconds' % (time.time() - start_time))
-
+
+ ## Save intermediate files
+ joblib.dump(df_time_series, output_dir + 'df_time_series.joblib')
+ joblib.dump(dtypes_time_series, output_dir + 'dtypes_time_series.joblib')
+ df_time_series[[]].to_csv(dir_path + 'X.ID,t_range.csv')
+
## Map variables to features
- X_all, X_all_feature_names = map_time_series_features(df_time_series, dtypes_time_series, args)
+ X_all, X_all_feature_names, X_discretization_bins = map_time_series_features(df_time_series, dtypes_time_series, args)
sparse.save_npz(dir_path + 'X_all.npz', X_all)
- with open(dir_path + 'X_all.feature_names.json', 'w') as f:
- json.dump(list(X_all_feature_names), f, sort_keys=True)
- print('Time elapsed: %f seconds' % (time.time() - start_time))
-
- ## Filter features
- print_header('3-B) Post-filter time-dependent data', char='-')
- print(X_all.shape, X_all.density)
- X, X_feature_names, X_feature_aliases = post_filter_time_series(X_all, X_all_feature_names, theta_2, args)
- print(X.shape, X.density)
+ json.dump(list(X_all_feature_names), open(dir_path + 'X_all.feature_names.json', 'w'), sort_keys=True)
+ json.dump(X_discretization_bins, open(dir_path + 'X_all.discretization.json', 'w'))
+
print('Time elapsed: %f seconds' % (time.time() - start_time))
- ## Save output
- print()
- print('Output')
- print('X: shape={}, density={:.3f}'.format(X.shape, X.density))
- sparse.save_npz(dir_path + 'X.npz', X)
- with open(dir_path + 'X.feature_names.json', 'w') as f:
- json.dump(list(X_feature_names), f, sort_keys=True)
- with open(dir_path + 'X.feature_aliases.json', 'w') as f:
- json.dump(X_feature_aliases, f, sort_keys=True)
-
- print('Total time: %f seconds' % (time.time() - start_time))
- print('', flush=True)
- return X, X_feature_names, X_feature_aliases
+ if args.postfilter:
+ ##############
+ print_header('3-B) Post-filter time-dependent data', char='-')
+ print(X_all.shape, X_all.density)
+
+ ## Filter features
+ X, X_feature_names, X_feature_aliases = post_filter_time_series(X_all, X_all_feature_names, theta_2, args)
+ print(X.shape, X.density)
+ print('Time elapsed: %f seconds' % (time.time() - start_time))
+
+ ## Save output
+ print()
+ print('Output')
+ print('X: shape={}, density={:.3f}'.format(X.shape, X.density))
+ sparse.save_npz(dir_path + 'X.npz', X)
+ with open(dir_path + 'X.feature_names.json', 'w') as f:
+ json.dump(list(X_feature_names), f, sort_keys=True)
+ with open(dir_path + 'X.feature_aliases.json', 'w') as f:
+ json.dump(X_feature_aliases, f, sort_keys=True)
+
+ print('Total time: %f seconds' % (time.time() - start_time))
+ print('', flush=True)
+ return X, X_feature_names, X_feature_aliases
+ else:
+ return X_all, X_all_feature_names, None
######
@@ -250,12 +280,12 @@ def process_time_dependent(df_data_time_series, args):
######
def transform_time_invariant_table(df_in, df_population):
df_in = df_in.copy()
-
+
# Recorded Value (np.nan if not recorded)
df_value = pd.pivot_table(df_in, val_col, ID_col, var_col, 'last', np.nan)
df_value = df_value.reindex(index=df_population.index, fill_value=np.nan)
df_value.columns = [str(col) + '_value' for col in df_value.columns]
-
+
print('(N \u00D7 ^d) table :\t', df_value.shape)
print('number of missing entries :\t', '{} out of {} total'.format(df_value.isna().sum().sum(), df_value.size))
return df_value
@@ -263,12 +293,18 @@ def transform_time_invariant_table(df_in, df_population):
def map_time_invariant_features(df, args):
# Categorical -> binary features
# Numeric -> binary/float-valued features
- if args.binarize:
- out = [smart_qcut_dummify(df[col], q=5, use_ordinal_encoding=use_ordinal_encoding) for col in df.columns]
+ discretization_bins = None
+ if args.discretize:
+ discretization_bins = args.S_discretization_bins
+ if discretization_bins is None:
+ discretization_bins = [compute_bin_edges(df[col], q=5) for col in df.columns]
+ discretization_bins = dict(discretization_bins)
+
+ out = [smart_qcut_dummify(df[col], discretization_bins[col], use_ordinal_encoding=args.use_ordinal_encoding) for col in df.columns]
time_invariant_features = pd.concat(out, axis=1)
feature_names_all = time_invariant_features.columns.values
sdf = time_invariant_features.astype(pd.SparseDtype(int, fill_value=0))
- s_ = sparse.COO(sdf.sparse.to_coo())
+ S_ = sparse.COO(sdf.sparse.to_coo())
else:
# Split a mixed column into numeric and string columns
for col in df.columns:
@@ -279,45 +315,45 @@ def map_time_invariant_features(df, args):
df[col+'_str'] = df[col].copy()
df.loc[~numeric_mask, col] = np.nan
df.loc[numeric_mask, col+'_str'] = np.nan
-
+
out = [smart_dummify_impute(df[col]) for col in df.columns]
time_invariant_features = pd.concat(out, axis=1)
feature_names_all = time_invariant_features.columns.values
sdf = time_invariant_features.astype(pd.SparseDtype(float, fill_value=0))
- s_ = sparse.COO(sdf.sparse.to_coo())
-
+ S_ = sparse.COO(sdf.sparse.to_coo())
+
print()
print('Output')
- print('s_all, binary features :\t', s_.shape)
- return s_, feature_names_all
+ print('S_all, binary features :\t', S_.shape)
+ return S_, feature_names_all, discretization_bins
-def post_filter(s_, s_feature_names_all, threshold):
+def post_filter_time_invariant(S_, S_feature_names_all, threshold):
# Filter features (optional)
- assert s_.shape[1] == len(s_feature_names_all)
- feature_names_0 = s_feature_names_all
- s0 = s_.to_scipy_sparse()
+ assert S_.shape[1] == len(S_feature_names_all)
+ feature_names_0 = S_feature_names_all
+ S0 = S_.to_scipy_sparse()
print('Original :', len(feature_names_0))
-
+
## Remove nearly-constant features (with low variance)
## a binary feature is removed if =0 (or =1) for >th fraction of examples
## i.e., variance <= (th * (1 - th))
sel_rare = VarianceThreshold(threshold=(threshold * (1 - threshold)))
- s1 = sel_rare.fit_transform(s0)
+ S1 = sel_rare.fit_transform(S0)
feature_names_1 = feature_names_0[sel_rare.get_support()]
print('Nearly-constant:', len(feature_names_0) - len(feature_names_1))
-
+
## Keep only first of pairwise perfectly correlated features
sel_corr = CorrelationSelector()
- s2 = sel_corr.fit_transform(s1)
+ S2 = sel_corr.fit_transform(S1)
feature_names_2 = feature_names_1[sel_corr.get_support()]
feature_aliases = sel_corr.get_feature_aliases(feature_names_1)
print('Correlated :', len(feature_names_1) - len(feature_names_2))
-
- s = sparse.COO(s2)
+
+ S = sparse.COO(S2)
feature_names = feature_names_2
- assert s.shape[1] == len(feature_names)
-
- return s, feature_names, feature_aliases
+ assert S.shape[1] == len(feature_names)
+
+ return S, feature_names, feature_aliases
######
@@ -325,14 +361,16 @@ def post_filter(s_, s_feature_names_all, threshold):
######
def func_encode_single_time_series(i, g, variables, variables_num_freq, T, dt, stats_functions, impute=True):
try:
- assert g['ID'].nunique() == 1
- assert g['ID'].unique()[0] == i
+ assert g.index.nunique() == 1
+ assert g.index.unique()[0] == i
# non-frequent
variables_non = sorted(set(variables) - set(variables_num_freq))
- df_j = pivot_event_table(g).reindex(columns=variables_non).sort_index()
- df_values_j = most_recent_values(df_j, variables, T, dt)
- df_out = df_values_j
-
+ if len(variables_non) > 0:
+ variables_non = sorted(set(variables) - set(variables_num_freq))
+ df_j = pivot_event_table(g).reindex(columns=variables_non).sort_index()
+ df_values_j = most_recent_values(df_j, variables, T, dt)
+ df_out = df_values_j
+
if len(variables_num_freq) > 0:
# frequent
# we're only producing mask, ffill, and statistics if the data is measured frequently enough
@@ -345,51 +383,84 @@ def func_encode_single_time_series(i, g, variables, variables_num_freq, T, dt, s
if impute:
check_imputed_output(df_values_i)
check_imputed_output(df_stats_i)
-
+
df_out = df_out.join([mask_i, delta_t_i, df_values_i, df_stats_i])
except:
print(i)
raise Exception(i)
return i, df_out
+def divide_chunks(l, n):
+ # looping till length l
+ for i in range(0, len(l), n):
+ yield l[i:i + n]
+
+def form_batches_of_examples(df_in, args, batch_size):
+ grouped = df_in.set_index(ID_col)
+ IDs = list(grouped.index.unique())
+ batches_IDs = list(divide_chunks(IDs, batch_size))
+ batches = [grouped.loc[chunk] for chunk in batches_IDs]
+ return batches, batches_IDs
+
+def process_batch_time_series(first_arg):
+ batch, batch_IDs, args = first_arg
+ variables, variables_num_freq = args.variables, args.variables_num_freq
+ out = dict(
+ func_encode_single_time_series(i, batch.loc[i:i], variables, variables_num_freq, args.T, args.dt, args.stats_functions)
+ for i in batch_IDs
+ )
+ return out
+
def transform_time_series_table(df_in, args):
- data_path = args.data_path
+ output_dir = args.output_dir
theta_freq = args.theta_freq
stats_functions = args.stats_functions
N, L = args.N, args.L
df_population = args.df_population
parallel = args.parallel
-
+
## TODO: asserts shape of df_in
# Determine all unique variable names
variables = get_unique_variables(df_in)
assert df_in[var_col].nunique() == len(variables)
print('Total variables :', len(variables))
-
+
# Determine frequent variables -> we'll calculate statistics, mask, and delta_time only on these
variables_num_freq = get_frequent_numeric_variables(df_in, variables, theta_freq, args)
print('Frequent variables :', list(variables_num_freq))
print('{} = {}'.format('M\u2081', len(variables_num_freq)))
print('{} = {}'.format('M\u2082', len(variables) - len(variables_num_freq)))
print('{} = {} {}'.format('k ', len(stats_functions), stats_functions))
-
+
print()
print('Transforming each example...')
+ args.variables = variables
+ args.variables_num_freq = variables_num_freq
+
# Encode time series table for each patient
- grouped = list(df_in.groupby(ID_col))
- if parallel:
- out = dict(Parallel(n_jobs=n_jobs, verbose=10)(
- delayed(func_encode_single_time_series)(i, g, variables, variables_num_freq, args.T, args.dt, args.stats_functions)
- for i, g in grouped[:N]
+ if args.parallel:
+ batches, batches_IDs = form_batches_of_examples(df_in, args, batch_size=args.batch_size)
+ print('Batches of size {}: '.format(args.batch_size), len(batches))
+ pool = multiprocessing.Pool(args.n_jobs)
+ out = list(tqdm(pool.imap_unordered(
+ process_batch_time_series,
+ zip(batches, batches_IDs, [args]*len(batches))), total=len(batches)
))
-
+ pool.close()
+ pool.join()
+
+ out = dict((key, d[key]) for d in out for key in d)
+ print()
+ print('Parallel processing done', flush=True)
+
else:
+ grouped = list(df_in.groupby(ID_col))
out = dict(
- func_encode_single_time_series(i, g, variables, variables_num_freq, args.T, args.dt, args.stats_functions)
+ func_encode_single_time_series(i, g.set_index(ID_col), variables, variables_num_freq, args.T, args.dt, args.stats_functions)
for i, g in tqdm(grouped[:N])
)
-
+
# Handle IDs not in the table
df_original = list(out.values())[0]
df_copy = pd.DataFrame().reindex_like(df_original)
@@ -421,68 +492,68 @@ def transform_time_series_table(df_in, args):
D_timeseries = out
D_ = len(list(D_timeseries.values())[0].columns)
-
+
# (N*L)xD^ table
## Create MultiIndex of (ID, time_bin)
- index = sum([
+ index = sum([
[(ID, t_) for t_ in list(df_.index)]
- for ID, df_ in sorted(D_timeseries.items())
+ for ID, df_ in sorted(D_timeseries.items())
], [])
- index = pd.Index(index)
+ index = pd.Index(index, names=['ID', 't_range'])
assert len(index) == N * L
-
+
## Assume all dataframes have the same columns, used after concatenation
columns = list(sorted(D_timeseries.items())[0][1].columns)
columns = np.array(columns)
dtypes = sorted(D_timeseries.items())[0][1].dtypes
-
+
## Convert each df to a numpy array
## Concatenate **sorted** numpy arrays (faster than calling pd.concat)
feature_values = [(ID, df_.to_numpy()) for ID, df_ in sorted(D_timeseries.items())]
time_series = np.concatenate([feat_val[1] for feat_val in feature_values])
assert time_series.shape == (len(index), len(columns))
-
+
df_time_series = pd.DataFrame(data=time_series, index=index, columns=columns)
-
+
# Print metadata
print('DONE: Transforming each example...')
## Freq: Count missing entries using mask
ts_mask = df_time_series[[col for col in df_time_series if col.endswith('_mask')]]
ts_mask.columns = [col.replace('_mask', '') for col in ts_mask.columns]
- print('(freq) number of missing entries :\t',
+ print('(freq) number of missing entries :\t',
'{} out of {}={} total'.format(
- (1-ts_mask).astype(int).sum().sum(),
+ (1-ts_mask).astype(int).sum().sum(),
'\u00D7'.join(str(i) for i in [N,L,ts_mask.shape[1]]), ts_mask.size))
-
+
## Freq: Count imputed entries using mask and dt
ts_delta_time = df_time_series[[col for col in df_time_series if col.endswith('_delta_time')]]
ts_delta_time.columns = [col.replace('_delta_time', '') for col in ts_delta_time.columns]
-
+
imputed = (1-ts_mask).astype(bool) & (ts_delta_time > 0)
- print('(freq) number of imputed entries :\t',
+ print('(freq) number of imputed entries :\t',
'{}'.format(imputed.sum().sum(), ts_delta_time.size))
- imputed.sum().rename('count').to_csv(data_path + '/' + 'freq_imputed.csv')
-
+ imputed.sum().rename('count').to_csv(output_dir + '/' + 'freq_imputed.csv')
+
not_imputed = (1-ts_mask).astype(bool) & (ts_delta_time == 0)
- print('(freq) number of not imputed entries :\t',
+ print('(freq) number of not imputed entries :\t',
'{}'.format(not_imputed.sum().sum(), ts_delta_time.size))
- not_imputed.sum().rename('count').to_csv(data_path + '/' + 'freq_not_imputed.csv')
-
+ not_imputed.sum().rename('count').to_csv(output_dir + '/' + 'freq_not_imputed.csv')
+
## Non-Freq: Count missing entries
non_freq_cols = sorted([c + '_value' for c in set(variables) - set(variables_num_freq)])
non_freqs = df_time_series[non_freq_cols]
print('(non-freq) number of missing entries :\t',
'{} out of {}={} total'.format(
- non_freqs.isna().sum().sum(),
+ non_freqs.isna().sum().sum(),
'\u00D7'.join(str(i) for i in [N,L,non_freqs.shape[1]]), non_freqs.size))
-
+
print()
print('(N \u00D7 L \u00D7 ^D) table :\t', (N, L, len(columns)))
return df_time_series, dtypes
def map_time_series_features(df_time_series, dtypes, args):
N, L = args.N, args.L
-
+
df_time_series = df_time_series.dropna(axis='columns', how='all').sort_index()
print('Discretizing features...')
@@ -491,24 +562,36 @@ def map_time_series_features(df_time_series, dtypes, args):
assert len(ts_mixed.columns) + len(ts_mask.columns) == len(df_time_series.columns)
ts_feature_mask = ts_mask.astype(int)
ts_mixed_cols = [ts_mixed[col] for col in ts_mixed.columns]
-
+
print()
- if args.binarize:
+ discretization_bins = None
+ if args.discretize:
dtype = int
print('Processing', len(ts_mixed_cols), 'non-boolean variable columns...')
- print(' Binning numeric variables by quintile...')
- print(' Converting variables to binary features')
- if parallel:
- out = Parallel(n_jobs=n_jobs, verbose=10)( # Need to share global variables
- delayed(smart_qcut_dummify)(col_data, q=5, use_ordinal_encoding=use_ordinal_encoding) for col_data in ts_mixed_cols
- )
+ discretization_bins = args.X_discretization_bins
+ if discretization_bins is None:
+ print(' Computing bin edges for numeric variables...')
+ discretization_bins = [compute_bin_edges(col_data, q=5) for col_data in tqdm(ts_mixed_cols)]
+ discretization_bins = dict(discretization_bins)
+ else:
+ print(' Usng predetermined bin edges for numeric variables...')
+
+ print(' Discretizing variables to binary features')
+ if args.parallel:
+ pool = multiprocessing.Pool(args.n_jobs)
+ out = list(tqdm(pool.imap_unordered(
+ smart_qcut_dummify_parallel,
+ [(col_data, discretization_bins[col_data.name], args.use_ordinal_encoding) for col_data in ts_mixed_cols]), total=len(ts_mixed_cols)
+ ))
+ pool.close()
+ pool.join()
else:
- out = [smart_qcut_dummify(col_data, q=5, use_ordinal_encoding=use_ordinal_encoding) for col_data in tqdm(ts_mixed_cols)]
+ out = [smart_qcut_dummify(col_data, discretization_bins[col_data.name], use_ordinal_encoding=args.use_ordinal_encoding) for col_data in tqdm(ts_mixed_cols)]
else:
dtype = float
df = ts_mixed.copy()
-
+
# Split a mixed column into numeric and string columns
for col in df.columns:
col_data = df[col]
@@ -518,31 +601,34 @@ def map_time_series_features(df_time_series, dtypes, args):
df[col+'_str'] = df[col].copy()
df.loc[~numeric_mask, col] = np.nan
df.loc[numeric_mask, col+'_str'] = np.nan
-
+
ts_mixed_cols = [df[col] for col in df.columns]
-
+
print('Discretizing categorical features...')
- if parallel:
- out = Parallel(n_jobs=n_jobs, verbose=10)( # Need to share global variables?
- delayed(smart_dummify_impute)(col_data) for col_data in ts_mixed_cols
- )
+ if args.parallel:
+ pool = multiprocessing.Pool(args.n_jobs)
+ out = list(tqdm(pool.imap_unordered(
+ smart_dummify_impute, [(col_data) for col_data in ts_mixed_cols]), total=len(ts_mixed_cols)
+ ))
+ pool.close()
+ pool.join()
else:
out = [smart_dummify_impute(col_data) for col_data in tqdm(ts_mixed_cols)]
-
+
out = [ts_feature_mask, *out]
D_all = sum(len(df_i.columns) for df_i in out)
X_all_feature_names = np.asarray(sum([list(df_i.columns) for df_i in out], []))
X_dense = np.concatenate([df_i.values for df_i in out], axis=1).astype(dtype)
X_all = sparse.COO(X_dense)
-
+
print('Finished discretizing features')
assert X_all.shape[0] == N * L
X_all = X_all.reshape((N, L, D_all))
-
+
print()
print('Output')
print('X_all: shape={}, density={:.3f}'.format(X_all.shape, X_all.density))
- return X_all, X_all_feature_names
+ return X_all, X_all_feature_names, discretization_bins
def post_filter_time_series(X_all, feature_names_all, threshold, args):
N, L = args.N, args.L
@@ -550,11 +636,11 @@ def post_filter_time_series(X_all, feature_names_all, threshold, args):
assert X_all.shape[1] == L
# assert X_all.dtype == int
start_time = time.time()
-
+
X0 = X_all
feature_names_0 = feature_names_all
print('Original :', len(feature_names_0))
-
+
## Remove nearly-constant features (with low variance)
sel_const = FrequencyThreshold_temporal(threshold=threshold, L=L)
sel_const.fit(X0.reshape((N*L, -1)))
@@ -564,7 +650,7 @@ def post_filter_time_series(X_all, feature_names_all, threshold, args):
feature_names_1 = feature_names_0[m_ts_const]
print('Nearly-constant:', len(feature_names_0) - len(feature_names_1))
print('*** time: ', time.time() - start_time)
-
+
## Keep only first of pairwise perfectly correlated features
sel_ts_corr = CorrelationSelector()
sel_ts_corr.fit(X1.reshape((N*L, -1)))
@@ -575,14 +661,14 @@ def post_filter_time_series(X_all, feature_names_all, threshold, args):
feature_aliases = sel_ts_corr.get_feature_aliases(feature_names_1)
print('Correlated :', len(feature_names_1) - len(feature_names_2))
print('*** time: ', time.time() - start_time)
-
+
X = sparse.COO(X2)
feature_names = feature_names_2
assert X.shape == (N, L, len(feature_names))
-
+
## Save output
print()
print('Output')
print('X: shape={}, density={:.3f}'.format(X.shape, X.density))
-
+
return X, feature_names, feature_aliases
diff --git a/README.md b/README.md
index 5573cd19c27570ff123f0b9e019de57eb634182c..6d3a896abb9f420f1f3ab7588f1f8788efd4cd5d 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,14 @@
# FIDDLE
-FIDDLE – FlexIble Data-Driven pipeLinE – is a preprocessing pipeline that transforms structured EHR data into feature vectors that can be used with ML algorithms, relying on only a small number of user-defined arguments.
+FIDDLE – FlexIble Data-Driven pipeLinE – is a preprocessing pipeline that transforms structured EHR data into feature vectors that can be used with ML algorithms, relying on only a small number of user-defined arguments.
-Requires python 3.6 or above. Required packages and versions are listed in `requirements.txt`. Older versions may still work but have not been tested.
+Try a quick demo here: [tiny.cc/FIDDLE-demo](https://tiny.cc/FIDDLE-demo)
-Note: This README contains latex equations and is best viewed on the GitLab site (https://gitlab.eecs.umich.edu/mld3/FIDDLE).
+Note: This README contains latex equations and is best viewed on the [GitLab site](https://gitlab.eecs.umich.edu/mld3/FIDDLE).
## Publications & Resources
- Title: Democratizing EHR analyses with FIDDLE: a flexible data-driven preprocessing pipeline for structured clinical data.
-- Authors: Shengpu Tang, Parmida Davarmanesh, Yanmeng Song, Danai Koutra, Michael W. Sjoding, and Jenna Wiens.
+- Authors: Shengpu Tang, Parmida Davarmanesh, Yanmeng Song, Danai Koutra, Michael W. Sjoding, and Jenna Wiens.
- Published in JAMIA (Journal of the American Medical Informatics Association), October 2020: [article link](https://doi.org/10.1093/jamia/ocaa139)
- Previously presented at MLHC 2019 ([Machine Learning for Healthcare](https://www.mlforhc.org/)) as a [clinical abstract](https://www.mlforhc.org/s/Sjoding-jete.pdf)
- News coverage on HealthcareITNews: [link](https://www.healthcareitnews.com/news/new-framework-helps-streamline-ehr-data-extraction)
@@ -23,20 +23,39 @@ If you use FIDDLE in your research, please cite the following publication:
journal = {Journal of the American Medical Informatics Association},
year = {2020},
month = {10},
- issn = {1527-974X},
doi = {10.1093/jamia/ocaa139},
}
```
+## System Requirements
+
+### Pip
+Requires python 3.7 or above (older versions may still work but have not been tested). Required packages and versions are listed in `requirements.txt`. Run the following command to install the required packages.
+```bash
+pip install -r requirements.txt
+```
+
+### Docker
+To build the docker image, run the following command:
+```bash
+docker build -t fiddle-v020 .
+```
+Refer to the notebook `tests/small_test/Run-docker.ipynb` for an example to run FIDDLE in docker.
+
+
## Usage Notes
FIDDLE generates feature vectors based on data within the observation period $`t\in[0,T]`$. This feature representation can be used to make predictions of adverse outcomes at t=T. More specifically, FIDDLE outputs a set of binary feature vectors for each example $`i`$, $`\{(s_i,x_i)\ \text{for}\ i=1 \dots N\}`$ where $`s_i \in R^d`$ contains time-invariant features and $`x_i \in R^{L \times D}`$ contains time-dependent features.
-Input:
+Input:
- formatted EHR data: `.csv` or `.p`/`.pickle` file, a table with 4 columns \[`ID`, `t`, `variable_name`, `variable_value`\]
- population file: a list of unique `ID`s you want processed
+ - the output feature matrix will correspond to IDs in lexicographically sorted order
+- config file:
+ - specifies additional settings by providing a custom `config.yaml` file
+ - a default config file is located at `FIDDLE/config-default.yaml`
- arguments:
- - T: The time of prediction; time-dependent features will be generated using data in $`t\in[0,T]`$.
- - dt: the temporal granularity at which to "window" time-dependent data.
+ - T: The time of prediction; time-dependent features will be generated using data in $`t\in[0,T]`$.
+ - dt: the temporal granularity at which to "window" time-dependent data.
- theta_1: The threshold for Pre-filter.
- theta_2: The threshold for Post-filter.
- theta_freq: The threshold at which we deem a variable “frequent” (for which summary statistics will be calculated).
@@ -46,8 +65,10 @@ Output: The generated features and associated metadata are located in `{data_pat
- `s.npz`: a sparse array of shape (N, d)
- `X.npz`: a sparse tensor of shape (N, L, D)
-- `s.feature_names.txt`: names of _d_ time-invariant features
-- `X.feature_names.txt`: names of _D_ time-series features
+- `s.feature_names.json`: names of _d_ time-invariant features
+- `X.feature_names.json`: names of _D_ time-series features
+- `x.feature_aliases.json`: aliases of duplicated time-invariant features
+- `X.feature_aliases.json`: aliases of duplicated time-series features
To load the generated features:
@@ -70,11 +91,11 @@ python -m FIDDLE.run \
## Guidelines on argument settings
The user-defined arguments of FIDDLE include: T, dt, theta_1, theta_2, theta_freq, and K statistics functions. The settings of these arguments could affect the features and how they can be used. We provided reasonable default values in the implementation, and here list some practical considerations: (i) prediction time and frequency, (ii) temporal density of data, and (iii) class balance.
-(i) The prediction time and frequency determine the appropriate settings for T and dt. The risk stratification tasks we considered all involve a single prediction at the end of a fixed prediction window. It is thus most reasonable to set T to be the length of prediction window. Another possible formulation is to make multiple predictions where each prediction depends on only data from the past (not the future), using models like LSTM or fully convolutional networks. In that case, for example, if a prediction needs to be made every 4 hours over a 48-hour period, then T should be 48 hours, whereas dt should be at most 4 hours.
+(i) The prediction time and frequency determine the appropriate settings for T and dt. The risk stratification tasks we considered all involve a single prediction at the end of a fixed prediction window. It is thus most reasonable to set T to be the length of prediction window. Another possible formulation is to make multiple predictions where each prediction depends on only data from the past (not the future), using models like LSTM or fully convolutional networks. In that case, for example, if a prediction needs to be made every 4 hours over a 48-hour period, then T should be 48 hours, whereas dt should be at most 4 hours.
-(ii) The temporal density of data, that is, how often the variables are usually measured, also affects the setting of dt. This can be achieved by plotting a histogram of recording frequency. In our case, we observed that the maximum hourly frequency is ~1.2 times, which suggests dt should not be smaller than 1 hour. While most variables are recorded on average <0.1 time per hour (most of the time not recorded), the 6 vital signs are recorded slightly >1 time per hour. Thus, given that in the ICU, vital signs are usually collected once per hour, we set dt=1. This also implies the setting of θ_freq to be 1. Besides determining the value for dt from context (how granular we want to encode the data), we can also sweep the range (if there are sufficient computational resources and time) given the prediction frequency and the temporal density of data.
+(ii) The temporal density of data, that is, how often the variables are usually measured, also affects the setting of dt. This can be achieved by plotting a histogram of recording frequency. In our case, we observed that the maximum hourly frequency is ~1.2 times, which suggests dt should not be smaller than 1 hour. While most variables are recorded on average <0.1 time per hour (most of the time not recorded), the 6 vital signs are recorded slightly >1 time per hour. Thus, given that in the ICU, vital signs are usually collected once per hour, we set dt=1. This also implies the setting of θ_freq to be 1. Besides determining the value for dt from context (how granular we want to encode the data), we can also sweep the range (if there are sufficient computational resources and time) given the prediction frequency and the temporal density of data.
-(iii) We recommend setting θ_1=θ_2=θ and be conservative to avoid removing information that could be potentially useful. For binary classification, the rule-of-the-thumb we suggest is to set θ to be about 1/100 of the minority class. For example, our cohorts consist of ~10% positive cases, so setting θ=0.001 is appropriate, whereas for a cohort with only 1% positive cases, then θ=0.0001 is more appropriate. Given sufficient computational resources and time, the value of θ can also be swept and optimized.
+(iii) We recommend setting θ_1=θ_2=θ and be conservative to avoid removing information that could be potentially useful. For binary classification, the rule-of-the-thumb we suggest is to set θ to be about 1/100 of the minority class. For example, our cohorts consist of ~10% positive cases, so setting θ=0.001 is appropriate, whereas for a cohort with only 1% positive cases, then θ=0.0001 is more appropriate. Given sufficient computational resources and time, the value of θ can also be swept and optimized.
Finally, for the summary statistics functions, we included by default the most basic statistics functions are minimum, maximum, and mean. If on average, we expect more than one value per time bin, then we can also include higher order statistics such as standard deviation and linear slope.
@@ -82,4 +103,4 @@ Finally, for the summary statistics functions, we included by default the most b
## Experiments
-In order to show the flexibility and utility of FIDDLE, we conducted several experiments using data from MIMIC-III and eICU. The code to reproduce the results are located at https://gitlab.eecs.umich.edu/MLD3/FIDDLE_experiments. The experiments were performed using FIDDLE v0.1.0 and reported in the JAMIA paper; bug fixes and new functionalities have since been implemented and may affect the numerical results.
+In order to show the flexibility and utility of FIDDLE, we conducted several experiments using data from MIMIC-III and eICU. The code to reproduce the results are located at https://gitlab.eecs.umich.edu/MLD3/FIDDLE_experiments. The experiments were performed using FIDDLE v0.1.0 and reported in the JAMIA paper; bug fixes and new functionalities have since been implemented and may affect the numerical results.
diff --git a/requirements.txt b/requirements.txt
index dd38396e52b8ce24c22b6ec82393b25606da7819..09232cd87101feafe96b04770f3f2b7a4996b230 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,9 @@
-numpy>=1.16
-pandas>=1.0.1
-sparse>=0.9.1
-scikit-learn>=0.22.1
-tqdm>=4.43.0
-joblib>=0.13.2
-icd9cms>=0.2.1
-icd10-cm>=0.0.4
+pyyaml>=5.3
+numpy>=1.19
+pandas>=1.1
+sparse>=0.11
+scikit-learn>=0.23
+tqdm>=4.50
+joblib>=0.16
+icd9cms==0.2.1
+icd10-cm==0.0.4
diff --git a/test/Test_Case-ICD.ipynb b/test/Test_Case-ICD.ipynb
deleted file mode 100644
index f94992cb3dfa89061c2019531eaacc6c6395a0fa..0000000000000000000000000000000000000000
--- a/test/Test_Case-ICD.ipynb
+++ /dev/null
@@ -1,329 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "df = pd.read_csv('./icd_data/input_data.csv')\n",
- "df.loc[df['variable_value'] == '71970', 'variable_value'] = '7197'\n",
- "df.to_csv('./icd_data/input_data.csv', index=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Input data file: ./icd_data/input_data.csv\n",
- "\n",
- "Input arguments:\n",
- " T = 4\n",
- " dt = 1.0\n",
- " θ₁ = 0.001\n",
- " θ₂ = 0.001\n",
- " θ_freq = 1.0\n",
- " k = 3 ['min', 'max', 'mean']\n",
- "binarize = yes\n",
- "\n",
- "N = 53122\n",
- "L = 4\n",
- "\n",
- "\n",
- "================================================================================\n",
- "1) Pre-filter\n",
- "================================================================================\n",
- "Remove rows not in population\n",
- "Remove rows with t outside of [0, 4]\n",
- "Remove rare variables (<= 0.001)\n",
- "Total variables : 1\n",
- "Rare variables : 0\n",
- "Remaining variables : 1\n",
- "# rows (original) : 569007\n",
- "# rows (filtered) : 569007\n",
- "\n",
- "================================================================================\n",
- "2) Transform; 3) Post-filter\n",
- "================================================================================\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "*) Detecting and parsing value types\n",
- "--------------------------------------------------------------------------------\n",
- "Saved as: ./icd_data/value_types.csv\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "*) Separate time-invariant and time-dependent\n",
- "--------------------------------------------------------------------------------\n",
- "Variables (time-invariant): 1447\n",
- "Variables (time-dependent): 0\n",
- "# rows (time-invariant): 1265903\n",
- "# rows (time-dependent): 0\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "2-A) Transform time-invariant data\n",
- "--------------------------------------------------------------------------------\n",
- "(N × ^d) table :\t (53122, 1447)\n",
- "number of missing entries :\t 75601631 out of 76867534 total\n",
- "Time elapsed: 8.736094 seconds\n",
- "\n",
- "Output\n",
- "s_all, binary features :\t (53122, 1447)\n",
- "Time elapsed: 115.795696 seconds\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "3-A) Post-filter time-invariant data\n",
- "--------------------------------------------------------------------------------\n",
- "Original : 1447\n",
- "Nearly-constant: 753\n",
- "Correlated : 7\n",
- "Time elapsed: 116.175213 seconds\n",
- "\n",
- "Output\n",
- "s: shape=(53122, 687), density=0.034\n",
- "Total time: 116.547743 seconds\n",
- "\n"
- ]
- }
- ],
- "source": [
- "! PYTHONPATH=\"$PYTHONPATH:../\" \\\n",
- "python -m FIDDLE.run \\\n",
- " --data_path='./icd_data/' \\\n",
- " --population='./icd_data/pop.csv' \\\n",
- " --T=4 --dt=1.0 \\\n",
- " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
- " --stats_functions 'min' 'max' 'mean'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Input data file: ./icd_test/input_data.csv\n",
- "\n",
- "Input arguments:\n",
- " T = 4\n",
- " dt = 1.0\n",
- " θ₁ = 0.001\n",
- " θ₂ = 0.001\n",
- " θ_freq = 1.0\n",
- " k = 3 ['min', 'max', 'mean']\n",
- "binarize = yes\n",
- "\n",
- "N = 200\n",
- "L = 4\n",
- "\n",
- "\n",
- "================================================================================\n",
- "1) Pre-filter\n",
- "================================================================================\n",
- "Remove rows not in population\n",
- "Remove rows with t outside of [0, 4]\n",
- "Remove rare variables (<= 0.001)\n",
- "Total variables : 1\n",
- "Rare variables : 0\n",
- "Remaining variables : 1\n",
- "# rows (original) : 1861\n",
- "# rows (filtered) : 1861\n",
- "\n",
- "================================================================================\n",
- "2) Transform; 3) Post-filter\n",
- "================================================================================\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "*) Detecting and parsing value types\n",
- "--------------------------------------------------------------------------------\n",
- "Saved as: ./icd_test/value_types.csv\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "*) Separate time-invariant and time-dependent\n",
- "--------------------------------------------------------------------------------\n",
- "Variables (time-invariant): 455\n",
- "Variables (time-dependent): 0\n",
- "# rows (time-invariant): 4205\n",
- "# rows (time-dependent): 0\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "2-A) Transform time-invariant data\n",
- "--------------------------------------------------------------------------------\n",
- "(N × ^d) table :\t (200, 455)\n",
- "number of missing entries :\t 86795 out of 91000 total\n",
- "Time elapsed: 0.101392 seconds\n",
- "\n",
- "Output\n",
- "s_all, binary features :\t (200, 455)\n",
- "Time elapsed: 1.779821 seconds\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "3-A) Post-filter time-invariant data\n",
- "--------------------------------------------------------------------------------\n",
- "Original : 455\n",
- "Nearly-constant: 0\n",
- "Correlated : 87\n",
- "Time elapsed: 1.820592 seconds\n",
- "\n",
- "Output\n",
- "s: shape=(200, 368), density=0.055\n",
- "Total time: 1.827327 seconds\n",
- "\n"
- ]
- }
- ],
- "source": [
- "! PYTHONPATH=\"$PYTHONPATH:../\" \\\n",
- "python -m FIDDLE.run \\\n",
- " --data_path='./icd_test/' \\\n",
- " --population='./icd_test/pop.csv' \\\n",
- " --T=4 --dt=1.0 \\\n",
- " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
- " --stats_functions 'min' 'max' 'mean'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "import sparse\n",
- "s = sparse.load_npz('./icd_test/s.npz').todense()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(200, 368)"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "s.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[0, 0, 0, ..., 0, 0, 0],\n",
- " [0, 0, 0, ..., 0, 0, 0],\n",
- " [0, 0, 0, ..., 0, 0, 0],\n",
- " ...,\n",
- " [0, 1, 0, ..., 0, 0, 0],\n",
- " [0, 1, 0, ..., 0, 0, 0],\n",
- " [0, 1, 1, ..., 0, 0, 0]])"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "s"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "import json\n",
- "names = json.load(open('./icd_test/s.feature_names.json', 'rb'))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "368"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(names)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.6"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/test/Test_Case.ipynb b/test/Test_Case.ipynb
deleted file mode 100644
index b249db411e8f5f04d5d3946374baaff0061cdbe4..0000000000000000000000000000000000000000
--- a/test/Test_Case.ipynb
+++ /dev/null
@@ -1,348 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Input data file: ./small_test/input_data.csv\n",
- "\n",
- "Input arguments:\n",
- " T = 4\n",
- " dt = 1.0\n",
- " θ₁ = 0.001\n",
- " θ₂ = 0.001\n",
- " θ_freq = 1.0\n",
- " k = 3 ['min', 'max', 'mean']\n",
- "\n",
- "N = 4\n",
- "L = 4\n",
- "\n",
- "\n",
- "================================================================================\n",
- "1) Pre-filter\n",
- "================================================================================\n",
- "Remove rows not in population\n",
- "Remove rows with t outside of [0, 4]\n",
- "Remove rare variables (<= 0.001)\n",
- "Total variables : 7\n",
- "Rare variables : 0\n",
- "Remaining variables : 7\n",
- "# rows (original) : 31\n",
- "# rows (filtered) : 31\n",
- "\n",
- "================================================================================\n",
- "2) Transform; 3) Post-filter\n",
- "================================================================================\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "*) Detecting value types\n",
- "--------------------------------------------------------------------------------\n",
- "Saved as: ./small_test/value_types.csv\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "*) Separate time-invariant and time-dependent\n",
- "--------------------------------------------------------------------------------\n",
- "Variables (time-invariant): 3\n",
- "Variables (time-dependent): 4\n",
- "# rows (time-invariant): 8\n",
- "# rows (time-dependent): 23\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "2.1) Transform time-invariant data\n",
- "--------------------------------------------------------------------------------\n",
- "(N × ^d) table :\t (4, 3)\n",
- "number of missing entries :\t 4 out of 12 total\n",
- "Time elapsed: 0.017584 seconds\n",
- "\n",
- "Output\n",
- "s_all, binary features :\t (4, 7)\n",
- "Time elapsed: 0.072829 seconds\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "3.1) Post-filter time-invariant data\n",
- "--------------------------------------------------------------------------------\n",
- "Original : 7\n",
- "Nearly-constant: 0\n",
- "Correlated : 3\n",
- "Time elapsed: 0.076066 seconds\n",
- "\n",
- "Output\n",
- "s: shape=(4, 4), density=0.312\n",
- "Total time: 0.078834 seconds\n",
- "\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "2.2) Transform time-dependent data\n",
- "--------------------------------------------------------------------------------\n",
- "Total variables : 4\n",
- "Frequent variables : ['HR']\n",
- "M₁ = 1\n",
- "M₂ = 3\n",
- "k = 3 ['min', 'max', 'mean']\n",
- "\n",
- "Transforming each example...\n",
- "[Parallel(n_jobs=72)]: Using backend LokyBackend with 72 concurrent workers.\n",
- "[Parallel(n_jobs=72)]: Done 1 tasks | elapsed: 3.8s\n",
- "[Parallel(n_jobs=72)]: Done 2 out of 4 | elapsed: 3.8s remaining: 3.8s\n",
- "[Parallel(n_jobs=72)]: Done 4 out of 4 | elapsed: 3.9s remaining: 0.0s\n",
- "[Parallel(n_jobs=72)]: Done 4 out of 4 | elapsed: 3.9s finished\n",
- "(freq) number of missing entries :\t 5 out of 4×4×1=16 total\n",
- "(freq) number of imputed entries :\t 4\n",
- " HR 4\n",
- "(freq) number of not imputed entries :\t 1\n",
- " HR 1\n",
- "(non-freq) number of missing entries :\t 41 out of 4×4×3=48 total\n",
- "\n",
- "(N × L × ^D) table :\t (4, 4, 9)\n",
- "Time elapsed: 3.977742 seconds\n",
- "\n",
- "Discretizing features...\n",
- "Processing 8 non-boolean variable columns...\n",
- " Binning numeric variables by quintile...\n",
- " Converting variables to binary features\n",
- "[Parallel(n_jobs=72)]: Using backend LokyBackend with 72 concurrent workers.\n",
- "[Parallel(n_jobs=72)]: Done 2 out of 8 | elapsed: 1.0s remaining: 3.0s\n",
- "[Parallel(n_jobs=72)]: Done 3 out of 8 | elapsed: 1.0s remaining: 1.7s\n",
- "[Parallel(n_jobs=72)]: Done 4 out of 8 | elapsed: 1.0s remaining: 1.0s\n",
- "[Parallel(n_jobs=72)]: Done 5 out of 8 | elapsed: 1.0s remaining: 0.6s\n",
- "[Parallel(n_jobs=72)]: Done 6 out of 8 | elapsed: 1.0s remaining: 0.3s\n",
- "[Parallel(n_jobs=72)]: Done 8 out of 8 | elapsed: 1.1s remaining: 0.0s\n",
- "[Parallel(n_jobs=72)]: Done 8 out of 8 | elapsed: 1.1s finished\n",
- "Finished discretizing features\n",
- "\n",
- "Output\n",
- "X_all: shape=(4, 4, 29), density=0.203\n",
- "Time elapsed: 5.103915 seconds\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "3.2) Post-filter time-dependent data\n",
- "--------------------------------------------------------------------------------\n",
- "(4, 4, 29) 0.2025862068965517\n",
- "Original : 29\n",
- "Nearly-constant: 0\n",
- "*** time: 2.486790657043457\n",
- "Correlated : 15\n",
- "*** time: 4.358332395553589\n",
- "\n",
- "Output\n",
- "X: shape=(4, 4, 14), density=0.237\n",
- "(4, 4, 14) 0.23660714285714285\n",
- "Time elapsed: 9.462556 seconds\n",
- "\n",
- "Output\n",
- "X: shape=(4, 4, 14), density=0.237\n",
- "Total time: 9.466846 seconds\n",
- "\n"
- ]
- }
- ],
- "source": [
- "! PYTHONPATH=\"$PYTHONPATH:../\" \\\n",
- "python -m FIDDLE.run \\\n",
- " --data_path='./small_test/' \\\n",
- " --population='./small_test/pop.csv' \\\n",
- " --T=4 --dt=1.0 \\\n",
- " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
- " --stats_functions 'min' 'max' 'mean'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Input data file: ./large_test/input_data.csv\n",
- "\n",
- "Input arguments:\n",
- " T = 4\n",
- " dt = 1.0\n",
- " θ₁ = 0.001\n",
- " θ₂ = 0.001\n",
- " θ_freq = 1.0\n",
- " k = 3 ['min', 'max', 'mean']\n",
- "\n",
- "N = 200\n",
- "L = 4\n",
- "\n",
- "\n",
- "================================================================================\n",
- "1) Pre-filter\n",
- "================================================================================\n",
- "Remove rows not in population\n",
- "Remove rows with t outside of [0, 4]\n",
- "Remove rare variables (<= 0.001)\n",
- "Total variables : 1970\n",
- "Rare variables : 0\n",
- "Remaining variables : 1970\n",
- "# rows (original) : 64777\n",
- "# rows (filtered) : 64777\n",
- "\n",
- "================================================================================\n",
- "2) Transform; 3) Post-filter\n",
- "================================================================================\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "*) Detecting value types\n",
- "--------------------------------------------------------------------------------\n",
- "Saved as: ./large_test/value_types.csv\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "*) Separate time-invariant and time-dependent\n",
- "--------------------------------------------------------------------------------\n",
- "Variables (time-invariant): 12\n",
- "Variables (time-dependent): 1958\n",
- "# rows (time-invariant): 2400\n",
- "# rows (time-dependent): 62377\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "2.1) Transform time-invariant data\n",
- "--------------------------------------------------------------------------------\n",
- "(N × ^d) table :\t (200, 12)\n",
- "number of missing entries :\t 4 out of 2400 total\n",
- "Time elapsed: 0.021392 seconds\n",
- "\n",
- "Output\n",
- "s_all, binary features :\t (200, 84)\n",
- "Time elapsed: 0.216294 seconds\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "3.1) Post-filter time-invariant data\n",
- "--------------------------------------------------------------------------------\n",
- "Original : 84\n",
- "Nearly-constant: 0\n",
- "Correlated : 7\n",
- "Time elapsed: 0.221074 seconds\n",
- "\n",
- "Output\n",
- "s: shape=(200, 77), density=0.145\n",
- "Total time: 0.225575 seconds\n",
- "\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "2.2) Transform time-dependent data\n",
- "--------------------------------------------------------------------------------\n",
- "Total variables : 1958\n",
- "Frequent variables : ['DiaBP', 'HR', 'RR', 'SpO2', 'SysBP']\n",
- "M₁ = 5\n",
- "M₂ = 1953\n",
- "k = 3 ['min', 'max', 'mean']\n",
- "\n",
- "Transforming each example...\n",
- "[Parallel(n_jobs=72)]: Using backend LokyBackend with 72 concurrent workers.\n",
- "[Parallel(n_jobs=72)]: Done 1 tasks | elapsed: 4.8s\n",
- "[Parallel(n_jobs=72)]: Done 18 tasks | elapsed: 8.0s\n",
- "[Parallel(n_jobs=72)]: Done 37 tasks | elapsed: 9.3s\n",
- "[Parallel(n_jobs=72)]: Done 56 tasks | elapsed: 10.3s\n",
- "[Parallel(n_jobs=72)]: Done 78 out of 200 | elapsed: 11.5s remaining: 18.0s\n",
- "[Parallel(n_jobs=72)]: Done 99 out of 200 | elapsed: 12.7s remaining: 13.0s\n",
- "[Parallel(n_jobs=72)]: Done 120 out of 200 | elapsed: 13.8s remaining: 9.2s\n",
- "[Parallel(n_jobs=72)]: Done 141 out of 200 | elapsed: 14.8s remaining: 6.2s\n",
- "[Parallel(n_jobs=72)]: Done 162 out of 200 | elapsed: 15.4s remaining: 3.6s\n",
- "[Parallel(n_jobs=72)]: Done 183 out of 200 | elapsed: 16.1s remaining: 1.5s\n",
- "[Parallel(n_jobs=72)]: Done 200 out of 200 | elapsed: 16.8s finished\n",
- "(freq) number of missing entries :\t 996 out of 200×4×5=4000 total\n",
- "(freq) number of imputed entries :\t 58\n",
- " DiaBP 17\n",
- " HR 5\n",
- " RR 6\n",
- " SpO2 13\n",
- " SysBP 17\n",
- "(freq) number of not imputed entries :\t 938\n",
- " DiaBP 190\n",
- " HR 180\n",
- " RR 183\n",
- " SpO2 195\n",
- " SysBP 190\n",
- "(non-freq) number of missing entries :\t 1510389 out of 200×4×1953=1562400 total\n",
- "\n",
- "(N × L × ^D) table :\t (200, 4, 1983)\n",
- "Time elapsed: 19.099867 seconds\n",
- "\n",
- "Discretizing features...\n",
- "Processing 1978 non-boolean variable columns...\n",
- " Binning numeric variables by quintile...\n",
- " Converting variables to binary features\n",
- "[Parallel(n_jobs=72)]: Using backend LokyBackend with 72 concurrent workers.\n",
- "[Parallel(n_jobs=72)]: Done 1 tasks | elapsed: 0.0s\n",
- "[Parallel(n_jobs=72)]: Batch computation too fast (0.0419s.) Setting batch_size=8.\n",
- "[Parallel(n_jobs=72)]: Batch computation too fast (0.0419s.) Setting batch_size=76.\n",
- "[Parallel(n_jobs=72)]: Done 9 tasks | elapsed: 0.2s\n",
- "[Parallel(n_jobs=72)]: Done 20 tasks | elapsed: 0.7s\n",
- "[Parallel(n_jobs=72)]: Done 1978 out of 1978 | elapsed: 6.7s finished\n",
- "Finished discretizing features\n",
- "\n",
- "Output\n",
- "X_all: shape=(200, 4, 3406), density=0.026\n",
- "Time elapsed: 26.408678 seconds\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "3.2) Post-filter time-dependent data\n",
- "--------------------------------------------------------------------------------\n",
- "(200, 4, 3406) 0.026153479154433352\n",
- "Original : 3406\n",
- "Nearly-constant: 5\n",
- "*** time: 3.5170133113861084\n",
- "Correlated : 1102\n",
- "*** time: 7.688496828079224\n",
- "\n",
- "Output\n",
- "X: shape=(200, 4, 2299), density=0.034\n",
- "(200, 4, 2299) 0.034270334928229666\n",
- "Time elapsed: 34.102943 seconds\n",
- "\n",
- "Output\n",
- "X: shape=(200, 4, 2299), density=0.034\n",
- "Total time: 34.251790 seconds\n",
- "\n"
- ]
- }
- ],
- "source": [
- "! PYTHONPATH=\"$PYTHONPATH:../\" \\\n",
- "python -m FIDDLE.run \\\n",
- " --data_path='./large_test/' \\\n",
- " --population='./large_test/pop.csv' \\\n",
- " --T=4 --dt=1.0 \\\n",
- " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
- " --stats_functions 'min' 'max' 'mean'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/tests/icd_test/Run.ipynb b/tests/icd_test/Run.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..c83b13c17e3f8b4b19b69af4c159bd46c62c47fd
--- /dev/null
+++ b/tests/icd_test/Run.ipynb
@@ -0,0 +1,1924 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "zsh:1: no matches found: output-*/\n"
+ ]
+ }
+ ],
+ "source": [
+ "!rm -rf output-*/"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Test: levels = [0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!mkdir -p output-0"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Input:\n",
+ " Data : ./input/data.csv\n",
+ " Population: ./input/pop.csv\n",
+ " Config : ./input/config-0.yaml\n",
+ "\n",
+ "Output directory: ./output-0/\n",
+ "\n",
+ "Input arguments:\n",
+ " T = 4.0\n",
+ " dt = 1.0\n",
+ " θ₁ = 0.001\n",
+ " θ₂ = 0.001\n",
+ " θ_freq = 1.0\n",
+ " k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "discretize = yes\n",
+ " S discretization bins: to be computed from data\n",
+ " X discretization bins: to be computed from data\n",
+ "\n",
+ "N = 200\n",
+ "L = 4\n",
+ "\n",
+ "\n",
+ "================================================================================\n",
+ "1) Pre-filter\n",
+ "================================================================================\n",
+ "Remove rows not in population\n",
+ "Remove rows with t outside of [0, 4]\n",
+ "Remove rare variables (<= 0.001)\n",
+ "Total variables : 1\n",
+ "Rare variables : 0\n",
+ "Remaining variables : 1\n",
+ "# rows (original) : 1861\n",
+ "# rows (filtered) : 1861\n",
+ "\n",
+ "================================================================================\n",
+ "2) Transform; 3) Post-filter\n",
+ "================================================================================\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Detecting and parsing value types\n",
+ "--------------------------------------------------------------------------------\n",
+ "Saved as: ./output-0/value_types.csv\n",
+ "/Users/shengputang/Developer/FIDDLE/FIDDLE/steps.py:148: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " df_non_num_dup[var_col] = df_non_num_dup[var_col].astype(str) + ':' + df_non_num_dup[val_col].astype(str)\n",
+ "/Users/shengputang/Developer/FIDDLE/FIDDLE/steps.py:149: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " df_non_num_dup[val_col] = 1\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Separate time-invariant and time-dependent\n",
+ "--------------------------------------------------------------------------------\n",
+ "Variables (time-invariant): 19\n",
+ "Variables (time-dependent): 0\n",
+ "# rows (time-invariant): 984\n",
+ "# rows (time-dependent): 0\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-A) Transform time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "(N × ^d) table :\t (200, 19)\n",
+ "number of missing entries :\t 2816 out of 3800 total\n",
+ "Time elapsed: 0.025395 seconds\n",
+ "\n",
+ "Output\n",
+ "S_all, binary features :\t (200, 21)\n",
+ "Time elapsed: 0.171098 seconds\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "3-A) Post-filter time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "Original : 21\n",
+ "Nearly-constant: 0\n",
+ "Correlated : 0\n",
+ "Time elapsed: 0.178303 seconds\n",
+ "\n",
+ "Output\n",
+ "S: shape=(200, 21), density=0.234\n",
+ "Total time: 0.180898 seconds\n",
+ "\n",
+ "\u001b[0m"
+ ]
+ }
+ ],
+ "source": [
+ "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n",
+ "python -m FIDDLE.run \\\n",
+ " --data_fname='./input/data.csv' \\\n",
+ " --population_fname='./input/pop.csv' \\\n",
+ " --config_fname='./input/config-0.yaml' \\\n",
+ " --output_dir='./output-0/' \\\n",
+ " --T=4 --dt=1.0 \\\n",
+ " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
+ " --stats_functions 'min' 'max' 'mean'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ICD9_CODE_value_140-239 | \n",
+ " ICD9_CODE_value_390-459 | \n",
+ " ICD9_CODE_value_V01-V91 | \n",
+ " ICD9_CODE:001-139_value_1 | \n",
+ " ICD9_CODE:140-239_value_1 | \n",
+ " ICD9_CODE:240-279_value_1 | \n",
+ " ICD9_CODE:280-289_value_1 | \n",
+ " ICD9_CODE:290-319_value_1 | \n",
+ " ICD9_CODE:320-389_value_1 | \n",
+ " ICD9_CODE:390-459_value_1 | \n",
+ " ... | \n",
+ " ICD9_CODE:520-579_value_1 | \n",
+ " ICD9_CODE:580-629_value_1 | \n",
+ " ICD9_CODE:680-709_value_1 | \n",
+ " ICD9_CODE:710-739_value_1 | \n",
+ " ICD9_CODE:740-759_value_1 | \n",
+ " ICD9_CODE:760-779_value_1 | \n",
+ " ICD9_CODE:780-799_value_1 | \n",
+ " ICD9_CODE:800-999_value_1 | \n",
+ " ICD9_CODE:E000-E999_value_1 | \n",
+ " ICD9_CODE:V01-V91_value_1 | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 100536 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 101757 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 102631 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 103251 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 104130 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 197661 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 198214 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 198892 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 199634 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 199724 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
200 rows × 21 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ICD9_CODE_value_140-239 ICD9_CODE_value_390-459 \\\n",
+ "ID \n",
+ "100536 0 0 \n",
+ "101757 0 0 \n",
+ "102631 0 0 \n",
+ "103251 0 0 \n",
+ "104130 0 0 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 0 0 \n",
+ "199724 0 0 \n",
+ "\n",
+ " ICD9_CODE_value_V01-V91 ICD9_CODE:001-139_value_1 \\\n",
+ "ID \n",
+ "100536 0 1 \n",
+ "101757 0 1 \n",
+ "102631 0 0 \n",
+ "103251 1 0 \n",
+ "104130 0 1 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 0 1 \n",
+ "199724 0 1 \n",
+ "\n",
+ " ICD9_CODE:140-239_value_1 ICD9_CODE:240-279_value_1 \\\n",
+ "ID \n",
+ "100536 0 0 \n",
+ "101757 1 0 \n",
+ "102631 0 0 \n",
+ "103251 0 0 \n",
+ "104130 0 1 \n",
+ "... ... ... \n",
+ "197661 0 1 \n",
+ "198214 0 1 \n",
+ "198892 0 0 \n",
+ "199634 1 1 \n",
+ "199724 0 1 \n",
+ "\n",
+ " ICD9_CODE:280-289_value_1 ICD9_CODE:290-319_value_1 \\\n",
+ "ID \n",
+ "100536 1 1 \n",
+ "101757 1 1 \n",
+ "102631 0 0 \n",
+ "103251 0 0 \n",
+ "104130 1 1 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 1 0 \n",
+ "198892 0 0 \n",
+ "199634 0 0 \n",
+ "199724 0 1 \n",
+ "\n",
+ " ICD9_CODE:320-389_value_1 ICD9_CODE:390-459_value_1 ... \\\n",
+ "ID ... \n",
+ "100536 0 0 ... \n",
+ "101757 1 1 ... \n",
+ "102631 0 1 ... \n",
+ "103251 0 0 ... \n",
+ "104130 0 1 ... \n",
+ "... ... ... ... \n",
+ "197661 0 1 ... \n",
+ "198214 1 1 ... \n",
+ "198892 0 0 ... \n",
+ "199634 0 1 ... \n",
+ "199724 1 1 ... \n",
+ "\n",
+ " ICD9_CODE:520-579_value_1 ICD9_CODE:580-629_value_1 \\\n",
+ "ID \n",
+ "100536 1 0 \n",
+ "101757 0 0 \n",
+ "102631 0 0 \n",
+ "103251 0 0 \n",
+ "104130 0 1 \n",
+ "... ... ... \n",
+ "197661 0 1 \n",
+ "198214 1 0 \n",
+ "198892 0 0 \n",
+ "199634 1 0 \n",
+ "199724 0 0 \n",
+ "\n",
+ " ICD9_CODE:680-709_value_1 ICD9_CODE:710-739_value_1 \\\n",
+ "ID \n",
+ "100536 0 0 \n",
+ "101757 1 0 \n",
+ "102631 0 0 \n",
+ "103251 0 0 \n",
+ "104130 0 0 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 0 1 \n",
+ "199724 0 0 \n",
+ "\n",
+ " ICD9_CODE:740-759_value_1 ICD9_CODE:760-779_value_1 \\\n",
+ "ID \n",
+ "100536 0 0 \n",
+ "101757 0 0 \n",
+ "102631 0 0 \n",
+ "103251 0 0 \n",
+ "104130 0 0 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 1 \n",
+ "199634 0 0 \n",
+ "199724 0 0 \n",
+ "\n",
+ " ICD9_CODE:780-799_value_1 ICD9_CODE:800-999_value_1 \\\n",
+ "ID \n",
+ "100536 0 1 \n",
+ "101757 1 0 \n",
+ "102631 0 1 \n",
+ "103251 0 0 \n",
+ "104130 1 0 \n",
+ "... ... ... \n",
+ "197661 0 1 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 0 0 \n",
+ "199724 0 1 \n",
+ "\n",
+ " ICD9_CODE:E000-E999_value_1 ICD9_CODE:V01-V91_value_1 \n",
+ "ID \n",
+ "100536 1 0 \n",
+ "101757 0 1 \n",
+ "102631 0 0 \n",
+ "103251 0 0 \n",
+ "104130 0 1 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 1 \n",
+ "199634 0 1 \n",
+ "199724 0 0 \n",
+ "\n",
+ "[200 rows x 21 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import json\n",
+ "import sparse\n",
+ "\n",
+ "S = sparse.load_npz('output-0/S_all.npz')\n",
+ "S_names = json.load(open('output-0/S_all.feature_names.json', 'r'))\n",
+ "S_index = pd.read_csv('output-0/S.ID.csv').set_index(['ID'])\n",
+ "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n",
+ "\n",
+ "display(df_S)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Test: levels = [0,1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!mkdir -p output-1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Input:\n",
+ " Data : ./input/data.csv\n",
+ " Population: ./input/pop.csv\n",
+ " Config : ./input/config-1.yaml\n",
+ "\n",
+ "Output directory: ./output-1/\n",
+ "\n",
+ "Input arguments:\n",
+ " T = 4.0\n",
+ " dt = 1.0\n",
+ " θ₁ = 0.001\n",
+ " θ₂ = 0.001\n",
+ " θ_freq = 1.0\n",
+ " k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "discretize = yes\n",
+ " S discretization bins: to be computed from data\n",
+ " X discretization bins: to be computed from data\n",
+ "\n",
+ "N = 200\n",
+ "L = 4\n",
+ "\n",
+ "\n",
+ "================================================================================\n",
+ "1) Pre-filter\n",
+ "================================================================================\n",
+ "Remove rows not in population\n",
+ "Remove rows with t outside of [0, 4]\n",
+ "Remove rare variables (<= 0.001)\n",
+ "Total variables : 1\n",
+ "Rare variables : 0\n",
+ "Remaining variables : 1\n",
+ "# rows (original) : 1861\n",
+ "# rows (filtered) : 1861\n",
+ "\n",
+ "================================================================================\n",
+ "2) Transform; 3) Post-filter\n",
+ "================================================================================\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Detecting and parsing value types\n",
+ "--------------------------------------------------------------------------------\n",
+ "Saved as: ./output-1/value_types.csv\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Separate time-invariant and time-dependent\n",
+ "--------------------------------------------------------------------------------\n",
+ "Variables (time-invariant): 129\n",
+ "Variables (time-dependent): 0\n",
+ "# rows (time-invariant): 2463\n",
+ "# rows (time-dependent): 0\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-A) Transform time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "(N × ^d) table :\t (200, 129)\n",
+ "number of missing entries :\t 23337 out of 25800 total\n",
+ "Time elapsed: 0.057711 seconds\n",
+ "\n",
+ "Output\n",
+ "S_all, binary features :\t (200, 129)\n",
+ "Time elapsed: 0.830818 seconds\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "3-A) Post-filter time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "Original : 129\n",
+ "Nearly-constant: 0\n",
+ "Correlated : 2\n",
+ "Time elapsed: 0.840801 seconds\n",
+ "\n",
+ "Output\n",
+ "S: shape=(200, 127), density=0.097\n",
+ "Total time: 0.844234 seconds\n",
+ "\n",
+ "\u001b[0m"
+ ]
+ }
+ ],
+ "source": [
+ "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n",
+ "python -m FIDDLE.run \\\n",
+ " --data_fname='./input/data.csv' \\\n",
+ " --population_fname='./input/pop.csv' \\\n",
+ " --config_fname='./input/config-1.yaml' \\\n",
+ " --output_dir='./output-1/' \\\n",
+ " --T=4 --dt=1.0 \\\n",
+ " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
+ " --stats_functions 'min' 'max' 'mean'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ICD9_CODE:001-009_value_1.0 | \n",
+ " ICD9_CODE:001-139_value_1.0 | \n",
+ " ICD9_CODE:030-041_value_1.0 | \n",
+ " ICD9_CODE:042-042_value_1.0 | \n",
+ " ICD9_CODE:070-079_value_1.0 | \n",
+ " ICD9_CODE:080-088_value_1.0 | \n",
+ " ICD9_CODE:110-118_value_1.0 | \n",
+ " ICD9_CODE:130-136_value_1.0 | \n",
+ " ICD9_CODE:140-239_value_1.0 | \n",
+ " ICD9_CODE:150-159_value_1.0 | \n",
+ " ... | \n",
+ " ICD9_CODE:E960-E969_value_1.0 | \n",
+ " ICD9_CODE:V01-V09_value_1.0 | \n",
+ " ICD9_CODE:V01-V91_value_1.0 | \n",
+ " ICD9_CODE:V10-V19_value_1.0 | \n",
+ " ICD9_CODE:V20-V29_value_1.0 | \n",
+ " ICD9_CODE:V30-V39_value_1.0 | \n",
+ " ICD9_CODE:V40-V49_value_1.0 | \n",
+ " ICD9_CODE:V50-V59_value_1.0 | \n",
+ " ICD9_CODE:V60-V69_value_1.0 | \n",
+ " ICD9_CODE:V70-V82_value_1.0 | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 100536 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 101757 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 102631 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 103251 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 104130 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 197661 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 198214 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 198892 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 199634 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 199724 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
200 rows × 129 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ICD9_CODE:001-009_value_1.0 ICD9_CODE:001-139_value_1.0 \\\n",
+ "ID \n",
+ "100536 0 1 \n",
+ "101757 0 1 \n",
+ "102631 0 0 \n",
+ "103251 0 0 \n",
+ "104130 0 1 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 0 1 \n",
+ "199724 0 1 \n",
+ "\n",
+ " ICD9_CODE:030-041_value_1.0 ICD9_CODE:042-042_value_1.0 \\\n",
+ "ID \n",
+ "100536 0 1 \n",
+ "101757 0 0 \n",
+ "102631 0 0 \n",
+ "103251 0 0 \n",
+ "104130 1 0 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 0 0 \n",
+ "199724 1 0 \n",
+ "\n",
+ " ICD9_CODE:070-079_value_1.0 ICD9_CODE:080-088_value_1.0 \\\n",
+ "ID \n",
+ "100536 1 0 \n",
+ "101757 0 0 \n",
+ "102631 0 0 \n",
+ "103251 0 0 \n",
+ "104130 0 0 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 0 0 \n",
+ "199724 0 0 \n",
+ "\n",
+ " ICD9_CODE:110-118_value_1.0 ICD9_CODE:130-136_value_1.0 \\\n",
+ "ID \n",
+ "100536 0 0 \n",
+ "101757 1 0 \n",
+ "102631 0 0 \n",
+ "103251 0 0 \n",
+ "104130 0 0 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 0 1 \n",
+ "199724 0 0 \n",
+ "\n",
+ " ICD9_CODE:140-239_value_1.0 ICD9_CODE:150-159_value_1.0 ... \\\n",
+ "ID ... \n",
+ "100536 0 0 ... \n",
+ "101757 1 0 ... \n",
+ "102631 0 0 ... \n",
+ "103251 0 0 ... \n",
+ "104130 0 0 ... \n",
+ "... ... ... ... \n",
+ "197661 0 0 ... \n",
+ "198214 0 0 ... \n",
+ "198892 0 0 ... \n",
+ "199634 1 0 ... \n",
+ "199724 0 0 ... \n",
+ "\n",
+ " ICD9_CODE:E960-E969_value_1.0 ICD9_CODE:V01-V09_value_1.0 \\\n",
+ "ID \n",
+ "100536 0 0 \n",
+ "101757 0 0 \n",
+ "102631 0 0 \n",
+ "103251 0 1 \n",
+ "104130 0 0 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 0 0 \n",
+ "199724 0 0 \n",
+ "\n",
+ " ICD9_CODE:V01-V91_value_1.0 ICD9_CODE:V10-V19_value_1.0 \\\n",
+ "ID \n",
+ "100536 0 0 \n",
+ "101757 1 1 \n",
+ "102631 0 0 \n",
+ "103251 1 0 \n",
+ "104130 1 0 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 1 0 \n",
+ "199634 1 0 \n",
+ "199724 0 0 \n",
+ "\n",
+ " ICD9_CODE:V20-V29_value_1.0 ICD9_CODE:V30-V39_value_1.0 \\\n",
+ "ID \n",
+ "100536 0 0 \n",
+ "101757 0 0 \n",
+ "102631 0 0 \n",
+ "103251 1 1 \n",
+ "104130 0 0 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 1 \n",
+ "199634 0 0 \n",
+ "199724 0 0 \n",
+ "\n",
+ " ICD9_CODE:V40-V49_value_1.0 ICD9_CODE:V50-V59_value_1.0 \\\n",
+ "ID \n",
+ "100536 0 0 \n",
+ "101757 0 0 \n",
+ "102631 0 0 \n",
+ "103251 0 1 \n",
+ "104130 1 0 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 0 1 \n",
+ "199724 0 0 \n",
+ "\n",
+ " ICD9_CODE:V60-V69_value_1.0 ICD9_CODE:V70-V82_value_1.0 \n",
+ "ID \n",
+ "100536 0 0 \n",
+ "101757 0 0 \n",
+ "102631 0 0 \n",
+ "103251 0 1 \n",
+ "104130 0 0 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 0 0 \n",
+ "199724 0 0 \n",
+ "\n",
+ "[200 rows x 129 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import json\n",
+ "import sparse\n",
+ "\n",
+ "S = sparse.load_npz('output-1/S_all.npz')\n",
+ "S_names = json.load(open('output-1/S_all.feature_names.json', 'r'))\n",
+ "S_index = pd.read_csv('output-1/S.ID.csv').set_index(['ID'])\n",
+ "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n",
+ "\n",
+ "display(df_S)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Test: levels = [0,1,2]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!mkdir -p output-2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Input:\n",
+ " Data : ./input/data.csv\n",
+ " Population: ./input/pop.csv\n",
+ " Config : ./input/config-2.yaml\n",
+ "\n",
+ "Output directory: ./output-2/\n",
+ "\n",
+ "Input arguments:\n",
+ " T = 4.0\n",
+ " dt = 1.0\n",
+ " θ₁ = 0.001\n",
+ " θ₂ = 0.001\n",
+ " θ_freq = 1.0\n",
+ " k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "discretize = yes\n",
+ " S discretization bins: to be computed from data\n",
+ " X discretization bins: to be computed from data\n",
+ "\n",
+ "N = 200\n",
+ "L = 4\n",
+ "\n",
+ "\n",
+ "================================================================================\n",
+ "1) Pre-filter\n",
+ "================================================================================\n",
+ "Remove rows not in population\n",
+ "Remove rows with t outside of [0, 4]\n",
+ "Remove rare variables (<= 0.001)\n",
+ "Total variables : 1\n",
+ "Rare variables : 0\n",
+ "Remaining variables : 1\n",
+ "# rows (original) : 1861\n",
+ "# rows (filtered) : 1861\n",
+ "\n",
+ "================================================================================\n",
+ "2) Transform; 3) Post-filter\n",
+ "================================================================================\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Detecting and parsing value types\n",
+ "--------------------------------------------------------------------------------\n",
+ "Saved as: ./output-2/value_types.csv\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Separate time-invariant and time-dependent\n",
+ "--------------------------------------------------------------------------------\n",
+ "Variables (time-invariant): 455\n",
+ "Variables (time-dependent): 0\n",
+ "# rows (time-invariant): 4205\n",
+ "# rows (time-dependent): 0\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-A) Transform time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "(N × ^d) table :\t (200, 455)\n",
+ "number of missing entries :\t 86795 out of 91000 total\n",
+ "Time elapsed: 0.112510 seconds\n",
+ "\n",
+ "Output\n",
+ "S_all, binary features :\t (200, 455)\n",
+ "Time elapsed: 2.377939 seconds\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "3-A) Post-filter time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "Original : 455\n",
+ "Nearly-constant: 0\n",
+ "Correlated : 87\n",
+ "Time elapsed: 2.428499 seconds\n",
+ "\n",
+ "Output\n",
+ "S: shape=(200, 368), density=0.055\n",
+ "Total time: 2.435949 seconds\n",
+ "\n",
+ "\u001b[0m"
+ ]
+ }
+ ],
+ "source": [
+ "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n",
+ "python -m FIDDLE.run \\\n",
+ " --data_fname='./input/data.csv' \\\n",
+ " --population_fname='./input/pop.csv' \\\n",
+ " --config_fname='./input/config-2.yaml' \\\n",
+ " --output_dir='./output-2/' \\\n",
+ " --T=4 --dt=1.0 \\\n",
+ " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
+ " --stats_functions 'min' 'max' 'mean'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ICD9_CODE:001-009_value_1.0 | \n",
+ " ICD9_CODE:001-139_value_1.0 | \n",
+ " ICD9_CODE:008_value_1.0 | \n",
+ " ICD9_CODE:030-041_value_1.0 | \n",
+ " ICD9_CODE:038_value_1.0 | \n",
+ " ICD9_CODE:041_value_1.0 | \n",
+ " ICD9_CODE:042_value_1.0 | \n",
+ " ICD9_CODE:042-042_value_1.0 | \n",
+ " ICD9_CODE:070_value_1.0 | \n",
+ " ICD9_CODE:070-079_value_1.0 | \n",
+ " ... | \n",
+ " ICD9_CODE:V49_value_1.0 | \n",
+ " ICD9_CODE:V50_value_1.0 | \n",
+ " ICD9_CODE:V50-V59_value_1.0 | \n",
+ " ICD9_CODE:V54_value_1.0 | \n",
+ " ICD9_CODE:V58_value_1.0 | \n",
+ " ICD9_CODE:V60-V69_value_1.0 | \n",
+ " ICD9_CODE:V64_value_1.0 | \n",
+ " ICD9_CODE:V66_value_1.0 | \n",
+ " ICD9_CODE:V70-V82_value_1.0 | \n",
+ " ICD9_CODE:V72_value_1.0 | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 100536 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 101757 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 102631 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 103251 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 104130 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 197661 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 198214 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 198892 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 199634 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 199724 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
200 rows × 455 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ICD9_CODE:001-009_value_1.0 ICD9_CODE:001-139_value_1.0 \\\n",
+ "ID \n",
+ "100536 0 1 \n",
+ "101757 0 1 \n",
+ "102631 0 0 \n",
+ "103251 0 0 \n",
+ "104130 0 1 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 0 1 \n",
+ "199724 0 1 \n",
+ "\n",
+ " ICD9_CODE:008_value_1.0 ICD9_CODE:030-041_value_1.0 \\\n",
+ "ID \n",
+ "100536 0 0 \n",
+ "101757 0 0 \n",
+ "102631 0 0 \n",
+ "103251 0 0 \n",
+ "104130 0 1 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 0 0 \n",
+ "199724 0 1 \n",
+ "\n",
+ " ICD9_CODE:038_value_1.0 ICD9_CODE:041_value_1.0 \\\n",
+ "ID \n",
+ "100536 0 0 \n",
+ "101757 0 0 \n",
+ "102631 0 0 \n",
+ "103251 0 0 \n",
+ "104130 0 1 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 0 0 \n",
+ "199724 1 0 \n",
+ "\n",
+ " ICD9_CODE:042_value_1.0 ICD9_CODE:042-042_value_1.0 \\\n",
+ "ID \n",
+ "100536 1 1 \n",
+ "101757 0 0 \n",
+ "102631 0 0 \n",
+ "103251 0 0 \n",
+ "104130 0 0 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 0 0 \n",
+ "199724 0 0 \n",
+ "\n",
+ " ICD9_CODE:070_value_1.0 ICD9_CODE:070-079_value_1.0 ... \\\n",
+ "ID ... \n",
+ "100536 1 1 ... \n",
+ "101757 0 0 ... \n",
+ "102631 0 0 ... \n",
+ "103251 0 0 ... \n",
+ "104130 0 0 ... \n",
+ "... ... ... ... \n",
+ "197661 0 0 ... \n",
+ "198214 0 0 ... \n",
+ "198892 0 0 ... \n",
+ "199634 0 0 ... \n",
+ "199724 0 0 ... \n",
+ "\n",
+ " ICD9_CODE:V49_value_1.0 ICD9_CODE:V50_value_1.0 \\\n",
+ "ID \n",
+ "100536 0 0 \n",
+ "101757 0 0 \n",
+ "102631 0 0 \n",
+ "103251 0 1 \n",
+ "104130 0 0 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 0 0 \n",
+ "199724 0 0 \n",
+ "\n",
+ " ICD9_CODE:V50-V59_value_1.0 ICD9_CODE:V54_value_1.0 \\\n",
+ "ID \n",
+ "100536 0 0 \n",
+ "101757 0 0 \n",
+ "102631 0 0 \n",
+ "103251 1 0 \n",
+ "104130 0 0 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 1 0 \n",
+ "199724 0 0 \n",
+ "\n",
+ " ICD9_CODE:V58_value_1.0 ICD9_CODE:V60-V69_value_1.0 \\\n",
+ "ID \n",
+ "100536 0 0 \n",
+ "101757 0 0 \n",
+ "102631 0 0 \n",
+ "103251 0 0 \n",
+ "104130 0 0 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 1 0 \n",
+ "199724 0 0 \n",
+ "\n",
+ " ICD9_CODE:V64_value_1.0 ICD9_CODE:V66_value_1.0 \\\n",
+ "ID \n",
+ "100536 0 0 \n",
+ "101757 0 0 \n",
+ "102631 0 0 \n",
+ "103251 0 0 \n",
+ "104130 0 0 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 0 0 \n",
+ "199724 0 0 \n",
+ "\n",
+ " ICD9_CODE:V70-V82_value_1.0 ICD9_CODE:V72_value_1.0 \n",
+ "ID \n",
+ "100536 0 0 \n",
+ "101757 0 0 \n",
+ "102631 0 0 \n",
+ "103251 1 1 \n",
+ "104130 0 0 \n",
+ "... ... ... \n",
+ "197661 0 0 \n",
+ "198214 0 0 \n",
+ "198892 0 0 \n",
+ "199634 0 0 \n",
+ "199724 0 0 \n",
+ "\n",
+ "[200 rows x 455 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import json\n",
+ "import sparse\n",
+ "\n",
+ "S = sparse.load_npz('output-2/S_all.npz')\n",
+ "S_names = json.load(open('output-2/S_all.feature_names.json', 'r'))\n",
+ "S_index = pd.read_csv('output-2/S.ID.csv').set_index(['ID'])\n",
+ "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n",
+ "\n",
+ "display(df_S)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tests/icd_test/input/config-0.yaml b/tests/icd_test/input/config-0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..913d9292b81922c79edb8e62a0e5f6c87efdda27
--- /dev/null
+++ b/tests/icd_test/input/config-0.yaml
@@ -0,0 +1,2 @@
+hierarchical_sep: ':'
+hierarchical_levels: [0]
diff --git a/tests/icd_test/input/config-1.yaml b/tests/icd_test/input/config-1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2c6ecce9bfea06c237f27be76622bf9699e9898d
--- /dev/null
+++ b/tests/icd_test/input/config-1.yaml
@@ -0,0 +1,2 @@
+hierarchical_sep: ':'
+hierarchical_levels: [0,1]
diff --git a/tests/icd_test/input/config-2.yaml b/tests/icd_test/input/config-2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9c6d4af6b14efe802a5f423eac094966cdc95d7c
--- /dev/null
+++ b/tests/icd_test/input/config-2.yaml
@@ -0,0 +1,2 @@
+hierarchical_sep: ':'
+hierarchical_levels: [0,1,2]
diff --git a/test/icd_test/input_data.csv b/tests/icd_test/input/data.csv
similarity index 100%
rename from test/icd_test/input_data.csv
rename to tests/icd_test/input/data.csv
diff --git a/test/icd_test/icd_test_data.csv b/tests/icd_test/input/icd_test_data.csv
similarity index 100%
rename from test/icd_test/icd_test_data.csv
rename to tests/icd_test/input/icd_test_data.csv
diff --git a/test/icd_test/pop.csv b/tests/icd_test/input/pop.csv
similarity index 100%
rename from test/icd_test/pop.csv
rename to tests/icd_test/input/pop.csv
diff --git a/tests/large_test/Run.ipynb b/tests/large_test/Run.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..3033773a2fa628af2ea9d243f358ffcc6b159af6
--- /dev/null
+++ b/tests/large_test/Run.ipynb
@@ -0,0 +1,3664 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!rm -rf output-*/"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Test 1: discretize = True, use_ordinal_encoding = False"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!mkdir -p output-1-parallel"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Input:\n",
+ " Data : ./input/data.csv\n",
+ " Population: ./input/pop.csv\n",
+ " Config : ./input/config-1-parallel.yaml\n",
+ "\n",
+ "Output directory: ./output-1-parallel/\n",
+ "\n",
+ "Input arguments:\n",
+ " T = 4.0\n",
+ " dt = 1.0\n",
+ " θ₁ = 0.001\n",
+ " θ₂ = 0.001\n",
+ " θ_freq = 1.0\n",
+ " k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "discretize = yes\n",
+ " S discretization bins: to be computed from data\n",
+ " X discretization bins: to be computed from data\n",
+ "\n",
+ "N = 200\n",
+ "L = 4\n",
+ "\n",
+ "\n",
+ "================================================================================\n",
+ "1) Pre-filter\n",
+ "================================================================================\n",
+ "Remove rows not in population\n",
+ "Remove rows with t outside of [0, 4]\n",
+ "Remove rare variables (<= 0.001)\n",
+ "Total variables : 1970\n",
+ "Rare variables : 0\n",
+ "Remaining variables : 1970\n",
+ "# rows (original) : 64777\n",
+ "# rows (filtered) : 64777\n",
+ "\n",
+ "================================================================================\n",
+ "2) Transform; 3) Post-filter\n",
+ "================================================================================\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Detecting and parsing value types\n",
+ "--------------------------------------------------------------------------------\n",
+ "Saved as: ./output-1-parallel/value_types.csv\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Separate time-invariant and time-dependent\n",
+ "--------------------------------------------------------------------------------\n",
+ "Variables (time-invariant): 12\n",
+ "Variables (time-dependent): 1958\n",
+ "# rows (time-invariant): 2400\n",
+ "# rows (time-dependent): 62377\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-A) Transform time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "(N × ^d) table :\t (200, 12)\n",
+ "number of missing entries :\t 4 out of 2400 total\n",
+ "Time elapsed: 0.030966 seconds\n",
+ "\n",
+ "Output\n",
+ "S_all, binary features :\t (200, 84)\n",
+ "Time elapsed: 0.226954 seconds\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "3-A) Post-filter time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "Original : 84\n",
+ "Nearly-constant: 0\n",
+ "Correlated : 7\n",
+ "Time elapsed: 0.232384 seconds\n",
+ "\n",
+ "Output\n",
+ "S: shape=(200, 77), density=0.145\n",
+ "Total time: 0.235823 seconds\n",
+ "\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-B) Transform time-dependent data\n",
+ "--------------------------------------------------------------------------------\n",
+ "Total variables : 1958\n",
+ "Frequent variables : ['DiaBP', 'HR', 'RR', 'SpO2', 'SysBP']\n",
+ "M₁ = 5\n",
+ "M₂ = 1953\n",
+ "k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "Transforming each example...\n",
+ "Batches of size 100: 2\n",
+ "100%|█████████████████████████████████████████████| 2/2 [00:38<00:00, 19.13s/it]\n",
+ "\u001b[0m\u001b[0m\u001b[0m\u001b[0m\n",
+ "Parallel processing done\n",
+ "DONE: Transforming each example...\n",
+ "(freq) number of missing entries :\t 996 out of 200×4×5=4000 total\n",
+ "(freq) number of imputed entries :\t 58\n",
+ "(freq) number of not imputed entries :\t 938\n",
+ "(non-freq) number of missing entries :\t 1510389 out of 200×4×1953=1562400 total\n",
+ "\n",
+ "(N × L × ^D) table :\t (200, 4, 1983)\n",
+ "Time elapsed: 39.815167 seconds\n",
+ "Discretizing features...\n",
+ "\n",
+ "Processing 1978 non-boolean variable columns...\n",
+ " Computing bin edges for numeric variables...\n",
+ "100%|██████████████████████████████████████| 1978/1978 [00:06<00:00, 328.38it/s]\n",
+ " Discretizing variables to binary features\n",
+ "100%|██████████████████████████████████████| 1978/1978 [00:09<00:00, 201.94it/s]\n",
+ "\u001b[0m\u001b[0m\u001b[0m\u001b[0mFinished discretizing features\n",
+ "\n",
+ "Output\n",
+ "X_all: shape=(200, 4, 3557), density=0.025\n",
+ "Time elapsed: 57.075922 seconds\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "3-B) Post-filter time-dependent data\n",
+ "--------------------------------------------------------------------------------\n",
+ "(200, 4, 3557) 0.02504322462749508\n",
+ "Original : 3557\n",
+ "Nearly-constant: 77\n",
+ "*** time: 8.839366912841797\n",
+ "Correlated : 1137\n",
+ "*** time: 16.099601984024048\n",
+ "\n",
+ "Output\n",
+ "X: shape=(200, 4, 2343), density=0.034\n",
+ "(200, 4, 2343) 0.03446382842509603\n",
+ "Time elapsed: 73.185729 seconds\n",
+ "\n",
+ "Output\n",
+ "X: shape=(200, 4, 2343), density=0.034\n",
+ "Total time: 73.237736 seconds\n",
+ "\n",
+ "\u001b[0m"
+ ]
+ }
+ ],
+ "source": [
+ "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n",
+ "python -m FIDDLE.run \\\n",
+ " --data_fname='./input/data.csv' \\\n",
+ " --population_fname='./input/pop.csv' \\\n",
+ " --config_fname='./input/config-1-parallel.yaml' \\\n",
+ " --output_dir='./output-1-parallel/' \\\n",
+ " --T=4 --dt=1.0 \\\n",
+ " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
+ " --stats_functions 'min' 'max' 'mean'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ADMISSION_LOCATION_value_CLINIC REFERRAL/PREMATURE | \n",
+ " ADMISSION_LOCATION_value_EMERGENCY ROOM ADMIT | \n",
+ " ADMISSION_LOCATION_value_PHYS REFERRAL/NORMAL DELI | \n",
+ " ADMISSION_LOCATION_value_TRANSFER FROM HOSP/EXTRAM | \n",
+ " ADMISSION_LOCATION_value_TRANSFER FROM OTHER HEALT | \n",
+ " ADMISSION_LOCATION_value_TRANSFER FROM SKILLED NUR | \n",
+ " ADMISSION_TYPE_value_ELECTIVE | \n",
+ " ADMISSION_TYPE_value_EMERGENCY | \n",
+ " ADMISSION_TYPE_value_URGENT | \n",
+ " AGE_value_(19.737, 50.02] | \n",
+ " ... | \n",
+ " RELIGION_value_EPISCOPALIAN | \n",
+ " RELIGION_value_GREEK ORTHODOX | \n",
+ " RELIGION_value_JEHOVAH'S WITNESS | \n",
+ " RELIGION_value_JEWISH | \n",
+ " RELIGION_value_MUSLIM | \n",
+ " RELIGION_value_NOT SPECIFIED | \n",
+ " RELIGION_value_OTHER | \n",
+ " RELIGION_value_PROTESTANT QUAKER | \n",
+ " RELIGION_value_UNITARIAN-UNIVERSALIST | \n",
+ " RELIGION_value_UNOBTAINABLE | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 200001 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 200010 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 200016 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 200033 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 200034 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 201110 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 201113 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 201124 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 201125 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 201128 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
200 rows × 84 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ADMISSION_LOCATION_value_CLINIC REFERRAL/PREMATURE \\\n",
+ "ID \n",
+ "200001 1 \n",
+ "200010 1 \n",
+ "200016 0 \n",
+ "200033 1 \n",
+ "200034 0 \n",
+ "... ... \n",
+ "201110 1 \n",
+ "201113 0 \n",
+ "201124 0 \n",
+ "201125 1 \n",
+ "201128 0 \n",
+ "\n",
+ " ADMISSION_LOCATION_value_EMERGENCY ROOM ADMIT \\\n",
+ "ID \n",
+ "200001 0 \n",
+ "200010 0 \n",
+ "200016 0 \n",
+ "200033 0 \n",
+ "200034 0 \n",
+ "... ... \n",
+ "201110 0 \n",
+ "201113 1 \n",
+ "201124 0 \n",
+ "201125 0 \n",
+ "201128 0 \n",
+ "\n",
+ " ADMISSION_LOCATION_value_PHYS REFERRAL/NORMAL DELI \\\n",
+ "ID \n",
+ "200001 0 \n",
+ "200010 0 \n",
+ "200016 1 \n",
+ "200033 0 \n",
+ "200034 1 \n",
+ "... ... \n",
+ "201110 0 \n",
+ "201113 0 \n",
+ "201124 1 \n",
+ "201125 0 \n",
+ "201128 0 \n",
+ "\n",
+ " ADMISSION_LOCATION_value_TRANSFER FROM HOSP/EXTRAM \\\n",
+ "ID \n",
+ "200001 0 \n",
+ "200010 0 \n",
+ "200016 0 \n",
+ "200033 0 \n",
+ "200034 0 \n",
+ "... ... \n",
+ "201110 0 \n",
+ "201113 0 \n",
+ "201124 0 \n",
+ "201125 0 \n",
+ "201128 1 \n",
+ "\n",
+ " ADMISSION_LOCATION_value_TRANSFER FROM OTHER HEALT \\\n",
+ "ID \n",
+ "200001 0 \n",
+ "200010 0 \n",
+ "200016 0 \n",
+ "200033 0 \n",
+ "200034 0 \n",
+ "... ... \n",
+ "201110 0 \n",
+ "201113 0 \n",
+ "201124 0 \n",
+ "201125 0 \n",
+ "201128 0 \n",
+ "\n",
+ " ADMISSION_LOCATION_value_TRANSFER FROM SKILLED NUR \\\n",
+ "ID \n",
+ "200001 0 \n",
+ "200010 0 \n",
+ "200016 0 \n",
+ "200033 0 \n",
+ "200034 0 \n",
+ "... ... \n",
+ "201110 0 \n",
+ "201113 0 \n",
+ "201124 0 \n",
+ "201125 0 \n",
+ "201128 0 \n",
+ "\n",
+ " ADMISSION_TYPE_value_ELECTIVE ADMISSION_TYPE_value_EMERGENCY \\\n",
+ "ID \n",
+ "200001 0 1 \n",
+ "200010 0 1 \n",
+ "200016 1 0 \n",
+ "200033 0 1 \n",
+ "200034 1 0 \n",
+ "... ... ... \n",
+ "201110 0 1 \n",
+ "201113 0 1 \n",
+ "201124 1 0 \n",
+ "201125 0 1 \n",
+ "201128 0 1 \n",
+ "\n",
+ " ADMISSION_TYPE_value_URGENT AGE_value_(19.737, 50.02] ... \\\n",
+ "ID ... \n",
+ "200001 0 0 ... \n",
+ "200010 0 1 ... \n",
+ "200016 0 0 ... \n",
+ "200033 0 0 ... \n",
+ "200034 0 0 ... \n",
+ "... ... ... ... \n",
+ "201110 0 0 ... \n",
+ "201113 0 0 ... \n",
+ "201124 0 0 ... \n",
+ "201125 0 0 ... \n",
+ "201128 0 0 ... \n",
+ "\n",
+ " RELIGION_value_EPISCOPALIAN RELIGION_value_GREEK ORTHODOX \\\n",
+ "ID \n",
+ "200001 0 0 \n",
+ "200010 0 0 \n",
+ "200016 0 0 \n",
+ "200033 0 0 \n",
+ "200034 0 0 \n",
+ "... ... ... \n",
+ "201110 0 0 \n",
+ "201113 0 0 \n",
+ "201124 0 0 \n",
+ "201125 0 0 \n",
+ "201128 0 0 \n",
+ "\n",
+ " RELIGION_value_JEHOVAH'S WITNESS RELIGION_value_JEWISH \\\n",
+ "ID \n",
+ "200001 0 0 \n",
+ "200010 0 0 \n",
+ "200016 0 0 \n",
+ "200033 0 0 \n",
+ "200034 0 0 \n",
+ "... ... ... \n",
+ "201110 0 0 \n",
+ "201113 0 0 \n",
+ "201124 0 0 \n",
+ "201125 0 1 \n",
+ "201128 0 0 \n",
+ "\n",
+ " RELIGION_value_MUSLIM RELIGION_value_NOT SPECIFIED \\\n",
+ "ID \n",
+ "200001 1 0 \n",
+ "200010 0 1 \n",
+ "200016 0 0 \n",
+ "200033 0 0 \n",
+ "200034 0 0 \n",
+ "... ... ... \n",
+ "201110 0 1 \n",
+ "201113 0 0 \n",
+ "201124 0 1 \n",
+ "201125 0 0 \n",
+ "201128 0 0 \n",
+ "\n",
+ " RELIGION_value_OTHER RELIGION_value_PROTESTANT QUAKER \\\n",
+ "ID \n",
+ "200001 0 0 \n",
+ "200010 0 0 \n",
+ "200016 0 0 \n",
+ "200033 0 0 \n",
+ "200034 0 0 \n",
+ "... ... ... \n",
+ "201110 0 0 \n",
+ "201113 0 0 \n",
+ "201124 0 0 \n",
+ "201125 0 0 \n",
+ "201128 0 0 \n",
+ "\n",
+ " RELIGION_value_UNITARIAN-UNIVERSALIST RELIGION_value_UNOBTAINABLE \n",
+ "ID \n",
+ "200001 0 0 \n",
+ "200010 0 0 \n",
+ "200016 0 0 \n",
+ "200033 0 1 \n",
+ "200034 0 1 \n",
+ "... ... ... \n",
+ "201110 0 0 \n",
+ "201113 0 1 \n",
+ "201124 0 0 \n",
+ "201125 0 0 \n",
+ "201128 0 0 \n",
+ "\n",
+ "[200 rows x 84 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " DiaBP_mask | \n",
+ " HR_mask | \n",
+ " RR_mask | \n",
+ " SpO2_mask | \n",
+ " SysBP_mask | \n",
+ " 220046_value_(9.999, 120.0] | \n",
+ " 220046_value_(120.0, 200.0] | \n",
+ " 220047_value_(28.999, 50.0] | \n",
+ " 220047_value_(50.0, 55.0] | \n",
+ " 220047_value_(55.0, 60.0] | \n",
+ " ... | \n",
+ " SysBP_max_(65.999, 105.0] | \n",
+ " SysBP_max_(105.0, 116.0] | \n",
+ " SysBP_max_(116.0, 127.0] | \n",
+ " SysBP_max_(127.0, 141.0] | \n",
+ " SysBP_max_(141.0, 214.0] | \n",
+ " SysBP_mean_(65.999, 103.4] | \n",
+ " SysBP_mean_(103.4, 114.433] | \n",
+ " SysBP_mean_(114.433, 125.0] | \n",
+ " SysBP_mean_(125.0, 138.0] | \n",
+ " SysBP_mean_(138.0, 206.0] | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " t_range | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 200001 | \n",
+ " [0.0, 1.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 200010 | \n",
+ " [0.0, 1.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 201125 | \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 201128 | \n",
+ " [0.0, 1.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
800 rows × 3557 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " DiaBP_mask HR_mask RR_mask SpO2_mask SysBP_mask \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 1 1 1 1 1 \n",
+ " [1.0, 2.0) 1 1 1 1 1 \n",
+ " [2.0, 3.0) 1 1 1 1 1 \n",
+ " [3.0, 4.0) 1 1 1 1 1 \n",
+ "200010 [0.0, 1.0) 0 0 0 0 0 \n",
+ "... ... ... ... ... ... \n",
+ "201125 [3.0, 4.0) 1 1 1 1 1 \n",
+ "201128 [0.0, 1.0) 0 0 0 0 0 \n",
+ " [1.0, 2.0) 1 1 1 0 1 \n",
+ " [2.0, 3.0) 1 1 1 0 1 \n",
+ " [3.0, 4.0) 1 1 1 0 1 \n",
+ "\n",
+ " 220046_value_(9.999, 120.0] 220046_value_(120.0, 200.0] \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 1 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "200010 [0.0, 1.0) 1 0 \n",
+ "... ... ... \n",
+ "201125 [3.0, 4.0) 0 1 \n",
+ "201128 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 1 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "\n",
+ " 220047_value_(28.999, 50.0] 220047_value_(50.0, 55.0] \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "200010 [0.0, 1.0) 1 0 \n",
+ "... ... ... \n",
+ "201125 [3.0, 4.0) 0 0 \n",
+ "201128 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 1 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "\n",
+ " 220047_value_(55.0, 60.0] ... SysBP_max_(65.999, 105.0] \\\n",
+ "ID t_range ... \n",
+ "200001 [0.0, 1.0) 1 ... 0 \n",
+ " [1.0, 2.0) 0 ... 0 \n",
+ " [2.0, 3.0) 0 ... 0 \n",
+ " [3.0, 4.0) 0 ... 1 \n",
+ "200010 [0.0, 1.0) 0 ... 0 \n",
+ "... ... ... ... \n",
+ "201125 [3.0, 4.0) 1 ... 0 \n",
+ "201128 [0.0, 1.0) 0 ... 0 \n",
+ " [1.0, 2.0) 0 ... 0 \n",
+ " [2.0, 3.0) 0 ... 0 \n",
+ " [3.0, 4.0) 0 ... 0 \n",
+ "\n",
+ " SysBP_max_(105.0, 116.0] SysBP_max_(116.0, 127.0] \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 1 0 \n",
+ " [1.0, 2.0) 1 0 \n",
+ " [2.0, 3.0) 1 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "200010 [0.0, 1.0) 0 0 \n",
+ "... ... ... \n",
+ "201125 [3.0, 4.0) 0 1 \n",
+ "201128 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 1 \n",
+ " [3.0, 4.0) 0 1 \n",
+ "\n",
+ " SysBP_max_(127.0, 141.0] SysBP_max_(141.0, 214.0] \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "200010 [0.0, 1.0) 0 0 \n",
+ "... ... ... \n",
+ "201125 [3.0, 4.0) 0 0 \n",
+ "201128 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 1 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "\n",
+ " SysBP_mean_(65.999, 103.4] SysBP_mean_(103.4, 114.433] \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 1 0 \n",
+ "200010 [0.0, 1.0) 0 0 \n",
+ "... ... ... \n",
+ "201125 [3.0, 4.0) 0 0 \n",
+ "201128 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "\n",
+ " SysBP_mean_(114.433, 125.0] SysBP_mean_(125.0, 138.0] \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 1 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "200010 [0.0, 1.0) 0 0 \n",
+ "... ... ... \n",
+ "201125 [3.0, 4.0) 1 0 \n",
+ "201128 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 1 0 \n",
+ " [3.0, 4.0) 1 0 \n",
+ "\n",
+ " SysBP_mean_(138.0, 206.0] \n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 0 \n",
+ " [2.0, 3.0) 0 \n",
+ " [3.0, 4.0) 0 \n",
+ "200010 [0.0, 1.0) 0 \n",
+ "... ... \n",
+ "201125 [3.0, 4.0) 0 \n",
+ "201128 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 0 \n",
+ " [2.0, 3.0) 0 \n",
+ " [3.0, 4.0) 0 \n",
+ "\n",
+ "[800 rows x 3557 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import json\n",
+ "import sparse\n",
+ "\n",
+ "S = sparse.load_npz('output-1-parallel/S_all.npz')\n",
+ "S_names = json.load(open('output-1-parallel/S_all.feature_names.json', 'r'))\n",
+ "S_index = pd.read_csv('output-1-parallel/S.ID.csv').set_index(['ID'])\n",
+ "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n",
+ "\n",
+ "X = sparse.load_npz('output-1-parallel/X_all.npz')\n",
+ "X_names = json.load(open('output-1-parallel/X_all.feature_names.json', 'r'))\n",
+ "X_index = pd.read_csv('output-1-parallel/X.ID,t_range.csv').set_index(['ID', 't_range'])\n",
+ "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n",
+ "\n",
+ "display(df_S)\n",
+ "display(df_X)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Test 2: discretize = True, use_ordinal_encoding = True"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!mkdir -p output-2-parallel"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Input:\n",
+ " Data : ./input/data.csv\n",
+ " Population: ./input/pop.csv\n",
+ " Config : ./input/config-2-parallel.yaml\n",
+ "\n",
+ "Output directory: ./output-2-parallel/\n",
+ "\n",
+ "Input arguments:\n",
+ " T = 4.0\n",
+ " dt = 1.0\n",
+ " θ₁ = 0.001\n",
+ " θ₂ = 0.001\n",
+ " θ_freq = 1.0\n",
+ " k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "discretize = yes\n",
+ " S discretization bins: to be computed from data\n",
+ " X discretization bins: to be computed from data\n",
+ "\n",
+ "N = 200\n",
+ "L = 4\n",
+ "\n",
+ "\n",
+ "================================================================================\n",
+ "1) Pre-filter\n",
+ "================================================================================\n",
+ "Remove rows not in population\n",
+ "Remove rows with t outside of [0, 4]\n",
+ "Remove rare variables (<= 0.001)\n",
+ "Total variables : 1970\n",
+ "Rare variables : 0\n",
+ "Remaining variables : 1970\n",
+ "# rows (original) : 64777\n",
+ "# rows (filtered) : 64777\n",
+ "\n",
+ "================================================================================\n",
+ "2) Transform; 3) Post-filter\n",
+ "================================================================================\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Detecting and parsing value types\n",
+ "--------------------------------------------------------------------------------\n",
+ "Saved as: ./output-2-parallel/value_types.csv\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Separate time-invariant and time-dependent\n",
+ "--------------------------------------------------------------------------------\n",
+ "Variables (time-invariant): 12\n",
+ "Variables (time-dependent): 1958\n",
+ "# rows (time-invariant): 2400\n",
+ "# rows (time-dependent): 62377\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-A) Transform time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "(N × ^d) table :\t (200, 12)\n",
+ "number of missing entries :\t 4 out of 2400 total\n",
+ "Time elapsed: 0.018090 seconds\n",
+ "\n",
+ "Output\n",
+ "S_all, binary features :\t (200, 84)\n",
+ "Time elapsed: 0.180124 seconds\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "3-A) Post-filter time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "Original : 84\n",
+ "Nearly-constant: 2\n",
+ "Correlated : 7\n",
+ "Time elapsed: 0.184865 seconds\n",
+ "\n",
+ "Output\n",
+ "S: shape=(200, 75), density=0.176\n",
+ "Total time: 0.188878 seconds\n",
+ "\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-B) Transform time-dependent data\n",
+ "--------------------------------------------------------------------------------\n",
+ "Total variables : 1958\n",
+ "Frequent variables : ['DiaBP', 'HR', 'RR', 'SpO2', 'SysBP']\n",
+ "M₁ = 5\n",
+ "M₂ = 1953\n",
+ "k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "Transforming each example...\n",
+ "Batches of size 100: 2\n",
+ "100%|█████████████████████████████████████████████| 2/2 [00:35<00:00, 17.88s/it]\n",
+ "\u001b[0m\u001b[0m\u001b[0m\u001b[0m\n",
+ "Parallel processing done\n",
+ "DONE: Transforming each example...\n",
+ "(freq) number of missing entries :\t 996 out of 200×4×5=4000 total\n",
+ "(freq) number of imputed entries :\t 58\n",
+ "(freq) number of not imputed entries :\t 938\n",
+ "(non-freq) number of missing entries :\t 1510389 out of 200×4×1953=1562400 total\n",
+ "\n",
+ "(N × L × ^D) table :\t (200, 4, 1983)\n",
+ "Time elapsed: 37.294821 seconds\n",
+ "Discretizing features...\n",
+ "\n",
+ "Processing 1978 non-boolean variable columns...\n",
+ " Computing bin edges for numeric variables...\n",
+ "100%|██████████████████████████████████████| 1978/1978 [00:05<00:00, 377.85it/s]\n",
+ " Discretizing variables to binary features\n",
+ "100%|██████████████████████████████████████| 1978/1978 [00:14<00:00, 139.24it/s]\n",
+ "\u001b[0m\u001b[0m\u001b[0m\u001b[0mFinished discretizing features\n",
+ "\n",
+ "Output\n",
+ "X_all: shape=(200, 4, 3587), density=0.039\n",
+ "Time elapsed: 58.029910 seconds\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "3-B) Post-filter time-dependent data\n",
+ "--------------------------------------------------------------------------------\n",
+ "(200, 4, 3587) 0.03878101477557848\n",
+ "Original : 3587\n",
+ "Nearly-constant: 3\n",
+ "*** time: 7.768070220947266\n",
+ "Correlated : 1194\n",
+ "*** time: 14.502072095870972\n",
+ "\n",
+ "Output\n",
+ "X: shape=(200, 4, 2390), density=0.048\n",
+ "(200, 4, 2390) 0.04819874476987448\n",
+ "Time elapsed: 72.538985 seconds\n",
+ "\n",
+ "Output\n",
+ "X: shape=(200, 4, 2390), density=0.048\n",
+ "Total time: 72.603644 seconds\n",
+ "\n",
+ "\u001b[0m"
+ ]
+ }
+ ],
+ "source": [
+ "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n",
+ "python -m FIDDLE.run \\\n",
+ " --data_fname='./input/data.csv' \\\n",
+ " --population_fname='./input/pop.csv' \\\n",
+ " --config_fname='./input/config-2-parallel.yaml' \\\n",
+ " --output_dir='./output-2-parallel/' \\\n",
+ " --T=4 --dt=1.0 \\\n",
+ " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
+ " --stats_functions 'min' 'max' 'mean'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ADMISSION_LOCATION_value_CLINIC REFERRAL/PREMATURE | \n",
+ " ADMISSION_LOCATION_value_EMERGENCY ROOM ADMIT | \n",
+ " ADMISSION_LOCATION_value_PHYS REFERRAL/NORMAL DELI | \n",
+ " ADMISSION_LOCATION_value_TRANSFER FROM HOSP/EXTRAM | \n",
+ " ADMISSION_LOCATION_value_TRANSFER FROM OTHER HEALT | \n",
+ " ADMISSION_LOCATION_value_TRANSFER FROM SKILLED NUR | \n",
+ " ADMISSION_TYPE_value_ELECTIVE | \n",
+ " ADMISSION_TYPE_value_EMERGENCY | \n",
+ " ADMISSION_TYPE_value_URGENT | \n",
+ " AGE_value>=19.737885622780315 | \n",
+ " ... | \n",
+ " RELIGION_value_EPISCOPALIAN | \n",
+ " RELIGION_value_GREEK ORTHODOX | \n",
+ " RELIGION_value_JEHOVAH'S WITNESS | \n",
+ " RELIGION_value_JEWISH | \n",
+ " RELIGION_value_MUSLIM | \n",
+ " RELIGION_value_NOT SPECIFIED | \n",
+ " RELIGION_value_OTHER | \n",
+ " RELIGION_value_PROTESTANT QUAKER | \n",
+ " RELIGION_value_UNITARIAN-UNIVERSALIST | \n",
+ " RELIGION_value_UNOBTAINABLE | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 200001 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 200010 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 200016 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 200033 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 200034 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 201110 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 201113 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 201124 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 201125 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 201128 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
200 rows × 84 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ADMISSION_LOCATION_value_CLINIC REFERRAL/PREMATURE \\\n",
+ "ID \n",
+ "200001 1 \n",
+ "200010 1 \n",
+ "200016 0 \n",
+ "200033 1 \n",
+ "200034 0 \n",
+ "... ... \n",
+ "201110 1 \n",
+ "201113 0 \n",
+ "201124 0 \n",
+ "201125 1 \n",
+ "201128 0 \n",
+ "\n",
+ " ADMISSION_LOCATION_value_EMERGENCY ROOM ADMIT \\\n",
+ "ID \n",
+ "200001 0 \n",
+ "200010 0 \n",
+ "200016 0 \n",
+ "200033 0 \n",
+ "200034 0 \n",
+ "... ... \n",
+ "201110 0 \n",
+ "201113 1 \n",
+ "201124 0 \n",
+ "201125 0 \n",
+ "201128 0 \n",
+ "\n",
+ " ADMISSION_LOCATION_value_PHYS REFERRAL/NORMAL DELI \\\n",
+ "ID \n",
+ "200001 0 \n",
+ "200010 0 \n",
+ "200016 1 \n",
+ "200033 0 \n",
+ "200034 1 \n",
+ "... ... \n",
+ "201110 0 \n",
+ "201113 0 \n",
+ "201124 1 \n",
+ "201125 0 \n",
+ "201128 0 \n",
+ "\n",
+ " ADMISSION_LOCATION_value_TRANSFER FROM HOSP/EXTRAM \\\n",
+ "ID \n",
+ "200001 0 \n",
+ "200010 0 \n",
+ "200016 0 \n",
+ "200033 0 \n",
+ "200034 0 \n",
+ "... ... \n",
+ "201110 0 \n",
+ "201113 0 \n",
+ "201124 0 \n",
+ "201125 0 \n",
+ "201128 1 \n",
+ "\n",
+ " ADMISSION_LOCATION_value_TRANSFER FROM OTHER HEALT \\\n",
+ "ID \n",
+ "200001 0 \n",
+ "200010 0 \n",
+ "200016 0 \n",
+ "200033 0 \n",
+ "200034 0 \n",
+ "... ... \n",
+ "201110 0 \n",
+ "201113 0 \n",
+ "201124 0 \n",
+ "201125 0 \n",
+ "201128 0 \n",
+ "\n",
+ " ADMISSION_LOCATION_value_TRANSFER FROM SKILLED NUR \\\n",
+ "ID \n",
+ "200001 0 \n",
+ "200010 0 \n",
+ "200016 0 \n",
+ "200033 0 \n",
+ "200034 0 \n",
+ "... ... \n",
+ "201110 0 \n",
+ "201113 0 \n",
+ "201124 0 \n",
+ "201125 0 \n",
+ "201128 0 \n",
+ "\n",
+ " ADMISSION_TYPE_value_ELECTIVE ADMISSION_TYPE_value_EMERGENCY \\\n",
+ "ID \n",
+ "200001 0 1 \n",
+ "200010 0 1 \n",
+ "200016 1 0 \n",
+ "200033 0 1 \n",
+ "200034 1 0 \n",
+ "... ... ... \n",
+ "201110 0 1 \n",
+ "201113 0 1 \n",
+ "201124 1 0 \n",
+ "201125 0 1 \n",
+ "201128 0 1 \n",
+ "\n",
+ " ADMISSION_TYPE_value_URGENT AGE_value>=19.737885622780315 ... \\\n",
+ "ID ... \n",
+ "200001 0 1 ... \n",
+ "200010 0 1 ... \n",
+ "200016 0 1 ... \n",
+ "200033 0 1 ... \n",
+ "200034 0 1 ... \n",
+ "... ... ... ... \n",
+ "201110 0 1 ... \n",
+ "201113 0 1 ... \n",
+ "201124 0 1 ... \n",
+ "201125 0 1 ... \n",
+ "201128 0 1 ... \n",
+ "\n",
+ " RELIGION_value_EPISCOPALIAN RELIGION_value_GREEK ORTHODOX \\\n",
+ "ID \n",
+ "200001 0 0 \n",
+ "200010 0 0 \n",
+ "200016 0 0 \n",
+ "200033 0 0 \n",
+ "200034 0 0 \n",
+ "... ... ... \n",
+ "201110 0 0 \n",
+ "201113 0 0 \n",
+ "201124 0 0 \n",
+ "201125 0 0 \n",
+ "201128 0 0 \n",
+ "\n",
+ " RELIGION_value_JEHOVAH'S WITNESS RELIGION_value_JEWISH \\\n",
+ "ID \n",
+ "200001 0 0 \n",
+ "200010 0 0 \n",
+ "200016 0 0 \n",
+ "200033 0 0 \n",
+ "200034 0 0 \n",
+ "... ... ... \n",
+ "201110 0 0 \n",
+ "201113 0 0 \n",
+ "201124 0 0 \n",
+ "201125 0 1 \n",
+ "201128 0 0 \n",
+ "\n",
+ " RELIGION_value_MUSLIM RELIGION_value_NOT SPECIFIED \\\n",
+ "ID \n",
+ "200001 1 0 \n",
+ "200010 0 1 \n",
+ "200016 0 0 \n",
+ "200033 0 0 \n",
+ "200034 0 0 \n",
+ "... ... ... \n",
+ "201110 0 1 \n",
+ "201113 0 0 \n",
+ "201124 0 1 \n",
+ "201125 0 0 \n",
+ "201128 0 0 \n",
+ "\n",
+ " RELIGION_value_OTHER RELIGION_value_PROTESTANT QUAKER \\\n",
+ "ID \n",
+ "200001 0 0 \n",
+ "200010 0 0 \n",
+ "200016 0 0 \n",
+ "200033 0 0 \n",
+ "200034 0 0 \n",
+ "... ... ... \n",
+ "201110 0 0 \n",
+ "201113 0 0 \n",
+ "201124 0 0 \n",
+ "201125 0 0 \n",
+ "201128 0 0 \n",
+ "\n",
+ " RELIGION_value_UNITARIAN-UNIVERSALIST RELIGION_value_UNOBTAINABLE \n",
+ "ID \n",
+ "200001 0 0 \n",
+ "200010 0 0 \n",
+ "200016 0 0 \n",
+ "200033 0 1 \n",
+ "200034 0 1 \n",
+ "... ... ... \n",
+ "201110 0 0 \n",
+ "201113 0 1 \n",
+ "201124 0 0 \n",
+ "201125 0 0 \n",
+ "201128 0 0 \n",
+ "\n",
+ "[200 rows x 84 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " DiaBP_mask | \n",
+ " HR_mask | \n",
+ " RR_mask | \n",
+ " SpO2_mask | \n",
+ " SysBP_mask | \n",
+ " 220048_value_1 | \n",
+ " 220048: 1st AV (First degree AV Block) _value_1 | \n",
+ " 220048: 3rd AV (Complete Heart Block) _value_1 | \n",
+ " 220048: A Flut (Atrial Flutter) _value_1 | \n",
+ " 220048: AF (Atrial Fibrillation)_value_1 | \n",
+ " ... | \n",
+ " SysBP_mean>=66.0 | \n",
+ " SysBP_mean>=103.4 | \n",
+ " SysBP_mean>=114.43333333333334 | \n",
+ " SysBP_mean>=125.0 | \n",
+ " SysBP_mean>=138.0 | \n",
+ " SysBP_max>=66.0 | \n",
+ " SysBP_max>=105.0 | \n",
+ " SysBP_max>=116.0 | \n",
+ " SysBP_max>=127.0 | \n",
+ " SysBP_max>=141.0 | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " t_range | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 200001 | \n",
+ " [0.0, 1.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 200010 | \n",
+ " [0.0, 1.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 201125 | \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 201128 | \n",
+ " [0.0, 1.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
800 rows × 3587 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " DiaBP_mask HR_mask RR_mask SpO2_mask SysBP_mask \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 1 1 1 1 1 \n",
+ " [1.0, 2.0) 1 1 1 1 1 \n",
+ " [2.0, 3.0) 1 1 1 1 1 \n",
+ " [3.0, 4.0) 1 1 1 1 1 \n",
+ "200010 [0.0, 1.0) 0 0 0 0 0 \n",
+ "... ... ... ... ... ... \n",
+ "201125 [3.0, 4.0) 1 1 1 1 1 \n",
+ "201128 [0.0, 1.0) 0 0 0 0 0 \n",
+ " [1.0, 2.0) 1 1 1 0 1 \n",
+ " [2.0, 3.0) 1 1 1 0 1 \n",
+ " [3.0, 4.0) 1 1 1 0 1 \n",
+ "\n",
+ " 220048_value_1 \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 1 \n",
+ " [1.0, 2.0) 1 \n",
+ " [2.0, 3.0) 1 \n",
+ " [3.0, 4.0) 1 \n",
+ "200010 [0.0, 1.0) 0 \n",
+ "... ... \n",
+ "201125 [3.0, 4.0) 1 \n",
+ "201128 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 1 \n",
+ " [2.0, 3.0) 1 \n",
+ " [3.0, 4.0) 1 \n",
+ "\n",
+ " 220048: 1st AV (First degree AV Block) _value_1 \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 0 \n",
+ " [2.0, 3.0) 0 \n",
+ " [3.0, 4.0) 0 \n",
+ "200010 [0.0, 1.0) 0 \n",
+ "... ... \n",
+ "201125 [3.0, 4.0) 0 \n",
+ "201128 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 0 \n",
+ " [2.0, 3.0) 0 \n",
+ " [3.0, 4.0) 0 \n",
+ "\n",
+ " 220048: 3rd AV (Complete Heart Block) _value_1 \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 0 \n",
+ " [2.0, 3.0) 0 \n",
+ " [3.0, 4.0) 0 \n",
+ "200010 [0.0, 1.0) 0 \n",
+ "... ... \n",
+ "201125 [3.0, 4.0) 0 \n",
+ "201128 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 0 \n",
+ " [2.0, 3.0) 0 \n",
+ " [3.0, 4.0) 0 \n",
+ "\n",
+ " 220048: A Flut (Atrial Flutter) _value_1 \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 0 \n",
+ " [2.0, 3.0) 0 \n",
+ " [3.0, 4.0) 0 \n",
+ "200010 [0.0, 1.0) 0 \n",
+ "... ... \n",
+ "201125 [3.0, 4.0) 0 \n",
+ "201128 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 0 \n",
+ " [2.0, 3.0) 0 \n",
+ " [3.0, 4.0) 0 \n",
+ "\n",
+ " 220048: AF (Atrial Fibrillation)_value_1 ... \\\n",
+ "ID t_range ... \n",
+ "200001 [0.0, 1.0) 0 ... \n",
+ " [1.0, 2.0) 0 ... \n",
+ " [2.0, 3.0) 0 ... \n",
+ " [3.0, 4.0) 0 ... \n",
+ "200010 [0.0, 1.0) 0 ... \n",
+ "... ... ... \n",
+ "201125 [3.0, 4.0) 0 ... \n",
+ "201128 [0.0, 1.0) 0 ... \n",
+ " [1.0, 2.0) 0 ... \n",
+ " [2.0, 3.0) 0 ... \n",
+ " [3.0, 4.0) 0 ... \n",
+ "\n",
+ " SysBP_mean>=66.0 SysBP_mean>=103.4 \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 1 1 \n",
+ " [1.0, 2.0) 1 1 \n",
+ " [2.0, 3.0) 1 1 \n",
+ " [3.0, 4.0) 1 0 \n",
+ "200010 [0.0, 1.0) 0 0 \n",
+ "... ... ... \n",
+ "201125 [3.0, 4.0) 1 1 \n",
+ "201128 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 1 1 \n",
+ " [2.0, 3.0) 1 1 \n",
+ " [3.0, 4.0) 1 1 \n",
+ "\n",
+ " SysBP_mean>=114.43333333333334 SysBP_mean>=125.0 \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 1 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "200010 [0.0, 1.0) 0 0 \n",
+ "... ... ... \n",
+ "201125 [3.0, 4.0) 1 0 \n",
+ "201128 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 1 1 \n",
+ " [2.0, 3.0) 1 0 \n",
+ " [3.0, 4.0) 1 0 \n",
+ "\n",
+ " SysBP_mean>=138.0 SysBP_max>=66.0 SysBP_max>=105.0 \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 0 1 1 \n",
+ " [1.0, 2.0) 0 1 1 \n",
+ " [2.0, 3.0) 0 1 1 \n",
+ " [3.0, 4.0) 0 1 0 \n",
+ "200010 [0.0, 1.0) 0 0 0 \n",
+ "... ... ... ... \n",
+ "201125 [3.0, 4.0) 0 1 1 \n",
+ "201128 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 1 1 \n",
+ " [2.0, 3.0) 0 1 1 \n",
+ " [3.0, 4.0) 0 1 1 \n",
+ "\n",
+ " SysBP_max>=116.0 SysBP_max>=127.0 SysBP_max>=141.0 \n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "200010 [0.0, 1.0) 0 0 0 \n",
+ "... ... ... ... \n",
+ "201125 [3.0, 4.0) 1 0 0 \n",
+ "201128 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 1 1 0 \n",
+ " [2.0, 3.0) 1 0 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "\n",
+ "[800 rows x 3587 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import json\n",
+ "import sparse\n",
+ "\n",
+ "S = sparse.load_npz('output-2-parallel/S_all.npz')\n",
+ "S_names = json.load(open('output-2-parallel/S_all.feature_names.json', 'r'))\n",
+ "S_index = pd.read_csv('output-2-parallel/S.ID.csv').set_index(['ID'])\n",
+ "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n",
+ "\n",
+ "X = sparse.load_npz('output-2-parallel/X_all.npz')\n",
+ "X_names = json.load(open('output-2-parallel/X_all.feature_names.json', 'r'))\n",
+ "X_index = pd.read_csv('output-2-parallel/X.ID,t_range.csv').set_index(['ID', 't_range'])\n",
+ "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n",
+ "\n",
+ "display(df_S)\n",
+ "display(df_X)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Test 3: discretize = False"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!mkdir -p output-3-parallel"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Input:\n",
+ " Data : ./input/data.csv\n",
+ " Population: ./input/pop.csv\n",
+ " Config : ./input/config-3-parallel.yaml\n",
+ "\n",
+ "Output directory: ./output-3-parallel/\n",
+ "\n",
+ "Input arguments:\n",
+ " T = 4.0\n",
+ " dt = 1.0\n",
+ " θ₁ = 0.001\n",
+ " θ₂ = 0.001\n",
+ " θ_freq = 1.0\n",
+ " k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "discretize = no\n",
+ "\n",
+ "N = 200\n",
+ "L = 4\n",
+ "\n",
+ "\n",
+ "================================================================================\n",
+ "1) Pre-filter\n",
+ "================================================================================\n",
+ "Remove rows not in population\n",
+ "Remove rows with t outside of [0, 4]\n",
+ "Remove rare variables (<= 0.001)\n",
+ "Total variables : 1970\n",
+ "Rare variables : 0\n",
+ "Remaining variables : 1970\n",
+ "# rows (original) : 64777\n",
+ "# rows (filtered) : 64777\n",
+ "\n",
+ "================================================================================\n",
+ "2) Transform; 3) Post-filter\n",
+ "================================================================================\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Detecting and parsing value types\n",
+ "--------------------------------------------------------------------------------\n",
+ "Saved as: ./output-3-parallel/value_types.csv\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Separate time-invariant and time-dependent\n",
+ "--------------------------------------------------------------------------------\n",
+ "Variables (time-invariant): 12\n",
+ "Variables (time-dependent): 1958\n",
+ "# rows (time-invariant): 2400\n",
+ "# rows (time-dependent): 62377\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-A) Transform time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "(N × ^d) table :\t (200, 12)\n",
+ "number of missing entries :\t 4 out of 2400 total\n",
+ "Time elapsed: 0.018502 seconds\n",
+ "\n",
+ "Output\n",
+ "S_all, binary features :\t (200, 76)\n",
+ "Time elapsed: 0.116800 seconds\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "3-A) Post-filter time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "Original : 76\n",
+ "Nearly-constant: 0\n",
+ "Correlated : 7\n",
+ "Time elapsed: 0.121063 seconds\n",
+ "\n",
+ "Output\n",
+ "S: shape=(200, 69), density=0.162\n",
+ "Total time: 0.125685 seconds\n",
+ "\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-B) Transform time-dependent data\n",
+ "--------------------------------------------------------------------------------\n",
+ "Total variables : 1958\n",
+ "Frequent variables : ['DiaBP', 'HR', 'RR', 'SpO2', 'SysBP']\n",
+ "M₁ = 5\n",
+ "M₂ = 1953\n",
+ "k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "Transforming each example...\n",
+ "Batches of size 100: 2\n",
+ "100%|█████████████████████████████████████████████| 2/2 [00:30<00:00, 15.46s/it]\n",
+ "\u001b[0m\u001b[0m\u001b[0m\u001b[0m\n",
+ "Parallel processing done\n",
+ "DONE: Transforming each example...\n",
+ "(freq) number of missing entries :\t 996 out of 200×4×5=4000 total\n",
+ "(freq) number of imputed entries :\t 58\n",
+ "(freq) number of not imputed entries :\t 938\n",
+ "(non-freq) number of missing entries :\t 1510389 out of 200×4×1953=1562400 total\n",
+ "\n",
+ "(N × L × ^D) table :\t (200, 4, 1983)\n",
+ "Time elapsed: 32.386383 seconds\n",
+ "Discretizing features...\n",
+ "\n",
+ "Discretizing categorical features...\n",
+ "100%|██████████████████████████████████████| 1990/1990 [00:10<00:00, 190.17it/s]\n",
+ "\u001b[0m\u001b[0m\u001b[0m\u001b[0mFinished discretizing features\n",
+ "\n",
+ "Output\n",
+ "X_all: shape=(200, 4, 2588), density=0.582\n",
+ "Time elapsed: 46.796057 seconds\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "3-B) Post-filter time-dependent data\n",
+ "--------------------------------------------------------------------------------\n",
+ "(200, 4, 2588) 0.5818387751159196\n",
+ "Original : 2588\n",
+ "Nearly-constant: 1064\n",
+ "*** time: 10.0768461227417\n",
+ "/Users/shengputang/Developer/FIDDLE/FIDDLE/helpers.py:426: RuntimeWarning: invalid value encountered in sqrt\n",
+ " coeffs = C / np.sqrt(np.outer(d, d))\n",
+ "/Users/shengputang/Developer/FIDDLE/FIDDLE/helpers.py:426: RuntimeWarning: divide by zero encountered in true_divide\n",
+ " coeffs = C / np.sqrt(np.outer(d, d))\n",
+ "/Users/shengputang/Developer/FIDDLE/FIDDLE/helpers.py:426: RuntimeWarning: invalid value encountered in true_divide\n",
+ " coeffs = C / np.sqrt(np.outer(d, d))\n",
+ "/Users/shengputang/Developer/FIDDLE/FIDDLE/helpers.py:376: RuntimeWarning: invalid value encountered in multiply\n",
+ " self.corr_matrix *= np.tri(*self.corr_matrix.shape)\n",
+ "Correlated : 310\n",
+ "*** time: 16.394930124282837\n",
+ "\n",
+ "Output\n",
+ "X: shape=(200, 4, 1214), density=0.366\n",
+ "(200, 4, 1214) 0.366085255354201\n",
+ "Time elapsed: 63.195710 seconds\n",
+ "\n",
+ "Output\n",
+ "X: shape=(200, 4, 1214), density=0.366\n",
+ "Total time: 63.452329 seconds\n",
+ "\n",
+ "\u001b[0m"
+ ]
+ }
+ ],
+ "source": [
+ "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n",
+ "python -m FIDDLE.run \\\n",
+ " --data_fname='./input/data.csv' \\\n",
+ " --population_fname='./input/pop.csv' \\\n",
+ " --config_fname='./input/config-3-parallel.yaml' \\\n",
+ " --output_dir='./output-3-parallel/' \\\n",
+ " --T=4 --dt=1.0 \\\n",
+ " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
+ " --stats_functions 'min' 'max' 'mean'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ADMISSION_LOCATION_value:CLINIC REFERRAL/PREMATURE | \n",
+ " ADMISSION_LOCATION_value:EMERGENCY ROOM ADMIT | \n",
+ " ADMISSION_LOCATION_value:PHYS REFERRAL/NORMAL DELI | \n",
+ " ADMISSION_LOCATION_value:TRANSFER FROM HOSP/EXTRAM | \n",
+ " ADMISSION_LOCATION_value:TRANSFER FROM OTHER HEALT | \n",
+ " ADMISSION_LOCATION_value:TRANSFER FROM SKILLED NUR | \n",
+ " ADMISSION_TYPE_value:ELECTIVE | \n",
+ " ADMISSION_TYPE_value:EMERGENCY | \n",
+ " ADMISSION_TYPE_value:URGENT | \n",
+ " AGE_value | \n",
+ " ... | \n",
+ " RELIGION_value:EPISCOPALIAN | \n",
+ " RELIGION_value:GREEK ORTHODOX | \n",
+ " RELIGION_value:JEHOVAH'S WITNESS | \n",
+ " RELIGION_value:JEWISH | \n",
+ " RELIGION_value:MUSLIM | \n",
+ " RELIGION_value:NOT SPECIFIED | \n",
+ " RELIGION_value:OTHER | \n",
+ " RELIGION_value:PROTESTANT QUAKER | \n",
+ " RELIGION_value:UNITARIAN-UNIVERSALIST | \n",
+ " RELIGION_value:UNOBTAINABLE | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 200001 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 61.111770 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 200010 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 27.271125 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 200016 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 67.281277 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 200033 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 67.191089 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 200034 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 54.077903 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 201110 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 54.746702 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 201113 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 50.456861 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 201124 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 70.488207 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 201125 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 79.555041 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 201128 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 74.814954 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
200 rows × 76 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ADMISSION_LOCATION_value:CLINIC REFERRAL/PREMATURE \\\n",
+ "ID \n",
+ "200001 1.0 \n",
+ "200010 1.0 \n",
+ "200016 0.0 \n",
+ "200033 1.0 \n",
+ "200034 0.0 \n",
+ "... ... \n",
+ "201110 1.0 \n",
+ "201113 0.0 \n",
+ "201124 0.0 \n",
+ "201125 1.0 \n",
+ "201128 0.0 \n",
+ "\n",
+ " ADMISSION_LOCATION_value:EMERGENCY ROOM ADMIT \\\n",
+ "ID \n",
+ "200001 0.0 \n",
+ "200010 0.0 \n",
+ "200016 0.0 \n",
+ "200033 0.0 \n",
+ "200034 0.0 \n",
+ "... ... \n",
+ "201110 0.0 \n",
+ "201113 1.0 \n",
+ "201124 0.0 \n",
+ "201125 0.0 \n",
+ "201128 0.0 \n",
+ "\n",
+ " ADMISSION_LOCATION_value:PHYS REFERRAL/NORMAL DELI \\\n",
+ "ID \n",
+ "200001 0.0 \n",
+ "200010 0.0 \n",
+ "200016 1.0 \n",
+ "200033 0.0 \n",
+ "200034 1.0 \n",
+ "... ... \n",
+ "201110 0.0 \n",
+ "201113 0.0 \n",
+ "201124 1.0 \n",
+ "201125 0.0 \n",
+ "201128 0.0 \n",
+ "\n",
+ " ADMISSION_LOCATION_value:TRANSFER FROM HOSP/EXTRAM \\\n",
+ "ID \n",
+ "200001 0.0 \n",
+ "200010 0.0 \n",
+ "200016 0.0 \n",
+ "200033 0.0 \n",
+ "200034 0.0 \n",
+ "... ... \n",
+ "201110 0.0 \n",
+ "201113 0.0 \n",
+ "201124 0.0 \n",
+ "201125 0.0 \n",
+ "201128 1.0 \n",
+ "\n",
+ " ADMISSION_LOCATION_value:TRANSFER FROM OTHER HEALT \\\n",
+ "ID \n",
+ "200001 0.0 \n",
+ "200010 0.0 \n",
+ "200016 0.0 \n",
+ "200033 0.0 \n",
+ "200034 0.0 \n",
+ "... ... \n",
+ "201110 0.0 \n",
+ "201113 0.0 \n",
+ "201124 0.0 \n",
+ "201125 0.0 \n",
+ "201128 0.0 \n",
+ "\n",
+ " ADMISSION_LOCATION_value:TRANSFER FROM SKILLED NUR \\\n",
+ "ID \n",
+ "200001 0.0 \n",
+ "200010 0.0 \n",
+ "200016 0.0 \n",
+ "200033 0.0 \n",
+ "200034 0.0 \n",
+ "... ... \n",
+ "201110 0.0 \n",
+ "201113 0.0 \n",
+ "201124 0.0 \n",
+ "201125 0.0 \n",
+ "201128 0.0 \n",
+ "\n",
+ " ADMISSION_TYPE_value:ELECTIVE ADMISSION_TYPE_value:EMERGENCY \\\n",
+ "ID \n",
+ "200001 0.0 1.0 \n",
+ "200010 0.0 1.0 \n",
+ "200016 1.0 0.0 \n",
+ "200033 0.0 1.0 \n",
+ "200034 1.0 0.0 \n",
+ "... ... ... \n",
+ "201110 0.0 1.0 \n",
+ "201113 0.0 1.0 \n",
+ "201124 1.0 0.0 \n",
+ "201125 0.0 1.0 \n",
+ "201128 0.0 1.0 \n",
+ "\n",
+ " ADMISSION_TYPE_value:URGENT AGE_value ... \\\n",
+ "ID ... \n",
+ "200001 0.0 61.111770 ... \n",
+ "200010 0.0 27.271125 ... \n",
+ "200016 0.0 67.281277 ... \n",
+ "200033 0.0 67.191089 ... \n",
+ "200034 0.0 54.077903 ... \n",
+ "... ... ... ... \n",
+ "201110 0.0 54.746702 ... \n",
+ "201113 0.0 50.456861 ... \n",
+ "201124 0.0 70.488207 ... \n",
+ "201125 0.0 79.555041 ... \n",
+ "201128 0.0 74.814954 ... \n",
+ "\n",
+ " RELIGION_value:EPISCOPALIAN RELIGION_value:GREEK ORTHODOX \\\n",
+ "ID \n",
+ "200001 0.0 0.0 \n",
+ "200010 0.0 0.0 \n",
+ "200016 0.0 0.0 \n",
+ "200033 0.0 0.0 \n",
+ "200034 0.0 0.0 \n",
+ "... ... ... \n",
+ "201110 0.0 0.0 \n",
+ "201113 0.0 0.0 \n",
+ "201124 0.0 0.0 \n",
+ "201125 0.0 0.0 \n",
+ "201128 0.0 0.0 \n",
+ "\n",
+ " RELIGION_value:JEHOVAH'S WITNESS RELIGION_value:JEWISH \\\n",
+ "ID \n",
+ "200001 0.0 0.0 \n",
+ "200010 0.0 0.0 \n",
+ "200016 0.0 0.0 \n",
+ "200033 0.0 0.0 \n",
+ "200034 0.0 0.0 \n",
+ "... ... ... \n",
+ "201110 0.0 0.0 \n",
+ "201113 0.0 0.0 \n",
+ "201124 0.0 0.0 \n",
+ "201125 0.0 1.0 \n",
+ "201128 0.0 0.0 \n",
+ "\n",
+ " RELIGION_value:MUSLIM RELIGION_value:NOT SPECIFIED \\\n",
+ "ID \n",
+ "200001 1.0 0.0 \n",
+ "200010 0.0 1.0 \n",
+ "200016 0.0 0.0 \n",
+ "200033 0.0 0.0 \n",
+ "200034 0.0 0.0 \n",
+ "... ... ... \n",
+ "201110 0.0 1.0 \n",
+ "201113 0.0 0.0 \n",
+ "201124 0.0 1.0 \n",
+ "201125 0.0 0.0 \n",
+ "201128 0.0 0.0 \n",
+ "\n",
+ " RELIGION_value:OTHER RELIGION_value:PROTESTANT QUAKER \\\n",
+ "ID \n",
+ "200001 0.0 0.0 \n",
+ "200010 0.0 0.0 \n",
+ "200016 0.0 0.0 \n",
+ "200033 0.0 0.0 \n",
+ "200034 0.0 0.0 \n",
+ "... ... ... \n",
+ "201110 0.0 0.0 \n",
+ "201113 0.0 0.0 \n",
+ "201124 0.0 0.0 \n",
+ "201125 0.0 0.0 \n",
+ "201128 0.0 0.0 \n",
+ "\n",
+ " RELIGION_value:UNITARIAN-UNIVERSALIST RELIGION_value:UNOBTAINABLE \n",
+ "ID \n",
+ "200001 0.0 0.0 \n",
+ "200010 0.0 0.0 \n",
+ "200016 0.0 0.0 \n",
+ "200033 0.0 1.0 \n",
+ "200034 0.0 1.0 \n",
+ "... ... ... \n",
+ "201110 0.0 0.0 \n",
+ "201113 0.0 1.0 \n",
+ "201124 0.0 0.0 \n",
+ "201125 0.0 0.0 \n",
+ "201128 0.0 0.0 \n",
+ "\n",
+ "[200 rows x 76 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " DiaBP_mask | \n",
+ " HR_mask | \n",
+ " RR_mask | \n",
+ " SpO2_mask | \n",
+ " SysBP_mask | \n",
+ " 220046_value | \n",
+ " 220047_value | \n",
+ " 220048_value | \n",
+ " 220048: 1st AV (First degree AV Block) _value | \n",
+ " 220048: 3rd AV (Complete Heart Block) _value | \n",
+ " ... | \n",
+ " 51492_value_str:NEG | \n",
+ " 51492_value_str:TR | \n",
+ " 51493_value_str:0-2 | \n",
+ " 51493_value_str:>50 | \n",
+ " 51498_value_str:>=1.035 | \n",
+ " 51514_value_str:NEG | \n",
+ " 51516_value_str:0-2 | \n",
+ " 51516_value_str:3-5 | \n",
+ " 51516_value_str:6-10 | \n",
+ " 51516_value_str:>50 | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " t_range | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 200001 | \n",
+ " [0.0, 1.0) | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 120.000000 | \n",
+ " 60.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 120.940594 | \n",
+ " 53.935961 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 120.940594 | \n",
+ " 53.935961 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 120.940594 | \n",
+ " 53.935961 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 200010 | \n",
+ " [0.0, 1.0) | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 120.000000 | \n",
+ " 50.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 201125 | \n",
+ " [3.0, 4.0) | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 130.000000 | \n",
+ " 60.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 201128 | \n",
+ " [0.0, 1.0) | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 120.940594 | \n",
+ " 53.935961 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 120.000000 | \n",
+ " 50.000000 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 120.940594 | \n",
+ " 53.935961 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 120.940594 | \n",
+ " 53.935961 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
800 rows × 2588 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " DiaBP_mask HR_mask RR_mask SpO2_mask SysBP_mask \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 1.0 1.0 1.0 1.0 1.0 \n",
+ " [1.0, 2.0) 1.0 1.0 1.0 1.0 1.0 \n",
+ " [2.0, 3.0) 1.0 1.0 1.0 1.0 1.0 \n",
+ " [3.0, 4.0) 1.0 1.0 1.0 1.0 1.0 \n",
+ "200010 [0.0, 1.0) 0.0 0.0 0.0 0.0 0.0 \n",
+ "... ... ... ... ... ... \n",
+ "201125 [3.0, 4.0) 1.0 1.0 1.0 1.0 1.0 \n",
+ "201128 [0.0, 1.0) 0.0 0.0 0.0 0.0 0.0 \n",
+ " [1.0, 2.0) 1.0 1.0 1.0 0.0 1.0 \n",
+ " [2.0, 3.0) 1.0 1.0 1.0 0.0 1.0 \n",
+ " [3.0, 4.0) 1.0 1.0 1.0 0.0 1.0 \n",
+ "\n",
+ " 220046_value 220047_value 220048_value \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 120.000000 60.000000 1.0 \n",
+ " [1.0, 2.0) 120.940594 53.935961 1.0 \n",
+ " [2.0, 3.0) 120.940594 53.935961 1.0 \n",
+ " [3.0, 4.0) 120.940594 53.935961 1.0 \n",
+ "200010 [0.0, 1.0) 120.000000 50.000000 1.0 \n",
+ "... ... ... ... \n",
+ "201125 [3.0, 4.0) 130.000000 60.000000 1.0 \n",
+ "201128 [0.0, 1.0) 120.940594 53.935961 1.0 \n",
+ " [1.0, 2.0) 120.000000 50.000000 1.0 \n",
+ " [2.0, 3.0) 120.940594 53.935961 1.0 \n",
+ " [3.0, 4.0) 120.940594 53.935961 1.0 \n",
+ "\n",
+ " 220048: 1st AV (First degree AV Block) _value \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 1.0 \n",
+ " [1.0, 2.0) 1.0 \n",
+ " [2.0, 3.0) 1.0 \n",
+ " [3.0, 4.0) 1.0 \n",
+ "200010 [0.0, 1.0) 1.0 \n",
+ "... ... \n",
+ "201125 [3.0, 4.0) 1.0 \n",
+ "201128 [0.0, 1.0) 1.0 \n",
+ " [1.0, 2.0) 1.0 \n",
+ " [2.0, 3.0) 1.0 \n",
+ " [3.0, 4.0) 1.0 \n",
+ "\n",
+ " 220048: 3rd AV (Complete Heart Block) _value ... \\\n",
+ "ID t_range ... \n",
+ "200001 [0.0, 1.0) 1.0 ... \n",
+ " [1.0, 2.0) 1.0 ... \n",
+ " [2.0, 3.0) 1.0 ... \n",
+ " [3.0, 4.0) 1.0 ... \n",
+ "200010 [0.0, 1.0) 1.0 ... \n",
+ "... ... ... \n",
+ "201125 [3.0, 4.0) 1.0 ... \n",
+ "201128 [0.0, 1.0) 1.0 ... \n",
+ " [1.0, 2.0) 1.0 ... \n",
+ " [2.0, 3.0) 1.0 ... \n",
+ " [3.0, 4.0) 1.0 ... \n",
+ "\n",
+ " 51492_value_str:NEG 51492_value_str:TR \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 0.0 0.0 \n",
+ " [1.0, 2.0) 0.0 0.0 \n",
+ " [2.0, 3.0) 0.0 0.0 \n",
+ " [3.0, 4.0) 0.0 0.0 \n",
+ "200010 [0.0, 1.0) 0.0 0.0 \n",
+ "... ... ... \n",
+ "201125 [3.0, 4.0) 0.0 0.0 \n",
+ "201128 [0.0, 1.0) 0.0 0.0 \n",
+ " [1.0, 2.0) 0.0 0.0 \n",
+ " [2.0, 3.0) 0.0 0.0 \n",
+ " [3.0, 4.0) 0.0 0.0 \n",
+ "\n",
+ " 51493_value_str:0-2 51493_value_str:>50 \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 0.0 0.0 \n",
+ " [1.0, 2.0) 0.0 0.0 \n",
+ " [2.0, 3.0) 0.0 0.0 \n",
+ " [3.0, 4.0) 0.0 0.0 \n",
+ "200010 [0.0, 1.0) 0.0 0.0 \n",
+ "... ... ... \n",
+ "201125 [3.0, 4.0) 1.0 0.0 \n",
+ "201128 [0.0, 1.0) 0.0 0.0 \n",
+ " [1.0, 2.0) 0.0 0.0 \n",
+ " [2.0, 3.0) 0.0 0.0 \n",
+ " [3.0, 4.0) 0.0 0.0 \n",
+ "\n",
+ " 51498_value_str:>=1.035 51514_value_str:NEG \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 0.0 0.0 \n",
+ " [1.0, 2.0) 0.0 0.0 \n",
+ " [2.0, 3.0) 0.0 0.0 \n",
+ " [3.0, 4.0) 0.0 0.0 \n",
+ "200010 [0.0, 1.0) 0.0 0.0 \n",
+ "... ... ... \n",
+ "201125 [3.0, 4.0) 0.0 1.0 \n",
+ "201128 [0.0, 1.0) 0.0 0.0 \n",
+ " [1.0, 2.0) 0.0 0.0 \n",
+ " [2.0, 3.0) 0.0 0.0 \n",
+ " [3.0, 4.0) 0.0 0.0 \n",
+ "\n",
+ " 51516_value_str:0-2 51516_value_str:3-5 \\\n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 0.0 0.0 \n",
+ " [1.0, 2.0) 0.0 0.0 \n",
+ " [2.0, 3.0) 0.0 0.0 \n",
+ " [3.0, 4.0) 0.0 0.0 \n",
+ "200010 [0.0, 1.0) 0.0 0.0 \n",
+ "... ... ... \n",
+ "201125 [3.0, 4.0) 0.0 1.0 \n",
+ "201128 [0.0, 1.0) 0.0 0.0 \n",
+ " [1.0, 2.0) 0.0 0.0 \n",
+ " [2.0, 3.0) 0.0 0.0 \n",
+ " [3.0, 4.0) 0.0 0.0 \n",
+ "\n",
+ " 51516_value_str:6-10 51516_value_str:>50 \n",
+ "ID t_range \n",
+ "200001 [0.0, 1.0) 0.0 0.0 \n",
+ " [1.0, 2.0) 0.0 0.0 \n",
+ " [2.0, 3.0) 0.0 0.0 \n",
+ " [3.0, 4.0) 0.0 0.0 \n",
+ "200010 [0.0, 1.0) 0.0 0.0 \n",
+ "... ... ... \n",
+ "201125 [3.0, 4.0) 0.0 0.0 \n",
+ "201128 [0.0, 1.0) 0.0 0.0 \n",
+ " [1.0, 2.0) 0.0 0.0 \n",
+ " [2.0, 3.0) 0.0 0.0 \n",
+ " [3.0, 4.0) 0.0 0.0 \n",
+ "\n",
+ "[800 rows x 2588 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import json\n",
+ "import sparse\n",
+ "\n",
+ "S = sparse.load_npz('output-3-parallel/S_all.npz')\n",
+ "S_names = json.load(open('output-3-parallel/S_all.feature_names.json', 'r'))\n",
+ "S_index = pd.read_csv('output-3-parallel/S.ID.csv').set_index(['ID'])\n",
+ "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n",
+ "\n",
+ "X = sparse.load_npz('output-3-parallel/X_all.npz')\n",
+ "X_names = json.load(open('output-3-parallel/X_all.feature_names.json', 'r'))\n",
+ "X_index = pd.read_csv('output-3-parallel/X.ID,t_range.csv').set_index(['ID', 't_range'])\n",
+ "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n",
+ "\n",
+ "display(df_S)\n",
+ "display(df_X)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tests/large_test/input/config-1-parallel.yaml b/tests/large_test/input/config-1-parallel.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..04de805b0f77f01e6c290e4459f329cbe7b9a10a
--- /dev/null
+++ b/tests/large_test/input/config-1-parallel.yaml
@@ -0,0 +1,5 @@
+discretize: yes
+use_ordinal_encoding: no
+
+parallel: yes
+n_jobs: 4
diff --git a/tests/large_test/input/config-2-parallel.yaml b/tests/large_test/input/config-2-parallel.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..26bff1b866b98520a2d6afac91df41330b681fd4
--- /dev/null
+++ b/tests/large_test/input/config-2-parallel.yaml
@@ -0,0 +1,5 @@
+discretize: yes
+use_ordinal_encoding: yes
+
+parallel: yes
+n_jobs: 4
diff --git a/tests/large_test/input/config-3-parallel.yaml b/tests/large_test/input/config-3-parallel.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2a5b04c2b87d9937f7a94c686bced1793c12fba1
--- /dev/null
+++ b/tests/large_test/input/config-3-parallel.yaml
@@ -0,0 +1,4 @@
+discretize: no
+
+parallel: yes
+n_jobs: 4
diff --git a/test/large_test/input_data.csv b/tests/large_test/input/data.csv
similarity index 100%
rename from test/large_test/input_data.csv
rename to tests/large_test/input/data.csv
diff --git a/test/large_test/pop.csv b/tests/large_test/input/pop.csv
similarity index 100%
rename from test/large_test/pop.csv
rename to tests/large_test/input/pop.csv
diff --git a/tests/small_test/Run-docker.ipynb b/tests/small_test/Run-docker.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..a18976b9795b6d3485faed12fd96ba7d795120d9
--- /dev/null
+++ b/tests/small_test/Run-docker.ipynb
@@ -0,0 +1,2473 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!rm -rf output-*/"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Test 1: discretize = False"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!mkdir -p output-1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Input:\n",
+ " Data : /datadir/input/data.csv\n",
+ " Population: /datadir/input/pop.csv\n",
+ " Config : /datadir/input/config-1.yaml\n",
+ "\n",
+ "Output directory: /datadir/output-1/\n",
+ "\n",
+ "Input arguments:\n",
+ " T = 4.0\n",
+ " dt = 1.0\n",
+ " θ₁ = 0.001\n",
+ " θ₂ = 0.001\n",
+ " θ_freq = 1.0\n",
+ " k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "discretize = no\n",
+ "\n",
+ "N = 4\n",
+ "L = 4\n",
+ "\n",
+ "\n",
+ "================================================================================\n",
+ "2) Transform; 3) Post-filter\n",
+ "================================================================================\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Detecting and parsing value types\n",
+ "--------------------------------------------------------------------------------\n",
+ "Saved as: /datadir/output-1/value_types.csv\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Separate time-invariant and time-dependent\n",
+ "--------------------------------------------------------------------------------\n",
+ "Variables (time-invariant): 3\n",
+ "Variables (time-dependent): 4\n",
+ "# rows (time-invariant): 9\n",
+ "# rows (time-dependent): 23\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-A) Transform time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "(N × ^d) table :\t (4, 3)\n",
+ "number of missing entries :\t 3 out of 12 total\n",
+ "Time elapsed: 0.044580 seconds\n",
+ "\n",
+ "Output\n",
+ "S_all, binary features :\t (4, 6)\n",
+ "Time elapsed: 0.274032 seconds\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-B) Transform time-dependent data\n",
+ "--------------------------------------------------------------------------------\n",
+ "Total variables : 4\n",
+ "Frequent variables : ['HR']\n",
+ "M₁ = 1\n",
+ "M₂ = 3\n",
+ "k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "Transforming each example...\n",
+ "100%|█████████████████████████████████████████████| 4/4 [00:01<00:00, 3.60it/s]\n",
+ "DONE: Transforming each example...\n",
+ "(freq) number of missing entries :\t 5 out of 4×4×1=16 total\n",
+ "(freq) number of imputed entries :\t 4\n",
+ "(freq) number of not imputed entries :\t 1\n",
+ "(non-freq) number of missing entries :\t 41 out of 4×4×3=48 total\n",
+ "\n",
+ "(N × L × ^D) table :\t (4, 4, 9)\n",
+ "Time elapsed: 1.221479 seconds\n",
+ "Discretizing features...\n",
+ "\n",
+ "Discretizing categorical features...\n",
+ "100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 132.10it/s]\n",
+ "Finished discretizing features\n",
+ "\n",
+ "Output\n",
+ "X_all: shape=(4, 4, 12), density=0.599\n",
+ "Time elapsed: 1.397339 seconds\n"
+ ]
+ }
+ ],
+ "source": [
+ "!docker run -it \\\n",
+ " --mount type='bind',src=\"$(pwd)\",target='/datadir' \\\n",
+ " fiddle-v020 \\\n",
+ "python -m FIDDLE.run \\\n",
+ " --data_fname='/datadir/input/data.csv' \\\n",
+ " --population_fname='/datadir/input/pop.csv' \\\n",
+ " --config_fname='/datadir/input/config-1.yaml' \\\n",
+ " --output_dir='/datadir/output-1/' \\\n",
+ " --T=4 --dt=1.0 \\\n",
+ " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
+ " --stats_functions 'min' 'max' 'mean' \\\n",
+ " --no_prefilter --no_postfilter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " AGE_value | \n",
+ " ROOM_value:_101 | \n",
+ " ROOM_value:_102 | \n",
+ " ROOM_value:_103 | \n",
+ " SEX_value:F | \n",
+ " SEX_value:M | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 50.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 33.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 40.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 41.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " AGE_value ROOM_value:_101 ROOM_value:_102 ROOM_value:_103 SEX_value:F \\\n",
+ "ID \n",
+ "1 50.0 1.0 0.0 0.0 0.0 \n",
+ "2 33.0 0.0 1.0 0.0 0.0 \n",
+ "3 40.0 0.0 0.0 1.0 1.0 \n",
+ "4 41.0 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " SEX_value:M \n",
+ "ID \n",
+ "1 1.0 \n",
+ "2 1.0 \n",
+ "3 0.0 \n",
+ "4 0.0 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " HR_mask | \n",
+ " DRUG_A_RATE_value | \n",
+ " DRUG_A_ROUTE_value:Bolus | \n",
+ " DRUG_A_ROUTE_value:IV | \n",
+ " DRUG_A_ROUTE_value:Oral | \n",
+ " LAB_X_value | \n",
+ " HR_delta_time | \n",
+ " HR_value | \n",
+ " HR_min | \n",
+ " HR_max | \n",
+ " HR_mean | \n",
+ " LAB_X_value_str:<1 | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " t_range | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " [0.0, 1.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 71.000000 | \n",
+ " 70.0 | \n",
+ " 71.000000 | \n",
+ " 70.500000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 73.000000 | \n",
+ " 72.0 | \n",
+ " 73.000000 | \n",
+ " 72.500000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 74.000000 | \n",
+ " 74.0 | \n",
+ " 74.000000 | \n",
+ " 74.000000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 75.000000 | \n",
+ " 75.0 | \n",
+ " 75.000000 | \n",
+ " 75.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " [0.0, 1.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 61.000000 | \n",
+ " 60.0 | \n",
+ " 61.000000 | \n",
+ " 60.333333 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 0.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 1.0 | \n",
+ " 61.000000 | \n",
+ " 61.0 | \n",
+ " 61.000000 | \n",
+ " 61.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 78.000000 | \n",
+ " 73.0 | \n",
+ " 78.000000 | \n",
+ " 75.500000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 75.000000 | \n",
+ " 75.0 | \n",
+ " 75.000000 | \n",
+ " 75.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " [0.0, 1.0) | \n",
+ " 0.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 75.533333 | \n",
+ " 75.0 | \n",
+ " 75.533333 | \n",
+ " 75.255556 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 90.000000 | \n",
+ " 90.0 | \n",
+ " 90.000000 | \n",
+ " 90.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 0.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 1.0 | \n",
+ " 90.000000 | \n",
+ " 90.0 | \n",
+ " 90.000000 | \n",
+ " 90.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 0.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 2.0 | \n",
+ " 90.000000 | \n",
+ " 90.0 | \n",
+ " 90.000000 | \n",
+ " 90.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " [0.0, 1.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 80.000000 | \n",
+ " 80.0 | \n",
+ " 80.000000 | \n",
+ " 80.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 0.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 1.0 | \n",
+ " 80.000000 | \n",
+ " 80.0 | \n",
+ " 80.000000 | \n",
+ " 80.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 62.000000 | \n",
+ " 62.0 | \n",
+ " 62.000000 | \n",
+ " 62.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 73.000000 | \n",
+ " 73.0 | \n",
+ " 73.000000 | \n",
+ " 73.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " HR_mask DRUG_A_RATE_value DRUG_A_ROUTE_value:Bolus \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 1.0 48.0 0.0 \n",
+ " [1.0, 2.0) 1.0 48.0 0.0 \n",
+ " [2.0, 3.0) 1.0 48.0 0.0 \n",
+ " [3.0, 4.0) 1.0 48.0 0.0 \n",
+ "2 [0.0, 1.0) 1.0 48.0 0.0 \n",
+ " [1.0, 2.0) 0.0 48.0 0.0 \n",
+ " [2.0, 3.0) 1.0 48.0 0.0 \n",
+ " [3.0, 4.0) 1.0 48.0 0.0 \n",
+ "3 [0.0, 1.0) 0.0 48.0 0.0 \n",
+ " [1.0, 2.0) 1.0 48.0 1.0 \n",
+ " [2.0, 3.0) 0.0 48.0 0.0 \n",
+ " [3.0, 4.0) 0.0 48.0 0.0 \n",
+ "4 [0.0, 1.0) 1.0 48.0 0.0 \n",
+ " [1.0, 2.0) 0.0 48.0 0.0 \n",
+ " [2.0, 3.0) 1.0 48.0 0.0 \n",
+ " [3.0, 4.0) 1.0 48.0 0.0 \n",
+ "\n",
+ " DRUG_A_ROUTE_value:IV DRUG_A_ROUTE_value:Oral LAB_X_value \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0.0 0.0 5.0 \n",
+ " [1.0, 2.0) 0.0 0.0 5.0 \n",
+ " [2.0, 3.0) 0.0 1.0 5.0 \n",
+ " [3.0, 4.0) 0.0 0.0 5.0 \n",
+ "2 [0.0, 1.0) 0.0 0.0 5.0 \n",
+ " [1.0, 2.0) 0.0 0.0 5.0 \n",
+ " [2.0, 3.0) 0.0 0.0 5.0 \n",
+ " [3.0, 4.0) 1.0 0.0 5.0 \n",
+ "3 [0.0, 1.0) 0.0 0.0 5.0 \n",
+ " [1.0, 2.0) 0.0 0.0 5.0 \n",
+ " [2.0, 3.0) 0.0 0.0 5.0 \n",
+ " [3.0, 4.0) 0.0 0.0 5.0 \n",
+ "4 [0.0, 1.0) 0.0 0.0 5.0 \n",
+ " [1.0, 2.0) 0.0 0.0 5.0 \n",
+ " [2.0, 3.0) 0.0 0.0 5.0 \n",
+ " [3.0, 4.0) 0.0 0.0 5.0 \n",
+ "\n",
+ " HR_delta_time HR_value HR_min HR_max HR_mean \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0.0 71.000000 70.0 71.000000 70.500000 \n",
+ " [1.0, 2.0) 0.0 73.000000 72.0 73.000000 72.500000 \n",
+ " [2.0, 3.0) 0.0 74.000000 74.0 74.000000 74.000000 \n",
+ " [3.0, 4.0) 0.0 75.000000 75.0 75.000000 75.000000 \n",
+ "2 [0.0, 1.0) 0.0 61.000000 60.0 61.000000 60.333333 \n",
+ " [1.0, 2.0) 1.0 61.000000 61.0 61.000000 61.000000 \n",
+ " [2.0, 3.0) 0.0 78.000000 73.0 78.000000 75.500000 \n",
+ " [3.0, 4.0) 0.0 75.000000 75.0 75.000000 75.000000 \n",
+ "3 [0.0, 1.0) 0.0 75.533333 75.0 75.533333 75.255556 \n",
+ " [1.0, 2.0) 0.0 90.000000 90.0 90.000000 90.000000 \n",
+ " [2.0, 3.0) 1.0 90.000000 90.0 90.000000 90.000000 \n",
+ " [3.0, 4.0) 2.0 90.000000 90.0 90.000000 90.000000 \n",
+ "4 [0.0, 1.0) 0.0 80.000000 80.0 80.000000 80.000000 \n",
+ " [1.0, 2.0) 1.0 80.000000 80.0 80.000000 80.000000 \n",
+ " [2.0, 3.0) 0.0 62.000000 62.0 62.000000 62.000000 \n",
+ " [3.0, 4.0) 0.0 73.000000 73.0 73.000000 73.000000 \n",
+ "\n",
+ " LAB_X_value_str:<1 \n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0.0 \n",
+ " [1.0, 2.0) 0.0 \n",
+ " [2.0, 3.0) 1.0 \n",
+ " [3.0, 4.0) 0.0 \n",
+ "2 [0.0, 1.0) 0.0 \n",
+ " [1.0, 2.0) 0.0 \n",
+ " [2.0, 3.0) 0.0 \n",
+ " [3.0, 4.0) 0.0 \n",
+ "3 [0.0, 1.0) 0.0 \n",
+ " [1.0, 2.0) 0.0 \n",
+ " [2.0, 3.0) 0.0 \n",
+ " [3.0, 4.0) 0.0 \n",
+ "4 [0.0, 1.0) 0.0 \n",
+ " [1.0, 2.0) 0.0 \n",
+ " [2.0, 3.0) 0.0 \n",
+ " [3.0, 4.0) 0.0 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import json\n",
+ "import sparse\n",
+ "\n",
+ "S = sparse.load_npz('output-1/S_all.npz')\n",
+ "S_names = json.load(open('output-1/S_all.feature_names.json', 'r'))\n",
+ "S_index = pd.read_csv('output-1/S.ID.csv').set_index(['ID'])\n",
+ "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n",
+ "\n",
+ "X = sparse.load_npz('output-1/X_all.npz')\n",
+ "X_names = json.load(open('output-1/X_all.feature_names.json', 'r'))\n",
+ "X_index = pd.read_csv('output-1/X.ID,t_range.csv').set_index(['ID', 't_range'])\n",
+ "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n",
+ "\n",
+ "display(df_S)\n",
+ "display(df_X)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Test 2: discretize = True, use_ordinal_encoding = False"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!mkdir -p output-2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Input:\n",
+ " Data : /datadir/input/data.csv\n",
+ " Population: /datadir/input/pop.csv\n",
+ " Config : /datadir/input/config-2.yaml\n",
+ "\n",
+ "Output directory: /datadir/output-2/\n",
+ "\n",
+ "Input arguments:\n",
+ " T = 4.0\n",
+ " dt = 1.0\n",
+ " θ₁ = 0.001\n",
+ " θ₂ = 0.001\n",
+ " θ_freq = 1.0\n",
+ " k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "discretize = yes\n",
+ " S discretization bins: to be computed from data\n",
+ " X discretization bins: to be computed from data\n",
+ "\n",
+ "N = 4\n",
+ "L = 4\n",
+ "\n",
+ "\n",
+ "================================================================================\n",
+ "2) Transform; 3) Post-filter\n",
+ "================================================================================\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Detecting and parsing value types\n",
+ "--------------------------------------------------------------------------------\n",
+ "Saved as: /datadir/output-2/value_types.csv\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Separate time-invariant and time-dependent\n",
+ "--------------------------------------------------------------------------------\n",
+ "Variables (time-invariant): 3\n",
+ "Variables (time-dependent): 4\n",
+ "# rows (time-invariant): 9\n",
+ "# rows (time-dependent): 23\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-A) Transform time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "(N × ^d) table :\t (4, 3)\n",
+ "number of missing entries :\t 3 out of 12 total\n",
+ "Time elapsed: 0.057177 seconds\n",
+ "\n",
+ "Output\n",
+ "S_all, binary features :\t (4, 10)\n",
+ "Time elapsed: 0.212313 seconds\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-B) Transform time-dependent data\n",
+ "--------------------------------------------------------------------------------\n",
+ "Total variables : 4\n",
+ "Frequent variables : ['HR']\n",
+ "M₁ = 1\n",
+ "M₂ = 3\n",
+ "k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "Transforming each example...\n",
+ "100%|█████████████████████████████████████████████| 4/4 [00:01<00:00, 2.82it/s]\n",
+ "DONE: Transforming each example...\n",
+ "(freq) number of missing entries :\t 5 out of 4×4×1=16 total\n",
+ "(freq) number of imputed entries :\t 4\n",
+ "(freq) number of not imputed entries :\t 1\n",
+ "(non-freq) number of missing entries :\t 41 out of 4×4×3=48 total\n",
+ "\n",
+ "(N × L × ^D) table :\t (4, 4, 9)\n",
+ "Time elapsed: 1.567708 seconds\n",
+ "Discretizing features...\n",
+ "\n",
+ "Processing 8 non-boolean variable columns...\n",
+ " Computing bin edges for numeric variables...\n",
+ "100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 72.63it/s]\n",
+ " Discretizing variables to binary features\n",
+ "100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 28.40it/s]\n",
+ "Finished discretizing features\n",
+ "\n",
+ "Output\n",
+ "X_all: shape=(4, 4, 29), density=0.203\n",
+ "Time elapsed: 2.102018 seconds\n"
+ ]
+ }
+ ],
+ "source": [
+ "!docker run -it \\\n",
+ " --mount type='bind',src=\"$(pwd)\",target='/datadir' \\\n",
+ " fiddle-v020 \\\n",
+ "python -m FIDDLE.run \\\n",
+ " --data_fname='/datadir/input/data.csv' \\\n",
+ " --population_fname='/datadir/input/pop.csv' \\\n",
+ " --config_fname='/datadir/input/config-2.yaml' \\\n",
+ " --output_dir='/datadir/output-2/' \\\n",
+ " --T=4 --dt=1.0 \\\n",
+ " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
+ " --stats_functions 'min' 'max' 'mean' \\\n",
+ " --no_prefilter --no_postfilter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " AGE_value_(32.999, 35.8] | \n",
+ " AGE_value_(35.8, 38.6] | \n",
+ " AGE_value_(38.6, 42.0] | \n",
+ " AGE_value_(42.0, 46.0] | \n",
+ " AGE_value_(46.0, 50.0] | \n",
+ " ROOM_value__101 | \n",
+ " ROOM_value__102 | \n",
+ " ROOM_value__103 | \n",
+ " SEX_value_F | \n",
+ " SEX_value_M | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " AGE_value_(32.999, 35.8] AGE_value_(35.8, 38.6] AGE_value_(38.6, 42.0] \\\n",
+ "ID \n",
+ "1 0 0 0 \n",
+ "2 1 0 0 \n",
+ "3 0 0 1 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " AGE_value_(42.0, 46.0] AGE_value_(46.0, 50.0] ROOM_value__101 \\\n",
+ "ID \n",
+ "1 0 1 1 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " ROOM_value__102 ROOM_value__103 SEX_value_F SEX_value_M \n",
+ "ID \n",
+ "1 0 0 0 1 \n",
+ "2 1 0 0 1 \n",
+ "3 0 1 1 0 \n",
+ "4 0 0 0 0 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " HR_mask | \n",
+ " DRUG_A_RATE_value_48 | \n",
+ " DRUG_A_ROUTE_value_Bolus | \n",
+ " DRUG_A_ROUTE_value_IV | \n",
+ " DRUG_A_ROUTE_value_Oral | \n",
+ " LAB_X_value_5 | \n",
+ " LAB_X_value_<1 | \n",
+ " HR_delta_time_(-0.001, 1.0] | \n",
+ " HR_delta_time_(1.0, 2.0] | \n",
+ " HR_value_(60.999, 69.2] | \n",
+ " ... | \n",
+ " HR_max_(60.999, 69.2] | \n",
+ " HR_max_(69.2, 73.6] | \n",
+ " HR_max_(73.6, 76.2] | \n",
+ " HR_max_(76.2, 82.0] | \n",
+ " HR_max_(82.0, 90.0] | \n",
+ " HR_mean_(60.332, 68.8] | \n",
+ " HR_mean_(68.8, 73.6] | \n",
+ " HR_mean_(73.6, 75.2] | \n",
+ " HR_mean_(75.2, 82.0] | \n",
+ " HR_mean_(82.0, 90.0] | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " t_range | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " [0.0, 1.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " [0.0, 1.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " [0.0, 1.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " [0.0, 1.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
16 rows × 29 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " HR_mask DRUG_A_RATE_value_48 DRUG_A_ROUTE_value_Bolus \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 1 0 0 \n",
+ " [1.0, 2.0) 1 0 0 \n",
+ " [2.0, 3.0) 1 1 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "2 [0.0, 1.0) 1 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 0 0 \n",
+ " [3.0, 4.0) 1 1 0 \n",
+ "3 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 1 0 1 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "4 [0.0, 1.0) 1 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 0 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "\n",
+ " DRUG_A_ROUTE_value_IV DRUG_A_ROUTE_value_Oral LAB_X_value_5 \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 1 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "2 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "3 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 0 1 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "4 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "\n",
+ " LAB_X_value_<1 HR_delta_time_(-0.001, 1.0] \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 1 1 \n",
+ " [3.0, 4.0) 0 1 \n",
+ "2 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 1 \n",
+ " [3.0, 4.0) 0 1 \n",
+ "3 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 1 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "4 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 1 \n",
+ " [3.0, 4.0) 0 1 \n",
+ "\n",
+ " HR_delta_time_(1.0, 2.0] HR_value_(60.999, 69.2] ... \\\n",
+ "ID t_range ... \n",
+ "1 [0.0, 1.0) 0 0 ... \n",
+ " [1.0, 2.0) 0 0 ... \n",
+ " [2.0, 3.0) 0 0 ... \n",
+ " [3.0, 4.0) 0 0 ... \n",
+ "2 [0.0, 1.0) 0 1 ... \n",
+ " [1.0, 2.0) 0 1 ... \n",
+ " [2.0, 3.0) 0 0 ... \n",
+ " [3.0, 4.0) 0 0 ... \n",
+ "3 [0.0, 1.0) 0 0 ... \n",
+ " [1.0, 2.0) 0 0 ... \n",
+ " [2.0, 3.0) 0 0 ... \n",
+ " [3.0, 4.0) 1 0 ... \n",
+ "4 [0.0, 1.0) 0 0 ... \n",
+ " [1.0, 2.0) 0 0 ... \n",
+ " [2.0, 3.0) 0 1 ... \n",
+ " [3.0, 4.0) 0 0 ... \n",
+ "\n",
+ " HR_max_(60.999, 69.2] HR_max_(69.2, 73.6] \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "2 [0.0, 1.0) 1 0 \n",
+ " [1.0, 2.0) 1 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "3 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "4 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 1 0 \n",
+ " [3.0, 4.0) 0 1 \n",
+ "\n",
+ " HR_max_(73.6, 76.2] HR_max_(76.2, 82.0] HR_max_(82.0, 90.0] \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 0 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "2 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 1 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "3 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 1 \n",
+ " [2.0, 3.0) 0 0 1 \n",
+ " [3.0, 4.0) 0 0 1 \n",
+ "4 [0.0, 1.0) 0 1 0 \n",
+ " [1.0, 2.0) 0 1 0 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "\n",
+ " HR_mean_(60.332, 68.8] HR_mean_(68.8, 73.6] \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "2 [0.0, 1.0) 1 0 \n",
+ " [1.0, 2.0) 1 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "3 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "4 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 1 0 \n",
+ " [3.0, 4.0) 0 1 \n",
+ "\n",
+ " HR_mean_(73.6, 75.2] HR_mean_(75.2, 82.0] \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 1 0 \n",
+ " [3.0, 4.0) 1 0 \n",
+ "2 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 1 \n",
+ " [3.0, 4.0) 1 0 \n",
+ "3 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "4 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "\n",
+ " HR_mean_(82.0, 90.0] \n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 0 \n",
+ " [2.0, 3.0) 0 \n",
+ " [3.0, 4.0) 0 \n",
+ "2 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 0 \n",
+ " [2.0, 3.0) 0 \n",
+ " [3.0, 4.0) 0 \n",
+ "3 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 1 \n",
+ " [2.0, 3.0) 1 \n",
+ " [3.0, 4.0) 1 \n",
+ "4 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 0 \n",
+ " [2.0, 3.0) 0 \n",
+ " [3.0, 4.0) 0 \n",
+ "\n",
+ "[16 rows x 29 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import json\n",
+ "import sparse\n",
+ "\n",
+ "S = sparse.load_npz('output-2/S_all.npz')\n",
+ "S_names = json.load(open('output-2/S_all.feature_names.json', 'r'))\n",
+ "S_index = pd.read_csv('output-2/S.ID.csv').set_index(['ID'])\n",
+ "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n",
+ "\n",
+ "X = sparse.load_npz('output-2/X_all.npz')\n",
+ "X_names = json.load(open('output-2/X_all.feature_names.json', 'r'))\n",
+ "X_index = pd.read_csv('output-2/X.ID,t_range.csv').set_index(['ID', 't_range'])\n",
+ "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n",
+ "\n",
+ "display(df_S)\n",
+ "display(df_X)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Test 3: discretize = True, use_ordinal_encoding = True"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!mkdir -p output-3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Input:\n",
+ " Data : /datadir/input/data.csv\n",
+ " Population: /datadir/input/pop.csv\n",
+ " Config : /datadir/input/config-3.yaml\n",
+ "\n",
+ "Output directory: /datadir/output-3/\n",
+ "\n",
+ "Input arguments:\n",
+ " T = 4.0\n",
+ " dt = 1.0\n",
+ " θ₁ = 0.001\n",
+ " θ₂ = 0.001\n",
+ " θ_freq = 1.0\n",
+ " k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "discretize = yes\n",
+ " S discretization bins: to be computed from data\n",
+ " X discretization bins: to be computed from data\n",
+ "\n",
+ "N = 4\n",
+ "L = 4\n",
+ "\n",
+ "\n",
+ "================================================================================\n",
+ "2) Transform; 3) Post-filter\n",
+ "================================================================================\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Detecting and parsing value types\n",
+ "--------------------------------------------------------------------------------\n",
+ "Saved as: /datadir/output-3/value_types.csv\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Separate time-invariant and time-dependent\n",
+ "--------------------------------------------------------------------------------\n",
+ "Variables (time-invariant): 3\n",
+ "Variables (time-dependent): 4\n",
+ "# rows (time-invariant): 9\n",
+ "# rows (time-dependent): 23\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-A) Transform time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "(N × ^d) table :\t (4, 3)\n",
+ "number of missing entries :\t 3 out of 12 total\n",
+ "Time elapsed: 0.047925 seconds\n",
+ "\n",
+ "Output\n",
+ "S_all, binary features :\t (4, 10)\n",
+ "Time elapsed: 0.147781 seconds\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-B) Transform time-dependent data\n",
+ "--------------------------------------------------------------------------------\n",
+ "Total variables : 4\n",
+ "Frequent variables : ['HR']\n",
+ "M₁ = 1\n",
+ "M₂ = 3\n",
+ "k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "Transforming each example...\n",
+ "100%|█████████████████████████████████████████████| 4/4 [00:01<00:00, 3.53it/s]\n",
+ "DONE: Transforming each example...\n",
+ "(freq) number of missing entries :\t 5 out of 4×4×1=16 total\n",
+ "(freq) number of imputed entries :\t 4\n",
+ "(freq) number of not imputed entries :\t 1\n",
+ "(non-freq) number of missing entries :\t 41 out of 4×4×3=48 total\n",
+ "\n",
+ "(N × L × ^D) table :\t (4, 4, 9)\n",
+ "Time elapsed: 1.239074 seconds\n",
+ "Discretizing features...\n",
+ "\n",
+ "Processing 8 non-boolean variable columns...\n",
+ " Computing bin edges for numeric variables...\n",
+ "100%|████████████████████████████████████████████| 8/8 [00:00<00:00, 165.76it/s]\n",
+ " Discretizing variables to binary features\n",
+ "100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 24.97it/s]\n",
+ "Finished discretizing features\n",
+ "\n",
+ "Output\n",
+ "X_all: shape=(4, 4, 29), density=0.474\n",
+ "Time elapsed: 1.718451 seconds\n"
+ ]
+ }
+ ],
+ "source": [
+ "!docker run -it \\\n",
+ " --mount type='bind',src=\"$(pwd)\",target='/datadir' \\\n",
+ " fiddle-v020 \\\n",
+ "python -m FIDDLE.run \\\n",
+ " --data_fname='/datadir/input/data.csv' \\\n",
+ " --population_fname='/datadir/input/pop.csv' \\\n",
+ " --config_fname='/datadir/input/config-3.yaml' \\\n",
+ " --output_dir='/datadir/output-3/' \\\n",
+ " --T=4 --dt=1.0 \\\n",
+ " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
+ " --stats_functions 'min' 'max' 'mean' \\\n",
+ " --no_prefilter --no_postfilter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " AGE_value>=33.0 | \n",
+ " AGE_value>=35.8 | \n",
+ " AGE_value>=38.6 | \n",
+ " AGE_value>=42.0 | \n",
+ " AGE_value>=46.0 | \n",
+ " ROOM_value__101 | \n",
+ " ROOM_value__102 | \n",
+ " ROOM_value__103 | \n",
+ " SEX_value_F | \n",
+ " SEX_value_M | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " AGE_value>=33.0 AGE_value>=35.8 AGE_value>=38.6 AGE_value>=42.0 \\\n",
+ "ID \n",
+ "1 1 1 1 1 \n",
+ "2 1 0 0 0 \n",
+ "3 1 1 1 0 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ " AGE_value>=46.0 ROOM_value__101 ROOM_value__102 ROOM_value__103 \\\n",
+ "ID \n",
+ "1 1 1 0 0 \n",
+ "2 0 0 1 0 \n",
+ "3 0 0 0 1 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ " SEX_value_F SEX_value_M \n",
+ "ID \n",
+ "1 0 1 \n",
+ "2 0 1 \n",
+ "3 1 0 \n",
+ "4 0 0 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " HR_mask | \n",
+ " DRUG_A_RATE_value_48 | \n",
+ " DRUG_A_ROUTE_value_Bolus | \n",
+ " DRUG_A_ROUTE_value_IV | \n",
+ " DRUG_A_ROUTE_value_Oral | \n",
+ " LAB_X_value_5 | \n",
+ " LAB_X_value_<1 | \n",
+ " HR_delta_time>=0.0 | \n",
+ " HR_delta_time>=1.0 | \n",
+ " HR_value>=61.0 | \n",
+ " ... | \n",
+ " HR_max>=61.0 | \n",
+ " HR_max>=69.2 | \n",
+ " HR_max>=73.6 | \n",
+ " HR_max>=76.2 | \n",
+ " HR_max>=82.00000000000001 | \n",
+ " HR_mean>=60.333333333333336 | \n",
+ " HR_mean>=68.80000000000001 | \n",
+ " HR_mean>=73.6 | \n",
+ " HR_mean>=75.2 | \n",
+ " HR_mean>=82.00000000000001 | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " t_range | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " [0.0, 1.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " [0.0, 1.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " [0.0, 1.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " [0.0, 1.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
16 rows × 29 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " HR_mask DRUG_A_RATE_value_48 DRUG_A_ROUTE_value_Bolus \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 1 0 0 \n",
+ " [1.0, 2.0) 1 0 0 \n",
+ " [2.0, 3.0) 1 1 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "2 [0.0, 1.0) 1 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 0 0 \n",
+ " [3.0, 4.0) 1 1 0 \n",
+ "3 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 1 0 1 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "4 [0.0, 1.0) 1 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 0 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "\n",
+ " DRUG_A_ROUTE_value_IV DRUG_A_ROUTE_value_Oral LAB_X_value_5 \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 1 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "2 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "3 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 0 1 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "4 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "\n",
+ " LAB_X_value_<1 HR_delta_time>=0.0 HR_delta_time>=1.0 \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 1 0 \n",
+ " [1.0, 2.0) 0 1 0 \n",
+ " [2.0, 3.0) 1 1 0 \n",
+ " [3.0, 4.0) 0 1 0 \n",
+ "2 [0.0, 1.0) 0 1 0 \n",
+ " [1.0, 2.0) 0 1 1 \n",
+ " [2.0, 3.0) 0 1 0 \n",
+ " [3.0, 4.0) 0 1 0 \n",
+ "3 [0.0, 1.0) 0 1 0 \n",
+ " [1.0, 2.0) 0 1 0 \n",
+ " [2.0, 3.0) 0 1 1 \n",
+ " [3.0, 4.0) 0 1 1 \n",
+ "4 [0.0, 1.0) 0 1 0 \n",
+ " [1.0, 2.0) 0 1 1 \n",
+ " [2.0, 3.0) 0 1 0 \n",
+ " [3.0, 4.0) 0 1 0 \n",
+ "\n",
+ " HR_value>=61.0 ... HR_max>=61.0 HR_max>=69.2 HR_max>=73.6 \\\n",
+ "ID t_range ... \n",
+ "1 [0.0, 1.0) 1 ... 1 1 0 \n",
+ " [1.0, 2.0) 1 ... 1 1 0 \n",
+ " [2.0, 3.0) 1 ... 1 1 1 \n",
+ " [3.0, 4.0) 1 ... 1 1 1 \n",
+ "2 [0.0, 1.0) 1 ... 1 0 0 \n",
+ " [1.0, 2.0) 1 ... 1 0 0 \n",
+ " [2.0, 3.0) 1 ... 1 1 1 \n",
+ " [3.0, 4.0) 1 ... 1 1 1 \n",
+ "3 [0.0, 1.0) 0 ... 0 0 0 \n",
+ " [1.0, 2.0) 1 ... 1 1 1 \n",
+ " [2.0, 3.0) 1 ... 1 1 1 \n",
+ " [3.0, 4.0) 1 ... 1 1 1 \n",
+ "4 [0.0, 1.0) 1 ... 1 1 1 \n",
+ " [1.0, 2.0) 1 ... 1 1 1 \n",
+ " [2.0, 3.0) 1 ... 1 0 0 \n",
+ " [3.0, 4.0) 1 ... 1 1 0 \n",
+ "\n",
+ " HR_max>=76.2 HR_max>=82.00000000000001 \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "2 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 1 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "3 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 1 1 \n",
+ " [2.0, 3.0) 1 1 \n",
+ " [3.0, 4.0) 1 1 \n",
+ "4 [0.0, 1.0) 1 0 \n",
+ " [1.0, 2.0) 1 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "\n",
+ " HR_mean>=60.333333333333336 HR_mean>=68.80000000000001 \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 1 1 \n",
+ " [1.0, 2.0) 1 1 \n",
+ " [2.0, 3.0) 1 1 \n",
+ " [3.0, 4.0) 1 1 \n",
+ "2 [0.0, 1.0) 1 0 \n",
+ " [1.0, 2.0) 1 0 \n",
+ " [2.0, 3.0) 1 1 \n",
+ " [3.0, 4.0) 1 1 \n",
+ "3 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 1 1 \n",
+ " [2.0, 3.0) 1 1 \n",
+ " [3.0, 4.0) 1 1 \n",
+ "4 [0.0, 1.0) 1 1 \n",
+ " [1.0, 2.0) 1 1 \n",
+ " [2.0, 3.0) 1 0 \n",
+ " [3.0, 4.0) 1 1 \n",
+ "\n",
+ " HR_mean>=73.6 HR_mean>=75.2 HR_mean>=82.00000000000001 \n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 0 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "2 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 1 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "3 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 1 1 1 \n",
+ " [2.0, 3.0) 1 1 1 \n",
+ " [3.0, 4.0) 1 1 1 \n",
+ "4 [0.0, 1.0) 1 1 0 \n",
+ " [1.0, 2.0) 1 1 0 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "\n",
+ "[16 rows x 29 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import json\n",
+ "import sparse\n",
+ "\n",
+ "S = sparse.load_npz('output-3/S_all.npz')\n",
+ "S_names = json.load(open('output-3/S_all.feature_names.json', 'r'))\n",
+ "S_index = pd.read_csv('output-3/S.ID.csv').set_index(['ID'])\n",
+ "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n",
+ "\n",
+ "X = sparse.load_npz('output-3/X_all.npz')\n",
+ "X_names = json.load(open('output-3/X_all.feature_names.json', 'r'))\n",
+ "X_index = pd.read_csv('output-3/X.ID,t_range.csv').set_index(['ID', 't_range'])\n",
+ "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n",
+ "\n",
+ "display(df_S)\n",
+ "display(df_X)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tests/small_test/Run.ipynb b/tests/small_test/Run.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..b9a073f0aefaf94cee655782941896fd07baf23e
--- /dev/null
+++ b/tests/small_test/Run.ipynb
@@ -0,0 +1,3390 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!rm -rf output-*/"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Test 1: discretize = False"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!mkdir -p output-1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Input:\n",
+ " Data : ./input/data.csv\n",
+ " Population: ./input/pop.csv\n",
+ " Config : ./input/config-1.yaml\n",
+ "\n",
+ "Output directory: ./output-1/\n",
+ "\n",
+ "Input arguments:\n",
+ " T = 4.0\n",
+ " dt = 1.0\n",
+ " θ₁ = 0.001\n",
+ " θ₂ = 0.001\n",
+ " θ_freq = 1.0\n",
+ " k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "discretize = no\n",
+ "\n",
+ "N = 4\n",
+ "L = 4\n",
+ "\n",
+ "\n",
+ "================================================================================\n",
+ "2) Transform; 3) Post-filter\n",
+ "================================================================================\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Detecting and parsing value types\n",
+ "--------------------------------------------------------------------------------\n",
+ "Saved as: ./output-1/value_types.csv\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Separate time-invariant and time-dependent\n",
+ "--------------------------------------------------------------------------------\n",
+ "Variables (time-invariant): 3\n",
+ "Variables (time-dependent): 4\n",
+ "# rows (time-invariant): 9\n",
+ "# rows (time-dependent): 23\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-A) Transform time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "(N × ^d) table :\t (4, 3)\n",
+ "number of missing entries :\t 3 out of 12 total\n",
+ "Time elapsed: 0.027288 seconds\n",
+ "\n",
+ "Output\n",
+ "s_all, binary features :\t (4, 6)\n",
+ "Time elapsed: 0.058650 seconds\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-B) Transform time-dependent data\n",
+ "--------------------------------------------------------------------------------\n",
+ "Total variables : 4\n",
+ "Frequent variables : ['HR']\n",
+ "M₁ = 1\n",
+ "M₂ = 3\n",
+ "k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "Transforming each example...\n",
+ "100%|█████████████████████████████████████████████| 4/4 [00:00<00:00, 4.82it/s]\n",
+ "DONE: Transforming each example...\n",
+ "(freq) number of missing entries :\t 5 out of 4×4×1=16 total\n",
+ "(freq) number of imputed entries :\t 4\n",
+ "(freq) number of not imputed entries :\t 1\n",
+ "(non-freq) number of missing entries :\t 41 out of 4×4×3=48 total\n",
+ "\n",
+ "(N × L × ^D) table :\t (4, 4, 9)\n",
+ "Time elapsed: 0.917519 seconds\n",
+ "Discretizing features...\n",
+ "\n",
+ "Discretizing categorical features...\n",
+ "100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 202.11it/s]\n",
+ "Finished discretizing features\n",
+ "\n",
+ "Output\n",
+ "X_all: shape=(4, 4, 12), density=0.599\n",
+ "Time elapsed: 1.008456 seconds\n",
+ "\u001b[0m"
+ ]
+ }
+ ],
+ "source": [
+ "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n",
+ "python -m FIDDLE.run \\\n",
+ " --data_fname='./input/data.csv' \\\n",
+ " --population_fname='./input/pop.csv' \\\n",
+ " --config_fname='./input/config-1.yaml' \\\n",
+ " --output_dir='./output-1/' \\\n",
+ " --T=4 --dt=1.0 \\\n",
+ " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
+ " --stats_functions 'min' 'max' 'mean' \\\n",
+ " --no_prefilter --no_postfilter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " AGE_value | \n",
+ " ROOM_value:_101 | \n",
+ " ROOM_value:_102 | \n",
+ " ROOM_value:_103 | \n",
+ " SEX_value:F | \n",
+ " SEX_value:M | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 50.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 33.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 40.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 41.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " AGE_value ROOM_value:_101 ROOM_value:_102 ROOM_value:_103 SEX_value:F \\\n",
+ "ID \n",
+ "1 50.0 1.0 0.0 0.0 0.0 \n",
+ "2 33.0 0.0 1.0 0.0 0.0 \n",
+ "3 40.0 0.0 0.0 1.0 1.0 \n",
+ "4 41.0 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " SEX_value:M \n",
+ "ID \n",
+ "1 1.0 \n",
+ "2 1.0 \n",
+ "3 0.0 \n",
+ "4 0.0 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " HR_mask | \n",
+ " DRUG_A_RATE_value | \n",
+ " DRUG_A_ROUTE_value:Bolus | \n",
+ " DRUG_A_ROUTE_value:IV | \n",
+ " DRUG_A_ROUTE_value:Oral | \n",
+ " LAB_X_value | \n",
+ " HR_delta_time | \n",
+ " HR_value | \n",
+ " HR_min | \n",
+ " HR_max | \n",
+ " HR_mean | \n",
+ " LAB_X_value_str:<1 | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " t_range | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " [0.0, 1.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 71.000000 | \n",
+ " 70.0 | \n",
+ " 71.000000 | \n",
+ " 70.500000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 73.000000 | \n",
+ " 72.0 | \n",
+ " 73.000000 | \n",
+ " 72.500000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 74.000000 | \n",
+ " 74.0 | \n",
+ " 74.000000 | \n",
+ " 74.000000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 75.000000 | \n",
+ " 75.0 | \n",
+ " 75.000000 | \n",
+ " 75.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " [0.0, 1.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 61.000000 | \n",
+ " 60.0 | \n",
+ " 61.000000 | \n",
+ " 60.333333 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 0.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 1.0 | \n",
+ " 61.000000 | \n",
+ " 61.0 | \n",
+ " 61.000000 | \n",
+ " 61.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 78.000000 | \n",
+ " 73.0 | \n",
+ " 78.000000 | \n",
+ " 75.500000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 75.000000 | \n",
+ " 75.0 | \n",
+ " 75.000000 | \n",
+ " 75.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " [0.0, 1.0) | \n",
+ " 0.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 75.533333 | \n",
+ " 75.0 | \n",
+ " 75.533333 | \n",
+ " 75.255556 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 90.000000 | \n",
+ " 90.0 | \n",
+ " 90.000000 | \n",
+ " 90.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 0.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 1.0 | \n",
+ " 90.000000 | \n",
+ " 90.0 | \n",
+ " 90.000000 | \n",
+ " 90.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 0.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 2.0 | \n",
+ " 90.000000 | \n",
+ " 90.0 | \n",
+ " 90.000000 | \n",
+ " 90.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " [0.0, 1.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 80.000000 | \n",
+ " 80.0 | \n",
+ " 80.000000 | \n",
+ " 80.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 0.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 1.0 | \n",
+ " 80.000000 | \n",
+ " 80.0 | \n",
+ " 80.000000 | \n",
+ " 80.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 62.000000 | \n",
+ " 62.0 | \n",
+ " 62.000000 | \n",
+ " 62.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1.0 | \n",
+ " 48.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 73.000000 | \n",
+ " 73.0 | \n",
+ " 73.000000 | \n",
+ " 73.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " HR_mask DRUG_A_RATE_value DRUG_A_ROUTE_value:Bolus \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 1.0 48.0 0.0 \n",
+ " [1.0, 2.0) 1.0 48.0 0.0 \n",
+ " [2.0, 3.0) 1.0 48.0 0.0 \n",
+ " [3.0, 4.0) 1.0 48.0 0.0 \n",
+ "2 [0.0, 1.0) 1.0 48.0 0.0 \n",
+ " [1.0, 2.0) 0.0 48.0 0.0 \n",
+ " [2.0, 3.0) 1.0 48.0 0.0 \n",
+ " [3.0, 4.0) 1.0 48.0 0.0 \n",
+ "3 [0.0, 1.0) 0.0 48.0 0.0 \n",
+ " [1.0, 2.0) 1.0 48.0 1.0 \n",
+ " [2.0, 3.0) 0.0 48.0 0.0 \n",
+ " [3.0, 4.0) 0.0 48.0 0.0 \n",
+ "4 [0.0, 1.0) 1.0 48.0 0.0 \n",
+ " [1.0, 2.0) 0.0 48.0 0.0 \n",
+ " [2.0, 3.0) 1.0 48.0 0.0 \n",
+ " [3.0, 4.0) 1.0 48.0 0.0 \n",
+ "\n",
+ " DRUG_A_ROUTE_value:IV DRUG_A_ROUTE_value:Oral LAB_X_value \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0.0 0.0 5.0 \n",
+ " [1.0, 2.0) 0.0 0.0 5.0 \n",
+ " [2.0, 3.0) 0.0 1.0 5.0 \n",
+ " [3.0, 4.0) 0.0 0.0 5.0 \n",
+ "2 [0.0, 1.0) 0.0 0.0 5.0 \n",
+ " [1.0, 2.0) 0.0 0.0 5.0 \n",
+ " [2.0, 3.0) 0.0 0.0 5.0 \n",
+ " [3.0, 4.0) 1.0 0.0 5.0 \n",
+ "3 [0.0, 1.0) 0.0 0.0 5.0 \n",
+ " [1.0, 2.0) 0.0 0.0 5.0 \n",
+ " [2.0, 3.0) 0.0 0.0 5.0 \n",
+ " [3.0, 4.0) 0.0 0.0 5.0 \n",
+ "4 [0.0, 1.0) 0.0 0.0 5.0 \n",
+ " [1.0, 2.0) 0.0 0.0 5.0 \n",
+ " [2.0, 3.0) 0.0 0.0 5.0 \n",
+ " [3.0, 4.0) 0.0 0.0 5.0 \n",
+ "\n",
+ " HR_delta_time HR_value HR_min HR_max HR_mean \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0.0 71.000000 70.0 71.000000 70.500000 \n",
+ " [1.0, 2.0) 0.0 73.000000 72.0 73.000000 72.500000 \n",
+ " [2.0, 3.0) 0.0 74.000000 74.0 74.000000 74.000000 \n",
+ " [3.0, 4.0) 0.0 75.000000 75.0 75.000000 75.000000 \n",
+ "2 [0.0, 1.0) 0.0 61.000000 60.0 61.000000 60.333333 \n",
+ " [1.0, 2.0) 1.0 61.000000 61.0 61.000000 61.000000 \n",
+ " [2.0, 3.0) 0.0 78.000000 73.0 78.000000 75.500000 \n",
+ " [3.0, 4.0) 0.0 75.000000 75.0 75.000000 75.000000 \n",
+ "3 [0.0, 1.0) 0.0 75.533333 75.0 75.533333 75.255556 \n",
+ " [1.0, 2.0) 0.0 90.000000 90.0 90.000000 90.000000 \n",
+ " [2.0, 3.0) 1.0 90.000000 90.0 90.000000 90.000000 \n",
+ " [3.0, 4.0) 2.0 90.000000 90.0 90.000000 90.000000 \n",
+ "4 [0.0, 1.0) 0.0 80.000000 80.0 80.000000 80.000000 \n",
+ " [1.0, 2.0) 1.0 80.000000 80.0 80.000000 80.000000 \n",
+ " [2.0, 3.0) 0.0 62.000000 62.0 62.000000 62.000000 \n",
+ " [3.0, 4.0) 0.0 73.000000 73.0 73.000000 73.000000 \n",
+ "\n",
+ " LAB_X_value_str:<1 \n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0.0 \n",
+ " [1.0, 2.0) 0.0 \n",
+ " [2.0, 3.0) 1.0 \n",
+ " [3.0, 4.0) 0.0 \n",
+ "2 [0.0, 1.0) 0.0 \n",
+ " [1.0, 2.0) 0.0 \n",
+ " [2.0, 3.0) 0.0 \n",
+ " [3.0, 4.0) 0.0 \n",
+ "3 [0.0, 1.0) 0.0 \n",
+ " [1.0, 2.0) 0.0 \n",
+ " [2.0, 3.0) 0.0 \n",
+ " [3.0, 4.0) 0.0 \n",
+ "4 [0.0, 1.0) 0.0 \n",
+ " [1.0, 2.0) 0.0 \n",
+ " [2.0, 3.0) 0.0 \n",
+ " [3.0, 4.0) 0.0 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import json\n",
+ "import sparse\n",
+ "\n",
+ "S = sparse.load_npz('output-1/S_all.npz')\n",
+ "S_names = json.load(open('output-1/S_all.feature_names.json', 'r'))\n",
+ "S_index = pd.read_csv('output-1/S.ID.csv').set_index(['ID'])\n",
+ "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n",
+ "\n",
+ "X = sparse.load_npz('output-1/X_all.npz')\n",
+ "X_names = json.load(open('output-1/X_all.feature_names.json', 'r'))\n",
+ "X_index = pd.read_csv('output-1/X.ID,t_range.csv').set_index(['ID', 't_range'])\n",
+ "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n",
+ "\n",
+ "display(df_S)\n",
+ "display(df_X)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Test 2: discretize = True, use_ordinal_encoding = False"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!mkdir -p output-2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Input:\n",
+ " Data : ./input/data.csv\n",
+ " Population: ./input/pop.csv\n",
+ " Config : ./input/config-2.yaml\n",
+ "\n",
+ "Output directory: ./output-2/\n",
+ "\n",
+ "Input arguments:\n",
+ " T = 4.0\n",
+ " dt = 1.0\n",
+ " θ₁ = 0.001\n",
+ " θ₂ = 0.001\n",
+ " θ_freq = 1.0\n",
+ " k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "discretize = yes\n",
+ " S discretization bins: to be computed from data\n",
+ " X discretization bins: to be computed from data\n",
+ "\n",
+ "N = 4\n",
+ "L = 4\n",
+ "\n",
+ "\n",
+ "================================================================================\n",
+ "2) Transform; 3) Post-filter\n",
+ "================================================================================\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Detecting and parsing value types\n",
+ "--------------------------------------------------------------------------------\n",
+ "Saved as: ./output-2/value_types.csv\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Separate time-invariant and time-dependent\n",
+ "--------------------------------------------------------------------------------\n",
+ "Variables (time-invariant): 3\n",
+ "Variables (time-dependent): 4\n",
+ "# rows (time-invariant): 9\n",
+ "# rows (time-dependent): 23\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-A) Transform time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "(N × ^d) table :\t (4, 3)\n",
+ "number of missing entries :\t 3 out of 12 total\n",
+ "Time elapsed: 0.020094 seconds\n",
+ "\n",
+ "Output\n",
+ "s_all, binary features :\t (4, 10)\n",
+ "Time elapsed: 0.065039 seconds\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-B) Transform time-dependent data\n",
+ "--------------------------------------------------------------------------------\n",
+ "Total variables : 4\n",
+ "Frequent variables : ['HR']\n",
+ "M₁ = 1\n",
+ "M₂ = 3\n",
+ "k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "Transforming each example...\n",
+ "100%|█████████████████████████████████████████████| 4/4 [00:00<00:00, 6.65it/s]\n",
+ "DONE: Transforming each example...\n",
+ "(freq) number of missing entries :\t 5 out of 4×4×1=16 total\n",
+ "(freq) number of imputed entries :\t 4\n",
+ "(freq) number of not imputed entries :\t 1\n",
+ "(non-freq) number of missing entries :\t 41 out of 4×4×3=48 total\n",
+ "\n",
+ "(N × L × ^D) table :\t (4, 4, 9)\n",
+ "Time elapsed: 0.653901 seconds\n",
+ "Discretizing features...\n",
+ "\n",
+ "Processing 8 non-boolean variable columns...\n",
+ " Computing bin edges for numeric variables...\n",
+ "100%|████████████████████████████████████████████| 8/8 [00:00<00:00, 228.60it/s]\n",
+ " Discretizing variables to binary features\n",
+ "100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 87.10it/s]\n",
+ "Finished discretizing features\n",
+ "\n",
+ "Output\n",
+ "X_all: shape=(4, 4, 29), density=0.203\n",
+ "Time elapsed: 0.800083 seconds\n",
+ "\u001b[0m"
+ ]
+ }
+ ],
+ "source": [
+ "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n",
+ "python -m FIDDLE.run \\\n",
+ " --data_fname='./input/data.csv' \\\n",
+ " --population_fname='./input/pop.csv' \\\n",
+ " --config_fname='./input/config-2.yaml' \\\n",
+ " --output_dir='./output-2/' \\\n",
+ " --T=4 --dt=1.0 \\\n",
+ " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
+ " --stats_functions 'min' 'max' 'mean' \\\n",
+ " --no_prefilter --no_postfilter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " AGE_value_(32.999, 35.8] | \n",
+ " AGE_value_(35.8, 38.6] | \n",
+ " AGE_value_(38.6, 42.0] | \n",
+ " AGE_value_(42.0, 46.0] | \n",
+ " AGE_value_(46.0, 50.0] | \n",
+ " ROOM_value__101 | \n",
+ " ROOM_value__102 | \n",
+ " ROOM_value__103 | \n",
+ " SEX_value_F | \n",
+ " SEX_value_M | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " AGE_value_(32.999, 35.8] AGE_value_(35.8, 38.6] AGE_value_(38.6, 42.0] \\\n",
+ "ID \n",
+ "1 0 0 0 \n",
+ "2 1 0 0 \n",
+ "3 0 0 1 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " AGE_value_(42.0, 46.0] AGE_value_(46.0, 50.0] ROOM_value__101 \\\n",
+ "ID \n",
+ "1 0 1 1 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " ROOM_value__102 ROOM_value__103 SEX_value_F SEX_value_M \n",
+ "ID \n",
+ "1 0 0 0 1 \n",
+ "2 1 0 0 1 \n",
+ "3 0 1 1 0 \n",
+ "4 0 0 0 0 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " HR_mask | \n",
+ " DRUG_A_RATE_value_48 | \n",
+ " DRUG_A_ROUTE_value_Bolus | \n",
+ " DRUG_A_ROUTE_value_IV | \n",
+ " DRUG_A_ROUTE_value_Oral | \n",
+ " LAB_X_value_5 | \n",
+ " LAB_X_value_<1 | \n",
+ " HR_delta_time_(-0.001, 1.0] | \n",
+ " HR_delta_time_(1.0, 2.0] | \n",
+ " HR_value_(60.999, 69.2] | \n",
+ " ... | \n",
+ " HR_max_(60.999, 69.2] | \n",
+ " HR_max_(69.2, 73.6] | \n",
+ " HR_max_(73.6, 76.2] | \n",
+ " HR_max_(76.2, 82.0] | \n",
+ " HR_max_(82.0, 90.0] | \n",
+ " HR_mean_(60.332, 68.8] | \n",
+ " HR_mean_(68.8, 73.6] | \n",
+ " HR_mean_(73.6, 75.2] | \n",
+ " HR_mean_(75.2, 82.0] | \n",
+ " HR_mean_(82.0, 90.0] | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " t_range | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " [0.0, 1.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " [0.0, 1.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " [0.0, 1.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " [0.0, 1.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
16 rows × 29 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " HR_mask DRUG_A_RATE_value_48 DRUG_A_ROUTE_value_Bolus \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 1 0 0 \n",
+ " [1.0, 2.0) 1 0 0 \n",
+ " [2.0, 3.0) 1 1 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "2 [0.0, 1.0) 1 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 0 0 \n",
+ " [3.0, 4.0) 1 1 0 \n",
+ "3 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 1 0 1 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "4 [0.0, 1.0) 1 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 0 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "\n",
+ " DRUG_A_ROUTE_value_IV DRUG_A_ROUTE_value_Oral LAB_X_value_5 \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 1 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "2 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "3 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 0 1 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "4 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "\n",
+ " LAB_X_value_<1 HR_delta_time_(-0.001, 1.0] \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 1 1 \n",
+ " [3.0, 4.0) 0 1 \n",
+ "2 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 1 \n",
+ " [3.0, 4.0) 0 1 \n",
+ "3 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 1 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "4 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 1 \n",
+ " [3.0, 4.0) 0 1 \n",
+ "\n",
+ " HR_delta_time_(1.0, 2.0] HR_value_(60.999, 69.2] ... \\\n",
+ "ID t_range ... \n",
+ "1 [0.0, 1.0) 0 0 ... \n",
+ " [1.0, 2.0) 0 0 ... \n",
+ " [2.0, 3.0) 0 0 ... \n",
+ " [3.0, 4.0) 0 0 ... \n",
+ "2 [0.0, 1.0) 0 1 ... \n",
+ " [1.0, 2.0) 0 1 ... \n",
+ " [2.0, 3.0) 0 0 ... \n",
+ " [3.0, 4.0) 0 0 ... \n",
+ "3 [0.0, 1.0) 0 0 ... \n",
+ " [1.0, 2.0) 0 0 ... \n",
+ " [2.0, 3.0) 0 0 ... \n",
+ " [3.0, 4.0) 1 0 ... \n",
+ "4 [0.0, 1.0) 0 0 ... \n",
+ " [1.0, 2.0) 0 0 ... \n",
+ " [2.0, 3.0) 0 1 ... \n",
+ " [3.0, 4.0) 0 0 ... \n",
+ "\n",
+ " HR_max_(60.999, 69.2] HR_max_(69.2, 73.6] \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "2 [0.0, 1.0) 1 0 \n",
+ " [1.0, 2.0) 1 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "3 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "4 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 1 0 \n",
+ " [3.0, 4.0) 0 1 \n",
+ "\n",
+ " HR_max_(73.6, 76.2] HR_max_(76.2, 82.0] HR_max_(82.0, 90.0] \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 0 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "2 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 1 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "3 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 1 \n",
+ " [2.0, 3.0) 0 0 1 \n",
+ " [3.0, 4.0) 0 0 1 \n",
+ "4 [0.0, 1.0) 0 1 0 \n",
+ " [1.0, 2.0) 0 1 0 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "\n",
+ " HR_mean_(60.332, 68.8] HR_mean_(68.8, 73.6] \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "2 [0.0, 1.0) 1 0 \n",
+ " [1.0, 2.0) 1 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "3 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "4 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 1 0 \n",
+ " [3.0, 4.0) 0 1 \n",
+ "\n",
+ " HR_mean_(73.6, 75.2] HR_mean_(75.2, 82.0] \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 1 0 \n",
+ " [3.0, 4.0) 1 0 \n",
+ "2 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 1 \n",
+ " [3.0, 4.0) 1 0 \n",
+ "3 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "4 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "\n",
+ " HR_mean_(82.0, 90.0] \n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 0 \n",
+ " [2.0, 3.0) 0 \n",
+ " [3.0, 4.0) 0 \n",
+ "2 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 0 \n",
+ " [2.0, 3.0) 0 \n",
+ " [3.0, 4.0) 0 \n",
+ "3 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 1 \n",
+ " [2.0, 3.0) 1 \n",
+ " [3.0, 4.0) 1 \n",
+ "4 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 0 \n",
+ " [2.0, 3.0) 0 \n",
+ " [3.0, 4.0) 0 \n",
+ "\n",
+ "[16 rows x 29 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import json\n",
+ "import sparse\n",
+ "\n",
+ "S = sparse.load_npz('output-2/S_all.npz')\n",
+ "S_names = json.load(open('output-2/S_all.feature_names.json', 'r'))\n",
+ "S_index = pd.read_csv('output-2/S.ID.csv').set_index(['ID'])\n",
+ "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n",
+ "\n",
+ "X = sparse.load_npz('output-2/X_all.npz')\n",
+ "X_names = json.load(open('output-2/X_all.feature_names.json', 'r'))\n",
+ "X_index = pd.read_csv('output-2/X.ID,t_range.csv').set_index(['ID', 't_range'])\n",
+ "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n",
+ "\n",
+ "display(df_S)\n",
+ "display(df_X)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Test 2.1: predetermined discretization bins"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!mkdir -p output-2-bins"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Input:\n",
+ " Data : ./input/data.csv\n",
+ " Population: ./input/pop.csv\n",
+ " Config : ./input/config-2-bins.yaml\n",
+ "\n",
+ "Output directory: ./output-2-bins/\n",
+ "\n",
+ "Input arguments:\n",
+ " T = 4.0\n",
+ " dt = 1.0\n",
+ " θ₁ = 0.001\n",
+ " θ₂ = 0.001\n",
+ " θ_freq = 1.0\n",
+ " k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "discretize = yes\n",
+ " S discretization bins: input/s_all.discretization.json\n",
+ " X discretization bins: input/X_all.discretization.json\n",
+ "\n",
+ "N = 4\n",
+ "L = 4\n",
+ "\n",
+ "\n",
+ "================================================================================\n",
+ "2) Transform; 3) Post-filter\n",
+ "================================================================================\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Detecting and parsing value types\n",
+ "--------------------------------------------------------------------------------\n",
+ "Saved as: ./output-2-bins/value_types.csv\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Separate time-invariant and time-dependent\n",
+ "--------------------------------------------------------------------------------\n",
+ "Variables (time-invariant): 3\n",
+ "Variables (time-dependent): 4\n",
+ "# rows (time-invariant): 9\n",
+ "# rows (time-dependent): 23\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-A) Transform time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "(N × ^d) table :\t (4, 3)\n",
+ "number of missing entries :\t 3 out of 12 total\n",
+ "Time elapsed: 0.018257 seconds\n",
+ "\n",
+ "Output\n",
+ "s_all, binary features :\t (4, 10)\n",
+ "Time elapsed: 0.055306 seconds\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-B) Transform time-dependent data\n",
+ "--------------------------------------------------------------------------------\n",
+ "Total variables : 4\n",
+ "Frequent variables : ['HR']\n",
+ "M₁ = 1\n",
+ "M₂ = 3\n",
+ "k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "Transforming each example...\n",
+ "100%|█████████████████████████████████████████████| 4/4 [00:00<00:00, 5.98it/s]\n",
+ "DONE: Transforming each example...\n",
+ "(freq) number of missing entries :\t 5 out of 4×4×1=16 total\n",
+ "(freq) number of imputed entries :\t 4\n",
+ "(freq) number of not imputed entries :\t 1\n",
+ "(non-freq) number of missing entries :\t 41 out of 4×4×3=48 total\n",
+ "\n",
+ "(N × L × ^D) table :\t (4, 4, 9)\n",
+ "Time elapsed: 0.727093 seconds\n",
+ "Discretizing features...\n",
+ "\n",
+ "Processing 8 non-boolean variable columns...\n",
+ " Usng predetermined bin edges for numeric variables...\n",
+ " Discretizing variables to binary features\n",
+ "100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 99.05it/s]\n",
+ "Finished discretizing features\n",
+ "\n",
+ "Output\n",
+ "X_all: shape=(4, 4, 29), density=0.203\n",
+ "Time elapsed: 0.828591 seconds\n",
+ "\u001b[0m"
+ ]
+ }
+ ],
+ "source": [
+ "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n",
+ "python -m FIDDLE.run \\\n",
+ " --data_fname='./input/data.csv' \\\n",
+ " --population_fname='./input/pop.csv' \\\n",
+ " --config_fname='./input/config-2-bins.yaml' \\\n",
+ " --output_dir='./output-2-bins/' \\\n",
+ " --T=4 --dt=1.0 \\\n",
+ " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
+ " --stats_functions 'min' 'max' 'mean' \\\n",
+ " --no_prefilter --no_postfilter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " AGE_value_(32.999, 35.8] | \n",
+ " AGE_value_(35.8, 38.6] | \n",
+ " AGE_value_(38.6, 42.0] | \n",
+ " AGE_value_(42.0, 46.0] | \n",
+ " AGE_value_(46.0, 50.0] | \n",
+ " ROOM_value__101 | \n",
+ " ROOM_value__102 | \n",
+ " ROOM_value__103 | \n",
+ " SEX_value_F | \n",
+ " SEX_value_M | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " AGE_value_(32.999, 35.8] AGE_value_(35.8, 38.6] AGE_value_(38.6, 42.0] \\\n",
+ "ID \n",
+ "1 0 0 0 \n",
+ "2 1 0 0 \n",
+ "3 0 0 1 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " AGE_value_(42.0, 46.0] AGE_value_(46.0, 50.0] ROOM_value__101 \\\n",
+ "ID \n",
+ "1 0 1 1 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " ROOM_value__102 ROOM_value__103 SEX_value_F SEX_value_M \n",
+ "ID \n",
+ "1 0 0 0 1 \n",
+ "2 1 0 0 1 \n",
+ "3 0 1 1 0 \n",
+ "4 0 0 0 0 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " HR_mask | \n",
+ " DRUG_A_RATE_value_48 | \n",
+ " DRUG_A_ROUTE_value_Bolus | \n",
+ " DRUG_A_ROUTE_value_IV | \n",
+ " DRUG_A_ROUTE_value_Oral | \n",
+ " LAB_X_value_5 | \n",
+ " LAB_X_value_<1 | \n",
+ " HR_delta_time_(-0.001, 1.0] | \n",
+ " HR_delta_time_(1.0, 2.0] | \n",
+ " HR_value_(60.999, 69.2] | \n",
+ " ... | \n",
+ " HR_max_(60.999, 69.2] | \n",
+ " HR_max_(69.2, 73.6] | \n",
+ " HR_max_(73.6, 76.2] | \n",
+ " HR_max_(76.2, 82.0] | \n",
+ " HR_max_(82.0, 90.0] | \n",
+ " HR_mean_(60.332, 68.8] | \n",
+ " HR_mean_(68.8, 73.6] | \n",
+ " HR_mean_(73.6, 75.2] | \n",
+ " HR_mean_(75.2, 82.0] | \n",
+ " HR_mean_(82.0, 90.0] | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " t_range | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " [0.0, 1.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " [0.0, 1.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " [0.0, 1.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " [0.0, 1.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
16 rows × 29 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " HR_mask DRUG_A_RATE_value_48 DRUG_A_ROUTE_value_Bolus \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 1 0 0 \n",
+ " [1.0, 2.0) 1 0 0 \n",
+ " [2.0, 3.0) 1 1 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "2 [0.0, 1.0) 1 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 0 0 \n",
+ " [3.0, 4.0) 1 1 0 \n",
+ "3 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 1 0 1 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "4 [0.0, 1.0) 1 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 0 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "\n",
+ " DRUG_A_ROUTE_value_IV DRUG_A_ROUTE_value_Oral LAB_X_value_5 \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 1 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "2 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "3 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 0 1 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "4 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "\n",
+ " LAB_X_value_<1 HR_delta_time_(-0.001, 1.0] \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 1 1 \n",
+ " [3.0, 4.0) 0 1 \n",
+ "2 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 1 \n",
+ " [3.0, 4.0) 0 1 \n",
+ "3 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 1 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "4 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 1 \n",
+ " [3.0, 4.0) 0 1 \n",
+ "\n",
+ " HR_delta_time_(1.0, 2.0] HR_value_(60.999, 69.2] ... \\\n",
+ "ID t_range ... \n",
+ "1 [0.0, 1.0) 0 0 ... \n",
+ " [1.0, 2.0) 0 0 ... \n",
+ " [2.0, 3.0) 0 0 ... \n",
+ " [3.0, 4.0) 0 0 ... \n",
+ "2 [0.0, 1.0) 0 1 ... \n",
+ " [1.0, 2.0) 0 1 ... \n",
+ " [2.0, 3.0) 0 0 ... \n",
+ " [3.0, 4.0) 0 0 ... \n",
+ "3 [0.0, 1.0) 0 0 ... \n",
+ " [1.0, 2.0) 0 0 ... \n",
+ " [2.0, 3.0) 0 0 ... \n",
+ " [3.0, 4.0) 1 0 ... \n",
+ "4 [0.0, 1.0) 0 0 ... \n",
+ " [1.0, 2.0) 0 0 ... \n",
+ " [2.0, 3.0) 0 1 ... \n",
+ " [3.0, 4.0) 0 0 ... \n",
+ "\n",
+ " HR_max_(60.999, 69.2] HR_max_(69.2, 73.6] \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "2 [0.0, 1.0) 1 0 \n",
+ " [1.0, 2.0) 1 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "3 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "4 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 1 0 \n",
+ " [3.0, 4.0) 0 1 \n",
+ "\n",
+ " HR_max_(73.6, 76.2] HR_max_(76.2, 82.0] HR_max_(82.0, 90.0] \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 0 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "2 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 1 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "3 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 1 \n",
+ " [2.0, 3.0) 0 0 1 \n",
+ " [3.0, 4.0) 0 0 1 \n",
+ "4 [0.0, 1.0) 0 1 0 \n",
+ " [1.0, 2.0) 0 1 0 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "\n",
+ " HR_mean_(60.332, 68.8] HR_mean_(68.8, 73.6] \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "2 [0.0, 1.0) 1 0 \n",
+ " [1.0, 2.0) 1 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "3 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "4 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 1 0 \n",
+ " [3.0, 4.0) 0 1 \n",
+ "\n",
+ " HR_mean_(73.6, 75.2] HR_mean_(75.2, 82.0] \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 1 0 \n",
+ " [3.0, 4.0) 1 0 \n",
+ "2 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 1 \n",
+ " [3.0, 4.0) 1 0 \n",
+ "3 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "4 [0.0, 1.0) 0 1 \n",
+ " [1.0, 2.0) 0 1 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "\n",
+ " HR_mean_(82.0, 90.0] \n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 0 \n",
+ " [2.0, 3.0) 0 \n",
+ " [3.0, 4.0) 0 \n",
+ "2 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 0 \n",
+ " [2.0, 3.0) 0 \n",
+ " [3.0, 4.0) 0 \n",
+ "3 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 1 \n",
+ " [2.0, 3.0) 1 \n",
+ " [3.0, 4.0) 1 \n",
+ "4 [0.0, 1.0) 0 \n",
+ " [1.0, 2.0) 0 \n",
+ " [2.0, 3.0) 0 \n",
+ " [3.0, 4.0) 0 \n",
+ "\n",
+ "[16 rows x 29 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import json\n",
+ "import sparse\n",
+ "\n",
+ "S = sparse.load_npz('output-2-bins/S_all.npz')\n",
+ "S_names = json.load(open('output-2-bins/S_all.feature_names.json', 'r'))\n",
+ "S_index = pd.read_csv('output-2-bins/S.ID.csv').set_index(['ID'])\n",
+ "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n",
+ "\n",
+ "X = sparse.load_npz('output-2-bins/X_all.npz')\n",
+ "X_names = json.load(open('output-2-bins/X_all.feature_names.json', 'r'))\n",
+ "X_index = pd.read_csv('output-2-bins/X.ID,t_range.csv').set_index(['ID', 't_range'])\n",
+ "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n",
+ "\n",
+ "display(df_S)\n",
+ "display(df_X)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Test 3: discretize = True, use_ordinal_encoding = True"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!mkdir -p output-3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Input:\n",
+ " Data : ./input/data.csv\n",
+ " Population: ./input/pop.csv\n",
+ " Config : ./input/config-3.yaml\n",
+ "\n",
+ "Output directory: ./output-3/\n",
+ "\n",
+ "Input arguments:\n",
+ " T = 4.0\n",
+ " dt = 1.0\n",
+ " θ₁ = 0.001\n",
+ " θ₂ = 0.001\n",
+ " θ_freq = 1.0\n",
+ " k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "discretize = yes\n",
+ " S discretization bins: to be computed from data\n",
+ " X discretization bins: to be computed from data\n",
+ "\n",
+ "N = 4\n",
+ "L = 4\n",
+ "\n",
+ "\n",
+ "================================================================================\n",
+ "2) Transform; 3) Post-filter\n",
+ "================================================================================\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Detecting and parsing value types\n",
+ "--------------------------------------------------------------------------------\n",
+ "Saved as: ./output-3/value_types.csv\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "*) Separate time-invariant and time-dependent\n",
+ "--------------------------------------------------------------------------------\n",
+ "Variables (time-invariant): 3\n",
+ "Variables (time-dependent): 4\n",
+ "# rows (time-invariant): 9\n",
+ "# rows (time-dependent): 23\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-A) Transform time-invariant data\n",
+ "--------------------------------------------------------------------------------\n",
+ "(N × ^d) table :\t (4, 3)\n",
+ "number of missing entries :\t 3 out of 12 total\n",
+ "Time elapsed: 0.018871 seconds\n",
+ "\n",
+ "Output\n",
+ "s_all, binary features :\t (4, 10)\n",
+ "Time elapsed: 0.061661 seconds\n",
+ "\n",
+ "--------------------------------------------------------------------------------\n",
+ "2-B) Transform time-dependent data\n",
+ "--------------------------------------------------------------------------------\n",
+ "Total variables : 4\n",
+ "Frequent variables : ['HR']\n",
+ "M₁ = 1\n",
+ "M₂ = 3\n",
+ "k = 3 ['min', 'max', 'mean']\n",
+ "\n",
+ "Transforming each example...\n",
+ "100%|█████████████████████████████████████████████| 4/4 [00:00<00:00, 5.89it/s]\n",
+ "DONE: Transforming each example...\n",
+ "(freq) number of missing entries :\t 5 out of 4×4×1=16 total\n",
+ "(freq) number of imputed entries :\t 4\n",
+ "(freq) number of not imputed entries :\t 1\n",
+ "(non-freq) number of missing entries :\t 41 out of 4×4×3=48 total\n",
+ "\n",
+ "(N × L × ^D) table :\t (4, 4, 9)\n",
+ "Time elapsed: 0.735244 seconds\n",
+ "Discretizing features...\n",
+ "\n",
+ "Processing 8 non-boolean variable columns...\n",
+ " Computing bin edges for numeric variables...\n",
+ "100%|████████████████████████████████████████████| 8/8 [00:00<00:00, 313.93it/s]\n",
+ " Discretizing variables to binary features\n",
+ "100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 38.42it/s]\n",
+ "Finished discretizing features\n",
+ "\n",
+ "Output\n",
+ "X_all: shape=(4, 4, 29), density=0.420\n",
+ "Time elapsed: 0.989317 seconds\n",
+ "\u001b[0m"
+ ]
+ }
+ ],
+ "source": [
+ "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n",
+ "python -m FIDDLE.run \\\n",
+ " --data_fname='./input/data.csv' \\\n",
+ " --population_fname='./input/pop.csv' \\\n",
+ " --config_fname='./input/config-3.yaml' \\\n",
+ " --output_dir='./output-3/' \\\n",
+ " --T=4 --dt=1.0 \\\n",
+ " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
+ " --stats_functions 'min' 'max' 'mean' \\\n",
+ " --no_prefilter --no_postfilter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " AGE_value>=33.0 | \n",
+ " AGE_value>=35.8 | \n",
+ " AGE_value>=38.6 | \n",
+ " AGE_value>=42.0 | \n",
+ " AGE_value>=46.0 | \n",
+ " ROOM_value__101 | \n",
+ " ROOM_value__102 | \n",
+ " ROOM_value__103 | \n",
+ " SEX_value_F | \n",
+ " SEX_value_M | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " AGE_value>=33.0 AGE_value>=35.8 AGE_value>=38.6 AGE_value>=42.0 \\\n",
+ "ID \n",
+ "1 1 1 1 1 \n",
+ "2 0 0 0 0 \n",
+ "3 1 1 1 0 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ " AGE_value>=46.0 ROOM_value__101 ROOM_value__102 ROOM_value__103 \\\n",
+ "ID \n",
+ "1 1 1 0 0 \n",
+ "2 0 0 1 0 \n",
+ "3 0 0 0 1 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ " SEX_value_F SEX_value_M \n",
+ "ID \n",
+ "1 0 1 \n",
+ "2 0 1 \n",
+ "3 1 0 \n",
+ "4 0 0 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " HR_mask | \n",
+ " DRUG_A_RATE_value_48 | \n",
+ " DRUG_A_ROUTE_value_Bolus | \n",
+ " DRUG_A_ROUTE_value_IV | \n",
+ " DRUG_A_ROUTE_value_Oral | \n",
+ " LAB_X_value_5 | \n",
+ " LAB_X_value_<1 | \n",
+ " HR_delta_time>=0.0 | \n",
+ " HR_delta_time>=1.0 | \n",
+ " HR_value>=61.0 | \n",
+ " ... | \n",
+ " HR_max>=61.0 | \n",
+ " HR_max>=69.2 | \n",
+ " HR_max>=73.6 | \n",
+ " HR_max>=76.2 | \n",
+ " HR_max>=82.00000000000001 | \n",
+ " HR_mean>=60.333333333333336 | \n",
+ " HR_mean>=68.80000000000001 | \n",
+ " HR_mean>=73.6 | \n",
+ " HR_mean>=75.2 | \n",
+ " HR_mean>=82.00000000000001 | \n",
+ "
\n",
+ " \n",
+ " ID | \n",
+ " t_range | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " [0.0, 1.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " [0.0, 1.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " [0.0, 1.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " [0.0, 1.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [1.0, 2.0) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [2.0, 3.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " [3.0, 4.0) | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
16 rows × 29 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " HR_mask DRUG_A_RATE_value_48 DRUG_A_ROUTE_value_Bolus \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 1 0 0 \n",
+ " [1.0, 2.0) 1 0 0 \n",
+ " [2.0, 3.0) 1 1 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "2 [0.0, 1.0) 1 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 0 0 \n",
+ " [3.0, 4.0) 1 1 0 \n",
+ "3 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 1 0 1 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "4 [0.0, 1.0) 1 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 0 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "\n",
+ " DRUG_A_ROUTE_value_IV DRUG_A_ROUTE_value_Oral LAB_X_value_5 \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 1 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "2 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "3 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 0 1 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "4 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "\n",
+ " LAB_X_value_<1 HR_delta_time>=0.0 HR_delta_time>=1.0 \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "2 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 1 0 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "3 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 0 1 0 \n",
+ " [3.0, 4.0) 0 1 1 \n",
+ "4 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 1 0 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "\n",
+ " HR_value>=61.0 ... HR_max>=61.0 HR_max>=69.2 HR_max>=73.6 \\\n",
+ "ID t_range ... \n",
+ "1 [0.0, 1.0) 1 ... 1 1 0 \n",
+ " [1.0, 2.0) 1 ... 1 1 0 \n",
+ " [2.0, 3.0) 1 ... 1 1 1 \n",
+ " [3.0, 4.0) 1 ... 1 1 1 \n",
+ "2 [0.0, 1.0) 0 ... 0 0 0 \n",
+ " [1.0, 2.0) 0 ... 0 0 0 \n",
+ " [2.0, 3.0) 1 ... 1 1 1 \n",
+ " [3.0, 4.0) 1 ... 1 1 1 \n",
+ "3 [0.0, 1.0) 0 ... 0 0 0 \n",
+ " [1.0, 2.0) 1 ... 1 1 1 \n",
+ " [2.0, 3.0) 1 ... 1 1 1 \n",
+ " [3.0, 4.0) 1 ... 1 1 1 \n",
+ "4 [0.0, 1.0) 1 ... 1 1 1 \n",
+ " [1.0, 2.0) 1 ... 1 1 1 \n",
+ " [2.0, 3.0) 1 ... 1 0 0 \n",
+ " [3.0, 4.0) 1 ... 1 1 0 \n",
+ "\n",
+ " HR_max>=76.2 HR_max>=82.00000000000001 \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "2 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 0 0 \n",
+ " [2.0, 3.0) 1 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "3 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 1 1 \n",
+ " [2.0, 3.0) 1 1 \n",
+ " [3.0, 4.0) 1 1 \n",
+ "4 [0.0, 1.0) 1 0 \n",
+ " [1.0, 2.0) 1 0 \n",
+ " [2.0, 3.0) 0 0 \n",
+ " [3.0, 4.0) 0 0 \n",
+ "\n",
+ " HR_mean>=60.333333333333336 HR_mean>=68.80000000000001 \\\n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 1 1 \n",
+ " [1.0, 2.0) 1 1 \n",
+ " [2.0, 3.0) 1 1 \n",
+ " [3.0, 4.0) 1 1 \n",
+ "2 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 1 0 \n",
+ " [2.0, 3.0) 1 1 \n",
+ " [3.0, 4.0) 1 1 \n",
+ "3 [0.0, 1.0) 0 0 \n",
+ " [1.0, 2.0) 1 1 \n",
+ " [2.0, 3.0) 1 1 \n",
+ " [3.0, 4.0) 1 1 \n",
+ "4 [0.0, 1.0) 1 1 \n",
+ " [1.0, 2.0) 1 1 \n",
+ " [2.0, 3.0) 1 0 \n",
+ " [3.0, 4.0) 1 1 \n",
+ "\n",
+ " HR_mean>=73.6 HR_mean>=75.2 HR_mean>=82.00000000000001 \n",
+ "ID t_range \n",
+ "1 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 0 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "2 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 0 0 0 \n",
+ " [2.0, 3.0) 1 1 0 \n",
+ " [3.0, 4.0) 1 0 0 \n",
+ "3 [0.0, 1.0) 0 0 0 \n",
+ " [1.0, 2.0) 1 1 1 \n",
+ " [2.0, 3.0) 1 1 1 \n",
+ " [3.0, 4.0) 1 1 1 \n",
+ "4 [0.0, 1.0) 1 1 0 \n",
+ " [1.0, 2.0) 1 1 0 \n",
+ " [2.0, 3.0) 0 0 0 \n",
+ " [3.0, 4.0) 0 0 0 \n",
+ "\n",
+ "[16 rows x 29 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import json\n",
+ "import sparse\n",
+ "\n",
+ "S = sparse.load_npz('output-3/S_all.npz')\n",
+ "S_names = json.load(open('output-3/S_all.feature_names.json', 'r'))\n",
+ "S_index = pd.read_csv('output-3/S.ID.csv').set_index(['ID'])\n",
+ "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n",
+ "\n",
+ "X = sparse.load_npz('output-3/X_all.npz')\n",
+ "X_names = json.load(open('output-3/X_all.feature_names.json', 'r'))\n",
+ "X_index = pd.read_csv('output-3/X.ID,t_range.csv').set_index(['ID', 't_range'])\n",
+ "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n",
+ "\n",
+ "display(df_S)\n",
+ "display(df_X)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tests/small_test/input/X_all.discretization.json b/tests/small_test/input/X_all.discretization.json
new file mode 100644
index 0000000000000000000000000000000000000000..03d039879cb3c1d58b76e4ec0f5f59a41da86118
--- /dev/null
+++ b/tests/small_test/input/X_all.discretization.json
@@ -0,0 +1 @@
+{"DRUG_A_RATE_value": null, "DRUG_A_ROUTE_value": null, "LAB_X_value": null, "HR_delta_time": [0.0, 1.0, 2.0], "HR_value": [61.0, 69.2, 73.6, 76.2, 82.00000000000001, 90.0], "HR_min": [60.0, 68.4, 73.0, 75.0, 82.00000000000001, 90.0], "HR_max": [61.0, 69.2, 73.6, 76.2, 82.00000000000001, 90.0], "HR_mean": [60.333333333333336, 68.80000000000001, 73.6, 75.2, 82.00000000000001, 90.0]}
\ No newline at end of file
diff --git a/tests/small_test/input/config-1.yaml b/tests/small_test/input/config-1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..679862de171094f80f20a98d99579939a1d7b665
--- /dev/null
+++ b/tests/small_test/input/config-1.yaml
@@ -0,0 +1,7 @@
+discretize: no
+
+parallel: no
+n_jobs: 1
+
+value_types:
+ ROOM: Categorical
diff --git a/tests/small_test/input/config-2-bins.yaml b/tests/small_test/input/config-2-bins.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aa79b4bd4a9e4f4cba5cb9bcc1d93291819d70fc
--- /dev/null
+++ b/tests/small_test/input/config-2-bins.yaml
@@ -0,0 +1,10 @@
+discretize: yes
+use_ordinal_encoding: no
+S_discretization_bins: 'input/S_all.discretization.json'
+X_discretization_bins: 'input/X_all.discretization.json'
+
+parallel: no
+n_jobs: 1
+
+value_types:
+ ROOM: Categorical
diff --git a/tests/small_test/input/config-2.yaml b/tests/small_test/input/config-2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e51bb502223b6129081521e1e34056db5b8c63e5
--- /dev/null
+++ b/tests/small_test/input/config-2.yaml
@@ -0,0 +1,8 @@
+discretize: yes
+use_ordinal_encoding: no
+
+parallel: no
+n_jobs: 1
+
+value_types:
+ ROOM: Categorical
diff --git a/tests/small_test/input/config-3.yaml b/tests/small_test/input/config-3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..370e730db136225c1e2cd563fbf2a0737eae2fd6
--- /dev/null
+++ b/tests/small_test/input/config-3.yaml
@@ -0,0 +1,8 @@
+discretize: yes
+use_ordinal_encoding: yes
+
+parallel: no
+n_jobs: 1
+
+value_types:
+ ROOM: Categorical
diff --git a/test/small_test/input_data.csv b/tests/small_test/input/data.csv
similarity index 77%
rename from test/small_test/input_data.csv
rename to tests/small_test/input/data.csv
index 86239ef9d15bc7da8f46d9897855faaf09335a8e..2dc23c5d6688c40c8745b8d3fd5a709c7432957e 100644
--- a/test/small_test/input_data.csv
+++ b/tests/small_test/input/data.csv
@@ -1,6 +1,7 @@
ID,t,variable_name,variable_value
1,NULL,AGE,50
2,NULL,AGE,33
+3,NULL,AGE,40
1,NULL,SEX,M
2,NULL,SEX,M
3,NULL,SEX,F
@@ -20,13 +21,13 @@ ID,t,variable_name,variable_value
2,2.9,HR,78
2,3.5,HR,75
3,1.7,HR,90
+4,0.7,HR,80
+4,2.5,HR,62
+4,3.9,HR,73
1,2.3,DRUG_A_RATE,48
2,3.4,DRUG_A_RATE,48
-1,2.3,DRUG_A_ROUTE,Mouth
-2,3.4,DRUG_A_ROUTE,Cont.IV
+1,2.3,DRUG_A_ROUTE,Oral
+2,3.4,DRUG_A_ROUTE,IV
3,1,DRUG_A_ROUTE,Bolus
1,2.3,LAB_X,<1
-3,2.7,LAB_X,5
-4,0.7,HR,80
-4,2.5,HR,62
-4,3.9,HR,73
\ No newline at end of file
+3,2.7,LAB_X,5
\ No newline at end of file
diff --git a/test/small_test/pop.csv b/tests/small_test/input/pop.csv
similarity index 100%
rename from test/small_test/pop.csv
rename to tests/small_test/input/pop.csv
index 4d6ba9bc759bda91e8b5dd2ab73db68a9e7bb83b..a911283dac17180ad2b16dab766a87adf56de95c 100644
--- a/test/small_test/pop.csv
+++ b/tests/small_test/input/pop.csv
@@ -1,5 +1,5 @@
ID
-1
-2
3
4
+1
+2
diff --git a/tests/small_test/input/s_all.discretization.json b/tests/small_test/input/s_all.discretization.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b746e3b311206f5102f18efa0006b3476905f80
--- /dev/null
+++ b/tests/small_test/input/s_all.discretization.json
@@ -0,0 +1 @@
+{"AGE_value": [33.0, 35.8, 38.6, 42.0, 46.0, 50.0], "ROOM_value": null, "SEX_value": null}
\ No newline at end of file
diff --git a/tests/small_test/reference-1/df_S.csv b/tests/small_test/reference-1/df_S.csv
new file mode 100644
index 0000000000000000000000000000000000000000..d055870959d3625ee26ae058bd0f041737aac6ba
--- /dev/null
+++ b/tests/small_test/reference-1/df_S.csv
@@ -0,0 +1,5 @@
+ID,AGE_value,ROOM_value:_101,ROOM_value:_102,ROOM_value:_103,SEX_value:F,SEX_value:M
+1,50.0,1.0,0.0,0.0,0.0,1.0
+2,33.0,0.0,1.0,0.0,0.0,1.0
+3,40.0,0.0,0.0,1.0,1.0,0.0
+4,41.0,0.0,0.0,0.0,0.0,0.0
diff --git a/tests/small_test/reference-1/df_X.csv b/tests/small_test/reference-1/df_X.csv
new file mode 100644
index 0000000000000000000000000000000000000000..283ba54ffe73b95cfca7784a08947f5416b44ee3
--- /dev/null
+++ b/tests/small_test/reference-1/df_X.csv
@@ -0,0 +1,17 @@
+ID,t_range,HR_mask,DRUG_A_RATE_value,DRUG_A_ROUTE_value:Bolus,DRUG_A_ROUTE_value:IV,DRUG_A_ROUTE_value:Oral,LAB_X_value,HR_delta_time,HR_value,HR_min,HR_max,HR_mean,LAB_X_value_str:<1
+1,"[0.0, 1.0)",1.0,48.0,0.0,0.0,0.0,5.0,0.0,71.0,70.0,71.0,70.5,0.0
+1,"[1.0, 2.0)",1.0,48.0,0.0,0.0,0.0,5.0,0.0,73.0,72.0,73.0,72.5,0.0
+1,"[2.0, 3.0)",1.0,48.0,0.0,0.0,1.0,5.0,0.0,74.0,74.0,74.0,74.0,1.0
+1,"[3.0, 4.0)",1.0,48.0,0.0,0.0,0.0,5.0,0.0,75.0,75.0,75.0,75.0,0.0
+2,"[0.0, 1.0)",1.0,48.0,0.0,0.0,0.0,5.0,0.0,61.0,60.0,61.0,60.333333333333336,0.0
+2,"[1.0, 2.0)",0.0,48.0,0.0,0.0,0.0,5.0,1.0,61.0,61.0,61.0,61.0,0.0
+2,"[2.0, 3.0)",1.0,48.0,0.0,0.0,0.0,5.0,0.0,78.0,73.0,78.0,75.5,0.0
+2,"[3.0, 4.0)",1.0,48.0,0.0,1.0,0.0,5.0,0.0,75.0,75.0,75.0,75.0,0.0
+3,"[0.0, 1.0)",0.0,48.0,0.0,0.0,0.0,5.0,0.0,75.53333333333333,75.0,75.53333333333333,75.25555555555556,0.0
+3,"[1.0, 2.0)",1.0,48.0,1.0,0.0,0.0,5.0,0.0,90.0,90.0,90.0,90.0,0.0
+3,"[2.0, 3.0)",0.0,48.0,0.0,0.0,0.0,5.0,1.0,90.0,90.0,90.0,90.0,0.0
+3,"[3.0, 4.0)",0.0,48.0,0.0,0.0,0.0,5.0,2.0,90.0,90.0,90.0,90.0,0.0
+4,"[0.0, 1.0)",1.0,48.0,0.0,0.0,0.0,5.0,0.0,80.0,80.0,80.0,80.0,0.0
+4,"[1.0, 2.0)",0.0,48.0,0.0,0.0,0.0,5.0,1.0,80.0,80.0,80.0,80.0,0.0
+4,"[2.0, 3.0)",1.0,48.0,0.0,0.0,0.0,5.0,0.0,62.0,62.0,62.0,62.0,0.0
+4,"[3.0, 4.0)",1.0,48.0,0.0,0.0,0.0,5.0,0.0,73.0,73.0,73.0,73.0,0.0