diff --git a/.gitignore b/.gitignore index e74dffadfd3127242f2f41b25cf53ccc309c3bc9..95c242e15f8be52ead23d3efcf8fdbb1726bf68e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ data/* +**output**/ .ipynb_checkpoints *.png diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..b8b7570639559a207e2b482e0184ffa960abc1da --- /dev/null +++ b/Dockerfile @@ -0,0 +1,7 @@ +FROM python:3.8 +WORKDIR /workdir + +COPY requirements.txt . +RUN pip install -r requirements.txt + +COPY FIDDLE/ ./FIDDLE/ diff --git a/FIDDLE/config.yaml b/FIDDLE/config-default.yaml similarity index 84% rename from FIDDLE/config.yaml rename to FIDDLE/config-default.yaml index d30932a3992530e5f050c8bfcf33579b8b28e83f..b21c5fc5dca809dedee28c4f8620bfa446b87e78 100644 --- a/FIDDLE/config.yaml +++ b/FIDDLE/config-default.yaml @@ -5,10 +5,17 @@ column_names: var_name: variable_name var_value: variable_value -use_ordinal_encoding: no +parallel: yes +n_jobs: 72 +batch_size: 100 + hierarchical_sep: ":" hierarchical_levels: [0, 1, 2] +discretize: yes +use_ordinal_encoding: no +discretization: ~ + value_types: # enter the feature type that you would like to override in the following format: FIRST_WARDID: Categorical diff --git a/FIDDLE/config.py b/FIDDLE/config.py index 25fb05c2fbbe9e06ac540d3ef22b321f28bb421d..4b5f74e7d77ae559a1a797a61e297bfbb8ba7f2a 100644 --- a/FIDDLE/config.py +++ b/FIDDLE/config.py @@ -1,16 +1,27 @@ import os, yaml -with open(os.path.join(os.path.dirname(__file__), 'config.yaml')) as f: - config = yaml.full_load(f) +import copy -ID_col = config['column_names']['ID'] -var_col = config['column_names']['var_name'] -val_col = config['column_names']['var_value'] -t_col = config['column_names']['t'] -hierarchical_sep = config['hierarchical_sep'] -hierarchical_levels = config['hierarchical_levels'] +with open(os.path.join(os.path.dirname(__file__), 'config-default.yaml')) as f: + config_default = yaml.safe_load(f) -use_ordinal_encoding = config['use_ordinal_encoding'] -value_type_override = config['value_types'] +def load_config(fname): + config = copy.deepcopy(config_default) + if fname: + config_custom = yaml.safe_load(open(fname, 'r')) + for k, v in config_custom.items(): + config[k] = v + return config -parallel = True -n_jobs = 72 + +ID_col = 'ID' +t_col = 't' +var_col = 'variable_name' +val_col = 'variable_value' + +if 'column_names' in config_default: + ID_col = config_default['column_names'].get('ID', 'ID') + t_col = config_default['column_names'].get('t', 't') + var_col = config_default['column_names'].get('var_name', 'variable_name') + val_col = config_default['column_names'].get('var_value', 'variable_value') +else: + pass diff --git a/FIDDLE/helpers.py b/FIDDLE/helpers.py index 93b39916ddc100471b5fdd29f91fa4fe6afc04a2..18142e812c6eff70978e99c737f98de187bd0147 100644 --- a/FIDDLE/helpers.py +++ b/FIDDLE/helpers.py @@ -1,29 +1,19 @@ -import argparse -def str2bool(v): - if isinstance(v, bool): - return v - if v.lower() in ('yes', 'true', 't', 'y', '1'): - return True - elif v.lower() in ('no', 'false', 'f', 'n', '0'): - return False - else: - raise argparse.ArgumentTypeError('Boolean value expected.') - - -from .config import * import pandas as pd import numpy as np import scipy import sparse from collections import defaultdict - -from joblib import Parallel, delayed, parallel_backend from tqdm import tqdm from sklearn.feature_selection import VarianceThreshold import sklearn from collections import defaultdict +try: + from .config import * +except: + from config import * + def print_header(*content, char='='): print() print(char * 80) @@ -95,11 +85,11 @@ def get_unique_variables(df): return sorted(df[var_col].unique()) def get_frequent_numeric_variables(df_time_series, variables, threshold, args): - data_path = args.data_path + output_dir = args.output_dir df_population = args.df_population T, dt = args.T, args.dt - df_types = pd.read_csv(data_path + 'value_types.csv').set_index(var_col)['value_type'] + df_types = pd.read_csv(output_dir + 'value_types.csv').set_index(var_col)['value_type'] numeric_vars = [col for col in variables if df_types[col] == 'Numeric'] df_num_counts = calculate_variable_counts(df_time_series, df_population)[numeric_vars] #gets the count of each variable for each patient. variables_num_freq = df_num_counts.columns[df_num_counts.mean() >= threshold * np.floor(T/dt)] @@ -136,23 +126,41 @@ def select_dtype(df, dtype, dtypes=None): assert False return -def smart_qcut_dummify(x, q, use_ordinal_encoding=False): + +def compute_bin_edges(x, q): # ignore strings when performing qcut z = x.copy() z = z.apply(make_float) m = z.apply(np.isreal) + bin_edges = None if z.loc[m].dropna().nunique() > 1: # when more than one numeric values - if use_ordinal_encoding: - bin_edges = np.nanpercentile(z.loc[m].astype(float).to_numpy(), [0, 20, 40, 60, 80, 100]) - bin_edges = np.unique(bin_edges) - col_names = ['{}>={}'.format(z.name, bin_edge) for bin_edge in bin_edges[:-1]] - out = pd.DataFrame(0, z.index, col_names) - for i, bin_edge in enumerate(bin_edges[:-1]): - out.loc[m, col_names[i]] = (z.loc[m] > bin_edge).astype(int) - out = pd.concat([out, pd.get_dummies(z.where(~m, np.nan), prefix=z.name)], axis=1) + if z.loc[m].dropna().nunique() == 2: + pass + else: + bin_edges = list(np.unique(np.nanpercentile(z.loc[m].astype(float).values, np.linspace(0, 100, q+1)))) + return (x.name, bin_edges) + +def smart_qcut_dummify_parallel(first_arg): + return smart_qcut_dummify(*first_arg) + +def smart_qcut_dummify(x, bin_edges, use_ordinal_encoding=False): + # ignore strings when performing qcut + z = x.copy() + z = z.apply(make_float) + m = z.apply(np.isreal) + if z.loc[m].dropna().nunique() > 1: # when more than one unique numeric values + if z.loc[m].dropna().nunique() == 2: # when only two unique numeric values + out = pd.get_dummies(x, prefix=x.name) else: - z.loc[m] = pd.qcut(z.loc[m].to_numpy(), q=q, duplicates='drop') - out = pd.get_dummies(z, prefix=z.name) + if use_ordinal_encoding: + col_names = ['{}>={}'.format(z.name, bin_edge) for bin_edge in bin_edges[:-1]] + out = pd.DataFrame(0, z.index, col_names) + for i, bin_edge in enumerate(bin_edges[:-1]): + out.loc[m, col_names[i]] = (z.loc[m] >= bin_edge).astype(int) + out = pd.concat([out, pd.get_dummies(z.where(~m, np.nan), prefix=z.name)], axis=1) + else: + z.loc[m] = pd.cut(z.loc[m].to_numpy(), bin_edges, duplicates='drop', include_lowest=True) + out = pd.get_dummies(z, prefix=z.name) else: out = pd.get_dummies(x, prefix=x.name) return out @@ -202,13 +210,13 @@ def pivot_event_table(df): # Handle cases where the same variable is recorded multiple times with the same timestamp # Adjust the timestamps by epsilon so that all timestamps are unique eps = 1e-6 - m_dups = df.duplicated([ID_col, t_col, var_col], keep=False) + m_dups = df.duplicated([t_col, var_col], keep=False) df_dups = df[m_dups].copy() for v, df_v in df_dups.groupby(var_col): df_dups.loc[df_v.index, t_col] += eps * np.arange(len(df_v)) df = pd.concat([df[~m_dups], df_dups]) - assert not df.duplicated([ID_col, t_col, var_col], keep=False).any() + assert not df.duplicated([t_col, var_col], keep=False).any() return pd.pivot_table(df, val_col, t_col, var_col, 'first') diff --git a/FIDDLE/run.py b/FIDDLE/run.py old mode 100755 new mode 100644 index 7caf2f72f4d51a3495ddcd8238573d4749939a3c..c39802ac518aa432c29573d03fdb96f8176f100a --- a/FIDDLE/run.py +++ b/FIDDLE/run.py @@ -1,112 +1,141 @@ -from .config import * -import pickle import pandas as pd import numpy as np +import pickle import time import os - +import yaml +import json import argparse -from .helpers import str2bool - -parser = argparse.ArgumentParser(description='') -parser.add_argument('--T', type=float, required=True) -parser.add_argument('--dt', type=float, required=True) -parser.add_argument('--theta_1', type=float, default=0.001) -parser.add_argument('--theta_2', type=float, default=0.001) -parser.add_argument('--theta_freq', type=float, default=1.0) -parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean']) -parser.add_argument('--binarize', type=str2bool, default=True, nargs='?', const=True) - -parser.add_argument('--data_path', type=str, required=True) -parser.add_argument('--input_fname', type=str, required=False) -parser.add_argument('--population', type=str, required=True) -parser.add_argument('--N', type=int, required=False) -parser.add_argument('--Ds', nargs='+', type=int) - -parser.add_argument('--no_prefilter', dest='prefilter', action='store_false') -parser.add_argument('--no_postfilter', dest='postfilter', action='store_false') -parser.set_defaults(prefilter=True, postfilter=True) - -args = parser.parse_args() - -data_path = args.data_path -if not data_path.endswith('/'): - data_path += '/' - -population = args.population -T = int(args.T) -dt = args.dt -theta_1 = args.theta_1 -theta_2 = args.theta_2 -theta_freq = args.theta_freq -stats_functions = args.stats_functions -binarize = args.binarize - -df_population = pd.read_csv(population).set_index('ID') -N = args.N or len(df_population) -df_population = df_population.iloc[:args.N] -L = int(np.floor(T/dt)) - -args.df_population = df_population -args.N = N -args.L = L -args.parallel = parallel - -if args.input_fname and os.path.isfile(args.input_fname): - input_fname = args.input_fname - if input_fname.endswith('.p' or '.pickle'): - df_data = pd.read_pickle(input_fname) - elif input_fname.endswith('.csv'): - df_data = pd.read_csv(input_fname) + +import FIDDLE.config as FIDDLE_config +import FIDDLE.steps as FIDDLE_steps + +def main(): + ###### + # User arguments + ###### + parser = argparse.ArgumentParser(description='') + + # Files + parser.add_argument('--data_fname', type=str, required=True) + parser.add_argument('--population_fname',type=str, required=True) + parser.add_argument('--output_dir', type=str, required=True) + parser.add_argument('--config_fname', type=str, required=False) + + # Settings + parser.add_argument('--T', type=float, required=True) + parser.add_argument('--dt', type=float, required=True) + parser.add_argument('--theta_1', type=float, default=0.001) + parser.add_argument('--theta_2', type=float, default=0.001) + parser.add_argument('--theta_freq', type=float, default=1.0) + parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean']) + + # Debug + parser.add_argument('--N', type=int, required=False) + parser.add_argument('--Ds', nargs='+', type=int) + parser.add_argument('--no_prefilter', dest='prefilter', action='store_false') + parser.add_argument('--no_postfilter', dest='postfilter', action='store_false') + parser.set_defaults(prefilter=True, postfilter=True) + + args = parser.parse_args() + + + ###### + # Load files + ###### + + data_fname = args.data_fname + if data_fname.endswith('.p' or '.pickle'): + df_data = pd.read_pickle(data_fname) + elif data_fname.endswith('.csv'): + df_data = pd.read_csv(data_fname) else: - assert False -elif os.path.isfile(data_path + 'input_data.p'): - input_fname = data_path + 'input_data.p' - df_data = pd.read_pickle(input_fname) -elif os.path.isfile(data_path + 'input_data.pickle'): - input_fname = data_path + 'input_data.pickle' - df_data = pd.read_pickle(input_fname) -elif os.path.isfile(data_path + 'input_data.csv'): - input_fname = data_path + 'input_data.csv' - df_data = pd.read_csv(input_fname) -else: - raise NotImplementedError - - -from .steps import * - -print('Input data file:', input_fname) -print() -print('Input arguments:') -print(' {:<6} = {}'.format('T', T)) -print(' {:<6} = {}'.format('dt', dt)) -print(' {:<6} = {}'.format('\u03B8\u2081', theta_1)) -print(' {:<6} = {}'.format('\u03B8\u2082', theta_2)) -print(' {:<6} = {}'.format('\u03B8_freq', theta_freq)) -print(' {:<6} = {} {}'.format('k', len(stats_functions), stats_functions)) -print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize])) -print() -print('N = {}'.format(N)) -print('L = {}'.format(L)) -print('', flush=True) - - -###### -# Main -###### -if args.prefilter: - print_header('1) Pre-filter') - df_data = pre_filter(df_data, theta_1, df_population, args) - df_data.to_csv(data_path + 'pre-filtered.csv', index=False) - -print_header('2) Transform; 3) Post-filter') -df_data, df_types = parse_variable_data_type(df_data, value_type_override, args) -df_time_invariant, df_time_series = split_by_timestamp_type(df_data) - -# Process time-invariant data -if len(df_time_invariant) > 0: - s, s_feature_names, s_feature_aliases = process_time_invariant(df_time_invariant, args) - -# Process time-dependent data -if len(df_time_series) > 0: - X, X_feature_names, X_feature_aliases = process_time_dependent(df_time_series, args) + raise NotImplementedError + + df_population = args.df_population = pd.read_csv(args.population_fname).set_index('ID').sort_index() + config = FIDDLE_config.load_config(args.config_fname) + + + ## Arguments settings + output_dir = args.output_dir + if not output_dir.endswith('/'): + output_dir += '/' + + T = args.T + dt = args.dt + theta_1 = args.theta_1 + theta_2 = args.theta_2 + theta_freq = args.theta_freq + stats_functions = args.stats_functions + + args.hierarchical_sep = config.get('hierarchical_sep', ':') + args.hierarchical_levels = config.get('hierarchical_levels', []) + args.value_type_override = config.get('value_types', {}) + + args.discretize = config.get('discretize', True) + args.use_ordinal_encoding = config.get('use_ordinal_encoding', False) + + args.S_discretization_bins = None + args.X_discretization_bins = None + S_discretization_bins = config.get('S_discretization_bins') + X_discretization_bins = config.get('X_discretization_bins') + if S_discretization_bins: + args.s_discretization_bins = json.load(open(S_discretization_bins, 'r')) + if X_discretization_bins: + args.X_discretization_bins = json.load(open(X_discretization_bins, 'r')) + + args.parallel = config.get('parallel', False) + args.n_jobs = config.get('n_jobs', 1) + args.batch_size = config.get('batch_size', 100) + + N = args.N = args.N or len(df_population) + df_population = df_population.iloc[:args.N] + L = args.L = int(np.floor(T/dt)) + + print('Input:') + print(' Data :', args.data_fname) + print(' Population:', args.population_fname) + print(' Config :', args.config_fname) + print() + print('Output directory:', args.output_dir) + print() + print('Input arguments:') + print(' {:<6} = {}'.format('T', T)) + print(' {:<6} = {}'.format('dt', dt)) + print(' {:<6} = {}'.format('\u03B8\u2081', theta_1)) + print(' {:<6} = {}'.format('\u03B8\u2082', theta_2)) + print(' {:<6} = {}'.format('\u03B8_freq', theta_freq)) + print(' {:<6} = {} {}'.format('k', len(stats_functions), stats_functions)) + print() + print('{} = {}'.format('discretize', {False: 'no', True: 'yes'}[args.discretize])) + if args.discretize: + print(' S discretization bins:', S_discretization_bins or 'to be computed from data') + print(' X discretization bins:', X_discretization_bins or 'to be computed from data') + print() + print('N = {}'.format(N)) + print('L = {}'.format(L)) + print('', flush=True) + + + ###### + # Main + ###### + df_population[[]].to_csv(output_dir + 'IDs.csv') + + if args.prefilter: + FIDDLE_steps.print_header('1) Pre-filter') + df_data = FIDDLE_steps.pre_filter(df_data, theta_1, df_population, args) + df_data.to_csv(output_dir + 'pre-filtered.csv', index=False) + + FIDDLE_steps.print_header('2) Transform; 3) Post-filter') + df_data, df_types = FIDDLE_steps.parse_variable_data_type(df_data, args) + df_time_invariant, df_time_series = FIDDLE_steps.split_by_timestamp_type(df_data) + + # Process time-invariant data + S, S_feature_names, S_feature_aliases = FIDDLE_steps.process_time_invariant(df_time_invariant, args) + + # Process time-dependent data + X, X_feature_names, X_feature_aliases = FIDDLE_steps.process_time_dependent(df_time_series, args) + +if __name__ == '__main__': + main() diff --git a/FIDDLE/steps.py b/FIDDLE/steps.py index 509e2bca60bd90238ef3bf3c519b92dae1b06c57..ceb461e2c9ffa8095ea6961af997b624edaac1f3 100644 --- a/FIDDLE/steps.py +++ b/FIDDLE/steps.py @@ -4,45 +4,50 @@ FIDDLE Preprocessing steps 2. Transform 3. Post-filter """ -from .helpers import * +try: + from .helpers import * +except: + from helpers import * import time import json +import joblib +import multiprocessing def pre_filter(df, threshold, df_population, args): T = int(args.T) theta_1 = args.theta_1 df_population = args.df_population - + # Remove rows not in population print('Remove rows not in population') df = df[df['ID'].isin(df_population.index)] - + # Remove rows with t outside of [0, T) print('Remove rows with t outside of [0, {}]'.format(T)) df = df[pd.isnull(df[t_col]) | ((0 <= df[t_col]) & (df[t_col] < T))] - + # Data table should not contain duplicate rows with any numerical values # Check for inconsistencies - var_names = [v for v, ty in value_type_override.items() if 'hierarchical' in ty.lower() or 'categorical' in ty.lower()] + var_names = [v for v, ty in args.value_type_override.items() if 'hierarchical' in ty.lower() or 'categorical' in ty.lower()] df_tmp = df[~df[var_col].isin(var_names)] dups = df_tmp.duplicated(subset=[ID_col, t_col, var_col], keep=False) df_dups = df_tmp[dups] if any(dups) and any(is_numeric(v) for v in df_dups[val_col] if not pd.isnull(v)): print(df_dups.head()) raise Exception('Inconsistent numerical values recorded') - + # Remove variables that occur too rarely as defined by the threshold print('Remove rare variables (<= {})'.format(threshold)) - + ## Calculate overall occurrence rate of each variable based on IDs df_count = calculate_variable_counts(df, df_population) # (N x |var|) table of counts df_bool = df_count.astype(bool) # convert counts to boolean - + ## Keep variables that are recorded for more than threshold fraction of IDs variables_keep = df_bool.columns[df_bool.mean(axis=0) > threshold] df_out = df[df[var_col].isin(variables_keep)] assert set(variables_keep) == set(df_out[var_col].unique()) - + variables = sorted(df_bool.columns) variables_remove = sorted(set(variables) - set(variables_keep)) print('Total variables :', len(variables)) @@ -53,22 +58,22 @@ def pre_filter(df, threshold, df_population, args): return df_out -def parse_variable_data_type(df_data, value_type_override, args): +def parse_variable_data_type(df_data, args): # 1. parse hierarchical values (e.g. ICD codes) into strings # 2. automatically detect value types, respecting user override, and set dtypes in DataFrames # 3. pre-map duplicated non-numerical values into multiple categorical variables - data_path = args.data_path + output_dir = args.output_dir df = df_data assert val_col in df.columns print_header('*) Detecting and parsing value types', char='-') - + ## 1. Hierarchical values - var_names = [v for v, ty in value_type_override.items() if 'hierarchical' in ty.lower()] + var_names = [v for v, ty in args.value_type_override.items() if 'hierarchical' in ty.lower()] if len(var_names) == 0: # No hierarchical values pass - + for var_name in var_names: - var_type = value_type_override[var_name] + var_type = args.value_type_override[var_name] df_var = df.loc[df[var_col] == var_name, val_col] if var_type.lower() == 'hierarchical_icd': # need to figure out ICD version @@ -79,24 +84,24 @@ def parse_variable_data_type(df_data, value_type_override, args): df_var = df_var.apply(lambda s: map_icd_hierarchy(s, version=10)) else: df_var = df_var.apply(lambda s: s.split(hierarchical_sep)) - + # Assign mapped values back to original df df.loc[df[var_col] == var_name, val_col] = df_var - + # Only encode selected levels df_nonhier = df[~df[var_col].isin(var_names)] df_hier = df[df[var_col].isin(var_names)] df_hier_levels = [] - for hier_level in hierarchical_levels: + for hier_level in args.hierarchical_levels: # encode level if available df_hier_level = df_hier.copy() df_hier_level[val_col] = df_hier_level[val_col].apply(lambda h: h[min(hier_level, len(h))]) df_hier_levels.append(df_hier_level) df_hier_levels = pd.concat(df_hier_levels).drop_duplicates() - + # Combine hierarchical and non-hierarchical data df = pd.concat([df_nonhier, df_hier_levels]) - + ## 2. Detect value types data_types = [] @@ -110,10 +115,10 @@ def parse_variable_data_type(df_data, value_type_override, args): # Determine type of each variable for variable, values in sorted(values_by_variable.items()): # Manual override type in config - if variable in value_type_override: - data_types.append((variable, value_type_override[variable])) + if variable in args.value_type_override: + data_types.append((variable, args.value_type_override[variable])) # Force categorical values to be a string - if value_type_override[variable] == 'Categorical' and \ + if args.value_type_override[variable] == 'Categorical' and \ any(is_numeric(v) for v in values if not pd.isnull(v)): m_var = df[var_col] == variable df.loc[m_var, val_col] = df.loc[m_var, val_col].apply(lambda s: '_' + str(s)) @@ -126,14 +131,14 @@ def parse_variable_data_type(df_data, value_type_override, args): data_types.append((variable, 'Numeric + Categorical')) else: data_types.append((variable, 'Categorical')) - + df_types = pd.DataFrame(data_types, columns=['variable_name', 'value_type']) df_types[var_col] = df_types[var_col].astype(str) df_types = df_types.set_index(var_col) - fpath = data_path + 'value_types.csv' + fpath = output_dir + 'value_types.csv' df_types.to_csv(fpath, quoting=1) print('Saved as:', fpath) - + ## 3. Pre-map duplicated non-numerical values to separate variables var_names = [v for v, ty in data_types if 'numeric' not in ty.lower() and 'none' not in ty.lower()] df_non_num = df[df[var_col].isin(var_names)].copy() @@ -144,17 +149,17 @@ def parse_variable_data_type(df_data, value_type_override, args): df_non_num_dup[val_col] = 1 df_non_num[dup_] = df_non_num_dup df[df[var_col].isin(var_names)] = df_non_num - + return df, df_types['value_type'] def split_by_timestamp_type(df): print_header('*) Separate time-invariant and time-dependent', char='-') - + variables_inv = df[pd.isnull(df[t_col])][var_col].unique() # Invariant variables have t = NULL df_time_invariant = df[df[var_col].isin(variables_inv)] df_time_series = df[~df[var_col].isin(variables_inv)] - + print('Variables (time-invariant):', len(variables_inv)) print('Variables (time-dependent):', df[var_col].nunique() - len(variables_inv)) print('# rows (time-invariant):', len(df_time_invariant)) @@ -163,86 +168,111 @@ def split_by_timestamp_type(df): def process_time_invariant(df_data_time_invariant, args): - data_path = args.data_path + if len(df_data_time_invariant) == 0: + return None, None, None + + output_dir = args.output_dir df_population = args.df_population theta_2 = args.theta_2 - + + ############## print_header('2-A) Transform time-invariant data', char='-') - dir_path = data_path + '/' + dir_path = output_dir + '/' start_time = time.time() ## Create Nxd^ table df_time_invariant = transform_time_invariant_table(df_data_time_invariant, df_population) + df_time_invariant[[]].to_csv(dir_path + 'S.ID.csv') print('Time elapsed: %f seconds' % (time.time() - start_time)) ## Discretize - s_all, s_all_feature_names = map_time_invariant_features(df_time_invariant, args) - sparse.save_npz(dir_path + 's_all.npz', s_all) - with open(dir_path + 's_all.feature_names.json', 'w') as f: - json.dump(list(s_all_feature_names), f, sort_keys=True) - print('Time elapsed: %f seconds' % (time.time() - start_time)) + S_all, S_all_feature_names, S_discretization_bins = map_time_invariant_features(df_time_invariant, args) + sparse.save_npz(dir_path + 'S_all.npz', S_all) + json.dump(list(S_all_feature_names), open(dir_path + 'S_all.feature_names.json', 'w'), sort_keys=True) + json.dump(S_discretization_bins, open(dir_path + 'S_all.discretization.json', 'w')) - print_header('3-A) Post-filter time-invariant data', char='-') - - ## Filter - s, s_feature_names, s_feature_aliases = post_filter(s_all, s_all_feature_names, theta_2) print('Time elapsed: %f seconds' % (time.time() - start_time)) - - ## Save output - print() - print('Output') - print('s: shape={}, density={:.3f}'.format(s.shape, s.density)) - sparse.save_npz(dir_path + 's.npz', s) - - with open(dir_path + 's.feature_names.json', 'w') as f: - json.dump(list(s_feature_names), f, sort_keys=True) - with open(dir_path + 's.feature_aliases.json', 'w') as f: - json.dump(s_feature_aliases, f, sort_keys=True) - - print('Total time: %f seconds' % (time.time() - start_time)) - print('', flush=True) - return s, s_feature_names, s_feature_aliases + + if args.postfilter: + ############## + print_header('3-A) Post-filter time-invariant data', char='-') + + ## Filter + S, S_feature_names, S_feature_aliases = post_filter_time_invariant(S_all, S_all_feature_names, theta_2) + print('Time elapsed: %f seconds' % (time.time() - start_time)) + + ## Save output + print() + print('Output') + print('S: shape={}, density={:.3f}'.format(S.shape, S.density)) + sparse.save_npz(dir_path + 'S.npz', S) + + with open(dir_path + 'S.feature_names.json', 'w') as f: + json.dump(list(S_feature_names), f, sort_keys=True) + with open(dir_path + 'S.feature_aliases.json', 'w') as f: + json.dump(S_feature_aliases, f, sort_keys=True) + + print('Total time: %f seconds' % (time.time() - start_time)) + print('', flush=True) + return S, S_feature_names, S_feature_aliases + else: + return S_all, S_all_feature_names, None def process_time_dependent(df_data_time_series, args): - data_path = args.data_path - theta_2 = args.theta_2 + if len(df_data_time_series) == 0: + return None, None, None + output_dir = args.output_dir + theta_2 = args.theta_2 + + ############## print_header('2-B) Transform time-dependent data', char='-') - dir_path = data_path + '/' + dir_path = output_dir + '/' start_time = time.time() ## Create NxLxD^ table df_time_series, dtypes_time_series = transform_time_series_table(df_data_time_series, args) print('Time elapsed: %f seconds' % (time.time() - start_time)) - + + ## Save intermediate files + joblib.dump(df_time_series, output_dir + 'df_time_series.joblib') + joblib.dump(dtypes_time_series, output_dir + 'dtypes_time_series.joblib') + df_time_series[[]].to_csv(dir_path + 'X.ID,t_range.csv') + ## Map variables to features - X_all, X_all_feature_names = map_time_series_features(df_time_series, dtypes_time_series, args) + X_all, X_all_feature_names, X_discretization_bins = map_time_series_features(df_time_series, dtypes_time_series, args) sparse.save_npz(dir_path + 'X_all.npz', X_all) - with open(dir_path + 'X_all.feature_names.json', 'w') as f: - json.dump(list(X_all_feature_names), f, sort_keys=True) - print('Time elapsed: %f seconds' % (time.time() - start_time)) - - ## Filter features - print_header('3-B) Post-filter time-dependent data', char='-') - print(X_all.shape, X_all.density) - X, X_feature_names, X_feature_aliases = post_filter_time_series(X_all, X_all_feature_names, theta_2, args) - print(X.shape, X.density) + json.dump(list(X_all_feature_names), open(dir_path + 'X_all.feature_names.json', 'w'), sort_keys=True) + json.dump(X_discretization_bins, open(dir_path + 'X_all.discretization.json', 'w')) + print('Time elapsed: %f seconds' % (time.time() - start_time)) - ## Save output - print() - print('Output') - print('X: shape={}, density={:.3f}'.format(X.shape, X.density)) - sparse.save_npz(dir_path + 'X.npz', X) - with open(dir_path + 'X.feature_names.json', 'w') as f: - json.dump(list(X_feature_names), f, sort_keys=True) - with open(dir_path + 'X.feature_aliases.json', 'w') as f: - json.dump(X_feature_aliases, f, sort_keys=True) - - print('Total time: %f seconds' % (time.time() - start_time)) - print('', flush=True) - return X, X_feature_names, X_feature_aliases + if args.postfilter: + ############## + print_header('3-B) Post-filter time-dependent data', char='-') + print(X_all.shape, X_all.density) + + ## Filter features + X, X_feature_names, X_feature_aliases = post_filter_time_series(X_all, X_all_feature_names, theta_2, args) + print(X.shape, X.density) + print('Time elapsed: %f seconds' % (time.time() - start_time)) + + ## Save output + print() + print('Output') + print('X: shape={}, density={:.3f}'.format(X.shape, X.density)) + sparse.save_npz(dir_path + 'X.npz', X) + with open(dir_path + 'X.feature_names.json', 'w') as f: + json.dump(list(X_feature_names), f, sort_keys=True) + with open(dir_path + 'X.feature_aliases.json', 'w') as f: + json.dump(X_feature_aliases, f, sort_keys=True) + + print('Total time: %f seconds' % (time.time() - start_time)) + print('', flush=True) + return X, X_feature_names, X_feature_aliases + else: + return X_all, X_all_feature_names, None ###### @@ -250,12 +280,12 @@ def process_time_dependent(df_data_time_series, args): ###### def transform_time_invariant_table(df_in, df_population): df_in = df_in.copy() - + # Recorded Value (np.nan if not recorded) df_value = pd.pivot_table(df_in, val_col, ID_col, var_col, 'last', np.nan) df_value = df_value.reindex(index=df_population.index, fill_value=np.nan) df_value.columns = [str(col) + '_value' for col in df_value.columns] - + print('(N \u00D7 ^d) table :\t', df_value.shape) print('number of missing entries :\t', '{} out of {} total'.format(df_value.isna().sum().sum(), df_value.size)) return df_value @@ -263,12 +293,18 @@ def transform_time_invariant_table(df_in, df_population): def map_time_invariant_features(df, args): # Categorical -> binary features # Numeric -> binary/float-valued features - if args.binarize: - out = [smart_qcut_dummify(df[col], q=5, use_ordinal_encoding=use_ordinal_encoding) for col in df.columns] + discretization_bins = None + if args.discretize: + discretization_bins = args.S_discretization_bins + if discretization_bins is None: + discretization_bins = [compute_bin_edges(df[col], q=5) for col in df.columns] + discretization_bins = dict(discretization_bins) + + out = [smart_qcut_dummify(df[col], discretization_bins[col], use_ordinal_encoding=args.use_ordinal_encoding) for col in df.columns] time_invariant_features = pd.concat(out, axis=1) feature_names_all = time_invariant_features.columns.values sdf = time_invariant_features.astype(pd.SparseDtype(int, fill_value=0)) - s_ = sparse.COO(sdf.sparse.to_coo()) + S_ = sparse.COO(sdf.sparse.to_coo()) else: # Split a mixed column into numeric and string columns for col in df.columns: @@ -279,45 +315,45 @@ def map_time_invariant_features(df, args): df[col+'_str'] = df[col].copy() df.loc[~numeric_mask, col] = np.nan df.loc[numeric_mask, col+'_str'] = np.nan - + out = [smart_dummify_impute(df[col]) for col in df.columns] time_invariant_features = pd.concat(out, axis=1) feature_names_all = time_invariant_features.columns.values sdf = time_invariant_features.astype(pd.SparseDtype(float, fill_value=0)) - s_ = sparse.COO(sdf.sparse.to_coo()) - + S_ = sparse.COO(sdf.sparse.to_coo()) + print() print('Output') - print('s_all, binary features :\t', s_.shape) - return s_, feature_names_all + print('S_all, binary features :\t', S_.shape) + return S_, feature_names_all, discretization_bins -def post_filter(s_, s_feature_names_all, threshold): +def post_filter_time_invariant(S_, S_feature_names_all, threshold): # Filter features (optional) - assert s_.shape[1] == len(s_feature_names_all) - feature_names_0 = s_feature_names_all - s0 = s_.to_scipy_sparse() + assert S_.shape[1] == len(S_feature_names_all) + feature_names_0 = S_feature_names_all + S0 = S_.to_scipy_sparse() print('Original :', len(feature_names_0)) - + ## Remove nearly-constant features (with low variance) ## a binary feature is removed if =0 (or =1) for >th fraction of examples ## i.e., variance <= (th * (1 - th)) sel_rare = VarianceThreshold(threshold=(threshold * (1 - threshold))) - s1 = sel_rare.fit_transform(s0) + S1 = sel_rare.fit_transform(S0) feature_names_1 = feature_names_0[sel_rare.get_support()] print('Nearly-constant:', len(feature_names_0) - len(feature_names_1)) - + ## Keep only first of pairwise perfectly correlated features sel_corr = CorrelationSelector() - s2 = sel_corr.fit_transform(s1) + S2 = sel_corr.fit_transform(S1) feature_names_2 = feature_names_1[sel_corr.get_support()] feature_aliases = sel_corr.get_feature_aliases(feature_names_1) print('Correlated :', len(feature_names_1) - len(feature_names_2)) - - s = sparse.COO(s2) + + S = sparse.COO(S2) feature_names = feature_names_2 - assert s.shape[1] == len(feature_names) - - return s, feature_names, feature_aliases + assert S.shape[1] == len(feature_names) + + return S, feature_names, feature_aliases ###### @@ -325,14 +361,16 @@ def post_filter(s_, s_feature_names_all, threshold): ###### def func_encode_single_time_series(i, g, variables, variables_num_freq, T, dt, stats_functions, impute=True): try: - assert g['ID'].nunique() == 1 - assert g['ID'].unique()[0] == i + assert g.index.nunique() == 1 + assert g.index.unique()[0] == i # non-frequent variables_non = sorted(set(variables) - set(variables_num_freq)) - df_j = pivot_event_table(g).reindex(columns=variables_non).sort_index() - df_values_j = most_recent_values(df_j, variables, T, dt) - df_out = df_values_j - + if len(variables_non) > 0: + variables_non = sorted(set(variables) - set(variables_num_freq)) + df_j = pivot_event_table(g).reindex(columns=variables_non).sort_index() + df_values_j = most_recent_values(df_j, variables, T, dt) + df_out = df_values_j + if len(variables_num_freq) > 0: # frequent # we're only producing mask, ffill, and statistics if the data is measured frequently enough @@ -345,51 +383,84 @@ def func_encode_single_time_series(i, g, variables, variables_num_freq, T, dt, s if impute: check_imputed_output(df_values_i) check_imputed_output(df_stats_i) - + df_out = df_out.join([mask_i, delta_t_i, df_values_i, df_stats_i]) except: print(i) raise Exception(i) return i, df_out +def divide_chunks(l, n): + # looping till length l + for i in range(0, len(l), n): + yield l[i:i + n] + +def form_batches_of_examples(df_in, args, batch_size): + grouped = df_in.set_index(ID_col) + IDs = list(grouped.index.unique()) + batches_IDs = list(divide_chunks(IDs, batch_size)) + batches = [grouped.loc[chunk] for chunk in batches_IDs] + return batches, batches_IDs + +def process_batch_time_series(first_arg): + batch, batch_IDs, args = first_arg + variables, variables_num_freq = args.variables, args.variables_num_freq + out = dict( + func_encode_single_time_series(i, batch.loc[i:i], variables, variables_num_freq, args.T, args.dt, args.stats_functions) + for i in batch_IDs + ) + return out + def transform_time_series_table(df_in, args): - data_path = args.data_path + output_dir = args.output_dir theta_freq = args.theta_freq stats_functions = args.stats_functions N, L = args.N, args.L df_population = args.df_population parallel = args.parallel - + ## TODO: asserts shape of df_in # Determine all unique variable names variables = get_unique_variables(df_in) assert df_in[var_col].nunique() == len(variables) print('Total variables :', len(variables)) - + # Determine frequent variables -> we'll calculate statistics, mask, and delta_time only on these variables_num_freq = get_frequent_numeric_variables(df_in, variables, theta_freq, args) print('Frequent variables :', list(variables_num_freq)) print('{} = {}'.format('M\u2081', len(variables_num_freq))) print('{} = {}'.format('M\u2082', len(variables) - len(variables_num_freq))) print('{} = {} {}'.format('k ', len(stats_functions), stats_functions)) - + print() print('Transforming each example...') + args.variables = variables + args.variables_num_freq = variables_num_freq + # Encode time series table for each patient - grouped = list(df_in.groupby(ID_col)) - if parallel: - out = dict(Parallel(n_jobs=n_jobs, verbose=10)( - delayed(func_encode_single_time_series)(i, g, variables, variables_num_freq, args.T, args.dt, args.stats_functions) - for i, g in grouped[:N] + if args.parallel: + batches, batches_IDs = form_batches_of_examples(df_in, args, batch_size=args.batch_size) + print('Batches of size {}: '.format(args.batch_size), len(batches)) + pool = multiprocessing.Pool(args.n_jobs) + out = list(tqdm(pool.imap_unordered( + process_batch_time_series, + zip(batches, batches_IDs, [args]*len(batches))), total=len(batches) )) - + pool.close() + pool.join() + + out = dict((key, d[key]) for d in out for key in d) + print() + print('Parallel processing done', flush=True) + else: + grouped = list(df_in.groupby(ID_col)) out = dict( - func_encode_single_time_series(i, g, variables, variables_num_freq, args.T, args.dt, args.stats_functions) + func_encode_single_time_series(i, g.set_index(ID_col), variables, variables_num_freq, args.T, args.dt, args.stats_functions) for i, g in tqdm(grouped[:N]) ) - + # Handle IDs not in the table df_original = list(out.values())[0] df_copy = pd.DataFrame().reindex_like(df_original) @@ -421,68 +492,68 @@ def transform_time_series_table(df_in, args): D_timeseries = out D_ = len(list(D_timeseries.values())[0].columns) - + # (N*L)xD^ table ## Create MultiIndex of (ID, time_bin) - index = sum([ + index = sum([ [(ID, t_) for t_ in list(df_.index)] - for ID, df_ in sorted(D_timeseries.items()) + for ID, df_ in sorted(D_timeseries.items()) ], []) - index = pd.Index(index) + index = pd.Index(index, names=['ID', 't_range']) assert len(index) == N * L - + ## Assume all dataframes have the same columns, used after concatenation columns = list(sorted(D_timeseries.items())[0][1].columns) columns = np.array(columns) dtypes = sorted(D_timeseries.items())[0][1].dtypes - + ## Convert each df to a numpy array ## Concatenate **sorted** numpy arrays (faster than calling pd.concat) feature_values = [(ID, df_.to_numpy()) for ID, df_ in sorted(D_timeseries.items())] time_series = np.concatenate([feat_val[1] for feat_val in feature_values]) assert time_series.shape == (len(index), len(columns)) - + df_time_series = pd.DataFrame(data=time_series, index=index, columns=columns) - + # Print metadata print('DONE: Transforming each example...') ## Freq: Count missing entries using mask ts_mask = df_time_series[[col for col in df_time_series if col.endswith('_mask')]] ts_mask.columns = [col.replace('_mask', '') for col in ts_mask.columns] - print('(freq) number of missing entries :\t', + print('(freq) number of missing entries :\t', '{} out of {}={} total'.format( - (1-ts_mask).astype(int).sum().sum(), + (1-ts_mask).astype(int).sum().sum(), '\u00D7'.join(str(i) for i in [N,L,ts_mask.shape[1]]), ts_mask.size)) - + ## Freq: Count imputed entries using mask and dt ts_delta_time = df_time_series[[col for col in df_time_series if col.endswith('_delta_time')]] ts_delta_time.columns = [col.replace('_delta_time', '') for col in ts_delta_time.columns] - + imputed = (1-ts_mask).astype(bool) & (ts_delta_time > 0) - print('(freq) number of imputed entries :\t', + print('(freq) number of imputed entries :\t', '{}'.format(imputed.sum().sum(), ts_delta_time.size)) - imputed.sum().rename('count').to_csv(data_path + '/' + 'freq_imputed.csv') - + imputed.sum().rename('count').to_csv(output_dir + '/' + 'freq_imputed.csv') + not_imputed = (1-ts_mask).astype(bool) & (ts_delta_time == 0) - print('(freq) number of not imputed entries :\t', + print('(freq) number of not imputed entries :\t', '{}'.format(not_imputed.sum().sum(), ts_delta_time.size)) - not_imputed.sum().rename('count').to_csv(data_path + '/' + 'freq_not_imputed.csv') - + not_imputed.sum().rename('count').to_csv(output_dir + '/' + 'freq_not_imputed.csv') + ## Non-Freq: Count missing entries non_freq_cols = sorted([c + '_value' for c in set(variables) - set(variables_num_freq)]) non_freqs = df_time_series[non_freq_cols] print('(non-freq) number of missing entries :\t', '{} out of {}={} total'.format( - non_freqs.isna().sum().sum(), + non_freqs.isna().sum().sum(), '\u00D7'.join(str(i) for i in [N,L,non_freqs.shape[1]]), non_freqs.size)) - + print() print('(N \u00D7 L \u00D7 ^D) table :\t', (N, L, len(columns))) return df_time_series, dtypes def map_time_series_features(df_time_series, dtypes, args): N, L = args.N, args.L - + df_time_series = df_time_series.dropna(axis='columns', how='all').sort_index() print('Discretizing features...') @@ -491,24 +562,36 @@ def map_time_series_features(df_time_series, dtypes, args): assert len(ts_mixed.columns) + len(ts_mask.columns) == len(df_time_series.columns) ts_feature_mask = ts_mask.astype(int) ts_mixed_cols = [ts_mixed[col] for col in ts_mixed.columns] - + print() - if args.binarize: + discretization_bins = None + if args.discretize: dtype = int print('Processing', len(ts_mixed_cols), 'non-boolean variable columns...') - print(' Binning numeric variables by quintile...') - print(' Converting variables to binary features') - if parallel: - out = Parallel(n_jobs=n_jobs, verbose=10)( # Need to share global variables - delayed(smart_qcut_dummify)(col_data, q=5, use_ordinal_encoding=use_ordinal_encoding) for col_data in ts_mixed_cols - ) + discretization_bins = args.X_discretization_bins + if discretization_bins is None: + print(' Computing bin edges for numeric variables...') + discretization_bins = [compute_bin_edges(col_data, q=5) for col_data in tqdm(ts_mixed_cols)] + discretization_bins = dict(discretization_bins) + else: + print(' Usng predetermined bin edges for numeric variables...') + + print(' Discretizing variables to binary features') + if args.parallel: + pool = multiprocessing.Pool(args.n_jobs) + out = list(tqdm(pool.imap_unordered( + smart_qcut_dummify_parallel, + [(col_data, discretization_bins[col_data.name], args.use_ordinal_encoding) for col_data in ts_mixed_cols]), total=len(ts_mixed_cols) + )) + pool.close() + pool.join() else: - out = [smart_qcut_dummify(col_data, q=5, use_ordinal_encoding=use_ordinal_encoding) for col_data in tqdm(ts_mixed_cols)] + out = [smart_qcut_dummify(col_data, discretization_bins[col_data.name], use_ordinal_encoding=args.use_ordinal_encoding) for col_data in tqdm(ts_mixed_cols)] else: dtype = float df = ts_mixed.copy() - + # Split a mixed column into numeric and string columns for col in df.columns: col_data = df[col] @@ -518,31 +601,34 @@ def map_time_series_features(df_time_series, dtypes, args): df[col+'_str'] = df[col].copy() df.loc[~numeric_mask, col] = np.nan df.loc[numeric_mask, col+'_str'] = np.nan - + ts_mixed_cols = [df[col] for col in df.columns] - + print('Discretizing categorical features...') - if parallel: - out = Parallel(n_jobs=n_jobs, verbose=10)( # Need to share global variables? - delayed(smart_dummify_impute)(col_data) for col_data in ts_mixed_cols - ) + if args.parallel: + pool = multiprocessing.Pool(args.n_jobs) + out = list(tqdm(pool.imap_unordered( + smart_dummify_impute, [(col_data) for col_data in ts_mixed_cols]), total=len(ts_mixed_cols) + )) + pool.close() + pool.join() else: out = [smart_dummify_impute(col_data) for col_data in tqdm(ts_mixed_cols)] - + out = [ts_feature_mask, *out] D_all = sum(len(df_i.columns) for df_i in out) X_all_feature_names = np.asarray(sum([list(df_i.columns) for df_i in out], [])) X_dense = np.concatenate([df_i.values for df_i in out], axis=1).astype(dtype) X_all = sparse.COO(X_dense) - + print('Finished discretizing features') assert X_all.shape[0] == N * L X_all = X_all.reshape((N, L, D_all)) - + print() print('Output') print('X_all: shape={}, density={:.3f}'.format(X_all.shape, X_all.density)) - return X_all, X_all_feature_names + return X_all, X_all_feature_names, discretization_bins def post_filter_time_series(X_all, feature_names_all, threshold, args): N, L = args.N, args.L @@ -550,11 +636,11 @@ def post_filter_time_series(X_all, feature_names_all, threshold, args): assert X_all.shape[1] == L # assert X_all.dtype == int start_time = time.time() - + X0 = X_all feature_names_0 = feature_names_all print('Original :', len(feature_names_0)) - + ## Remove nearly-constant features (with low variance) sel_const = FrequencyThreshold_temporal(threshold=threshold, L=L) sel_const.fit(X0.reshape((N*L, -1))) @@ -564,7 +650,7 @@ def post_filter_time_series(X_all, feature_names_all, threshold, args): feature_names_1 = feature_names_0[m_ts_const] print('Nearly-constant:', len(feature_names_0) - len(feature_names_1)) print('*** time: ', time.time() - start_time) - + ## Keep only first of pairwise perfectly correlated features sel_ts_corr = CorrelationSelector() sel_ts_corr.fit(X1.reshape((N*L, -1))) @@ -575,14 +661,14 @@ def post_filter_time_series(X_all, feature_names_all, threshold, args): feature_aliases = sel_ts_corr.get_feature_aliases(feature_names_1) print('Correlated :', len(feature_names_1) - len(feature_names_2)) print('*** time: ', time.time() - start_time) - + X = sparse.COO(X2) feature_names = feature_names_2 assert X.shape == (N, L, len(feature_names)) - + ## Save output print() print('Output') print('X: shape={}, density={:.3f}'.format(X.shape, X.density)) - + return X, feature_names, feature_aliases diff --git a/README.md b/README.md index 5573cd19c27570ff123f0b9e019de57eb634182c..6d3a896abb9f420f1f3ab7588f1f8788efd4cd5d 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ # FIDDLE -FIDDLE – FlexIble Data-Driven pipeLinE – is a preprocessing pipeline that transforms structured EHR data into feature vectors that can be used with ML algorithms, relying on only a small number of user-defined arguments. +FIDDLE – FlexIble Data-Driven pipeLinE – is a preprocessing pipeline that transforms structured EHR data into feature vectors that can be used with ML algorithms, relying on only a small number of user-defined arguments. -Requires python 3.6 or above. Required packages and versions are listed in `requirements.txt`. Older versions may still work but have not been tested. +Try a quick demo here: [tiny.cc/FIDDLE-demo](https://tiny.cc/FIDDLE-demo) -Note: This README contains latex equations and is best viewed on the GitLab site (https://gitlab.eecs.umich.edu/mld3/FIDDLE). +Note: This README contains latex equations and is best viewed on the [GitLab site](https://gitlab.eecs.umich.edu/mld3/FIDDLE). ## Publications & Resources - Title: Democratizing EHR analyses with FIDDLE: a flexible data-driven preprocessing pipeline for structured clinical data. -- Authors: Shengpu Tang, Parmida Davarmanesh, Yanmeng Song, Danai Koutra, Michael W. Sjoding, and Jenna Wiens. +- Authors: Shengpu Tang, Parmida Davarmanesh, Yanmeng Song, Danai Koutra, Michael W. Sjoding, and Jenna Wiens. - Published in JAMIA (Journal of the American Medical Informatics Association), October 2020: [article link](https://doi.org/10.1093/jamia/ocaa139) - Previously presented at MLHC 2019 ([Machine Learning for Healthcare](https://www.mlforhc.org/)) as a [clinical abstract](https://www.mlforhc.org/s/Sjoding-jete.pdf) - News coverage on HealthcareITNews: [link](https://www.healthcareitnews.com/news/new-framework-helps-streamline-ehr-data-extraction) @@ -23,20 +23,39 @@ If you use FIDDLE in your research, please cite the following publication: journal = {Journal of the American Medical Informatics Association}, year = {2020}, month = {10}, - issn = {1527-974X}, doi = {10.1093/jamia/ocaa139}, } ``` +## System Requirements + +### Pip +Requires python 3.7 or above (older versions may still work but have not been tested). Required packages and versions are listed in `requirements.txt`. Run the following command to install the required packages. +```bash +pip install -r requirements.txt +``` + +### Docker +To build the docker image, run the following command: +```bash +docker build -t fiddle-v020 . +``` +Refer to the notebook `tests/small_test/Run-docker.ipynb` for an example to run FIDDLE in docker. + + ## Usage Notes FIDDLE generates feature vectors based on data within the observation period $`t\in[0,T]`$. This feature representation can be used to make predictions of adverse outcomes at t=T. More specifically, FIDDLE outputs a set of binary feature vectors for each example $`i`$, $`\{(s_i,x_i)\ \text{for}\ i=1 \dots N\}`$ where $`s_i \in R^d`$ contains time-invariant features and $`x_i \in R^{L \times D}`$ contains time-dependent features. -Input: +Input: - formatted EHR data: `.csv` or `.p`/`.pickle` file, a table with 4 columns \[`ID`, `t`, `variable_name`, `variable_value`\] - population file: a list of unique `ID`s you want processed + - the output feature matrix will correspond to IDs in lexicographically sorted order +- config file: + - specifies additional settings by providing a custom `config.yaml` file + - a default config file is located at `FIDDLE/config-default.yaml` - arguments: - - T: The time of prediction; time-dependent features will be generated using data in $`t\in[0,T]`$. - - dt: the temporal granularity at which to "window" time-dependent data. + - T: The time of prediction; time-dependent features will be generated using data in $`t\in[0,T]`$. + - dt: the temporal granularity at which to "window" time-dependent data. - theta_1: The threshold for Pre-filter. - theta_2: The threshold for Post-filter. - theta_freq: The threshold at which we deem a variable “frequent” (for which summary statistics will be calculated). @@ -46,8 +65,10 @@ Output: The generated features and associated metadata are located in `{data_pat - `s.npz`: a sparse array of shape (N, d) - `X.npz`: a sparse tensor of shape (N, L, D) -- `s.feature_names.txt`: names of _d_ time-invariant features -- `X.feature_names.txt`: names of _D_ time-series features +- `s.feature_names.json`: names of _d_ time-invariant features +- `X.feature_names.json`: names of _D_ time-series features +- `x.feature_aliases.json`: aliases of duplicated time-invariant features +- `X.feature_aliases.json`: aliases of duplicated time-series features To load the generated features: @@ -70,11 +91,11 @@ python -m FIDDLE.run \ ## Guidelines on argument settings The user-defined arguments of FIDDLE include: T, dt, theta_1, theta_2, theta_freq, and K statistics functions. The settings of these arguments could affect the features and how they can be used. We provided reasonable default values in the implementation, and here list some practical considerations: (i) prediction time and frequency, (ii) temporal density of data, and (iii) class balance. -(i) The prediction time and frequency determine the appropriate settings for T and dt. The risk stratification tasks we considered all involve a single prediction at the end of a fixed prediction window. It is thus most reasonable to set T to be the length of prediction window. Another possible formulation is to make multiple predictions where each prediction depends on only data from the past (not the future), using models like LSTM or fully convolutional networks. In that case, for example, if a prediction needs to be made every 4 hours over a 48-hour period, then T should be 48 hours, whereas dt should be at most 4 hours. +(i) The prediction time and frequency determine the appropriate settings for T and dt. The risk stratification tasks we considered all involve a single prediction at the end of a fixed prediction window. It is thus most reasonable to set T to be the length of prediction window. Another possible formulation is to make multiple predictions where each prediction depends on only data from the past (not the future), using models like LSTM or fully convolutional networks. In that case, for example, if a prediction needs to be made every 4 hours over a 48-hour period, then T should be 48 hours, whereas dt should be at most 4 hours. -(ii) The temporal density of data, that is, how often the variables are usually measured, also affects the setting of dt. This can be achieved by plotting a histogram of recording frequency. In our case, we observed that the maximum hourly frequency is ~1.2 times, which suggests dt should not be smaller than 1 hour. While most variables are recorded on average <0.1 time per hour (most of the time not recorded), the 6 vital signs are recorded slightly >1 time per hour. Thus, given that in the ICU, vital signs are usually collected once per hour, we set dt=1. This also implies the setting of θ_freq to be 1. Besides determining the value for dt from context (how granular we want to encode the data), we can also sweep the range (if there are sufficient computational resources and time) given the prediction frequency and the temporal density of data. +(ii) The temporal density of data, that is, how often the variables are usually measured, also affects the setting of dt. This can be achieved by plotting a histogram of recording frequency. In our case, we observed that the maximum hourly frequency is ~1.2 times, which suggests dt should not be smaller than 1 hour. While most variables are recorded on average <0.1 time per hour (most of the time not recorded), the 6 vital signs are recorded slightly >1 time per hour. Thus, given that in the ICU, vital signs are usually collected once per hour, we set dt=1. This also implies the setting of θ_freq to be 1. Besides determining the value for dt from context (how granular we want to encode the data), we can also sweep the range (if there are sufficient computational resources and time) given the prediction frequency and the temporal density of data. -(iii) We recommend setting θ_1=θ_2=θ and be conservative to avoid removing information that could be potentially useful. For binary classification, the rule-of-the-thumb we suggest is to set θ to be about 1/100 of the minority class. For example, our cohorts consist of ~10% positive cases, so setting θ=0.001 is appropriate, whereas for a cohort with only 1% positive cases, then θ=0.0001 is more appropriate. Given sufficient computational resources and time, the value of θ can also be swept and optimized. +(iii) We recommend setting θ_1=θ_2=θ and be conservative to avoid removing information that could be potentially useful. For binary classification, the rule-of-the-thumb we suggest is to set θ to be about 1/100 of the minority class. For example, our cohorts consist of ~10% positive cases, so setting θ=0.001 is appropriate, whereas for a cohort with only 1% positive cases, then θ=0.0001 is more appropriate. Given sufficient computational resources and time, the value of θ can also be swept and optimized. Finally, for the summary statistics functions, we included by default the most basic statistics functions are minimum, maximum, and mean. If on average, we expect more than one value per time bin, then we can also include higher order statistics such as standard deviation and linear slope. @@ -82,4 +103,4 @@ Finally, for the summary statistics functions, we included by default the most b ## Experiments -In order to show the flexibility and utility of FIDDLE, we conducted several experiments using data from MIMIC-III and eICU. The code to reproduce the results are located at https://gitlab.eecs.umich.edu/MLD3/FIDDLE_experiments. The experiments were performed using FIDDLE v0.1.0 and reported in the JAMIA paper; bug fixes and new functionalities have since been implemented and may affect the numerical results. +In order to show the flexibility and utility of FIDDLE, we conducted several experiments using data from MIMIC-III and eICU. The code to reproduce the results are located at https://gitlab.eecs.umich.edu/MLD3/FIDDLE_experiments. The experiments were performed using FIDDLE v0.1.0 and reported in the JAMIA paper; bug fixes and new functionalities have since been implemented and may affect the numerical results. diff --git a/requirements.txt b/requirements.txt index dd38396e52b8ce24c22b6ec82393b25606da7819..09232cd87101feafe96b04770f3f2b7a4996b230 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ -numpy>=1.16 -pandas>=1.0.1 -sparse>=0.9.1 -scikit-learn>=0.22.1 -tqdm>=4.43.0 -joblib>=0.13.2 -icd9cms>=0.2.1 -icd10-cm>=0.0.4 +pyyaml>=5.3 +numpy>=1.19 +pandas>=1.1 +sparse>=0.11 +scikit-learn>=0.23 +tqdm>=4.50 +joblib>=0.16 +icd9cms==0.2.1 +icd10-cm==0.0.4 diff --git a/test/Test_Case-ICD.ipynb b/test/Test_Case-ICD.ipynb deleted file mode 100644 index f94992cb3dfa89061c2019531eaacc6c6395a0fa..0000000000000000000000000000000000000000 --- a/test/Test_Case-ICD.ipynb +++ /dev/null @@ -1,329 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "df = pd.read_csv('./icd_data/input_data.csv')\n", - "df.loc[df['variable_value'] == '71970', 'variable_value'] = '7197'\n", - "df.to_csv('./icd_data/input_data.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data file: ./icd_data/input_data.csv\n", - "\n", - "Input arguments:\n", - " T = 4\n", - " dt = 1.0\n", - " θ₁ = 0.001\n", - " θ₂ = 0.001\n", - " θ_freq = 1.0\n", - " k = 3 ['min', 'max', 'mean']\n", - "binarize = yes\n", - "\n", - "N = 53122\n", - "L = 4\n", - "\n", - "\n", - "================================================================================\n", - "1) Pre-filter\n", - "================================================================================\n", - "Remove rows not in population\n", - "Remove rows with t outside of [0, 4]\n", - "Remove rare variables (<= 0.001)\n", - "Total variables : 1\n", - "Rare variables : 0\n", - "Remaining variables : 1\n", - "# rows (original) : 569007\n", - "# rows (filtered) : 569007\n", - "\n", - "================================================================================\n", - "2) Transform; 3) Post-filter\n", - "================================================================================\n", - "\n", - "--------------------------------------------------------------------------------\n", - "*) Detecting and parsing value types\n", - "--------------------------------------------------------------------------------\n", - "Saved as: ./icd_data/value_types.csv\n", - "\n", - "--------------------------------------------------------------------------------\n", - "*) Separate time-invariant and time-dependent\n", - "--------------------------------------------------------------------------------\n", - "Variables (time-invariant): 1447\n", - "Variables (time-dependent): 0\n", - "# rows (time-invariant): 1265903\n", - "# rows (time-dependent): 0\n", - "\n", - "--------------------------------------------------------------------------------\n", - "2-A) Transform time-invariant data\n", - "--------------------------------------------------------------------------------\n", - "(N × ^d) table :\t (53122, 1447)\n", - "number of missing entries :\t 75601631 out of 76867534 total\n", - "Time elapsed: 8.736094 seconds\n", - "\n", - "Output\n", - "s_all, binary features :\t (53122, 1447)\n", - "Time elapsed: 115.795696 seconds\n", - "\n", - "--------------------------------------------------------------------------------\n", - "3-A) Post-filter time-invariant data\n", - "--------------------------------------------------------------------------------\n", - "Original : 1447\n", - "Nearly-constant: 753\n", - "Correlated : 7\n", - "Time elapsed: 116.175213 seconds\n", - "\n", - "Output\n", - "s: shape=(53122, 687), density=0.034\n", - "Total time: 116.547743 seconds\n", - "\n" - ] - } - ], - "source": [ - "! PYTHONPATH=\"$PYTHONPATH:../\" \\\n", - "python -m FIDDLE.run \\\n", - " --data_path='./icd_data/' \\\n", - " --population='./icd_data/pop.csv' \\\n", - " --T=4 --dt=1.0 \\\n", - " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n", - " --stats_functions 'min' 'max' 'mean'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data file: ./icd_test/input_data.csv\n", - "\n", - "Input arguments:\n", - " T = 4\n", - " dt = 1.0\n", - " θ₁ = 0.001\n", - " θ₂ = 0.001\n", - " θ_freq = 1.0\n", - " k = 3 ['min', 'max', 'mean']\n", - "binarize = yes\n", - "\n", - "N = 200\n", - "L = 4\n", - "\n", - "\n", - "================================================================================\n", - "1) Pre-filter\n", - "================================================================================\n", - "Remove rows not in population\n", - "Remove rows with t outside of [0, 4]\n", - "Remove rare variables (<= 0.001)\n", - "Total variables : 1\n", - "Rare variables : 0\n", - "Remaining variables : 1\n", - "# rows (original) : 1861\n", - "# rows (filtered) : 1861\n", - "\n", - "================================================================================\n", - "2) Transform; 3) Post-filter\n", - "================================================================================\n", - "\n", - "--------------------------------------------------------------------------------\n", - "*) Detecting and parsing value types\n", - "--------------------------------------------------------------------------------\n", - "Saved as: ./icd_test/value_types.csv\n", - "\n", - "--------------------------------------------------------------------------------\n", - "*) Separate time-invariant and time-dependent\n", - "--------------------------------------------------------------------------------\n", - "Variables (time-invariant): 455\n", - "Variables (time-dependent): 0\n", - "# rows (time-invariant): 4205\n", - "# rows (time-dependent): 0\n", - "\n", - "--------------------------------------------------------------------------------\n", - "2-A) Transform time-invariant data\n", - "--------------------------------------------------------------------------------\n", - "(N × ^d) table :\t (200, 455)\n", - "number of missing entries :\t 86795 out of 91000 total\n", - "Time elapsed: 0.101392 seconds\n", - "\n", - "Output\n", - "s_all, binary features :\t (200, 455)\n", - "Time elapsed: 1.779821 seconds\n", - "\n", - "--------------------------------------------------------------------------------\n", - "3-A) Post-filter time-invariant data\n", - "--------------------------------------------------------------------------------\n", - "Original : 455\n", - "Nearly-constant: 0\n", - "Correlated : 87\n", - "Time elapsed: 1.820592 seconds\n", - "\n", - "Output\n", - "s: shape=(200, 368), density=0.055\n", - "Total time: 1.827327 seconds\n", - "\n" - ] - } - ], - "source": [ - "! PYTHONPATH=\"$PYTHONPATH:../\" \\\n", - "python -m FIDDLE.run \\\n", - " --data_path='./icd_test/' \\\n", - " --population='./icd_test/pop.csv' \\\n", - " --T=4 --dt=1.0 \\\n", - " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n", - " --stats_functions 'min' 'max' 'mean'" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import sparse\n", - "s = sparse.load_npz('./icd_test/s.npz').todense()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(200, 368)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[0, 0, 0, ..., 0, 0, 0],\n", - " [0, 0, 0, ..., 0, 0, 0],\n", - " [0, 0, 0, ..., 0, 0, 0],\n", - " ...,\n", - " [0, 1, 0, ..., 0, 0, 0],\n", - " [0, 1, 0, ..., 0, 0, 0],\n", - " [0, 1, 1, ..., 0, 0, 0]])" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "names = json.load(open('./icd_test/s.feature_names.json', 'rb'))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "368" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(names)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/test/Test_Case.ipynb b/test/Test_Case.ipynb deleted file mode 100644 index b249db411e8f5f04d5d3946374baaff0061cdbe4..0000000000000000000000000000000000000000 --- a/test/Test_Case.ipynb +++ /dev/null @@ -1,348 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data file: ./small_test/input_data.csv\n", - "\n", - "Input arguments:\n", - " T = 4\n", - " dt = 1.0\n", - " θ₁ = 0.001\n", - " θ₂ = 0.001\n", - " θ_freq = 1.0\n", - " k = 3 ['min', 'max', 'mean']\n", - "\n", - "N = 4\n", - "L = 4\n", - "\n", - "\n", - "================================================================================\n", - "1) Pre-filter\n", - "================================================================================\n", - "Remove rows not in population\n", - "Remove rows with t outside of [0, 4]\n", - "Remove rare variables (<= 0.001)\n", - "Total variables : 7\n", - "Rare variables : 0\n", - "Remaining variables : 7\n", - "# rows (original) : 31\n", - "# rows (filtered) : 31\n", - "\n", - "================================================================================\n", - "2) Transform; 3) Post-filter\n", - "================================================================================\n", - "\n", - "--------------------------------------------------------------------------------\n", - "*) Detecting value types\n", - "--------------------------------------------------------------------------------\n", - "Saved as: ./small_test/value_types.csv\n", - "\n", - "--------------------------------------------------------------------------------\n", - "*) Separate time-invariant and time-dependent\n", - "--------------------------------------------------------------------------------\n", - "Variables (time-invariant): 3\n", - "Variables (time-dependent): 4\n", - "# rows (time-invariant): 8\n", - "# rows (time-dependent): 23\n", - "\n", - "--------------------------------------------------------------------------------\n", - "2.1) Transform time-invariant data\n", - "--------------------------------------------------------------------------------\n", - "(N × ^d) table :\t (4, 3)\n", - "number of missing entries :\t 4 out of 12 total\n", - "Time elapsed: 0.017584 seconds\n", - "\n", - "Output\n", - "s_all, binary features :\t (4, 7)\n", - "Time elapsed: 0.072829 seconds\n", - "\n", - "--------------------------------------------------------------------------------\n", - "3.1) Post-filter time-invariant data\n", - "--------------------------------------------------------------------------------\n", - "Original : 7\n", - "Nearly-constant: 0\n", - "Correlated : 3\n", - "Time elapsed: 0.076066 seconds\n", - "\n", - "Output\n", - "s: shape=(4, 4), density=0.312\n", - "Total time: 0.078834 seconds\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "2.2) Transform time-dependent data\n", - "--------------------------------------------------------------------------------\n", - "Total variables : 4\n", - "Frequent variables : ['HR']\n", - "M₁ = 1\n", - "M₂ = 3\n", - "k = 3 ['min', 'max', 'mean']\n", - "\n", - "Transforming each example...\n", - "[Parallel(n_jobs=72)]: Using backend LokyBackend with 72 concurrent workers.\n", - "[Parallel(n_jobs=72)]: Done 1 tasks | elapsed: 3.8s\n", - "[Parallel(n_jobs=72)]: Done 2 out of 4 | elapsed: 3.8s remaining: 3.8s\n", - "[Parallel(n_jobs=72)]: Done 4 out of 4 | elapsed: 3.9s remaining: 0.0s\n", - "[Parallel(n_jobs=72)]: Done 4 out of 4 | elapsed: 3.9s finished\n", - "(freq) number of missing entries :\t 5 out of 4×4×1=16 total\n", - "(freq) number of imputed entries :\t 4\n", - " HR 4\n", - "(freq) number of not imputed entries :\t 1\n", - " HR 1\n", - "(non-freq) number of missing entries :\t 41 out of 4×4×3=48 total\n", - "\n", - "(N × L × ^D) table :\t (4, 4, 9)\n", - "Time elapsed: 3.977742 seconds\n", - "\n", - "Discretizing features...\n", - "Processing 8 non-boolean variable columns...\n", - " Binning numeric variables by quintile...\n", - " Converting variables to binary features\n", - "[Parallel(n_jobs=72)]: Using backend LokyBackend with 72 concurrent workers.\n", - "[Parallel(n_jobs=72)]: Done 2 out of 8 | elapsed: 1.0s remaining: 3.0s\n", - "[Parallel(n_jobs=72)]: Done 3 out of 8 | elapsed: 1.0s remaining: 1.7s\n", - "[Parallel(n_jobs=72)]: Done 4 out of 8 | elapsed: 1.0s remaining: 1.0s\n", - "[Parallel(n_jobs=72)]: Done 5 out of 8 | elapsed: 1.0s remaining: 0.6s\n", - "[Parallel(n_jobs=72)]: Done 6 out of 8 | elapsed: 1.0s remaining: 0.3s\n", - "[Parallel(n_jobs=72)]: Done 8 out of 8 | elapsed: 1.1s remaining: 0.0s\n", - "[Parallel(n_jobs=72)]: Done 8 out of 8 | elapsed: 1.1s finished\n", - "Finished discretizing features\n", - "\n", - "Output\n", - "X_all: shape=(4, 4, 29), density=0.203\n", - "Time elapsed: 5.103915 seconds\n", - "\n", - "--------------------------------------------------------------------------------\n", - "3.2) Post-filter time-dependent data\n", - "--------------------------------------------------------------------------------\n", - "(4, 4, 29) 0.2025862068965517\n", - "Original : 29\n", - "Nearly-constant: 0\n", - "*** time: 2.486790657043457\n", - "Correlated : 15\n", - "*** time: 4.358332395553589\n", - "\n", - "Output\n", - "X: shape=(4, 4, 14), density=0.237\n", - "(4, 4, 14) 0.23660714285714285\n", - "Time elapsed: 9.462556 seconds\n", - "\n", - "Output\n", - "X: shape=(4, 4, 14), density=0.237\n", - "Total time: 9.466846 seconds\n", - "\n" - ] - } - ], - "source": [ - "! PYTHONPATH=\"$PYTHONPATH:../\" \\\n", - "python -m FIDDLE.run \\\n", - " --data_path='./small_test/' \\\n", - " --population='./small_test/pop.csv' \\\n", - " --T=4 --dt=1.0 \\\n", - " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n", - " --stats_functions 'min' 'max' 'mean'" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input data file: ./large_test/input_data.csv\n", - "\n", - "Input arguments:\n", - " T = 4\n", - " dt = 1.0\n", - " θ₁ = 0.001\n", - " θ₂ = 0.001\n", - " θ_freq = 1.0\n", - " k = 3 ['min', 'max', 'mean']\n", - "\n", - "N = 200\n", - "L = 4\n", - "\n", - "\n", - "================================================================================\n", - "1) Pre-filter\n", - "================================================================================\n", - "Remove rows not in population\n", - "Remove rows with t outside of [0, 4]\n", - "Remove rare variables (<= 0.001)\n", - "Total variables : 1970\n", - "Rare variables : 0\n", - "Remaining variables : 1970\n", - "# rows (original) : 64777\n", - "# rows (filtered) : 64777\n", - "\n", - "================================================================================\n", - "2) Transform; 3) Post-filter\n", - "================================================================================\n", - "\n", - "--------------------------------------------------------------------------------\n", - "*) Detecting value types\n", - "--------------------------------------------------------------------------------\n", - "Saved as: ./large_test/value_types.csv\n", - "\n", - "--------------------------------------------------------------------------------\n", - "*) Separate time-invariant and time-dependent\n", - "--------------------------------------------------------------------------------\n", - "Variables (time-invariant): 12\n", - "Variables (time-dependent): 1958\n", - "# rows (time-invariant): 2400\n", - "# rows (time-dependent): 62377\n", - "\n", - "--------------------------------------------------------------------------------\n", - "2.1) Transform time-invariant data\n", - "--------------------------------------------------------------------------------\n", - "(N × ^d) table :\t (200, 12)\n", - "number of missing entries :\t 4 out of 2400 total\n", - "Time elapsed: 0.021392 seconds\n", - "\n", - "Output\n", - "s_all, binary features :\t (200, 84)\n", - "Time elapsed: 0.216294 seconds\n", - "\n", - "--------------------------------------------------------------------------------\n", - "3.1) Post-filter time-invariant data\n", - "--------------------------------------------------------------------------------\n", - "Original : 84\n", - "Nearly-constant: 0\n", - "Correlated : 7\n", - "Time elapsed: 0.221074 seconds\n", - "\n", - "Output\n", - "s: shape=(200, 77), density=0.145\n", - "Total time: 0.225575 seconds\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "2.2) Transform time-dependent data\n", - "--------------------------------------------------------------------------------\n", - "Total variables : 1958\n", - "Frequent variables : ['DiaBP', 'HR', 'RR', 'SpO2', 'SysBP']\n", - "M₁ = 5\n", - "M₂ = 1953\n", - "k = 3 ['min', 'max', 'mean']\n", - "\n", - "Transforming each example...\n", - "[Parallel(n_jobs=72)]: Using backend LokyBackend with 72 concurrent workers.\n", - "[Parallel(n_jobs=72)]: Done 1 tasks | elapsed: 4.8s\n", - "[Parallel(n_jobs=72)]: Done 18 tasks | elapsed: 8.0s\n", - "[Parallel(n_jobs=72)]: Done 37 tasks | elapsed: 9.3s\n", - "[Parallel(n_jobs=72)]: Done 56 tasks | elapsed: 10.3s\n", - "[Parallel(n_jobs=72)]: Done 78 out of 200 | elapsed: 11.5s remaining: 18.0s\n", - "[Parallel(n_jobs=72)]: Done 99 out of 200 | elapsed: 12.7s remaining: 13.0s\n", - "[Parallel(n_jobs=72)]: Done 120 out of 200 | elapsed: 13.8s remaining: 9.2s\n", - "[Parallel(n_jobs=72)]: Done 141 out of 200 | elapsed: 14.8s remaining: 6.2s\n", - "[Parallel(n_jobs=72)]: Done 162 out of 200 | elapsed: 15.4s remaining: 3.6s\n", - "[Parallel(n_jobs=72)]: Done 183 out of 200 | elapsed: 16.1s remaining: 1.5s\n", - "[Parallel(n_jobs=72)]: Done 200 out of 200 | elapsed: 16.8s finished\n", - "(freq) number of missing entries :\t 996 out of 200×4×5=4000 total\n", - "(freq) number of imputed entries :\t 58\n", - " DiaBP 17\n", - " HR 5\n", - " RR 6\n", - " SpO2 13\n", - " SysBP 17\n", - "(freq) number of not imputed entries :\t 938\n", - " DiaBP 190\n", - " HR 180\n", - " RR 183\n", - " SpO2 195\n", - " SysBP 190\n", - "(non-freq) number of missing entries :\t 1510389 out of 200×4×1953=1562400 total\n", - "\n", - "(N × L × ^D) table :\t (200, 4, 1983)\n", - "Time elapsed: 19.099867 seconds\n", - "\n", - "Discretizing features...\n", - "Processing 1978 non-boolean variable columns...\n", - " Binning numeric variables by quintile...\n", - " Converting variables to binary features\n", - "[Parallel(n_jobs=72)]: Using backend LokyBackend with 72 concurrent workers.\n", - "[Parallel(n_jobs=72)]: Done 1 tasks | elapsed: 0.0s\n", - "[Parallel(n_jobs=72)]: Batch computation too fast (0.0419s.) Setting batch_size=8.\n", - "[Parallel(n_jobs=72)]: Batch computation too fast (0.0419s.) Setting batch_size=76.\n", - "[Parallel(n_jobs=72)]: Done 9 tasks | elapsed: 0.2s\n", - "[Parallel(n_jobs=72)]: Done 20 tasks | elapsed: 0.7s\n", - "[Parallel(n_jobs=72)]: Done 1978 out of 1978 | elapsed: 6.7s finished\n", - "Finished discretizing features\n", - "\n", - "Output\n", - "X_all: shape=(200, 4, 3406), density=0.026\n", - "Time elapsed: 26.408678 seconds\n", - "\n", - "--------------------------------------------------------------------------------\n", - "3.2) Post-filter time-dependent data\n", - "--------------------------------------------------------------------------------\n", - "(200, 4, 3406) 0.026153479154433352\n", - "Original : 3406\n", - "Nearly-constant: 5\n", - "*** time: 3.5170133113861084\n", - "Correlated : 1102\n", - "*** time: 7.688496828079224\n", - "\n", - "Output\n", - "X: shape=(200, 4, 2299), density=0.034\n", - "(200, 4, 2299) 0.034270334928229666\n", - "Time elapsed: 34.102943 seconds\n", - "\n", - "Output\n", - "X: shape=(200, 4, 2299), density=0.034\n", - "Total time: 34.251790 seconds\n", - "\n" - ] - } - ], - "source": [ - "! PYTHONPATH=\"$PYTHONPATH:../\" \\\n", - "python -m FIDDLE.run \\\n", - " --data_path='./large_test/' \\\n", - " --population='./large_test/pop.csv' \\\n", - " --T=4 --dt=1.0 \\\n", - " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n", - " --stats_functions 'min' 'max' 'mean'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tests/icd_test/Run.ipynb b/tests/icd_test/Run.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c83b13c17e3f8b4b19b69af4c159bd46c62c47fd --- /dev/null +++ b/tests/icd_test/Run.ipynb @@ -0,0 +1,1924 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "zsh:1: no matches found: output-*/\n" + ] + } + ], + "source": [ + "!rm -rf output-*/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test: levels = [0]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p output-0" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input:\n", + " Data : ./input/data.csv\n", + " Population: ./input/pop.csv\n", + " Config : ./input/config-0.yaml\n", + "\n", + "Output directory: ./output-0/\n", + "\n", + "Input arguments:\n", + " T = 4.0\n", + " dt = 1.0\n", + " θ₁ = 0.001\n", + " θ₂ = 0.001\n", + " θ_freq = 1.0\n", + " k = 3 ['min', 'max', 'mean']\n", + "\n", + "discretize = yes\n", + " S discretization bins: to be computed from data\n", + " X discretization bins: to be computed from data\n", + "\n", + "N = 200\n", + "L = 4\n", + "\n", + "\n", + "================================================================================\n", + "1) Pre-filter\n", + "================================================================================\n", + "Remove rows not in population\n", + "Remove rows with t outside of [0, 4]\n", + "Remove rare variables (<= 0.001)\n", + "Total variables : 1\n", + "Rare variables : 0\n", + "Remaining variables : 1\n", + "# rows (original) : 1861\n", + "# rows (filtered) : 1861\n", + "\n", + "================================================================================\n", + "2) Transform; 3) Post-filter\n", + "================================================================================\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Detecting and parsing value types\n", + "--------------------------------------------------------------------------------\n", + "Saved as: ./output-0/value_types.csv\n", + "/Users/shengputang/Developer/FIDDLE/FIDDLE/steps.py:148: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df_non_num_dup[var_col] = df_non_num_dup[var_col].astype(str) + ':' + df_non_num_dup[val_col].astype(str)\n", + "/Users/shengputang/Developer/FIDDLE/FIDDLE/steps.py:149: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df_non_num_dup[val_col] = 1\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Separate time-invariant and time-dependent\n", + "--------------------------------------------------------------------------------\n", + "Variables (time-invariant): 19\n", + "Variables (time-dependent): 0\n", + "# rows (time-invariant): 984\n", + "# rows (time-dependent): 0\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-A) Transform time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "(N × ^d) table :\t (200, 19)\n", + "number of missing entries :\t 2816 out of 3800 total\n", + "Time elapsed: 0.025395 seconds\n", + "\n", + "Output\n", + "S_all, binary features :\t (200, 21)\n", + "Time elapsed: 0.171098 seconds\n", + "\n", + "--------------------------------------------------------------------------------\n", + "3-A) Post-filter time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "Original : 21\n", + "Nearly-constant: 0\n", + "Correlated : 0\n", + "Time elapsed: 0.178303 seconds\n", + "\n", + "Output\n", + "S: shape=(200, 21), density=0.234\n", + "Total time: 0.180898 seconds\n", + "\n", + "\u001b[0m" + ] + } + ], + "source": [ + "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n", + "python -m FIDDLE.run \\\n", + " --data_fname='./input/data.csv' \\\n", + " --population_fname='./input/pop.csv' \\\n", + " --config_fname='./input/config-0.yaml' \\\n", + " --output_dir='./output-0/' \\\n", + " --T=4 --dt=1.0 \\\n", + " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n", + " --stats_functions 'min' 'max' 'mean'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ICD9_CODE_value_140-239ICD9_CODE_value_390-459ICD9_CODE_value_V01-V91ICD9_CODE:001-139_value_1ICD9_CODE:140-239_value_1ICD9_CODE:240-279_value_1ICD9_CODE:280-289_value_1ICD9_CODE:290-319_value_1ICD9_CODE:320-389_value_1ICD9_CODE:390-459_value_1...ICD9_CODE:520-579_value_1ICD9_CODE:580-629_value_1ICD9_CODE:680-709_value_1ICD9_CODE:710-739_value_1ICD9_CODE:740-759_value_1ICD9_CODE:760-779_value_1ICD9_CODE:780-799_value_1ICD9_CODE:800-999_value_1ICD9_CODE:E000-E999_value_1ICD9_CODE:V01-V91_value_1
ID
1005360001001100...1000000110
1017570001101111...0010001001
1026310000000001...0000000100
1032510010000000...0000000000
1041300001011101...0100001001
..................................................................
1976610000010001...0100000100
1982140000011011...1000000000
1988920000000000...0000010001
1996340001110001...1001000001
1997240001010111...0000000100
\n", + "

200 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " ICD9_CODE_value_140-239 ICD9_CODE_value_390-459 \\\n", + "ID \n", + "100536 0 0 \n", + "101757 0 0 \n", + "102631 0 0 \n", + "103251 0 0 \n", + "104130 0 0 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 0 0 \n", + "199724 0 0 \n", + "\n", + " ICD9_CODE_value_V01-V91 ICD9_CODE:001-139_value_1 \\\n", + "ID \n", + "100536 0 1 \n", + "101757 0 1 \n", + "102631 0 0 \n", + "103251 1 0 \n", + "104130 0 1 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 0 1 \n", + "199724 0 1 \n", + "\n", + " ICD9_CODE:140-239_value_1 ICD9_CODE:240-279_value_1 \\\n", + "ID \n", + "100536 0 0 \n", + "101757 1 0 \n", + "102631 0 0 \n", + "103251 0 0 \n", + "104130 0 1 \n", + "... ... ... \n", + "197661 0 1 \n", + "198214 0 1 \n", + "198892 0 0 \n", + "199634 1 1 \n", + "199724 0 1 \n", + "\n", + " ICD9_CODE:280-289_value_1 ICD9_CODE:290-319_value_1 \\\n", + "ID \n", + "100536 1 1 \n", + "101757 1 1 \n", + "102631 0 0 \n", + "103251 0 0 \n", + "104130 1 1 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 1 0 \n", + "198892 0 0 \n", + "199634 0 0 \n", + "199724 0 1 \n", + "\n", + " ICD9_CODE:320-389_value_1 ICD9_CODE:390-459_value_1 ... \\\n", + "ID ... \n", + "100536 0 0 ... \n", + "101757 1 1 ... \n", + "102631 0 1 ... \n", + "103251 0 0 ... \n", + "104130 0 1 ... \n", + "... ... ... ... \n", + "197661 0 1 ... \n", + "198214 1 1 ... \n", + "198892 0 0 ... \n", + "199634 0 1 ... \n", + "199724 1 1 ... \n", + "\n", + " ICD9_CODE:520-579_value_1 ICD9_CODE:580-629_value_1 \\\n", + "ID \n", + "100536 1 0 \n", + "101757 0 0 \n", + "102631 0 0 \n", + "103251 0 0 \n", + "104130 0 1 \n", + "... ... ... \n", + "197661 0 1 \n", + "198214 1 0 \n", + "198892 0 0 \n", + "199634 1 0 \n", + "199724 0 0 \n", + "\n", + " ICD9_CODE:680-709_value_1 ICD9_CODE:710-739_value_1 \\\n", + "ID \n", + "100536 0 0 \n", + "101757 1 0 \n", + "102631 0 0 \n", + "103251 0 0 \n", + "104130 0 0 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 0 1 \n", + "199724 0 0 \n", + "\n", + " ICD9_CODE:740-759_value_1 ICD9_CODE:760-779_value_1 \\\n", + "ID \n", + "100536 0 0 \n", + "101757 0 0 \n", + "102631 0 0 \n", + "103251 0 0 \n", + "104130 0 0 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 1 \n", + "199634 0 0 \n", + "199724 0 0 \n", + "\n", + " ICD9_CODE:780-799_value_1 ICD9_CODE:800-999_value_1 \\\n", + "ID \n", + "100536 0 1 \n", + "101757 1 0 \n", + "102631 0 1 \n", + "103251 0 0 \n", + "104130 1 0 \n", + "... ... ... \n", + "197661 0 1 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 0 0 \n", + "199724 0 1 \n", + "\n", + " ICD9_CODE:E000-E999_value_1 ICD9_CODE:V01-V91_value_1 \n", + "ID \n", + "100536 1 0 \n", + "101757 0 1 \n", + "102631 0 0 \n", + "103251 0 0 \n", + "104130 0 1 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 1 \n", + "199634 0 1 \n", + "199724 0 0 \n", + "\n", + "[200 rows x 21 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import json\n", + "import sparse\n", + "\n", + "S = sparse.load_npz('output-0/S_all.npz')\n", + "S_names = json.load(open('output-0/S_all.feature_names.json', 'r'))\n", + "S_index = pd.read_csv('output-0/S.ID.csv').set_index(['ID'])\n", + "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n", + "\n", + "display(df_S)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test: levels = [0,1]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p output-1" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input:\n", + " Data : ./input/data.csv\n", + " Population: ./input/pop.csv\n", + " Config : ./input/config-1.yaml\n", + "\n", + "Output directory: ./output-1/\n", + "\n", + "Input arguments:\n", + " T = 4.0\n", + " dt = 1.0\n", + " θ₁ = 0.001\n", + " θ₂ = 0.001\n", + " θ_freq = 1.0\n", + " k = 3 ['min', 'max', 'mean']\n", + "\n", + "discretize = yes\n", + " S discretization bins: to be computed from data\n", + " X discretization bins: to be computed from data\n", + "\n", + "N = 200\n", + "L = 4\n", + "\n", + "\n", + "================================================================================\n", + "1) Pre-filter\n", + "================================================================================\n", + "Remove rows not in population\n", + "Remove rows with t outside of [0, 4]\n", + "Remove rare variables (<= 0.001)\n", + "Total variables : 1\n", + "Rare variables : 0\n", + "Remaining variables : 1\n", + "# rows (original) : 1861\n", + "# rows (filtered) : 1861\n", + "\n", + "================================================================================\n", + "2) Transform; 3) Post-filter\n", + "================================================================================\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Detecting and parsing value types\n", + "--------------------------------------------------------------------------------\n", + "Saved as: ./output-1/value_types.csv\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Separate time-invariant and time-dependent\n", + "--------------------------------------------------------------------------------\n", + "Variables (time-invariant): 129\n", + "Variables (time-dependent): 0\n", + "# rows (time-invariant): 2463\n", + "# rows (time-dependent): 0\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-A) Transform time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "(N × ^d) table :\t (200, 129)\n", + "number of missing entries :\t 23337 out of 25800 total\n", + "Time elapsed: 0.057711 seconds\n", + "\n", + "Output\n", + "S_all, binary features :\t (200, 129)\n", + "Time elapsed: 0.830818 seconds\n", + "\n", + "--------------------------------------------------------------------------------\n", + "3-A) Post-filter time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "Original : 129\n", + "Nearly-constant: 0\n", + "Correlated : 2\n", + "Time elapsed: 0.840801 seconds\n", + "\n", + "Output\n", + "S: shape=(200, 127), density=0.097\n", + "Total time: 0.844234 seconds\n", + "\n", + "\u001b[0m" + ] + } + ], + "source": [ + "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n", + "python -m FIDDLE.run \\\n", + " --data_fname='./input/data.csv' \\\n", + " --population_fname='./input/pop.csv' \\\n", + " --config_fname='./input/config-1.yaml' \\\n", + " --output_dir='./output-1/' \\\n", + " --T=4 --dt=1.0 \\\n", + " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n", + " --stats_functions 'min' 'max' 'mean'" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ICD9_CODE:001-009_value_1.0ICD9_CODE:001-139_value_1.0ICD9_CODE:030-041_value_1.0ICD9_CODE:042-042_value_1.0ICD9_CODE:070-079_value_1.0ICD9_CODE:080-088_value_1.0ICD9_CODE:110-118_value_1.0ICD9_CODE:130-136_value_1.0ICD9_CODE:140-239_value_1.0ICD9_CODE:150-159_value_1.0...ICD9_CODE:E960-E969_value_1.0ICD9_CODE:V01-V09_value_1.0ICD9_CODE:V01-V91_value_1.0ICD9_CODE:V10-V19_value_1.0ICD9_CODE:V20-V29_value_1.0ICD9_CODE:V30-V39_value_1.0ICD9_CODE:V40-V49_value_1.0ICD9_CODE:V50-V59_value_1.0ICD9_CODE:V60-V69_value_1.0ICD9_CODE:V70-V82_value_1.0
ID
1005360101100000...0000000000
1017570100001010...0011000000
1026310000000000...0000000000
1032510000000000...0110110101
1041300110000000...0010001000
..................................................................
1976610000000000...0000000000
1982140000000000...0000000000
1988920000000000...0010010000
1996340100000110...0010000100
1997240110000000...0000000000
\n", + "

200 rows × 129 columns

\n", + "
" + ], + "text/plain": [ + " ICD9_CODE:001-009_value_1.0 ICD9_CODE:001-139_value_1.0 \\\n", + "ID \n", + "100536 0 1 \n", + "101757 0 1 \n", + "102631 0 0 \n", + "103251 0 0 \n", + "104130 0 1 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 0 1 \n", + "199724 0 1 \n", + "\n", + " ICD9_CODE:030-041_value_1.0 ICD9_CODE:042-042_value_1.0 \\\n", + "ID \n", + "100536 0 1 \n", + "101757 0 0 \n", + "102631 0 0 \n", + "103251 0 0 \n", + "104130 1 0 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 0 0 \n", + "199724 1 0 \n", + "\n", + " ICD9_CODE:070-079_value_1.0 ICD9_CODE:080-088_value_1.0 \\\n", + "ID \n", + "100536 1 0 \n", + "101757 0 0 \n", + "102631 0 0 \n", + "103251 0 0 \n", + "104130 0 0 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 0 0 \n", + "199724 0 0 \n", + "\n", + " ICD9_CODE:110-118_value_1.0 ICD9_CODE:130-136_value_1.0 \\\n", + "ID \n", + "100536 0 0 \n", + "101757 1 0 \n", + "102631 0 0 \n", + "103251 0 0 \n", + "104130 0 0 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 0 1 \n", + "199724 0 0 \n", + "\n", + " ICD9_CODE:140-239_value_1.0 ICD9_CODE:150-159_value_1.0 ... \\\n", + "ID ... \n", + "100536 0 0 ... \n", + "101757 1 0 ... \n", + "102631 0 0 ... \n", + "103251 0 0 ... \n", + "104130 0 0 ... \n", + "... ... ... ... \n", + "197661 0 0 ... \n", + "198214 0 0 ... \n", + "198892 0 0 ... \n", + "199634 1 0 ... \n", + "199724 0 0 ... \n", + "\n", + " ICD9_CODE:E960-E969_value_1.0 ICD9_CODE:V01-V09_value_1.0 \\\n", + "ID \n", + "100536 0 0 \n", + "101757 0 0 \n", + "102631 0 0 \n", + "103251 0 1 \n", + "104130 0 0 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 0 0 \n", + "199724 0 0 \n", + "\n", + " ICD9_CODE:V01-V91_value_1.0 ICD9_CODE:V10-V19_value_1.0 \\\n", + "ID \n", + "100536 0 0 \n", + "101757 1 1 \n", + "102631 0 0 \n", + "103251 1 0 \n", + "104130 1 0 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 1 0 \n", + "199634 1 0 \n", + "199724 0 0 \n", + "\n", + " ICD9_CODE:V20-V29_value_1.0 ICD9_CODE:V30-V39_value_1.0 \\\n", + "ID \n", + "100536 0 0 \n", + "101757 0 0 \n", + "102631 0 0 \n", + "103251 1 1 \n", + "104130 0 0 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 1 \n", + "199634 0 0 \n", + "199724 0 0 \n", + "\n", + " ICD9_CODE:V40-V49_value_1.0 ICD9_CODE:V50-V59_value_1.0 \\\n", + "ID \n", + "100536 0 0 \n", + "101757 0 0 \n", + "102631 0 0 \n", + "103251 0 1 \n", + "104130 1 0 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 0 1 \n", + "199724 0 0 \n", + "\n", + " ICD9_CODE:V60-V69_value_1.0 ICD9_CODE:V70-V82_value_1.0 \n", + "ID \n", + "100536 0 0 \n", + "101757 0 0 \n", + "102631 0 0 \n", + "103251 0 1 \n", + "104130 0 0 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 0 0 \n", + "199724 0 0 \n", + "\n", + "[200 rows x 129 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import json\n", + "import sparse\n", + "\n", + "S = sparse.load_npz('output-1/S_all.npz')\n", + "S_names = json.load(open('output-1/S_all.feature_names.json', 'r'))\n", + "S_index = pd.read_csv('output-1/S.ID.csv').set_index(['ID'])\n", + "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n", + "\n", + "display(df_S)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test: levels = [0,1,2]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p output-2" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input:\n", + " Data : ./input/data.csv\n", + " Population: ./input/pop.csv\n", + " Config : ./input/config-2.yaml\n", + "\n", + "Output directory: ./output-2/\n", + "\n", + "Input arguments:\n", + " T = 4.0\n", + " dt = 1.0\n", + " θ₁ = 0.001\n", + " θ₂ = 0.001\n", + " θ_freq = 1.0\n", + " k = 3 ['min', 'max', 'mean']\n", + "\n", + "discretize = yes\n", + " S discretization bins: to be computed from data\n", + " X discretization bins: to be computed from data\n", + "\n", + "N = 200\n", + "L = 4\n", + "\n", + "\n", + "================================================================================\n", + "1) Pre-filter\n", + "================================================================================\n", + "Remove rows not in population\n", + "Remove rows with t outside of [0, 4]\n", + "Remove rare variables (<= 0.001)\n", + "Total variables : 1\n", + "Rare variables : 0\n", + "Remaining variables : 1\n", + "# rows (original) : 1861\n", + "# rows (filtered) : 1861\n", + "\n", + "================================================================================\n", + "2) Transform; 3) Post-filter\n", + "================================================================================\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Detecting and parsing value types\n", + "--------------------------------------------------------------------------------\n", + "Saved as: ./output-2/value_types.csv\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Separate time-invariant and time-dependent\n", + "--------------------------------------------------------------------------------\n", + "Variables (time-invariant): 455\n", + "Variables (time-dependent): 0\n", + "# rows (time-invariant): 4205\n", + "# rows (time-dependent): 0\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-A) Transform time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "(N × ^d) table :\t (200, 455)\n", + "number of missing entries :\t 86795 out of 91000 total\n", + "Time elapsed: 0.112510 seconds\n", + "\n", + "Output\n", + "S_all, binary features :\t (200, 455)\n", + "Time elapsed: 2.377939 seconds\n", + "\n", + "--------------------------------------------------------------------------------\n", + "3-A) Post-filter time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "Original : 455\n", + "Nearly-constant: 0\n", + "Correlated : 87\n", + "Time elapsed: 2.428499 seconds\n", + "\n", + "Output\n", + "S: shape=(200, 368), density=0.055\n", + "Total time: 2.435949 seconds\n", + "\n", + "\u001b[0m" + ] + } + ], + "source": [ + "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n", + "python -m FIDDLE.run \\\n", + " --data_fname='./input/data.csv' \\\n", + " --population_fname='./input/pop.csv' \\\n", + " --config_fname='./input/config-2.yaml' \\\n", + " --output_dir='./output-2/' \\\n", + " --T=4 --dt=1.0 \\\n", + " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n", + " --stats_functions 'min' 'max' 'mean'" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ICD9_CODE:001-009_value_1.0ICD9_CODE:001-139_value_1.0ICD9_CODE:008_value_1.0ICD9_CODE:030-041_value_1.0ICD9_CODE:038_value_1.0ICD9_CODE:041_value_1.0ICD9_CODE:042_value_1.0ICD9_CODE:042-042_value_1.0ICD9_CODE:070_value_1.0ICD9_CODE:070-079_value_1.0...ICD9_CODE:V49_value_1.0ICD9_CODE:V50_value_1.0ICD9_CODE:V50-V59_value_1.0ICD9_CODE:V54_value_1.0ICD9_CODE:V58_value_1.0ICD9_CODE:V60-V69_value_1.0ICD9_CODE:V64_value_1.0ICD9_CODE:V66_value_1.0ICD9_CODE:V70-V82_value_1.0ICD9_CODE:V72_value_1.0
ID
1005360100001111...0000000000
1017570100000000...0000000000
1026310000000000...0000000000
1032510000000000...0110000011
1041300101010000...0000000000
..................................................................
1976610000000000...0000000000
1982140000000000...0000000000
1988920000000000...0000000000
1996340100000000...0010100000
1997240101100000...0000000000
\n", + "

200 rows × 455 columns

\n", + "
" + ], + "text/plain": [ + " ICD9_CODE:001-009_value_1.0 ICD9_CODE:001-139_value_1.0 \\\n", + "ID \n", + "100536 0 1 \n", + "101757 0 1 \n", + "102631 0 0 \n", + "103251 0 0 \n", + "104130 0 1 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 0 1 \n", + "199724 0 1 \n", + "\n", + " ICD9_CODE:008_value_1.0 ICD9_CODE:030-041_value_1.0 \\\n", + "ID \n", + "100536 0 0 \n", + "101757 0 0 \n", + "102631 0 0 \n", + "103251 0 0 \n", + "104130 0 1 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 0 0 \n", + "199724 0 1 \n", + "\n", + " ICD9_CODE:038_value_1.0 ICD9_CODE:041_value_1.0 \\\n", + "ID \n", + "100536 0 0 \n", + "101757 0 0 \n", + "102631 0 0 \n", + "103251 0 0 \n", + "104130 0 1 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 0 0 \n", + "199724 1 0 \n", + "\n", + " ICD9_CODE:042_value_1.0 ICD9_CODE:042-042_value_1.0 \\\n", + "ID \n", + "100536 1 1 \n", + "101757 0 0 \n", + "102631 0 0 \n", + "103251 0 0 \n", + "104130 0 0 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 0 0 \n", + "199724 0 0 \n", + "\n", + " ICD9_CODE:070_value_1.0 ICD9_CODE:070-079_value_1.0 ... \\\n", + "ID ... \n", + "100536 1 1 ... \n", + "101757 0 0 ... \n", + "102631 0 0 ... \n", + "103251 0 0 ... \n", + "104130 0 0 ... \n", + "... ... ... ... \n", + "197661 0 0 ... \n", + "198214 0 0 ... \n", + "198892 0 0 ... \n", + "199634 0 0 ... \n", + "199724 0 0 ... \n", + "\n", + " ICD9_CODE:V49_value_1.0 ICD9_CODE:V50_value_1.0 \\\n", + "ID \n", + "100536 0 0 \n", + "101757 0 0 \n", + "102631 0 0 \n", + "103251 0 1 \n", + "104130 0 0 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 0 0 \n", + "199724 0 0 \n", + "\n", + " ICD9_CODE:V50-V59_value_1.0 ICD9_CODE:V54_value_1.0 \\\n", + "ID \n", + "100536 0 0 \n", + "101757 0 0 \n", + "102631 0 0 \n", + "103251 1 0 \n", + "104130 0 0 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 1 0 \n", + "199724 0 0 \n", + "\n", + " ICD9_CODE:V58_value_1.0 ICD9_CODE:V60-V69_value_1.0 \\\n", + "ID \n", + "100536 0 0 \n", + "101757 0 0 \n", + "102631 0 0 \n", + "103251 0 0 \n", + "104130 0 0 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 1 0 \n", + "199724 0 0 \n", + "\n", + " ICD9_CODE:V64_value_1.0 ICD9_CODE:V66_value_1.0 \\\n", + "ID \n", + "100536 0 0 \n", + "101757 0 0 \n", + "102631 0 0 \n", + "103251 0 0 \n", + "104130 0 0 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 0 0 \n", + "199724 0 0 \n", + "\n", + " ICD9_CODE:V70-V82_value_1.0 ICD9_CODE:V72_value_1.0 \n", + "ID \n", + "100536 0 0 \n", + "101757 0 0 \n", + "102631 0 0 \n", + "103251 1 1 \n", + "104130 0 0 \n", + "... ... ... \n", + "197661 0 0 \n", + "198214 0 0 \n", + "198892 0 0 \n", + "199634 0 0 \n", + "199724 0 0 \n", + "\n", + "[200 rows x 455 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import json\n", + "import sparse\n", + "\n", + "S = sparse.load_npz('output-2/S_all.npz')\n", + "S_names = json.load(open('output-2/S_all.feature_names.json', 'r'))\n", + "S_index = pd.read_csv('output-2/S.ID.csv').set_index(['ID'])\n", + "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n", + "\n", + "display(df_S)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tests/icd_test/input/config-0.yaml b/tests/icd_test/input/config-0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..913d9292b81922c79edb8e62a0e5f6c87efdda27 --- /dev/null +++ b/tests/icd_test/input/config-0.yaml @@ -0,0 +1,2 @@ +hierarchical_sep: ':' +hierarchical_levels: [0] diff --git a/tests/icd_test/input/config-1.yaml b/tests/icd_test/input/config-1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2c6ecce9bfea06c237f27be76622bf9699e9898d --- /dev/null +++ b/tests/icd_test/input/config-1.yaml @@ -0,0 +1,2 @@ +hierarchical_sep: ':' +hierarchical_levels: [0,1] diff --git a/tests/icd_test/input/config-2.yaml b/tests/icd_test/input/config-2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9c6d4af6b14efe802a5f423eac094966cdc95d7c --- /dev/null +++ b/tests/icd_test/input/config-2.yaml @@ -0,0 +1,2 @@ +hierarchical_sep: ':' +hierarchical_levels: [0,1,2] diff --git a/test/icd_test/input_data.csv b/tests/icd_test/input/data.csv similarity index 100% rename from test/icd_test/input_data.csv rename to tests/icd_test/input/data.csv diff --git a/test/icd_test/icd_test_data.csv b/tests/icd_test/input/icd_test_data.csv similarity index 100% rename from test/icd_test/icd_test_data.csv rename to tests/icd_test/input/icd_test_data.csv diff --git a/test/icd_test/pop.csv b/tests/icd_test/input/pop.csv similarity index 100% rename from test/icd_test/pop.csv rename to tests/icd_test/input/pop.csv diff --git a/tests/large_test/Run.ipynb b/tests/large_test/Run.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..3033773a2fa628af2ea9d243f358ffcc6b159af6 --- /dev/null +++ b/tests/large_test/Run.ipynb @@ -0,0 +1,3664 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf output-*/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 1: discretize = True, use_ordinal_encoding = False" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p output-1-parallel" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input:\n", + " Data : ./input/data.csv\n", + " Population: ./input/pop.csv\n", + " Config : ./input/config-1-parallel.yaml\n", + "\n", + "Output directory: ./output-1-parallel/\n", + "\n", + "Input arguments:\n", + " T = 4.0\n", + " dt = 1.0\n", + " θ₁ = 0.001\n", + " θ₂ = 0.001\n", + " θ_freq = 1.0\n", + " k = 3 ['min', 'max', 'mean']\n", + "\n", + "discretize = yes\n", + " S discretization bins: to be computed from data\n", + " X discretization bins: to be computed from data\n", + "\n", + "N = 200\n", + "L = 4\n", + "\n", + "\n", + "================================================================================\n", + "1) Pre-filter\n", + "================================================================================\n", + "Remove rows not in population\n", + "Remove rows with t outside of [0, 4]\n", + "Remove rare variables (<= 0.001)\n", + "Total variables : 1970\n", + "Rare variables : 0\n", + "Remaining variables : 1970\n", + "# rows (original) : 64777\n", + "# rows (filtered) : 64777\n", + "\n", + "================================================================================\n", + "2) Transform; 3) Post-filter\n", + "================================================================================\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Detecting and parsing value types\n", + "--------------------------------------------------------------------------------\n", + "Saved as: ./output-1-parallel/value_types.csv\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Separate time-invariant and time-dependent\n", + "--------------------------------------------------------------------------------\n", + "Variables (time-invariant): 12\n", + "Variables (time-dependent): 1958\n", + "# rows (time-invariant): 2400\n", + "# rows (time-dependent): 62377\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-A) Transform time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "(N × ^d) table :\t (200, 12)\n", + "number of missing entries :\t 4 out of 2400 total\n", + "Time elapsed: 0.030966 seconds\n", + "\n", + "Output\n", + "S_all, binary features :\t (200, 84)\n", + "Time elapsed: 0.226954 seconds\n", + "\n", + "--------------------------------------------------------------------------------\n", + "3-A) Post-filter time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "Original : 84\n", + "Nearly-constant: 0\n", + "Correlated : 7\n", + "Time elapsed: 0.232384 seconds\n", + "\n", + "Output\n", + "S: shape=(200, 77), density=0.145\n", + "Total time: 0.235823 seconds\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-B) Transform time-dependent data\n", + "--------------------------------------------------------------------------------\n", + "Total variables : 1958\n", + "Frequent variables : ['DiaBP', 'HR', 'RR', 'SpO2', 'SysBP']\n", + "M₁ = 5\n", + "M₂ = 1953\n", + "k = 3 ['min', 'max', 'mean']\n", + "\n", + "Transforming each example...\n", + "Batches of size 100: 2\n", + "100%|█████████████████████████████████████████████| 2/2 [00:38<00:00, 19.13s/it]\n", + "\u001b[0m\u001b[0m\u001b[0m\u001b[0m\n", + "Parallel processing done\n", + "DONE: Transforming each example...\n", + "(freq) number of missing entries :\t 996 out of 200×4×5=4000 total\n", + "(freq) number of imputed entries :\t 58\n", + "(freq) number of not imputed entries :\t 938\n", + "(non-freq) number of missing entries :\t 1510389 out of 200×4×1953=1562400 total\n", + "\n", + "(N × L × ^D) table :\t (200, 4, 1983)\n", + "Time elapsed: 39.815167 seconds\n", + "Discretizing features...\n", + "\n", + "Processing 1978 non-boolean variable columns...\n", + " Computing bin edges for numeric variables...\n", + "100%|██████████████████████████████████████| 1978/1978 [00:06<00:00, 328.38it/s]\n", + " Discretizing variables to binary features\n", + "100%|██████████████████████████████████████| 1978/1978 [00:09<00:00, 201.94it/s]\n", + "\u001b[0m\u001b[0m\u001b[0m\u001b[0mFinished discretizing features\n", + "\n", + "Output\n", + "X_all: shape=(200, 4, 3557), density=0.025\n", + "Time elapsed: 57.075922 seconds\n", + "\n", + "--------------------------------------------------------------------------------\n", + "3-B) Post-filter time-dependent data\n", + "--------------------------------------------------------------------------------\n", + "(200, 4, 3557) 0.02504322462749508\n", + "Original : 3557\n", + "Nearly-constant: 77\n", + "*** time: 8.839366912841797\n", + "Correlated : 1137\n", + "*** time: 16.099601984024048\n", + "\n", + "Output\n", + "X: shape=(200, 4, 2343), density=0.034\n", + "(200, 4, 2343) 0.03446382842509603\n", + "Time elapsed: 73.185729 seconds\n", + "\n", + "Output\n", + "X: shape=(200, 4, 2343), density=0.034\n", + "Total time: 73.237736 seconds\n", + "\n", + "\u001b[0m" + ] + } + ], + "source": [ + "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n", + "python -m FIDDLE.run \\\n", + " --data_fname='./input/data.csv' \\\n", + " --population_fname='./input/pop.csv' \\\n", + " --config_fname='./input/config-1-parallel.yaml' \\\n", + " --output_dir='./output-1-parallel/' \\\n", + " --T=4 --dt=1.0 \\\n", + " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n", + " --stats_functions 'min' 'max' 'mean'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ADMISSION_LOCATION_value_CLINIC REFERRAL/PREMATUREADMISSION_LOCATION_value_EMERGENCY ROOM ADMITADMISSION_LOCATION_value_PHYS REFERRAL/NORMAL DELIADMISSION_LOCATION_value_TRANSFER FROM HOSP/EXTRAMADMISSION_LOCATION_value_TRANSFER FROM OTHER HEALTADMISSION_LOCATION_value_TRANSFER FROM SKILLED NURADMISSION_TYPE_value_ELECTIVEADMISSION_TYPE_value_EMERGENCYADMISSION_TYPE_value_URGENTAGE_value_(19.737, 50.02]...RELIGION_value_EPISCOPALIANRELIGION_value_GREEK ORTHODOXRELIGION_value_JEHOVAH'S WITNESSRELIGION_value_JEWISHRELIGION_value_MUSLIMRELIGION_value_NOT SPECIFIEDRELIGION_value_OTHERRELIGION_value_PROTESTANT QUAKERRELIGION_value_UNITARIAN-UNIVERSALISTRELIGION_value_UNOBTAINABLE
ID
2000011000000100...0000100000
2000101000000101...0000010000
2000160010001000...0000000000
2000331000000100...0000000001
2000340010001000...0000000001
..................................................................
2011101000000100...0000010000
2011130100000100...0000000001
2011240010001000...0000010000
2011251000000100...0001000000
2011280001000100...0000000000
\n", + "

200 rows × 84 columns

\n", + "
" + ], + "text/plain": [ + " ADMISSION_LOCATION_value_CLINIC REFERRAL/PREMATURE \\\n", + "ID \n", + "200001 1 \n", + "200010 1 \n", + "200016 0 \n", + "200033 1 \n", + "200034 0 \n", + "... ... \n", + "201110 1 \n", + "201113 0 \n", + "201124 0 \n", + "201125 1 \n", + "201128 0 \n", + "\n", + " ADMISSION_LOCATION_value_EMERGENCY ROOM ADMIT \\\n", + "ID \n", + "200001 0 \n", + "200010 0 \n", + "200016 0 \n", + "200033 0 \n", + "200034 0 \n", + "... ... \n", + "201110 0 \n", + "201113 1 \n", + "201124 0 \n", + "201125 0 \n", + "201128 0 \n", + "\n", + " ADMISSION_LOCATION_value_PHYS REFERRAL/NORMAL DELI \\\n", + "ID \n", + "200001 0 \n", + "200010 0 \n", + "200016 1 \n", + "200033 0 \n", + "200034 1 \n", + "... ... \n", + "201110 0 \n", + "201113 0 \n", + "201124 1 \n", + "201125 0 \n", + "201128 0 \n", + "\n", + " ADMISSION_LOCATION_value_TRANSFER FROM HOSP/EXTRAM \\\n", + "ID \n", + "200001 0 \n", + "200010 0 \n", + "200016 0 \n", + "200033 0 \n", + "200034 0 \n", + "... ... \n", + "201110 0 \n", + "201113 0 \n", + "201124 0 \n", + "201125 0 \n", + "201128 1 \n", + "\n", + " ADMISSION_LOCATION_value_TRANSFER FROM OTHER HEALT \\\n", + "ID \n", + "200001 0 \n", + "200010 0 \n", + "200016 0 \n", + "200033 0 \n", + "200034 0 \n", + "... ... \n", + "201110 0 \n", + "201113 0 \n", + "201124 0 \n", + "201125 0 \n", + "201128 0 \n", + "\n", + " ADMISSION_LOCATION_value_TRANSFER FROM SKILLED NUR \\\n", + "ID \n", + "200001 0 \n", + "200010 0 \n", + "200016 0 \n", + "200033 0 \n", + "200034 0 \n", + "... ... \n", + "201110 0 \n", + "201113 0 \n", + "201124 0 \n", + "201125 0 \n", + "201128 0 \n", + "\n", + " ADMISSION_TYPE_value_ELECTIVE ADMISSION_TYPE_value_EMERGENCY \\\n", + "ID \n", + "200001 0 1 \n", + "200010 0 1 \n", + "200016 1 0 \n", + "200033 0 1 \n", + "200034 1 0 \n", + "... ... ... \n", + "201110 0 1 \n", + "201113 0 1 \n", + "201124 1 0 \n", + "201125 0 1 \n", + "201128 0 1 \n", + "\n", + " ADMISSION_TYPE_value_URGENT AGE_value_(19.737, 50.02] ... \\\n", + "ID ... \n", + "200001 0 0 ... \n", + "200010 0 1 ... \n", + "200016 0 0 ... \n", + "200033 0 0 ... \n", + "200034 0 0 ... \n", + "... ... ... ... \n", + "201110 0 0 ... \n", + "201113 0 0 ... \n", + "201124 0 0 ... \n", + "201125 0 0 ... \n", + "201128 0 0 ... \n", + "\n", + " RELIGION_value_EPISCOPALIAN RELIGION_value_GREEK ORTHODOX \\\n", + "ID \n", + "200001 0 0 \n", + "200010 0 0 \n", + "200016 0 0 \n", + "200033 0 0 \n", + "200034 0 0 \n", + "... ... ... \n", + "201110 0 0 \n", + "201113 0 0 \n", + "201124 0 0 \n", + "201125 0 0 \n", + "201128 0 0 \n", + "\n", + " RELIGION_value_JEHOVAH'S WITNESS RELIGION_value_JEWISH \\\n", + "ID \n", + "200001 0 0 \n", + "200010 0 0 \n", + "200016 0 0 \n", + "200033 0 0 \n", + "200034 0 0 \n", + "... ... ... \n", + "201110 0 0 \n", + "201113 0 0 \n", + "201124 0 0 \n", + "201125 0 1 \n", + "201128 0 0 \n", + "\n", + " RELIGION_value_MUSLIM RELIGION_value_NOT SPECIFIED \\\n", + "ID \n", + "200001 1 0 \n", + "200010 0 1 \n", + "200016 0 0 \n", + "200033 0 0 \n", + "200034 0 0 \n", + "... ... ... \n", + "201110 0 1 \n", + "201113 0 0 \n", + "201124 0 1 \n", + "201125 0 0 \n", + "201128 0 0 \n", + "\n", + " RELIGION_value_OTHER RELIGION_value_PROTESTANT QUAKER \\\n", + "ID \n", + "200001 0 0 \n", + "200010 0 0 \n", + "200016 0 0 \n", + "200033 0 0 \n", + "200034 0 0 \n", + "... ... ... \n", + "201110 0 0 \n", + "201113 0 0 \n", + "201124 0 0 \n", + "201125 0 0 \n", + "201128 0 0 \n", + "\n", + " RELIGION_value_UNITARIAN-UNIVERSALIST RELIGION_value_UNOBTAINABLE \n", + "ID \n", + "200001 0 0 \n", + "200010 0 0 \n", + "200016 0 0 \n", + "200033 0 1 \n", + "200034 0 1 \n", + "... ... ... \n", + "201110 0 0 \n", + "201113 0 1 \n", + "201124 0 0 \n", + "201125 0 0 \n", + "201128 0 0 \n", + "\n", + "[200 rows x 84 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DiaBP_maskHR_maskRR_maskSpO2_maskSysBP_mask220046_value_(9.999, 120.0]220046_value_(120.0, 200.0]220047_value_(28.999, 50.0]220047_value_(50.0, 55.0]220047_value_(55.0, 60.0]...SysBP_max_(65.999, 105.0]SysBP_max_(105.0, 116.0]SysBP_max_(116.0, 127.0]SysBP_max_(127.0, 141.0]SysBP_max_(141.0, 214.0]SysBP_mean_(65.999, 103.4]SysBP_mean_(103.4, 114.433]SysBP_mean_(114.433, 125.0]SysBP_mean_(125.0, 138.0]SysBP_mean_(138.0, 206.0]
IDt_range
200001[0.0, 1.0)1111110001...0100001000
[1.0, 2.0)1111100000...0100001000
[2.0, 3.0)1111100000...0100000100
[3.0, 4.0)1111100000...1000010000
200010[0.0, 1.0)0000010100...0000000000
.....................................................................
201125[3.0, 4.0)1111101001...0010000100
201128[0.0, 1.0)0000000000...0000000000
[1.0, 2.0)1110110100...0001000010
[2.0, 3.0)1110100000...0010000100
[3.0, 4.0)1110100000...0010000100
\n", + "

800 rows × 3557 columns

\n", + "
" + ], + "text/plain": [ + " DiaBP_mask HR_mask RR_mask SpO2_mask SysBP_mask \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 1 1 1 1 1 \n", + " [1.0, 2.0) 1 1 1 1 1 \n", + " [2.0, 3.0) 1 1 1 1 1 \n", + " [3.0, 4.0) 1 1 1 1 1 \n", + "200010 [0.0, 1.0) 0 0 0 0 0 \n", + "... ... ... ... ... ... \n", + "201125 [3.0, 4.0) 1 1 1 1 1 \n", + "201128 [0.0, 1.0) 0 0 0 0 0 \n", + " [1.0, 2.0) 1 1 1 0 1 \n", + " [2.0, 3.0) 1 1 1 0 1 \n", + " [3.0, 4.0) 1 1 1 0 1 \n", + "\n", + " 220046_value_(9.999, 120.0] 220046_value_(120.0, 200.0] \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 1 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "200010 [0.0, 1.0) 1 0 \n", + "... ... ... \n", + "201125 [3.0, 4.0) 0 1 \n", + "201128 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 1 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "\n", + " 220047_value_(28.999, 50.0] 220047_value_(50.0, 55.0] \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "200010 [0.0, 1.0) 1 0 \n", + "... ... ... \n", + "201125 [3.0, 4.0) 0 0 \n", + "201128 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 1 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "\n", + " 220047_value_(55.0, 60.0] ... SysBP_max_(65.999, 105.0] \\\n", + "ID t_range ... \n", + "200001 [0.0, 1.0) 1 ... 0 \n", + " [1.0, 2.0) 0 ... 0 \n", + " [2.0, 3.0) 0 ... 0 \n", + " [3.0, 4.0) 0 ... 1 \n", + "200010 [0.0, 1.0) 0 ... 0 \n", + "... ... ... ... \n", + "201125 [3.0, 4.0) 1 ... 0 \n", + "201128 [0.0, 1.0) 0 ... 0 \n", + " [1.0, 2.0) 0 ... 0 \n", + " [2.0, 3.0) 0 ... 0 \n", + " [3.0, 4.0) 0 ... 0 \n", + "\n", + " SysBP_max_(105.0, 116.0] SysBP_max_(116.0, 127.0] \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 1 0 \n", + " [1.0, 2.0) 1 0 \n", + " [2.0, 3.0) 1 0 \n", + " [3.0, 4.0) 0 0 \n", + "200010 [0.0, 1.0) 0 0 \n", + "... ... ... \n", + "201125 [3.0, 4.0) 0 1 \n", + "201128 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 1 \n", + " [3.0, 4.0) 0 1 \n", + "\n", + " SysBP_max_(127.0, 141.0] SysBP_max_(141.0, 214.0] \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "200010 [0.0, 1.0) 0 0 \n", + "... ... ... \n", + "201125 [3.0, 4.0) 0 0 \n", + "201128 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 1 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "\n", + " SysBP_mean_(65.999, 103.4] SysBP_mean_(103.4, 114.433] \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 1 0 \n", + "200010 [0.0, 1.0) 0 0 \n", + "... ... ... \n", + "201125 [3.0, 4.0) 0 0 \n", + "201128 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "\n", + " SysBP_mean_(114.433, 125.0] SysBP_mean_(125.0, 138.0] \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 1 0 \n", + " [3.0, 4.0) 0 0 \n", + "200010 [0.0, 1.0) 0 0 \n", + "... ... ... \n", + "201125 [3.0, 4.0) 1 0 \n", + "201128 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 1 0 \n", + " [3.0, 4.0) 1 0 \n", + "\n", + " SysBP_mean_(138.0, 206.0] \n", + "ID t_range \n", + "200001 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 0 \n", + " [2.0, 3.0) 0 \n", + " [3.0, 4.0) 0 \n", + "200010 [0.0, 1.0) 0 \n", + "... ... \n", + "201125 [3.0, 4.0) 0 \n", + "201128 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 0 \n", + " [2.0, 3.0) 0 \n", + " [3.0, 4.0) 0 \n", + "\n", + "[800 rows x 3557 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import json\n", + "import sparse\n", + "\n", + "S = sparse.load_npz('output-1-parallel/S_all.npz')\n", + "S_names = json.load(open('output-1-parallel/S_all.feature_names.json', 'r'))\n", + "S_index = pd.read_csv('output-1-parallel/S.ID.csv').set_index(['ID'])\n", + "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n", + "\n", + "X = sparse.load_npz('output-1-parallel/X_all.npz')\n", + "X_names = json.load(open('output-1-parallel/X_all.feature_names.json', 'r'))\n", + "X_index = pd.read_csv('output-1-parallel/X.ID,t_range.csv').set_index(['ID', 't_range'])\n", + "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n", + "\n", + "display(df_S)\n", + "display(df_X)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 2: discretize = True, use_ordinal_encoding = True" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p output-2-parallel" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input:\n", + " Data : ./input/data.csv\n", + " Population: ./input/pop.csv\n", + " Config : ./input/config-2-parallel.yaml\n", + "\n", + "Output directory: ./output-2-parallel/\n", + "\n", + "Input arguments:\n", + " T = 4.0\n", + " dt = 1.0\n", + " θ₁ = 0.001\n", + " θ₂ = 0.001\n", + " θ_freq = 1.0\n", + " k = 3 ['min', 'max', 'mean']\n", + "\n", + "discretize = yes\n", + " S discretization bins: to be computed from data\n", + " X discretization bins: to be computed from data\n", + "\n", + "N = 200\n", + "L = 4\n", + "\n", + "\n", + "================================================================================\n", + "1) Pre-filter\n", + "================================================================================\n", + "Remove rows not in population\n", + "Remove rows with t outside of [0, 4]\n", + "Remove rare variables (<= 0.001)\n", + "Total variables : 1970\n", + "Rare variables : 0\n", + "Remaining variables : 1970\n", + "# rows (original) : 64777\n", + "# rows (filtered) : 64777\n", + "\n", + "================================================================================\n", + "2) Transform; 3) Post-filter\n", + "================================================================================\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Detecting and parsing value types\n", + "--------------------------------------------------------------------------------\n", + "Saved as: ./output-2-parallel/value_types.csv\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Separate time-invariant and time-dependent\n", + "--------------------------------------------------------------------------------\n", + "Variables (time-invariant): 12\n", + "Variables (time-dependent): 1958\n", + "# rows (time-invariant): 2400\n", + "# rows (time-dependent): 62377\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-A) Transform time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "(N × ^d) table :\t (200, 12)\n", + "number of missing entries :\t 4 out of 2400 total\n", + "Time elapsed: 0.018090 seconds\n", + "\n", + "Output\n", + "S_all, binary features :\t (200, 84)\n", + "Time elapsed: 0.180124 seconds\n", + "\n", + "--------------------------------------------------------------------------------\n", + "3-A) Post-filter time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "Original : 84\n", + "Nearly-constant: 2\n", + "Correlated : 7\n", + "Time elapsed: 0.184865 seconds\n", + "\n", + "Output\n", + "S: shape=(200, 75), density=0.176\n", + "Total time: 0.188878 seconds\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-B) Transform time-dependent data\n", + "--------------------------------------------------------------------------------\n", + "Total variables : 1958\n", + "Frequent variables : ['DiaBP', 'HR', 'RR', 'SpO2', 'SysBP']\n", + "M₁ = 5\n", + "M₂ = 1953\n", + "k = 3 ['min', 'max', 'mean']\n", + "\n", + "Transforming each example...\n", + "Batches of size 100: 2\n", + "100%|█████████████████████████████████████████████| 2/2 [00:35<00:00, 17.88s/it]\n", + "\u001b[0m\u001b[0m\u001b[0m\u001b[0m\n", + "Parallel processing done\n", + "DONE: Transforming each example...\n", + "(freq) number of missing entries :\t 996 out of 200×4×5=4000 total\n", + "(freq) number of imputed entries :\t 58\n", + "(freq) number of not imputed entries :\t 938\n", + "(non-freq) number of missing entries :\t 1510389 out of 200×4×1953=1562400 total\n", + "\n", + "(N × L × ^D) table :\t (200, 4, 1983)\n", + "Time elapsed: 37.294821 seconds\n", + "Discretizing features...\n", + "\n", + "Processing 1978 non-boolean variable columns...\n", + " Computing bin edges for numeric variables...\n", + "100%|██████████████████████████████████████| 1978/1978 [00:05<00:00, 377.85it/s]\n", + " Discretizing variables to binary features\n", + "100%|██████████████████████████████████████| 1978/1978 [00:14<00:00, 139.24it/s]\n", + "\u001b[0m\u001b[0m\u001b[0m\u001b[0mFinished discretizing features\n", + "\n", + "Output\n", + "X_all: shape=(200, 4, 3587), density=0.039\n", + "Time elapsed: 58.029910 seconds\n", + "\n", + "--------------------------------------------------------------------------------\n", + "3-B) Post-filter time-dependent data\n", + "--------------------------------------------------------------------------------\n", + "(200, 4, 3587) 0.03878101477557848\n", + "Original : 3587\n", + "Nearly-constant: 3\n", + "*** time: 7.768070220947266\n", + "Correlated : 1194\n", + "*** time: 14.502072095870972\n", + "\n", + "Output\n", + "X: shape=(200, 4, 2390), density=0.048\n", + "(200, 4, 2390) 0.04819874476987448\n", + "Time elapsed: 72.538985 seconds\n", + "\n", + "Output\n", + "X: shape=(200, 4, 2390), density=0.048\n", + "Total time: 72.603644 seconds\n", + "\n", + "\u001b[0m" + ] + } + ], + "source": [ + "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n", + "python -m FIDDLE.run \\\n", + " --data_fname='./input/data.csv' \\\n", + " --population_fname='./input/pop.csv' \\\n", + " --config_fname='./input/config-2-parallel.yaml' \\\n", + " --output_dir='./output-2-parallel/' \\\n", + " --T=4 --dt=1.0 \\\n", + " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n", + " --stats_functions 'min' 'max' 'mean'" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ADMISSION_LOCATION_value_CLINIC REFERRAL/PREMATUREADMISSION_LOCATION_value_EMERGENCY ROOM ADMITADMISSION_LOCATION_value_PHYS REFERRAL/NORMAL DELIADMISSION_LOCATION_value_TRANSFER FROM HOSP/EXTRAMADMISSION_LOCATION_value_TRANSFER FROM OTHER HEALTADMISSION_LOCATION_value_TRANSFER FROM SKILLED NURADMISSION_TYPE_value_ELECTIVEADMISSION_TYPE_value_EMERGENCYADMISSION_TYPE_value_URGENTAGE_value>=19.737885622780315...RELIGION_value_EPISCOPALIANRELIGION_value_GREEK ORTHODOXRELIGION_value_JEHOVAH'S WITNESSRELIGION_value_JEWISHRELIGION_value_MUSLIMRELIGION_value_NOT SPECIFIEDRELIGION_value_OTHERRELIGION_value_PROTESTANT QUAKERRELIGION_value_UNITARIAN-UNIVERSALISTRELIGION_value_UNOBTAINABLE
ID
2000011000000101...0000100000
2000101000000101...0000010000
2000160010001001...0000000000
2000331000000101...0000000001
2000340010001001...0000000001
..................................................................
2011101000000101...0000010000
2011130100000101...0000000001
2011240010001001...0000010000
2011251000000101...0001000000
2011280001000101...0000000000
\n", + "

200 rows × 84 columns

\n", + "
" + ], + "text/plain": [ + " ADMISSION_LOCATION_value_CLINIC REFERRAL/PREMATURE \\\n", + "ID \n", + "200001 1 \n", + "200010 1 \n", + "200016 0 \n", + "200033 1 \n", + "200034 0 \n", + "... ... \n", + "201110 1 \n", + "201113 0 \n", + "201124 0 \n", + "201125 1 \n", + "201128 0 \n", + "\n", + " ADMISSION_LOCATION_value_EMERGENCY ROOM ADMIT \\\n", + "ID \n", + "200001 0 \n", + "200010 0 \n", + "200016 0 \n", + "200033 0 \n", + "200034 0 \n", + "... ... \n", + "201110 0 \n", + "201113 1 \n", + "201124 0 \n", + "201125 0 \n", + "201128 0 \n", + "\n", + " ADMISSION_LOCATION_value_PHYS REFERRAL/NORMAL DELI \\\n", + "ID \n", + "200001 0 \n", + "200010 0 \n", + "200016 1 \n", + "200033 0 \n", + "200034 1 \n", + "... ... \n", + "201110 0 \n", + "201113 0 \n", + "201124 1 \n", + "201125 0 \n", + "201128 0 \n", + "\n", + " ADMISSION_LOCATION_value_TRANSFER FROM HOSP/EXTRAM \\\n", + "ID \n", + "200001 0 \n", + "200010 0 \n", + "200016 0 \n", + "200033 0 \n", + "200034 0 \n", + "... ... \n", + "201110 0 \n", + "201113 0 \n", + "201124 0 \n", + "201125 0 \n", + "201128 1 \n", + "\n", + " ADMISSION_LOCATION_value_TRANSFER FROM OTHER HEALT \\\n", + "ID \n", + "200001 0 \n", + "200010 0 \n", + "200016 0 \n", + "200033 0 \n", + "200034 0 \n", + "... ... \n", + "201110 0 \n", + "201113 0 \n", + "201124 0 \n", + "201125 0 \n", + "201128 0 \n", + "\n", + " ADMISSION_LOCATION_value_TRANSFER FROM SKILLED NUR \\\n", + "ID \n", + "200001 0 \n", + "200010 0 \n", + "200016 0 \n", + "200033 0 \n", + "200034 0 \n", + "... ... \n", + "201110 0 \n", + "201113 0 \n", + "201124 0 \n", + "201125 0 \n", + "201128 0 \n", + "\n", + " ADMISSION_TYPE_value_ELECTIVE ADMISSION_TYPE_value_EMERGENCY \\\n", + "ID \n", + "200001 0 1 \n", + "200010 0 1 \n", + "200016 1 0 \n", + "200033 0 1 \n", + "200034 1 0 \n", + "... ... ... \n", + "201110 0 1 \n", + "201113 0 1 \n", + "201124 1 0 \n", + "201125 0 1 \n", + "201128 0 1 \n", + "\n", + " ADMISSION_TYPE_value_URGENT AGE_value>=19.737885622780315 ... \\\n", + "ID ... \n", + "200001 0 1 ... \n", + "200010 0 1 ... \n", + "200016 0 1 ... \n", + "200033 0 1 ... \n", + "200034 0 1 ... \n", + "... ... ... ... \n", + "201110 0 1 ... \n", + "201113 0 1 ... \n", + "201124 0 1 ... \n", + "201125 0 1 ... \n", + "201128 0 1 ... \n", + "\n", + " RELIGION_value_EPISCOPALIAN RELIGION_value_GREEK ORTHODOX \\\n", + "ID \n", + "200001 0 0 \n", + "200010 0 0 \n", + "200016 0 0 \n", + "200033 0 0 \n", + "200034 0 0 \n", + "... ... ... \n", + "201110 0 0 \n", + "201113 0 0 \n", + "201124 0 0 \n", + "201125 0 0 \n", + "201128 0 0 \n", + "\n", + " RELIGION_value_JEHOVAH'S WITNESS RELIGION_value_JEWISH \\\n", + "ID \n", + "200001 0 0 \n", + "200010 0 0 \n", + "200016 0 0 \n", + "200033 0 0 \n", + "200034 0 0 \n", + "... ... ... \n", + "201110 0 0 \n", + "201113 0 0 \n", + "201124 0 0 \n", + "201125 0 1 \n", + "201128 0 0 \n", + "\n", + " RELIGION_value_MUSLIM RELIGION_value_NOT SPECIFIED \\\n", + "ID \n", + "200001 1 0 \n", + "200010 0 1 \n", + "200016 0 0 \n", + "200033 0 0 \n", + "200034 0 0 \n", + "... ... ... \n", + "201110 0 1 \n", + "201113 0 0 \n", + "201124 0 1 \n", + "201125 0 0 \n", + "201128 0 0 \n", + "\n", + " RELIGION_value_OTHER RELIGION_value_PROTESTANT QUAKER \\\n", + "ID \n", + "200001 0 0 \n", + "200010 0 0 \n", + "200016 0 0 \n", + "200033 0 0 \n", + "200034 0 0 \n", + "... ... ... \n", + "201110 0 0 \n", + "201113 0 0 \n", + "201124 0 0 \n", + "201125 0 0 \n", + "201128 0 0 \n", + "\n", + " RELIGION_value_UNITARIAN-UNIVERSALIST RELIGION_value_UNOBTAINABLE \n", + "ID \n", + "200001 0 0 \n", + "200010 0 0 \n", + "200016 0 0 \n", + "200033 0 1 \n", + "200034 0 1 \n", + "... ... ... \n", + "201110 0 0 \n", + "201113 0 1 \n", + "201124 0 0 \n", + "201125 0 0 \n", + "201128 0 0 \n", + "\n", + "[200 rows x 84 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DiaBP_maskHR_maskRR_maskSpO2_maskSysBP_mask220048_value_1220048: 1st AV (First degree AV Block) _value_1220048: 3rd AV (Complete Heart Block) _value_1220048: A Flut (Atrial Flutter) _value_1220048: AF (Atrial Fibrillation)_value_1...SysBP_mean>=66.0SysBP_mean>=103.4SysBP_mean>=114.43333333333334SysBP_mean>=125.0SysBP_mean>=138.0SysBP_max>=66.0SysBP_max>=105.0SysBP_max>=116.0SysBP_max>=127.0SysBP_max>=141.0
IDt_range
200001[0.0, 1.0)1111110000...1100011000
[1.0, 2.0)1111110000...1100011000
[2.0, 3.0)1111110000...1110011100
[3.0, 4.0)1111110000...1000010000
200010[0.0, 1.0)0000000000...0000000000
.....................................................................
201125[3.0, 4.0)1111110000...1110011100
201128[0.0, 1.0)0000000000...0000000000
[1.0, 2.0)1110110000...1111011110
[2.0, 3.0)1110110000...1110011100
[3.0, 4.0)1110110000...1110011100
\n", + "

800 rows × 3587 columns

\n", + "
" + ], + "text/plain": [ + " DiaBP_mask HR_mask RR_mask SpO2_mask SysBP_mask \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 1 1 1 1 1 \n", + " [1.0, 2.0) 1 1 1 1 1 \n", + " [2.0, 3.0) 1 1 1 1 1 \n", + " [3.0, 4.0) 1 1 1 1 1 \n", + "200010 [0.0, 1.0) 0 0 0 0 0 \n", + "... ... ... ... ... ... \n", + "201125 [3.0, 4.0) 1 1 1 1 1 \n", + "201128 [0.0, 1.0) 0 0 0 0 0 \n", + " [1.0, 2.0) 1 1 1 0 1 \n", + " [2.0, 3.0) 1 1 1 0 1 \n", + " [3.0, 4.0) 1 1 1 0 1 \n", + "\n", + " 220048_value_1 \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 1 \n", + " [1.0, 2.0) 1 \n", + " [2.0, 3.0) 1 \n", + " [3.0, 4.0) 1 \n", + "200010 [0.0, 1.0) 0 \n", + "... ... \n", + "201125 [3.0, 4.0) 1 \n", + "201128 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 1 \n", + " [2.0, 3.0) 1 \n", + " [3.0, 4.0) 1 \n", + "\n", + " 220048: 1st AV (First degree AV Block) _value_1 \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 0 \n", + " [2.0, 3.0) 0 \n", + " [3.0, 4.0) 0 \n", + "200010 [0.0, 1.0) 0 \n", + "... ... \n", + "201125 [3.0, 4.0) 0 \n", + "201128 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 0 \n", + " [2.0, 3.0) 0 \n", + " [3.0, 4.0) 0 \n", + "\n", + " 220048: 3rd AV (Complete Heart Block) _value_1 \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 0 \n", + " [2.0, 3.0) 0 \n", + " [3.0, 4.0) 0 \n", + "200010 [0.0, 1.0) 0 \n", + "... ... \n", + "201125 [3.0, 4.0) 0 \n", + "201128 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 0 \n", + " [2.0, 3.0) 0 \n", + " [3.0, 4.0) 0 \n", + "\n", + " 220048: A Flut (Atrial Flutter) _value_1 \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 0 \n", + " [2.0, 3.0) 0 \n", + " [3.0, 4.0) 0 \n", + "200010 [0.0, 1.0) 0 \n", + "... ... \n", + "201125 [3.0, 4.0) 0 \n", + "201128 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 0 \n", + " [2.0, 3.0) 0 \n", + " [3.0, 4.0) 0 \n", + "\n", + " 220048: AF (Atrial Fibrillation)_value_1 ... \\\n", + "ID t_range ... \n", + "200001 [0.0, 1.0) 0 ... \n", + " [1.0, 2.0) 0 ... \n", + " [2.0, 3.0) 0 ... \n", + " [3.0, 4.0) 0 ... \n", + "200010 [0.0, 1.0) 0 ... \n", + "... ... ... \n", + "201125 [3.0, 4.0) 0 ... \n", + "201128 [0.0, 1.0) 0 ... \n", + " [1.0, 2.0) 0 ... \n", + " [2.0, 3.0) 0 ... \n", + " [3.0, 4.0) 0 ... \n", + "\n", + " SysBP_mean>=66.0 SysBP_mean>=103.4 \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 1 1 \n", + " [1.0, 2.0) 1 1 \n", + " [2.0, 3.0) 1 1 \n", + " [3.0, 4.0) 1 0 \n", + "200010 [0.0, 1.0) 0 0 \n", + "... ... ... \n", + "201125 [3.0, 4.0) 1 1 \n", + "201128 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 1 1 \n", + " [2.0, 3.0) 1 1 \n", + " [3.0, 4.0) 1 1 \n", + "\n", + " SysBP_mean>=114.43333333333334 SysBP_mean>=125.0 \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 1 0 \n", + " [3.0, 4.0) 0 0 \n", + "200010 [0.0, 1.0) 0 0 \n", + "... ... ... \n", + "201125 [3.0, 4.0) 1 0 \n", + "201128 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 1 1 \n", + " [2.0, 3.0) 1 0 \n", + " [3.0, 4.0) 1 0 \n", + "\n", + " SysBP_mean>=138.0 SysBP_max>=66.0 SysBP_max>=105.0 \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 0 1 1 \n", + " [1.0, 2.0) 0 1 1 \n", + " [2.0, 3.0) 0 1 1 \n", + " [3.0, 4.0) 0 1 0 \n", + "200010 [0.0, 1.0) 0 0 0 \n", + "... ... ... ... \n", + "201125 [3.0, 4.0) 0 1 1 \n", + "201128 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 1 1 \n", + " [2.0, 3.0) 0 1 1 \n", + " [3.0, 4.0) 0 1 1 \n", + "\n", + " SysBP_max>=116.0 SysBP_max>=127.0 SysBP_max>=141.0 \n", + "ID t_range \n", + "200001 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "200010 [0.0, 1.0) 0 0 0 \n", + "... ... ... ... \n", + "201125 [3.0, 4.0) 1 0 0 \n", + "201128 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 1 1 0 \n", + " [2.0, 3.0) 1 0 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "\n", + "[800 rows x 3587 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import json\n", + "import sparse\n", + "\n", + "S = sparse.load_npz('output-2-parallel/S_all.npz')\n", + "S_names = json.load(open('output-2-parallel/S_all.feature_names.json', 'r'))\n", + "S_index = pd.read_csv('output-2-parallel/S.ID.csv').set_index(['ID'])\n", + "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n", + "\n", + "X = sparse.load_npz('output-2-parallel/X_all.npz')\n", + "X_names = json.load(open('output-2-parallel/X_all.feature_names.json', 'r'))\n", + "X_index = pd.read_csv('output-2-parallel/X.ID,t_range.csv').set_index(['ID', 't_range'])\n", + "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n", + "\n", + "display(df_S)\n", + "display(df_X)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 3: discretize = False" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p output-3-parallel" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input:\n", + " Data : ./input/data.csv\n", + " Population: ./input/pop.csv\n", + " Config : ./input/config-3-parallel.yaml\n", + "\n", + "Output directory: ./output-3-parallel/\n", + "\n", + "Input arguments:\n", + " T = 4.0\n", + " dt = 1.0\n", + " θ₁ = 0.001\n", + " θ₂ = 0.001\n", + " θ_freq = 1.0\n", + " k = 3 ['min', 'max', 'mean']\n", + "\n", + "discretize = no\n", + "\n", + "N = 200\n", + "L = 4\n", + "\n", + "\n", + "================================================================================\n", + "1) Pre-filter\n", + "================================================================================\n", + "Remove rows not in population\n", + "Remove rows with t outside of [0, 4]\n", + "Remove rare variables (<= 0.001)\n", + "Total variables : 1970\n", + "Rare variables : 0\n", + "Remaining variables : 1970\n", + "# rows (original) : 64777\n", + "# rows (filtered) : 64777\n", + "\n", + "================================================================================\n", + "2) Transform; 3) Post-filter\n", + "================================================================================\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Detecting and parsing value types\n", + "--------------------------------------------------------------------------------\n", + "Saved as: ./output-3-parallel/value_types.csv\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Separate time-invariant and time-dependent\n", + "--------------------------------------------------------------------------------\n", + "Variables (time-invariant): 12\n", + "Variables (time-dependent): 1958\n", + "# rows (time-invariant): 2400\n", + "# rows (time-dependent): 62377\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-A) Transform time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "(N × ^d) table :\t (200, 12)\n", + "number of missing entries :\t 4 out of 2400 total\n", + "Time elapsed: 0.018502 seconds\n", + "\n", + "Output\n", + "S_all, binary features :\t (200, 76)\n", + "Time elapsed: 0.116800 seconds\n", + "\n", + "--------------------------------------------------------------------------------\n", + "3-A) Post-filter time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "Original : 76\n", + "Nearly-constant: 0\n", + "Correlated : 7\n", + "Time elapsed: 0.121063 seconds\n", + "\n", + "Output\n", + "S: shape=(200, 69), density=0.162\n", + "Total time: 0.125685 seconds\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-B) Transform time-dependent data\n", + "--------------------------------------------------------------------------------\n", + "Total variables : 1958\n", + "Frequent variables : ['DiaBP', 'HR', 'RR', 'SpO2', 'SysBP']\n", + "M₁ = 5\n", + "M₂ = 1953\n", + "k = 3 ['min', 'max', 'mean']\n", + "\n", + "Transforming each example...\n", + "Batches of size 100: 2\n", + "100%|█████████████████████████████████████████████| 2/2 [00:30<00:00, 15.46s/it]\n", + "\u001b[0m\u001b[0m\u001b[0m\u001b[0m\n", + "Parallel processing done\n", + "DONE: Transforming each example...\n", + "(freq) number of missing entries :\t 996 out of 200×4×5=4000 total\n", + "(freq) number of imputed entries :\t 58\n", + "(freq) number of not imputed entries :\t 938\n", + "(non-freq) number of missing entries :\t 1510389 out of 200×4×1953=1562400 total\n", + "\n", + "(N × L × ^D) table :\t (200, 4, 1983)\n", + "Time elapsed: 32.386383 seconds\n", + "Discretizing features...\n", + "\n", + "Discretizing categorical features...\n", + "100%|██████████████████████████████████████| 1990/1990 [00:10<00:00, 190.17it/s]\n", + "\u001b[0m\u001b[0m\u001b[0m\u001b[0mFinished discretizing features\n", + "\n", + "Output\n", + "X_all: shape=(200, 4, 2588), density=0.582\n", + "Time elapsed: 46.796057 seconds\n", + "\n", + "--------------------------------------------------------------------------------\n", + "3-B) Post-filter time-dependent data\n", + "--------------------------------------------------------------------------------\n", + "(200, 4, 2588) 0.5818387751159196\n", + "Original : 2588\n", + "Nearly-constant: 1064\n", + "*** time: 10.0768461227417\n", + "/Users/shengputang/Developer/FIDDLE/FIDDLE/helpers.py:426: RuntimeWarning: invalid value encountered in sqrt\n", + " coeffs = C / np.sqrt(np.outer(d, d))\n", + "/Users/shengputang/Developer/FIDDLE/FIDDLE/helpers.py:426: RuntimeWarning: divide by zero encountered in true_divide\n", + " coeffs = C / np.sqrt(np.outer(d, d))\n", + "/Users/shengputang/Developer/FIDDLE/FIDDLE/helpers.py:426: RuntimeWarning: invalid value encountered in true_divide\n", + " coeffs = C / np.sqrt(np.outer(d, d))\n", + "/Users/shengputang/Developer/FIDDLE/FIDDLE/helpers.py:376: RuntimeWarning: invalid value encountered in multiply\n", + " self.corr_matrix *= np.tri(*self.corr_matrix.shape)\n", + "Correlated : 310\n", + "*** time: 16.394930124282837\n", + "\n", + "Output\n", + "X: shape=(200, 4, 1214), density=0.366\n", + "(200, 4, 1214) 0.366085255354201\n", + "Time elapsed: 63.195710 seconds\n", + "\n", + "Output\n", + "X: shape=(200, 4, 1214), density=0.366\n", + "Total time: 63.452329 seconds\n", + "\n", + "\u001b[0m" + ] + } + ], + "source": [ + "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n", + "python -m FIDDLE.run \\\n", + " --data_fname='./input/data.csv' \\\n", + " --population_fname='./input/pop.csv' \\\n", + " --config_fname='./input/config-3-parallel.yaml' \\\n", + " --output_dir='./output-3-parallel/' \\\n", + " --T=4 --dt=1.0 \\\n", + " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n", + " --stats_functions 'min' 'max' 'mean'" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ADMISSION_LOCATION_value:CLINIC REFERRAL/PREMATUREADMISSION_LOCATION_value:EMERGENCY ROOM ADMITADMISSION_LOCATION_value:PHYS REFERRAL/NORMAL DELIADMISSION_LOCATION_value:TRANSFER FROM HOSP/EXTRAMADMISSION_LOCATION_value:TRANSFER FROM OTHER HEALTADMISSION_LOCATION_value:TRANSFER FROM SKILLED NURADMISSION_TYPE_value:ELECTIVEADMISSION_TYPE_value:EMERGENCYADMISSION_TYPE_value:URGENTAGE_value...RELIGION_value:EPISCOPALIANRELIGION_value:GREEK ORTHODOXRELIGION_value:JEHOVAH'S WITNESSRELIGION_value:JEWISHRELIGION_value:MUSLIMRELIGION_value:NOT SPECIFIEDRELIGION_value:OTHERRELIGION_value:PROTESTANT QUAKERRELIGION_value:UNITARIAN-UNIVERSALISTRELIGION_value:UNOBTAINABLE
ID
2000011.00.00.00.00.00.00.01.00.061.111770...0.00.00.00.01.00.00.00.00.00.0
2000101.00.00.00.00.00.00.01.00.027.271125...0.00.00.00.00.01.00.00.00.00.0
2000160.00.01.00.00.00.01.00.00.067.281277...0.00.00.00.00.00.00.00.00.00.0
2000331.00.00.00.00.00.00.01.00.067.191089...0.00.00.00.00.00.00.00.00.01.0
2000340.00.01.00.00.00.01.00.00.054.077903...0.00.00.00.00.00.00.00.00.01.0
..................................................................
2011101.00.00.00.00.00.00.01.00.054.746702...0.00.00.00.00.01.00.00.00.00.0
2011130.01.00.00.00.00.00.01.00.050.456861...0.00.00.00.00.00.00.00.00.01.0
2011240.00.01.00.00.00.01.00.00.070.488207...0.00.00.00.00.01.00.00.00.00.0
2011251.00.00.00.00.00.00.01.00.079.555041...0.00.00.01.00.00.00.00.00.00.0
2011280.00.00.01.00.00.00.01.00.074.814954...0.00.00.00.00.00.00.00.00.00.0
\n", + "

200 rows × 76 columns

\n", + "
" + ], + "text/plain": [ + " ADMISSION_LOCATION_value:CLINIC REFERRAL/PREMATURE \\\n", + "ID \n", + "200001 1.0 \n", + "200010 1.0 \n", + "200016 0.0 \n", + "200033 1.0 \n", + "200034 0.0 \n", + "... ... \n", + "201110 1.0 \n", + "201113 0.0 \n", + "201124 0.0 \n", + "201125 1.0 \n", + "201128 0.0 \n", + "\n", + " ADMISSION_LOCATION_value:EMERGENCY ROOM ADMIT \\\n", + "ID \n", + "200001 0.0 \n", + "200010 0.0 \n", + "200016 0.0 \n", + "200033 0.0 \n", + "200034 0.0 \n", + "... ... \n", + "201110 0.0 \n", + "201113 1.0 \n", + "201124 0.0 \n", + "201125 0.0 \n", + "201128 0.0 \n", + "\n", + " ADMISSION_LOCATION_value:PHYS REFERRAL/NORMAL DELI \\\n", + "ID \n", + "200001 0.0 \n", + "200010 0.0 \n", + "200016 1.0 \n", + "200033 0.0 \n", + "200034 1.0 \n", + "... ... \n", + "201110 0.0 \n", + "201113 0.0 \n", + "201124 1.0 \n", + "201125 0.0 \n", + "201128 0.0 \n", + "\n", + " ADMISSION_LOCATION_value:TRANSFER FROM HOSP/EXTRAM \\\n", + "ID \n", + "200001 0.0 \n", + "200010 0.0 \n", + "200016 0.0 \n", + "200033 0.0 \n", + "200034 0.0 \n", + "... ... \n", + "201110 0.0 \n", + "201113 0.0 \n", + "201124 0.0 \n", + "201125 0.0 \n", + "201128 1.0 \n", + "\n", + " ADMISSION_LOCATION_value:TRANSFER FROM OTHER HEALT \\\n", + "ID \n", + "200001 0.0 \n", + "200010 0.0 \n", + "200016 0.0 \n", + "200033 0.0 \n", + "200034 0.0 \n", + "... ... \n", + "201110 0.0 \n", + "201113 0.0 \n", + "201124 0.0 \n", + "201125 0.0 \n", + "201128 0.0 \n", + "\n", + " ADMISSION_LOCATION_value:TRANSFER FROM SKILLED NUR \\\n", + "ID \n", + "200001 0.0 \n", + "200010 0.0 \n", + "200016 0.0 \n", + "200033 0.0 \n", + "200034 0.0 \n", + "... ... \n", + "201110 0.0 \n", + "201113 0.0 \n", + "201124 0.0 \n", + "201125 0.0 \n", + "201128 0.0 \n", + "\n", + " ADMISSION_TYPE_value:ELECTIVE ADMISSION_TYPE_value:EMERGENCY \\\n", + "ID \n", + "200001 0.0 1.0 \n", + "200010 0.0 1.0 \n", + "200016 1.0 0.0 \n", + "200033 0.0 1.0 \n", + "200034 1.0 0.0 \n", + "... ... ... \n", + "201110 0.0 1.0 \n", + "201113 0.0 1.0 \n", + "201124 1.0 0.0 \n", + "201125 0.0 1.0 \n", + "201128 0.0 1.0 \n", + "\n", + " ADMISSION_TYPE_value:URGENT AGE_value ... \\\n", + "ID ... \n", + "200001 0.0 61.111770 ... \n", + "200010 0.0 27.271125 ... \n", + "200016 0.0 67.281277 ... \n", + "200033 0.0 67.191089 ... \n", + "200034 0.0 54.077903 ... \n", + "... ... ... ... \n", + "201110 0.0 54.746702 ... \n", + "201113 0.0 50.456861 ... \n", + "201124 0.0 70.488207 ... \n", + "201125 0.0 79.555041 ... \n", + "201128 0.0 74.814954 ... \n", + "\n", + " RELIGION_value:EPISCOPALIAN RELIGION_value:GREEK ORTHODOX \\\n", + "ID \n", + "200001 0.0 0.0 \n", + "200010 0.0 0.0 \n", + "200016 0.0 0.0 \n", + "200033 0.0 0.0 \n", + "200034 0.0 0.0 \n", + "... ... ... \n", + "201110 0.0 0.0 \n", + "201113 0.0 0.0 \n", + "201124 0.0 0.0 \n", + "201125 0.0 0.0 \n", + "201128 0.0 0.0 \n", + "\n", + " RELIGION_value:JEHOVAH'S WITNESS RELIGION_value:JEWISH \\\n", + "ID \n", + "200001 0.0 0.0 \n", + "200010 0.0 0.0 \n", + "200016 0.0 0.0 \n", + "200033 0.0 0.0 \n", + "200034 0.0 0.0 \n", + "... ... ... \n", + "201110 0.0 0.0 \n", + "201113 0.0 0.0 \n", + "201124 0.0 0.0 \n", + "201125 0.0 1.0 \n", + "201128 0.0 0.0 \n", + "\n", + " RELIGION_value:MUSLIM RELIGION_value:NOT SPECIFIED \\\n", + "ID \n", + "200001 1.0 0.0 \n", + "200010 0.0 1.0 \n", + "200016 0.0 0.0 \n", + "200033 0.0 0.0 \n", + "200034 0.0 0.0 \n", + "... ... ... \n", + "201110 0.0 1.0 \n", + "201113 0.0 0.0 \n", + "201124 0.0 1.0 \n", + "201125 0.0 0.0 \n", + "201128 0.0 0.0 \n", + "\n", + " RELIGION_value:OTHER RELIGION_value:PROTESTANT QUAKER \\\n", + "ID \n", + "200001 0.0 0.0 \n", + "200010 0.0 0.0 \n", + "200016 0.0 0.0 \n", + "200033 0.0 0.0 \n", + "200034 0.0 0.0 \n", + "... ... ... \n", + "201110 0.0 0.0 \n", + "201113 0.0 0.0 \n", + "201124 0.0 0.0 \n", + "201125 0.0 0.0 \n", + "201128 0.0 0.0 \n", + "\n", + " RELIGION_value:UNITARIAN-UNIVERSALIST RELIGION_value:UNOBTAINABLE \n", + "ID \n", + "200001 0.0 0.0 \n", + "200010 0.0 0.0 \n", + "200016 0.0 0.0 \n", + "200033 0.0 1.0 \n", + "200034 0.0 1.0 \n", + "... ... ... \n", + "201110 0.0 0.0 \n", + "201113 0.0 1.0 \n", + "201124 0.0 0.0 \n", + "201125 0.0 0.0 \n", + "201128 0.0 0.0 \n", + "\n", + "[200 rows x 76 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DiaBP_maskHR_maskRR_maskSpO2_maskSysBP_mask220046_value220047_value220048_value220048: 1st AV (First degree AV Block) _value220048: 3rd AV (Complete Heart Block) _value...51492_value_str:NEG51492_value_str:TR51493_value_str:0-251493_value_str:>5051498_value_str:>=1.03551514_value_str:NEG51516_value_str:0-251516_value_str:3-551516_value_str:6-1051516_value_str:>50
IDt_range
200001[0.0, 1.0)1.01.01.01.01.0120.00000060.0000001.01.01.0...0.00.00.00.00.00.00.00.00.00.0
[1.0, 2.0)1.01.01.01.01.0120.94059453.9359611.01.01.0...0.00.00.00.00.00.00.00.00.00.0
[2.0, 3.0)1.01.01.01.01.0120.94059453.9359611.01.01.0...0.00.00.00.00.00.00.00.00.00.0
[3.0, 4.0)1.01.01.01.01.0120.94059453.9359611.01.01.0...0.00.00.00.00.00.00.00.00.00.0
200010[0.0, 1.0)0.00.00.00.00.0120.00000050.0000001.01.01.0...0.00.00.00.00.00.00.00.00.00.0
.....................................................................
201125[3.0, 4.0)1.01.01.01.01.0130.00000060.0000001.01.01.0...0.00.01.00.00.01.00.01.00.00.0
201128[0.0, 1.0)0.00.00.00.00.0120.94059453.9359611.01.01.0...0.00.00.00.00.00.00.00.00.00.0
[1.0, 2.0)1.01.01.00.01.0120.00000050.0000001.01.01.0...0.00.00.00.00.00.00.00.00.00.0
[2.0, 3.0)1.01.01.00.01.0120.94059453.9359611.01.01.0...0.00.00.00.00.00.00.00.00.00.0
[3.0, 4.0)1.01.01.00.01.0120.94059453.9359611.01.01.0...0.00.00.00.00.00.00.00.00.00.0
\n", + "

800 rows × 2588 columns

\n", + "
" + ], + "text/plain": [ + " DiaBP_mask HR_mask RR_mask SpO2_mask SysBP_mask \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 1.0 1.0 1.0 1.0 1.0 \n", + " [1.0, 2.0) 1.0 1.0 1.0 1.0 1.0 \n", + " [2.0, 3.0) 1.0 1.0 1.0 1.0 1.0 \n", + " [3.0, 4.0) 1.0 1.0 1.0 1.0 1.0 \n", + "200010 [0.0, 1.0) 0.0 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... ... \n", + "201125 [3.0, 4.0) 1.0 1.0 1.0 1.0 1.0 \n", + "201128 [0.0, 1.0) 0.0 0.0 0.0 0.0 0.0 \n", + " [1.0, 2.0) 1.0 1.0 1.0 0.0 1.0 \n", + " [2.0, 3.0) 1.0 1.0 1.0 0.0 1.0 \n", + " [3.0, 4.0) 1.0 1.0 1.0 0.0 1.0 \n", + "\n", + " 220046_value 220047_value 220048_value \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 120.000000 60.000000 1.0 \n", + " [1.0, 2.0) 120.940594 53.935961 1.0 \n", + " [2.0, 3.0) 120.940594 53.935961 1.0 \n", + " [3.0, 4.0) 120.940594 53.935961 1.0 \n", + "200010 [0.0, 1.0) 120.000000 50.000000 1.0 \n", + "... ... ... ... \n", + "201125 [3.0, 4.0) 130.000000 60.000000 1.0 \n", + "201128 [0.0, 1.0) 120.940594 53.935961 1.0 \n", + " [1.0, 2.0) 120.000000 50.000000 1.0 \n", + " [2.0, 3.0) 120.940594 53.935961 1.0 \n", + " [3.0, 4.0) 120.940594 53.935961 1.0 \n", + "\n", + " 220048: 1st AV (First degree AV Block) _value \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 1.0 \n", + " [1.0, 2.0) 1.0 \n", + " [2.0, 3.0) 1.0 \n", + " [3.0, 4.0) 1.0 \n", + "200010 [0.0, 1.0) 1.0 \n", + "... ... \n", + "201125 [3.0, 4.0) 1.0 \n", + "201128 [0.0, 1.0) 1.0 \n", + " [1.0, 2.0) 1.0 \n", + " [2.0, 3.0) 1.0 \n", + " [3.0, 4.0) 1.0 \n", + "\n", + " 220048: 3rd AV (Complete Heart Block) _value ... \\\n", + "ID t_range ... \n", + "200001 [0.0, 1.0) 1.0 ... \n", + " [1.0, 2.0) 1.0 ... \n", + " [2.0, 3.0) 1.0 ... \n", + " [3.0, 4.0) 1.0 ... \n", + "200010 [0.0, 1.0) 1.0 ... \n", + "... ... ... \n", + "201125 [3.0, 4.0) 1.0 ... \n", + "201128 [0.0, 1.0) 1.0 ... \n", + " [1.0, 2.0) 1.0 ... \n", + " [2.0, 3.0) 1.0 ... \n", + " [3.0, 4.0) 1.0 ... \n", + "\n", + " 51492_value_str:NEG 51492_value_str:TR \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 0.0 0.0 \n", + " [1.0, 2.0) 0.0 0.0 \n", + " [2.0, 3.0) 0.0 0.0 \n", + " [3.0, 4.0) 0.0 0.0 \n", + "200010 [0.0, 1.0) 0.0 0.0 \n", + "... ... ... \n", + "201125 [3.0, 4.0) 0.0 0.0 \n", + "201128 [0.0, 1.0) 0.0 0.0 \n", + " [1.0, 2.0) 0.0 0.0 \n", + " [2.0, 3.0) 0.0 0.0 \n", + " [3.0, 4.0) 0.0 0.0 \n", + "\n", + " 51493_value_str:0-2 51493_value_str:>50 \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 0.0 0.0 \n", + " [1.0, 2.0) 0.0 0.0 \n", + " [2.0, 3.0) 0.0 0.0 \n", + " [3.0, 4.0) 0.0 0.0 \n", + "200010 [0.0, 1.0) 0.0 0.0 \n", + "... ... ... \n", + "201125 [3.0, 4.0) 1.0 0.0 \n", + "201128 [0.0, 1.0) 0.0 0.0 \n", + " [1.0, 2.0) 0.0 0.0 \n", + " [2.0, 3.0) 0.0 0.0 \n", + " [3.0, 4.0) 0.0 0.0 \n", + "\n", + " 51498_value_str:>=1.035 51514_value_str:NEG \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 0.0 0.0 \n", + " [1.0, 2.0) 0.0 0.0 \n", + " [2.0, 3.0) 0.0 0.0 \n", + " [3.0, 4.0) 0.0 0.0 \n", + "200010 [0.0, 1.0) 0.0 0.0 \n", + "... ... ... \n", + "201125 [3.0, 4.0) 0.0 1.0 \n", + "201128 [0.0, 1.0) 0.0 0.0 \n", + " [1.0, 2.0) 0.0 0.0 \n", + " [2.0, 3.0) 0.0 0.0 \n", + " [3.0, 4.0) 0.0 0.0 \n", + "\n", + " 51516_value_str:0-2 51516_value_str:3-5 \\\n", + "ID t_range \n", + "200001 [0.0, 1.0) 0.0 0.0 \n", + " [1.0, 2.0) 0.0 0.0 \n", + " [2.0, 3.0) 0.0 0.0 \n", + " [3.0, 4.0) 0.0 0.0 \n", + "200010 [0.0, 1.0) 0.0 0.0 \n", + "... ... ... \n", + "201125 [3.0, 4.0) 0.0 1.0 \n", + "201128 [0.0, 1.0) 0.0 0.0 \n", + " [1.0, 2.0) 0.0 0.0 \n", + " [2.0, 3.0) 0.0 0.0 \n", + " [3.0, 4.0) 0.0 0.0 \n", + "\n", + " 51516_value_str:6-10 51516_value_str:>50 \n", + "ID t_range \n", + "200001 [0.0, 1.0) 0.0 0.0 \n", + " [1.0, 2.0) 0.0 0.0 \n", + " [2.0, 3.0) 0.0 0.0 \n", + " [3.0, 4.0) 0.0 0.0 \n", + "200010 [0.0, 1.0) 0.0 0.0 \n", + "... ... ... \n", + "201125 [3.0, 4.0) 0.0 0.0 \n", + "201128 [0.0, 1.0) 0.0 0.0 \n", + " [1.0, 2.0) 0.0 0.0 \n", + " [2.0, 3.0) 0.0 0.0 \n", + " [3.0, 4.0) 0.0 0.0 \n", + "\n", + "[800 rows x 2588 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import json\n", + "import sparse\n", + "\n", + "S = sparse.load_npz('output-3-parallel/S_all.npz')\n", + "S_names = json.load(open('output-3-parallel/S_all.feature_names.json', 'r'))\n", + "S_index = pd.read_csv('output-3-parallel/S.ID.csv').set_index(['ID'])\n", + "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n", + "\n", + "X = sparse.load_npz('output-3-parallel/X_all.npz')\n", + "X_names = json.load(open('output-3-parallel/X_all.feature_names.json', 'r'))\n", + "X_index = pd.read_csv('output-3-parallel/X.ID,t_range.csv').set_index(['ID', 't_range'])\n", + "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n", + "\n", + "display(df_S)\n", + "display(df_X)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tests/large_test/input/config-1-parallel.yaml b/tests/large_test/input/config-1-parallel.yaml new file mode 100644 index 0000000000000000000000000000000000000000..04de805b0f77f01e6c290e4459f329cbe7b9a10a --- /dev/null +++ b/tests/large_test/input/config-1-parallel.yaml @@ -0,0 +1,5 @@ +discretize: yes +use_ordinal_encoding: no + +parallel: yes +n_jobs: 4 diff --git a/tests/large_test/input/config-2-parallel.yaml b/tests/large_test/input/config-2-parallel.yaml new file mode 100644 index 0000000000000000000000000000000000000000..26bff1b866b98520a2d6afac91df41330b681fd4 --- /dev/null +++ b/tests/large_test/input/config-2-parallel.yaml @@ -0,0 +1,5 @@ +discretize: yes +use_ordinal_encoding: yes + +parallel: yes +n_jobs: 4 diff --git a/tests/large_test/input/config-3-parallel.yaml b/tests/large_test/input/config-3-parallel.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2a5b04c2b87d9937f7a94c686bced1793c12fba1 --- /dev/null +++ b/tests/large_test/input/config-3-parallel.yaml @@ -0,0 +1,4 @@ +discretize: no + +parallel: yes +n_jobs: 4 diff --git a/test/large_test/input_data.csv b/tests/large_test/input/data.csv similarity index 100% rename from test/large_test/input_data.csv rename to tests/large_test/input/data.csv diff --git a/test/large_test/pop.csv b/tests/large_test/input/pop.csv similarity index 100% rename from test/large_test/pop.csv rename to tests/large_test/input/pop.csv diff --git a/tests/small_test/Run-docker.ipynb b/tests/small_test/Run-docker.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a18976b9795b6d3485faed12fd96ba7d795120d9 --- /dev/null +++ b/tests/small_test/Run-docker.ipynb @@ -0,0 +1,2473 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf output-*/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 1: discretize = False" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p output-1" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input:\n", + " Data : /datadir/input/data.csv\n", + " Population: /datadir/input/pop.csv\n", + " Config : /datadir/input/config-1.yaml\n", + "\n", + "Output directory: /datadir/output-1/\n", + "\n", + "Input arguments:\n", + " T = 4.0\n", + " dt = 1.0\n", + " θ₁ = 0.001\n", + " θ₂ = 0.001\n", + " θ_freq = 1.0\n", + " k = 3 ['min', 'max', 'mean']\n", + "\n", + "discretize = no\n", + "\n", + "N = 4\n", + "L = 4\n", + "\n", + "\n", + "================================================================================\n", + "2) Transform; 3) Post-filter\n", + "================================================================================\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Detecting and parsing value types\n", + "--------------------------------------------------------------------------------\n", + "Saved as: /datadir/output-1/value_types.csv\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Separate time-invariant and time-dependent\n", + "--------------------------------------------------------------------------------\n", + "Variables (time-invariant): 3\n", + "Variables (time-dependent): 4\n", + "# rows (time-invariant): 9\n", + "# rows (time-dependent): 23\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-A) Transform time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "(N × ^d) table :\t (4, 3)\n", + "number of missing entries :\t 3 out of 12 total\n", + "Time elapsed: 0.044580 seconds\n", + "\n", + "Output\n", + "S_all, binary features :\t (4, 6)\n", + "Time elapsed: 0.274032 seconds\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-B) Transform time-dependent data\n", + "--------------------------------------------------------------------------------\n", + "Total variables : 4\n", + "Frequent variables : ['HR']\n", + "M₁ = 1\n", + "M₂ = 3\n", + "k = 3 ['min', 'max', 'mean']\n", + "\n", + "Transforming each example...\n", + "100%|█████████████████████████████████████████████| 4/4 [00:01<00:00, 3.60it/s]\n", + "DONE: Transforming each example...\n", + "(freq) number of missing entries :\t 5 out of 4×4×1=16 total\n", + "(freq) number of imputed entries :\t 4\n", + "(freq) number of not imputed entries :\t 1\n", + "(non-freq) number of missing entries :\t 41 out of 4×4×3=48 total\n", + "\n", + "(N × L × ^D) table :\t (4, 4, 9)\n", + "Time elapsed: 1.221479 seconds\n", + "Discretizing features...\n", + "\n", + "Discretizing categorical features...\n", + "100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 132.10it/s]\n", + "Finished discretizing features\n", + "\n", + "Output\n", + "X_all: shape=(4, 4, 12), density=0.599\n", + "Time elapsed: 1.397339 seconds\n" + ] + } + ], + "source": [ + "!docker run -it \\\n", + " --mount type='bind',src=\"$(pwd)\",target='/datadir' \\\n", + " fiddle-v020 \\\n", + "python -m FIDDLE.run \\\n", + " --data_fname='/datadir/input/data.csv' \\\n", + " --population_fname='/datadir/input/pop.csv' \\\n", + " --config_fname='/datadir/input/config-1.yaml' \\\n", + " --output_dir='/datadir/output-1/' \\\n", + " --T=4 --dt=1.0 \\\n", + " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n", + " --stats_functions 'min' 'max' 'mean' \\\n", + " --no_prefilter --no_postfilter" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AGE_valueROOM_value:_101ROOM_value:_102ROOM_value:_103SEX_value:FSEX_value:M
ID
150.01.00.00.00.01.0
233.00.01.00.00.01.0
340.00.00.01.01.00.0
441.00.00.00.00.00.0
\n", + "
" + ], + "text/plain": [ + " AGE_value ROOM_value:_101 ROOM_value:_102 ROOM_value:_103 SEX_value:F \\\n", + "ID \n", + "1 50.0 1.0 0.0 0.0 0.0 \n", + "2 33.0 0.0 1.0 0.0 0.0 \n", + "3 40.0 0.0 0.0 1.0 1.0 \n", + "4 41.0 0.0 0.0 0.0 0.0 \n", + "\n", + " SEX_value:M \n", + "ID \n", + "1 1.0 \n", + "2 1.0 \n", + "3 0.0 \n", + "4 0.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HR_maskDRUG_A_RATE_valueDRUG_A_ROUTE_value:BolusDRUG_A_ROUTE_value:IVDRUG_A_ROUTE_value:OralLAB_X_valueHR_delta_timeHR_valueHR_minHR_maxHR_meanLAB_X_value_str:<1
IDt_range
1[0.0, 1.0)1.048.00.00.00.05.00.071.00000070.071.00000070.5000000.0
[1.0, 2.0)1.048.00.00.00.05.00.073.00000072.073.00000072.5000000.0
[2.0, 3.0)1.048.00.00.01.05.00.074.00000074.074.00000074.0000001.0
[3.0, 4.0)1.048.00.00.00.05.00.075.00000075.075.00000075.0000000.0
2[0.0, 1.0)1.048.00.00.00.05.00.061.00000060.061.00000060.3333330.0
[1.0, 2.0)0.048.00.00.00.05.01.061.00000061.061.00000061.0000000.0
[2.0, 3.0)1.048.00.00.00.05.00.078.00000073.078.00000075.5000000.0
[3.0, 4.0)1.048.00.01.00.05.00.075.00000075.075.00000075.0000000.0
3[0.0, 1.0)0.048.00.00.00.05.00.075.53333375.075.53333375.2555560.0
[1.0, 2.0)1.048.01.00.00.05.00.090.00000090.090.00000090.0000000.0
[2.0, 3.0)0.048.00.00.00.05.01.090.00000090.090.00000090.0000000.0
[3.0, 4.0)0.048.00.00.00.05.02.090.00000090.090.00000090.0000000.0
4[0.0, 1.0)1.048.00.00.00.05.00.080.00000080.080.00000080.0000000.0
[1.0, 2.0)0.048.00.00.00.05.01.080.00000080.080.00000080.0000000.0
[2.0, 3.0)1.048.00.00.00.05.00.062.00000062.062.00000062.0000000.0
[3.0, 4.0)1.048.00.00.00.05.00.073.00000073.073.00000073.0000000.0
\n", + "
" + ], + "text/plain": [ + " HR_mask DRUG_A_RATE_value DRUG_A_ROUTE_value:Bolus \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 1.0 48.0 0.0 \n", + " [1.0, 2.0) 1.0 48.0 0.0 \n", + " [2.0, 3.0) 1.0 48.0 0.0 \n", + " [3.0, 4.0) 1.0 48.0 0.0 \n", + "2 [0.0, 1.0) 1.0 48.0 0.0 \n", + " [1.0, 2.0) 0.0 48.0 0.0 \n", + " [2.0, 3.0) 1.0 48.0 0.0 \n", + " [3.0, 4.0) 1.0 48.0 0.0 \n", + "3 [0.0, 1.0) 0.0 48.0 0.0 \n", + " [1.0, 2.0) 1.0 48.0 1.0 \n", + " [2.0, 3.0) 0.0 48.0 0.0 \n", + " [3.0, 4.0) 0.0 48.0 0.0 \n", + "4 [0.0, 1.0) 1.0 48.0 0.0 \n", + " [1.0, 2.0) 0.0 48.0 0.0 \n", + " [2.0, 3.0) 1.0 48.0 0.0 \n", + " [3.0, 4.0) 1.0 48.0 0.0 \n", + "\n", + " DRUG_A_ROUTE_value:IV DRUG_A_ROUTE_value:Oral LAB_X_value \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0.0 0.0 5.0 \n", + " [1.0, 2.0) 0.0 0.0 5.0 \n", + " [2.0, 3.0) 0.0 1.0 5.0 \n", + " [3.0, 4.0) 0.0 0.0 5.0 \n", + "2 [0.0, 1.0) 0.0 0.0 5.0 \n", + " [1.0, 2.0) 0.0 0.0 5.0 \n", + " [2.0, 3.0) 0.0 0.0 5.0 \n", + " [3.0, 4.0) 1.0 0.0 5.0 \n", + "3 [0.0, 1.0) 0.0 0.0 5.0 \n", + " [1.0, 2.0) 0.0 0.0 5.0 \n", + " [2.0, 3.0) 0.0 0.0 5.0 \n", + " [3.0, 4.0) 0.0 0.0 5.0 \n", + "4 [0.0, 1.0) 0.0 0.0 5.0 \n", + " [1.0, 2.0) 0.0 0.0 5.0 \n", + " [2.0, 3.0) 0.0 0.0 5.0 \n", + " [3.0, 4.0) 0.0 0.0 5.0 \n", + "\n", + " HR_delta_time HR_value HR_min HR_max HR_mean \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0.0 71.000000 70.0 71.000000 70.500000 \n", + " [1.0, 2.0) 0.0 73.000000 72.0 73.000000 72.500000 \n", + " [2.0, 3.0) 0.0 74.000000 74.0 74.000000 74.000000 \n", + " [3.0, 4.0) 0.0 75.000000 75.0 75.000000 75.000000 \n", + "2 [0.0, 1.0) 0.0 61.000000 60.0 61.000000 60.333333 \n", + " [1.0, 2.0) 1.0 61.000000 61.0 61.000000 61.000000 \n", + " [2.0, 3.0) 0.0 78.000000 73.0 78.000000 75.500000 \n", + " [3.0, 4.0) 0.0 75.000000 75.0 75.000000 75.000000 \n", + "3 [0.0, 1.0) 0.0 75.533333 75.0 75.533333 75.255556 \n", + " [1.0, 2.0) 0.0 90.000000 90.0 90.000000 90.000000 \n", + " [2.0, 3.0) 1.0 90.000000 90.0 90.000000 90.000000 \n", + " [3.0, 4.0) 2.0 90.000000 90.0 90.000000 90.000000 \n", + "4 [0.0, 1.0) 0.0 80.000000 80.0 80.000000 80.000000 \n", + " [1.0, 2.0) 1.0 80.000000 80.0 80.000000 80.000000 \n", + " [2.0, 3.0) 0.0 62.000000 62.0 62.000000 62.000000 \n", + " [3.0, 4.0) 0.0 73.000000 73.0 73.000000 73.000000 \n", + "\n", + " LAB_X_value_str:<1 \n", + "ID t_range \n", + "1 [0.0, 1.0) 0.0 \n", + " [1.0, 2.0) 0.0 \n", + " [2.0, 3.0) 1.0 \n", + " [3.0, 4.0) 0.0 \n", + "2 [0.0, 1.0) 0.0 \n", + " [1.0, 2.0) 0.0 \n", + " [2.0, 3.0) 0.0 \n", + " [3.0, 4.0) 0.0 \n", + "3 [0.0, 1.0) 0.0 \n", + " [1.0, 2.0) 0.0 \n", + " [2.0, 3.0) 0.0 \n", + " [3.0, 4.0) 0.0 \n", + "4 [0.0, 1.0) 0.0 \n", + " [1.0, 2.0) 0.0 \n", + " [2.0, 3.0) 0.0 \n", + " [3.0, 4.0) 0.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import json\n", + "import sparse\n", + "\n", + "S = sparse.load_npz('output-1/S_all.npz')\n", + "S_names = json.load(open('output-1/S_all.feature_names.json', 'r'))\n", + "S_index = pd.read_csv('output-1/S.ID.csv').set_index(['ID'])\n", + "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n", + "\n", + "X = sparse.load_npz('output-1/X_all.npz')\n", + "X_names = json.load(open('output-1/X_all.feature_names.json', 'r'))\n", + "X_index = pd.read_csv('output-1/X.ID,t_range.csv').set_index(['ID', 't_range'])\n", + "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n", + "\n", + "display(df_S)\n", + "display(df_X)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 2: discretize = True, use_ordinal_encoding = False" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p output-2" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input:\n", + " Data : /datadir/input/data.csv\n", + " Population: /datadir/input/pop.csv\n", + " Config : /datadir/input/config-2.yaml\n", + "\n", + "Output directory: /datadir/output-2/\n", + "\n", + "Input arguments:\n", + " T = 4.0\n", + " dt = 1.0\n", + " θ₁ = 0.001\n", + " θ₂ = 0.001\n", + " θ_freq = 1.0\n", + " k = 3 ['min', 'max', 'mean']\n", + "\n", + "discretize = yes\n", + " S discretization bins: to be computed from data\n", + " X discretization bins: to be computed from data\n", + "\n", + "N = 4\n", + "L = 4\n", + "\n", + "\n", + "================================================================================\n", + "2) Transform; 3) Post-filter\n", + "================================================================================\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Detecting and parsing value types\n", + "--------------------------------------------------------------------------------\n", + "Saved as: /datadir/output-2/value_types.csv\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Separate time-invariant and time-dependent\n", + "--------------------------------------------------------------------------------\n", + "Variables (time-invariant): 3\n", + "Variables (time-dependent): 4\n", + "# rows (time-invariant): 9\n", + "# rows (time-dependent): 23\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-A) Transform time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "(N × ^d) table :\t (4, 3)\n", + "number of missing entries :\t 3 out of 12 total\n", + "Time elapsed: 0.057177 seconds\n", + "\n", + "Output\n", + "S_all, binary features :\t (4, 10)\n", + "Time elapsed: 0.212313 seconds\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-B) Transform time-dependent data\n", + "--------------------------------------------------------------------------------\n", + "Total variables : 4\n", + "Frequent variables : ['HR']\n", + "M₁ = 1\n", + "M₂ = 3\n", + "k = 3 ['min', 'max', 'mean']\n", + "\n", + "Transforming each example...\n", + "100%|█████████████████████████████████████████████| 4/4 [00:01<00:00, 2.82it/s]\n", + "DONE: Transforming each example...\n", + "(freq) number of missing entries :\t 5 out of 4×4×1=16 total\n", + "(freq) number of imputed entries :\t 4\n", + "(freq) number of not imputed entries :\t 1\n", + "(non-freq) number of missing entries :\t 41 out of 4×4×3=48 total\n", + "\n", + "(N × L × ^D) table :\t (4, 4, 9)\n", + "Time elapsed: 1.567708 seconds\n", + "Discretizing features...\n", + "\n", + "Processing 8 non-boolean variable columns...\n", + " Computing bin edges for numeric variables...\n", + "100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 72.63it/s]\n", + " Discretizing variables to binary features\n", + "100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 28.40it/s]\n", + "Finished discretizing features\n", + "\n", + "Output\n", + "X_all: shape=(4, 4, 29), density=0.203\n", + "Time elapsed: 2.102018 seconds\n" + ] + } + ], + "source": [ + "!docker run -it \\\n", + " --mount type='bind',src=\"$(pwd)\",target='/datadir' \\\n", + " fiddle-v020 \\\n", + "python -m FIDDLE.run \\\n", + " --data_fname='/datadir/input/data.csv' \\\n", + " --population_fname='/datadir/input/pop.csv' \\\n", + " --config_fname='/datadir/input/config-2.yaml' \\\n", + " --output_dir='/datadir/output-2/' \\\n", + " --T=4 --dt=1.0 \\\n", + " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n", + " --stats_functions 'min' 'max' 'mean' \\\n", + " --no_prefilter --no_postfilter" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AGE_value_(32.999, 35.8]AGE_value_(35.8, 38.6]AGE_value_(38.6, 42.0]AGE_value_(42.0, 46.0]AGE_value_(46.0, 50.0]ROOM_value__101ROOM_value__102ROOM_value__103SEX_value_FSEX_value_M
ID
10000110001
21000001001
30010000110
40000000000
\n", + "
" + ], + "text/plain": [ + " AGE_value_(32.999, 35.8] AGE_value_(35.8, 38.6] AGE_value_(38.6, 42.0] \\\n", + "ID \n", + "1 0 0 0 \n", + "2 1 0 0 \n", + "3 0 0 1 \n", + "4 0 0 0 \n", + "\n", + " AGE_value_(42.0, 46.0] AGE_value_(46.0, 50.0] ROOM_value__101 \\\n", + "ID \n", + "1 0 1 1 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "\n", + " ROOM_value__102 ROOM_value__103 SEX_value_F SEX_value_M \n", + "ID \n", + "1 0 0 0 1 \n", + "2 1 0 0 1 \n", + "3 0 1 1 0 \n", + "4 0 0 0 0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HR_maskDRUG_A_RATE_value_48DRUG_A_ROUTE_value_BolusDRUG_A_ROUTE_value_IVDRUG_A_ROUTE_value_OralLAB_X_value_5LAB_X_value_<1HR_delta_time_(-0.001, 1.0]HR_delta_time_(1.0, 2.0]HR_value_(60.999, 69.2]...HR_max_(60.999, 69.2]HR_max_(69.2, 73.6]HR_max_(73.6, 76.2]HR_max_(76.2, 82.0]HR_max_(82.0, 90.0]HR_mean_(60.332, 68.8]HR_mean_(68.8, 73.6]HR_mean_(73.6, 75.2]HR_mean_(75.2, 82.0]HR_mean_(82.0, 90.0]
IDt_range
1[0.0, 1.0)1000000100...0100001000
[1.0, 2.0)1000000100...0100001000
[2.0, 3.0)1100101100...0010000100
[3.0, 4.0)1000000100...0010000100
2[0.0, 1.0)1000000101...1000010000
[1.0, 2.0)0000000101...1000010000
[2.0, 3.0)1000000100...0001000010
[3.0, 4.0)1101000100...0010000100
3[0.0, 1.0)0000000100...0000000000
[1.0, 2.0)1010000100...0000100001
[2.0, 3.0)0000010100...0000100001
[3.0, 4.0)0000000010...0000100001
4[0.0, 1.0)1000000100...0001000010
[1.0, 2.0)0000000100...0001000010
[2.0, 3.0)1000000101...1000010000
[3.0, 4.0)1000000100...0100001000
\n", + "

16 rows × 29 columns

\n", + "
" + ], + "text/plain": [ + " HR_mask DRUG_A_RATE_value_48 DRUG_A_ROUTE_value_Bolus \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 1 0 0 \n", + " [1.0, 2.0) 1 0 0 \n", + " [2.0, 3.0) 1 1 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "2 [0.0, 1.0) 1 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 0 0 \n", + " [3.0, 4.0) 1 1 0 \n", + "3 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 1 0 1 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "4 [0.0, 1.0) 1 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 0 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "\n", + " DRUG_A_ROUTE_value_IV DRUG_A_ROUTE_value_Oral LAB_X_value_5 \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 1 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "2 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "3 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 0 1 \n", + " [3.0, 4.0) 0 0 0 \n", + "4 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "\n", + " LAB_X_value_<1 HR_delta_time_(-0.001, 1.0] \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 1 1 \n", + " [3.0, 4.0) 0 1 \n", + "2 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 1 \n", + " [3.0, 4.0) 0 1 \n", + "3 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 1 \n", + " [3.0, 4.0) 0 0 \n", + "4 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 1 \n", + " [3.0, 4.0) 0 1 \n", + "\n", + " HR_delta_time_(1.0, 2.0] HR_value_(60.999, 69.2] ... \\\n", + "ID t_range ... \n", + "1 [0.0, 1.0) 0 0 ... \n", + " [1.0, 2.0) 0 0 ... \n", + " [2.0, 3.0) 0 0 ... \n", + " [3.0, 4.0) 0 0 ... \n", + "2 [0.0, 1.0) 0 1 ... \n", + " [1.0, 2.0) 0 1 ... \n", + " [2.0, 3.0) 0 0 ... \n", + " [3.0, 4.0) 0 0 ... \n", + "3 [0.0, 1.0) 0 0 ... \n", + " [1.0, 2.0) 0 0 ... \n", + " [2.0, 3.0) 0 0 ... \n", + " [3.0, 4.0) 1 0 ... \n", + "4 [0.0, 1.0) 0 0 ... \n", + " [1.0, 2.0) 0 0 ... \n", + " [2.0, 3.0) 0 1 ... \n", + " [3.0, 4.0) 0 0 ... \n", + "\n", + " HR_max_(60.999, 69.2] HR_max_(69.2, 73.6] \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "2 [0.0, 1.0) 1 0 \n", + " [1.0, 2.0) 1 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "3 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "4 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 1 0 \n", + " [3.0, 4.0) 0 1 \n", + "\n", + " HR_max_(73.6, 76.2] HR_max_(76.2, 82.0] HR_max_(82.0, 90.0] \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 0 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "2 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 1 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "3 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 1 \n", + " [2.0, 3.0) 0 0 1 \n", + " [3.0, 4.0) 0 0 1 \n", + "4 [0.0, 1.0) 0 1 0 \n", + " [1.0, 2.0) 0 1 0 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "\n", + " HR_mean_(60.332, 68.8] HR_mean_(68.8, 73.6] \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "2 [0.0, 1.0) 1 0 \n", + " [1.0, 2.0) 1 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "3 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "4 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 1 0 \n", + " [3.0, 4.0) 0 1 \n", + "\n", + " HR_mean_(73.6, 75.2] HR_mean_(75.2, 82.0] \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 1 0 \n", + " [3.0, 4.0) 1 0 \n", + "2 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 1 \n", + " [3.0, 4.0) 1 0 \n", + "3 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "4 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "\n", + " HR_mean_(82.0, 90.0] \n", + "ID t_range \n", + "1 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 0 \n", + " [2.0, 3.0) 0 \n", + " [3.0, 4.0) 0 \n", + "2 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 0 \n", + " [2.0, 3.0) 0 \n", + " [3.0, 4.0) 0 \n", + "3 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 1 \n", + " [2.0, 3.0) 1 \n", + " [3.0, 4.0) 1 \n", + "4 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 0 \n", + " [2.0, 3.0) 0 \n", + " [3.0, 4.0) 0 \n", + "\n", + "[16 rows x 29 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import json\n", + "import sparse\n", + "\n", + "S = sparse.load_npz('output-2/S_all.npz')\n", + "S_names = json.load(open('output-2/S_all.feature_names.json', 'r'))\n", + "S_index = pd.read_csv('output-2/S.ID.csv').set_index(['ID'])\n", + "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n", + "\n", + "X = sparse.load_npz('output-2/X_all.npz')\n", + "X_names = json.load(open('output-2/X_all.feature_names.json', 'r'))\n", + "X_index = pd.read_csv('output-2/X.ID,t_range.csv').set_index(['ID', 't_range'])\n", + "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n", + "\n", + "display(df_S)\n", + "display(df_X)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 3: discretize = True, use_ordinal_encoding = True" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p output-3" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input:\n", + " Data : /datadir/input/data.csv\n", + " Population: /datadir/input/pop.csv\n", + " Config : /datadir/input/config-3.yaml\n", + "\n", + "Output directory: /datadir/output-3/\n", + "\n", + "Input arguments:\n", + " T = 4.0\n", + " dt = 1.0\n", + " θ₁ = 0.001\n", + " θ₂ = 0.001\n", + " θ_freq = 1.0\n", + " k = 3 ['min', 'max', 'mean']\n", + "\n", + "discretize = yes\n", + " S discretization bins: to be computed from data\n", + " X discretization bins: to be computed from data\n", + "\n", + "N = 4\n", + "L = 4\n", + "\n", + "\n", + "================================================================================\n", + "2) Transform; 3) Post-filter\n", + "================================================================================\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Detecting and parsing value types\n", + "--------------------------------------------------------------------------------\n", + "Saved as: /datadir/output-3/value_types.csv\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Separate time-invariant and time-dependent\n", + "--------------------------------------------------------------------------------\n", + "Variables (time-invariant): 3\n", + "Variables (time-dependent): 4\n", + "# rows (time-invariant): 9\n", + "# rows (time-dependent): 23\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-A) Transform time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "(N × ^d) table :\t (4, 3)\n", + "number of missing entries :\t 3 out of 12 total\n", + "Time elapsed: 0.047925 seconds\n", + "\n", + "Output\n", + "S_all, binary features :\t (4, 10)\n", + "Time elapsed: 0.147781 seconds\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-B) Transform time-dependent data\n", + "--------------------------------------------------------------------------------\n", + "Total variables : 4\n", + "Frequent variables : ['HR']\n", + "M₁ = 1\n", + "M₂ = 3\n", + "k = 3 ['min', 'max', 'mean']\n", + "\n", + "Transforming each example...\n", + "100%|█████████████████████████████████████████████| 4/4 [00:01<00:00, 3.53it/s]\n", + "DONE: Transforming each example...\n", + "(freq) number of missing entries :\t 5 out of 4×4×1=16 total\n", + "(freq) number of imputed entries :\t 4\n", + "(freq) number of not imputed entries :\t 1\n", + "(non-freq) number of missing entries :\t 41 out of 4×4×3=48 total\n", + "\n", + "(N × L × ^D) table :\t (4, 4, 9)\n", + "Time elapsed: 1.239074 seconds\n", + "Discretizing features...\n", + "\n", + "Processing 8 non-boolean variable columns...\n", + " Computing bin edges for numeric variables...\n", + "100%|████████████████████████████████████████████| 8/8 [00:00<00:00, 165.76it/s]\n", + " Discretizing variables to binary features\n", + "100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 24.97it/s]\n", + "Finished discretizing features\n", + "\n", + "Output\n", + "X_all: shape=(4, 4, 29), density=0.474\n", + "Time elapsed: 1.718451 seconds\n" + ] + } + ], + "source": [ + "!docker run -it \\\n", + " --mount type='bind',src=\"$(pwd)\",target='/datadir' \\\n", + " fiddle-v020 \\\n", + "python -m FIDDLE.run \\\n", + " --data_fname='/datadir/input/data.csv' \\\n", + " --population_fname='/datadir/input/pop.csv' \\\n", + " --config_fname='/datadir/input/config-3.yaml' \\\n", + " --output_dir='/datadir/output-3/' \\\n", + " --T=4 --dt=1.0 \\\n", + " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n", + " --stats_functions 'min' 'max' 'mean' \\\n", + " --no_prefilter --no_postfilter" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AGE_value>=33.0AGE_value>=35.8AGE_value>=38.6AGE_value>=42.0AGE_value>=46.0ROOM_value__101ROOM_value__102ROOM_value__103SEX_value_FSEX_value_M
ID
11111110001
21000001001
31110000110
40000000000
\n", + "
" + ], + "text/plain": [ + " AGE_value>=33.0 AGE_value>=35.8 AGE_value>=38.6 AGE_value>=42.0 \\\n", + "ID \n", + "1 1 1 1 1 \n", + "2 1 0 0 0 \n", + "3 1 1 1 0 \n", + "4 0 0 0 0 \n", + "\n", + " AGE_value>=46.0 ROOM_value__101 ROOM_value__102 ROOM_value__103 \\\n", + "ID \n", + "1 1 1 0 0 \n", + "2 0 0 1 0 \n", + "3 0 0 0 1 \n", + "4 0 0 0 0 \n", + "\n", + " SEX_value_F SEX_value_M \n", + "ID \n", + "1 0 1 \n", + "2 0 1 \n", + "3 1 0 \n", + "4 0 0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HR_maskDRUG_A_RATE_value_48DRUG_A_ROUTE_value_BolusDRUG_A_ROUTE_value_IVDRUG_A_ROUTE_value_OralLAB_X_value_5LAB_X_value_<1HR_delta_time>=0.0HR_delta_time>=1.0HR_value>=61.0...HR_max>=61.0HR_max>=69.2HR_max>=73.6HR_max>=76.2HR_max>=82.00000000000001HR_mean>=60.333333333333336HR_mean>=68.80000000000001HR_mean>=73.6HR_mean>=75.2HR_mean>=82.00000000000001
IDt_range
1[0.0, 1.0)1000000101...1100011000
[1.0, 2.0)1000000101...1100011000
[2.0, 3.0)1100101101...1110011100
[3.0, 4.0)1000000101...1110011100
2[0.0, 1.0)1000000101...1000010000
[1.0, 2.0)0000000111...1000010000
[2.0, 3.0)1000000101...1111011110
[3.0, 4.0)1101000101...1110011100
3[0.0, 1.0)0000000100...0000000000
[1.0, 2.0)1010000101...1111111111
[2.0, 3.0)0000010111...1111111111
[3.0, 4.0)0000000111...1111111111
4[0.0, 1.0)1000000101...1111011110
[1.0, 2.0)0000000111...1111011110
[2.0, 3.0)1000000101...1000010000
[3.0, 4.0)1000000101...1100011000
\n", + "

16 rows × 29 columns

\n", + "
" + ], + "text/plain": [ + " HR_mask DRUG_A_RATE_value_48 DRUG_A_ROUTE_value_Bolus \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 1 0 0 \n", + " [1.0, 2.0) 1 0 0 \n", + " [2.0, 3.0) 1 1 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "2 [0.0, 1.0) 1 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 0 0 \n", + " [3.0, 4.0) 1 1 0 \n", + "3 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 1 0 1 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "4 [0.0, 1.0) 1 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 0 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "\n", + " DRUG_A_ROUTE_value_IV DRUG_A_ROUTE_value_Oral LAB_X_value_5 \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 1 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "2 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "3 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 0 1 \n", + " [3.0, 4.0) 0 0 0 \n", + "4 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "\n", + " LAB_X_value_<1 HR_delta_time>=0.0 HR_delta_time>=1.0 \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 1 0 \n", + " [1.0, 2.0) 0 1 0 \n", + " [2.0, 3.0) 1 1 0 \n", + " [3.0, 4.0) 0 1 0 \n", + "2 [0.0, 1.0) 0 1 0 \n", + " [1.0, 2.0) 0 1 1 \n", + " [2.0, 3.0) 0 1 0 \n", + " [3.0, 4.0) 0 1 0 \n", + "3 [0.0, 1.0) 0 1 0 \n", + " [1.0, 2.0) 0 1 0 \n", + " [2.0, 3.0) 0 1 1 \n", + " [3.0, 4.0) 0 1 1 \n", + "4 [0.0, 1.0) 0 1 0 \n", + " [1.0, 2.0) 0 1 1 \n", + " [2.0, 3.0) 0 1 0 \n", + " [3.0, 4.0) 0 1 0 \n", + "\n", + " HR_value>=61.0 ... HR_max>=61.0 HR_max>=69.2 HR_max>=73.6 \\\n", + "ID t_range ... \n", + "1 [0.0, 1.0) 1 ... 1 1 0 \n", + " [1.0, 2.0) 1 ... 1 1 0 \n", + " [2.0, 3.0) 1 ... 1 1 1 \n", + " [3.0, 4.0) 1 ... 1 1 1 \n", + "2 [0.0, 1.0) 1 ... 1 0 0 \n", + " [1.0, 2.0) 1 ... 1 0 0 \n", + " [2.0, 3.0) 1 ... 1 1 1 \n", + " [3.0, 4.0) 1 ... 1 1 1 \n", + "3 [0.0, 1.0) 0 ... 0 0 0 \n", + " [1.0, 2.0) 1 ... 1 1 1 \n", + " [2.0, 3.0) 1 ... 1 1 1 \n", + " [3.0, 4.0) 1 ... 1 1 1 \n", + "4 [0.0, 1.0) 1 ... 1 1 1 \n", + " [1.0, 2.0) 1 ... 1 1 1 \n", + " [2.0, 3.0) 1 ... 1 0 0 \n", + " [3.0, 4.0) 1 ... 1 1 0 \n", + "\n", + " HR_max>=76.2 HR_max>=82.00000000000001 \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "2 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 1 0 \n", + " [3.0, 4.0) 0 0 \n", + "3 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 1 1 \n", + " [2.0, 3.0) 1 1 \n", + " [3.0, 4.0) 1 1 \n", + "4 [0.0, 1.0) 1 0 \n", + " [1.0, 2.0) 1 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "\n", + " HR_mean>=60.333333333333336 HR_mean>=68.80000000000001 \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 1 1 \n", + " [1.0, 2.0) 1 1 \n", + " [2.0, 3.0) 1 1 \n", + " [3.0, 4.0) 1 1 \n", + "2 [0.0, 1.0) 1 0 \n", + " [1.0, 2.0) 1 0 \n", + " [2.0, 3.0) 1 1 \n", + " [3.0, 4.0) 1 1 \n", + "3 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 1 1 \n", + " [2.0, 3.0) 1 1 \n", + " [3.0, 4.0) 1 1 \n", + "4 [0.0, 1.0) 1 1 \n", + " [1.0, 2.0) 1 1 \n", + " [2.0, 3.0) 1 0 \n", + " [3.0, 4.0) 1 1 \n", + "\n", + " HR_mean>=73.6 HR_mean>=75.2 HR_mean>=82.00000000000001 \n", + "ID t_range \n", + "1 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 0 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "2 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 1 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "3 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 1 1 1 \n", + " [2.0, 3.0) 1 1 1 \n", + " [3.0, 4.0) 1 1 1 \n", + "4 [0.0, 1.0) 1 1 0 \n", + " [1.0, 2.0) 1 1 0 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "\n", + "[16 rows x 29 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import json\n", + "import sparse\n", + "\n", + "S = sparse.load_npz('output-3/S_all.npz')\n", + "S_names = json.load(open('output-3/S_all.feature_names.json', 'r'))\n", + "S_index = pd.read_csv('output-3/S.ID.csv').set_index(['ID'])\n", + "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n", + "\n", + "X = sparse.load_npz('output-3/X_all.npz')\n", + "X_names = json.load(open('output-3/X_all.feature_names.json', 'r'))\n", + "X_index = pd.read_csv('output-3/X.ID,t_range.csv').set_index(['ID', 't_range'])\n", + "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n", + "\n", + "display(df_S)\n", + "display(df_X)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tests/small_test/Run.ipynb b/tests/small_test/Run.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..b9a073f0aefaf94cee655782941896fd07baf23e --- /dev/null +++ b/tests/small_test/Run.ipynb @@ -0,0 +1,3390 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf output-*/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 1: discretize = False" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p output-1" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input:\n", + " Data : ./input/data.csv\n", + " Population: ./input/pop.csv\n", + " Config : ./input/config-1.yaml\n", + "\n", + "Output directory: ./output-1/\n", + "\n", + "Input arguments:\n", + " T = 4.0\n", + " dt = 1.0\n", + " θ₁ = 0.001\n", + " θ₂ = 0.001\n", + " θ_freq = 1.0\n", + " k = 3 ['min', 'max', 'mean']\n", + "\n", + "discretize = no\n", + "\n", + "N = 4\n", + "L = 4\n", + "\n", + "\n", + "================================================================================\n", + "2) Transform; 3) Post-filter\n", + "================================================================================\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Detecting and parsing value types\n", + "--------------------------------------------------------------------------------\n", + "Saved as: ./output-1/value_types.csv\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Separate time-invariant and time-dependent\n", + "--------------------------------------------------------------------------------\n", + "Variables (time-invariant): 3\n", + "Variables (time-dependent): 4\n", + "# rows (time-invariant): 9\n", + "# rows (time-dependent): 23\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-A) Transform time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "(N × ^d) table :\t (4, 3)\n", + "number of missing entries :\t 3 out of 12 total\n", + "Time elapsed: 0.027288 seconds\n", + "\n", + "Output\n", + "s_all, binary features :\t (4, 6)\n", + "Time elapsed: 0.058650 seconds\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-B) Transform time-dependent data\n", + "--------------------------------------------------------------------------------\n", + "Total variables : 4\n", + "Frequent variables : ['HR']\n", + "M₁ = 1\n", + "M₂ = 3\n", + "k = 3 ['min', 'max', 'mean']\n", + "\n", + "Transforming each example...\n", + "100%|█████████████████████████████████████████████| 4/4 [00:00<00:00, 4.82it/s]\n", + "DONE: Transforming each example...\n", + "(freq) number of missing entries :\t 5 out of 4×4×1=16 total\n", + "(freq) number of imputed entries :\t 4\n", + "(freq) number of not imputed entries :\t 1\n", + "(non-freq) number of missing entries :\t 41 out of 4×4×3=48 total\n", + "\n", + "(N × L × ^D) table :\t (4, 4, 9)\n", + "Time elapsed: 0.917519 seconds\n", + "Discretizing features...\n", + "\n", + "Discretizing categorical features...\n", + "100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 202.11it/s]\n", + "Finished discretizing features\n", + "\n", + "Output\n", + "X_all: shape=(4, 4, 12), density=0.599\n", + "Time elapsed: 1.008456 seconds\n", + "\u001b[0m" + ] + } + ], + "source": [ + "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n", + "python -m FIDDLE.run \\\n", + " --data_fname='./input/data.csv' \\\n", + " --population_fname='./input/pop.csv' \\\n", + " --config_fname='./input/config-1.yaml' \\\n", + " --output_dir='./output-1/' \\\n", + " --T=4 --dt=1.0 \\\n", + " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n", + " --stats_functions 'min' 'max' 'mean' \\\n", + " --no_prefilter --no_postfilter" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AGE_valueROOM_value:_101ROOM_value:_102ROOM_value:_103SEX_value:FSEX_value:M
ID
150.01.00.00.00.01.0
233.00.01.00.00.01.0
340.00.00.01.01.00.0
441.00.00.00.00.00.0
\n", + "
" + ], + "text/plain": [ + " AGE_value ROOM_value:_101 ROOM_value:_102 ROOM_value:_103 SEX_value:F \\\n", + "ID \n", + "1 50.0 1.0 0.0 0.0 0.0 \n", + "2 33.0 0.0 1.0 0.0 0.0 \n", + "3 40.0 0.0 0.0 1.0 1.0 \n", + "4 41.0 0.0 0.0 0.0 0.0 \n", + "\n", + " SEX_value:M \n", + "ID \n", + "1 1.0 \n", + "2 1.0 \n", + "3 0.0 \n", + "4 0.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HR_maskDRUG_A_RATE_valueDRUG_A_ROUTE_value:BolusDRUG_A_ROUTE_value:IVDRUG_A_ROUTE_value:OralLAB_X_valueHR_delta_timeHR_valueHR_minHR_maxHR_meanLAB_X_value_str:<1
IDt_range
1[0.0, 1.0)1.048.00.00.00.05.00.071.00000070.071.00000070.5000000.0
[1.0, 2.0)1.048.00.00.00.05.00.073.00000072.073.00000072.5000000.0
[2.0, 3.0)1.048.00.00.01.05.00.074.00000074.074.00000074.0000001.0
[3.0, 4.0)1.048.00.00.00.05.00.075.00000075.075.00000075.0000000.0
2[0.0, 1.0)1.048.00.00.00.05.00.061.00000060.061.00000060.3333330.0
[1.0, 2.0)0.048.00.00.00.05.01.061.00000061.061.00000061.0000000.0
[2.0, 3.0)1.048.00.00.00.05.00.078.00000073.078.00000075.5000000.0
[3.0, 4.0)1.048.00.01.00.05.00.075.00000075.075.00000075.0000000.0
3[0.0, 1.0)0.048.00.00.00.05.00.075.53333375.075.53333375.2555560.0
[1.0, 2.0)1.048.01.00.00.05.00.090.00000090.090.00000090.0000000.0
[2.0, 3.0)0.048.00.00.00.05.01.090.00000090.090.00000090.0000000.0
[3.0, 4.0)0.048.00.00.00.05.02.090.00000090.090.00000090.0000000.0
4[0.0, 1.0)1.048.00.00.00.05.00.080.00000080.080.00000080.0000000.0
[1.0, 2.0)0.048.00.00.00.05.01.080.00000080.080.00000080.0000000.0
[2.0, 3.0)1.048.00.00.00.05.00.062.00000062.062.00000062.0000000.0
[3.0, 4.0)1.048.00.00.00.05.00.073.00000073.073.00000073.0000000.0
\n", + "
" + ], + "text/plain": [ + " HR_mask DRUG_A_RATE_value DRUG_A_ROUTE_value:Bolus \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 1.0 48.0 0.0 \n", + " [1.0, 2.0) 1.0 48.0 0.0 \n", + " [2.0, 3.0) 1.0 48.0 0.0 \n", + " [3.0, 4.0) 1.0 48.0 0.0 \n", + "2 [0.0, 1.0) 1.0 48.0 0.0 \n", + " [1.0, 2.0) 0.0 48.0 0.0 \n", + " [2.0, 3.0) 1.0 48.0 0.0 \n", + " [3.0, 4.0) 1.0 48.0 0.0 \n", + "3 [0.0, 1.0) 0.0 48.0 0.0 \n", + " [1.0, 2.0) 1.0 48.0 1.0 \n", + " [2.0, 3.0) 0.0 48.0 0.0 \n", + " [3.0, 4.0) 0.0 48.0 0.0 \n", + "4 [0.0, 1.0) 1.0 48.0 0.0 \n", + " [1.0, 2.0) 0.0 48.0 0.0 \n", + " [2.0, 3.0) 1.0 48.0 0.0 \n", + " [3.0, 4.0) 1.0 48.0 0.0 \n", + "\n", + " DRUG_A_ROUTE_value:IV DRUG_A_ROUTE_value:Oral LAB_X_value \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0.0 0.0 5.0 \n", + " [1.0, 2.0) 0.0 0.0 5.0 \n", + " [2.0, 3.0) 0.0 1.0 5.0 \n", + " [3.0, 4.0) 0.0 0.0 5.0 \n", + "2 [0.0, 1.0) 0.0 0.0 5.0 \n", + " [1.0, 2.0) 0.0 0.0 5.0 \n", + " [2.0, 3.0) 0.0 0.0 5.0 \n", + " [3.0, 4.0) 1.0 0.0 5.0 \n", + "3 [0.0, 1.0) 0.0 0.0 5.0 \n", + " [1.0, 2.0) 0.0 0.0 5.0 \n", + " [2.0, 3.0) 0.0 0.0 5.0 \n", + " [3.0, 4.0) 0.0 0.0 5.0 \n", + "4 [0.0, 1.0) 0.0 0.0 5.0 \n", + " [1.0, 2.0) 0.0 0.0 5.0 \n", + " [2.0, 3.0) 0.0 0.0 5.0 \n", + " [3.0, 4.0) 0.0 0.0 5.0 \n", + "\n", + " HR_delta_time HR_value HR_min HR_max HR_mean \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0.0 71.000000 70.0 71.000000 70.500000 \n", + " [1.0, 2.0) 0.0 73.000000 72.0 73.000000 72.500000 \n", + " [2.0, 3.0) 0.0 74.000000 74.0 74.000000 74.000000 \n", + " [3.0, 4.0) 0.0 75.000000 75.0 75.000000 75.000000 \n", + "2 [0.0, 1.0) 0.0 61.000000 60.0 61.000000 60.333333 \n", + " [1.0, 2.0) 1.0 61.000000 61.0 61.000000 61.000000 \n", + " [2.0, 3.0) 0.0 78.000000 73.0 78.000000 75.500000 \n", + " [3.0, 4.0) 0.0 75.000000 75.0 75.000000 75.000000 \n", + "3 [0.0, 1.0) 0.0 75.533333 75.0 75.533333 75.255556 \n", + " [1.0, 2.0) 0.0 90.000000 90.0 90.000000 90.000000 \n", + " [2.0, 3.0) 1.0 90.000000 90.0 90.000000 90.000000 \n", + " [3.0, 4.0) 2.0 90.000000 90.0 90.000000 90.000000 \n", + "4 [0.0, 1.0) 0.0 80.000000 80.0 80.000000 80.000000 \n", + " [1.0, 2.0) 1.0 80.000000 80.0 80.000000 80.000000 \n", + " [2.0, 3.0) 0.0 62.000000 62.0 62.000000 62.000000 \n", + " [3.0, 4.0) 0.0 73.000000 73.0 73.000000 73.000000 \n", + "\n", + " LAB_X_value_str:<1 \n", + "ID t_range \n", + "1 [0.0, 1.0) 0.0 \n", + " [1.0, 2.0) 0.0 \n", + " [2.0, 3.0) 1.0 \n", + " [3.0, 4.0) 0.0 \n", + "2 [0.0, 1.0) 0.0 \n", + " [1.0, 2.0) 0.0 \n", + " [2.0, 3.0) 0.0 \n", + " [3.0, 4.0) 0.0 \n", + "3 [0.0, 1.0) 0.0 \n", + " [1.0, 2.0) 0.0 \n", + " [2.0, 3.0) 0.0 \n", + " [3.0, 4.0) 0.0 \n", + "4 [0.0, 1.0) 0.0 \n", + " [1.0, 2.0) 0.0 \n", + " [2.0, 3.0) 0.0 \n", + " [3.0, 4.0) 0.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import json\n", + "import sparse\n", + "\n", + "S = sparse.load_npz('output-1/S_all.npz')\n", + "S_names = json.load(open('output-1/S_all.feature_names.json', 'r'))\n", + "S_index = pd.read_csv('output-1/S.ID.csv').set_index(['ID'])\n", + "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n", + "\n", + "X = sparse.load_npz('output-1/X_all.npz')\n", + "X_names = json.load(open('output-1/X_all.feature_names.json', 'r'))\n", + "X_index = pd.read_csv('output-1/X.ID,t_range.csv').set_index(['ID', 't_range'])\n", + "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n", + "\n", + "display(df_S)\n", + "display(df_X)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 2: discretize = True, use_ordinal_encoding = False" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p output-2" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input:\n", + " Data : ./input/data.csv\n", + " Population: ./input/pop.csv\n", + " Config : ./input/config-2.yaml\n", + "\n", + "Output directory: ./output-2/\n", + "\n", + "Input arguments:\n", + " T = 4.0\n", + " dt = 1.0\n", + " θ₁ = 0.001\n", + " θ₂ = 0.001\n", + " θ_freq = 1.0\n", + " k = 3 ['min', 'max', 'mean']\n", + "\n", + "discretize = yes\n", + " S discretization bins: to be computed from data\n", + " X discretization bins: to be computed from data\n", + "\n", + "N = 4\n", + "L = 4\n", + "\n", + "\n", + "================================================================================\n", + "2) Transform; 3) Post-filter\n", + "================================================================================\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Detecting and parsing value types\n", + "--------------------------------------------------------------------------------\n", + "Saved as: ./output-2/value_types.csv\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Separate time-invariant and time-dependent\n", + "--------------------------------------------------------------------------------\n", + "Variables (time-invariant): 3\n", + "Variables (time-dependent): 4\n", + "# rows (time-invariant): 9\n", + "# rows (time-dependent): 23\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-A) Transform time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "(N × ^d) table :\t (4, 3)\n", + "number of missing entries :\t 3 out of 12 total\n", + "Time elapsed: 0.020094 seconds\n", + "\n", + "Output\n", + "s_all, binary features :\t (4, 10)\n", + "Time elapsed: 0.065039 seconds\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-B) Transform time-dependent data\n", + "--------------------------------------------------------------------------------\n", + "Total variables : 4\n", + "Frequent variables : ['HR']\n", + "M₁ = 1\n", + "M₂ = 3\n", + "k = 3 ['min', 'max', 'mean']\n", + "\n", + "Transforming each example...\n", + "100%|█████████████████████████████████████████████| 4/4 [00:00<00:00, 6.65it/s]\n", + "DONE: Transforming each example...\n", + "(freq) number of missing entries :\t 5 out of 4×4×1=16 total\n", + "(freq) number of imputed entries :\t 4\n", + "(freq) number of not imputed entries :\t 1\n", + "(non-freq) number of missing entries :\t 41 out of 4×4×3=48 total\n", + "\n", + "(N × L × ^D) table :\t (4, 4, 9)\n", + "Time elapsed: 0.653901 seconds\n", + "Discretizing features...\n", + "\n", + "Processing 8 non-boolean variable columns...\n", + " Computing bin edges for numeric variables...\n", + "100%|████████████████████████████████████████████| 8/8 [00:00<00:00, 228.60it/s]\n", + " Discretizing variables to binary features\n", + "100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 87.10it/s]\n", + "Finished discretizing features\n", + "\n", + "Output\n", + "X_all: shape=(4, 4, 29), density=0.203\n", + "Time elapsed: 0.800083 seconds\n", + "\u001b[0m" + ] + } + ], + "source": [ + "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n", + "python -m FIDDLE.run \\\n", + " --data_fname='./input/data.csv' \\\n", + " --population_fname='./input/pop.csv' \\\n", + " --config_fname='./input/config-2.yaml' \\\n", + " --output_dir='./output-2/' \\\n", + " --T=4 --dt=1.0 \\\n", + " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n", + " --stats_functions 'min' 'max' 'mean' \\\n", + " --no_prefilter --no_postfilter" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AGE_value_(32.999, 35.8]AGE_value_(35.8, 38.6]AGE_value_(38.6, 42.0]AGE_value_(42.0, 46.0]AGE_value_(46.0, 50.0]ROOM_value__101ROOM_value__102ROOM_value__103SEX_value_FSEX_value_M
ID
10000110001
21000001001
30010000110
40000000000
\n", + "
" + ], + "text/plain": [ + " AGE_value_(32.999, 35.8] AGE_value_(35.8, 38.6] AGE_value_(38.6, 42.0] \\\n", + "ID \n", + "1 0 0 0 \n", + "2 1 0 0 \n", + "3 0 0 1 \n", + "4 0 0 0 \n", + "\n", + " AGE_value_(42.0, 46.0] AGE_value_(46.0, 50.0] ROOM_value__101 \\\n", + "ID \n", + "1 0 1 1 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "\n", + " ROOM_value__102 ROOM_value__103 SEX_value_F SEX_value_M \n", + "ID \n", + "1 0 0 0 1 \n", + "2 1 0 0 1 \n", + "3 0 1 1 0 \n", + "4 0 0 0 0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HR_maskDRUG_A_RATE_value_48DRUG_A_ROUTE_value_BolusDRUG_A_ROUTE_value_IVDRUG_A_ROUTE_value_OralLAB_X_value_5LAB_X_value_<1HR_delta_time_(-0.001, 1.0]HR_delta_time_(1.0, 2.0]HR_value_(60.999, 69.2]...HR_max_(60.999, 69.2]HR_max_(69.2, 73.6]HR_max_(73.6, 76.2]HR_max_(76.2, 82.0]HR_max_(82.0, 90.0]HR_mean_(60.332, 68.8]HR_mean_(68.8, 73.6]HR_mean_(73.6, 75.2]HR_mean_(75.2, 82.0]HR_mean_(82.0, 90.0]
IDt_range
1[0.0, 1.0)1000000100...0100001000
[1.0, 2.0)1000000100...0100001000
[2.0, 3.0)1100101100...0010000100
[3.0, 4.0)1000000100...0010000100
2[0.0, 1.0)1000000101...1000010000
[1.0, 2.0)0000000101...1000010000
[2.0, 3.0)1000000100...0001000010
[3.0, 4.0)1101000100...0010000100
3[0.0, 1.0)0000000100...0000000000
[1.0, 2.0)1010000100...0000100001
[2.0, 3.0)0000010100...0000100001
[3.0, 4.0)0000000010...0000100001
4[0.0, 1.0)1000000100...0001000010
[1.0, 2.0)0000000100...0001000010
[2.0, 3.0)1000000101...1000010000
[3.0, 4.0)1000000100...0100001000
\n", + "

16 rows × 29 columns

\n", + "
" + ], + "text/plain": [ + " HR_mask DRUG_A_RATE_value_48 DRUG_A_ROUTE_value_Bolus \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 1 0 0 \n", + " [1.0, 2.0) 1 0 0 \n", + " [2.0, 3.0) 1 1 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "2 [0.0, 1.0) 1 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 0 0 \n", + " [3.0, 4.0) 1 1 0 \n", + "3 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 1 0 1 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "4 [0.0, 1.0) 1 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 0 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "\n", + " DRUG_A_ROUTE_value_IV DRUG_A_ROUTE_value_Oral LAB_X_value_5 \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 1 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "2 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "3 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 0 1 \n", + " [3.0, 4.0) 0 0 0 \n", + "4 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "\n", + " LAB_X_value_<1 HR_delta_time_(-0.001, 1.0] \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 1 1 \n", + " [3.0, 4.0) 0 1 \n", + "2 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 1 \n", + " [3.0, 4.0) 0 1 \n", + "3 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 1 \n", + " [3.0, 4.0) 0 0 \n", + "4 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 1 \n", + " [3.0, 4.0) 0 1 \n", + "\n", + " HR_delta_time_(1.0, 2.0] HR_value_(60.999, 69.2] ... \\\n", + "ID t_range ... \n", + "1 [0.0, 1.0) 0 0 ... \n", + " [1.0, 2.0) 0 0 ... \n", + " [2.0, 3.0) 0 0 ... \n", + " [3.0, 4.0) 0 0 ... \n", + "2 [0.0, 1.0) 0 1 ... \n", + " [1.0, 2.0) 0 1 ... \n", + " [2.0, 3.0) 0 0 ... \n", + " [3.0, 4.0) 0 0 ... \n", + "3 [0.0, 1.0) 0 0 ... \n", + " [1.0, 2.0) 0 0 ... \n", + " [2.0, 3.0) 0 0 ... \n", + " [3.0, 4.0) 1 0 ... \n", + "4 [0.0, 1.0) 0 0 ... \n", + " [1.0, 2.0) 0 0 ... \n", + " [2.0, 3.0) 0 1 ... \n", + " [3.0, 4.0) 0 0 ... \n", + "\n", + " HR_max_(60.999, 69.2] HR_max_(69.2, 73.6] \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "2 [0.0, 1.0) 1 0 \n", + " [1.0, 2.0) 1 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "3 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "4 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 1 0 \n", + " [3.0, 4.0) 0 1 \n", + "\n", + " HR_max_(73.6, 76.2] HR_max_(76.2, 82.0] HR_max_(82.0, 90.0] \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 0 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "2 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 1 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "3 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 1 \n", + " [2.0, 3.0) 0 0 1 \n", + " [3.0, 4.0) 0 0 1 \n", + "4 [0.0, 1.0) 0 1 0 \n", + " [1.0, 2.0) 0 1 0 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "\n", + " HR_mean_(60.332, 68.8] HR_mean_(68.8, 73.6] \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "2 [0.0, 1.0) 1 0 \n", + " [1.0, 2.0) 1 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "3 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "4 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 1 0 \n", + " [3.0, 4.0) 0 1 \n", + "\n", + " HR_mean_(73.6, 75.2] HR_mean_(75.2, 82.0] \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 1 0 \n", + " [3.0, 4.0) 1 0 \n", + "2 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 1 \n", + " [3.0, 4.0) 1 0 \n", + "3 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "4 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "\n", + " HR_mean_(82.0, 90.0] \n", + "ID t_range \n", + "1 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 0 \n", + " [2.0, 3.0) 0 \n", + " [3.0, 4.0) 0 \n", + "2 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 0 \n", + " [2.0, 3.0) 0 \n", + " [3.0, 4.0) 0 \n", + "3 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 1 \n", + " [2.0, 3.0) 1 \n", + " [3.0, 4.0) 1 \n", + "4 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 0 \n", + " [2.0, 3.0) 0 \n", + " [3.0, 4.0) 0 \n", + "\n", + "[16 rows x 29 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import json\n", + "import sparse\n", + "\n", + "S = sparse.load_npz('output-2/S_all.npz')\n", + "S_names = json.load(open('output-2/S_all.feature_names.json', 'r'))\n", + "S_index = pd.read_csv('output-2/S.ID.csv').set_index(['ID'])\n", + "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n", + "\n", + "X = sparse.load_npz('output-2/X_all.npz')\n", + "X_names = json.load(open('output-2/X_all.feature_names.json', 'r'))\n", + "X_index = pd.read_csv('output-2/X.ID,t_range.csv').set_index(['ID', 't_range'])\n", + "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n", + "\n", + "display(df_S)\n", + "display(df_X)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test 2.1: predetermined discretization bins" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p output-2-bins" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input:\n", + " Data : ./input/data.csv\n", + " Population: ./input/pop.csv\n", + " Config : ./input/config-2-bins.yaml\n", + "\n", + "Output directory: ./output-2-bins/\n", + "\n", + "Input arguments:\n", + " T = 4.0\n", + " dt = 1.0\n", + " θ₁ = 0.001\n", + " θ₂ = 0.001\n", + " θ_freq = 1.0\n", + " k = 3 ['min', 'max', 'mean']\n", + "\n", + "discretize = yes\n", + " S discretization bins: input/s_all.discretization.json\n", + " X discretization bins: input/X_all.discretization.json\n", + "\n", + "N = 4\n", + "L = 4\n", + "\n", + "\n", + "================================================================================\n", + "2) Transform; 3) Post-filter\n", + "================================================================================\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Detecting and parsing value types\n", + "--------------------------------------------------------------------------------\n", + "Saved as: ./output-2-bins/value_types.csv\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Separate time-invariant and time-dependent\n", + "--------------------------------------------------------------------------------\n", + "Variables (time-invariant): 3\n", + "Variables (time-dependent): 4\n", + "# rows (time-invariant): 9\n", + "# rows (time-dependent): 23\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-A) Transform time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "(N × ^d) table :\t (4, 3)\n", + "number of missing entries :\t 3 out of 12 total\n", + "Time elapsed: 0.018257 seconds\n", + "\n", + "Output\n", + "s_all, binary features :\t (4, 10)\n", + "Time elapsed: 0.055306 seconds\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-B) Transform time-dependent data\n", + "--------------------------------------------------------------------------------\n", + "Total variables : 4\n", + "Frequent variables : ['HR']\n", + "M₁ = 1\n", + "M₂ = 3\n", + "k = 3 ['min', 'max', 'mean']\n", + "\n", + "Transforming each example...\n", + "100%|█████████████████████████████████████████████| 4/4 [00:00<00:00, 5.98it/s]\n", + "DONE: Transforming each example...\n", + "(freq) number of missing entries :\t 5 out of 4×4×1=16 total\n", + "(freq) number of imputed entries :\t 4\n", + "(freq) number of not imputed entries :\t 1\n", + "(non-freq) number of missing entries :\t 41 out of 4×4×3=48 total\n", + "\n", + "(N × L × ^D) table :\t (4, 4, 9)\n", + "Time elapsed: 0.727093 seconds\n", + "Discretizing features...\n", + "\n", + "Processing 8 non-boolean variable columns...\n", + " Usng predetermined bin edges for numeric variables...\n", + " Discretizing variables to binary features\n", + "100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 99.05it/s]\n", + "Finished discretizing features\n", + "\n", + "Output\n", + "X_all: shape=(4, 4, 29), density=0.203\n", + "Time elapsed: 0.828591 seconds\n", + "\u001b[0m" + ] + } + ], + "source": [ + "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n", + "python -m FIDDLE.run \\\n", + " --data_fname='./input/data.csv' \\\n", + " --population_fname='./input/pop.csv' \\\n", + " --config_fname='./input/config-2-bins.yaml' \\\n", + " --output_dir='./output-2-bins/' \\\n", + " --T=4 --dt=1.0 \\\n", + " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n", + " --stats_functions 'min' 'max' 'mean' \\\n", + " --no_prefilter --no_postfilter" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AGE_value_(32.999, 35.8]AGE_value_(35.8, 38.6]AGE_value_(38.6, 42.0]AGE_value_(42.0, 46.0]AGE_value_(46.0, 50.0]ROOM_value__101ROOM_value__102ROOM_value__103SEX_value_FSEX_value_M
ID
10000110001
21000001001
30010000110
40000000000
\n", + "
" + ], + "text/plain": [ + " AGE_value_(32.999, 35.8] AGE_value_(35.8, 38.6] AGE_value_(38.6, 42.0] \\\n", + "ID \n", + "1 0 0 0 \n", + "2 1 0 0 \n", + "3 0 0 1 \n", + "4 0 0 0 \n", + "\n", + " AGE_value_(42.0, 46.0] AGE_value_(46.0, 50.0] ROOM_value__101 \\\n", + "ID \n", + "1 0 1 1 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "\n", + " ROOM_value__102 ROOM_value__103 SEX_value_F SEX_value_M \n", + "ID \n", + "1 0 0 0 1 \n", + "2 1 0 0 1 \n", + "3 0 1 1 0 \n", + "4 0 0 0 0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HR_maskDRUG_A_RATE_value_48DRUG_A_ROUTE_value_BolusDRUG_A_ROUTE_value_IVDRUG_A_ROUTE_value_OralLAB_X_value_5LAB_X_value_<1HR_delta_time_(-0.001, 1.0]HR_delta_time_(1.0, 2.0]HR_value_(60.999, 69.2]...HR_max_(60.999, 69.2]HR_max_(69.2, 73.6]HR_max_(73.6, 76.2]HR_max_(76.2, 82.0]HR_max_(82.0, 90.0]HR_mean_(60.332, 68.8]HR_mean_(68.8, 73.6]HR_mean_(73.6, 75.2]HR_mean_(75.2, 82.0]HR_mean_(82.0, 90.0]
IDt_range
1[0.0, 1.0)1000000100...0100001000
[1.0, 2.0)1000000100...0100001000
[2.0, 3.0)1100101100...0010000100
[3.0, 4.0)1000000100...0010000100
2[0.0, 1.0)1000000101...1000010000
[1.0, 2.0)0000000101...1000010000
[2.0, 3.0)1000000100...0001000010
[3.0, 4.0)1101000100...0010000100
3[0.0, 1.0)0000000100...0000000000
[1.0, 2.0)1010000100...0000100001
[2.0, 3.0)0000010100...0000100001
[3.0, 4.0)0000000010...0000100001
4[0.0, 1.0)1000000100...0001000010
[1.0, 2.0)0000000100...0001000010
[2.0, 3.0)1000000101...1000010000
[3.0, 4.0)1000000100...0100001000
\n", + "

16 rows × 29 columns

\n", + "
" + ], + "text/plain": [ + " HR_mask DRUG_A_RATE_value_48 DRUG_A_ROUTE_value_Bolus \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 1 0 0 \n", + " [1.0, 2.0) 1 0 0 \n", + " [2.0, 3.0) 1 1 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "2 [0.0, 1.0) 1 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 0 0 \n", + " [3.0, 4.0) 1 1 0 \n", + "3 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 1 0 1 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "4 [0.0, 1.0) 1 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 0 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "\n", + " DRUG_A_ROUTE_value_IV DRUG_A_ROUTE_value_Oral LAB_X_value_5 \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 1 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "2 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "3 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 0 1 \n", + " [3.0, 4.0) 0 0 0 \n", + "4 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "\n", + " LAB_X_value_<1 HR_delta_time_(-0.001, 1.0] \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 1 1 \n", + " [3.0, 4.0) 0 1 \n", + "2 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 1 \n", + " [3.0, 4.0) 0 1 \n", + "3 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 1 \n", + " [3.0, 4.0) 0 0 \n", + "4 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 1 \n", + " [3.0, 4.0) 0 1 \n", + "\n", + " HR_delta_time_(1.0, 2.0] HR_value_(60.999, 69.2] ... \\\n", + "ID t_range ... \n", + "1 [0.0, 1.0) 0 0 ... \n", + " [1.0, 2.0) 0 0 ... \n", + " [2.0, 3.0) 0 0 ... \n", + " [3.0, 4.0) 0 0 ... \n", + "2 [0.0, 1.0) 0 1 ... \n", + " [1.0, 2.0) 0 1 ... \n", + " [2.0, 3.0) 0 0 ... \n", + " [3.0, 4.0) 0 0 ... \n", + "3 [0.0, 1.0) 0 0 ... \n", + " [1.0, 2.0) 0 0 ... \n", + " [2.0, 3.0) 0 0 ... \n", + " [3.0, 4.0) 1 0 ... \n", + "4 [0.0, 1.0) 0 0 ... \n", + " [1.0, 2.0) 0 0 ... \n", + " [2.0, 3.0) 0 1 ... \n", + " [3.0, 4.0) 0 0 ... \n", + "\n", + " HR_max_(60.999, 69.2] HR_max_(69.2, 73.6] \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "2 [0.0, 1.0) 1 0 \n", + " [1.0, 2.0) 1 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "3 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "4 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 1 0 \n", + " [3.0, 4.0) 0 1 \n", + "\n", + " HR_max_(73.6, 76.2] HR_max_(76.2, 82.0] HR_max_(82.0, 90.0] \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 0 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "2 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 1 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "3 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 1 \n", + " [2.0, 3.0) 0 0 1 \n", + " [3.0, 4.0) 0 0 1 \n", + "4 [0.0, 1.0) 0 1 0 \n", + " [1.0, 2.0) 0 1 0 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "\n", + " HR_mean_(60.332, 68.8] HR_mean_(68.8, 73.6] \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "2 [0.0, 1.0) 1 0 \n", + " [1.0, 2.0) 1 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "3 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "4 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 1 0 \n", + " [3.0, 4.0) 0 1 \n", + "\n", + " HR_mean_(73.6, 75.2] HR_mean_(75.2, 82.0] \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 1 0 \n", + " [3.0, 4.0) 1 0 \n", + "2 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 1 \n", + " [3.0, 4.0) 1 0 \n", + "3 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "4 [0.0, 1.0) 0 1 \n", + " [1.0, 2.0) 0 1 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "\n", + " HR_mean_(82.0, 90.0] \n", + "ID t_range \n", + "1 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 0 \n", + " [2.0, 3.0) 0 \n", + " [3.0, 4.0) 0 \n", + "2 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 0 \n", + " [2.0, 3.0) 0 \n", + " [3.0, 4.0) 0 \n", + "3 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 1 \n", + " [2.0, 3.0) 1 \n", + " [3.0, 4.0) 1 \n", + "4 [0.0, 1.0) 0 \n", + " [1.0, 2.0) 0 \n", + " [2.0, 3.0) 0 \n", + " [3.0, 4.0) 0 \n", + "\n", + "[16 rows x 29 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import json\n", + "import sparse\n", + "\n", + "S = sparse.load_npz('output-2-bins/S_all.npz')\n", + "S_names = json.load(open('output-2-bins/S_all.feature_names.json', 'r'))\n", + "S_index = pd.read_csv('output-2-bins/S.ID.csv').set_index(['ID'])\n", + "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n", + "\n", + "X = sparse.load_npz('output-2-bins/X_all.npz')\n", + "X_names = json.load(open('output-2-bins/X_all.feature_names.json', 'r'))\n", + "X_index = pd.read_csv('output-2-bins/X.ID,t_range.csv').set_index(['ID', 't_range'])\n", + "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n", + "\n", + "display(df_S)\n", + "display(df_X)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test 3: discretize = True, use_ordinal_encoding = True" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p output-3" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input:\n", + " Data : ./input/data.csv\n", + " Population: ./input/pop.csv\n", + " Config : ./input/config-3.yaml\n", + "\n", + "Output directory: ./output-3/\n", + "\n", + "Input arguments:\n", + " T = 4.0\n", + " dt = 1.0\n", + " θ₁ = 0.001\n", + " θ₂ = 0.001\n", + " θ_freq = 1.0\n", + " k = 3 ['min', 'max', 'mean']\n", + "\n", + "discretize = yes\n", + " S discretization bins: to be computed from data\n", + " X discretization bins: to be computed from data\n", + "\n", + "N = 4\n", + "L = 4\n", + "\n", + "\n", + "================================================================================\n", + "2) Transform; 3) Post-filter\n", + "================================================================================\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Detecting and parsing value types\n", + "--------------------------------------------------------------------------------\n", + "Saved as: ./output-3/value_types.csv\n", + "\n", + "--------------------------------------------------------------------------------\n", + "*) Separate time-invariant and time-dependent\n", + "--------------------------------------------------------------------------------\n", + "Variables (time-invariant): 3\n", + "Variables (time-dependent): 4\n", + "# rows (time-invariant): 9\n", + "# rows (time-dependent): 23\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-A) Transform time-invariant data\n", + "--------------------------------------------------------------------------------\n", + "(N × ^d) table :\t (4, 3)\n", + "number of missing entries :\t 3 out of 12 total\n", + "Time elapsed: 0.018871 seconds\n", + "\n", + "Output\n", + "s_all, binary features :\t (4, 10)\n", + "Time elapsed: 0.061661 seconds\n", + "\n", + "--------------------------------------------------------------------------------\n", + "2-B) Transform time-dependent data\n", + "--------------------------------------------------------------------------------\n", + "Total variables : 4\n", + "Frequent variables : ['HR']\n", + "M₁ = 1\n", + "M₂ = 3\n", + "k = 3 ['min', 'max', 'mean']\n", + "\n", + "Transforming each example...\n", + "100%|█████████████████████████████████████████████| 4/4 [00:00<00:00, 5.89it/s]\n", + "DONE: Transforming each example...\n", + "(freq) number of missing entries :\t 5 out of 4×4×1=16 total\n", + "(freq) number of imputed entries :\t 4\n", + "(freq) number of not imputed entries :\t 1\n", + "(non-freq) number of missing entries :\t 41 out of 4×4×3=48 total\n", + "\n", + "(N × L × ^D) table :\t (4, 4, 9)\n", + "Time elapsed: 0.735244 seconds\n", + "Discretizing features...\n", + "\n", + "Processing 8 non-boolean variable columns...\n", + " Computing bin edges for numeric variables...\n", + "100%|████████████████████████████████████████████| 8/8 [00:00<00:00, 313.93it/s]\n", + " Discretizing variables to binary features\n", + "100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 38.42it/s]\n", + "Finished discretizing features\n", + "\n", + "Output\n", + "X_all: shape=(4, 4, 29), density=0.420\n", + "Time elapsed: 0.989317 seconds\n", + "\u001b[0m" + ] + } + ], + "source": [ + "! PYTHONPATH=\"$PYTHONPATH:../../\" \\\n", + "python -m FIDDLE.run \\\n", + " --data_fname='./input/data.csv' \\\n", + " --population_fname='./input/pop.csv' \\\n", + " --config_fname='./input/config-3.yaml' \\\n", + " --output_dir='./output-3/' \\\n", + " --T=4 --dt=1.0 \\\n", + " --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n", + " --stats_functions 'min' 'max' 'mean' \\\n", + " --no_prefilter --no_postfilter" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AGE_value>=33.0AGE_value>=35.8AGE_value>=38.6AGE_value>=42.0AGE_value>=46.0ROOM_value__101ROOM_value__102ROOM_value__103SEX_value_FSEX_value_M
ID
11111110001
20000001001
31110000110
40000000000
\n", + "
" + ], + "text/plain": [ + " AGE_value>=33.0 AGE_value>=35.8 AGE_value>=38.6 AGE_value>=42.0 \\\n", + "ID \n", + "1 1 1 1 1 \n", + "2 0 0 0 0 \n", + "3 1 1 1 0 \n", + "4 0 0 0 0 \n", + "\n", + " AGE_value>=46.0 ROOM_value__101 ROOM_value__102 ROOM_value__103 \\\n", + "ID \n", + "1 1 1 0 0 \n", + "2 0 0 1 0 \n", + "3 0 0 0 1 \n", + "4 0 0 0 0 \n", + "\n", + " SEX_value_F SEX_value_M \n", + "ID \n", + "1 0 1 \n", + "2 0 1 \n", + "3 1 0 \n", + "4 0 0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HR_maskDRUG_A_RATE_value_48DRUG_A_ROUTE_value_BolusDRUG_A_ROUTE_value_IVDRUG_A_ROUTE_value_OralLAB_X_value_5LAB_X_value_<1HR_delta_time>=0.0HR_delta_time>=1.0HR_value>=61.0...HR_max>=61.0HR_max>=69.2HR_max>=73.6HR_max>=76.2HR_max>=82.00000000000001HR_mean>=60.333333333333336HR_mean>=68.80000000000001HR_mean>=73.6HR_mean>=75.2HR_mean>=82.00000000000001
IDt_range
1[0.0, 1.0)1000000001...1100011000
[1.0, 2.0)1000000001...1100011000
[2.0, 3.0)1100101001...1110011100
[3.0, 4.0)1000000001...1110011100
2[0.0, 1.0)1000000000...0000000000
[1.0, 2.0)0000000100...0000010000
[2.0, 3.0)1000000001...1111011110
[3.0, 4.0)1101000001...1110011100
3[0.0, 1.0)0000000000...0000000000
[1.0, 2.0)1010000001...1111111111
[2.0, 3.0)0000010101...1111111111
[3.0, 4.0)0000000111...1111111111
4[0.0, 1.0)1000000001...1111011110
[1.0, 2.0)0000000101...1111011110
[2.0, 3.0)1000000001...1000010000
[3.0, 4.0)1000000001...1100011000
\n", + "

16 rows × 29 columns

\n", + "
" + ], + "text/plain": [ + " HR_mask DRUG_A_RATE_value_48 DRUG_A_ROUTE_value_Bolus \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 1 0 0 \n", + " [1.0, 2.0) 1 0 0 \n", + " [2.0, 3.0) 1 1 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "2 [0.0, 1.0) 1 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 0 0 \n", + " [3.0, 4.0) 1 1 0 \n", + "3 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 1 0 1 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "4 [0.0, 1.0) 1 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 0 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "\n", + " DRUG_A_ROUTE_value_IV DRUG_A_ROUTE_value_Oral LAB_X_value_5 \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 1 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "2 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "3 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 0 1 \n", + " [3.0, 4.0) 0 0 0 \n", + "4 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "\n", + " LAB_X_value_<1 HR_delta_time>=0.0 HR_delta_time>=1.0 \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "2 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 1 0 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "3 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 0 1 0 \n", + " [3.0, 4.0) 0 1 1 \n", + "4 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 1 0 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "\n", + " HR_value>=61.0 ... HR_max>=61.0 HR_max>=69.2 HR_max>=73.6 \\\n", + "ID t_range ... \n", + "1 [0.0, 1.0) 1 ... 1 1 0 \n", + " [1.0, 2.0) 1 ... 1 1 0 \n", + " [2.0, 3.0) 1 ... 1 1 1 \n", + " [3.0, 4.0) 1 ... 1 1 1 \n", + "2 [0.0, 1.0) 0 ... 0 0 0 \n", + " [1.0, 2.0) 0 ... 0 0 0 \n", + " [2.0, 3.0) 1 ... 1 1 1 \n", + " [3.0, 4.0) 1 ... 1 1 1 \n", + "3 [0.0, 1.0) 0 ... 0 0 0 \n", + " [1.0, 2.0) 1 ... 1 1 1 \n", + " [2.0, 3.0) 1 ... 1 1 1 \n", + " [3.0, 4.0) 1 ... 1 1 1 \n", + "4 [0.0, 1.0) 1 ... 1 1 1 \n", + " [1.0, 2.0) 1 ... 1 1 1 \n", + " [2.0, 3.0) 1 ... 1 0 0 \n", + " [3.0, 4.0) 1 ... 1 1 0 \n", + "\n", + " HR_max>=76.2 HR_max>=82.00000000000001 \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "2 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 0 0 \n", + " [2.0, 3.0) 1 0 \n", + " [3.0, 4.0) 0 0 \n", + "3 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 1 1 \n", + " [2.0, 3.0) 1 1 \n", + " [3.0, 4.0) 1 1 \n", + "4 [0.0, 1.0) 1 0 \n", + " [1.0, 2.0) 1 0 \n", + " [2.0, 3.0) 0 0 \n", + " [3.0, 4.0) 0 0 \n", + "\n", + " HR_mean>=60.333333333333336 HR_mean>=68.80000000000001 \\\n", + "ID t_range \n", + "1 [0.0, 1.0) 1 1 \n", + " [1.0, 2.0) 1 1 \n", + " [2.0, 3.0) 1 1 \n", + " [3.0, 4.0) 1 1 \n", + "2 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 1 0 \n", + " [2.0, 3.0) 1 1 \n", + " [3.0, 4.0) 1 1 \n", + "3 [0.0, 1.0) 0 0 \n", + " [1.0, 2.0) 1 1 \n", + " [2.0, 3.0) 1 1 \n", + " [3.0, 4.0) 1 1 \n", + "4 [0.0, 1.0) 1 1 \n", + " [1.0, 2.0) 1 1 \n", + " [2.0, 3.0) 1 0 \n", + " [3.0, 4.0) 1 1 \n", + "\n", + " HR_mean>=73.6 HR_mean>=75.2 HR_mean>=82.00000000000001 \n", + "ID t_range \n", + "1 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 0 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "2 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 0 0 0 \n", + " [2.0, 3.0) 1 1 0 \n", + " [3.0, 4.0) 1 0 0 \n", + "3 [0.0, 1.0) 0 0 0 \n", + " [1.0, 2.0) 1 1 1 \n", + " [2.0, 3.0) 1 1 1 \n", + " [3.0, 4.0) 1 1 1 \n", + "4 [0.0, 1.0) 1 1 0 \n", + " [1.0, 2.0) 1 1 0 \n", + " [2.0, 3.0) 0 0 0 \n", + " [3.0, 4.0) 0 0 0 \n", + "\n", + "[16 rows x 29 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import json\n", + "import sparse\n", + "\n", + "S = sparse.load_npz('output-3/S_all.npz')\n", + "S_names = json.load(open('output-3/S_all.feature_names.json', 'r'))\n", + "S_index = pd.read_csv('output-3/S.ID.csv').set_index(['ID'])\n", + "df_S = pd.DataFrame(S.todense(), columns=S_names, index=S_index.index)\n", + "\n", + "X = sparse.load_npz('output-3/X_all.npz')\n", + "X_names = json.load(open('output-3/X_all.feature_names.json', 'r'))\n", + "X_index = pd.read_csv('output-3/X.ID,t_range.csv').set_index(['ID', 't_range'])\n", + "df_X = pd.DataFrame(X.todense().reshape(-1, X.shape[-1]), columns=X_names, index=X_index.index)\n", + "\n", + "display(df_S)\n", + "display(df_X)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tests/small_test/input/X_all.discretization.json b/tests/small_test/input/X_all.discretization.json new file mode 100644 index 0000000000000000000000000000000000000000..03d039879cb3c1d58b76e4ec0f5f59a41da86118 --- /dev/null +++ b/tests/small_test/input/X_all.discretization.json @@ -0,0 +1 @@ +{"DRUG_A_RATE_value": null, "DRUG_A_ROUTE_value": null, "LAB_X_value": null, "HR_delta_time": [0.0, 1.0, 2.0], "HR_value": [61.0, 69.2, 73.6, 76.2, 82.00000000000001, 90.0], "HR_min": [60.0, 68.4, 73.0, 75.0, 82.00000000000001, 90.0], "HR_max": [61.0, 69.2, 73.6, 76.2, 82.00000000000001, 90.0], "HR_mean": [60.333333333333336, 68.80000000000001, 73.6, 75.2, 82.00000000000001, 90.0]} \ No newline at end of file diff --git a/tests/small_test/input/config-1.yaml b/tests/small_test/input/config-1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..679862de171094f80f20a98d99579939a1d7b665 --- /dev/null +++ b/tests/small_test/input/config-1.yaml @@ -0,0 +1,7 @@ +discretize: no + +parallel: no +n_jobs: 1 + +value_types: + ROOM: Categorical diff --git a/tests/small_test/input/config-2-bins.yaml b/tests/small_test/input/config-2-bins.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aa79b4bd4a9e4f4cba5cb9bcc1d93291819d70fc --- /dev/null +++ b/tests/small_test/input/config-2-bins.yaml @@ -0,0 +1,10 @@ +discretize: yes +use_ordinal_encoding: no +S_discretization_bins: 'input/S_all.discretization.json' +X_discretization_bins: 'input/X_all.discretization.json' + +parallel: no +n_jobs: 1 + +value_types: + ROOM: Categorical diff --git a/tests/small_test/input/config-2.yaml b/tests/small_test/input/config-2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e51bb502223b6129081521e1e34056db5b8c63e5 --- /dev/null +++ b/tests/small_test/input/config-2.yaml @@ -0,0 +1,8 @@ +discretize: yes +use_ordinal_encoding: no + +parallel: no +n_jobs: 1 + +value_types: + ROOM: Categorical diff --git a/tests/small_test/input/config-3.yaml b/tests/small_test/input/config-3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..370e730db136225c1e2cd563fbf2a0737eae2fd6 --- /dev/null +++ b/tests/small_test/input/config-3.yaml @@ -0,0 +1,8 @@ +discretize: yes +use_ordinal_encoding: yes + +parallel: no +n_jobs: 1 + +value_types: + ROOM: Categorical diff --git a/test/small_test/input_data.csv b/tests/small_test/input/data.csv similarity index 77% rename from test/small_test/input_data.csv rename to tests/small_test/input/data.csv index 86239ef9d15bc7da8f46d9897855faaf09335a8e..2dc23c5d6688c40c8745b8d3fd5a709c7432957e 100644 --- a/test/small_test/input_data.csv +++ b/tests/small_test/input/data.csv @@ -1,6 +1,7 @@ ID,t,variable_name,variable_value 1,NULL,AGE,50 2,NULL,AGE,33 +3,NULL,AGE,40 1,NULL,SEX,M 2,NULL,SEX,M 3,NULL,SEX,F @@ -20,13 +21,13 @@ ID,t,variable_name,variable_value 2,2.9,HR,78 2,3.5,HR,75 3,1.7,HR,90 +4,0.7,HR,80 +4,2.5,HR,62 +4,3.9,HR,73 1,2.3,DRUG_A_RATE,48 2,3.4,DRUG_A_RATE,48 -1,2.3,DRUG_A_ROUTE,Mouth -2,3.4,DRUG_A_ROUTE,Cont.IV +1,2.3,DRUG_A_ROUTE,Oral +2,3.4,DRUG_A_ROUTE,IV 3,1,DRUG_A_ROUTE,Bolus 1,2.3,LAB_X,<1 -3,2.7,LAB_X,5 -4,0.7,HR,80 -4,2.5,HR,62 -4,3.9,HR,73 \ No newline at end of file +3,2.7,LAB_X,5 \ No newline at end of file diff --git a/test/small_test/pop.csv b/tests/small_test/input/pop.csv similarity index 100% rename from test/small_test/pop.csv rename to tests/small_test/input/pop.csv index 4d6ba9bc759bda91e8b5dd2ab73db68a9e7bb83b..a911283dac17180ad2b16dab766a87adf56de95c 100644 --- a/test/small_test/pop.csv +++ b/tests/small_test/input/pop.csv @@ -1,5 +1,5 @@ ID -1 -2 3 4 +1 +2 diff --git a/tests/small_test/input/s_all.discretization.json b/tests/small_test/input/s_all.discretization.json new file mode 100644 index 0000000000000000000000000000000000000000..6b746e3b311206f5102f18efa0006b3476905f80 --- /dev/null +++ b/tests/small_test/input/s_all.discretization.json @@ -0,0 +1 @@ +{"AGE_value": [33.0, 35.8, 38.6, 42.0, 46.0, 50.0], "ROOM_value": null, "SEX_value": null} \ No newline at end of file diff --git a/tests/small_test/reference-1/df_S.csv b/tests/small_test/reference-1/df_S.csv new file mode 100644 index 0000000000000000000000000000000000000000..d055870959d3625ee26ae058bd0f041737aac6ba --- /dev/null +++ b/tests/small_test/reference-1/df_S.csv @@ -0,0 +1,5 @@ +ID,AGE_value,ROOM_value:_101,ROOM_value:_102,ROOM_value:_103,SEX_value:F,SEX_value:M +1,50.0,1.0,0.0,0.0,0.0,1.0 +2,33.0,0.0,1.0,0.0,0.0,1.0 +3,40.0,0.0,0.0,1.0,1.0,0.0 +4,41.0,0.0,0.0,0.0,0.0,0.0 diff --git a/tests/small_test/reference-1/df_X.csv b/tests/small_test/reference-1/df_X.csv new file mode 100644 index 0000000000000000000000000000000000000000..283ba54ffe73b95cfca7784a08947f5416b44ee3 --- /dev/null +++ b/tests/small_test/reference-1/df_X.csv @@ -0,0 +1,17 @@ +ID,t_range,HR_mask,DRUG_A_RATE_value,DRUG_A_ROUTE_value:Bolus,DRUG_A_ROUTE_value:IV,DRUG_A_ROUTE_value:Oral,LAB_X_value,HR_delta_time,HR_value,HR_min,HR_max,HR_mean,LAB_X_value_str:<1 +1,"[0.0, 1.0)",1.0,48.0,0.0,0.0,0.0,5.0,0.0,71.0,70.0,71.0,70.5,0.0 +1,"[1.0, 2.0)",1.0,48.0,0.0,0.0,0.0,5.0,0.0,73.0,72.0,73.0,72.5,0.0 +1,"[2.0, 3.0)",1.0,48.0,0.0,0.0,1.0,5.0,0.0,74.0,74.0,74.0,74.0,1.0 +1,"[3.0, 4.0)",1.0,48.0,0.0,0.0,0.0,5.0,0.0,75.0,75.0,75.0,75.0,0.0 +2,"[0.0, 1.0)",1.0,48.0,0.0,0.0,0.0,5.0,0.0,61.0,60.0,61.0,60.333333333333336,0.0 +2,"[1.0, 2.0)",0.0,48.0,0.0,0.0,0.0,5.0,1.0,61.0,61.0,61.0,61.0,0.0 +2,"[2.0, 3.0)",1.0,48.0,0.0,0.0,0.0,5.0,0.0,78.0,73.0,78.0,75.5,0.0 +2,"[3.0, 4.0)",1.0,48.0,0.0,1.0,0.0,5.0,0.0,75.0,75.0,75.0,75.0,0.0 +3,"[0.0, 1.0)",0.0,48.0,0.0,0.0,0.0,5.0,0.0,75.53333333333333,75.0,75.53333333333333,75.25555555555556,0.0 +3,"[1.0, 2.0)",1.0,48.0,1.0,0.0,0.0,5.0,0.0,90.0,90.0,90.0,90.0,0.0 +3,"[2.0, 3.0)",0.0,48.0,0.0,0.0,0.0,5.0,1.0,90.0,90.0,90.0,90.0,0.0 +3,"[3.0, 4.0)",0.0,48.0,0.0,0.0,0.0,5.0,2.0,90.0,90.0,90.0,90.0,0.0 +4,"[0.0, 1.0)",1.0,48.0,0.0,0.0,0.0,5.0,0.0,80.0,80.0,80.0,80.0,0.0 +4,"[1.0, 2.0)",0.0,48.0,0.0,0.0,0.0,5.0,1.0,80.0,80.0,80.0,80.0,0.0 +4,"[2.0, 3.0)",1.0,48.0,0.0,0.0,0.0,5.0,0.0,62.0,62.0,62.0,62.0,0.0 +4,"[3.0, 4.0)",1.0,48.0,0.0,0.0,0.0,5.0,0.0,73.0,73.0,73.0,73.0,0.0