diff --git a/FIDDLE/config.py b/FIDDLE/config.py index 823ab736b6ec8bfcb88c5f4a9e91c5b5a3332b97..ce408a367b3e8d172b22b6f916e21f7feec0259c 100644 --- a/FIDDLE/config.py +++ b/FIDDLE/config.py @@ -7,6 +7,7 @@ var_col = config['column_names']['var_name'] val_col = config['column_names']['var_value'] t_col = config['column_names']['t'] +use_ordinal_encoding = config['use_ordinal_encoding'] value_type_override = config['value_types'] parallel = True diff --git a/FIDDLE/config.yaml b/FIDDLE/config.yaml index 6fedd116e2ce2562887d83e5931a0d51cbaa92a5..d5c48a1b714e1484117b400fcba34d46dbc6bdbe 100644 --- a/FIDDLE/config.yaml +++ b/FIDDLE/config.yaml @@ -5,6 +5,8 @@ column_names: var_name: variable_name var_value: variable_value +use_ordinal_encoding: no + value_types: # enter the feature type that you would like to override in the following format: FIRST_WARDID: Categorical diff --git a/FIDDLE/helpers.py b/FIDDLE/helpers.py index c1b6fd3b196db52a62815b17ad26a5755f52a9a0..82aafb75413a9e2c1193e170a640fffdfc9e7df3 100644 --- a/FIDDLE/helpers.py +++ b/FIDDLE/helpers.py @@ -80,20 +80,26 @@ def select_dtype(df, dtype, dtypes=None): assert False return -def smart_qcut_dummify(x, q): - z = smart_qcut(x, q) - return pd.get_dummies(z, prefix=z.name) - -def smart_qcut(x, q): +def smart_qcut_dummify(x, q, use_ordinal_encoding=False): # ignore strings when performing qcut - x = x.copy() - x = x.apply(make_float) - m = x.apply(np.isreal) - if x.loc[m].dropna().nunique() > 1: # when more than one numeric values - x.loc[m] = pd.qcut(x.loc[m].to_numpy(), q=q, duplicates='drop') -# bins = np.percentile(x.loc[m].to_numpy(), [0, 20, 40, 60, 80, 100]) -# x.loc[m] = pd.cut(x, bins) - return x + z = x.copy() + z = z.apply(make_float) + m = z.apply(np.isreal) + if z.loc[m].dropna().nunique() > 1: # when more than one numeric values + if use_ordinal_encoding: + bin_edges = np.nanpercentile(z.loc[m].astype(float).to_numpy(), [0, 20, 40, 60, 80, 100]) + bin_edges = np.unique(bin_edges) + col_names = ['{}>={}'.format(z.name, bin_edge) for bin_edge in bin_edges[:-1]] + out = pd.DataFrame(0, z.index, col_names) + for i, bin_edge in enumerate(bin_edges[:-1]): + out.loc[m, col_names[i]] = (z.loc[m] > bin_edge).astype(int) + out = pd.concat([out, pd.get_dummies(z.where(~m, np.nan), prefix=z.name)], axis=1) + else: + z.loc[m] = pd.qcut(z.loc[m].to_numpy(), q=q, duplicates='drop') + out = pd.get_dummies(z, prefix=z.name) + else: + out = pd.get_dummies(x, prefix=x.name) + return out def smart_dummify_impute(x): x = x.copy() diff --git a/FIDDLE/steps.py b/FIDDLE/steps.py index 98930bbbc4ceca343e7765e909e78a64f71d0afe..c94f91737a332c291e7dd11ba239a564bb1e973e 100644 --- a/FIDDLE/steps.py +++ b/FIDDLE/steps.py @@ -107,6 +107,7 @@ def split_by_timestamp_type(df): print('# rows (time-dependent):', len(df_time_series)) return df_time_invariant, df_time_series + def process_time_invariant(df_data_time_invariant, args): data_path = args.data_path df_population = args.df_population @@ -121,7 +122,7 @@ def process_time_invariant(df_data_time_invariant, args): print('Time elapsed: %f seconds' % (time.time() - start_time)) ## Discretize - s_all, s_all_feature_names = map_time_invariant_features(df_time_invariant, args.binarize) + s_all, s_all_feature_names = map_time_invariant_features(df_time_invariant, args) sparse.save_npz(dir_path + 's_all.npz', s_all) with open(dir_path + 's_all.feature_names.json', 'w') as f: json.dump(list(s_all_feature_names), f, sort_keys=True) @@ -205,16 +206,11 @@ def transform_time_invariant_table(df_in, df_population): print('number of missing entries :\t', '{} out of {} total'.format(df_value.isna().sum().sum(), df_value.size)) return df_value -def map_time_invariant_features(df, bin_numeric=True): +def map_time_invariant_features(df, args): # Categorical -> binary features # Numeric -> binary/float-valued features - if bin_numeric: -# df_mixed = df.apply(smart_qcut, q=5) -# features_mixed = pd.get_dummies(df_mixed, columns=df_mixed.columns, prefix_sep=':') -# time_invariant_features = features_mixed -# assert time_invariant_features.astype(int).dtypes.nunique() == 1 - - out = [smart_qcut_dummify(df[col], q=5) for col in df.columns] + if args.binarize: + out = [smart_qcut_dummify(df[col], q=5, use_ordinal_encoding=use_ordinal_encoding) for col in df.columns] time_invariant_features = pd.concat(out, axis=1) feature_names_all = time_invariant_features.columns.values sdf = time_invariant_features.astype(pd.SparseDtype(int, fill_value=0)) @@ -451,10 +447,10 @@ def map_time_series_features(df_time_series, dtypes, args): print(' Converting variables to binary features') if parallel: out = Parallel(n_jobs=n_jobs, verbose=10)( # Need to share global variables - delayed(smart_qcut_dummify)(col_data, q=5) for col_data in ts_mixed_cols + delayed(smart_qcut_dummify)(col_data, q=5, use_ordinal_encoding=use_ordinal_encoding) for col_data in ts_mixed_cols ) else: - out = [smart_qcut_dummify(col_data, q=5) for col_data in tqdm(ts_mixed_cols)] + out = [smart_qcut_dummify(col_data, q=5, use_ordinal_encoding=use_ordinal_encoding) for col_data in tqdm(ts_mixed_cols)] else: dtype = float df = ts_mixed.copy()