From f87a0b3c9a70f7158758487bab15fdc1b54f0c50 Mon Sep 17 00:00:00 2001 From: Shengpu Tang Date: Wed, 26 Feb 2020 17:54:36 -0500 Subject: [PATCH 1/2] add support for ordinal encoding --- FIDDLE/config.py | 1 + FIDDLE/config.yaml | 2 ++ FIDDLE/helpers.py | 32 +++++++++++++++++++------------- FIDDLE/steps.py | 18 +++++++----------- 4 files changed, 29 insertions(+), 24 deletions(-) diff --git a/FIDDLE/config.py b/FIDDLE/config.py index 823ab73..ce408a3 100644 --- a/FIDDLE/config.py +++ b/FIDDLE/config.py @@ -7,6 +7,7 @@ var_col = config['column_names']['var_name'] val_col = config['column_names']['var_value'] t_col = config['column_names']['t'] +use_ordinal_encoding = config['use_ordinal_encoding'] value_type_override = config['value_types'] parallel = True diff --git a/FIDDLE/config.yaml b/FIDDLE/config.yaml index 6fedd11..d5c48a1 100644 --- a/FIDDLE/config.yaml +++ b/FIDDLE/config.yaml @@ -5,6 +5,8 @@ column_names: var_name: variable_name var_value: variable_value +use_ordinal_encoding: no + value_types: # enter the feature type that you would like to override in the following format: FIRST_WARDID: Categorical diff --git a/FIDDLE/helpers.py b/FIDDLE/helpers.py index c1b6fd3..521f4c6 100644 --- a/FIDDLE/helpers.py +++ b/FIDDLE/helpers.py @@ -80,20 +80,26 @@ def select_dtype(df, dtype, dtypes=None): assert False return -def smart_qcut_dummify(x, q): - z = smart_qcut(x, q) - return pd.get_dummies(z, prefix=z.name) - -def smart_qcut(x, q): +def smart_qcut_dummify(x, q, use_ordinal_encoding=False): # ignore strings when performing qcut - x = x.copy() - x = x.apply(make_float) - m = x.apply(np.isreal) - if x.loc[m].dropna().nunique() > 1: # when more than one numeric values - x.loc[m] = pd.qcut(x.loc[m].to_numpy(), q=q, duplicates='drop') -# bins = np.percentile(x.loc[m].to_numpy(), [0, 20, 40, 60, 80, 100]) -# x.loc[m] = pd.cut(x, bins) - return x + z = x.copy() + z = z.apply(make_float) + m = z.apply(np.isreal) + if z.loc[m].dropna().nunique() > 1: # when more than one numeric values + if use_ordinal_encoding: + bin_edges = np.nanpercentile(z.loc[m].to_numpy(), [0, 20, 40, 60, 80, 100]) + bin_edges = np.unique(bin_edges) + col_names = ['{}>={}'.format(z.name, bin_edge) for bin_edge in bin_edges[:-1]] + out = pd.DataFrame(0, z.index, col_names) + for i, bin_edge in enumerate(bin_edges[:-1]): + out.loc[m, col_names[i]] = (z.loc[m] > bin_edge).astype(int) + out = pd.concat([out, pd.get_dummies(z.where(~m, np.nan), prefix=z.name)], axis=1) + else: + z.loc[m] = pd.qcut(z.loc[m].to_numpy(), q=q, duplicates='drop') + out = pd.get_dummies(z, prefix=z.name) + else: + out = pd.get_dummies(x, prefix=x.name) + return out def smart_dummify_impute(x): x = x.copy() diff --git a/FIDDLE/steps.py b/FIDDLE/steps.py index 98930bb..c94f917 100644 --- a/FIDDLE/steps.py +++ b/FIDDLE/steps.py @@ -107,6 +107,7 @@ def split_by_timestamp_type(df): print('# rows (time-dependent):', len(df_time_series)) return df_time_invariant, df_time_series + def process_time_invariant(df_data_time_invariant, args): data_path = args.data_path df_population = args.df_population @@ -121,7 +122,7 @@ def process_time_invariant(df_data_time_invariant, args): print('Time elapsed: %f seconds' % (time.time() - start_time)) ## Discretize - s_all, s_all_feature_names = map_time_invariant_features(df_time_invariant, args.binarize) + s_all, s_all_feature_names = map_time_invariant_features(df_time_invariant, args) sparse.save_npz(dir_path + 's_all.npz', s_all) with open(dir_path + 's_all.feature_names.json', 'w') as f: json.dump(list(s_all_feature_names), f, sort_keys=True) @@ -205,16 +206,11 @@ def transform_time_invariant_table(df_in, df_population): print('number of missing entries :\t', '{} out of {} total'.format(df_value.isna().sum().sum(), df_value.size)) return df_value -def map_time_invariant_features(df, bin_numeric=True): +def map_time_invariant_features(df, args): # Categorical -> binary features # Numeric -> binary/float-valued features - if bin_numeric: -# df_mixed = df.apply(smart_qcut, q=5) -# features_mixed = pd.get_dummies(df_mixed, columns=df_mixed.columns, prefix_sep=':') -# time_invariant_features = features_mixed -# assert time_invariant_features.astype(int).dtypes.nunique() == 1 - - out = [smart_qcut_dummify(df[col], q=5) for col in df.columns] + if args.binarize: + out = [smart_qcut_dummify(df[col], q=5, use_ordinal_encoding=use_ordinal_encoding) for col in df.columns] time_invariant_features = pd.concat(out, axis=1) feature_names_all = time_invariant_features.columns.values sdf = time_invariant_features.astype(pd.SparseDtype(int, fill_value=0)) @@ -451,10 +447,10 @@ def map_time_series_features(df_time_series, dtypes, args): print(' Converting variables to binary features') if parallel: out = Parallel(n_jobs=n_jobs, verbose=10)( # Need to share global variables - delayed(smart_qcut_dummify)(col_data, q=5) for col_data in ts_mixed_cols + delayed(smart_qcut_dummify)(col_data, q=5, use_ordinal_encoding=use_ordinal_encoding) for col_data in ts_mixed_cols ) else: - out = [smart_qcut_dummify(col_data, q=5) for col_data in tqdm(ts_mixed_cols)] + out = [smart_qcut_dummify(col_data, q=5, use_ordinal_encoding=use_ordinal_encoding) for col_data in tqdm(ts_mixed_cols)] else: dtype = float df = ts_mixed.copy() -- GitLab From ea76413cfec87e0c1e9033e5ada28a9b701d3464 Mon Sep 17 00:00:00 2001 From: Shengpu Tang Date: Fri, 6 Mar 2020 10:36:29 -0500 Subject: [PATCH 2/2] attempt to fix bug --- FIDDLE/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FIDDLE/helpers.py b/FIDDLE/helpers.py index 521f4c6..82aafb7 100644 --- a/FIDDLE/helpers.py +++ b/FIDDLE/helpers.py @@ -87,7 +87,7 @@ def smart_qcut_dummify(x, q, use_ordinal_encoding=False): m = z.apply(np.isreal) if z.loc[m].dropna().nunique() > 1: # when more than one numeric values if use_ordinal_encoding: - bin_edges = np.nanpercentile(z.loc[m].to_numpy(), [0, 20, 40, 60, 80, 100]) + bin_edges = np.nanpercentile(z.loc[m].astype(float).to_numpy(), [0, 20, 40, 60, 80, 100]) bin_edges = np.unique(bin_edges) col_names = ['{}>={}'.format(z.name, bin_edge) for bin_edge in bin_edges[:-1]] out = pd.DataFrame(0, z.index, col_names) -- GitLab