Commit f3a7ee10 authored by Shengpu Tang (tangsp)'s avatar Shengpu Tang (tangsp)
Browse files

Ordinal encoding

parent b65693ab
......@@ -7,6 +7,7 @@ var_col = config['column_names']['var_name']
val_col = config['column_names']['var_value']
t_col = config['column_names']['t']
use_ordinal_encoding = config['use_ordinal_encoding']
value_type_override = config['value_types']
parallel = True
......
......@@ -5,6 +5,8 @@ column_names:
var_name: variable_name
var_value: variable_value
use_ordinal_encoding: no
value_types:
# enter the feature type that you would like to override in the following format:
FIRST_WARDID: Categorical
......
......@@ -80,20 +80,26 @@ def select_dtype(df, dtype, dtypes=None):
assert False
return
def smart_qcut_dummify(x, q):
z = smart_qcut(x, q)
return pd.get_dummies(z, prefix=z.name)
def smart_qcut(x, q):
def smart_qcut_dummify(x, q, use_ordinal_encoding=False):
# ignore strings when performing qcut
x = x.copy()
x = x.apply(make_float)
m = x.apply(np.isreal)
if x.loc[m].dropna().nunique() > 1: # when more than one numeric values
x.loc[m] = pd.qcut(x.loc[m].to_numpy(), q=q, duplicates='drop')
# bins = np.percentile(x.loc[m].to_numpy(), [0, 20, 40, 60, 80, 100])
# x.loc[m] = pd.cut(x, bins)
return x
z = x.copy()
z = z.apply(make_float)
m = z.apply(np.isreal)
if z.loc[m].dropna().nunique() > 1: # when more than one numeric values
if use_ordinal_encoding:
bin_edges = np.nanpercentile(z.loc[m].astype(float).to_numpy(), [0, 20, 40, 60, 80, 100])
bin_edges = np.unique(bin_edges)
col_names = ['{}>={}'.format(z.name, bin_edge) for bin_edge in bin_edges[:-1]]
out = pd.DataFrame(0, z.index, col_names)
for i, bin_edge in enumerate(bin_edges[:-1]):
out.loc[m, col_names[i]] = (z.loc[m] > bin_edge).astype(int)
out = pd.concat([out, pd.get_dummies(z.where(~m, np.nan), prefix=z.name)], axis=1)
else:
z.loc[m] = pd.qcut(z.loc[m].to_numpy(), q=q, duplicates='drop')
out = pd.get_dummies(z, prefix=z.name)
else:
out = pd.get_dummies(x, prefix=x.name)
return out
def smart_dummify_impute(x):
x = x.copy()
......
......@@ -107,6 +107,7 @@ def split_by_timestamp_type(df):
print('# rows (time-dependent):', len(df_time_series))
return df_time_invariant, df_time_series
def process_time_invariant(df_data_time_invariant, args):
data_path = args.data_path
df_population = args.df_population
......@@ -121,7 +122,7 @@ def process_time_invariant(df_data_time_invariant, args):
print('Time elapsed: %f seconds' % (time.time() - start_time))
## Discretize
s_all, s_all_feature_names = map_time_invariant_features(df_time_invariant, args.binarize)
s_all, s_all_feature_names = map_time_invariant_features(df_time_invariant, args)
sparse.save_npz(dir_path + 's_all.npz', s_all)
with open(dir_path + 's_all.feature_names.json', 'w') as f:
json.dump(list(s_all_feature_names), f, sort_keys=True)
......@@ -205,16 +206,11 @@ def transform_time_invariant_table(df_in, df_population):
print('number of missing entries :\t', '{} out of {} total'.format(df_value.isna().sum().sum(), df_value.size))
return df_value
def map_time_invariant_features(df, bin_numeric=True):
def map_time_invariant_features(df, args):
# Categorical -> binary features
# Numeric -> binary/float-valued features
if bin_numeric:
# df_mixed = df.apply(smart_qcut, q=5)
# features_mixed = pd.get_dummies(df_mixed, columns=df_mixed.columns, prefix_sep=':')
# time_invariant_features = features_mixed
# assert time_invariant_features.astype(int).dtypes.nunique() == 1
out = [smart_qcut_dummify(df[col], q=5) for col in df.columns]
if args.binarize:
out = [smart_qcut_dummify(df[col], q=5, use_ordinal_encoding=use_ordinal_encoding) for col in df.columns]
time_invariant_features = pd.concat(out, axis=1)
feature_names_all = time_invariant_features.columns.values
sdf = time_invariant_features.astype(pd.SparseDtype(int, fill_value=0))
......@@ -451,10 +447,10 @@ def map_time_series_features(df_time_series, dtypes, args):
print(' Converting variables to binary features')
if parallel:
out = Parallel(n_jobs=n_jobs, verbose=10)( # Need to share global variables
delayed(smart_qcut_dummify)(col_data, q=5) for col_data in ts_mixed_cols
delayed(smart_qcut_dummify)(col_data, q=5, use_ordinal_encoding=use_ordinal_encoding) for col_data in ts_mixed_cols
)
else:
out = [smart_qcut_dummify(col_data, q=5) for col_data in tqdm(ts_mixed_cols)]
out = [smart_qcut_dummify(col_data, q=5, use_ordinal_encoding=use_ordinal_encoding) for col_data in tqdm(ts_mixed_cols)]
else:
dtype = float
df = ts_mixed.copy()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment