Commit 93ef213f authored by Shengpu Tang (tangsp)'s avatar Shengpu Tang (tangsp)
Browse files

added an option for disabling binarization of float variables

parent d9f08806
...@@ -77,12 +77,23 @@ def smart_qcut(x, q): ...@@ -77,12 +77,23 @@ def smart_qcut(x, q):
x = x.copy() x = x.copy()
x = x.apply(make_float) x = x.apply(make_float)
m = x.apply(np.isreal) m = x.apply(np.isreal)
if x.loc[m].dropna().nunique() > 1: if x.loc[m].dropna().nunique() > 1: # when more than one numeric values
x.loc[m] = pd.qcut(x.loc[m].to_numpy(), q=q, duplicates='drop') x.loc[m] = pd.qcut(x.loc[m].to_numpy(), q=q, duplicates='drop')
# bins = np.percentile(x.loc[m].to_numpy(), [0, 20, 40, 60, 80, 100]) # bins = np.percentile(x.loc[m].to_numpy(), [0, 20, 40, 60, 80, 100])
# x.loc[m] = pd.cut(x, bins) # x.loc[m] = pd.cut(x, bins)
return x return x
def smart_dummify_impute(x):
x = x.copy()
x = x.apply(make_float)
m = x.apply(np.isreal)
if x.loc[m].dropna().nunique() == 0: # all string values
return pd.get_dummies(x, prefix=x.name, prefix_sep=':')
else:
x = pd.DataFrame(x)
x = x.fillna(x.mean()) # simple mean imputation
return x
def make_float(v): def make_float(v):
try: try:
return float(v) return float(v)
......
...@@ -5,6 +5,16 @@ import numpy as np ...@@ -5,6 +5,16 @@ import numpy as np
import time import time
import os import os
def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
import argparse import argparse
parser = argparse.ArgumentParser(description='') parser = argparse.ArgumentParser(description='')
parser.add_argument('--data_path', type=str, required=True) parser.add_argument('--data_path', type=str, required=True)
...@@ -15,6 +25,7 @@ parser.add_argument('--theta_1', type=float, default=0.001) ...@@ -15,6 +25,7 @@ parser.add_argument('--theta_1', type=float, default=0.001)
parser.add_argument('--theta_2', type=float, default=0.001) parser.add_argument('--theta_2', type=float, default=0.001)
parser.add_argument('--theta_freq', type=float, default=1.0) parser.add_argument('--theta_freq', type=float, default=1.0)
parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean']) parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean'])
parser.add_argument('--binarize', type=str2bool, default=True, nargs='?', const=True)
args = parser.parse_args() args = parser.parse_args()
data_path = args.data_path data_path = args.data_path
...@@ -28,6 +39,7 @@ theta_1 = args.theta_1 ...@@ -28,6 +39,7 @@ theta_1 = args.theta_1
theta_2 = args.theta_2 theta_2 = args.theta_2
theta_freq = args.theta_freq theta_freq = args.theta_freq
stats_functions = args.stats_functions stats_functions = args.stats_functions
binarize = args.binarize
df_population = pd.read_csv(population).set_index('ID') df_population = pd.read_csv(population).set_index('ID')
N = len(df_population) N = len(df_population)
...@@ -60,6 +72,7 @@ print(' {:<6} = {}'.format('\u03B8\u2081', theta_1)) ...@@ -60,6 +72,7 @@ print(' {:<6} = {}'.format('\u03B8\u2081', theta_1))
print(' {:<6} = {}'.format('\u03B8\u2082', theta_2)) print(' {:<6} = {}'.format('\u03B8\u2082', theta_2))
print(' {:<6} = {}'.format('\u03B8_freq', theta_freq)) print(' {:<6} = {}'.format('\u03B8_freq', theta_freq))
print(' {:<6} = {} {}'.format('k', len(stats_functions), stats_functions)) print(' {:<6} = {} {}'.format('k', len(stats_functions), stats_functions))
print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize]))
print() print()
print('N = {}'.format(N)) print('N = {}'.format(N))
print('L = {}'.format(L)) print('L = {}'.format(L))
......
...@@ -120,7 +120,7 @@ def transform_time_invariant(df_data_time_invariant, args): ...@@ -120,7 +120,7 @@ def transform_time_invariant(df_data_time_invariant, args):
print('Time elapsed: %f seconds' % (time.time() - start_time)) print('Time elapsed: %f seconds' % (time.time() - start_time))
## Discretize ## Discretize
s_all, s_all_feature_names = map_time_invariant_features(df_time_invariant) s_all, s_all_feature_names = map_time_invariant_features(df_time_invariant, args.binarize)
sparse.save_npz(dir_path + 's_all.npz', s_all) sparse.save_npz(dir_path + 's_all.npz', s_all)
np.savetxt(dir_path + 's_all.feature_names.txt', s_all_feature_names, '"%s"') np.savetxt(dir_path + 's_all.feature_names.txt', s_all_feature_names, '"%s"')
print('Time elapsed: %f seconds' % (time.time() - start_time)) print('Time elapsed: %f seconds' % (time.time() - start_time))
...@@ -203,16 +203,32 @@ def map_time_invariant_features(df, bin_numeric=True): ...@@ -203,16 +203,32 @@ def map_time_invariant_features(df, bin_numeric=True):
# Categorical -> binary features # Categorical -> binary features
# Numeric -> binary/float-valued features # Numeric -> binary/float-valued features
if bin_numeric: if bin_numeric:
df_mixed = df.apply(smart_qcut, q=5) # df_mixed = df.apply(smart_qcut, q=5)
features_mixed = pd.get_dummies(df_mixed, columns=df_mixed.columns, prefix_sep=':') # features_mixed = pd.get_dummies(df_mixed, columns=df_mixed.columns, prefix_sep=':')
# time_invariant_features = features_mixed
# assert time_invariant_features.astype(int).dtypes.nunique() == 1
out = [smart_qcut_dummify(df[col], q=5) for col in df.columns]
time_invariant_features = pd.concat(out, axis=1)
feature_names_all = time_invariant_features.columns.values
sdf = time_invariant_features.astype(pd.SparseDtype(int, fill_value=0))
s_ = sparse.COO(sdf.sparse.to_coo())
else: else:
raise NotImplementedError # Split a mixed column into numeric and string columns
for col in df.columns:
time_invariant_features = features_mixed col_data = df[col]
assert time_invariant_features.astype(int).dtypes.nunique() == 1 col_is_numeric = [is_numeric(v) for v in col_data if not pd.isnull(v)]
sdf = time_invariant_features.astype(int).to_sparse(fill_value=0) if not all(col_is_numeric) and any(col_is_numeric): # have mixed type values
feature_names_all = time_invariant_features.columns.values numeric_mask = col_data.apply(is_numeric)
s_ = sparse.COO(sdf.to_coo()) df[col+'_str'] = df[col].copy()
df.loc[~numeric_mask, col] = np.nan
df.loc[numeric_mask, col+'_str'] = np.nan
out = [smart_dummify_impute(df[col]) for col in df.columns]
time_invariant_features = pd.concat(out, axis=1)
feature_names_all = time_invariant_features.columns.values
sdf = time_invariant_features.astype(pd.SparseDtype(float, fill_value=0))
s_ = sparse.COO(sdf.sparse.to_coo())
print() print()
print('Output') print('Output')
...@@ -373,7 +389,7 @@ def process_time_series_table(df_in, args): ...@@ -373,7 +389,7 @@ def process_time_series_table(df_in, args):
df_time_series = pd.DataFrame(data=time_series, index=index, columns=columns) df_time_series = pd.DataFrame(data=time_series, index=index, columns=columns)
# Print metadata # Print metadata
## Freq: Count misisng entries using mask ## Freq: Count missing entries using mask
ts_mask = df_time_series[[col for col in df_time_series if col.endswith('_mask')]] ts_mask = df_time_series[[col for col in df_time_series if col.endswith('_mask')]]
ts_mask.columns = [col.replace('_mask', '') for col in ts_mask.columns] ts_mask.columns = [col.replace('_mask', '') for col in ts_mask.columns]
print('(freq) number of missing entries :\t', print('(freq) number of missing entries :\t',
...@@ -410,50 +426,57 @@ def process_time_series_table(df_in, args): ...@@ -410,50 +426,57 @@ def process_time_series_table(df_in, args):
def map_time_series_features(df_time_series, dtypes, args): def map_time_series_features(df_time_series, dtypes, args):
N, L = args.N, args.L N, L = args.N, args.L
print()
print('Discretizing features...')
df_time_series = df_time_series.dropna(axis='columns', how='all').sort_index() df_time_series = df_time_series.dropna(axis='columns', how='all').sort_index()
# time_series = df_time_series[df_time_series.index.get_level_values(0).isin(population.index)]
print('Discretizing features...')
ts_mask = select_dtype(df_time_series, 'mask', dtypes) ts_mask = select_dtype(df_time_series, 'mask', dtypes)
ts_mixed = select_dtype(df_time_series, '~mask', dtypes) ts_mixed = select_dtype(df_time_series, '~mask', dtypes)
assert len(ts_mixed.columns) + len(ts_mask.columns) == len(df_time_series.columns) assert len(ts_mixed.columns) + len(ts_mask.columns) == len(df_time_series.columns)
ts_feature_mask = ts_mask.astype(int) ts_feature_mask = ts_mask.astype(int)
ts_mixed_cols = [ts_mixed[col] for col in ts_mixed.columns] ts_mixed_cols = [ts_mixed[col] for col in ts_mixed.columns]
print('Processing', len(ts_mixed_cols), 'non-boolean variable columns...')
print()
print(' Binning numeric variables by quintile...') if args.binarize:
print(' Converting variables to binary features') dtype = int
if parallel: print('Processing', len(ts_mixed_cols), 'non-boolean variable columns...')
out = Parallel(n_jobs=n_jobs, verbose=10)( # Need to share global variables
delayed(smart_qcut_dummify)(col_data, q=5) for col_data in ts_mixed_cols print(' Binning numeric variables by quintile...')
) print(' Converting variables to binary features')
else: if parallel:
out = [smart_qcut_dummify(col_data, q=5) for col_data in tqdm(ts_mixed_cols)] out = Parallel(n_jobs=n_jobs, verbose=10)( # Need to share global variables
delayed(smart_qcut_dummify)(col_data, q=5) for col_data in ts_mixed_cols
if False: )
# ts_mixed_cut = ts_mixed.progress_apply(smart_qcut, q=5) else:
# ts_feature_mixed = pd.get_dummies(ts_mixed_cut, prefix_sep='_', columns=ts_mixed_cut.columns) out = [smart_qcut_dummify(col_data, q=5) for col_data in tqdm(ts_mixed_cols)]
ts_feature_mixed = pd.concat(out, axis=1)
time_series_features = ts_feature_mask.join([ts_feature_mixed]).astype(int)
assert time_series_features.astype(int).dtypes.nunique() == 1
Xdf = time_series_features.to_sparse(fill_value=0)
X_all_feature_names = time_series_features.columns.values
X_all = sparse.COO(Xdf.to_coo())
_, D_all = X_all.shape
else: else:
out = [ts_feature_mask, *out] dtype = float
D_all = sum(len(df_i.columns) for df_i in out) df = ts_mixed.copy()
X_all_feature_names = np.asarray(sum([list(df_i.columns) for df_i in out], []))
X_dense = np.concatenate([df_i.values for df_i in out], axis=1).astype(int) # Split a mixed column into numeric and string columns
X_all = sparse.COO(X_dense) for col in df.columns:
col_data = df[col]
col_is_numeric = [is_numeric(v) for v in col_data if not pd.isnull(v)]
if not all(col_is_numeric) and any(col_is_numeric): # have mixed type values
numeric_mask = col_data.apply(is_numeric)
df[col+'_str'] = df[col].copy()
df.loc[~numeric_mask, col] = np.nan
df.loc[numeric_mask, col+'_str'] = np.nan
ts_mixed_cols = [df[col] for col in df.columns]
print('Discretizing categorical features...')
if parallel:
out = Parallel(n_jobs=n_jobs, verbose=10)( # Need to share global variables?
delayed(smart_dummify_impute)(col_data) for col_data in ts_mixed_cols
)
else:
out = [smart_dummify_impute(col_data) for col_data in tqdm(ts_mixed_cols)]
out = [ts_feature_mask, *out]
D_all = sum(len(df_i.columns) for df_i in out)
X_all_feature_names = np.asarray(sum([list(df_i.columns) for df_i in out], []))
X_dense = np.concatenate([df_i.values for df_i in out], axis=1).astype(dtype)
X_all = sparse.COO(X_dense)
print('Finished discretizing features') print('Finished discretizing features')
assert X_all.shape[0] == N * L assert X_all.shape[0] == N * L
...@@ -468,7 +491,7 @@ def post_filter_time_series(X_all, feature_names_all, threshold, args): ...@@ -468,7 +491,7 @@ def post_filter_time_series(X_all, feature_names_all, threshold, args):
N, L = args.N, args.L N, L = args.N, args.L
assert X_all.shape[0] == N assert X_all.shape[0] == N
assert X_all.shape[1] == L assert X_all.shape[1] == L
assert X_all.dtype == int # assert X_all.dtype == int
start_time = time.time() start_time = time.time()
X0 = X_all X0 = X_all
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment