Commit 93ef213f authored by Shengpu Tang (tangsp)'s avatar Shengpu Tang (tangsp)
Browse files

added an option for disabling binarization of float variables

parent d9f08806
......@@ -77,12 +77,23 @@ def smart_qcut(x, q):
x = x.copy()
x = x.apply(make_float)
m = x.apply(np.isreal)
if x.loc[m].dropna().nunique() > 1:
if x.loc[m].dropna().nunique() > 1: # when more than one numeric values
x.loc[m] = pd.qcut(x.loc[m].to_numpy(), q=q, duplicates='drop')
# bins = np.percentile(x.loc[m].to_numpy(), [0, 20, 40, 60, 80, 100])
# x.loc[m] = pd.cut(x, bins)
return x
def smart_dummify_impute(x):
x = x.copy()
x = x.apply(make_float)
m = x.apply(np.isreal)
if x.loc[m].dropna().nunique() == 0: # all string values
return pd.get_dummies(x, prefix=x.name, prefix_sep=':')
else:
x = pd.DataFrame(x)
x = x.fillna(x.mean()) # simple mean imputation
return x
def make_float(v):
try:
return float(v)
......
......@@ -5,6 +5,16 @@ import numpy as np
import time
import os
def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
import argparse
parser = argparse.ArgumentParser(description='')
parser.add_argument('--data_path', type=str, required=True)
......@@ -15,6 +25,7 @@ parser.add_argument('--theta_1', type=float, default=0.001)
parser.add_argument('--theta_2', type=float, default=0.001)
parser.add_argument('--theta_freq', type=float, default=1.0)
parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean'])
parser.add_argument('--binarize', type=str2bool, default=True, nargs='?', const=True)
args = parser.parse_args()
data_path = args.data_path
......@@ -28,6 +39,7 @@ theta_1 = args.theta_1
theta_2 = args.theta_2
theta_freq = args.theta_freq
stats_functions = args.stats_functions
binarize = args.binarize
df_population = pd.read_csv(population).set_index('ID')
N = len(df_population)
......@@ -60,6 +72,7 @@ print(' {:<6} = {}'.format('\u03B8\u2081', theta_1))
print(' {:<6} = {}'.format('\u03B8\u2082', theta_2))
print(' {:<6} = {}'.format('\u03B8_freq', theta_freq))
print(' {:<6} = {} {}'.format('k', len(stats_functions), stats_functions))
print('{} = {}'.format('binarize', {False: 'no', True: 'yes'}[binarize]))
print()
print('N = {}'.format(N))
print('L = {}'.format(L))
......
......@@ -120,7 +120,7 @@ def transform_time_invariant(df_data_time_invariant, args):
print('Time elapsed: %f seconds' % (time.time() - start_time))
## Discretize
s_all, s_all_feature_names = map_time_invariant_features(df_time_invariant)
s_all, s_all_feature_names = map_time_invariant_features(df_time_invariant, args.binarize)
sparse.save_npz(dir_path + 's_all.npz', s_all)
np.savetxt(dir_path + 's_all.feature_names.txt', s_all_feature_names, '"%s"')
print('Time elapsed: %f seconds' % (time.time() - start_time))
......@@ -203,16 +203,32 @@ def map_time_invariant_features(df, bin_numeric=True):
# Categorical -> binary features
# Numeric -> binary/float-valued features
if bin_numeric:
df_mixed = df.apply(smart_qcut, q=5)
features_mixed = pd.get_dummies(df_mixed, columns=df_mixed.columns, prefix_sep=':')
# df_mixed = df.apply(smart_qcut, q=5)
# features_mixed = pd.get_dummies(df_mixed, columns=df_mixed.columns, prefix_sep=':')
# time_invariant_features = features_mixed
# assert time_invariant_features.astype(int).dtypes.nunique() == 1
out = [smart_qcut_dummify(df[col], q=5) for col in df.columns]
time_invariant_features = pd.concat(out, axis=1)
feature_names_all = time_invariant_features.columns.values
sdf = time_invariant_features.astype(pd.SparseDtype(int, fill_value=0))
s_ = sparse.COO(sdf.sparse.to_coo())
else:
raise NotImplementedError
time_invariant_features = features_mixed
assert time_invariant_features.astype(int).dtypes.nunique() == 1
sdf = time_invariant_features.astype(int).to_sparse(fill_value=0)
feature_names_all = time_invariant_features.columns.values
s_ = sparse.COO(sdf.to_coo())
# Split a mixed column into numeric and string columns
for col in df.columns:
col_data = df[col]
col_is_numeric = [is_numeric(v) for v in col_data if not pd.isnull(v)]
if not all(col_is_numeric) and any(col_is_numeric): # have mixed type values
numeric_mask = col_data.apply(is_numeric)
df[col+'_str'] = df[col].copy()
df.loc[~numeric_mask, col] = np.nan
df.loc[numeric_mask, col+'_str'] = np.nan
out = [smart_dummify_impute(df[col]) for col in df.columns]
time_invariant_features = pd.concat(out, axis=1)
feature_names_all = time_invariant_features.columns.values
sdf = time_invariant_features.astype(pd.SparseDtype(float, fill_value=0))
s_ = sparse.COO(sdf.sparse.to_coo())
print()
print('Output')
......@@ -373,7 +389,7 @@ def process_time_series_table(df_in, args):
df_time_series = pd.DataFrame(data=time_series, index=index, columns=columns)
# Print metadata
## Freq: Count misisng entries using mask
## Freq: Count missing entries using mask
ts_mask = df_time_series[[col for col in df_time_series if col.endswith('_mask')]]
ts_mask.columns = [col.replace('_mask', '') for col in ts_mask.columns]
print('(freq) number of missing entries :\t',
......@@ -410,50 +426,57 @@ def process_time_series_table(df_in, args):
def map_time_series_features(df_time_series, dtypes, args):
N, L = args.N, args.L
print()
print('Discretizing features...')
df_time_series = df_time_series.dropna(axis='columns', how='all').sort_index()
# time_series = df_time_series[df_time_series.index.get_level_values(0).isin(population.index)]
print('Discretizing features...')
ts_mask = select_dtype(df_time_series, 'mask', dtypes)
ts_mixed = select_dtype(df_time_series, '~mask', dtypes)
assert len(ts_mixed.columns) + len(ts_mask.columns) == len(df_time_series.columns)
ts_feature_mask = ts_mask.astype(int)
ts_mixed_cols = [ts_mixed[col] for col in ts_mixed.columns]
print('Processing', len(ts_mixed_cols), 'non-boolean variable columns...')
print(' Binning numeric variables by quintile...')
print(' Converting variables to binary features')
if parallel:
out = Parallel(n_jobs=n_jobs, verbose=10)( # Need to share global variables
delayed(smart_qcut_dummify)(col_data, q=5) for col_data in ts_mixed_cols
)
else:
out = [smart_qcut_dummify(col_data, q=5) for col_data in tqdm(ts_mixed_cols)]
if False:
# ts_mixed_cut = ts_mixed.progress_apply(smart_qcut, q=5)
# ts_feature_mixed = pd.get_dummies(ts_mixed_cut, prefix_sep='_', columns=ts_mixed_cut.columns)
ts_feature_mixed = pd.concat(out, axis=1)
time_series_features = ts_feature_mask.join([ts_feature_mixed]).astype(int)
assert time_series_features.astype(int).dtypes.nunique() == 1
Xdf = time_series_features.to_sparse(fill_value=0)
X_all_feature_names = time_series_features.columns.values
X_all = sparse.COO(Xdf.to_coo())
_, D_all = X_all.shape
print()
if args.binarize:
dtype = int
print('Processing', len(ts_mixed_cols), 'non-boolean variable columns...')
print(' Binning numeric variables by quintile...')
print(' Converting variables to binary features')
if parallel:
out = Parallel(n_jobs=n_jobs, verbose=10)( # Need to share global variables
delayed(smart_qcut_dummify)(col_data, q=5) for col_data in ts_mixed_cols
)
else:
out = [smart_qcut_dummify(col_data, q=5) for col_data in tqdm(ts_mixed_cols)]
else:
out = [ts_feature_mask, *out]
D_all = sum(len(df_i.columns) for df_i in out)
X_all_feature_names = np.asarray(sum([list(df_i.columns) for df_i in out], []))
X_dense = np.concatenate([df_i.values for df_i in out], axis=1).astype(int)
X_all = sparse.COO(X_dense)
dtype = float
df = ts_mixed.copy()
# Split a mixed column into numeric and string columns
for col in df.columns:
col_data = df[col]
col_is_numeric = [is_numeric(v) for v in col_data if not pd.isnull(v)]
if not all(col_is_numeric) and any(col_is_numeric): # have mixed type values
numeric_mask = col_data.apply(is_numeric)
df[col+'_str'] = df[col].copy()
df.loc[~numeric_mask, col] = np.nan
df.loc[numeric_mask, col+'_str'] = np.nan
ts_mixed_cols = [df[col] for col in df.columns]
print('Discretizing categorical features...')
if parallel:
out = Parallel(n_jobs=n_jobs, verbose=10)( # Need to share global variables?
delayed(smart_dummify_impute)(col_data) for col_data in ts_mixed_cols
)
else:
out = [smart_dummify_impute(col_data) for col_data in tqdm(ts_mixed_cols)]
out = [ts_feature_mask, *out]
D_all = sum(len(df_i.columns) for df_i in out)
X_all_feature_names = np.asarray(sum([list(df_i.columns) for df_i in out], []))
X_dense = np.concatenate([df_i.values for df_i in out], axis=1).astype(dtype)
X_all = sparse.COO(X_dense)
print('Finished discretizing features')
assert X_all.shape[0] == N * L
......@@ -468,7 +491,7 @@ def post_filter_time_series(X_all, feature_names_all, threshold, args):
N, L = args.N, args.L
assert X_all.shape[0] == N
assert X_all.shape[1] == L
assert X_all.dtype == int
# assert X_all.dtype == int
start_time = time.time()
X0 = X_all
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment