Commit 608320bd authored by Shengpu Tang (tangsp)'s avatar Shengpu Tang (tangsp)
Browse files

bug fixes

parent 83a6d599
......@@ -17,101 +17,11 @@ def print_header(*content, char='='):
print(*content)
print(char * 80, flush=True)
######
# Post-filter: feature selection classes
######
class FrequencyThreshold_temporal(
sklearn.base.BaseEstimator,
sklearn.feature_selection.base.SelectorMixin
):
def __init__(self, threshold=0., L=None):
assert L is not None
self.threshold = threshold
self.L = L
def fit(self, X, y=None):
# Reshape to be 3-dimensional array
NL, D = X.shape
X = X.reshape((int(NL/self.L), self.L, D))
# Collapse time dimension, generating NxD matrix
X_notalways0 = X.any(axis=1)
X_notalways1 = (1-X).any(axis=1)
if hasattr(X, "toarray"):
X_notalways0 = X_notalways0.toarray()
X_notalways1 = X_notalways1.toarray()
if hasattr(X, "todense"):
X_notalways0 = X_notalways0.todense()
X_notalways1 = X_notalways1.todense()
self.freqs_notalways0 = np.mean(X_notalways0, axis=0)
self.freqs_notalways1 = np.mean(X_notalways1, axis=0)
return self
def _get_support_mask(self):
return np.logical_and(
self.freqs_notalways0 > self.threshold,
self.freqs_notalways1 > self.threshold,
)
# Keep only first feature in a pairwise perfectly correlated feature group
class CorrelationSelector(
sklearn.base.BaseEstimator,
sklearn.feature_selection.base.SelectorMixin,
):
def __init__(self):
super().__init__()
def fit(self, X, y=None):
if hasattr(X, "toarray"): # sparse matrix
X = X.toarray()
if hasattr(X, "todense"): # sparse matrix
X = X.todense()
# Calculate correlation matrix
# Keep only lower triangular matrix
self.corr_matrix = np.corrcoef(X.T)
np.fill_diagonal(self.corr_matrix, 0)
self.corr_matrix *= np.tri(*self.corr_matrix.shape)
# get absolute value
corr = abs(self.corr_matrix)
# coefficient close to 1 means perfectly correlated
# Compare each feature to previous feature (smaller index) to see if they have correlation of 1
to_drop = np.isclose(corr, 1.0).sum(axis=1).astype(bool)
self.to_keep = ~to_drop
return self
def _get_support_mask(self):
return self.to_keep
def get_feature_aliases(self, feature_names):
feature_names = [str(n) for n in feature_names]
corr_matrix = self.corr_matrix
flags = np.isclose(abs(corr_matrix), 1.0)
alias_map = defaultdict(list)
for i in range(1, corr_matrix.shape[0]):
for j in range(i):
if flags[i,j]:
if np.isclose(corr_matrix[i,j], 1.0):
alias_map[feature_names[j]].append(feature_names[i])
elif np.isclose(corr_matrix[i,j], -1.0):
alias_map[feature_names[j]].append('~{' + feature_names[i] + '}')
else:
assert False
# Only save alias for first in the list
break
return dict(alias_map)
######
# Transform
######
def get_unique_variables(df):
return sorted(df[var_col].unique())
......@@ -193,7 +103,9 @@ def is_numeric(v):
######
def _get_time_bins(T, dt):
return np.arange(0, T+dt, dt)
# Defines the boundaries of time bins [0, dt, 2*dt, ..., k*dt]
# where k*dt <= T and (k+1)*dt > T
return np.arange(0, dt*(T//dt+1), dt)
def _get_time_bins_index(T, dt):
return pd.Index(pd.interval_range(start=0, end=T, freq=dt, closed='left'))
......@@ -305,3 +217,95 @@ def check_imputed_output(df_v):
assert pd.isnull(x[:(last_null_idx+1)]).all() # all values up to here are nan
assert (~pd.isnull(x[(last_null_idx+1):])).all() # all values after here are not nan
return
######
# Post-filter: feature selection classes
######
class FrequencyThreshold_temporal(
sklearn.base.BaseEstimator,
sklearn.feature_selection.base.SelectorMixin
):
def __init__(self, threshold=0., L=None):
assert L is not None
self.threshold = threshold
self.L = L
def fit(self, X, y=None):
# Reshape to be 3-dimensional array
NL, D = X.shape
X = X.reshape((int(NL/self.L), self.L, D))
# Collapse time dimension, generating NxD matrix
X_notalways0 = X.any(axis=1)
X_notalways1 = (1-X).any(axis=1)
if hasattr(X, "toarray"):
X_notalways0 = X_notalways0.toarray()
X_notalways1 = X_notalways1.toarray()
if hasattr(X, "todense"):
X_notalways0 = X_notalways0.todense()
X_notalways1 = X_notalways1.todense()
self.freqs_notalways0 = np.mean(X_notalways0, axis=0)
self.freqs_notalways1 = np.mean(X_notalways1, axis=0)
return self
def _get_support_mask(self):
return np.logical_and(
self.freqs_notalways0 > self.threshold,
self.freqs_notalways1 > self.threshold,
)
# Keep only first feature in a pairwise perfectly correlated feature group
class CorrelationSelector(
sklearn.base.BaseEstimator,
sklearn.feature_selection.base.SelectorMixin,
):
def __init__(self):
super().__init__()
def fit(self, X, y=None):
if hasattr(X, "toarray"): # sparse matrix
X = X.toarray()
if hasattr(X, "todense"): # sparse matrix
X = X.todense()
# Calculate correlation matrix
# Keep only lower triangular matrix
self.corr_matrix = np.corrcoef(X.T)
np.fill_diagonal(self.corr_matrix, 0)
self.corr_matrix *= np.tri(*self.corr_matrix.shape)
# get absolute value
corr = abs(self.corr_matrix)
# coefficient close to 1 means perfectly correlated
# Compare each feature to previous feature (smaller index) to see if they have correlation of 1
to_drop = np.isclose(corr, 1.0).sum(axis=1).astype(bool)
self.to_keep = ~to_drop
return self
def _get_support_mask(self):
return self.to_keep
def get_feature_aliases(self, feature_names):
feature_names = [str(n) for n in feature_names]
corr_matrix = self.corr_matrix
flags = np.isclose(abs(corr_matrix), 1.0)
alias_map = defaultdict(list)
for i in range(1, corr_matrix.shape[0]):
for j in range(i):
if flags[i,j]:
if np.isclose(corr_matrix[i,j], 1.0):
alias_map[feature_names[j]].append(feature_names[i])
elif np.isclose(corr_matrix[i,j], -1.0):
alias_map[feature_names[j]].append('~{' + feature_names[i] + '}')
else:
assert False
# Only save alias for first in the list
break
return dict(alias_map)
......@@ -7,14 +7,14 @@ import os
import argparse
parser = argparse.ArgumentParser(description='')
parser.add_argument('--data_path', type=str, required=True)
parser.add_argument('--population', type=str, required=True)
parser.add_argument('--T', type=float, required=True)
parser.add_argument('--dt', type=float, required=True)
parser.add_argument('--theta_1', type=float, default=0.001)
parser.add_argument('--theta_2', type=float, default=0.001)
parser.add_argument('--theta_freq', type=float, default=1.0)
parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean'])
parser.add_argument('--data_path', type=str, required=True)
parser.add_argument('--population', type=str, required=True)
parser.add_argument('--T', type=float, required=True)
parser.add_argument('--dt', type=float, required=True)
parser.add_argument('--theta_1', type=float, default=0.001)
parser.add_argument('--theta_2', type=float, default=0.001)
parser.add_argument('--theta_freq', type=float, default=1.0)
parser.add_argument('--stats_functions', nargs='+', default=['min', 'max', 'mean'])
args = parser.parse_args()
data_path = args.data_path
......@@ -29,13 +29,14 @@ theta_2 = args.theta_2
theta_freq = args.theta_freq
stats_functions = args.stats_functions
df_population = pd.read_csv(population).rename(columns={'ICUSTAY_ID': 'ID'}).set_index('ID')
df_population = pd.read_csv(population).set_index('ID')
N = len(df_population)
L = int(np.floor(T/dt))
args.df_population = df_population
args.N = N
args.L = L
args.parallel = parallel
if os.path.isfile(data_path + 'input_data.p'):
input_fname = data_path + 'input_data.p'
......@@ -47,7 +48,7 @@ elif os.path.isfile(data_path + 'input_data.csv'):
input_fname = data_path + 'input_data.csv'
df_data = pd.read_csv(input_fname)
## Import helper after parsing arguments to share global variables
from .steps import *
print('Input data file:', input_fname)
......
......@@ -280,12 +280,13 @@ def func_encode_single_time_series(i, g, variables, variables_num_freq, T, dt, s
raise Exception(i)
return i, df_out
def process_time_series_table(df_in, args, parallel=True):
def process_time_series_table(df_in, args):
data_path = args.data_path
theta_freq = args.theta_freq
stats_functions = args.stats_functions
N, L = args.N, args.L
df_population = args.df_population
parallel = args.parallel
## TODO: asserts shape of df_in
......@@ -312,7 +313,10 @@ def process_time_series_table(df_in, args, parallel=True):
))
else:
out = dict(func_encode_single_time_series(i, g, variables, variables_num_freq) for i, g in tqdm(grouped[:N]))
out = dict(
func_encode_single_time_series(i, g, variables, variables_num_freq, args.T, args.dt, args.stats_functions)
for i, g in tqdm(grouped[:N])
)
# Handle IDs not in the table
df_original = list(out.values())[0]
......
# FIDDLE
Required packages:
- numpy
- pandas
- sparse
- sklearn
- tqdm
- joblib
Example usage:
```bash
python -m FIDDLE.run \
--data_path='./test/small_test/' \
--population='./test/small_test/pop.csv' \
--T=24 --dt=5 \
--theta_1=0.001 --theta_2=0.001 --theta_freq=1 \
--stats_functions 'min' 'max' 'mean'
```
The generated features and associated metadata are located in `{data_path}/`:
- `s.npz`: a sparse array of shape (N, d)
- `X.npz`: a sparse tensor of shape (N, L, D)
- `s.feature_names.txt`: names of _d_ time-invariant features
- `X.feature_names.txt`: names of _D_ time-series features
To load the generated features:
```python
X = sparse.load_npz('{data_path}/X.npz'.format(data_path=...)).todense()
s = sparse.load_npz('{data_path}/s.npz'.format(data_path=...)).todense()
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment