Commit e8eb066b authored by Shengpu Tang (tangsp)'s avatar Shengpu Tang (tangsp)
Browse files

Optimize `post-filter` for large sparse matrices

parent 0aa11f54
...@@ -42,7 +42,7 @@ def calculate_variable_counts(df_data, df_population): ...@@ -42,7 +42,7 @@ def calculate_variable_counts(df_data, df_population):
""" """
df = df_data.copy() df = df_data.copy()
df['count'] = 1 df['count'] = 1
df_count = df.groupby([ID_col, var_col]).count()[['count']].unstack(1, fill_value=0) df_count = df[[ID_col, var_col, 'count']].groupby([ID_col, var_col]).count().unstack(1, fill_value=0)
df_count.columns = df_count.columns.droplevel() df_count.columns = df_count.columns.droplevel()
df_count = df_count.reindex(df_population.index, fill_value=0) df_count = df_count.reindex(df_population.index, fill_value=0)
## Slower version ## Slower version
...@@ -240,22 +240,21 @@ class FrequencyThreshold_temporal( ...@@ -240,22 +240,21 @@ class FrequencyThreshold_temporal(
# Collapse time dimension, generating NxD matrix # Collapse time dimension, generating NxD matrix
X_notalways0 = X.any(axis=1) X_notalways0 = X.any(axis=1)
X_notalways1 = (1-X).any(axis=1) X_notalways1 = (1-X).any(axis=1)
if hasattr(X, "toarray"):
X_notalways0 = X_notalways0.toarray()
X_notalways1 = X_notalways1.toarray()
if hasattr(X, "todense"):
X_notalways0 = X_notalways0.todense()
X_notalways1 = X_notalways1.todense()
self.freqs_notalways0 = np.mean(X_notalways0, axis=0) self.freqs_notalways0 = np.mean(X_notalways0, axis=0)
self.freqs_notalways1 = np.mean(X_notalways1, axis=0) self.freqs_notalways1 = np.mean(X_notalways1, axis=0)
return self return self
def _get_support_mask(self): def _get_support_mask(self):
return np.logical_and( mask = np.logical_and(
self.freqs_notalways0 > self.threshold, self.freqs_notalways0 > self.threshold,
self.freqs_notalways1 > self.threshold, self.freqs_notalways1 > self.threshold,
) )
if hasattr(mask, "toarray"):
mask = mask.toarray()
if hasattr(mask, "todense"):
mask = mask.todense()
return mask
# Keep only first feature in a pairwise perfectly correlated feature group # Keep only first feature in a pairwise perfectly correlated feature group
class CorrelationSelector( class CorrelationSelector(
...@@ -266,14 +265,15 @@ class CorrelationSelector( ...@@ -266,14 +265,15 @@ class CorrelationSelector(
super().__init__() super().__init__()
def fit(self, X, y=None): def fit(self, X, y=None):
if hasattr(X, "toarray"): # sparse matrix if hasattr(X, "to_scipy_sparse"): # sparse matrix
X = X.toarray() X = X.to_scipy_sparse()
if hasattr(X, "todense"): # sparse matrix
X = X.todense()
# Calculate correlation matrix # Calculate correlation matrix
# Keep only lower triangular matrix # Keep only lower triangular matrix
self.corr_matrix = np.corrcoef(X.T) if scipy.sparse.issparse(X):
self.corr_matrix = sparse_corrcoef(X.T)
else:
self.corr_matrix = np.corrcoef(X.T)
np.fill_diagonal(self.corr_matrix, 0) np.fill_diagonal(self.corr_matrix, 0)
self.corr_matrix *= np.tri(*self.corr_matrix.shape) self.corr_matrix *= np.tri(*self.corr_matrix.shape)
...@@ -309,3 +309,22 @@ class CorrelationSelector( ...@@ -309,3 +309,22 @@ class CorrelationSelector(
break break
return dict(alias_map) return dict(alias_map)
# https://stackoverflow.com/questions/19231268/correlation-coefficients-for-sparse-matrix-in-python
def sparse_corrcoef(A, B=None):
if B is not None:
A = sparse.vstack((A, B), format='csr')
A = A.astype(np.float64)
n = A.shape[1]
# Compute the covariance matrix
rowsum = A.sum(1)
centering = rowsum.dot(rowsum.T.conjugate()) / n
C = (A.dot(A.T.conjugate()) - centering) / (n - 1)
# The correlation coefficients are given by
# C_{i,j} / sqrt(C_{i} * C_{j})
d = np.diag(C)
coeffs = C / np.sqrt(np.outer(d, d))
return np.array(coeffs)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment