helpers.py 11.5 KB
Newer Older
Shengpu Tang (tangsp)'s avatar
init    
Shengpu Tang (tangsp) committed
1
2
3
from .config import *
import pandas as pd
import numpy as np
4
import scipy
Shengpu Tang (tangsp)'s avatar
init    
Shengpu Tang (tangsp) committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import sparse
from collections import defaultdict

from joblib import Parallel, delayed, parallel_backend
from tqdm import tqdm

from sklearn.feature_selection import VarianceThreshold
import sklearn
from collections import defaultdict

def print_header(*content, char='='):
    print()
    print(char * 80)
    print(*content)
    print(char * 80, flush=True)


######
Shengpu Tang (tangsp)'s avatar
Shengpu Tang (tangsp) committed
23
# Transform
Shengpu Tang (tangsp)'s avatar
init    
Shengpu Tang (tangsp) committed
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
######

def get_unique_variables(df):
    return sorted(df[var_col].unique())

def get_frequent_numeric_variables(df_time_series, variables, threshold, args):
    data_path = args.data_path
    df_population = args.df_population
    T, dt = args.T, args.dt
    
    df_types = pd.read_csv(data_path + 'value_types.csv').set_index(var_col)['value_type']
    numeric_vars = [col for col in variables if df_types[col] == 'Numeric']
    df_num_counts = calculate_variable_counts(df_time_series, df_population)[numeric_vars] #gets the count of each variable for each patient. 
    variables_num_freq = df_num_counts.columns[df_num_counts.mean() >= threshold * np.floor(T/dt)]
    return variables_num_freq

def calculate_variable_counts(df_data, df_population):
    """
    df_data in raw format with four columns
    """
    df = df_data.copy()
    df['count'] = 1
46
    df_count = df[[ID_col, var_col, 'count']].groupby([ID_col, var_col]).count().unstack(1, fill_value=0)
Shengpu Tang (tangsp)'s avatar
init    
Shengpu Tang (tangsp) committed
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
    df_count.columns = df_count.columns.droplevel()
    df_count = df_count.reindex(df_population.index, fill_value=0)
    ## Slower version
    # df_count = df[['ID', 'variable_name', 'count']].pivot_table(index='ID', columns='variable_name', aggfunc='count', fill_value=0)
    return df_count

def select_dtype(df, dtype, dtypes=None):
    if dtypes is None:
        ## Need to assert dtypes are not all objects
        assert not all(df.dtypes == 'object')
        if dtype == 'mask':
            return df.select_dtypes('bool')
        elif dtype == '~mask':
            return df.select_dtypes(exclude='bool')
    else:
        ## Need to assert df.columns and dtypes.index are the same
        if dtype == 'mask':
            return df.loc[:, (dtypes == 'bool')].astype(bool)
        elif dtype == '~mask':
            return df.loc[:, (dtypes != 'bool')]
        else:
            assert False
    return

def smart_qcut_dummify(x, q):
    z = smart_qcut(x, q)
    return pd.get_dummies(z, prefix=z.name)

def smart_qcut(x, q):
    # ignore strings when performing qcut
    x = x.copy()
    x = x.apply(make_float)
    m = x.apply(np.isreal)
80
    if x.loc[m].dropna().nunique() > 1: # when more than one numeric values
Shengpu Tang (tangsp)'s avatar
init    
Shengpu Tang (tangsp) committed
81
82
83
84
85
        x.loc[m] = pd.qcut(x.loc[m].to_numpy(), q=q, duplicates='drop')
#         bins = np.percentile(x.loc[m].to_numpy(), [0, 20, 40, 60, 80, 100])
#         x.loc[m] = pd.cut(x, bins)
    return x

86
87
88
89
90
91
92
93
94
95
96
def smart_dummify_impute(x):
    x = x.copy()
    x = x.apply(make_float)
    m = x.apply(np.isreal)
    if x.loc[m].dropna().nunique() == 0: # all string values
        return pd.get_dummies(x, prefix=x.name, prefix_sep=':')
    else:
        x = pd.DataFrame(x)
        x = x.fillna(x.mean()) # simple mean imputation
        return x

Shengpu Tang (tangsp)'s avatar
init    
Shengpu Tang (tangsp) committed
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def make_float(v):
    try:
        return float(v)
    except ValueError:
        return v
    assert False

def is_numeric(v):
    try:
        float(v)
        return True
    except ValueError:
        return False
    assert False


######
# Time-series internals
######

def _get_time_bins(T, dt):
Shengpu Tang (tangsp)'s avatar
Shengpu Tang (tangsp) committed
118
119
    # Defines the boundaries of time bins [0, dt, 2*dt, ..., k*dt] 
    # where k*dt <= T and (k+1)*dt > T
Shengpu Tang (tangsp)'s avatar
bug fix    
Shengpu Tang (tangsp) committed
120
    return np.arange(0, dt*(np.floor(T/dt)+1), dt)
Shengpu Tang (tangsp)'s avatar
init    
Shengpu Tang (tangsp) committed
121
122

def _get_time_bins_index(T, dt):
123
    return pd.cut([], _get_time_bins(T, dt), right=False).categories
Shengpu Tang (tangsp)'s avatar
init    
Shengpu Tang (tangsp) committed
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231

def pivot_event_table(df):
    df = df.copy()
    
    # Handle cases where the same variable is recorded multiple times with the same timestamp
    # Adjust the timestamps by epsilon so that all timestamps are unique
    eps = 1e-6
    m_dups = df.duplicated([ID_col, t_col, var_col], keep=False)
    df_dups = df[m_dups].copy()
    for v, df_v in df_dups.groupby(var_col):
        df_dups.loc[df_v.index, t_col] += eps * np.arange(len(df_v))
    
    df = pd.concat([df[~m_dups], df_dups])
    assert not df.duplicated([ID_col, t_col, var_col], keep=False).any()
    
    return pd.pivot_table(df, val_col, t_col, var_col, 'first')

def presence_mask(df_i, variables, T, dt):
    # for each itemid
    # for each time bin, whether there is real measurement
    if len(df_i) == 0:
        mask_i = pd.DataFrame().reindex(index=_get_time_bins_index(T, dt), columns=list(variables), fill_value=False)
    else:
        mask_i = df_i.groupby(
            pd.cut(df_i.index, _get_time_bins(T, dt), right=False)
        ).apply(lambda x: x.notnull().any())
        mask_i = mask_i.reindex(columns=variables, fill_value=False)
    
    mask_i.columns = [str(col) + '_mask' for col in mask_i.columns]
    return mask_i

def get_delta_time(mask_i):
    a = 1 - mask_i
    b = a.cumsum()
    c = mask_i.cumsum()
    dt_i = b - b.where(~a.astype(bool)).ffill().fillna(0).astype(int)
    
    # the delta time for itemid's for which there are no measurements must be 0
    # or where there's no previous measurement and no imputation
    dt_i[c == 0] = 0
    
    dt_i.columns = [str(col).replace('_mask', '_delta_time') for col in dt_i.columns]
    return dt_i

def impute_ffill(df, columns, T, dt, mask=None):
    if len(df) == 0:
        return pd.DataFrame().reindex(columns=columns, fill_value=np.nan)
    
    if mask is None:
        mask = presence_mask(df, columns)

    # Calculate time bins, sorted by time
    df_bin = df.copy()
    df_bin.index = pd.cut(df_bin.index, _get_time_bins(T, dt), right=False)
    
    # Compute the values used for imputation
    ## Collapse duplicate time bins, keeping latest values for each time bin
    df_imp = df_bin.ffill()
    df_imp = df_imp[~df_imp.index.duplicated(keep='last')]
    ## Reindex to make sure every time bin exists
    df_imp = df_imp.reindex(_get_time_bins_index(T, dt))
    ## Forward fill the missing time bins
    df_imp = df_imp.ffill()

    df_ff = df_imp
    df_ff[mask.to_numpy()] = np.nan
    df_ff.index = df_ff.index.mid ## Imputed values lie at the middle of a time bin
    df_ff = pd.concat([df, df_ff]).dropna(how='all')
    df_ff.sort_index(inplace=True)
    return df_ff

def most_recent_values(df_i, columns, T, dt):
    df_bin = df_i.copy()
    df_bin.index = pd.cut(df_bin.index, _get_time_bins(T, dt), right=False)
    df_v = df_bin.groupby(level=0).last()
    df_v.columns = [str(col) + '_value' for col in df_v.columns]
    df_v = df_v.reindex(_get_time_bins_index(T, dt))
    return df_v

def summary_statistics(df_i, columns, stats_functions, T, dt):
    # e.g. stats_functions=['mean', 'min', 'max']
    if len(columns) == 0:
        return pd.DataFrame().reindex(_get_time_bins_index(T, dt))
    else:
        # Encode statistics for numeric, frequent variables
        df_numeric = df_i[columns]
        df = df_numeric.copy().astype(float)
        df.index = pd.cut(df.index, _get_time_bins(T, dt), right=False)
        df_v = df.reset_index().groupby('index').agg(stats_functions)
        df_v.columns = list(map('_'.join, df_v.columns.values))
        df_v = df_v.reindex(_get_time_bins_index(T, dt))
        return df_v

def check_imputed_output(df_v):
    # Check imputation is successful
    ## If column is all null -> OK
    ## If column is all non-null -> OK
    ## If column has some null -> should only occur at the beginning
    not_null = df_v.notnull().all()
    all_null = df_v.isnull().all()
    cols_to_check = list(df_v.columns[~(not_null | all_null)])

    for col in cols_to_check:
        x = df_v[col].to_numpy()
        last_null_idx = np.argmax(np.where(pd.isnull(x))) # Find index of last nan
        assert pd.isnull(x[:(last_null_idx+1)]).all() # all values up to here are nan
        assert (~pd.isnull(x[(last_null_idx+1):])).all() # all values after here are not nan
    return
Shengpu Tang (tangsp)'s avatar
Shengpu Tang (tangsp) committed
232
233
234
235
236


######
# Post-filter: feature selection classes
######
237
238
try:
    from sklearn.feature_selection._base import SelectorMixin
Shengpu Tang (tangsp)'s avatar
Shengpu Tang (tangsp) committed
239
240
except:
    from sklearn.feature_selection.base import SelectorMixin
Shengpu Tang (tangsp)'s avatar
Shengpu Tang (tangsp) committed
241
242
243

class FrequencyThreshold_temporal(
    sklearn.base.BaseEstimator,
244
    SelectorMixin
Shengpu Tang (tangsp)'s avatar
Shengpu Tang (tangsp) committed
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
):
    def __init__(self, threshold=0., L=None):
        assert L is not None
        self.threshold = threshold
        self.L = L
    
    def fit(self, X, y=None):
        # Reshape to be 3-dimensional array
        NL, D = X.shape
        X = X.reshape((int(NL/self.L), self.L, D))
        
        # Collapse time dimension, generating NxD matrix
        X_notalways0 = X.any(axis=1)
        X_notalways1 = (1-X).any(axis=1)
        
        self.freqs_notalways0 = np.mean(X_notalways0, axis=0)
        self.freqs_notalways1 = np.mean(X_notalways1, axis=0)
        return self

    def _get_support_mask(self):
265
        mask = np.logical_and(
Shengpu Tang (tangsp)'s avatar
Shengpu Tang (tangsp) committed
266
267
268
            self.freqs_notalways0 > self.threshold,
            self.freqs_notalways1 > self.threshold,
        )
269
270
271
272
273
        if hasattr(mask, "toarray"):
            mask = mask.toarray()
        if hasattr(mask, "todense"):
            mask = mask.todense()
        return mask
Shengpu Tang (tangsp)'s avatar
Shengpu Tang (tangsp) committed
274
275
276
277

# Keep only first feature in a pairwise perfectly correlated feature group
class CorrelationSelector(
    sklearn.base.BaseEstimator,
278
    SelectorMixin,
Shengpu Tang (tangsp)'s avatar
Shengpu Tang (tangsp) committed
279
280
281
282
283
):
    def __init__(self):
        super().__init__()
    
    def fit(self, X, y=None):
284
285
        if hasattr(X, "to_scipy_sparse"):   # sparse matrix
            X = X.to_scipy_sparse()
Shengpu Tang (tangsp)'s avatar
Shengpu Tang (tangsp) committed
286
287
288
        
        # Calculate correlation matrix
        # Keep only lower triangular matrix
289
290
291
292
        if scipy.sparse.issparse(X):
            self.corr_matrix = sparse_corrcoef(X.T)
        else:
            self.corr_matrix = np.corrcoef(X.T)
Shengpu Tang (tangsp)'s avatar
Shengpu Tang (tangsp) committed
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
        np.fill_diagonal(self.corr_matrix, 0)
        self.corr_matrix *= np.tri(*self.corr_matrix.shape)
        
        # get absolute value
        corr = abs(self.corr_matrix)
        
        # coefficient close to 1 means perfectly correlated
        # Compare each feature to previous feature (smaller index) to see if they have correlation of 1
        to_drop = np.isclose(corr, 1.0).sum(axis=1).astype(bool)
        self.to_keep = ~to_drop
        
        return self

    def _get_support_mask(self):
        return self.to_keep
    
    def get_feature_aliases(self, feature_names):
        feature_names = [str(n) for n in feature_names]
        corr_matrix = self.corr_matrix
        flags = np.isclose(abs(corr_matrix), 1.0)
        alias_map = defaultdict(list)
        for i in range(1, corr_matrix.shape[0]):
            for j in range(i):
                if flags[i,j]:
                    if np.isclose(corr_matrix[i,j], 1.0):
                        alias_map[feature_names[j]].append(feature_names[i])
                    elif np.isclose(corr_matrix[i,j], -1.0):
                        alias_map[feature_names[j]].append('~{' + feature_names[i] + '}')
                    else:
                        assert False

                    # Only save alias for first in the list
                    break
        return dict(alias_map)

328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
# https://stackoverflow.com/questions/19231268/correlation-coefficients-for-sparse-matrix-in-python
def sparse_corrcoef(A, B=None):
    if B is not None:
        A = sparse.vstack((A, B), format='csr')
    
    A = A.astype(np.float64)
    n = A.shape[1]

    # Compute the covariance matrix
    rowsum = A.sum(1)
    centering = rowsum.dot(rowsum.T.conjugate()) / n
    C = (A.dot(A.T.conjugate()) - centering) / (n - 1)

    # The correlation coefficients are given by
    # C_{i,j} / sqrt(C_{i} * C_{j})
    d = np.diag(C)
    coeffs = C / np.sqrt(np.outer(d, d))

    return np.array(coeffs)