Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
MLD3
FIDDLE
Commits
608320bd
Commit
608320bd
authored
Jun 17, 2019
by
Shengpu Tang (tangsp)
Browse files
bug fixes
parent
83a6d599
Changes
4
Hide whitespace changes
Inline
Side-by-side
FIDDLE/helpers.py
View file @
608320bd
...
...
@@ -17,101 +17,11 @@ def print_header(*content, char='='):
print
(
*
content
)
print
(
char
*
80
,
flush
=
True
)
######
# Post-filter: feature selection classes
######
class
FrequencyThreshold_temporal
(
sklearn
.
base
.
BaseEstimator
,
sklearn
.
feature_selection
.
base
.
SelectorMixin
):
def
__init__
(
self
,
threshold
=
0.
,
L
=
None
):
assert
L
is
not
None
self
.
threshold
=
threshold
self
.
L
=
L
def
fit
(
self
,
X
,
y
=
None
):
# Reshape to be 3-dimensional array
NL
,
D
=
X
.
shape
X
=
X
.
reshape
((
int
(
NL
/
self
.
L
),
self
.
L
,
D
))
# Collapse time dimension, generating NxD matrix
X_notalways0
=
X
.
any
(
axis
=
1
)
X_notalways1
=
(
1
-
X
).
any
(
axis
=
1
)
if
hasattr
(
X
,
"toarray"
):
X_notalways0
=
X_notalways0
.
toarray
()
X_notalways1
=
X_notalways1
.
toarray
()
if
hasattr
(
X
,
"todense"
):
X_notalways0
=
X_notalways0
.
todense
()
X_notalways1
=
X_notalways1
.
todense
()
self
.
freqs_notalways0
=
np
.
mean
(
X_notalways0
,
axis
=
0
)
self
.
freqs_notalways1
=
np
.
mean
(
X_notalways1
,
axis
=
0
)
return
self
def
_get_support_mask
(
self
):
return
np
.
logical_and
(
self
.
freqs_notalways0
>
self
.
threshold
,
self
.
freqs_notalways1
>
self
.
threshold
,
)
# Keep only first feature in a pairwise perfectly correlated feature group
class
CorrelationSelector
(
sklearn
.
base
.
BaseEstimator
,
sklearn
.
feature_selection
.
base
.
SelectorMixin
,
):
def
__init__
(
self
):
super
().
__init__
()
def
fit
(
self
,
X
,
y
=
None
):
if
hasattr
(
X
,
"toarray"
):
# sparse matrix
X
=
X
.
toarray
()
if
hasattr
(
X
,
"todense"
):
# sparse matrix
X
=
X
.
todense
()
# Calculate correlation matrix
# Keep only lower triangular matrix
self
.
corr_matrix
=
np
.
corrcoef
(
X
.
T
)
np
.
fill_diagonal
(
self
.
corr_matrix
,
0
)
self
.
corr_matrix
*=
np
.
tri
(
*
self
.
corr_matrix
.
shape
)
# get absolute value
corr
=
abs
(
self
.
corr_matrix
)
# coefficient close to 1 means perfectly correlated
# Compare each feature to previous feature (smaller index) to see if they have correlation of 1
to_drop
=
np
.
isclose
(
corr
,
1.0
).
sum
(
axis
=
1
).
astype
(
bool
)
self
.
to_keep
=
~
to_drop
return
self
def
_get_support_mask
(
self
):
return
self
.
to_keep
def
get_feature_aliases
(
self
,
feature_names
):
feature_names
=
[
str
(
n
)
for
n
in
feature_names
]
corr_matrix
=
self
.
corr_matrix
flags
=
np
.
isclose
(
abs
(
corr_matrix
),
1.0
)
alias_map
=
defaultdict
(
list
)
for
i
in
range
(
1
,
corr_matrix
.
shape
[
0
]):
for
j
in
range
(
i
):
if
flags
[
i
,
j
]:
if
np
.
isclose
(
corr_matrix
[
i
,
j
],
1.0
):
alias_map
[
feature_names
[
j
]].
append
(
feature_names
[
i
])
elif
np
.
isclose
(
corr_matrix
[
i
,
j
],
-
1.0
):
alias_map
[
feature_names
[
j
]].
append
(
'~{'
+
feature_names
[
i
]
+
'}'
)
else
:
assert
False
# Only save alias for first in the list
break
return
dict
(
alias_map
)
######
# Transform
######
def
get_unique_variables
(
df
):
return
sorted
(
df
[
var_col
].
unique
())
...
...
@@ -193,7 +103,9 @@ def is_numeric(v):
######
def
_get_time_bins
(
T
,
dt
):
return
np
.
arange
(
0
,
T
+
dt
,
dt
)
# Defines the boundaries of time bins [0, dt, 2*dt, ..., k*dt]
# where k*dt <= T and (k+1)*dt > T
return
np
.
arange
(
0
,
dt
*
(
T
//
dt
+
1
),
dt
)
def
_get_time_bins_index
(
T
,
dt
):
return
pd
.
Index
(
pd
.
interval_range
(
start
=
0
,
end
=
T
,
freq
=
dt
,
closed
=
'left'
))
...
...
@@ -305,3 +217,95 @@ def check_imputed_output(df_v):
assert
pd
.
isnull
(
x
[:(
last_null_idx
+
1
)]).
all
()
# all values up to here are nan
assert
(
~
pd
.
isnull
(
x
[(
last_null_idx
+
1
):])).
all
()
# all values after here are not nan
return
######
# Post-filter: feature selection classes
######
class
FrequencyThreshold_temporal
(
sklearn
.
base
.
BaseEstimator
,
sklearn
.
feature_selection
.
base
.
SelectorMixin
):
def
__init__
(
self
,
threshold
=
0.
,
L
=
None
):
assert
L
is
not
None
self
.
threshold
=
threshold
self
.
L
=
L
def
fit
(
self
,
X
,
y
=
None
):
# Reshape to be 3-dimensional array
NL
,
D
=
X
.
shape
X
=
X
.
reshape
((
int
(
NL
/
self
.
L
),
self
.
L
,
D
))
# Collapse time dimension, generating NxD matrix
X_notalways0
=
X
.
any
(
axis
=
1
)
X_notalways1
=
(
1
-
X
).
any
(
axis
=
1
)
if
hasattr
(
X
,
"toarray"
):
X_notalways0
=
X_notalways0
.
toarray
()
X_notalways1
=
X_notalways1
.
toarray
()
if
hasattr
(
X
,
"todense"
):
X_notalways0
=
X_notalways0
.
todense
()
X_notalways1
=
X_notalways1
.
todense
()
self
.
freqs_notalways0
=
np
.
mean
(
X_notalways0
,
axis
=
0
)
self
.
freqs_notalways1
=
np
.
mean
(
X_notalways1
,
axis
=
0
)
return
self
def
_get_support_mask
(
self
):
return
np
.
logical_and
(
self
.
freqs_notalways0
>
self
.
threshold
,
self
.
freqs_notalways1
>
self
.
threshold
,
)
# Keep only first feature in a pairwise perfectly correlated feature group
class
CorrelationSelector
(
sklearn
.
base
.
BaseEstimator
,
sklearn
.
feature_selection
.
base
.
SelectorMixin
,
):
def
__init__
(
self
):
super
().
__init__
()
def
fit
(
self
,
X
,
y
=
None
):
if
hasattr
(
X
,
"toarray"
):
# sparse matrix
X
=
X
.
toarray
()
if
hasattr
(
X
,
"todense"
):
# sparse matrix
X
=
X
.
todense
()
# Calculate correlation matrix
# Keep only lower triangular matrix
self
.
corr_matrix
=
np
.
corrcoef
(
X
.
T
)
np
.
fill_diagonal
(
self
.
corr_matrix
,
0
)
self
.
corr_matrix
*=
np
.
tri
(
*
self
.
corr_matrix
.
shape
)
# get absolute value
corr
=
abs
(
self
.
corr_matrix
)
# coefficient close to 1 means perfectly correlated
# Compare each feature to previous feature (smaller index) to see if they have correlation of 1
to_drop
=
np
.
isclose
(
corr
,
1.0
).
sum
(
axis
=
1
).
astype
(
bool
)
self
.
to_keep
=
~
to_drop
return
self
def
_get_support_mask
(
self
):
return
self
.
to_keep
def
get_feature_aliases
(
self
,
feature_names
):
feature_names
=
[
str
(
n
)
for
n
in
feature_names
]
corr_matrix
=
self
.
corr_matrix
flags
=
np
.
isclose
(
abs
(
corr_matrix
),
1.0
)
alias_map
=
defaultdict
(
list
)
for
i
in
range
(
1
,
corr_matrix
.
shape
[
0
]):
for
j
in
range
(
i
):
if
flags
[
i
,
j
]:
if
np
.
isclose
(
corr_matrix
[
i
,
j
],
1.0
):
alias_map
[
feature_names
[
j
]].
append
(
feature_names
[
i
])
elif
np
.
isclose
(
corr_matrix
[
i
,
j
],
-
1.0
):
alias_map
[
feature_names
[
j
]].
append
(
'~{'
+
feature_names
[
i
]
+
'}'
)
else
:
assert
False
# Only save alias for first in the list
break
return
dict
(
alias_map
)
FIDDLE/run.py
View file @
608320bd
...
...
@@ -7,14 +7,14 @@ import os
import
argparse
parser
=
argparse
.
ArgumentParser
(
description
=
''
)
parser
.
add_argument
(
'--data_path'
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
'--population'
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
'--T'
,
type
=
float
,
required
=
True
)
parser
.
add_argument
(
'--dt'
,
type
=
float
,
required
=
True
)
parser
.
add_argument
(
'--theta_1'
,
type
=
float
,
default
=
0.001
)
parser
.
add_argument
(
'--theta_2'
,
type
=
float
,
default
=
0.001
)
parser
.
add_argument
(
'--theta_freq'
,
type
=
float
,
default
=
1.0
)
parser
.
add_argument
(
'--stats_functions'
,
nargs
=
'+'
,
default
=
[
'min'
,
'max'
,
'mean'
])
parser
.
add_argument
(
'--data_path'
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
'--population'
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
'--T'
,
type
=
float
,
required
=
True
)
parser
.
add_argument
(
'--dt'
,
type
=
float
,
required
=
True
)
parser
.
add_argument
(
'--theta_1'
,
type
=
float
,
default
=
0.001
)
parser
.
add_argument
(
'--theta_2'
,
type
=
float
,
default
=
0.001
)
parser
.
add_argument
(
'--theta_freq'
,
type
=
float
,
default
=
1.0
)
parser
.
add_argument
(
'--stats_functions'
,
nargs
=
'+'
,
default
=
[
'min'
,
'max'
,
'mean'
])
args
=
parser
.
parse_args
()
data_path
=
args
.
data_path
...
...
@@ -29,13 +29,14 @@ theta_2 = args.theta_2
theta_freq
=
args
.
theta_freq
stats_functions
=
args
.
stats_functions
df_population
=
pd
.
read_csv
(
population
).
rename
(
columns
=
{
'ICUSTAY_ID'
:
'ID'
}).
set_index
(
'ID'
)
df_population
=
pd
.
read_csv
(
population
).
set_index
(
'ID'
)
N
=
len
(
df_population
)
L
=
int
(
np
.
floor
(
T
/
dt
))
args
.
df_population
=
df_population
args
.
N
=
N
args
.
L
=
L
args
.
parallel
=
parallel
if
os
.
path
.
isfile
(
data_path
+
'input_data.p'
):
input_fname
=
data_path
+
'input_data.p'
...
...
@@ -47,7 +48,7 @@ elif os.path.isfile(data_path + 'input_data.csv'):
input_fname
=
data_path
+
'input_data.csv'
df_data
=
pd
.
read_csv
(
input_fname
)
## Import helper after parsing arguments to share global variables
from
.steps
import
*
print
(
'Input data file:'
,
input_fname
)
...
...
FIDDLE/steps.py
View file @
608320bd
...
...
@@ -280,12 +280,13 @@ def func_encode_single_time_series(i, g, variables, variables_num_freq, T, dt, s
raise
Exception
(
i
)
return
i
,
df_out
def
process_time_series_table
(
df_in
,
args
,
parallel
=
True
):
def
process_time_series_table
(
df_in
,
args
):
data_path
=
args
.
data_path
theta_freq
=
args
.
theta_freq
stats_functions
=
args
.
stats_functions
N
,
L
=
args
.
N
,
args
.
L
df_population
=
args
.
df_population
parallel
=
args
.
parallel
## TODO: asserts shape of df_in
...
...
@@ -312,7 +313,10 @@ def process_time_series_table(df_in, args, parallel=True):
))
else
:
out
=
dict
(
func_encode_single_time_series
(
i
,
g
,
variables
,
variables_num_freq
)
for
i
,
g
in
tqdm
(
grouped
[:
N
]))
out
=
dict
(
func_encode_single_time_series
(
i
,
g
,
variables
,
variables_num_freq
,
args
.
T
,
args
.
dt
,
args
.
stats_functions
)
for
i
,
g
in
tqdm
(
grouped
[:
N
])
)
# Handle IDs not in the table
df_original
=
list
(
out
.
values
())[
0
]
...
...
README.md
View file @
608320bd
# FIDDLE
Required packages:
-
numpy
-
pandas
-
sparse
-
sklearn
-
tqdm
-
joblib
Example usage:
```
bash
python
-m
FIDDLE.run
\
--data_path
=
'./test/small_test/'
\
--population
=
'./test/small_test/pop.csv'
\
--T
=
24
--dt
=
5
\
--theta_1
=
0.001
--theta_2
=
0.001
--theta_freq
=
1
\
--stats_functions
'min'
'max'
'mean'
```
The generated features and associated metadata are located in
`{data_path}/`
:
-
`s.npz`
: a sparse array of shape (N, d)
-
`X.npz`
: a sparse tensor of shape (N, L, D)
-
`s.feature_names.txt`
: names of _d_ time-invariant features
-
`X.feature_names.txt`
: names of _D_ time-series features
To load the generated features:
```
python
X
=
sparse
.
load_npz
(
'{data_path}/X.npz'
.
format
(
data_path
=
...)).
todense
()
s
=
sparse
.
load_npz
(
'{data_path}/s.npz'
.
format
(
data_path
=
...)).
todense
()
```
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment