Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
MLD3
FIDDLE
Commits
353d67b7
Commit
353d67b7
authored
Feb 25, 2020
by
Shengpu Tang (tangsp)
Browse files
Refactor 2020-Feb
parent
5432c2f3
Changes
3
Hide whitespace changes
Inline
Side-by-side
FIDDLE/helpers.py
View file @
353d67b7
import
argparse
def
str2bool
(
v
):
if
isinstance
(
v
,
bool
):
return
v
if
v
.
lower
()
in
(
'yes'
,
'true'
,
't'
,
'y'
,
'1'
):
return
True
elif
v
.
lower
()
in
(
'no'
,
'false'
,
'f'
,
'n'
,
'0'
):
return
False
else
:
raise
argparse
.
ArgumentTypeError
(
'Boolean value expected.'
)
from
.config
import
*
import
pandas
as
pd
import
numpy
as
np
...
...
FIDDLE/run.py
View file @
353d67b7
...
...
@@ -5,17 +5,9 @@ import numpy as np
import
time
import
os
def
str2bool
(
v
):
if
isinstance
(
v
,
bool
):
return
v
if
v
.
lower
()
in
(
'yes'
,
'true'
,
't'
,
'y'
,
'1'
):
return
True
elif
v
.
lower
()
in
(
'no'
,
'false'
,
'f'
,
'n'
,
'0'
):
return
False
else
:
raise
argparse
.
ArgumentTypeError
(
'Boolean value expected.'
)
import
argparse
from
.helpers
import
str2bool
parser
=
argparse
.
ArgumentParser
(
description
=
''
)
parser
.
add_argument
(
'--T'
,
type
=
float
,
required
=
True
)
parser
.
add_argument
(
'--dt'
,
type
=
float
,
required
=
True
)
...
...
@@ -96,6 +88,10 @@ print('N = {}'.format(N))
print
(
'L = {}'
.
format
(
L
))
print
(
''
,
flush
=
True
)
######
# Main
######
if
args
.
prefilter
:
print_header
(
'1) Pre-filter'
)
df_data
=
pre_filter
(
df_data
,
theta_1
,
df_population
,
args
)
...
...
@@ -106,7 +102,7 @@ df_data, df_types = detect_variable_data_type(df_data, value_type_override, args
df_time_invariant
,
df_time_series
=
split_by_timestamp_type
(
df_data
)
# Process time-invariant data
s
,
s_feature_names
,
s_feature_aliases
=
transform
_time_invariant
(
df_time_invariant
,
args
)
s
,
s_feature_names
,
s_feature_aliases
=
process
_time_invariant
(
df_time_invariant
,
args
)
# Process time-dependent data
X
,
X_feature_names
,
X_feature_aliases
=
transform
_time_dependent
(
df_time_series
,
args
)
X
,
X_feature_names
,
X_feature_aliases
=
process
_time_dependent
(
df_time_series
,
args
)
FIDDLE/steps.py
View file @
353d67b7
...
...
@@ -6,6 +6,7 @@ FIDDLE Preprocessing steps
"""
from
.helpers
import
*
import
time
import
json
def
pre_filter
(
df
,
threshold
,
df_population
,
args
):
T
=
int
(
args
.
T
)
...
...
@@ -106,26 +107,27 @@ def split_by_timestamp_type(df):
print
(
'# rows (time-dependent):'
,
len
(
df_time_series
))
return
df_time_invariant
,
df_time_series
def
transform
_time_invariant
(
df_data_time_invariant
,
args
):
def
process
_time_invariant
(
df_data_time_invariant
,
args
):
data_path
=
args
.
data_path
df_population
=
args
.
df_population
theta_2
=
args
.
theta_2
print_header
(
'2
.1
) Transform time-invariant data'
,
char
=
'-'
)
print_header
(
'2
-A
) Transform time-invariant data'
,
char
=
'-'
)
dir_path
=
data_path
+
'/'
start_time
=
time
.
time
()
## Create Nxd^ table
df_time_invariant
=
process
_time_invariant_table
(
df_data_time_invariant
,
df_population
)
df_time_invariant
=
transform
_time_invariant_table
(
df_data_time_invariant
,
df_population
)
print
(
'Time elapsed: %f seconds'
%
(
time
.
time
()
-
start_time
))
## Discretize
s_all
,
s_all_feature_names
=
map_time_invariant_features
(
df_time_invariant
,
args
.
binarize
)
sparse
.
save_npz
(
dir_path
+
's_all.npz'
,
s_all
)
np
.
savetxt
(
dir_path
+
's_all.feature_names.txt'
,
s_all_feature_names
,
'"%s"'
)
with
open
(
dir_path
+
's_all.feature_names.json'
,
'w'
)
as
f
:
json
.
dump
(
list
(
s_all_feature_names
),
f
,
sort_keys
=
True
)
print
(
'Time elapsed: %f seconds'
%
(
time
.
time
()
-
start_time
))
print_header
(
'3
.1
) Post-filter time-invariant data'
,
char
=
'-'
)
print_header
(
'3
-A
) Post-filter time-invariant data'
,
char
=
'-'
)
## Filter
s
,
s_feature_names
,
s_feature_aliases
=
post_filter
(
s_all
,
s_all_feature_names
,
theta_2
)
...
...
@@ -136,35 +138,38 @@ def transform_time_invariant(df_data_time_invariant, args):
print
(
'Output'
)
print
(
's: shape={}, density={:.3f}'
.
format
(
s
.
shape
,
s
.
density
))
sparse
.
save_npz
(
dir_path
+
's.npz'
,
s
)
np
.
savetxt
(
dir_path
+
's.feature_names.txt'
,
s_feature_names
,
'"%s"'
)
with
open
(
dir_path
+
's.feature_aliases.yml'
,
'w'
)
as
f
:
yaml
.
dump
(
s_feature_aliases
,
f
,
default_flow_style
=
False
)
with
open
(
dir_path
+
's.feature_names.json'
,
'w'
)
as
f
:
json
.
dump
(
list
(
s_feature_names
),
f
,
sort_keys
=
True
)
with
open
(
dir_path
+
's.feature_aliases.json'
,
'w'
)
as
f
:
json
.
dump
(
s_feature_aliases
,
f
,
sort_keys
=
True
)
print
(
'Total time: %f seconds'
%
(
time
.
time
()
-
start_time
))
print
(
''
,
flush
=
True
)
return
s
,
s_feature_names
,
s_feature_aliases
def
transform
_time_dependent
(
df_data_time_series
,
args
):
def
process
_time_dependent
(
df_data_time_series
,
args
):
data_path
=
args
.
data_path
theta_2
=
args
.
theta_2
print_header
(
'2
.2
) Transform time-dependent data'
,
char
=
'-'
)
print_header
(
'2
-B
) Transform time-dependent data'
,
char
=
'-'
)
dir_path
=
data_path
+
'/'
start_time
=
time
.
time
()
## Create NxLxD^ table
df_time_series
,
dtypes_time_series
=
process
_time_series_table
(
df_data_time_series
,
args
)
df_time_series
,
dtypes_time_series
=
transform
_time_series_table
(
df_data_time_series
,
args
)
print
(
'Time elapsed: %f seconds'
%
(
time
.
time
()
-
start_time
))
## Map variables to features
X_all
,
X_all_feature_names
=
map_time_series_features
(
df_time_series
,
dtypes_time_series
,
args
)
sparse
.
save_npz
(
dir_path
+
'X_all.npz'
,
X_all
)
np
.
savetxt
(
dir_path
+
'X_all.feature_names.txt'
,
X_all_feature_names
,
'"%s"'
)
with
open
(
dir_path
+
'X_all.feature_names.json'
,
'w'
)
as
f
:
json
.
dump
(
list
(
X_all_feature_names
),
f
,
sort_keys
=
True
)
print
(
'Time elapsed: %f seconds'
%
(
time
.
time
()
-
start_time
))
## Filter features
print_header
(
'3
.2
) Post-filter time-dependent data'
,
char
=
'-'
)
print_header
(
'3
-B
) Post-filter time-dependent data'
,
char
=
'-'
)
print
(
X_all
.
shape
,
X_all
.
density
)
X
,
X_feature_names
,
X_feature_aliases
=
post_filter_time_series
(
X_all
,
X_all_feature_names
,
theta_2
,
args
)
print
(
X
.
shape
,
X
.
density
)
...
...
@@ -175,9 +180,10 @@ def transform_time_dependent(df_data_time_series, args):
print
(
'Output'
)
print
(
'X: shape={}, density={:.3f}'
.
format
(
X
.
shape
,
X
.
density
))
sparse
.
save_npz
(
dir_path
+
'X.npz'
,
X
)
np
.
savetxt
(
dir_path
+
'X.feature_names.txt'
,
X_feature_names
,
'"%s"'
)
with
open
(
dir_path
+
'X.feature_aliases.yml'
,
'w'
)
as
f
:
yaml
.
dump
(
X_feature_aliases
,
f
,
default_flow_style
=
False
)
with
open
(
dir_path
+
's.feature_names.json'
,
'w'
)
as
f
:
json
.
dump
(
list
(
X_feature_names
),
f
,
sort_keys
=
True
)
with
open
(
dir_path
+
'X.feature_aliases.json'
,
'w'
)
as
f
:
json
.
dump
(
X_feature_aliases
,
f
,
sort_keys
=
True
)
print
(
'Total time: %f seconds'
%
(
time
.
time
()
-
start_time
))
print
(
''
,
flush
=
True
)
...
...
@@ -187,7 +193,7 @@ def transform_time_dependent(df_data_time_series, args):
######
# Time-invariant routines
######
def
process
_time_invariant_table
(
df_in
,
df_population
):
def
transform
_time_invariant_table
(
df_in
,
df_population
):
df_in
=
df_in
.
copy
()
# Recorded Value (np.nan if not recorded)
...
...
@@ -296,7 +302,7 @@ def func_encode_single_time_series(i, g, variables, variables_num_freq, T, dt, s
raise
Exception
(
i
)
return
i
,
df_out
def
process
_time_series_table
(
df_in
,
args
):
def
transform
_time_series_table
(
df_in
,
args
):
data_path
=
args
.
data_path
theta_freq
=
args
.
theta_freq
stats_functions
=
args
.
stats_functions
...
...
@@ -389,6 +395,7 @@ def process_time_series_table(df_in, args):
df_time_series
=
pd
.
DataFrame
(
data
=
time_series
,
index
=
index
,
columns
=
columns
)
# Print metadata
print
(
'DONE: Transforming each example...'
)
## Freq: Count missing entries using mask
ts_mask
=
df_time_series
[[
col
for
col
in
df_time_series
if
col
.
endswith
(
'_mask'
)]]
ts_mask
.
columns
=
[
col
.
replace
(
'_mask'
,
''
)
for
col
in
ts_mask
.
columns
]
...
...
@@ -404,14 +411,14 @@ def process_time_series_table(df_in, args):
imputed
=
(
1
-
ts_mask
).
astype
(
bool
)
&
(
ts_delta_time
>
0
)
print
(
'(freq) number of imputed entries :
\t
'
,
'{}'
.
format
(
imputed
.
sum
().
sum
(),
ts_delta_time
.
size
))
print
(
imputed
.
sum
().
re
set_index
().
to_string
(
header
=
None
,
index
=
None
)
)
imputed
.
sum
().
re
name
(
'count'
).
to_csv
(
data_path
+
'/'
+
'freq_imputed.csv'
)
not_imputed
=
(
1
-
ts_mask
).
astype
(
bool
)
&
(
ts_delta_time
==
0
)
print
(
'(freq) number of not imputed entries :
\t
'
,
'{}'
.
format
(
not_imputed
.
sum
().
sum
(),
ts_delta_time
.
size
))
print
(
not_imputed
.
sum
().
re
set_index
().
to_string
(
header
=
None
,
index
=
None
)
)
not_imputed
.
sum
().
re
name
(
'count'
).
to_csv
(
data_path
+
'/'
+
'freq_not_imputed.csv'
)
## Non-Freq: Count mis
i
sng entries
## Non-Freq: Count miss
i
ng entries
non_freq_cols
=
sorted
([
c
+
'_value'
for
c
in
set
(
variables
)
-
set
(
variables_num_freq
)])
non_freqs
=
df_time_series
[
non_freq_cols
]
print
(
'(non-freq) number of missing entries :
\t
'
,
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment