Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
MLD3
FIDDLE
Commits
6d3811f0
Commit
6d3811f0
authored
Feb 25, 2020
by
Shengpu Tang (tangsp)
Browse files
Merge branch 'refactor' into 'master'
Refactor See merge request
!1
parents
5432c2f3
353d67b7
Changes
3
Hide whitespace changes
Inline
Side-by-side
FIDDLE/helpers.py
View file @
6d3811f0
import
argparse
def
str2bool
(
v
):
if
isinstance
(
v
,
bool
):
return
v
if
v
.
lower
()
in
(
'yes'
,
'true'
,
't'
,
'y'
,
'1'
):
return
True
elif
v
.
lower
()
in
(
'no'
,
'false'
,
'f'
,
'n'
,
'0'
):
return
False
else
:
raise
argparse
.
ArgumentTypeError
(
'Boolean value expected.'
)
from
.config
import
*
import
pandas
as
pd
import
numpy
as
np
...
...
FIDDLE/run.py
View file @
6d3811f0
...
...
@@ -5,17 +5,9 @@ import numpy as np
import
time
import
os
def
str2bool
(
v
):
if
isinstance
(
v
,
bool
):
return
v
if
v
.
lower
()
in
(
'yes'
,
'true'
,
't'
,
'y'
,
'1'
):
return
True
elif
v
.
lower
()
in
(
'no'
,
'false'
,
'f'
,
'n'
,
'0'
):
return
False
else
:
raise
argparse
.
ArgumentTypeError
(
'Boolean value expected.'
)
import
argparse
from
.helpers
import
str2bool
parser
=
argparse
.
ArgumentParser
(
description
=
''
)
parser
.
add_argument
(
'--T'
,
type
=
float
,
required
=
True
)
parser
.
add_argument
(
'--dt'
,
type
=
float
,
required
=
True
)
...
...
@@ -96,6 +88,10 @@ print('N = {}'.format(N))
print
(
'L = {}'
.
format
(
L
))
print
(
''
,
flush
=
True
)
######
# Main
######
if
args
.
prefilter
:
print_header
(
'1) Pre-filter'
)
df_data
=
pre_filter
(
df_data
,
theta_1
,
df_population
,
args
)
...
...
@@ -106,7 +102,7 @@ df_data, df_types = detect_variable_data_type(df_data, value_type_override, args
df_time_invariant
,
df_time_series
=
split_by_timestamp_type
(
df_data
)
# Process time-invariant data
s
,
s_feature_names
,
s_feature_aliases
=
transform
_time_invariant
(
df_time_invariant
,
args
)
s
,
s_feature_names
,
s_feature_aliases
=
process
_time_invariant
(
df_time_invariant
,
args
)
# Process time-dependent data
X
,
X_feature_names
,
X_feature_aliases
=
transform
_time_dependent
(
df_time_series
,
args
)
X
,
X_feature_names
,
X_feature_aliases
=
process
_time_dependent
(
df_time_series
,
args
)
FIDDLE/steps.py
View file @
6d3811f0
...
...
@@ -6,6 +6,7 @@ FIDDLE Preprocessing steps
"""
from
.helpers
import
*
import
time
import
json
def
pre_filter
(
df
,
threshold
,
df_population
,
args
):
T
=
int
(
args
.
T
)
...
...
@@ -106,26 +107,27 @@ def split_by_timestamp_type(df):
print
(
'# rows (time-dependent):'
,
len
(
df_time_series
))
return
df_time_invariant
,
df_time_series
def
transform
_time_invariant
(
df_data_time_invariant
,
args
):
def
process
_time_invariant
(
df_data_time_invariant
,
args
):
data_path
=
args
.
data_path
df_population
=
args
.
df_population
theta_2
=
args
.
theta_2
print_header
(
'2
.1
) Transform time-invariant data'
,
char
=
'-'
)
print_header
(
'2
-A
) Transform time-invariant data'
,
char
=
'-'
)
dir_path
=
data_path
+
'/'
start_time
=
time
.
time
()
## Create Nxd^ table
df_time_invariant
=
process
_time_invariant_table
(
df_data_time_invariant
,
df_population
)
df_time_invariant
=
transform
_time_invariant_table
(
df_data_time_invariant
,
df_population
)
print
(
'Time elapsed: %f seconds'
%
(
time
.
time
()
-
start_time
))
## Discretize
s_all
,
s_all_feature_names
=
map_time_invariant_features
(
df_time_invariant
,
args
.
binarize
)
sparse
.
save_npz
(
dir_path
+
's_all.npz'
,
s_all
)
np
.
savetxt
(
dir_path
+
's_all.feature_names.txt'
,
s_all_feature_names
,
'"%s"'
)
with
open
(
dir_path
+
's_all.feature_names.json'
,
'w'
)
as
f
:
json
.
dump
(
list
(
s_all_feature_names
),
f
,
sort_keys
=
True
)
print
(
'Time elapsed: %f seconds'
%
(
time
.
time
()
-
start_time
))
print_header
(
'3
.1
) Post-filter time-invariant data'
,
char
=
'-'
)
print_header
(
'3
-A
) Post-filter time-invariant data'
,
char
=
'-'
)
## Filter
s
,
s_feature_names
,
s_feature_aliases
=
post_filter
(
s_all
,
s_all_feature_names
,
theta_2
)
...
...
@@ -136,35 +138,38 @@ def transform_time_invariant(df_data_time_invariant, args):
print
(
'Output'
)
print
(
's: shape={}, density={:.3f}'
.
format
(
s
.
shape
,
s
.
density
))
sparse
.
save_npz
(
dir_path
+
's.npz'
,
s
)
np
.
savetxt
(
dir_path
+
's.feature_names.txt'
,
s_feature_names
,
'"%s"'
)
with
open
(
dir_path
+
's.feature_aliases.yml'
,
'w'
)
as
f
:
yaml
.
dump
(
s_feature_aliases
,
f
,
default_flow_style
=
False
)
with
open
(
dir_path
+
's.feature_names.json'
,
'w'
)
as
f
:
json
.
dump
(
list
(
s_feature_names
),
f
,
sort_keys
=
True
)
with
open
(
dir_path
+
's.feature_aliases.json'
,
'w'
)
as
f
:
json
.
dump
(
s_feature_aliases
,
f
,
sort_keys
=
True
)
print
(
'Total time: %f seconds'
%
(
time
.
time
()
-
start_time
))
print
(
''
,
flush
=
True
)
return
s
,
s_feature_names
,
s_feature_aliases
def
transform
_time_dependent
(
df_data_time_series
,
args
):
def
process
_time_dependent
(
df_data_time_series
,
args
):
data_path
=
args
.
data_path
theta_2
=
args
.
theta_2
print_header
(
'2
.2
) Transform time-dependent data'
,
char
=
'-'
)
print_header
(
'2
-B
) Transform time-dependent data'
,
char
=
'-'
)
dir_path
=
data_path
+
'/'
start_time
=
time
.
time
()
## Create NxLxD^ table
df_time_series
,
dtypes_time_series
=
process
_time_series_table
(
df_data_time_series
,
args
)
df_time_series
,
dtypes_time_series
=
transform
_time_series_table
(
df_data_time_series
,
args
)
print
(
'Time elapsed: %f seconds'
%
(
time
.
time
()
-
start_time
))
## Map variables to features
X_all
,
X_all_feature_names
=
map_time_series_features
(
df_time_series
,
dtypes_time_series
,
args
)
sparse
.
save_npz
(
dir_path
+
'X_all.npz'
,
X_all
)
np
.
savetxt
(
dir_path
+
'X_all.feature_names.txt'
,
X_all_feature_names
,
'"%s"'
)
with
open
(
dir_path
+
'X_all.feature_names.json'
,
'w'
)
as
f
:
json
.
dump
(
list
(
X_all_feature_names
),
f
,
sort_keys
=
True
)
print
(
'Time elapsed: %f seconds'
%
(
time
.
time
()
-
start_time
))
## Filter features
print_header
(
'3
.2
) Post-filter time-dependent data'
,
char
=
'-'
)
print_header
(
'3
-B
) Post-filter time-dependent data'
,
char
=
'-'
)
print
(
X_all
.
shape
,
X_all
.
density
)
X
,
X_feature_names
,
X_feature_aliases
=
post_filter_time_series
(
X_all
,
X_all_feature_names
,
theta_2
,
args
)
print
(
X
.
shape
,
X
.
density
)
...
...
@@ -175,9 +180,10 @@ def transform_time_dependent(df_data_time_series, args):
print
(
'Output'
)
print
(
'X: shape={}, density={:.3f}'
.
format
(
X
.
shape
,
X
.
density
))
sparse
.
save_npz
(
dir_path
+
'X.npz'
,
X
)
np
.
savetxt
(
dir_path
+
'X.feature_names.txt'
,
X_feature_names
,
'"%s"'
)
with
open
(
dir_path
+
'X.feature_aliases.yml'
,
'w'
)
as
f
:
yaml
.
dump
(
X_feature_aliases
,
f
,
default_flow_style
=
False
)
with
open
(
dir_path
+
's.feature_names.json'
,
'w'
)
as
f
:
json
.
dump
(
list
(
X_feature_names
),
f
,
sort_keys
=
True
)
with
open
(
dir_path
+
'X.feature_aliases.json'
,
'w'
)
as
f
:
json
.
dump
(
X_feature_aliases
,
f
,
sort_keys
=
True
)
print
(
'Total time: %f seconds'
%
(
time
.
time
()
-
start_time
))
print
(
''
,
flush
=
True
)
...
...
@@ -187,7 +193,7 @@ def transform_time_dependent(df_data_time_series, args):
######
# Time-invariant routines
######
def
process
_time_invariant_table
(
df_in
,
df_population
):
def
transform
_time_invariant_table
(
df_in
,
df_population
):
df_in
=
df_in
.
copy
()
# Recorded Value (np.nan if not recorded)
...
...
@@ -296,7 +302,7 @@ def func_encode_single_time_series(i, g, variables, variables_num_freq, T, dt, s
raise
Exception
(
i
)
return
i
,
df_out
def
process
_time_series_table
(
df_in
,
args
):
def
transform
_time_series_table
(
df_in
,
args
):
data_path
=
args
.
data_path
theta_freq
=
args
.
theta_freq
stats_functions
=
args
.
stats_functions
...
...
@@ -389,6 +395,7 @@ def process_time_series_table(df_in, args):
df_time_series
=
pd
.
DataFrame
(
data
=
time_series
,
index
=
index
,
columns
=
columns
)
# Print metadata
print
(
'DONE: Transforming each example...'
)
## Freq: Count missing entries using mask
ts_mask
=
df_time_series
[[
col
for
col
in
df_time_series
if
col
.
endswith
(
'_mask'
)]]
ts_mask
.
columns
=
[
col
.
replace
(
'_mask'
,
''
)
for
col
in
ts_mask
.
columns
]
...
...
@@ -404,14 +411,14 @@ def process_time_series_table(df_in, args):
imputed
=
(
1
-
ts_mask
).
astype
(
bool
)
&
(
ts_delta_time
>
0
)
print
(
'(freq) number of imputed entries :
\t
'
,
'{}'
.
format
(
imputed
.
sum
().
sum
(),
ts_delta_time
.
size
))
print
(
imputed
.
sum
().
re
set_index
().
to_string
(
header
=
None
,
index
=
None
)
)
imputed
.
sum
().
re
name
(
'count'
).
to_csv
(
data_path
+
'/'
+
'freq_imputed.csv'
)
not_imputed
=
(
1
-
ts_mask
).
astype
(
bool
)
&
(
ts_delta_time
==
0
)
print
(
'(freq) number of not imputed entries :
\t
'
,
'{}'
.
format
(
not_imputed
.
sum
().
sum
(),
ts_delta_time
.
size
))
print
(
not_imputed
.
sum
().
re
set_index
().
to_string
(
header
=
None
,
index
=
None
)
)
not_imputed
.
sum
().
re
name
(
'count'
).
to_csv
(
data_path
+
'/'
+
'freq_not_imputed.csv'
)
## Non-Freq: Count mis
i
sng entries
## Non-Freq: Count miss
i
ng entries
non_freq_cols
=
sorted
([
c
+
'_value'
for
c
in
set
(
variables
)
-
set
(
variables_num_freq
)])
non_freqs
=
df_time_series
[
non_freq_cols
]
print
(
'(non-freq) number of missing entries :
\t
'
,
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment