Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
MLD3
FIDDLE
Commits
f3a7ee10
Commit
f3a7ee10
authored
May 07, 2020
by
Shengpu Tang (tangsp)
Browse files
Ordinal encoding
parent
b65693ab
Changes
4
Hide whitespace changes
Inline
Side-by-side
FIDDLE/config.py
View file @
f3a7ee10
...
...
@@ -7,6 +7,7 @@ var_col = config['column_names']['var_name']
val_col
=
config
[
'column_names'
][
'var_value'
]
t_col
=
config
[
'column_names'
][
't'
]
use_ordinal_encoding
=
config
[
'use_ordinal_encoding'
]
value_type_override
=
config
[
'value_types'
]
parallel
=
True
...
...
FIDDLE/config.yaml
View file @
f3a7ee10
...
...
@@ -5,6 +5,8 @@ column_names:
var_name
:
variable_name
var_value
:
variable_value
use_ordinal_encoding
:
no
value_types
:
# enter the feature type that you would like to override in the following format:
FIRST_WARDID
:
Categorical
...
...
FIDDLE/helpers.py
View file @
f3a7ee10
...
...
@@ -80,20 +80,26 @@ def select_dtype(df, dtype, dtypes=None):
assert
False
return
def
smart_qcut_dummify
(
x
,
q
):
z
=
smart_qcut
(
x
,
q
)
return
pd
.
get_dummies
(
z
,
prefix
=
z
.
name
)
def
smart_qcut
(
x
,
q
):
def
smart_qcut_dummify
(
x
,
q
,
use_ordinal_encoding
=
False
):
# ignore strings when performing qcut
x
=
x
.
copy
()
x
=
x
.
apply
(
make_float
)
m
=
x
.
apply
(
np
.
isreal
)
if
x
.
loc
[
m
].
dropna
().
nunique
()
>
1
:
# when more than one numeric values
x
.
loc
[
m
]
=
pd
.
qcut
(
x
.
loc
[
m
].
to_numpy
(),
q
=
q
,
duplicates
=
'drop'
)
# bins = np.percentile(x.loc[m].to_numpy(), [0, 20, 40, 60, 80, 100])
# x.loc[m] = pd.cut(x, bins)
return
x
z
=
x
.
copy
()
z
=
z
.
apply
(
make_float
)
m
=
z
.
apply
(
np
.
isreal
)
if
z
.
loc
[
m
].
dropna
().
nunique
()
>
1
:
# when more than one numeric values
if
use_ordinal_encoding
:
bin_edges
=
np
.
nanpercentile
(
z
.
loc
[
m
].
astype
(
float
).
to_numpy
(),
[
0
,
20
,
40
,
60
,
80
,
100
])
bin_edges
=
np
.
unique
(
bin_edges
)
col_names
=
[
'{}>={}'
.
format
(
z
.
name
,
bin_edge
)
for
bin_edge
in
bin_edges
[:
-
1
]]
out
=
pd
.
DataFrame
(
0
,
z
.
index
,
col_names
)
for
i
,
bin_edge
in
enumerate
(
bin_edges
[:
-
1
]):
out
.
loc
[
m
,
col_names
[
i
]]
=
(
z
.
loc
[
m
]
>
bin_edge
).
astype
(
int
)
out
=
pd
.
concat
([
out
,
pd
.
get_dummies
(
z
.
where
(
~
m
,
np
.
nan
),
prefix
=
z
.
name
)],
axis
=
1
)
else
:
z
.
loc
[
m
]
=
pd
.
qcut
(
z
.
loc
[
m
].
to_numpy
(),
q
=
q
,
duplicates
=
'drop'
)
out
=
pd
.
get_dummies
(
z
,
prefix
=
z
.
name
)
else
:
out
=
pd
.
get_dummies
(
x
,
prefix
=
x
.
name
)
return
out
def
smart_dummify_impute
(
x
):
x
=
x
.
copy
()
...
...
FIDDLE/steps.py
View file @
f3a7ee10
...
...
@@ -107,6 +107,7 @@ def split_by_timestamp_type(df):
print
(
'# rows (time-dependent):'
,
len
(
df_time_series
))
return
df_time_invariant
,
df_time_series
def
process_time_invariant
(
df_data_time_invariant
,
args
):
data_path
=
args
.
data_path
df_population
=
args
.
df_population
...
...
@@ -121,7 +122,7 @@ def process_time_invariant(df_data_time_invariant, args):
print
(
'Time elapsed: %f seconds'
%
(
time
.
time
()
-
start_time
))
## Discretize
s_all
,
s_all_feature_names
=
map_time_invariant_features
(
df_time_invariant
,
args
.
binarize
)
s_all
,
s_all_feature_names
=
map_time_invariant_features
(
df_time_invariant
,
args
)
sparse
.
save_npz
(
dir_path
+
's_all.npz'
,
s_all
)
with
open
(
dir_path
+
's_all.feature_names.json'
,
'w'
)
as
f
:
json
.
dump
(
list
(
s_all_feature_names
),
f
,
sort_keys
=
True
)
...
...
@@ -205,16 +206,11 @@ def transform_time_invariant_table(df_in, df_population):
print
(
'number of missing entries :
\t
'
,
'{} out of {} total'
.
format
(
df_value
.
isna
().
sum
().
sum
(),
df_value
.
size
))
return
df_value
def
map_time_invariant_features
(
df
,
bin_numeric
=
True
):
def
map_time_invariant_features
(
df
,
args
):
# Categorical -> binary features
# Numeric -> binary/float-valued features
if
bin_numeric
:
# df_mixed = df.apply(smart_qcut, q=5)
# features_mixed = pd.get_dummies(df_mixed, columns=df_mixed.columns, prefix_sep=':')
# time_invariant_features = features_mixed
# assert time_invariant_features.astype(int).dtypes.nunique() == 1
out
=
[
smart_qcut_dummify
(
df
[
col
],
q
=
5
)
for
col
in
df
.
columns
]
if
args
.
binarize
:
out
=
[
smart_qcut_dummify
(
df
[
col
],
q
=
5
,
use_ordinal_encoding
=
use_ordinal_encoding
)
for
col
in
df
.
columns
]
time_invariant_features
=
pd
.
concat
(
out
,
axis
=
1
)
feature_names_all
=
time_invariant_features
.
columns
.
values
sdf
=
time_invariant_features
.
astype
(
pd
.
SparseDtype
(
int
,
fill_value
=
0
))
...
...
@@ -451,10 +447,10 @@ def map_time_series_features(df_time_series, dtypes, args):
print
(
' Converting variables to binary features'
)
if
parallel
:
out
=
Parallel
(
n_jobs
=
n_jobs
,
verbose
=
10
)(
# Need to share global variables
delayed
(
smart_qcut_dummify
)(
col_data
,
q
=
5
)
for
col_data
in
ts_mixed_cols
delayed
(
smart_qcut_dummify
)(
col_data
,
q
=
5
,
use_ordinal_encoding
=
use_ordinal_encoding
)
for
col_data
in
ts_mixed_cols
)
else
:
out
=
[
smart_qcut_dummify
(
col_data
,
q
=
5
)
for
col_data
in
tqdm
(
ts_mixed_cols
)]
out
=
[
smart_qcut_dummify
(
col_data
,
q
=
5
,
use_ordinal_encoding
=
use_ordinal_encoding
)
for
col_data
in
tqdm
(
ts_mixed_cols
)]
else
:
dtype
=
float
df
=
ts_mixed
.
copy
()
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment