Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
MLD3
FIDDLE
Commits
e3fc4342
Commit
e3fc4342
authored
Jun 20, 2019
by
Shengpu Tang (tangsp)
Browse files
experiments: data extraction
parent
f4a1c687
Changes
16
Hide whitespace changes
Inline
Side-by-side
mimic3_experiments/1_data_extraction/InclusionExclusion.ipynb
0 → 100644
View file @
e3fc4342
%% Cell type:code id: tags:
```
python
import
numpy
as
np
import
pandas
as
pd
import
os
,
sys
,
time
from
datetime
import
datetime
,
timedelta
import
pickle
from
collections
import
Counter
```
%% Cell type:code id: tags:
```
python
import
yaml
config
=
yaml
.
safe_load
(
open
(
'../config.yaml'
))
data_path
=
config
[
'data_path'
]
mimic3_path
=
config
[
'mimic3_path'
]
import
pathlib
pathlib
.
Path
(
data_path
,
'population'
).
mkdir
(
parents
=
True
,
exist_ok
=
True
)
```
%% Cell type:code id: tags:
```
python
patients
=
pd
.
read_csv
(
mimic3_path
+
'PATIENTS.csv'
,
parse_dates
=
[
'DOB'
,
'DOD'
],
usecols
=
[
'SUBJECT_ID'
,
'DOB'
,
'DOD'
])
admissions
=
pd
.
read_csv
(
mimic3_path
+
'ADMISSIONS.csv'
,
parse_dates
=
[
'DEATHTIME'
],
usecols
=
[
'SUBJECT_ID'
,
'HADM_ID'
,
'DEATHTIME'
,
'HOSPITAL_EXPIRE_FLAG'
])
examples
=
pd
.
read_csv
(
data_path
+
'prep/icustays_MV.csv'
,
parse_dates
=
[
'INTIME'
,
'OUTTIME'
]).
sort_values
(
by
=
'ICUSTAY_ID'
)
# Only Metavision
examples
=
pd
.
merge
(
examples
,
patients
,
on
=
'SUBJECT_ID'
,
how
=
'left'
)
examples
=
pd
.
merge
(
examples
,
admissions
,
on
=
[
'SUBJECT_ID'
,
'HADM_ID'
],
how
=
'left'
)
examples
[
'AGE'
]
=
examples
.
apply
(
lambda
x
:
(
x
[
'INTIME'
]
-
x
[
'DOB'
]).
total_seconds
(),
axis
=
1
)
/
3600
/
24
/
365.25
examples
[
'LOS'
]
=
examples
[
'LOS'
]
*
24
# Convert to hours
```
%% Cell type:code id: tags:
```
python
tasks
=
[
'ARF'
,
'Shock'
]
label_defs
=
{
task
:
pd
.
read_csv
(
data_path
+
'labels/{}.csv'
.
format
(
task
))
for
task
in
tasks
}
```
%% Cell type:code id: tags:
```
python
# Start
N
=
len
(
examples
[
'ICUSTAY_ID'
].
unique
())
print
(
'Source population'
,
N
)
```
%%%% Output: stream
Source population 23620
%% Cell type:code id: tags:
```
python
assert
(
examples
[
'INTIME'
]
<=
examples
[
'OUTTIME'
]).
all
()
assert
(
examples
[
'DBSOURCE'
]
==
'metavision'
).
all
()
```
%% Cell type:code id: tags:
```
python
# Remove non-adults
min_age
=
18
max_age
=
np
.
inf
# no max age
examples
=
examples
[(
examples
.
AGE
>=
min_age
)
&
(
examples
.
AGE
<=
max_age
)]
print
(
'Exclude non-adults'
,
examples
[
'ICUSTAY_ID'
].
nunique
())
examples_
=
examples
```
%%%% Output: stream
Exclude non-adults 23593
%% Cell type:code id: tags:
```
python
for
T
in
[
4
,
12
]:
print
(
'======'
)
print
(
'prediction time'
,
T
,
'hour'
)
# Remove died before cutoff hour
examples
=
examples_
[(
examples_
.
DEATHTIME
>=
examples_
.
INTIME
+
timedelta
(
hours
=
T
))
|
(
examples_
.
DEATHTIME
.
isnull
())]
print
(
'Exclude deaths'
,
examples
[
'ICUSTAY_ID'
].
nunique
())
# Remove LOS < cutoff hour
examples
=
examples
[
examples
[
'LOS'
]
>=
T
]
print
(
'Exclude discharges'
,
examples
[
'ICUSTAY_ID'
].
nunique
())
populations
=
{}
# Remove event onset before (cutoff)
for
task
in
tasks
:
print
(
'---'
)
print
(
'Outcome'
,
task
)
label_def
=
label_defs
[
task
]
# Needed to preserve index in DataFrame
pop
=
examples
[[
'ICUSTAY_ID'
]].
reset_index
()
\
.
merge
(
label_def
[[
'ICUSTAY_ID'
,
'{}_ONSET_HOUR'
.
format
(
task
)]],
on
=
'ICUSTAY_ID'
,
how
=
'left'
)
\
.
set_index
(
'index'
).
copy
()
pop
=
pop
[(
pop
[
'{}_ONSET_HOUR'
.
format
(
task
)]
>=
T
)
|
pop
[
'{}_ONSET_HOUR'
.
format
(
task
)].
isnull
()]
pop
[
'{}_LABEL'
.
format
(
task
)]
=
pop
[
'{}_ONSET_HOUR'
.
format
(
task
)].
notnull
().
astype
(
int
)
pop
.
to_csv
(
data_path
+
'population/{}_{}h.csv'
.
format
(
task
,
T
),
index
=
False
)
# Construct boolean mask
## NOTE: uses pop.index here, assuming index is preserved
idx
=
pop
.
index
## Otherwise, there's a slower version
# if False:
# idx = np.array([examples[examples.ICUSTAY_ID == i].index[0] for i in pop['ICUSTAY_ID']])
mask_array
=
np
.
zeros
(
N
,
dtype
=
bool
)
mask_array
[
idx
]
=
True
# Save population boolean mask
np
.
save
(
data_path
+
'population/mask_{}_{}h.npy'
.
format
(
task
,
T
),
mask_array
)
np
.
savetxt
(
data_path
+
'population/mask_{}_{}h.txt'
.
format
(
task
,
T
),
mask_array
,
fmt
=
'%i'
)
populations
[
task
]
=
pop
print
(
'Exclude onset'
,
len
(
pop
))
```
%%%% Output: stream
======
prediction time 4 hour
Exclude deaths 23499
Exclude discharges 23401
---
Outcome ARF
Exclude onset 15873
---
Outcome Shock
Exclude onset 19342
======
prediction time 12 hour
Exclude deaths 23319
Exclude discharges 23060
---
Outcome ARF
Exclude onset 14174
---
Outcome Shock
Exclude onset 17588
%% Cell type:code id: tags:
```
python
for
T
in
[
48
]:
print
(
'======'
)
print
(
'prediction time'
,
T
,
'hour'
)
# Remove died before cutoff hour
examples
=
examples_
[(
examples_
.
DEATHTIME
>=
examples_
.
INTIME
+
timedelta
(
hours
=
T
))
|
(
examples_
.
DEATHTIME
.
isnull
())]
print
(
'Exclude deaths'
,
examples
[
'ICUSTAY_ID'
].
nunique
())
# Remove LOS < cutoff hour
examples
=
examples
[
examples
[
'LOS'
]
>=
T
]
print
(
'Exclude discharges'
,
examples
[
'ICUSTAY_ID'
].
nunique
())
# Remove event onset before (cutoff)
for
task
in
[
'mortality'
]:
print
(
'---'
)
print
(
'Outcome'
,
task
)
examples
[
'{}_LABEL'
.
format
(
task
)]
=
examples
.
HOSPITAL_EXPIRE_FLAG
pop
=
examples
[[
'ICUSTAY_ID'
,
'{}_LABEL'
.
format
(
task
)]]
pop
.
to_csv
(
data_path
+
'population/{}_{}h.csv'
.
format
(
task
,
T
),
index
=
False
)
print
(
'Exclude onset'
,
len
(
pop
))
```
%%%% Output: stream
======
prediction time 48 hour
Exclude deaths 22776
Exclude discharges 11695
---
Outcome mortality
Exclude onset 11695
%% Cell type:code id: tags:
```
python
```
mimic3_experiments/1_data_extraction/LabelDistributions.ipynb
0 → 100644
View file @
e3fc4342
%% Cell type:code id: tags:
```
python
import
pandas
as
pd
import
numpy
as
np
import
yaml
data_path
=
yaml
.
full_load
(
open
(
'../config.yaml'
))[
'data_path'
]
from
matplotlib
import
pyplot
as
plt
import
matplotlib
```
%% Cell type:code id: tags:
```
python
matplotlib
.
rcParams
[
'figure.figsize'
]
=
[
8
,
8
]
matplotlib
.
rcParams
[
'font.size'
]
=
15
```
%% Cell type:code id: tags:
```
python
def
visualize_labels
(
df
,
task
):
df
[
'{}_ONSET_HOUR'
.
format
(
task
)].
plot
.
hist
(
bins
=
np
.
arange
(
-
5
,
75
,
0.5
),
alpha
=
0.9
)
plt
.
xlim
(
-
4
,
100
)
plt
.
xlabel
(
'{} onset hour'
.
format
(
task
))
plt
.
savefig
(
'Onset_{}-histogram.png'
.
format
(
task
),
dpi
=
300
)
plt
.
show
()
```
%% Cell type:code id: tags:
```
python
df_ARF
=
pd
.
read_csv
(
data_path
+
'labels/ARF.csv'
,
index_col
=
'ICUSTAY_ID'
)
df_Shock
=
pd
.
read_csv
(
data_path
+
'labels/Shock.csv'
,
index_col
=
'ICUSTAY_ID'
)
```
%% Cell type:code id: tags:
```
python
visualize_labels
(
df_ARF
,
'ARF'
)
visualize_labels
(
df_Shock
,
'Shock'
)
```
%%%% Output: display_data

%%%% Output: display_data

%% Cell type:markdown id: tags:
---
%% Cell type:code id: tags:
```
python
from
datetime
import
timedelta
cutoff_h
=
4
mimic3_path
=
yaml
.
full_load
(
open
(
'config.yaml'
))[
'mimic3_path'
]
```
%% Cell type:code id: tags:
```
python
patients
=
pd
.
read_csv
(
mimic3_path
+
'PATIENTS.csv'
,
parse_dates
=
[
'DOB'
,
'DOD'
],
usecols
=
[
'SUBJECT_ID'
,
'DOB'
,
'DOD'
])
admissions
=
pd
.
read_csv
(
mimic3_path
+
'ADMISSIONS.csv'
,
parse_dates
=
[
'DEATHTIME'
],
usecols
=
[
'SUBJECT_ID'
,
'HADM_ID'
,
'DEATHTIME'
])
examples
=
pd
.
read_csv
(
data_path
+
'prep/icustays_MV.csv'
,
parse_dates
=
[
'INTIME'
,
'OUTTIME'
]).
sort_values
(
by
=
'ICUSTAY_ID'
)
# Only Metavision
examples
=
pd
.
merge
(
examples
,
patients
,
on
=
'SUBJECT_ID'
,
how
=
'left'
)
examples
=
pd
.
merge
(
examples
,
admissions
,
on
=
[
'SUBJECT_ID'
,
'HADM_ID'
],
how
=
'left'
)
examples
[
'AGE'
]
=
examples
.
apply
(
lambda
x
:
(
x
[
'INTIME'
]
-
x
[
'DOB'
]).
total_seconds
(),
axis
=
1
)
/
3600
/
24
/
365.25
examples
[
'LOS'
]
=
examples
[
'LOS'
]
*
24
# Convert to hours
```
%% Cell type:code id: tags:
```
python
(
examples
[[
'LOS'
]]
/
24.
).
describe
()
```
%%%% Output: execute_result
LOS
count 23620.000000
mean 3.593499
std 4.971162
min 0.000400
25% 1.151600
50% 1.996050
75% 3.835000
max 101.739000
%% Cell type:code id: tags:
```
python
# Remove non-adults
min_age
=
18
max_age
=
np
.
inf
# no max age
examples
=
examples
[(
examples
.
AGE
>=
min_age
)
&
(
examples
.
AGE
<=
max_age
)]
```
%% Cell type:code id: tags:
```
python
examples
[
'ICUSTAY_ID'
].
nunique
()
```
%%%% Output: execute_result
23593
%% Cell type:code id: tags:
```
python
examples
[[
'ICUSTAY_ID'
,
'DEATHTIME'
,
'INTIME'
]].
iloc
[
10
:]
```
%%%% Output: execute_result
ICUSTAY_ID DEATHTIME INTIME
10 200038 NaT 2143-10-24 20:35:24
11 200040 NaT 2153-10-24 16:01:41
12 200049 NaT 2118-08-28 08:56:44
13 200050 NaT 2149-07-14 17:51:18
14 200053 NaT 2166-02-27 18:45:49
15 200055 NaT 2179-03-17 09:59:19
16 200061 NaT 2134-01-23 16:38:46
17 200063 NaT 2141-03-09 23:20:49
18 200067 NaT 2126-06-29 18:45:37
19 200069 NaT 2195-12-12 02:12:35
20 200072 NaT 2106-03-03 19:39:49
21 200075 NaT 2159-09-23 00:13:20
22 200079 NaT 2158-01-02 20:54:10
23 200081 NaT 2142-03-01 16:46:10
24 200087 NaT 2196-08-30 11:19:49
25 200094 NaT 2198-06-21 12:27:37
26 200095 2113-10-31 00:20:00 2113-10-27 15:23:21
27 200098 NaT 2136-03-27 11:50:32
28 200099 NaT 2163-06-21 19:49:09
29 200104 NaT 2198-02-24 12:33:19
30 200105 2104-10-23 00:01:00 2104-10-23 18:30:01
31 200108 NaT 2110-12-10 01:18:17
32 200116 NaT 2198-03-19 20:16:11
33 200126 NaT 2126-12-14 11:29:28
34 200131 NaT 2176-10-30 12:05:18
35 200133 NaT 2172-02-14 14:53:10
36 200135 NaT 2145-10-12 18:46:47
37 200138 NaT 2190-07-28 19:41:12
38 200141 NaT 2143-10-15 11:24:29
39 200143 2191-04-28 14:31:00 2191-04-01 21:45:49
... ... ... ...
23590 299867 2186-08-02 02:31:00 2186-07-15 16:55:40
23591 299871 NaT 2170-02-02 22:33:55
23592 299872 NaT 2192-02-22 14:52:52
23593 299875 NaT 2155-06-05 15:59:09
23594 299879 NaT 2174-08-15 09:23:17
23595 299880 NaT 2159-06-02 20:04:41
23596 299883 NaT 2165-05-24 17:07:19
23597 299889 2172-06-09 04:30:00 2172-05-30 09:05:45
23598 299898 NaT 2169-09-16 04:29:52
23599 299901 NaT 2186-11-26 17:23:47
23600 299904 NaT 2187-01-21 00:25:29
23601 299909 NaT 2182-04-25 04:08:35
23602 299913 NaT 2121-07-21 02:08:50
23603 299914 NaT 2141-08-16 23:08:56
23604 299921 NaT 2103-05-08 09:43:51
23605 299923 2158-12-13 17:36:00 2158-12-13 13:14:38
23606 299928 NaT 2137-11-06 17:23:04
23607 299929 NaT 2145-01-14 22:33:54
23608 299930 NaT 2194-07-04 12:50:24
23609 299933 NaT 2105-08-03 14:40:37
23610 299943 NaT 2151-08-25 03:29:06
23611 299947 NaT 2116-11-27 21:10:23
23612 299948 NaT 2119-05-25 13:47:31
23613 299949 NaT 2118-07-16 05:42:45
23614 299950 NaT 2122-06-20 13:25:29
23615 299956 NaT 2177-05-29 07:38:54
23616 299957 NaT 2132-10-13 10:41:17
23617 299962 NaT 2195-11-25 19:04:06
23618 299979 NaT 2127-12-05 02:16:42
23619 299998 NaT 2181-07-05 18:47:40
[23583 rows x 3 columns]
%% Cell type:code id: tags:
```
python
# Death time < intime
sum
((
examples
[
'DEATHTIME'
]
-
examples
[
'INTIME'
]).
dt
.
total_seconds
()
<
0
)
```
%%%% Output: execute_result
27
%% Cell type:code id: tags:
```
python
# LOS vs OUTTIME - INTIME
((
examples
[
'OUTTIME'
]
-
examples
[
'INTIME'
]).
dt
.
total_seconds
()
/
3600
-
examples
[
'LOS'
]).
describe
()
```
%%%% Output: execute_result
count 2.359300e+04
mean -7.175857e-06
std 6.949819e-04
min -1.200000e-03
25% -6.111111e-04
50% -7.105427e-15
75% 5.888889e-04
max 1.188889e-03
dtype: float64
%% Cell type:code id: tags:
```
python
((
examples
[
'DEATHTIME'
]
-
examples
[
'INTIME'
]).
dt
.
total_seconds
()
/
3600
).
hist
(
bins
=
np
.
arange
(
-
5
,
200
,
1
),
alpha
=
0.9
)
plt
.
xlabel
(
'Survival time (h)'
)
plt
.
ylabel
(
'Number of ICU stays'
)
plt
.
text
(
80
,
30
,
'Death <= 200h: {}
\n
Death > 200h: {} (not shown)
\n
Death = null: {} (not shown)'
\
.
format
(
sum
(((
examples
[
'DEATHTIME'
]
-
examples
[
'INTIME'
]).
dt
.
total_seconds
()
/
3600
)
<=
200
),
sum
(((
examples
[
'DEATHTIME'
]
-
examples
[
'INTIME'
]).
dt
.
total_seconds
()
/
3600
)
>
200
),
sum
(
examples
[
'DEATHTIME'
].
isnull
())))
plt
.
show
()
```
%%%% Output: display_data

%% Cell type:code id: tags:
```
python
examples
[
'ICUSTAY_ID'
].
nunique
()
```
%%%% Output: execute_result
23593
%% Cell type:code id: tags:
```
python
sum
((
examples
[
'LOS'
])
<=
200
),
sum
((
examples
[
'LOS'
])
>
200
)
```
%%%% Output: execute_result
(21439, 2154)
%% Cell type:code id: tags:
```
python
(
examples
[
'LOS'
]).
hist
(
bins
=
np
.
arange
(
-
5
,
200
,
1
),
alpha
=
0.9
)
plt
.
xlabel
(
'Length of stay (h)'
)
plt
.
ylabel
(
'Number of ICU stays'
)
plt
.
text
(
80
,
500
,
'LOS <= 200h: {}
\n
LOS > 200h: {} (not shown)'
.
format
(
sum
((
examples
[
'LOS'
])
<=
200
),
sum
((
examples
[
'LOS'
])
>
200
)))
plt
.
show
()
```
%%%% Output: display_data

%% Cell type:code id: tags:
```
python
# Remove died before cutoff hour
examples
=
examples
[(
examples
.
DEATHTIME
>=
examples
.
INTIME
+
timedelta
(
hours
=
cutoff_h
))
|
(
examples
.
DEATHTIME
.
isnull
())]
```
%% Cell type:code id: tags:
```
python
# Remove LOS < cutoff hour
examples
=
examples
[
examples
[
'LOS'
]
>=
cutoff_h
]
```
%% Cell type:code id: tags:
```
python
```
mimic3_experiments/1_data_extraction/PopulationSummary.ipynb
0 → 100644
View file @
e3fc4342
%% Cell type:code id: tags:
```
python
import
yaml
with
open
(
'../config.yaml'
)
as
f
:
config
=
yaml
.
full_load
(
f
)
data_path
=
config
[
'data_path'
]
mimic3_path
=
config
[
'mimic3_path'
]
import
pandas
as
pd
import
itertools
from
collections
import
Counter
```
%% Cell type:code id: tags:
```
python
icustays
=
pd
.
read_csv
(
data_path
+
'prep/icustays_MV.csv'
)
partition
=
icustays
.
set_index
(
'ICUSTAY_ID'
)[[
'partition'
]]
tasks
=
[
'ARF'
,
'Shock'
]
Ts
=
[
4
,
12
]
populations
=
{}
for
task
,
T
in
itertools
.
product
(
tasks
,
Ts
):
pop
=
pd
.
read_csv
(
data_path
+
'population/{}_{}h.csv'
.
format
(
task
,
T
))
populations
[
task
,
T
]
=
pop
.
set_index
(
'ICUSTAY_ID'
)[[
'{}_LABEL'
.
format
(
task
)]]
populations
[
'mortality'
,
48
]
=
pd
.
read_csv
(
data_path
+
'population/pop.mortality_benchmark.csv'
.
format
(
'mortality'
,
48
))
\
.
set_index
(
'ID'
)[[
'{}_LABEL'
.
format
(
'mortality'
)]]
```
%% Cell type:code id: tags:
```
python
df_out
=
[]
for
(
task
,
T
),
labels
in
populations
.
items
():
df
=
labels
.
join
(
partition
)
c
=
Counter
(
df
[
'partition'
])
frac
=
df
.
groupby
(
'partition'
).
mean
()[
'{}_LABEL'
.
format
(
task
)]
df_out
.
append
([
task
,
T
,
len
(
df
),
df
[
'{}_LABEL'
.
format
(
task
)].
mean
(),
c
[
'train'
],
frac
[
'train'
],
c
[
'val'
],
frac
[
'val'
],
c
[
'test'
],
frac
[
'test'
]])
```
%% Cell type:code id: tags:
```
python
df_out
=
pd
.
DataFrame
(
df_out
,
columns
=
[
'task'
,
'T'
,
'TOTAL_N'
,
'TOTAL_%'
,
'train_N'
,
'train_%'
,
'val_N'
,
'val_%'
,
'test_N'
,
'test_%'
])
```
%% Cell type:code id: tags: