Add all files

fd3064e9 · Shengpu Tang (tangsp) · fe6333fb · fd3064e9 · fd3064e9 · fd3064e9
Commit fd3064e9 authored 5 years ago by Shengpu Tang (tangsp)
--- a/1_Extract_Features.ipynb
+++ b/1_Extract_Features.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import scipy.stats\n",
+    "import pickle, os, time\n",
+    "import itertools\n",
+    "from datetime import datetime, timedelta\n",
+    "from collections import Counter, defaultdict, namedtuple\n",
+    "from PIL import Image\n",
+    "import yaml\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "import seaborn as sns\n",
+    "from matplotlib import pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_dir = './data/'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the population, labels, and baseline features\n",
+    "pop = pd.read_csv(data_dir + 'population/d10_with_vitals.csv').set_index('BMT_ID')\n",
+    "\n",
+    "df_label_full = pop.join(pd.read_csv(data_dir + 'prep/label.csv', index_col='BMT_ID'), how='left')\n",
+    "df_static = pop.join(pd.read_csv(data_dir + 'features/static.csv', index_col='BMT_ID'), how='left')\n",
+    "df_static.index.rename('id', inplace=True)\n",
+    "\n",
+    "df_label = df_label_full['Label_GVHD']\n",
+    "df_label34 = (df_label_full['GVHD_max_grade'] >= 3).astype(int)\n",
+    "assert not df_static[df_static.isnull().any(axis=1)].any().any()\n",
+    "\n",
+    "# Load the vital sign time series\n",
+    "ts_vitals_by_bmt = pickle.load(open(data_dir + 'features/ts_vitals_by_bmt_2014_2017_MiChart.p', 'rb'))\n",
+    "ts_vitals_by_bmt = {ID: ts_vitals_by_bmt[ID] for ID in list(pop.index)}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Population size: 324\n",
+      "\n",
+      "Class balance\n",
+      "{0,1} vs. {2,3,4}: \t31.8%\n",
+      "{0,1,2} vs. {3,4}: \t13.6%\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('Population size:', len(ts_vitals_by_bmt))\n",
+    "print()\n",
+    "print('Class balance')\n",
+    "print('{{0,1}} vs. {{2,3,4}}: \\t{:.1%}'.format(df_label.mean()))\n",
+    "print('{{0,1,2}} vs. {{3,4}}: \\t{:.1%}'.format(df_label34.mean()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Feature Extraction: 100%|██████████| 278/278 [00:01<00:00, 263.91it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Extract vital sign features\n",
+    "variables = ['HR', 'RR', 'SysBP', 'DiaBP', 'Temp', 'SpO2']\n",
+    "t0, T = 0, 10\n",
+    "dt = 1\n",
+    "\n",
+    "import tsfresh\n",
+    "\n",
+    "def get_trend_features(t0, T, dt):\n",
+    "    # Recursively summarizing\n",
+    "    # First computes summary for every daily window (dt)\n",
+    "    # Then computes trend features\n",
+    "    \n",
+    "    # Daily vitals summary statistics\n",
+    "    D_tmp = {}\n",
+    "    for key, df in ts_vitals_by_bmt.items():\n",
+    "        df = df[(t0 <= df['t']) & (df['t'] < T)].set_index('t').copy()\n",
+    "        df = df.rename(columns={v: '{}_dt={}'.format(v, dt) for v in variables})\n",
+    "        df = df.groupby(\n",
+    "            pd.cut(df.index, np.arange(t0, T+dt, dt), right=False)\n",
+    "        ).agg([\n",
+    "            'mean', 'std',\n",
+    "            'min', 'max',\n",
+    "        ])\n",
+    "        df.index.rename('t', inplace=True)\n",
+    "        D_tmp[key] = df.reset_index()\n",
+    "    \n",
+    "    timeseries = pd.concat(D_tmp)\n",
+    "    timeseries.index.rename('id', level=0, inplace=True)\n",
+    "    timeseries = timeseries.sort_index()\n",
+    "    timeseries = timeseries.reset_index(level=0).set_index(['id', 't'])\n",
+    "\n",
+    "    timeseries.columns = ['_'.join(col).strip() for col in timeseries.columns.values]\n",
+    "    stacked_ts = timeseries.stack().copy()\n",
+    "    stacked_ts.index.rename('variable', level=-1, inplace=True)\n",
+    "    stacked_ts.rename('value', inplace=True)\n",
+    "    stacked_ts = stacked_ts.reset_index()\n",
+    "\n",
+    "    assert not pd.isnull(stacked_ts['value']).any()\n",
+    "    \n",
+    "    feature_params = {\n",
+    "        'mean': None,\n",
+    "        'linear_trend': [{'attr': 'slope'}],\n",
+    "        'sample_entropy': None,\n",
+    "        'fft_coefficient': [\n",
+    "            {'coeff': 1, 'attr': 'abs'},\n",
+    "            {'coeff': 1, 'attr': 'angle'},\n",
+    "        ],\n",
+    "    }\n",
+    "    extracted_features = tsfresh.extract_features(\n",
+    "        stacked_ts, column_id='id', column_sort='t', column_kind='variable', column_value='value',\n",
+    "        default_fc_parameters=feature_params,\n",
+    "    )\n",
+    "    \n",
+    "    return extracted_features\n",
+    "\n",
+    "\n",
+    "extracted_features = get_trend_features(t0, T, dt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extracted_features.to_csv('data/ts_features.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(324, 652)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Bin values by quintiles\n",
+    "df_features = df_static.join(\n",
+    "    pd.get_dummies(extracted_features.apply(pd.qcut, q=5, duplicates='drop'), prefix_sep='_')\n",
+    ")\n",
+    "print(df_features.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Age_(-0.001, 18.0]</th>\n",
+       "      <th>Age_(18.0, 45.0]</th>\n",
+       "      <th>Age_(45.0, 75.0]</th>\n",
+       "      <th>Disease Code category_Malignant</th>\n",
+       "      <th>Disease Code category_Non-malignant</th>\n",
+       "      <th>Disease Risk_0 - Non-malignant</th>\n",
+       "      <th>Disease Risk_1 - Low</th>\n",
+       "      <th>Disease Risk_2 - Intermediate</th>\n",
+       "      <th>Disease Risk_3 - High</th>\n",
+       "      <th>Intensity_0 - Full</th>\n",
+       "      <th>...</th>\n",
+       "      <th>Temp_dt=1_std__mean_(0.0977, 0.157]</th>\n",
+       "      <th>Temp_dt=1_std__mean_(0.157, 0.194]</th>\n",
+       "      <th>Temp_dt=1_std__mean_(0.194, 0.235]</th>\n",
+       "      <th>Temp_dt=1_std__mean_(0.235, 0.292]</th>\n",
+       "      <th>Temp_dt=1_std__mean_(0.292, 0.564]</th>\n",
+       "      <th>Temp_dt=1_std__sample_entropy_(0.67, 1.861]</th>\n",
+       "      <th>Temp_dt=1_std__sample_entropy_(1.861, 2.197]</th>\n",
+       "      <th>Temp_dt=1_std__sample_entropy_(2.197, 2.42]</th>\n",
+       "      <th>Temp_dt=1_std__sample_entropy_(2.42, 2.708]</th>\n",
+       "      <th>Temp_dt=1_std__sample_entropy_(2.708, 3.807]</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>id</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>201406001</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>201406002</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>201406003</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>201406004</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>201406005</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 652 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           Age_(-0.001, 18.0]  Age_(18.0, 45.0]  Age_(45.0, 75.0]  \\\n",
+       "id                                                                  \n",
+       "201406001                   0                 0                 1   \n",
+       "201406002                   0                 0                 1   \n",
+       "201406003                   0                 0                 1   \n",
+       "201406004                   0                 0                 1   \n",
+       "201406005                   0                 0                 1   \n",
+       "\n",
+       "           Disease Code category_Malignant  \\\n",
+       "id                                           \n",
+       "201406001                                1   \n",
+       "201406002                                1   \n",
+       "201406003                                1   \n",
+       "201406004                                1   \n",
+       "201406005                                1   \n",
+       "\n",
+       "           Disease Code category_Non-malignant  \\\n",
+       "id                                               \n",
+       "201406001                                    0   \n",
+       "201406002                                    0   \n",
+       "201406003                                    0   \n",
+       "201406004                                    0   \n",
+       "201406005                                    0   \n",
+       "\n",
+       "           Disease Risk_0 - Non-malignant  Disease Risk_1 - Low  \\\n",
+       "id                                                                \n",
+       "201406001                               0                     0   \n",
+       "201406002                               0                     0   \n",
+       "201406003                               0                     1   \n",
+       "201406004                               0                     0   \n",
+       "201406005                               0                     0   \n",
+       "\n",
+       "           Disease Risk_2 - Intermediate  Disease Risk_3 - High  \\\n",
+       "id                                                                \n",
+       "201406001                              1                      0   \n",
+       "201406002                              1                      0   \n",
+       "201406003                              0                      0   \n",
+       "201406004                              0                      1   \n",
+       "201406005                              0                      1   \n",
+       "\n",
+       "           Intensity_0 - Full                      ...                       \\\n",
+       "id                                                 ...                        \n",
+       "201406001                   1                      ...                        \n",
+       "201406002                   1                      ...                        \n",
+       "201406003                   1                      ...                        \n",
+       "201406004                   0                      ...                        \n",
+       "201406005                   1                      ...                        \n",
+       "\n",
+       "           Temp_dt=1_std__mean_(0.0977, 0.157]  \\\n",
+       "id                                               \n",
+       "201406001                                    0   \n",
+       "201406002                                    0   \n",
+       "201406003                                    0   \n",
+       "201406004                                    0   \n",
+       "201406005                                    0   \n",
+       "\n",
+       "           Temp_dt=1_std__mean_(0.157, 0.194]  \\\n",
+       "id                                              \n",
+       "201406001                                   0   \n",
+       "201406002                                   0   \n",
+       "201406003                                   0   \n",
+       "201406004                                   0   \n",
+       "201406005                                   0   \n",
+       "\n",
+       "           Temp_dt=1_std__mean_(0.194, 0.235]  \\\n",
+       "id                                              \n",
+       "201406001                                   1   \n",
+       "201406002                                   0   \n",
+       "201406003                                   0   \n",
+       "201406004                                   1   \n",
+       "201406005                                   0   \n",
+       "\n",
+       "           Temp_dt=1_std__mean_(0.235, 0.292]  \\\n",
+       "id                                              \n",
+       "201406001                                   0   \n",
+       "201406002                                   1   \n",
+       "201406003                                   1   \n",
+       "201406004                                   0   \n",
+       "201406005                                   1   \n",
+       "\n",
+       "           Temp_dt=1_std__mean_(0.292, 0.564]  \\\n",
+       "id                                              \n",
+       "201406001                                   0   \n",
+       "201406002                                   0   \n",
+       "201406003                                   0   \n",
+       "201406004                                   0   \n",
+       "201406005                                   0   \n",
+       "\n",
+       "           Temp_dt=1_std__sample_entropy_(0.67, 1.861]  \\\n",
+       "id                                                       \n",
+       "201406001                                            0   \n",
+       "201406002                                            0   \n",
+       "201406003                                            0   \n",
+       "201406004                                            1   \n",
+       "201406005                                            0   \n",
+       "\n",
+       "           Temp_dt=1_std__sample_entropy_(1.861, 2.197]  \\\n",
+       "id                                                        \n",
+       "201406001                                             0   \n",
+       "201406002                                             0   \n",
+       "201406003                                             1   \n",
+       "201406004                                             0   \n",
+       "201406005                                             1   \n",
+       "\n",
+       "           Temp_dt=1_std__sample_entropy_(2.197, 2.42]  \\\n",
+       "id                                                       \n",
+       "201406001                                            1   \n",
+       "201406002                                            0   \n",
+       "201406003                                            0   \n",
+       "201406004                                            0   \n",
+       "201406005                                            0   \n",
+       "\n",
+       "           Temp_dt=1_std__sample_entropy_(2.42, 2.708]  \\\n",
+       "id                                                       \n",
+       "201406001                                            0   \n",
+       "201406002                                            1   \n",
+       "201406003                                            0   \n",
+       "201406004                                            0   \n",
+       "201406005                                            0   \n",
+       "\n",
+       "           Temp_dt=1_std__sample_entropy_(2.708, 3.807]  \n",
+       "id                                                       \n",
+       "201406001                                             0  \n",
+       "201406002                                             0  \n",
+       "201406003                                             0  \n",
+       "201406004                                             0  \n",
+       "201406005                                             0  \n",
+       "\n",
+       "[5 rows x 652 columns]"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_features.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_features.to_csv('data/df_features.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = df_features.values\n",
+    "y = df_label.values\n",
+    "\n",
+    "# Make sure there are no nan values\n",
+    "assert not np.isnan(X).any()\n",
+    "assert not np.isnan(y).any()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y34 = df_label34.values\n",
+    "assert not np.isnan(y34).any()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "((324, 652), (324,))"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X.shape, y.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.savez('data/Xy.npz', X=X, y=y, y34=y34)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "gvhd_python3",
+   "language": "python",
+   "name": "gvhd_venv"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
+%% Cell type:code id: tags:
+
+``` python
+import pandas as pd
+import numpy as np
+import scipy.stats
+import pickle, os, time
+import itertools
+from datetime import datetime, timedelta
+from collections import Counter, defaultdict, namedtuple
+from PIL import Image
+import yaml
+from tqdm import tqdm
+
+import seaborn as sns
+from matplotlib import pyplot as plt
+```
+
+%% Cell type:code id: tags:
+
+``` python
+data_dir = './data/'
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Load the population, labels, and baseline features
+pop = pd.read_csv(data_dir + 'population/d10_with_vitals.csv').set_index('BMT_ID')
+
+df_label_full = pop.join(pd.read_csv(data_dir + 'prep/label.csv', index_col='BMT_ID'), how='left')
+df_static = pop.join(pd.read_csv(data_dir + 'features/static.csv', index_col='BMT_ID'), how='left')
+df_static.index.rename('id', inplace=True)
+
+df_label = df_label_full['Label_GVHD']
+df_label34 = (df_label_full['GVHD_max_grade'] >= 3).astype(int)
+assert not df_static[df_static.isnull().any(axis=1)].any().any()
+
+# Load the vital sign time series
+ts_vitals_by_bmt = pickle.load(open(data_dir + 'features/ts_vitals_by_bmt_2014_2017_MiChart.p', 'rb'))
+ts_vitals_by_bmt = {ID: ts_vitals_by_bmt[ID] for ID in list(pop.index)}
+```
+
+%% Cell type:code id: tags:
+
+``` python
+print('Population size:', len(ts_vitals_by_bmt))
+print()
+print('Class balance')
+print('{{0,1}} vs. {{2,3,4}}: \t{:.1%}'.format(df_label.mean()))
+print('{{0,1,2}} vs. {{3,4}}: \t{:.1%}'.format(df_label34.mean()))
+```
+
+%% Output
+
+    Population size: 324
+    
+    Class balance
+    {0,1} vs. {2,3,4}: 	31.8%
+    {0,1,2} vs. {3,4}: 	13.6%
+
+%% Cell type:code id: tags:
+
+``` python
+# Extract vital sign features
+variables = ['HR', 'RR', 'SysBP', 'DiaBP', 'Temp', 'SpO2']
+t0, T = 0, 10
+dt = 1
+
+import tsfresh
+
+def get_trend_features(t0, T, dt):
+    # Recursively summarizing
+    # First computes summary for every daily window (dt)
+    # Then computes trend features
+
+    # Daily vitals summary statistics
+    D_tmp = {}
+    for key, df in ts_vitals_by_bmt.items():
+        df = df[(t0 <= df['t']) & (df['t'] < T)].set_index('t').copy()
+        df = df.rename(columns={v: '{}_dt={}'.format(v, dt) for v in variables})
+        df = df.groupby(
+            pd.cut(df.index, np.arange(t0, T+dt, dt), right=False)
+        ).agg([
+            'mean', 'std',
+            'min', 'max',
+        ])
+        df.index.rename('t', inplace=True)
+        D_tmp[key] = df.reset_index()
+
+    timeseries = pd.concat(D_tmp)
+    timeseries.index.rename('id', level=0, inplace=True)
+    timeseries = timeseries.sort_index()
+    timeseries = timeseries.reset_index(level=0).set_index(['id', 't'])
+
+    timeseries.columns = ['_'.join(col).strip() for col in timeseries.columns.values]
+    stacked_ts = timeseries.stack().copy()
+    stacked_ts.index.rename('variable', level=-1, inplace=True)
+    stacked_ts.rename('value', inplace=True)
+    stacked_ts = stacked_ts.reset_index()
+
+    assert not pd.isnull(stacked_ts['value']).any()
+
+    feature_params = {
+        'mean': None,
+        'linear_trend': [{'attr': 'slope'}],
+        'sample_entropy': None,
+        'fft_coefficient': [
+            {'coeff': 1, 'attr': 'abs'},
+            {'coeff': 1, 'attr': 'angle'},
+        ],
+    }
+    extracted_features = tsfresh.extract_features(
+        stacked_ts, column_id='id', column_sort='t', column_kind='variable', column_value='value',
+        default_fc_parameters=feature_params,
+    )
+
+    return extracted_features
+
+
+extracted_features = get_trend_features(t0, T, dt)
+```
+
+%% Output
+
+    Feature Extraction: 100%|██████████| 278/278 [00:01<00:00, 263.91it/s]
+
+%% Cell type:code id: tags:
+
+``` python
+extracted_features.to_csv('data/ts_features.csv')
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Bin values by quintiles
+df_features = df_static.join(
+    pd.get_dummies(extracted_features.apply(pd.qcut, q=5, duplicates='drop'), prefix_sep='_')
+)
+print(df_features.shape)
+```
+
+%% Output
+
+    (324, 652)
+
+%% Cell type:code id: tags:
+
+``` python
+df_features.head()
+```
+
+%% Output
+
+               Age_(-0.001, 18.0]  Age_(18.0, 45.0]  Age_(45.0, 75.0]  \
+    id
+    201406001                   0                 0                 1
+    201406002                   0                 0                 1
+    201406003                   0                 0                 1
+    201406004                   0                 0                 1
+    201406005                   0                 0                 1
+    
+               Disease Code category_Malignant  \
+    id
+    201406001                                1
+    201406002                                1
+    201406003                                1
+    201406004                                1
+    201406005                                1
+    
+               Disease Code category_Non-malignant  \
+    id
+    201406001                                    0
+    201406002                                    0
+    201406003                                    0
+    201406004                                    0
+    201406005                                    0
+    
+               Disease Risk_0 - Non-malignant  Disease Risk_1 - Low  \
+    id
+    201406001                               0                     0
+    201406002                               0                     0
+    201406003                               0                     1
+    201406004                               0                     0
+    201406005                               0                     0
+    
+               Disease Risk_2 - Intermediate  Disease Risk_3 - High  \
+    id
+    201406001                              1                      0
+    201406002                              1                      0
+    201406003                              0                      0
+    201406004                              0                      1
+    201406005                              0                      1
+    
+               Intensity_0 - Full                      ...                       \
+    id                                                 ...
+    201406001                   1                      ...
+    201406002                   1                      ...
+    201406003                   1                      ...
+    201406004                   0                      ...
+    201406005                   1                      ...
+    
+               Temp_dt=1_std__mean_(0.0977, 0.157]  \
+    id
+    201406001                                    0
+    201406002                                    0
+    201406003                                    0
+    201406004                                    0
+    201406005                                    0
+    
+               Temp_dt=1_std__mean_(0.157, 0.194]  \
+    id
+    201406001                                   0
+    201406002                                   0
+    201406003                                   0
+    201406004                                   0
+    201406005                                   0
+    
+               Temp_dt=1_std__mean_(0.194, 0.235]  \
+    id
+    201406001                                   1
+    201406002                                   0
+    201406003                                   0
+    201406004                                   1
+    201406005                                   0
+    
+               Temp_dt=1_std__mean_(0.235, 0.292]  \
+    id
+    201406001                                   0
+    201406002                                   1
+    201406003                                   1
+    201406004                                   0
+    201406005                                   1
+    
+               Temp_dt=1_std__mean_(0.292, 0.564]  \
+    id
+    201406001                                   0
+    201406002                                   0
+    201406003                                   0
+    201406004                                   0
+    201406005                                   0
+    
+               Temp_dt=1_std__sample_entropy_(0.67, 1.861]  \
+    id
+    201406001                                            0
+    201406002                                            0
+    201406003                                            0
+    201406004                                            1
+    201406005                                            0
+    
+               Temp_dt=1_std__sample_entropy_(1.861, 2.197]  \
+    id
+    201406001                                             0
+    201406002                                             0
+    201406003                                             1
+    201406004                                             0
+    201406005                                             1
+    
+               Temp_dt=1_std__sample_entropy_(2.197, 2.42]  \
+    id
+    201406001                                            1
+    201406002                                            0
+    201406003                                            0
+    201406004                                            0
+    201406005                                            0
+    
+               Temp_dt=1_std__sample_entropy_(2.42, 2.708]  \
+    id
+    201406001                                            0
+    201406002                                            1
+    201406003                                            0
+    201406004                                            0
+    201406005                                            0
+    
+               Temp_dt=1_std__sample_entropy_(2.708, 3.807]
+    id
+    201406001                                             0
+    201406002                                             0
+    201406003                                             0
+    201406004                                             0
+    201406005                                             0
+    
+    [5 rows x 652 columns]
+
+%% Cell type:code id: tags:
+
+``` python
+df_features.to_csv('data/df_features.csv')
+```
+
+%% Cell type:code id: tags:
+
+``` python
+X = df_features.values
+y = df_label.values
+
+# Make sure there are no nan values
+assert not np.isnan(X).any()
+assert not np.isnan(y).any()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+y34 = df_label34.values
+assert not np.isnan(y34).any()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+X.shape, y.shape
+```
+
+%% Output
+
+    ((324, 652), (324,))
+
+%% Cell type:code id: tags:
+
+``` python
+np.savez('data/Xy.npz', X=X, y=y, y34=y34)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
--- a/2_ML_Models.ipynb
+++ b/2_ML_Models.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from joblib import dump, load\n",
+    "np.random.seed(42)\n",
+    "random.seed(42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_dir = './data/'\n",
+    "with np.load('data/Xy.npz') as f:\n",
+    "    X = f['X']\n",
+    "    y = f['y']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Perform temporal split of data into train/test sets\n",
+    "pop = pd.read_csv(data_dir + 'population/d10_with_vitals.csv').set_index('BMT_ID')\n",
+    "\n",
+    "split_date = 201701001\n",
+    "split_idx = -85\n",
+    "\n",
+    "assert (pop[:split_idx].index < split_date).all()\n",
+    "assert (pop[split_idx:].index >= split_date).all()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn import preprocessing, model_selection, metrics, utils\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from tqdm import tqdm\n",
+    "from joblib import Parallel, delayed\n",
+    "from sklearn.base import clone"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Specify hyperparameters and cv parameters\n",
+    "base_estimator = LogisticRegression(penalty='l2', class_weight='balanced', solver='liblinear')\n",
+    "param_grid = {\n",
+    "    'C': [10. ** n for n in range(-6, 7)],\n",
+    "    'penalty': ['l2'],\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train model with baseline+vitals"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Xtr, Xte = X[:split_idx], X[split_idx:]\n",
+    "ytr, yte = y[:split_idx], y[split_idx:]\n",
+    "\n",
+    "cv_splits, cv_repeat = 5, 20\n",
+    "cv = model_selection.RepeatedStratifiedKFold(cv_splits, cv_repeat, random_state=0)\n",
+    "clf = model_selection.GridSearchCV(\n",
+    "    clone(base_estimator), param_grid, \n",
+    "    cv=cv, scoring='roc_auc', n_jobs=5,\n",
+    ")\n",
+    "clf.fit(Xtr, ytr)\n",
+    "test_score = metrics.roc_auc_score(yte, clf.decision_function(Xte))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                    \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Test AUC: 0.658 (0.536, 0.784)\n",
+      "Test AUC: 0.659 ± 0.063\n"
+     ]
+    }
+   ],
+   "source": [
+    "y_true = yte\n",
+    "y_score = clf.decision_function(Xte)\n",
+    "\n",
+    "def boostrap_func(i, y_true, y_score):\n",
+    "    yte_true_b, yte_pred_b = utils.resample(y_true, y_score, replace=True, random_state=i)\n",
+    "    return metrics.roc_curve(yte_true_b, yte_pred_b), metrics.roc_auc_score(yte_true_b, yte_pred_b)\n",
+    "\n",
+    "roc_curves, auc_scores = zip(*Parallel(n_jobs=4)(delayed(boostrap_func)(i, y_true, y_score) for i in tqdm(range(1000), leave=False)))\n",
+    "print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(auc_scores), np.percentile(auc_scores, 2.5), np.percentile(auc_scores, 97.5)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['data/model_combined.joblib']"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dump(clf, 'data/model_combined.joblib') "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train model with baseline features only"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data4/tangsp/venv/lib/python3.7/site-packages/sklearn/model_selection/_search.py:814: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.\n",
+      "  DeprecationWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "Xtr, Xte = X[:split_idx, :52], X[split_idx:, :52]\n",
+    "ytr, yte = y[:split_idx], y[split_idx:]\n",
+    "\n",
+    "cv_splits, cv_repeat = 5, 20\n",
+    "cv = model_selection.RepeatedStratifiedKFold(cv_splits, cv_repeat, random_state=0)\n",
+    "clf = model_selection.GridSearchCV(\n",
+    "    clone(base_estimator), param_grid, \n",
+    "    cv=cv, scoring='roc_auc', n_jobs=5,\n",
+    ")\n",
+    "clf.fit(Xtr, ytr)\n",
+    "test_score = metrics.roc_auc_score(yte, clf.decision_function(Xte))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                  \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Test AUC: 0.512 (0.364, 0.643)\n"
+     ]
+    }
+   ],
+   "source": [
+    "y_true = yte\n",
+    "y_score = clf.decision_function(Xte)\n",
+    "\n",
+    "def boostrap_func(i, y_true, y_score):\n",
+    "    yte_true_b, yte_pred_b = utils.resample(y_true, y_score, replace=True, random_state=i)\n",
+    "    return metrics.roc_curve(yte_true_b, yte_pred_b), metrics.roc_auc_score(yte_true_b, yte_pred_b)\n",
+    "\n",
+    "roc_curves, auc_scores = zip(*Parallel(n_jobs=4)(delayed(boostrap_func)(i, y_true, y_score) for i in tqdm(range(1000), leave=False)))\n",
+    "print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(auc_scores), np.percentile(auc_scores, 2.5), np.percentile(auc_scores, 97.5)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['data/model_baseline.joblib']"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dump(clf, 'data/model_baseline.joblib') "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train model with vitals features only"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Xtr, Xte = X[:split_idx, 52:], X[split_idx:, 52:]\n",
+    "ytr, yte = y[:split_idx], y[split_idx:]\n",
+    "\n",
+    "cv_splits, cv_repeat = 5, 20\n",
+    "cv = model_selection.RepeatedStratifiedKFold(cv_splits, cv_repeat, random_state=0)\n",
+    "clf = model_selection.GridSearchCV(\n",
+    "    clone(base_estimator), param_grid, \n",
+    "    cv=cv, scoring='roc_auc', n_jobs=5,\n",
+    ")\n",
+    "clf.fit(Xtr, ytr)\n",
+    "test_score = metrics.roc_auc_score(yte, clf.decision_function(Xte))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                   \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Test AUC: 0.633 (0.507, 0.757)\n"
+     ]
+    }
+   ],
+   "source": [
+    "y_true = yte\n",
+    "y_score = clf.decision_function(Xte)\n",
+    "\n",
+    "def boostrap_func(i, y_true, y_score):\n",
+    "    yte_true_b, yte_pred_b = utils.resample(y_true, y_score, replace=True, random_state=i)\n",
+    "    return metrics.roc_curve(yte_true_b, yte_pred_b), metrics.roc_auc_score(yte_true_b, yte_pred_b)\n",
+    "\n",
+    "roc_curves, auc_scores = zip(*Parallel(n_jobs=4)(delayed(boostrap_func)(i, y_true, y_score) for i in tqdm(range(1000), leave=False)))\n",
+    "print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(auc_scores), np.percentile(auc_scores, 2.5), np.percentile(auc_scores, 97.5)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['data/model_vitals.joblib']"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dump(clf, 'data/model_vitals.joblib') "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
+%% Cell type:code id: tags:
+
+``` python
+import random
+import numpy as np
+import pandas as pd
+from joblib import dump, load
+np.random.seed(42)
+random.seed(42)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+data_dir = './data/'
+with np.load('data/Xy.npz') as f:
+    X = f['X']
+    y = f['y']
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Perform temporal split of data into train/test sets
+pop = pd.read_csv(data_dir + 'population/d10_with_vitals.csv').set_index('BMT_ID')
+
+split_date = 201701001
+split_idx = -85
+
+assert (pop[:split_idx].index < split_date).all()
+assert (pop[split_idx:].index >= split_date).all()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from sklearn import preprocessing, model_selection, metrics, utils
+from sklearn.linear_model import LogisticRegression
+from tqdm import tqdm
+from joblib import Parallel, delayed
+from sklearn.base import clone
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Specify hyperparameters and cv parameters
+base_estimator = LogisticRegression(penalty='l2', class_weight='balanced', solver='liblinear')
+param_grid = {
+    'C': [10. ** n for n in range(-6, 7)],
+    'penalty': ['l2'],
+}
+```
+
+%% Cell type:markdown id: tags:
+
+## Train model with baseline+vitals
+
+%% Cell type:code id: tags:
+
+``` python
+Xtr, Xte = X[:split_idx], X[split_idx:]
+ytr, yte = y[:split_idx], y[split_idx:]
+
+cv_splits, cv_repeat = 5, 20
+cv = model_selection.RepeatedStratifiedKFold(cv_splits, cv_repeat, random_state=0)
+clf = model_selection.GridSearchCV(
+    clone(base_estimator), param_grid,
+    cv=cv, scoring='roc_auc', n_jobs=5,
+)
+clf.fit(Xtr, ytr)
+test_score = metrics.roc_auc_score(yte, clf.decision_function(Xte))
+```
+
+%% Cell type:code id: tags:
+
+``` python
+y_true = yte
+y_score = clf.decision_function(Xte)
+
+def boostrap_func(i, y_true, y_score):
+    yte_true_b, yte_pred_b = utils.resample(y_true, y_score, replace=True, random_state=i)
+    return metrics.roc_curve(yte_true_b, yte_pred_b), metrics.roc_auc_score(yte_true_b, yte_pred_b)
+
+roc_curves, auc_scores = zip(*Parallel(n_jobs=4)(delayed(boostrap_func)(i, y_true, y_score) for i in tqdm(range(1000), leave=False)))
+print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(auc_scores), np.percentile(auc_scores, 2.5), np.percentile(auc_scores, 97.5)))
+```
+
+%% Output
+
+    
+
+    Test AUC: 0.658 (0.536, 0.784)
+    Test AUC: 0.659 ± 0.063
+
+%% Cell type:code id: tags:
+
+``` python
+dump(clf, 'data/model_combined.joblib')
+```
+
+%% Output
+
+    ['data/model_combined.joblib']
+
+%% Cell type:markdown id: tags:
+
+## Train model with baseline features only
+
+%% Cell type:code id: tags:
+
+``` python
+Xtr, Xte = X[:split_idx, :52], X[split_idx:, :52]
+ytr, yte = y[:split_idx], y[split_idx:]
+
+cv_splits, cv_repeat = 5, 20
+cv = model_selection.RepeatedStratifiedKFold(cv_splits, cv_repeat, random_state=0)
+clf = model_selection.GridSearchCV(
+    clone(base_estimator), param_grid,
+    cv=cv, scoring='roc_auc', n_jobs=5,
+)
+clf.fit(Xtr, ytr)
+test_score = metrics.roc_auc_score(yte, clf.decision_function(Xte))
+```
+
+%% Output
+
+    /data4/tangsp/venv/lib/python3.7/site-packages/sklearn/model_selection/_search.py:814: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
+      DeprecationWarning)
+
+%% Cell type:code id: tags:
+
+``` python
+y_true = yte
+y_score = clf.decision_function(Xte)
+
+def boostrap_func(i, y_true, y_score):
+    yte_true_b, yte_pred_b = utils.resample(y_true, y_score, replace=True, random_state=i)
+    return metrics.roc_curve(yte_true_b, yte_pred_b), metrics.roc_auc_score(yte_true_b, yte_pred_b)
+
+roc_curves, auc_scores = zip(*Parallel(n_jobs=4)(delayed(boostrap_func)(i, y_true, y_score) for i in tqdm(range(1000), leave=False)))
+print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(auc_scores), np.percentile(auc_scores, 2.5), np.percentile(auc_scores, 97.5)))
+```
+
+%% Output
+
+    
+
+    Test AUC: 0.512 (0.364, 0.643)
+
+%% Cell type:code id: tags:
+
+``` python
+dump(clf, 'data/model_baseline.joblib')
+```
+
+%% Output
+
+    ['data/model_baseline.joblib']
+
+%% Cell type:markdown id: tags:
+
+## Train model with vitals features only
+
+%% Cell type:code id: tags:
+
+``` python
+Xtr, Xte = X[:split_idx, 52:], X[split_idx:, 52:]
+ytr, yte = y[:split_idx], y[split_idx:]
+
+cv_splits, cv_repeat = 5, 20
+cv = model_selection.RepeatedStratifiedKFold(cv_splits, cv_repeat, random_state=0)
+clf = model_selection.GridSearchCV(
+    clone(base_estimator), param_grid,
+    cv=cv, scoring='roc_auc', n_jobs=5,
+)
+clf.fit(Xtr, ytr)
+test_score = metrics.roc_auc_score(yte, clf.decision_function(Xte))
+```
+
+%% Cell type:code id: tags:
+
+``` python
+y_true = yte
+y_score = clf.decision_function(Xte)
+
+def boostrap_func(i, y_true, y_score):
+    yte_true_b, yte_pred_b = utils.resample(y_true, y_score, replace=True, random_state=i)
+    return metrics.roc_curve(yte_true_b, yte_pred_b), metrics.roc_auc_score(yte_true_b, yte_pred_b)
+
+roc_curves, auc_scores = zip(*Parallel(n_jobs=4)(delayed(boostrap_func)(i, y_true, y_score) for i in tqdm(range(1000), leave=False)))
+print('Test AUC: {:.3f} ({:.3f}, {:.3f})'.format(np.median(auc_scores), np.percentile(auc_scores, 2.5), np.percentile(auc_scores, 97.5)))
+```
+
+%% Output
+
+    
+
+    Test AUC: 0.633 (0.507, 0.757)
+
+%% Cell type:code id: tags:
+
+``` python
+dump(clf, 'data/model_vitals.joblib')
+```
+
+%% Output
+
+    ['data/model_vitals.joblib']
+
+%% Cell type:code id: tags:
+
+``` python
+```
--- a/3_Evaluations.ipynb
+++ b/3_Evaluations.ipynb
--- a/4_Feature_Analyses.ipynb
+++ b/4_Feature_Analyses.ipynb
--- a/README.md
+++ b/README.md
 # JCO CCI - aGVHD_prediction

+## Overview
+- This is the code repository for the manuscript "Predicting Acute Graft-versus-Host Disease Using Machine Learning and Vital Sign Data from Electronic Health Records".
+- Authors: Shengpu Tang, Grant Chappell, Amanda Mazzoli, Muneesh Tewari, Sung Won Choi\*, and Jenna Wiens\*
--- a/Supplemental_Analyses.ipynb
+++ b/Supplemental_Analyses.ipynb
--- a/Vitals_visualize.ipynb
+++ b/Vitals_visualize.ipynb