Commit 56f2f65b authored by Shengpu Tang (tangsp)'s avatar Shengpu Tang (tangsp)
Browse files

Hierarchical type

parent f3a7ee10
......@@ -6,6 +6,8 @@ ID_col = config['column_names']['ID']
var_col = config['column_names']['var_name']
val_col = config['column_names']['var_value']
t_col = config['column_names']['t']
hierarchical_sep = config['hierarchical_sep']
hierarchical_levels = config['hierarchical_levels']
use_ordinal_encoding = config['use_ordinal_encoding']
value_type_override = config['value_types']
......
......@@ -6,10 +6,12 @@ column_names:
var_value: variable_value
use_ordinal_encoding: no
hierarchical_sep: ":"
hierarchical_levels: [0, 1, 2]
value_types:
# enter the feature type that you would like to override in the following format:
FIRST_WARDID: Categorical
MedA:
AMOUNT: Numeric
ROUTE: Categorical
MedA_AMOUNT: Numeric
MedA_ROUTE: Categorical
ICD9_CODE: hierarchical_ICD9
......@@ -31,6 +31,62 @@ def print_header(*content, char='='):
print(char * 80, flush=True)
######
# Hierarchical value type
# - Currently supports parsing/mapping ICD9, ICD10; will add support for CPT, DRG
# - Requires the user to specify which level(s) of the hierarchy to encode
# - Each hierachical variable is mapped to multiple rows with string values after Pre-filter
# as the first step of Transform, and will be treated as categorical variables
######
from icd9cms import icd9
import icd10
def map_icd_hierarchy(s, version=9):
s = str(s)
code9 = icd9.search(s)
code10 = icd10.find(s)
if code9 is None and code10 is None:
raise Exception("Invalid ICD code", s)
if version == 9:
if code9 is not None:
return list(reversed([code9.alt_code] + code9.ancestors()))
else:
raise Exception("Invalid ICD version", s)
elif version == 10:
if code10 is not None:
return [code10.chapter, code10.block, code10.code[:3], code10.code]
else:
try:
# Attempt to convert from version 9 to 10
s_ = convert_icd_9_to_10(code9.alt_code)
code = icd10.find(s_)
if code is None:
# Fall back to version 9
return list(reversed([code9.alt_code] + code9.ancestors()))
else:
return [code.chapter, code.block, code.code[:3], code.code]
except:
warnings.warn('Conversion failed: ' + str(s))
return list(reversed([code9.alt_code] + code9.ancestors()))
# raise Exception('Conversion error: ' + str(s))
else:
raise Exception("Invalid ICD version", s)
import warnings
_df_icd_mapping = pd.read_csv('https://raw.githubusercontent.com/bhanratt/ICD9CMtoICD10CM/master/icd9to10dictionary.txt', sep='|', header=None, names=['ICD9', 'ICD10', 'Description'])
_icd_mapping_9_to_10 = dict(_df_icd_mapping[['ICD9', 'ICD10']].values)
def convert_icd_9_to_10(s):
try:
return str(
_icd_mapping_9_to_10.get(s) or
_icd_mapping_9_to_10.get(icd9.search(s).parent.alt_code) or
_icd_mapping_9_to_10.get(icd9.search(s).parent.parent.alt_code)
)
except:
warnings.warn('Conversion failed: ' + str(s))
return s
######
# Transform
######
......
......@@ -100,7 +100,7 @@ if args.prefilter:
df_data.to_csv(data_path + 'pre-filtered.csv', index=False)
print_header('2) Transform; 3) Post-filter')
df_data, df_types = detect_variable_data_type(df_data, value_type_override, args)
df_data, df_types = parse_variable_data_type(df_data, value_type_override, args)
df_time_invariant, df_time_series = split_by_timestamp_type(df_data)
# Process time-invariant data
......
......@@ -21,12 +21,15 @@ def pre_filter(df, threshold, df_population, args):
print('Remove rows with t outside of [0, {}]'.format(T))
df = df[pd.isnull(df[t_col]) | ((0 <= df[t_col]) & (df[t_col] < T))]
# Data tables should not contain duplicate rows
# Data table should not contain duplicate rows with any numerical values
# Check for inconsistencies
dups = df.duplicated(subset=[ID_col, t_col, var_col], keep=False)
if any(dups):
print(df[dups].head())
raise Exception('Inconsistent values recorded')
var_names = [v for v, ty in value_type_override.items() if 'hierarchical' in ty.lower() or 'categorical' in ty.lower()]
df_tmp = df[~df[var_col].isin(var_names)]
dups = df_tmp.duplicated(subset=[ID_col, t_col, var_col], keep=False)
df_dups = df_tmp[dups]
if any(dups) and any(is_numeric(v) for v in df_dups[val_col] if not pd.isnull(v)):
print(df_dups.head())
raise Exception('Inconsistent numerical values recorded')
# Remove variables that occur too rarely as defined by the threshold
print('Remove rare variables (<= {})'.format(threshold))
......@@ -50,13 +53,52 @@ def pre_filter(df, threshold, df_population, args):
return df_out
def detect_variable_data_type(df_data, value_type_override, args):
def parse_variable_data_type(df_data, value_type_override, args):
# 1. parse hierarchical values (e.g. ICD codes) into strings
# 2. automatically detect value types, respecting user override, and set dtypes in DataFrames
# 3. pre-map duplicated non-numerical values into multiple categorical variables
data_path = args.data_path
print_header('*) Detecting value types', char='-')
data_types = []
df = df_data
assert val_col in df.columns
print_header('*) Detecting and parsing value types', char='-')
## 1. Hierarchical values
var_names = [v for v, ty in value_type_override.items() if 'hierarchical' in ty.lower()]
if len(var_names) == 0: # No hierarchical values
pass
for var_name in var_names:
var_type = value_type_override[var_name]
df_var = df.loc[df[var_col] == var_name, val_col]
if var_type.lower() == 'hierarchical_icd':
# need to figure out ICD version
raise NotImplementedError
elif var_type.lower() == 'hierarchical_icd9':
df_var = df_var.apply(lambda s: map_icd_hierarchy(s, version=9))
elif var_type.lower() == 'hierarchical_icd10':
df_var = df_var.apply(lambda s: map_icd_hierarchy(s, version=10))
else:
df_var = df_var.apply(lambda s: s.split(hierarchical_sep))
# Assign mapped values back to original df
df.loc[df[var_col] == var_name, val_col] = df_var
# Only encode selected levels
df_nonhier = df[~df[var_col].isin(var_names)]
df_hier = df[df[var_col].isin(var_names)]
df_hier_levels = []
for hier_level in hierarchical_levels:
# encode level if available
df_hier_level = df_hier.copy()
df_hier_level[val_col] = df_hier_level[val_col].apply(lambda h: h[min(hier_level, len(h))])
df_hier_levels.append(df_hier_level)
df_hier_levels = pd.concat(df_hier_levels).drop_duplicates()
# Combine hierarchical and non-hierarchical data
df = pd.concat([df_nonhier, df_hier_levels])
## 2. Detect value types
data_types = []
# Collect the unique values of each variable
# values_by_variable: dict(variable_name -> [value1, value2, ...])
......@@ -91,6 +133,18 @@ def detect_variable_data_type(df_data, value_type_override, args):
fpath = data_path + 'value_types.csv'
df_types.to_csv(fpath, quoting=1)
print('Saved as:', fpath)
## 3. Pre-map duplicated non-numerical values to separate variables
var_names = [v for v, ty in data_types if 'numeric' not in ty.lower() and 'none' not in ty.lower()]
df_non_num = df[df[var_col].isin(var_names)].copy()
dup_ = df_non_num.duplicated(subset=[ID_col, t_col, var_col], keep=False)
df_non_num_dup = df_non_num[dup_]
dup_var_names = df_non_num_dup[var_col].unique()
df_non_num_dup[var_col] = df_non_num_dup[var_col].astype(str) + ':' + df_non_num_dup[val_col].astype(str)
df_non_num_dup[val_col] = 1
df_non_num[dup_] = df_non_num_dup
df[df[var_col].isin(var_names)] = df_non_num
return df, df_types['value_type']
......
......@@ -4,3 +4,5 @@ sparse>=0.9.1
scikit-learn>=0.22.1
tqdm>=4.43.0
joblib>=0.13.2
icd9cms>=0.2.1
icd10-cm>=0.0.4
{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"df = pd.read_csv('./icd_data/input_data.csv')\n",
"df.loc[df['variable_value'] == '71970', 'variable_value'] = '7197'\n",
"df.to_csv('./icd_data/input_data.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Input data file: ./icd_data/input_data.csv\n",
"\n",
"Input arguments:\n",
" T = 4\n",
" dt = 1.0\n",
" θ₁ = 0.001\n",
" θ₂ = 0.001\n",
" θ_freq = 1.0\n",
" k = 3 ['min', 'max', 'mean']\n",
"binarize = yes\n",
"\n",
"N = 53122\n",
"L = 4\n",
"\n",
"\n",
"================================================================================\n",
"1) Pre-filter\n",
"================================================================================\n",
"Remove rows not in population\n",
"Remove rows with t outside of [0, 4]\n",
"Remove rare variables (<= 0.001)\n",
"Total variables : 1\n",
"Rare variables : 0\n",
"Remaining variables : 1\n",
"# rows (original) : 569007\n",
"# rows (filtered) : 569007\n",
"\n",
"================================================================================\n",
"2) Transform; 3) Post-filter\n",
"================================================================================\n",
"\n",
"--------------------------------------------------------------------------------\n",
"*) Detecting and parsing value types\n",
"--------------------------------------------------------------------------------\n",
"Saved as: ./icd_data/value_types.csv\n",
"\n",
"--------------------------------------------------------------------------------\n",
"*) Separate time-invariant and time-dependent\n",
"--------------------------------------------------------------------------------\n",
"Variables (time-invariant): 1447\n",
"Variables (time-dependent): 0\n",
"# rows (time-invariant): 1265903\n",
"# rows (time-dependent): 0\n",
"\n",
"--------------------------------------------------------------------------------\n",
"2-A) Transform time-invariant data\n",
"--------------------------------------------------------------------------------\n",
"(N × ^d) table :\t (53122, 1447)\n",
"number of missing entries :\t 75601631 out of 76867534 total\n",
"Time elapsed: 8.736094 seconds\n",
"\n",
"Output\n",
"s_all, binary features :\t (53122, 1447)\n",
"Time elapsed: 115.795696 seconds\n",
"\n",
"--------------------------------------------------------------------------------\n",
"3-A) Post-filter time-invariant data\n",
"--------------------------------------------------------------------------------\n",
"Original : 1447\n",
"Nearly-constant: 753\n",
"Correlated : 7\n",
"Time elapsed: 116.175213 seconds\n",
"\n",
"Output\n",
"s: shape=(53122, 687), density=0.034\n",
"Total time: 116.547743 seconds\n",
"\n"
]
}
],
"source": [
"! PYTHONPATH=\"$PYTHONPATH:../\" \\\n",
"python -m FIDDLE.run \\\n",
" --data_path='./icd_data/' \\\n",
" --population='./icd_data/pop.csv' \\\n",
" --T=4 --dt=1.0 \\\n",
" --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
" --stats_functions 'min' 'max' 'mean'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Input data file: ./icd_test/input_data.csv\n",
"\n",
"Input arguments:\n",
" T = 4\n",
" dt = 1.0\n",
" θ₁ = 0.001\n",
" θ₂ = 0.001\n",
" θ_freq = 1.0\n",
" k = 3 ['min', 'max', 'mean']\n",
"binarize = yes\n",
"\n",
"N = 200\n",
"L = 4\n",
"\n",
"\n",
"================================================================================\n",
"1) Pre-filter\n",
"================================================================================\n",
"Remove rows not in population\n",
"Remove rows with t outside of [0, 4]\n",
"Remove rare variables (<= 0.001)\n",
"Total variables : 1\n",
"Rare variables : 0\n",
"Remaining variables : 1\n",
"# rows (original) : 1861\n",
"# rows (filtered) : 1861\n",
"\n",
"================================================================================\n",
"2) Transform; 3) Post-filter\n",
"================================================================================\n",
"\n",
"--------------------------------------------------------------------------------\n",
"*) Detecting and parsing value types\n",
"--------------------------------------------------------------------------------\n",
"Saved as: ./icd_test/value_types.csv\n",
"\n",
"--------------------------------------------------------------------------------\n",
"*) Separate time-invariant and time-dependent\n",
"--------------------------------------------------------------------------------\n",
"Variables (time-invariant): 455\n",
"Variables (time-dependent): 0\n",
"# rows (time-invariant): 4205\n",
"# rows (time-dependent): 0\n",
"\n",
"--------------------------------------------------------------------------------\n",
"2-A) Transform time-invariant data\n",
"--------------------------------------------------------------------------------\n",
"(N × ^d) table :\t (200, 455)\n",
"number of missing entries :\t 86795 out of 91000 total\n",
"Time elapsed: 0.101392 seconds\n",
"\n",
"Output\n",
"s_all, binary features :\t (200, 455)\n",
"Time elapsed: 1.779821 seconds\n",
"\n",
"--------------------------------------------------------------------------------\n",
"3-A) Post-filter time-invariant data\n",
"--------------------------------------------------------------------------------\n",
"Original : 455\n",
"Nearly-constant: 0\n",
"Correlated : 87\n",
"Time elapsed: 1.820592 seconds\n",
"\n",
"Output\n",
"s: shape=(200, 368), density=0.055\n",
"Total time: 1.827327 seconds\n",
"\n"
]
}
],
"source": [
"! PYTHONPATH=\"$PYTHONPATH:../\" \\\n",
"python -m FIDDLE.run \\\n",
" --data_path='./icd_test/' \\\n",
" --population='./icd_test/pop.csv' \\\n",
" --T=4 --dt=1.0 \\\n",
" --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
" --stats_functions 'min' 'max' 'mean'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import sparse\n",
"s = sparse.load_npz('./icd_test/s.npz').todense()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(200, 368)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0, 0, 0, ..., 0, 0, 0],\n",
" [0, 0, 0, ..., 0, 0, 0],\n",
" [0, 0, 0, ..., 0, 0, 0],\n",
" ...,\n",
" [0, 1, 0, ..., 0, 0, 0],\n",
" [0, 1, 0, ..., 0, 0, 0],\n",
" [0, 1, 1, ..., 0, 0, 0]])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"names = json.load(open('./icd_test/s.feature_names.json', 'rb'))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"368"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(names)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
This diff is collapsed.
This diff is collapsed.
ID
172335
173633
174105
109976
178393
114585
127203
140784
164853
147035
135738
188923
135750
195632
112906
134369
138376
172461
157348
176860
141647
180872
164174
113323
198214
171781
160192
191817
167887
192180
199634
184644
151583
105764
172056
143430
104518
155252
186474
108329
170467
190201
178596
111944
156857
194730
182637
112086
142768
100536
181542
115385
175016
110641
158569
120969
166401
112077
116630
190243
190659
123010
188646
121205
142807
160481
140037
183686
160891
170324
127870
188606
187373
153952
175533
195700
130744
133550
164025
128744
161160
163353
145834
185777
178980
107064
118037
159514
150750
184167
194540
112213
143045
103251
161087
194023
188822
109235
157681
109451
111970