Commit 56f2f65b authored by Shengpu Tang (tangsp)'s avatar Shengpu Tang (tangsp)
Browse files

Hierarchical type

parent f3a7ee10
......@@ -6,6 +6,8 @@ ID_col = config['column_names']['ID']
var_col = config['column_names']['var_name']
val_col = config['column_names']['var_value']
t_col = config['column_names']['t']
hierarchical_sep = config['hierarchical_sep']
hierarchical_levels = config['hierarchical_levels']
use_ordinal_encoding = config['use_ordinal_encoding']
value_type_override = config['value_types']
......
......@@ -6,10 +6,12 @@ column_names:
var_value: variable_value
use_ordinal_encoding: no
hierarchical_sep: ":"
hierarchical_levels: [0, 1, 2]
value_types:
# enter the feature type that you would like to override in the following format:
FIRST_WARDID: Categorical
MedA:
AMOUNT: Numeric
ROUTE: Categorical
MedA_AMOUNT: Numeric
MedA_ROUTE: Categorical
ICD9_CODE: hierarchical_ICD9
......@@ -31,6 +31,62 @@ def print_header(*content, char='='):
print(char * 80, flush=True)
######
# Hierarchical value type
# - Currently supports parsing/mapping ICD9, ICD10; will add support for CPT, DRG
# - Requires the user to specify which level(s) of the hierarchy to encode
# - Each hierachical variable is mapped to multiple rows with string values after Pre-filter
# as the first step of Transform, and will be treated as categorical variables
######
from icd9cms import icd9
import icd10
def map_icd_hierarchy(s, version=9):
s = str(s)
code9 = icd9.search(s)
code10 = icd10.find(s)
if code9 is None and code10 is None:
raise Exception("Invalid ICD code", s)
if version == 9:
if code9 is not None:
return list(reversed([code9.alt_code] + code9.ancestors()))
else:
raise Exception("Invalid ICD version", s)
elif version == 10:
if code10 is not None:
return [code10.chapter, code10.block, code10.code[:3], code10.code]
else:
try:
# Attempt to convert from version 9 to 10
s_ = convert_icd_9_to_10(code9.alt_code)
code = icd10.find(s_)
if code is None:
# Fall back to version 9
return list(reversed([code9.alt_code] + code9.ancestors()))
else:
return [code.chapter, code.block, code.code[:3], code.code]
except:
warnings.warn('Conversion failed: ' + str(s))
return list(reversed([code9.alt_code] + code9.ancestors()))
# raise Exception('Conversion error: ' + str(s))
else:
raise Exception("Invalid ICD version", s)
import warnings
_df_icd_mapping = pd.read_csv('https://raw.githubusercontent.com/bhanratt/ICD9CMtoICD10CM/master/icd9to10dictionary.txt', sep='|', header=None, names=['ICD9', 'ICD10', 'Description'])
_icd_mapping_9_to_10 = dict(_df_icd_mapping[['ICD9', 'ICD10']].values)
def convert_icd_9_to_10(s):
try:
return str(
_icd_mapping_9_to_10.get(s) or
_icd_mapping_9_to_10.get(icd9.search(s).parent.alt_code) or
_icd_mapping_9_to_10.get(icd9.search(s).parent.parent.alt_code)
)
except:
warnings.warn('Conversion failed: ' + str(s))
return s
######
# Transform
######
......
......@@ -100,7 +100,7 @@ if args.prefilter:
df_data.to_csv(data_path + 'pre-filtered.csv', index=False)
print_header('2) Transform; 3) Post-filter')
df_data, df_types = detect_variable_data_type(df_data, value_type_override, args)
df_data, df_types = parse_variable_data_type(df_data, value_type_override, args)
df_time_invariant, df_time_series = split_by_timestamp_type(df_data)
# Process time-invariant data
......
......@@ -21,12 +21,15 @@ def pre_filter(df, threshold, df_population, args):
print('Remove rows with t outside of [0, {}]'.format(T))
df = df[pd.isnull(df[t_col]) | ((0 <= df[t_col]) & (df[t_col] < T))]
# Data tables should not contain duplicate rows
# Data table should not contain duplicate rows with any numerical values
# Check for inconsistencies
dups = df.duplicated(subset=[ID_col, t_col, var_col], keep=False)
if any(dups):
print(df[dups].head())
raise Exception('Inconsistent values recorded')
var_names = [v for v, ty in value_type_override.items() if 'hierarchical' in ty.lower() or 'categorical' in ty.lower()]
df_tmp = df[~df[var_col].isin(var_names)]
dups = df_tmp.duplicated(subset=[ID_col, t_col, var_col], keep=False)
df_dups = df_tmp[dups]
if any(dups) and any(is_numeric(v) for v in df_dups[val_col] if not pd.isnull(v)):
print(df_dups.head())
raise Exception('Inconsistent numerical values recorded')
# Remove variables that occur too rarely as defined by the threshold
print('Remove rare variables (<= {})'.format(threshold))
......@@ -50,13 +53,52 @@ def pre_filter(df, threshold, df_population, args):
return df_out
def detect_variable_data_type(df_data, value_type_override, args):
def parse_variable_data_type(df_data, value_type_override, args):
# 1. parse hierarchical values (e.g. ICD codes) into strings
# 2. automatically detect value types, respecting user override, and set dtypes in DataFrames
# 3. pre-map duplicated non-numerical values into multiple categorical variables
data_path = args.data_path
print_header('*) Detecting value types', char='-')
data_types = []
df = df_data
assert val_col in df.columns
print_header('*) Detecting and parsing value types', char='-')
## 1. Hierarchical values
var_names = [v for v, ty in value_type_override.items() if 'hierarchical' in ty.lower()]
if len(var_names) == 0: # No hierarchical values
pass
for var_name in var_names:
var_type = value_type_override[var_name]
df_var = df.loc[df[var_col] == var_name, val_col]
if var_type.lower() == 'hierarchical_icd':
# need to figure out ICD version
raise NotImplementedError
elif var_type.lower() == 'hierarchical_icd9':
df_var = df_var.apply(lambda s: map_icd_hierarchy(s, version=9))
elif var_type.lower() == 'hierarchical_icd10':
df_var = df_var.apply(lambda s: map_icd_hierarchy(s, version=10))
else:
df_var = df_var.apply(lambda s: s.split(hierarchical_sep))
# Assign mapped values back to original df
df.loc[df[var_col] == var_name, val_col] = df_var
# Only encode selected levels
df_nonhier = df[~df[var_col].isin(var_names)]
df_hier = df[df[var_col].isin(var_names)]
df_hier_levels = []
for hier_level in hierarchical_levels:
# encode level if available
df_hier_level = df_hier.copy()
df_hier_level[val_col] = df_hier_level[val_col].apply(lambda h: h[min(hier_level, len(h))])
df_hier_levels.append(df_hier_level)
df_hier_levels = pd.concat(df_hier_levels).drop_duplicates()
# Combine hierarchical and non-hierarchical data
df = pd.concat([df_nonhier, df_hier_levels])
## 2. Detect value types
data_types = []
# Collect the unique values of each variable
# values_by_variable: dict(variable_name -> [value1, value2, ...])
......@@ -91,6 +133,18 @@ def detect_variable_data_type(df_data, value_type_override, args):
fpath = data_path + 'value_types.csv'
df_types.to_csv(fpath, quoting=1)
print('Saved as:', fpath)
## 3. Pre-map duplicated non-numerical values to separate variables
var_names = [v for v, ty in data_types if 'numeric' not in ty.lower() and 'none' not in ty.lower()]
df_non_num = df[df[var_col].isin(var_names)].copy()
dup_ = df_non_num.duplicated(subset=[ID_col, t_col, var_col], keep=False)
df_non_num_dup = df_non_num[dup_]
dup_var_names = df_non_num_dup[var_col].unique()
df_non_num_dup[var_col] = df_non_num_dup[var_col].astype(str) + ':' + df_non_num_dup[val_col].astype(str)
df_non_num_dup[val_col] = 1
df_non_num[dup_] = df_non_num_dup
df[df[var_col].isin(var_names)] = df_non_num
return df, df_types['value_type']
......
......@@ -4,3 +4,5 @@ sparse>=0.9.1
scikit-learn>=0.22.1
tqdm>=4.43.0
joblib>=0.13.2
icd9cms>=0.2.1
icd10-cm>=0.0.4
%% Cell type:code id: tags:
``` python
import pandas as pd
df = pd.read_csv('./icd_data/input_data.csv')
df.loc[df['variable_value'] == '71970', 'variable_value'] = '7197'
df.to_csv('./icd_data/input_data.csv', index=False)
```
%% Cell type:code id: tags:
``` python
! PYTHONPATH="$PYTHONPATH:../" \
python -m FIDDLE.run \
--data_path='./icd_data/' \
--population='./icd_data/pop.csv' \
--T=4 --dt=1.0 \
--theta_1=0.001 --theta_2=0.001 --theta_freq=1 \
--stats_functions 'min' 'max' 'mean'
```
%% Output
Input data file: ./icd_data/input_data.csv
Input arguments:
T = 4
dt = 1.0
θ₁ = 0.001
θ₂ = 0.001
θ_freq = 1.0
k = 3 ['min', 'max', 'mean']
binarize = yes
N = 53122
L = 4
================================================================================
1) Pre-filter
================================================================================
Remove rows not in population
Remove rows with t outside of [0, 4]
Remove rare variables (<= 0.001)
Total variables : 1
Rare variables : 0
Remaining variables : 1
# rows (original) : 569007
# rows (filtered) : 569007
================================================================================
2) Transform; 3) Post-filter
================================================================================
--------------------------------------------------------------------------------
*) Detecting and parsing value types
--------------------------------------------------------------------------------
Saved as: ./icd_data/value_types.csv
--------------------------------------------------------------------------------
*) Separate time-invariant and time-dependent
--------------------------------------------------------------------------------
Variables (time-invariant): 1447
Variables (time-dependent): 0
# rows (time-invariant): 1265903
# rows (time-dependent): 0
--------------------------------------------------------------------------------
2-A) Transform time-invariant data
--------------------------------------------------------------------------------
(N × ^d) table : (53122, 1447)
number of missing entries : 75601631 out of 76867534 total
Time elapsed: 8.736094 seconds
Output
s_all, binary features : (53122, 1447)
Time elapsed: 115.795696 seconds
--------------------------------------------------------------------------------
3-A) Post-filter time-invariant data
--------------------------------------------------------------------------------
Original : 1447
Nearly-constant: 753
Correlated : 7
Time elapsed: 116.175213 seconds
Output
s: shape=(53122, 687), density=0.034
Total time: 116.547743 seconds
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
! PYTHONPATH="$PYTHONPATH:../" \
python -m FIDDLE.run \
--data_path='./icd_test/' \
--population='./icd_test/pop.csv' \
--T=4 --dt=1.0 \
--theta_1=0.001 --theta_2=0.001 --theta_freq=1 \
--stats_functions 'min' 'max' 'mean'
```