Commit e364178c authored by Shengpu Tang (tangsp)'s avatar Shengpu Tang (tangsp)
Browse files

add test case

parent 9423ec40
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Input data file: ./small_test/input_data.csv\n",
"\n",
"Input arguments:\n",
" T = 4\n",
" dt = 1.0\n",
" θ₁ = 0.001\n",
" θ₂ = 0.001\n",
" θ_freq = 1.0\n",
" k = 3 ['min', 'max', 'mean']\n",
"\n",
"N = 4\n",
"L = 4\n",
"\n",
"\n",
"================================================================================\n",
"1) Pre-filter\n",
"================================================================================\n",
"Remove rows not in population\n",
"Remove rows with t outside of [0, 4]\n",
"Remove rare variables (<= 0.001)\n",
"Total variables : 7\n",
"Rare variables : 0\n",
"Remaining variables : 7\n",
"# rows (original) : 31\n",
"# rows (filtered) : 31\n",
"\n",
"================================================================================\n",
"2) Transform; 3) Post-filter\n",
"================================================================================\n",
"\n",
"--------------------------------------------------------------------------------\n",
"*) Detecting value types\n",
"--------------------------------------------------------------------------------\n",
"Saved as: ./small_test/value_types.csv\n",
"\n",
"--------------------------------------------------------------------------------\n",
"*) Separate time-invariant and time-dependent\n",
"--------------------------------------------------------------------------------\n",
"Variables (time-invariant): 3\n",
"Variables (time-dependent): 4\n",
"# rows (time-invariant): 8\n",
"# rows (time-dependent): 23\n",
"\n",
"--------------------------------------------------------------------------------\n",
"2.1) Transform time-invariant data\n",
"--------------------------------------------------------------------------------\n",
"(N × ^d) table :\t (4, 3)\n",
"number of missing entries :\t 4 out of 12 total\n",
"Time elapsed: 0.017584 seconds\n",
"\n",
"Output\n",
"s_all, binary features :\t (4, 7)\n",
"Time elapsed: 0.072829 seconds\n",
"\n",
"--------------------------------------------------------------------------------\n",
"3.1) Post-filter time-invariant data\n",
"--------------------------------------------------------------------------------\n",
"Original : 7\n",
"Nearly-constant: 0\n",
"Correlated : 3\n",
"Time elapsed: 0.076066 seconds\n",
"\n",
"Output\n",
"s: shape=(4, 4), density=0.312\n",
"Total time: 0.078834 seconds\n",
"\n",
"\n",
"--------------------------------------------------------------------------------\n",
"2.2) Transform time-dependent data\n",
"--------------------------------------------------------------------------------\n",
"Total variables : 4\n",
"Frequent variables : ['HR']\n",
"M₁ = 1\n",
"M₂ = 3\n",
"k = 3 ['min', 'max', 'mean']\n",
"\n",
"Transforming each example...\n",
"[Parallel(n_jobs=72)]: Using backend LokyBackend with 72 concurrent workers.\n",
"[Parallel(n_jobs=72)]: Done 1 tasks | elapsed: 3.8s\n",
"[Parallel(n_jobs=72)]: Done 2 out of 4 | elapsed: 3.8s remaining: 3.8s\n",
"[Parallel(n_jobs=72)]: Done 4 out of 4 | elapsed: 3.9s remaining: 0.0s\n",
"[Parallel(n_jobs=72)]: Done 4 out of 4 | elapsed: 3.9s finished\n",
"(freq) number of missing entries :\t 5 out of 4×4×1=16 total\n",
"(freq) number of imputed entries :\t 4\n",
" HR 4\n",
"(freq) number of not imputed entries :\t 1\n",
" HR 1\n",
"(non-freq) number of missing entries :\t 41 out of 4×4×3=48 total\n",
"\n",
"(N × L × ^D) table :\t (4, 4, 9)\n",
"Time elapsed: 3.977742 seconds\n",
"\n",
"Discretizing features...\n",
"Processing 8 non-boolean variable columns...\n",
" Binning numeric variables by quintile...\n",
" Converting variables to binary features\n",
"[Parallel(n_jobs=72)]: Using backend LokyBackend with 72 concurrent workers.\n",
"[Parallel(n_jobs=72)]: Done 2 out of 8 | elapsed: 1.0s remaining: 3.0s\n",
"[Parallel(n_jobs=72)]: Done 3 out of 8 | elapsed: 1.0s remaining: 1.7s\n",
"[Parallel(n_jobs=72)]: Done 4 out of 8 | elapsed: 1.0s remaining: 1.0s\n",
"[Parallel(n_jobs=72)]: Done 5 out of 8 | elapsed: 1.0s remaining: 0.6s\n",
"[Parallel(n_jobs=72)]: Done 6 out of 8 | elapsed: 1.0s remaining: 0.3s\n",
"[Parallel(n_jobs=72)]: Done 8 out of 8 | elapsed: 1.1s remaining: 0.0s\n",
"[Parallel(n_jobs=72)]: Done 8 out of 8 | elapsed: 1.1s finished\n",
"Finished discretizing features\n",
"\n",
"Output\n",
"X_all: shape=(4, 4, 29), density=0.203\n",
"Time elapsed: 5.103915 seconds\n",
"\n",
"--------------------------------------------------------------------------------\n",
"3.2) Post-filter time-dependent data\n",
"--------------------------------------------------------------------------------\n",
"(4, 4, 29) 0.2025862068965517\n",
"Original : 29\n",
"Nearly-constant: 0\n",
"*** time: 2.486790657043457\n",
"Correlated : 15\n",
"*** time: 4.358332395553589\n",
"\n",
"Output\n",
"X: shape=(4, 4, 14), density=0.237\n",
"(4, 4, 14) 0.23660714285714285\n",
"Time elapsed: 9.462556 seconds\n",
"\n",
"Output\n",
"X: shape=(4, 4, 14), density=0.237\n",
"Total time: 9.466846 seconds\n",
"\n"
]
}
],
"source": [
"! PYTHONPATH=\"$PYTHONPATH:../\" \\\n",
"python -m FIDDLE.run \\\n",
" --data_path='./small_test/' \\\n",
" --population='./small_test/pop.csv' \\\n",
" --T=4 --dt=1.0 \\\n",
" --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
" --stats_functions 'min' 'max' 'mean'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Input data file: ./large_test/input_data.csv\n",
"\n",
"Input arguments:\n",
" T = 4\n",
" dt = 1.0\n",
" θ₁ = 0.001\n",
" θ₂ = 0.001\n",
" θ_freq = 1.0\n",
" k = 3 ['min', 'max', 'mean']\n",
"\n",
"N = 200\n",
"L = 4\n",
"\n",
"\n",
"================================================================================\n",
"1) Pre-filter\n",
"================================================================================\n",
"Remove rows not in population\n",
"Remove rows with t outside of [0, 4]\n",
"Remove rare variables (<= 0.001)\n",
"Total variables : 1970\n",
"Rare variables : 0\n",
"Remaining variables : 1970\n",
"# rows (original) : 64777\n",
"# rows (filtered) : 64777\n",
"\n",
"================================================================================\n",
"2) Transform; 3) Post-filter\n",
"================================================================================\n",
"\n",
"--------------------------------------------------------------------------------\n",
"*) Detecting value types\n",
"--------------------------------------------------------------------------------\n",
"Saved as: ./large_test/value_types.csv\n",
"\n",
"--------------------------------------------------------------------------------\n",
"*) Separate time-invariant and time-dependent\n",
"--------------------------------------------------------------------------------\n",
"Variables (time-invariant): 12\n",
"Variables (time-dependent): 1958\n",
"# rows (time-invariant): 2400\n",
"# rows (time-dependent): 62377\n",
"\n",
"--------------------------------------------------------------------------------\n",
"2.1) Transform time-invariant data\n",
"--------------------------------------------------------------------------------\n",
"(N × ^d) table :\t (200, 12)\n",
"number of missing entries :\t 4 out of 2400 total\n",
"Time elapsed: 0.021392 seconds\n",
"\n",
"Output\n",
"s_all, binary features :\t (200, 84)\n",
"Time elapsed: 0.216294 seconds\n",
"\n",
"--------------------------------------------------------------------------------\n",
"3.1) Post-filter time-invariant data\n",
"--------------------------------------------------------------------------------\n",
"Original : 84\n",
"Nearly-constant: 0\n",
"Correlated : 7\n",
"Time elapsed: 0.221074 seconds\n",
"\n",
"Output\n",
"s: shape=(200, 77), density=0.145\n",
"Total time: 0.225575 seconds\n",
"\n",
"\n",
"--------------------------------------------------------------------------------\n",
"2.2) Transform time-dependent data\n",
"--------------------------------------------------------------------------------\n",
"Total variables : 1958\n",
"Frequent variables : ['DiaBP', 'HR', 'RR', 'SpO2', 'SysBP']\n",
"M₁ = 5\n",
"M₂ = 1953\n",
"k = 3 ['min', 'max', 'mean']\n",
"\n",
"Transforming each example...\n",
"[Parallel(n_jobs=72)]: Using backend LokyBackend with 72 concurrent workers.\n",
"[Parallel(n_jobs=72)]: Done 1 tasks | elapsed: 4.8s\n",
"[Parallel(n_jobs=72)]: Done 18 tasks | elapsed: 8.0s\n",
"[Parallel(n_jobs=72)]: Done 37 tasks | elapsed: 9.3s\n",
"[Parallel(n_jobs=72)]: Done 56 tasks | elapsed: 10.3s\n",
"[Parallel(n_jobs=72)]: Done 78 out of 200 | elapsed: 11.5s remaining: 18.0s\n",
"[Parallel(n_jobs=72)]: Done 99 out of 200 | elapsed: 12.7s remaining: 13.0s\n",
"[Parallel(n_jobs=72)]: Done 120 out of 200 | elapsed: 13.8s remaining: 9.2s\n",
"[Parallel(n_jobs=72)]: Done 141 out of 200 | elapsed: 14.8s remaining: 6.2s\n",
"[Parallel(n_jobs=72)]: Done 162 out of 200 | elapsed: 15.4s remaining: 3.6s\n",
"[Parallel(n_jobs=72)]: Done 183 out of 200 | elapsed: 16.1s remaining: 1.5s\n",
"[Parallel(n_jobs=72)]: Done 200 out of 200 | elapsed: 16.8s finished\n",
"(freq) number of missing entries :\t 996 out of 200×4×5=4000 total\n",
"(freq) number of imputed entries :\t 58\n",
" DiaBP 17\n",
" HR 5\n",
" RR 6\n",
" SpO2 13\n",
" SysBP 17\n",
"(freq) number of not imputed entries :\t 938\n",
" DiaBP 190\n",
" HR 180\n",
" RR 183\n",
" SpO2 195\n",
" SysBP 190\n",
"(non-freq) number of missing entries :\t 1510389 out of 200×4×1953=1562400 total\n",
"\n",
"(N × L × ^D) table :\t (200, 4, 1983)\n",
"Time elapsed: 19.099867 seconds\n",
"\n",
"Discretizing features...\n",
"Processing 1978 non-boolean variable columns...\n",
" Binning numeric variables by quintile...\n",
" Converting variables to binary features\n",
"[Parallel(n_jobs=72)]: Using backend LokyBackend with 72 concurrent workers.\n",
"[Parallel(n_jobs=72)]: Done 1 tasks | elapsed: 0.0s\n",
"[Parallel(n_jobs=72)]: Batch computation too fast (0.0419s.) Setting batch_size=8.\n",
"[Parallel(n_jobs=72)]: Batch computation too fast (0.0419s.) Setting batch_size=76.\n",
"[Parallel(n_jobs=72)]: Done 9 tasks | elapsed: 0.2s\n",
"[Parallel(n_jobs=72)]: Done 20 tasks | elapsed: 0.7s\n",
"[Parallel(n_jobs=72)]: Done 1978 out of 1978 | elapsed: 6.7s finished\n",
"Finished discretizing features\n",
"\n",
"Output\n",
"X_all: shape=(200, 4, 3406), density=0.026\n",
"Time elapsed: 26.408678 seconds\n",
"\n",
"--------------------------------------------------------------------------------\n",
"3.2) Post-filter time-dependent data\n",
"--------------------------------------------------------------------------------\n",
"(200, 4, 3406) 0.026153479154433352\n",
"Original : 3406\n",
"Nearly-constant: 5\n",
"*** time: 3.5170133113861084\n",
"Correlated : 1102\n",
"*** time: 7.688496828079224\n",
"\n",
"Output\n",
"X: shape=(200, 4, 2299), density=0.034\n",
"(200, 4, 2299) 0.034270334928229666\n",
"Time elapsed: 34.102943 seconds\n",
"\n",
"Output\n",
"X: shape=(200, 4, 2299), density=0.034\n",
"Total time: 34.251790 seconds\n",
"\n"
]
}
],
"source": [
"! PYTHONPATH=\"$PYTHONPATH:../\" \\\n",
"python -m FIDDLE.run \\\n",
" --data_path='./large_test/' \\\n",
" --population='./large_test/pop.csv' \\\n",
" --T=4 --dt=1.0 \\\n",
" --theta_1=0.001 --theta_2=0.001 --theta_freq=1 \\\n",
" --stats_functions 'min' 'max' 'mean'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This diff is collapsed.
ID
200001
200010
200016
200033
200034
200035
200038
200040
200049
200050
200055
200061
200063
200072
200075
200079
200081
200087
200094
200098
200099
200104
200116
200126
200133
200135
200143
200150
200153
200163
200169
200172
200174
200191
200192
200200
200215
200217
200223
200224
200225
200227
200231
200234
200256
200262
200263
200265
200267
200277
200282
200284
200286
200292
200295
200311
200315
200330
200346
200349
200352
200357
200359
200368
200383
200392
200399
200402
200408
200416
200430
200439
200441
200445
200450
200451
200454
200457
200463
200481
200486
200487
200489
200509
200525
200529
200530
200550
200552
200553
200562
200563
200566
200571
200575
200579
200580
200583
200591
200599
200603
200606
200610
200612
200616
200621
200622
200625
200642
200659
200661
200662
200664
200665
200671
200683
200696
200697
200701
200708
200710
200711
200715
200723
200724
200729
200735
200737
200746
200747
200748
200749
200758
200759
200764
200773
200783
200788
200806
200813
200824
200825
200827
200855
200861
200873
200888
200897
200899
200908
200909
200913
200916
200932
200934
200935
200937
200940
200943
200944
200946
200966
200969
200972
200973
200975
200977
200983
200985
200988
200989
200993
200998
200999
201004
201018
201027
201029
201034
201039
201045
201046
201047
201053
201065
201073
201080
201081
201082
201083
201091
201092
201095
201104
201109
201110
201113
201124
201125
201128
ID,t,variable_name,variable_value
1,NULL,AGE,50
2,NULL,AGE,33
1,NULL,SEX,M
2,NULL,SEX,M
3,NULL,SEX,F
1,NULL,ROOM,101
2,NULL,ROOM,102
3,NULL,ROOM,103
1,0.1,HR,70
1,0.9,HR,71
1,1.5,HR,72
1,1.9,HR,73
1,2.9,HR,74
1,3.5,HR,75
2,0.1,HR,60
2,0.8,HR,60
2,0.9,HR,61
2,2.1,HR,73
2,2.9,HR,78
2,3.5,HR,75
3,1.7,HR,90
1,2.3,DRUG_A_RATE,48
2,3.4,DRUG_A_RATE,48
1,2.3,DRUG_A_ROUTE,Mouth
2,3.4,DRUG_A_ROUTE,Cont.IV
3,1,DRUG_A_ROUTE,Bolus
1,2.3,LAB_X,<1
3,2.7,LAB_X,5
4,0.7,HR,80
4,2.5,HR,62
4,3.9,HR,73
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment