Commit 10e86ced authored by mccrabb's avatar mccrabb
Browse files

Fixed Soft Voting calcs, broke old encoding

parent 604e0120
No preview for this file type
......@@ -92,7 +92,7 @@ RandomForest::RandomForest(vector <vector<string>> &data, vector<int> &trainingI
this->trees = decisionTrees;
}
// "Wrapper"ish function to simplify getting predictions
void RandomForest::predict(string voting, vector <string> &test, RandomForest *forest, vector <string> &features){
if(voting == "SOFT"){
this->getSoftVoting(test, forest, features);
......@@ -106,28 +106,47 @@ string RandomForest::getSoftVoting(vector <string> &test, RandomForest *forest,
cout << "Trees in forest: " << to_string(forest->trees.size()) << endl;
cout << "Predicting: Soft Voting" << endl;
map<string, double> votes;
vector<string> predictions(forest->trees.size());
vector<PredictionReport *> explanations(forest->trees.size());
// Get number of unique prediction classes
//get every tree's prediction
// for each tree in forest
unordered_set<string> uniquePredictions;
int numUniquePredictions = 0;
vector<pair<string, double>> predictions;
for (int treeIdx = 0; treeIdx < forest->trees.size(); treeIdx++) {
// get prediction and entropy
PredictionReport *report = new PredictionReport();
DecisionTree *tree = forest->trees[treeIdx];
pair<string, double> pred_pair = tree->predictSingle(test, tree->root, report);
string prediction = pred_pair.first;
predictions.emplace_back(pred_pair);
string prediction = pred_pair.first;
numUniquePredictions +=
(uniquePredictions.count(prediction) ?
0 : 1);
uniquePredictions.emplace(prediction);
// add prediction and explanation to forest's total
explanations[treeIdx] = report;
}
// Process predictions and entropies
// for each tree in forest
pair<string, double> pred_pair;
double maxEntropy = -1*log2(1/(double)numUniquePredictions);
for (int treeIdx = 0; treeIdx < forest->trees.size(); treeIdx++) {
pred_pair = predictions.at(treeIdx);
string prediction = pred_pair.first;
double entropy = pred_pair.second;
// add entropy for this prediction from this tree to the total for the forest
if(votes.count(prediction)){
votes[prediction] += entropy;
votes[prediction] += (maxEntropy - entropy);
}else{
votes[prediction] = entropy;
votes[prediction] = (maxEntropy - entropy);
}
// add prediction and explanation to forest's total
predictions[treeIdx] = prediction;
explanations[treeIdx] = report;
cout << prediction << ": " << to_string(votes[prediction]) << endl;
// print all the prediction reports
PredictionReport *report = explanations[treeIdx];
map < string, vector < std::pair < int, double > >>::iterator itr;
cout << "Explanation " << treeIdx << " classified " << report->classification << " because ";
for (itr = report->path.begin(); itr != report->path.end(); ++itr) {
......@@ -137,14 +156,14 @@ string RandomForest::getSoftVoting(vector <string> &test, RandomForest *forest,
}
}
//get prediction with lowest entropy
double minEntropy = 999999;
//get prediction with highest confidence
double maxConf = 0.0;
string label;
map<string, double>::iterator itr;
// find prediction with the lowest entropy (?)
for(itr=votes.begin(); itr != votes.end(); ++itr){
if(minEntropy > itr->second){
minEntropy = itr->second;
if(maxConf < itr->second){
maxConf = itr->second;
label = itr->first;
}
}
......
No preview for this file type
......@@ -86,3 +86,59 @@
100 20 0.0467269 0.7 25.4386 0.155197
------ Report ------
5 10 0.0764719 0.7 32.5 0.00779979
------ Report ------
5 10 0.0764719 0.7 31.5278 0.00825167
------ Report ------
5 10 0.0764719 0.7 31.1111 0.00758075
------ Report ------
5 10 0.0764719 0.7 32.4306 0.00716863
------ Report ------
5 10 0.0764719 0.7 30 0.00636382
------ Report ------
5 10 0.0764719 0.7 32.7083 0.00726345
------ Report ------
5 10 0.0764719 0.7 33.3333 0.00619241
------ Report ------
5 10 0.0764719 0.7 28.6806 0.00683623
------ Report ------
5 10 0.0764719 0.7 25.4861 0.00663246
------ Report ------
5 10 0.0764719 0.7 29.4444 0.00790174
------ Report ------
5 10 0.0764719 0.7 26.25 0.00773915
------ Report ------
5 10 0.0764719 0.7 29.4444 0.00670105
------ Report ------
5 3 0.0764719 0.7 23.2639 0.00523816
------ Report ------
5 3 0.0764719 0.7 30 0.00484361
------ Report ------
5 3 0.0764719 0.7 17.4306 0.00556822
------ Report ------
5 3 0.0467269 0.7 21.6862 0.0185424
------ Report ------
5 1 0.0764719 0.7 33.125 0.00363061
------ Report ------
5 1 0.0764719 0.7 21.5972 0.00370751
------ Report ------
5 1 0.223607 0.7 31.5278 0.00509295
------ Report ------
5 1 0.223607 0.7 33.3333 0.00396328
------ Report ------
5 1 0.223607 0.7 33.3333 0.0050132
------ Report ------
5 1 0.223607 0.7 20.6944 0.00282304
------ Report ------
5 1 0.223607 0.7 33.3333 0.00468833
------ Report ------
5 1 0.223607 0.7 23.125 0.00607161
------ Report ------
5 1 0.223607 0.7 26.9444 0.00550026
------ Report ------
5 1 0.223607 0.7 22.1528 0.00496346
------ Report ------
5 1 0.223607 0.7 24.5139 0.00437157
------ Report ------
5 1 0.223607 0.7 25.3472 0.0053871
------ Report ------
5 1 0.223607 0.7 33.3333 0.0049446
......@@ -102,3 +102,223 @@ Label bad was predicted right 227 times
Label good was predicted right 241 times
Label bad was predicted wrong 10 times
Label good was predicted wrong 2 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 10 and feature selection weight 0.0764719
Total tested data is 480
The accuracy of the tree is 94.5833% with 454 correct predictions and 26 incorrect predictions
Label bad was predicted right 222 times
Label good was predicted right 232 times
Label bad was predicted wrong 5 times
Label good was predicted wrong 21 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 10 and feature selection weight 0.0764719
Total tested data is 480
The accuracy of the tree is 93.3333% with 448 correct predictions and 32 incorrect predictions
Label bad was predicted right 200 times
Label good was predicted right 248 times
Label bad was predicted wrong 23 times
Label good was predicted wrong 9 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 10 and feature selection weight 0.0764719
Total tested data is 480
The accuracy of the tree is 97.2917% with 467 correct predictions and 13 incorrect predictions
Label bad was predicted right 227 times
Label good was predicted right 240 times
Label bad was predicted wrong 1 times
Label good was predicted wrong 12 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 10 and feature selection weight 0.0764719
Total tested data is 480
The accuracy of the tree is 90% with 432 correct predictions and 48 incorrect predictions
Label bad was predicted right 205 times
Label good was predicted right 227 times
Label bad was predicted wrong 12 times
Label good was predicted wrong 36 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 10 and feature selection weight 0.0764719
Total tested data is 480
The accuracy of the tree is 98.125% with 471 correct predictions and 9 incorrect predictions
Label bad was predicted right 220 times
Label good was predicted right 251 times
Label bad was predicted wrong 7 times
Label good was predicted wrong 2 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 10 and feature selection weight 0.0764719
Total tested data is 480
The accuracy of the tree is 100% with 480 correct predictions and 0 incorrect predictions
Label bad was predicted right 233 times
Label good was predicted right 247 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 10 and feature selection weight 0.0764719
Total tested data is 480
The accuracy of the tree is 86.0417% with 413 correct predictions and 67 incorrect predictions
Label bad was predicted right 158 times
Label good was predicted right 255 times
Label bad was predicted wrong 60 times
Label good was predicted wrong 7 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 10 and feature selection weight 0.0764719
Total tested data is 480
The accuracy of the tree is 76.4583% with 367 correct predictions and 113 incorrect predictions
Label bad was predicted right 207 times
Label good was predicted right 160 times
Label bad was predicted wrong 33 times
Label good was predicted wrong 80 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 10 and feature selection weight 0.0764719
Total tested data is 480
The accuracy of the tree is 88.3333% with 424 correct predictions and 56 incorrect predictions
Label bad was predicted right 215 times
Label good was predicted right 209 times
Label bad was predicted wrong 11 times
Label good was predicted wrong 45 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 10 and feature selection weight 0.0764719
Total tested data is 480
The accuracy of the tree is 78.75% with 378 correct predictions and 102 incorrect predictions
Label bad was predicted right 151 times
Label good was predicted right 227 times
Label bad was predicted wrong 64 times
Label good was predicted wrong 38 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 10 and feature selection weight 0.0764719
Total tested data is 480
The accuracy of the tree is 88.3333% with 424 correct predictions and 56 incorrect predictions
Label bad was predicted right 187 times
Label good was predicted right 237 times
Label bad was predicted wrong 44 times
Label good was predicted wrong 12 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 3 and feature selection weight 0.0764719
Total tested data is 480
The accuracy of the tree is 69.7917% with 335 correct predictions and 145 incorrect predictions
Label bad was predicted right 184 times
Label good was predicted right 151 times
Label bad was predicted wrong 24 times
Label good was predicted wrong 121 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 3 and feature selection weight 0.0764719
Total tested data is 480
The accuracy of the tree is 90% with 432 correct predictions and 48 incorrect predictions
Label bad was predicted right 213 times
Label good was predicted right 219 times
Label bad was predicted wrong 2 times
Label good was predicted wrong 46 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 3 and feature selection weight 0.0764719
Total tested data is 480
The accuracy of the tree is 52.2917% with 251 correct predictions and 229 incorrect predictions
Label bad was predicted right 220 times
Label good was predicted right 31 times
Label bad was predicted wrong 5 times
Label good was predicted wrong 224 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 3 and feature selection weight 0.0980581
Total tested data is 9049
The accuracy of the tree is 62.6478% with 5669 correct predictions and 3380 incorrect predictions
Label <=50K was predicted right 3616 times
Label >50K was predicted right 2053 times
Label <=50K was predicted wrong 3208 times
Label >50K was predicted wrong 172 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 3 and feature selection weight 0.0467269
Total tested data is 2052
The accuracy of the tree is 65.0585% with 1335 correct predictions and 717 incorrect predictions
Label H was predicted right 641 times
Label NH was predicted right 694 times
Label H was predicted wrong 306 times
Label NH was predicted wrong 411 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 1 and feature selection weight 0.0764719
Total tested data is 480
The accuracy of the tree is 99.375% with 477 correct predictions and 3 incorrect predictions
Label bad was predicted right 214 times
Label good was predicted right 263 times
Label good was predicted wrong 3 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 1 and feature selection weight 0.0764719
Total tested data is 480
The accuracy of the tree is 64.7917% with 311 correct predictions and 169 incorrect predictions
Label bad was predicted right 124 times
Label good was predicted right 187 times
Label bad was predicted wrong 112 times
Label good was predicted wrong 57 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 1 and feature selection weight 0.223607
Total tested data is 480
The accuracy of the tree is 94.5833% with 454 correct predictions and 26 incorrect predictions
Label bad was predicted right 220 times
Label good was predicted right 234 times
Label bad was predicted wrong 26 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 1 and feature selection weight 0.223607
Total tested data is 480
The accuracy of the tree is 100% with 480 correct predictions and 0 incorrect predictions
Label bad was predicted right 234 times
Label good was predicted right 246 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 1 and feature selection weight 0.223607
Total tested data is 480
The accuracy of the tree is 100% with 480 correct predictions and 0 incorrect predictions
Label bad was predicted right 223 times
Label good was predicted right 257 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 1 and feature selection weight 0.223607
Total tested data is 480
The accuracy of the tree is 62.0833% with 298 correct predictions and 182 incorrect predictions
Label bad was predicted right 187 times
Label good was predicted right 111 times
Label bad was predicted wrong 45 times
Label good was predicted wrong 137 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 1 and feature selection weight 0.223607
Total tested data is 480
The accuracy of the tree is 100% with 480 correct predictions and 0 incorrect predictions
Label bad was predicted right 221 times
Label good was predicted right 259 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 1 and feature selection weight 0.223607
Total tested data is 480
The accuracy of the tree is 69.375% with 333 correct predictions and 147 incorrect predictions
Label bad was predicted right 225 times
Label good was predicted right 108 times
Label bad was predicted wrong 15 times
Label good was predicted wrong 132 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 1 and feature selection weight 0.223607
Total tested data is 480
The accuracy of the tree is 80.8333% with 388 correct predictions and 92 incorrect predictions
Label bad was predicted right 203 times
Label good was predicted right 185 times
Label bad was predicted wrong 11 times
Label good was predicted wrong 81 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 1 and feature selection weight 0.223607
Total tested data is 480
The accuracy of the tree is 66.4583% with 319 correct predictions and 161 incorrect predictions
Label bad was predicted right 204 times
Label good was predicted right 115 times
Label bad was predicted wrong 33 times
Label good was predicted wrong 128 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 1 and feature selection weight 0.223607
Total tested data is 480
The accuracy of the tree is 73.5417% with 353 correct predictions and 127 incorrect predictions
Label bad was predicted right 158 times
Label good was predicted right 195 times
Label bad was predicted wrong 63 times
Label good was predicted wrong 64 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 1 and feature selection weight 0.223607
Total tested data is 480
The accuracy of the tree is 76.0417% with 365 correct predictions and 115 incorrect predictions
Label bad was predicted right 190 times
Label good was predicted right 175 times
Label bad was predicted wrong 31 times
Label good was predicted wrong 84 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 1 and feature selection weight 0.223607
Total tested data is 480
The accuracy of the tree is 100% with 480 correct predictions and 0 incorrect predictions
Label bad was predicted right 218 times
Label good was predicted right 262 times
......@@ -4,7 +4,12 @@
void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>& encodedDatasetAsString,
vector <string> features, vector <string>& encodedFeatures, vector<int> featuresToEncode,vector <FeatureType> featureTypes,vector <FeatureType>& encodedFeatureTypes){
encodedDatasetAsString = datasetAsString;
encodedFeatures = features;
encodedFeatureTypes = featureTypes;
/**
map<string, int> uniqueValues;
map<int,map<string, int>,greater <int>> featureUniqueValues; // maps featureIdx --> map(string --> String encoded as a number)
// for each feature
......@@ -58,6 +63,7 @@ void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>
// erase original feature
encodedDatasetAsString.erase(encodedDatasetAsString.begin()+featIdx);
}
**/
}
// create feature-specific one-hot encoding vector
vector<int> binaryShift(int size, int value){
......
No preview for this file type
No preview for this file type
......@@ -121,7 +121,7 @@ int main(int argc, char *argv[]) {
cout << encodedfeatures.at(featIndex) << "=" << encodedDatasetAsString.at(featIndex)[testingIdxs[0]] << ", ";
}
cout << endl;
randomForest->predict("HARD",testData, randomForest, encodedfeatures);
randomForest->predict("SOFT",testData, randomForest, encodedfeatures);
// Delete Tree (necessary for multiple identical trials
for (int i = 0; i<randomForest->trees.size(); i++){
......
No preview for this file type
No preview for this file type
......@@ -2,7 +2,7 @@
#for i in {5..100..10}
for i in {5..5..1}
do
./race $i 10 ../datasets/wine
./race $i 1 ../datasets/wine
# for j in {5..7}
# do
# ./race $i $j
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment