Commit 604e0120 authored by mccrabb's avatar mccrabb
Browse files

More cleaning

parent 468673fd
src/basoutput.txt
src/baseoutput.txt
src/avg.txt
......@@ -50,7 +50,6 @@ Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
// cout<<"No more split"<<endl;
return leaf;
}
// cout<<"splitting data"<<endl;
//split data
//TODO send data and index vector
......@@ -87,23 +86,23 @@ Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
}
}
// Recursive function to get prediction from one test entry
pair<string, double> DecisionTree::predictSingle(vector <string> &test, Node *treeRoot, PredictionReport *report) {
// Check if current node is leaf
if (treeRoot->isLeaf == true) {
std::pair<string, double> prediction(treeRoot->classification,
treeRoot->originalEntropy);
return prediction;
}
// Get Node "Question" (condition + Information Gain from condition check) + add to report
Question *question = treeRoot->question;
int splitIndex = question->splitFeatureIndex;
// cout << splitIndex << endl;
std::pair<int, double> featureInfoGain(question->splitFeatureIndex,
treeRoot->informationGainFromParent); // value init
treeRoot->informationGainFromParent);
report->path[treeRoot->classification].push_back(featureInfoGain);
string splitValue = question->splitValue;
FeatureType featureType = question->splitFeatureType;
// Get Next node
Node *answer;
if (featureType == CATEGORICAL) {
if (test[splitIndex] <= splitValue) {
......@@ -118,33 +117,33 @@ pair<string, double> DecisionTree::predictSingle(vector <string> &test, Node *tr
answer = treeRoot->falseBranch;
}
}
// If next node is leaf, return the prediction (class + entropy at node)
if (answer->isLeaf) {
report->classification = answer->classification;
std::pair<string, double> prediction(answer->classification,
answer->originalEntropy);
return prediction;
// Else, recurse
} else {
return predictSingle(test, answer, report);
}
}
pair<string, double> DecisionTree::predictSingle(vector <string> &test, Node *treeRoot) {
if (treeRoot->isLeaf == true) {
// Check if current node is leaf
if (treeRoot->isLeaf == true) {
std::pair<string, double> prediction(treeRoot->classification,
treeRoot->originalEntropy);
return prediction;
}
// Get Node "Question" (condition + Information Gain from condition check) + add to report
Question *question = treeRoot->question;
int splitIndex = question->splitFeatureIndex;
std::pair<int, double> featureInfoGain(question->splitFeatureIndex,
treeRoot->informationGainFromParent); // value init
string splitValue = question->splitValue;
FeatureType featureType = question->splitFeatureType;
// Get Next node
Node *answer;
if (featureType == CATEGORICAL) {
if (test[splitIndex] == splitValue) {
......@@ -159,11 +158,12 @@ pair<string, double> DecisionTree::predictSingle(vector <string> &test, Node *tr
answer = treeRoot->falseBranch;
}
}
// If next node is leaf, return the prediction (class + entropy at node)
if (answer->isLeaf) {
std::pair<string, double> prediction(answer->classification,
answer->originalEntropy);
return prediction;
// Else, recurse
} else {
return predictSingle(test, answer);
}
......@@ -187,4 +187,4 @@ void DecisionTree::printTree(Node *node, int space) {
cout << " " << node->classification << "\n";
}
}
\ No newline at end of file
}
......@@ -102,8 +102,7 @@ void RandomForest::predict(string voting, vector <string> &test, RandomForest *f
}
}
string
RandomForest::getSoftVoting(vector <string> &test, RandomForest *forest, vector <string> &features) {
string RandomForest::getSoftVoting(vector <string> &test, RandomForest *forest, vector <string> &features) {
cout << "Trees in forest: " << to_string(forest->trees.size()) << endl;
cout << "Predicting: Soft Voting" << endl;
map<string, double> votes;
......@@ -111,24 +110,24 @@ RandomForest::getSoftVoting(vector <string> &test, RandomForest *forest, vector
vector<PredictionReport *> explanations(forest->trees.size());
//get every tree's prediction
// for each tree in forest
for (int treeIdx = 0; treeIdx < forest->trees.size(); treeIdx++) {
// get prediction and entropy
PredictionReport *report = new PredictionReport();
DecisionTree *tree = forest->trees[treeIdx];
pair<string, double> pred_pair = tree->predictSingle(test, tree->root, report);
string prediction = pred_pair.first;
double entropy = pred_pair.second;
// add entropy for this prediction from this tree to the total for the forest
if(votes.count(prediction)){
votes[prediction] += entropy;
}else{
votes[prediction] = entropy;
}
// add prediction and explanation to forest's total
predictions[treeIdx] = prediction;
explanations[treeIdx] = report;
//TODO remove maybe: printing all the prediction reports
// print all the prediction reports
map < string, vector < std::pair < int, double > >>::iterator itr;
cout << "Explanation " << treeIdx << " classified " << report->classification << " because ";
for (itr = report->path.begin(); itr != report->path.end(); ++itr) {
......@@ -142,7 +141,7 @@ RandomForest::getSoftVoting(vector <string> &test, RandomForest *forest, vector
double minEntropy = 999999;
string label;
map<string, double>::iterator itr;
// find prediction with the lowest entropy (?)
for(itr=votes.begin(); itr != votes.end(); ++itr){
if(minEntropy > itr->second){
minEntropy = itr->second;
......
......@@ -84,3 +84,5 @@
5 5 0.0467269 0.7 24.4314 0.0239732
------ Report ------
100 20 0.0467269 0.7 25.4386 0.155197
------ Report ------
5 10 0.0764719 0.7 32.5 0.00779979
......@@ -94,3 +94,11 @@ Label H was predicted right 789 times
Label NH was predicted right 777 times
Label H was predicted wrong 161 times
Label NH was predicted wrong 325 times
---------- Report--------------
Testing accuracy for forest with 5 trees depth 10 and feature selection weight 0.0764719
Total tested data is 480
The accuracy of the tree is 97.5% with 468 correct predictions and 12 incorrect predictions
Label bad was predicted right 227 times
Label good was predicted right 241 times
Label bad was predicted wrong 10 times
Label good was predicted wrong 2 times
......@@ -452,6 +452,7 @@ string classifyData(vector <vector<string>> data) {
return label;
}
// Calculate the entropy of the data entries indicated by the indicies (+ most popular label)
std::pair<string, double> classifyWithEntropy(vector <vector<string>> &data, vector<int> &indices) {
//TODO get data reference and vector of index
auto start = high_resolution_clock::now();
......@@ -459,33 +460,31 @@ std::pair<string, double> classifyWithEntropy(vector <vector<string>> &data, vec
double entropy = 0.0;
int maxVote = 0;
string label;
// count the number of times each label is present
for (int i: indices) {
if (dataCount.count(data[data.size() - 1][i])) {
dataCount[data[data.size() - 1][i]] += 1;
} else {
dataCount[data[data.size() - 1][i]] = 1;
}
}
// for each label option
map<string, int>::iterator itr;
for (itr = dataCount.begin(); itr != dataCount.end(); ++itr) {
// cout<<itr->first<<" "<<itr->second<<endl;
// incrementally calculate (actual) entropy
double probability = (double) itr->second / (double) indices.size();
if (probability > 0) {
entropy -= (probability) * log2(probability);
}
// find most frequent label
if (maxVote < itr->second) {
maxVote = itr->second;
label = itr->first;
}
}
// create pair to be returned (label and class at this point
std::pair<string, double> classificationWithEntropy(label, entropy);
return classificationWithEntropy;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment