Commit 521708f7 authored by sgebreeg's avatar sgebreeg
Browse files

Merge branch 'soft_voting' into encoding

# Conflicts:
#	src/main.cpp
parents 738f0a86 df470359
......@@ -20,7 +20,7 @@ DecisionTree::DecisionTree(vector <vector<string>> &data, vector<int> &trainingI
};
Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
double parentEntropy, int currentDepth, int maxDepth, float featureWeight, vector<int> nodeDatasetIndices ) {
double parentEntropy, int currentDepth, int maxDepth, float featureWeight, vector<int> nodeDatasetIndices) {
//TODO pass data pointer and index vector
std::pair<string, double> classificationAndEntropy = classifyWithEntropy(data, nodeDatasetIndices);
......@@ -42,10 +42,10 @@ Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
//TODO send data vector of index
//find best split point
BestSplitPoint bestSplit = findBestSplit(parentEntropy, currentDepth, data,
featureType, featureWeight, nodeDatasetIndices);
BestSplitPoint bestSplit = findBestSplit(parentEntropy, currentDepth, data,
featureType, featureWeight, nodeDatasetIndices);
if(bestSplit.featureIdx == -1 || bestSplit.featureIdx > data.size()-1 ){
if (bestSplit.featureIdx == -1 || bestSplit.featureIdx > data.size() - 1) {
Node *leaf = new Node(NULL, NULL, NULL, true, classification, originalEntropy, informationGainFromParent);
// cout<<"No more split"<<endl;
return leaf;
......@@ -87,13 +87,15 @@ Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
}
}
string DecisionTree::predictSingle(vector <string>& test, Node *treeRoot, PredictionReport *report) {
pair<string, double> DecisionTree::predictSingle(vector <string> &test, Node *treeRoot, PredictionReport *report) {
if (treeRoot->isLeaf == true) {
return treeRoot->classification;
std::pair<string, double> prediction(treeRoot->classification,
treeRoot->originalEntropy);
return prediction;
}
Question *question = treeRoot->question;
int splitIndex = question->splitFeatureIndex;
cout<<splitIndex<<endl;
// cout << splitIndex << endl;
std::pair<int, double> featureInfoGain(question->splitFeatureIndex,
treeRoot->informationGainFromParent); // value init
......@@ -119,15 +121,20 @@ string DecisionTree::predictSingle(vector <string>& test, Node *treeRoot, Predic
if (answer->isLeaf) {
report->classification = answer->classification;
return answer->classification;
std::pair<string, double> prediction(answer->classification,
answer->originalEntropy);
return prediction;
} else {
return predictSingle(test, answer, report);
}
}
string DecisionTree::predictSingle(vector <string>& test, Node *treeRoot) {
pair<string, double> DecisionTree::predictSingle(vector <string> &test, Node *treeRoot) {
if (treeRoot->isLeaf == true) {
return treeRoot->classification;
std::pair<string, double> prediction(treeRoot->classification,
treeRoot->originalEntropy);
return prediction;
}
Question *question = treeRoot->question;
int splitIndex = question->splitFeatureIndex;
......@@ -146,7 +153,7 @@ string DecisionTree::predictSingle(vector <string>& test, Node *treeRoot) {
answer = treeRoot->falseBranch;
}
} else {
if ( stod(test[splitIndex]) <= stod(splitValue)) {
if (stod(test[splitIndex]) <= stod(splitValue)) {
answer = treeRoot->trueBranch;
} else {
answer = treeRoot->falseBranch;
......@@ -154,7 +161,9 @@ string DecisionTree::predictSingle(vector <string>& test, Node *treeRoot) {
}
if (answer->isLeaf) {
return answer->classification;
std::pair<string, double> prediction(answer->classification,
answer->originalEntropy);
return prediction;
} else {
return predictSingle(test, answer);
}
......
......@@ -14,8 +14,8 @@ class DecisionTree {
public:
DecisionTree(vector <vector<string>> &data, vector<int> &trainingIndx, int maxDepth, float featureWeight, vector <FeatureType> &featureType);
string predictSingle(vector <string>& test, Node *treeRoot, PredictionReport * report);
string predictSingle(vector <string>& test, Node *treeRoot);
pair<string, double> predictSingle(vector <string>& test, Node *treeRoot, PredictionReport * report);
pair<string, double> predictSingle(vector <string>& test, Node *treeRoot);
void printTree(Node *node, int space);
......
......@@ -46,7 +46,7 @@ RandomForest::RandomForest(vector <vector<string>> &data, vector<int> &trainingI
this->featureWeight = featureWeight;
this->depth = maxDepth;
unsigned num_cpus = std::thread::hardware_concurrency()/2;
unsigned num_cpus = std::thread::hardware_concurrency() / 2;
// unsigned num_cpus = 12;
if (numTrees < num_cpus)
num_cpus = numTrees;
......@@ -63,7 +63,7 @@ RandomForest::RandomForest(vector <vector<string>> &data, vector<int> &trainingI
maxDepth, featureWeight, &featureTypes, &decisionTrees] {
for (int j = 0; j < temp.at(i); j++) {
vector <int> baggedData = bagData(trainingIndx, baggingWeight); //TODO fix this
vector<int> baggedData = bagData(trainingIndx, baggingWeight); //TODO fix this
cout << "Training tree " << j << " in thread " << i << endl;
DecisionTree *tree = new DecisionTree(data, baggedData, maxDepth, featureWeight, featureTypes);
cout << "Done training tree " << j << " in thread " << i << endl;
......@@ -87,10 +87,84 @@ RandomForest::RandomForest(vector <vector<string>> &data, vector<int> &trainingI
}
void RandomForest::predict(string voting, vector <string> &test, RandomForest *forest, vector <string> &features){
if(voting == "SOFT"){
this->getSoftVoting(test, forest, features);
}
else if(voting == "HARD"){
this->getForestPrediction(test, forest, features);
}
}
string
RandomForest::getSoftVoting(vector <string> &test, RandomForest *forest, vector <string> &features) {
cout << "Trees in forest: " << to_string(forest->trees.size()) << endl;
cout << "Predicting: Soft Voting" << endl;
map<string, double> votes;
vector<string> predictions(forest->trees.size());
vector<PredictionReport *> explanations(forest->trees.size());
//get every tree's prediction
for (int treeIdx = 0; treeIdx < forest->trees.size(); treeIdx++) {
PredictionReport *report = new PredictionReport();
DecisionTree *tree = forest->trees[treeIdx];
pair<string, double> pred_pair = tree->predictSingle(test, tree->root, report);
string prediction = pred_pair.first;
double entropy = pred_pair.second;
if(votes.count(prediction)){
votes[prediction] += entropy;
}else{
votes[prediction] = entropy;
}
predictions[treeIdx] = prediction;
explanations[treeIdx] = report;
//TODO remove maybe: printing all the prediction reports
map < string, vector < std::pair < int, double > >>::iterator itr;
cout << "Explanation " << treeIdx << " classified " << report->classification << " because ";
for (itr = report->path.begin(); itr != report->path.end(); ++itr) {
if (itr->first == report->classification) {
sort(itr->second, test, features);
}
}
}
//get prediction with lowest entropy
double minEntropy = 999999;
string label;
map<string, double>::iterator itr;
for(itr=votes.begin(); itr != votes.end(); ++itr){
if(minEntropy > itr->second){
minEntropy = itr->second;
label = itr->first;
}
}
cout<<"Soft voting Classified "<<label<<endl;
cout<<"Because: ";
map<int, double> reports = explain(label, explanations);
sort(reports, test, features);
for (int j = 0; j < explanations.size(); j++) {
delete explanations[j];
}
return label;
}
vector <string>
RandomForest::getForestPrediction(vector <string>& test, RandomForest *forest, vector <string> &features) {
RandomForest::getForestPrediction(vector <string> &test, RandomForest *forest, vector <string> &features) {
cout << "Trees in forest: " << to_string(forest->trees.size()) << endl;
cout << "Predicting" << endl;
cout << "Predicting: Hard Voting" << endl;
vector <string> predictions(forest->trees.size());
vector < PredictionReport * > explanations(forest->trees.size());
map<string, int> votes;
......@@ -98,8 +172,9 @@ RandomForest::getForestPrediction(vector <string>& test, RandomForest *forest, v
for (int treeIdx = 0; treeIdx < forest->trees.size(); treeIdx++) {
PredictionReport *report = new PredictionReport();
DecisionTree *tree = forest->trees[treeIdx];
string prediction = tree->predictSingle(test, tree->root, report);
// cout << prediction << endl;
pair<string, double> pred_pair = tree->predictSingle(test, tree->root, report);
string prediction = pred_pair.first;
// cout << prediction << endl;
if (votes.count(prediction)) {
votes[prediction] += 1;
......@@ -137,12 +212,7 @@ RandomForest::getForestPrediction(vector <string>& test, RandomForest *forest, v
cout << "Because: ";
map<int, double> reports = explain(label, explanations);
map<int, double>::iterator
exp;
// for (exp = reports.begin(); exp != reports.end(); ++exp) {
// cout << "Feature no " << exp->first << " with info gain " << exp->second << endl;
// }
sort(reports, test, features);
......@@ -248,7 +318,8 @@ vector <pair<int, double>> sort(map<int, double> &M, vector <string> test, vecto
return A;
}
vector <string> getBatchPrediction(vector <vector<string>>& datasetAsString,vector <int>& testIdxs, RandomForest *forest) {
vector <string>
getBatchPrediction(vector <vector<string>> &datasetAsString, vector<int> &testIdxs, RandomForest *forest) {
vector <string> predictions;
......@@ -268,7 +339,8 @@ vector <string> getBatchPrediction(vector <vector<string>>& datasetAsString,vect
//Get every tree in the forests prediction
for (int treeIdx = 0; treeIdx < forest->trees.size(); treeIdx++) {
DecisionTree *tree = forest->trees[treeIdx];
string prediction = tree->predictSingle(test, tree->root);
pair<string, double> pred_pair = tree->predictSingle(test, tree->root);
string prediction = pred_pair.first;
if (votes.count(prediction)) {
votes[prediction] += 1;
......@@ -297,11 +369,11 @@ vector <string> getBatchPrediction(vector <vector<string>>& datasetAsString,vect
}
accuracyReport RandomForest::getAccuracy(vector <vector<string>>& datasetAsString,vector <int> & testIdxs) {
vector <string> predictions = getBatchPrediction(datasetAsString,testIdxs, this);
accuracyReport RandomForest::getAccuracy(vector <vector<string>> &datasetAsString, vector<int> &testIdxs) {
vector <string> predictions = getBatchPrediction(datasetAsString, testIdxs, this);
vector <string> labels;
for(int dataidx:testIdxs){
labels.push_back(datasetAsString.at(datasetAsString.size()-1).at(dataidx));
for (int dataidx:testIdxs) {
labels.push_back(datasetAsString.at(datasetAsString.size() - 1).at(dataidx));
}
std::map<std::string, int> incorrectLables;
std::map<std::string, int> correctLables;
......
......@@ -19,9 +19,12 @@ public:
accuracyReport getAccuracy(vector <vector<string>>& datasetAsString,vector <int> &testIdxs);
void printAccuracyReport(accuracyReport report);
void printAccuracyReportFile(accuracyReport report);
string getSoftVoting(vector <string> &test, RandomForest *forest, vector <string> &features);
void predict(string voting, vector <string> &test, RandomForest *forest, vector <string> &features);
};
};
vector <string> getBatchPrediction(vector <vector<string>>& datasetAsString,vector <int>& testIdxs, RandomForest *forest);
......
......@@ -56,7 +56,7 @@ int main(int argc, char *argv[]) {
featureTypes = parseFeatureTypes("../datasets/adult.featureTypes");
features = parseFeatures("../datasets/adult.features");
}
encodedDatasetAsString = datasetAsString;
encodedDatasetAsString.pop_back();
encodedfeatures = features;
......@@ -65,7 +65,7 @@ int main(int argc, char *argv[]) {
featuresToEncode.push_back(5);
encodeData(datasetAsString,encodedDatasetAsString,features,encodedfeatures,featuresToEncode);
for(string feature:encodedfeatures){
cout << " " << feature << " ";
}
......@@ -128,10 +128,10 @@ int main(int argc, char *argv[]) {
// cout << endl;
// randomForest->getForestPrediction(testData, randomForest, features);
// randomForest->predict("HARD",testData, randomForest, features);
// for (int i = 0; i<randomForest->trees.size(); i++){
// cleanTree(randomForest->trees[i]->root);
// delete randomForest->trees[i];
// }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment