Commit 222e9378 authored by sgebreeg's avatar sgebreeg
Browse files

decision tree prediction sends entropy

parent 5461658f
......@@ -20,7 +20,7 @@ DecisionTree::DecisionTree(vector <vector<string>> &data, vector<int> &trainingI
};
Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
double parentEntropy, int currentDepth, int maxDepth, float featureWeight, vector<int> nodeDatasetIndices ) {
double parentEntropy, int currentDepth, int maxDepth, float featureWeight, vector<int> nodeDatasetIndices) {
//TODO pass data pointer and index vector
std::pair<string, double> classificationAndEntropy = classifyWithEntropy(data, nodeDatasetIndices);
......@@ -42,10 +42,10 @@ Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
//TODO send data vector of index
//find best split point
BestSplitPoint bestSplit = findBestSplit(parentEntropy, currentDepth, data,
featureType, featureWeight, nodeDatasetIndices);
BestSplitPoint bestSplit = findBestSplit(parentEntropy, currentDepth, data,
featureType, featureWeight, nodeDatasetIndices);
if(bestSplit.featureIdx == -1 || bestSplit.featureIdx > data.size()-1 ){
if (bestSplit.featureIdx == -1 || bestSplit.featureIdx > data.size() - 1) {
Node *leaf = new Node(NULL, NULL, NULL, true, classification, originalEntropy, informationGainFromParent);
// cout<<"No more split"<<endl;
return leaf;
......@@ -87,13 +87,15 @@ Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
}
}
string DecisionTree::predictSingle(vector <string>& test, Node *treeRoot, PredictionReport *report) {
pair<string, double> DecisionTree::predictSingle(vector <string> &test, Node *treeRoot, PredictionReport *report) {
if (treeRoot->isLeaf == true) {
return treeRoot->classification;
std::pair<string, double> prediction(treeRoot->classification,
treeRoot->originalEntropy);
return prediction;
}
Question *question = treeRoot->question;
int splitIndex = question->splitFeatureIndex;
cout<<splitIndex<<endl;
cout << splitIndex << endl;
std::pair<int, double> featureInfoGain(question->splitFeatureIndex,
treeRoot->informationGainFromParent); // value init
......@@ -119,15 +121,20 @@ string DecisionTree::predictSingle(vector <string>& test, Node *treeRoot, Predic
if (answer->isLeaf) {
report->classification = answer->classification;
return answer->classification;
std::pair<string, double> prediction(answer->classification,
answer->originalEntropy);
return prediction;
} else {
return predictSingle(test, answer, report);
}
}
string DecisionTree::predictSingle(vector <string>& test, Node *treeRoot) {
pair<string, double> DecisionTree::predictSingle(vector <string> &test, Node *treeRoot) {
if (treeRoot->isLeaf == true) {
return treeRoot->classification;
std::pair<string, double> prediction(treeRoot->classification,
treeRoot->originalEntropy);
return prediction;
}
Question *question = treeRoot->question;
int splitIndex = question->splitFeatureIndex;
......@@ -146,7 +153,7 @@ string DecisionTree::predictSingle(vector <string>& test, Node *treeRoot) {
answer = treeRoot->falseBranch;
}
} else {
if ( stod(test[splitIndex]) <= stod(splitValue)) {
if (stod(test[splitIndex]) <= stod(splitValue)) {
answer = treeRoot->trueBranch;
} else {
answer = treeRoot->falseBranch;
......@@ -154,7 +161,9 @@ string DecisionTree::predictSingle(vector <string>& test, Node *treeRoot) {
}
if (answer->isLeaf) {
return answer->classification;
std::pair<string, double> prediction(answer->classification,
answer->originalEntropy);
return prediction;
} else {
return predictSingle(test, answer);
}
......
......@@ -14,8 +14,8 @@ class DecisionTree {
public:
DecisionTree(vector <vector<string>> &data, vector<int> &trainingIndx, int maxDepth, float featureWeight, vector <FeatureType> &featureType);
string predictSingle(vector <string>& test, Node *treeRoot, PredictionReport * report);
string predictSingle(vector <string>& test, Node *treeRoot);
pair<string, double> predictSingle(vector <string>& test, Node *treeRoot, PredictionReport * report);
pair<string, double> predictSingle(vector <string>& test, Node *treeRoot);
void printTree(Node *node, int space);
......
......@@ -46,7 +46,7 @@ RandomForest::RandomForest(vector <vector<string>> &data, vector<int> &trainingI
this->featureWeight = featureWeight;
this->depth = maxDepth;
unsigned num_cpus = std::thread::hardware_concurrency()/2;
unsigned num_cpus = std::thread::hardware_concurrency() / 2;
// unsigned num_cpus = 12;
if (numTrees < num_cpus)
num_cpus = numTrees;
......@@ -63,7 +63,7 @@ RandomForest::RandomForest(vector <vector<string>> &data, vector<int> &trainingI
maxDepth, featureWeight, &featureTypes, &decisionTrees] {
for (int j = 0; j < temp.at(i); j++) {
vector <int> baggedData = bagData(trainingIndx, baggingWeight); //TODO fix this
vector<int> baggedData = bagData(trainingIndx, baggingWeight); //TODO fix this
cout << "Training tree " << j << " in thread " << i << endl;
DecisionTree *tree = new DecisionTree(data, baggedData, maxDepth, featureWeight, featureTypes);
cout << "Done training tree " << j << " in thread " << i << endl;
......@@ -87,8 +87,31 @@ RandomForest::RandomForest(vector <vector<string>> &data, vector<int> &trainingI
}
vector <string>
RandomForest::getSoftVoting(vector <string> &test, RandomForest *forest, vector <string> &features) {
cout << "Trees in forest: " << to_string(forest->trees.size()) << endl;
cout << "Predicting" << endl;
map<string, double> votes;
//get every tree's prediction
for (int treeIdx = 0; treeIdx < forest->trees.size(); treeIdx++) {
DecisionTree *tree = forest->trees[treeIdx];
pair<string, double> pred_pair = tree->predictSingle(test, tree->root);
string prediction = pred_pair.first;
double entropy = pred_pair.second;
if(votes.count(prediction)){
votes[prediction] += entropy;
}else{
votes[prediction] = entropy;
}
}
}
vector <string>
RandomForest::getForestPrediction(vector <string>& test, RandomForest *forest, vector <string> &features) {
RandomForest::getForestPrediction(vector <string> &test, RandomForest *forest, vector <string> &features) {
cout << "Trees in forest: " << to_string(forest->trees.size()) << endl;
cout << "Predicting" << endl;
vector <string> predictions(forest->trees.size());
......@@ -98,8 +121,9 @@ RandomForest::getForestPrediction(vector <string>& test, RandomForest *forest, v
for (int treeIdx = 0; treeIdx < forest->trees.size(); treeIdx++) {
PredictionReport *report = new PredictionReport();
DecisionTree *tree = forest->trees[treeIdx];
string prediction = tree->predictSingle(test, tree->root, report);
// cout << prediction << endl;
pair<string, double> pred_pair = tree->predictSingle(test, tree->root);
string prediction = pred_pair.first;
// cout << prediction << endl;
if (votes.count(prediction)) {
votes[prediction] += 1;
......@@ -248,7 +272,8 @@ vector <pair<int, double>> sort(map<int, double> &M, vector <string> test, vecto
return A;
}
vector <string> getBatchPrediction(vector <vector<string>>& datasetAsString,vector <int>& testIdxs, RandomForest *forest) {
vector <string>
getBatchPrediction(vector <vector<string>> &datasetAsString, vector<int> &testIdxs, RandomForest *forest) {
vector <string> predictions;
......@@ -268,7 +293,8 @@ vector <string> getBatchPrediction(vector <vector<string>>& datasetAsString,vect
//Get every tree in the forests prediction
for (int treeIdx = 0; treeIdx < forest->trees.size(); treeIdx++) {
DecisionTree *tree = forest->trees[treeIdx];
string prediction = tree->predictSingle(test, tree->root);
pair<string, double> pred_pair = tree->predictSingle(test, tree->root);
string prediction = pred_pair.first;
if (votes.count(prediction)) {
votes[prediction] += 1;
......@@ -297,11 +323,11 @@ vector <string> getBatchPrediction(vector <vector<string>>& datasetAsString,vect
}
accuracyReport RandomForest::getAccuracy(vector <vector<string>>& datasetAsString,vector <int> & testIdxs) {
vector <string> predictions = getBatchPrediction(datasetAsString,testIdxs, this);
accuracyReport RandomForest::getAccuracy(vector <vector<string>> &datasetAsString, vector<int> &testIdxs) {
vector <string> predictions = getBatchPrediction(datasetAsString, testIdxs, this);
vector <string> labels;
for(int dataidx:testIdxs){
labels.push_back(datasetAsString.at(datasetAsString.size()-1).at(dataidx));
for (int dataidx:testIdxs) {
labels.push_back(datasetAsString.at(datasetAsString.size() - 1).at(dataidx));
}
std::map<std::string, int> incorrectLables;
std::map<std::string, int> correctLables;
......
......@@ -19,9 +19,11 @@ public:
accuracyReport getAccuracy(vector <vector<string>>& datasetAsString,vector <int> &testIdxs);
void printAccuracyReport(accuracyReport report);
void printAccuracyReportFile(accuracyReport report);
vector <string> getSoftVoting(vector <string> &test, RandomForest *forest, vector <string> &features);
};
};
vector <string> getBatchPrediction(vector <vector<string>>& datasetAsString,vector <int>& testIdxs, RandomForest *forest);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment