Commit 1c24f161 authored by sgebreeg's avatar sgebreeg
Browse files

unnecessary doubles changed to floats

parent 16d0aaaa
......@@ -20,12 +20,12 @@ DecisionTree::DecisionTree(vector <vector<string>> &data, vector<int> &trainingI
}
Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
double parentEntropy, int currentDepth, int maxDepth, float featureWeight, vector<int> nodeDatasetIndices) {
float parentEntropy, int currentDepth, int maxDepth, float featureWeight, vector<int> nodeDatasetIndices) {
//get classification and entropy of current data
std::pair<string, double> classificationAndEntropy = classifyWithEntropy(data, nodeDatasetIndices);
std::pair<string, float> classificationAndEntropy = classifyWithEntropy(data, nodeDatasetIndices);
string classification = classificationAndEntropy.first;
double originalEntropy = classificationAndEntropy.second;
float originalEntropy = classificationAndEntropy.second;
//if depth exceeds the max depth or the data is pure make a leaf
if (currentDepth > maxDepth || originalEntropy == 0.0) {
......
......@@ -28,7 +28,7 @@ private:
};
Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
double parentEntropy, int currentDepth, int maxDepth, float featureWeight, vector<int> nodeDatasetIndices );
float parentEntropy, int currentDepth, int maxDepth, float featureWeight, vector<int> nodeDatasetIndices );
#endif //RACER_DECISIONTREE_HPP
......@@ -3,7 +3,7 @@
#include "Question.hpp"
using namespace std;
Node::Node(Question *question, Node *leftBranch, Node *rightBranch, bool isLeaf, string classification,
double originalEntropy) {
float originalEntropy) {
this->question = question;
this->leftBranch = leftBranch;
this->rightBranch = rightBranch;
......
......@@ -6,7 +6,7 @@
class Node {
public:
Node(Question *question, Node *leftBranch, Node *rightBranch, bool isLeaf, std::string classification,
double originalEntropy);
float originalEntropy);
Question *question;
......@@ -14,7 +14,7 @@ public:
Node *rightBranch;
bool isLeaf;
string classification;
double originalEntropy;
float originalEntropy;
};
#endif //RACER_NODE_H
......@@ -98,7 +98,8 @@ RandomForest::RandomForest(vector <vector<string>> &data, vector<int> &trainingI
}
//run test data through all trees in forest and return the label with maximum votes
//run test data through all trees in forest
// return the label with maximum votes
string getForestPrediction(vector <string>& test, RandomForest *forest) {
//to save labels and vote counts
map<string, int> votes;
......@@ -173,7 +174,7 @@ accuracyReport RandomForest::getAccuracy(vector <vector<string>>& datasetAsStrin
std::map<std::string, int> incorrectlabels;
std::map<std::string, int> correctlabels;
double accuracy;
float accuracy;
int correct = 0;
int incorrect = 0;
int total = predictions.size();
......@@ -199,7 +200,7 @@ accuracyReport RandomForest::getAccuracy(vector <vector<string>>& datasetAsStrin
}
accuracy = ((double) correct / (double) total) * 100;
accuracy = ((float) correct / (float) total) * 100;
} else {
cerr << "Predictions and labels are not equal" << endl;
}
......@@ -212,6 +213,7 @@ accuracyReport RandomForest::getAccuracy(vector <vector<string>>& datasetAsStrin
//print accuracy on console
void RandomForest::printAccuracyReport(accuracyReport report) {
cout << "********************* Forest accuracy *****************" << endl;
cout << "Testing accuracy for forest with " << this->trees.size() << " trees depth " << this->depth
<< " and feature selection weight " << this->featureWeight << endl;
cout << "Total tested data is " << report.total << endl;
......@@ -230,10 +232,11 @@ void RandomForest::printAccuracyReport(accuracyReport report) {
}
//saves accuracy into a file basoutput.txt
//saves accuracy into a file accuracy-report.txt
void RandomForest::printAccuracyReportFile(accuracyReport report) {
cout << "Printing accuracy into accuracy-report.txt ..." << endl;
ofstream outfile;
outfile.open("basoutput.txt", ios::app);
outfile.open("accuracy-report.txt", ios::app);
outfile << "---------- Report--------------" << "\n";
outfile << "Testing accuracy for forest with " << this->trees.size() << " trees depth " << this->depth
<< " and feature selection weight " << this->featureWeight << endl;
......
......@@ -230,20 +230,20 @@ float calculateEntropy(vector <vector<string>> &data, vector<int> indices) {
// rightLabelCount = map of labels and their count on the right side,
//leftCount = total count of left side, rightCount = total count of right side,
// Returns entropy
double
float
calculateSplitEntropy(std::map<std::string, int> leftLabelCount, std::map<std::string, int> rightLabelCount,
int leftCount,
int rightCount) {
double totalData = leftCount + rightCount;
double probabilityRight = rightCount / totalData;
double probabilityLeft = leftCount / totalData;
float totalData = leftCount + rightCount;
float probabilityRight = rightCount / totalData;
float probabilityLeft = leftCount / totalData;
double leftEntropy = 0.0;
double rightEntropy = 0.0;
float leftEntropy = 0.0;
float rightEntropy = 0.0;
map<string, int>::iterator leftitr;
for (leftitr = leftLabelCount.begin(); leftitr != leftLabelCount.end(); ++leftitr) {
double probability = (double) leftitr->second / (double) leftCount;
float probability = (float) leftitr->second / (float) leftCount;
if (probability > 0) {
leftEntropy -= probability * log2(probability);
}
......@@ -251,14 +251,14 @@ calculateSplitEntropy(std::map<std::string, int> leftLabelCount, std::map<std::s
map<string, int>::iterator rightitr;
for (rightitr = rightLabelCount.begin(); rightitr != rightLabelCount.end(); ++rightitr) {
double probability = (double) rightitr->second / (double) rightCount;
float probability = (float) rightitr->second / (float) rightCount;
if (probability > 0) {
rightEntropy -= probability * log2(probability);
}
}
//calculate split entropy by calculating entropy of both left and right data
double splitEntropy = (probabilityLeft * leftEntropy) + (probabilityRight * rightEntropy);
float splitEntropy = (probabilityLeft * leftEntropy) + (probabilityRight * rightEntropy);
return splitEntropy;
}
......@@ -346,13 +346,13 @@ sortDataByFeature(int featIdx, vector <vector<string>> &data, vector <pair<int,
//featureWeight = feature selection weigh for random feature selection
//nodeDatasetIndices = indices of data entries to use
// Returns BestSplitPoint, best split feature index and best split feature value
BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vector<string>> &data,
BestSplitPoint findBestSplit(float parentEntropy, int currentDepth, vector <vector<string>> &data,
vector <FeatureType> featureTypes, float featureWeight, vector<int> &nodeDatasetIndices) {
vector<int> randomFeatures = randomSelect_WithoutReplacement(data.size() - 1, featureWeight);
int bestFeatureIndex = randomFeatures[0];
//start with maximum entropy
double minEntropy = 99999;
float minEntropy = 99999;
string bestSplitValue = "";
......@@ -411,7 +411,7 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vec
if (indx == featureData.size()) {
continue;
}
double splitEntropy = calculateSplitEntropy(leftLabelCount, rightLabelCount, leftSize, rightSize);
float splitEntropy = calculateSplitEntropy(leftLabelCount, rightLabelCount, leftSize, rightSize);
if (splitEntropy < minEntropy) {
// cout<<"Best split at "<< featureIndex <<" value "<<threshold<<" Entropy "<< splitEntropy<<endl;
......@@ -433,7 +433,7 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vec
for (splitItr = uniqueValues.begin(); splitItr != uniqueValues.end(); splitItr++) {
FeatureSplitDataIndx featSplitData = splitData(data, featureIndex, featureTypes, (*splitItr),
nodeDatasetIndices);
double splitEntropy = (double) calculateSplitEntropy(featSplitData, data);
float splitEntropy = (float) calculateSplitEntropy(featSplitData, data);
if (splitEntropy < minEntropy) {
minEntropy = splitEntropy;
......@@ -460,10 +460,10 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vec
//data = a two dimensional vector of strings, dataset divided first in columns then in rows
//indices = indices of data entries to use
// Returns both classification and entropy of data entries
std::pair<string, double> classifyWithEntropy(vector <vector<string>> &data, vector<int> &indices) {
std::pair<string, float> classifyWithEntropy(vector <vector<string>> &data, vector<int> &indices) {
auto start = high_resolution_clock::now();
std::map<std::string, int> dataCount;
double entropy = 0.0;
float entropy = 0.0;
int maxVote = 0;
string label;
//iterate through all data entries in the node
......@@ -478,7 +478,7 @@ std::pair<string, double> classifyWithEntropy(vector <vector<string>> &data, vec
}
map<string, int>::iterator itr;
for (itr = dataCount.begin(); itr != dataCount.end(); ++itr) {
double probability = (double) itr->second / (double) indices.size();
float probability = (float) itr->second / (float) indices.size();
if (probability > 0) {
entropy -= (probability) * log2(probability);
......@@ -491,7 +491,7 @@ std::pair<string, double> classifyWithEntropy(vector <vector<string>> &data, vec
}
std::pair<string, double> classificationWithEntropy(label, entropy);
std::pair<string, float> classificationWithEntropy(label, entropy);
return classificationWithEntropy;
}
......@@ -569,7 +569,7 @@ vector<int> oversample(vector <vector<string>> &data, vector<int> &indices) {
while (difference > 0) {
if (difference < labelSize) {
vector<int> selection = randSelectIdxWithReplacement(idxs.size(),
(double(difference) / double(labelSize)));
(float(difference) / float(labelSize)));
for (int selectionIdx:selection) {
toAdd.push_back(idxs.at(selectionIdx));
}
......
......@@ -30,15 +30,15 @@ vector <string> parseFeatures(string fileName);
vector<int> randSelectIdxWithoutReplacement(int originalNum, float percent);
vector<int> randSelectIdxWithReplacement(int originalNum, float percent);
vector<int> splitTrainingAndTesting(vector<int> trainingIndices, vector <vector<string>> &dataString);
std::pair<string,double> classifyWithEntropy(vector<vector<string>> &data, vector<int> &indices);
std::pair<string,float> classifyWithEntropy(vector<vector<string>> &data, vector<int> &indices);
FeatureSplitDataIndx splitData(vector<vector<string>>& data, int splitFeature,vector<FeatureType> featureTypes, string splitValue, vector<int> &nodeDatasetIndices );
float calculateEntropy(vector <vector<string>>& data, vector<int> indices) ;
double calculateSplitEntropy(std::map<std::string, int> leftLabelCount, std::map<std::string, int> rightLabelCount, int leftCount,int rightCount);
float calculateSplitEntropy(std::map<std::string, int> leftLabelCount, std::map<std::string, int> rightLabelCount, int leftCount,int rightCount);
float calculateSplitEntropy (FeatureSplitDataIndx featsplitData, vector<vector<string>> &data);
vector <int> bootstrapData(vector <int> &indices, float baggingWeight);
vector<int> randomSelect_WithoutReplacement(int originalNum, float percentTraining);
vector<int> oversample(vector<vector<string>> &data, vector<int> &indices);
BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vector<string>> &data,
BestSplitPoint findBestSplit(float parentEntropy, int currentDepth, vector <vector<string>> &data,
vector <FeatureType> featureType, float featureWeight, vector<int>& nodeDatasetIndices );
void cleanTree(Node *node);
......
......@@ -55,10 +55,15 @@ int main(int argc, char *argv[]) {
//get forest accuracy
cout << "********************* Forest accuracy *****************" << endl;
accuracyReport report = randomForest->getAccuracy(datasetAsString, testingIdxs);
//prints accuracy to console
randomForest->printAccuracyReport(report);
//uncomment the following line to print accuracy into file accuracy-report.txt
// randomForest->printAccuracyReportFile(report);
//free up memory
for (DecisionTree *tree : randomForest->trees) {
cleanTree(tree->root);
......
......@@ -8,7 +8,7 @@ enum FeatureType {
};
struct accuracyReport{
double accuracy;
float accuracy;
std::map<std::string, int> correctlabels;
std::map<std::string, int> incorrectlabels;
int correct;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment