Commit 89427173 authored by sgebreeg's avatar sgebreeg
Browse files

parallel training added

parent c89f8366
......@@ -38,24 +38,11 @@ Node *train(vector <vector<string>> data, vector <FeatureType> featureType,
informationGainFromParent = parentEntropy - originalEntropy;
}
// cout << "Depth: " << currentDepth << endl;
if (currentDepth > maxDepth || originalEntropy == 0.0) {
// cout << "data is pure -ish" << endl;
Node *leaf = new Node(NULL, NULL, NULL, true, classification, originalEntropy, informationGainFromParent);
return leaf;
} else {
// cout << "data is not pure" << endl;
// if ((int) parentEntropy != 0 && originalEntropy > parentEntropy) { //There is no need to split the data here
// Node *leaf = new Node(NULL, NULL, NULL, true, classification, originalEntropy, informationGainFromParent);
// // cout<<"No more split"<<endl;
// return leaf;
// }
// cout<<"Finding splits"<<endl;
//create a random subspace
......@@ -69,7 +56,7 @@ Node *train(vector <vector<string>> data, vector <FeatureType> featureType,
featureType);
//cout << "------ best split index "<<bestSplit.featureIdx<<endl;
if(bestSplit.featureIdx == -1){
if(bestSplit.featureIdx == -1 || bestSplit.featureIdx > data.size()-1 ){
Node *leaf = new Node(NULL, NULL, NULL, true, classification, originalEntropy, informationGainFromParent);
// cout<<"No more split"<<endl;
return leaf;
......@@ -113,9 +100,6 @@ string DecisionTree::predictSingle(vector <string> test, Node *treeRoot, Predict
return treeRoot->classification;
}
Question *question = treeRoot->question;
// cout << to_string(question->splitFeatureIndex) <<
// " Classified as " << treeRoot->classification <<
// " with infoGain " << treeRoot->informationGainFromParent << endl;
int splitIndex = question->splitFeatureIndex;
std::pair<int, double> featureInfoGain(question->splitFeatureIndex,
......@@ -133,7 +117,7 @@ string DecisionTree::predictSingle(vector <string> test, Node *treeRoot, Predict
answer = treeRoot->falseBranch;
}
} else {
if (test[splitIndex] >= splitValue) {
if (stod(test[splitIndex]) <= stod(splitValue)) {
answer = treeRoot->trueBranch;
} else {
answer = treeRoot->falseBranch;
......@@ -153,9 +137,6 @@ string DecisionTree::predictSingle(vector <string> test, Node *treeRoot) {
return treeRoot->classification;
}
Question *question = treeRoot->question;
// cout << to_string(question->splitFeatureIndex) <<
// " Classified as " << treeRoot->classification <<
// " with infoGain " << treeRoot->informationGainFromParent << endl;
int splitIndex = question->splitFeatureIndex;
std::pair<int, double> featureInfoGain(question->splitFeatureIndex,
......@@ -172,7 +153,7 @@ string DecisionTree::predictSingle(vector <string> test, Node *treeRoot) {
answer = treeRoot->falseBranch;
}
} else {
if ( stod(test[splitIndex]) >= stod(splitValue)) {
if ( stod(test[splitIndex]) <= stod(splitValue)) {
answer = treeRoot->trueBranch;
} else {
answer = treeRoot->falseBranch;
......
......@@ -10,7 +10,7 @@ RACEOBJS = $(patsubst %.cpp, %.o, $(RACESOURCES))
all: race
race: $(RACEOBJS)
$(CC) $(FLAGS) $^ -o $@
$(CC) $(FLAGS) $^ -o $@ -pthread
%.o: %.cpp
$(CC) $(FLAGS) -c -o $@ $<
......
......@@ -6,33 +6,79 @@
#include <map>
#include <vector>
#include <typeinfo>
#include <pthread.h>
using namespace std::chrono;
vector<int> getParts(int trees, int cpus) {
vector<int> temp;
if (cpus > trees) {
for (int i = 0; i < cpus; i++) {
temp.push_back(1);
}
} else if (trees % cpus == 0) {
for (int i = 0; i < cpus; i++)
temp.push_back(trees / cpus);
} else {
// upto n-(x % n) the values
// will be x / n
// after that the values
// will be x / n + 1
int zp = cpus - (trees % cpus);
int pp = trees / cpus;
for (int i = 0; i < cpus; i++) {
if (i >= zp)
temp.push_back(pp + 1);
else
temp.push_back(pp);
}
}
return temp;
}
RandomForest::RandomForest(vector <vector<string>> trainingData, vector <FeatureType> featureTypes, int numTrees,
float baggingWeight, float featureWeight, int maxDepth) {
vector < DecisionTree * > decisionTrees;
this->featureWeight = featureWeight;
this->depth = maxDepth;
for (int treeidx = 0; treeidx < numTrees; treeidx++) {
auto start = high_resolution_clock::now();
cout << "------ Training Tree " << treeidx << "\n";
vector <vector<string>> baggedData = bagData(trainingData, baggingWeight);
DecisionTree *tree = new DecisionTree(baggedData, maxDepth, featureWeight, featureTypes);
decisionTrees.push_back(tree);
cout << "------ Finished Training Tree " << treeidx << "\n";
double time = (high_resolution_clock::now() - start).count() / 1000000000.0;
unsigned num_cpus = std::thread::hardware_concurrency();
// A mutex ensures orderly access.
std::mutex iomutex;
std::vector <std::thread> threads(num_cpus);
cout << "Launching " << num_cpus << " jobs for training trees.\n";
vector<int> temp = getParts(numTrees, num_cpus); //determine how many trees to run in parallel
for (int i = 0; i < num_cpus; i++) {
if (i < temp.size())
threads[i] = std::thread([&iomutex, i, temp, trainingData,baggingWeight,
maxDepth, featureWeight, featureTypes, &decisionTrees] {
for (int j = 0; j < temp.at(i); j++) {
vector <vector<string>> baggedData = bagData(trainingData, baggingWeight);
cout<<"Training tree "<< j<<" in thread "<<i<<endl;
DecisionTree *tree = new DecisionTree(baggedData, maxDepth, featureWeight, featureTypes);
cout<<"Done training tree "<< j<<" in thread "<<i<<endl;
{
// Use a lexical scope and lock_guard to safely lock the mutex only for
// the duration of vector push.
std::lock_guard <std::mutex> iolock(iomutex);
decisionTrees.push_back(tree);
}
}
ofstream outfile;
outfile.open("timer.txt", ios::app);
outfile<< "------ Training Tree " << treeidx << " ";
outfile<< time<<"\n\n";
outfile.close();
});
}
for (auto& t : threads) {
t.join();
}
this->trees = decisionTrees;
......@@ -151,13 +197,13 @@ vector <pair<int, double>> sort(map<int, double> &M, vector <string> test, vecto
return A;
}
vector <string> getBatchPrediction(vector<vector<string>> testData, RandomForest *forest){
vector <string> getBatchPrediction(vector <vector<string>> testData, RandomForest *forest) {
vector<string> predictions;
vector <string> predictions;
for(int testIndex = 0; testIndex < testData.at(0).size(); testIndex++){
for (int testIndex = 0; testIndex < testData.at(0).size(); testIndex++) {
map<string, int> votes;
vector<string> test;
vector <string> test;
string emptystring;
for (int featIndex = 0; featIndex < testData.size(); featIndex++) {
......@@ -169,7 +215,7 @@ vector <string> getBatchPrediction(vector<vector<string>> testData, RandomForest
}
//Get every tree in the forests prediction
for(int treeIdx =0; treeIdx < forest->trees.size();treeIdx++){
for (int treeIdx = 0; treeIdx < forest->trees.size(); treeIdx++) {
DecisionTree *tree = forest->trees[treeIdx];
string prediction = tree->predictSingle(test, tree->root);
if (votes.count(prediction)) {
......@@ -181,7 +227,7 @@ vector <string> getBatchPrediction(vector<vector<string>> testData, RandomForest
}
}
int maxVote = 0;
int maxVote = 0;
string label;
//get highest voted label
......@@ -192,100 +238,95 @@ vector <string> getBatchPrediction(vector<vector<string>> testData, RandomForest
label = itr->first;
}
}
predictions.push_back(label);
}
return predictions;
}
accuracyReport RandomForest::getAccuracy(vector<vector<string>> testData){
vector<string> predictions = getBatchPrediction(testData,this);
vector<string> labels = testData.at(testData.size()-1);
accuracyReport RandomForest::getAccuracy(vector <vector<string>> testData) {
vector <string> predictions = getBatchPrediction(testData, this);
vector <string> labels = testData.at(testData.size() - 1);
std::map<std::string, int> incorrectLables;
std::map<std::string, int> correctLables;
double accuracy;
double accuracy;
int correct = 0;
int incorrect = 0 ;
int incorrect = 0;
int total = predictions.size();
if(predictions.size() == labels.size()){
for (int itr =0; itr < total; itr ++){
if(predictions[itr] == labels[itr]){
if (predictions.size() == labels.size()) {
for (int itr = 0; itr < total; itr++) {
if (predictions[itr] == labels[itr]) {
correct += 1;
if(correctLables.count(labels[itr])){
if (correctLables.count(labels[itr])) {
correctLables[labels[itr]] += 1;
} else {
correctLables[labels[itr]] = 1;
}
else
{
correctLables[labels[itr]] = 1;
}
}
else
{
} else {
incorrect += 1;
if(incorrectLables.count(labels[itr])){
if (incorrectLables.count(labels[itr])) {
incorrectLables[labels[itr]] += 1;
} else {
incorrectLables[labels[itr]] = 1;
}
else
{
incorrectLables[labels[itr]] = 1;
}
}
}
accuracy = ((double)correct/(double)total) *100;
}
else
{
cerr << "Predictions and lables are not equal" <<endl;
accuracy = ((double) correct / (double) total) * 100;
} else {
cerr << "Predictions and lables are not equal" << endl;
}
accuracyReport report = {accuracy,correctLables,incorrectLables,correct,incorrect,total};
accuracyReport report = {accuracy, correctLables, incorrectLables, correct, incorrect, total};
return report;
}
void RandomForest::printAccuracyReport(accuracyReport report){
cout << "Testing accuracy for forest with "<< this->trees.size() << " trees depth "<< this->depth << " and feature selection weight " << this->featureWeight<< endl;
cout << "Total tested data is " <<report.total<<endl;
cout << "The accuracy of the tree is "<<report.accuracy <<"% ";
cout << " with "<< report.correct << " correct predictions ";
cout << " and "<<report.incorrect << " incorrect predictions "<<endl;
void RandomForest::printAccuracyReport(accuracyReport report) {
cout << "Testing accuracy for forest with " << this->trees.size() << " trees depth " << this->depth
<< " and feature selection weight " << this->featureWeight << endl;
cout << "Total tested data is " << report.total << endl;
cout << "The accuracy of the tree is " << report.accuracy << "% ";
cout << " with " << report.correct << " correct predictions ";
cout << " and " << report.incorrect << " incorrect predictions " << endl;
map<string, int>::iterator citr;
for (citr = report.correctLables.begin(); citr != report.correctLables.end(); ++citr) {
cout << "Lable " << citr->first << " was predicted right " << citr->second << " times \n";
cout << "Lable " << citr->first << " was predicted right " << citr->second << " times \n";
}
map<string, int>::iterator itr;
for (itr = report.incorrectLables.begin(); itr != report.incorrectLables.end(); ++itr) {
cout << "Lable " << itr->first << " was predicted wrong " << itr->second << " times \n";
cout << "Lable " << itr->first << " was predicted wrong " << itr->second << " times \n";
}
}
void RandomForest::printAccuracyReportFile(accuracyReport report){
void RandomForest::printAccuracyReportFile(accuracyReport report) {
ofstream outfile;
outfile.open("basoutput.txt", ios::app);
outfile << "---------- Report--------------" << "\n";
outfile << "Testing accuracy for forest with "<< this->trees.size() << " trees depth "<< this->depth << " and feature selection weight " << this->featureWeight<< endl;
outfile << "Total tested data is " <<report.total<<endl;
outfile << "The accuracy of the tree is "<<report.accuracy <<"% ";
outfile << " with "<< report.correct << " correct predictions ";
outfile << " and "<<report.incorrect << " incorrect predictions"<<endl;
outfile << "Testing accuracy for forest with " << this->trees.size() << " trees depth " << this->depth
<< " and feature selection weight " << this->featureWeight << endl;
outfile << "Total tested data is " << report.total << endl;
outfile << "The accuracy of the tree is " << report.accuracy << "% ";
outfile << " with " << report.correct << " correct predictions ";
outfile << " and " << report.incorrect << " incorrect predictions" << endl;
map<string, int>::iterator citr;
for (citr = report.correctLables.begin(); citr != report.correctLables.end(); ++citr) {
outfile << "Lable " << citr->first << " was predicted right " << citr->second << " times \n";
outfile << "Lable " << citr->first << " was predicted right " << citr->second << " times \n";
}
map<string, int>::iterator itr;
for (itr = report.incorrectLables.begin(); itr != report.incorrectLables.end(); ++itr) {
outfile<< "Lable " << itr->first << " was predicted wrong " << itr->second << " times \n";
outfile << "Lable " << itr->first << " was predicted wrong " << itr->second << " times \n";
}
outfile.close();
}
......
......@@ -199,12 +199,9 @@ float calculateEntropy(vector <vector<string>> data) {
for (itr = dataCount.begin(); itr != dataCount.end(); ++itr) {
float probability = (float) itr->second / (float) data[data.size() - 1].size();
if (probability > 0) {
entropy -= (probability) * log2(probability);
entropy -= probability * log2(probability);
}
}
if(entropy == 1){
//cout << "size with one entropy "<< dataCount.size()<<endl;
}
return entropy;
}
......@@ -212,14 +209,13 @@ float calculateSplitEntropy(FeatureSplitData featsplitData) {
vector <vector<string>> splitDataTrue = featsplitData.dataTrue;
vector <vector<string>> splitDataFalse = featsplitData.dataFalse;
float toatalData = splitDataTrue.at(0).size() + splitDataFalse.at(0).size();
float probabilityDataTrue = (float) splitDataTrue.at(0).size() / toatalData;
float probabilityDataFalse = (float) splitDataFalse.at(0).size() / toatalData;
float totalData = splitDataTrue.at(0).size() + splitDataFalse.at(0).size();
float probabilityDataTrue = (float) splitDataTrue.at(0).size() / totalData;
float probabilityDataFalse = (float) splitDataFalse.at(0).size() / totalData;
float splitEntropy = (probabilityDataTrue * calculateEntropy(splitDataTrue)) +
(probabilityDataFalse * calculateEntropy(splitDataFalse));
//cout << "Split Entropy "<<splitEntropy<<endl;
return splitEntropy;
}
......@@ -255,7 +251,7 @@ splitData(vector <vector<string>> data, int splitFeature, vector <FeatureType> f
}
} else {
for (int dataIdx = 0; dataIdx < splitFeatureData.size(); dataIdx++) {
if (stod(splitFeatureData[dataIdx]) >= stod(splitValue)) {
if (stod(splitFeatureData[dataIdx]) <= stod(splitValue)) {
for (int featureIdx = 0; featureIdx < data.size(); featureIdx++) {
splitTrue.at(featureIdx).push_back(data.at(featureIdx).at(dataIdx));
}
......@@ -274,7 +270,8 @@ splitData(vector <vector<string>> data, int splitFeature, vector <FeatureType> f
return featSplitData;
}
BestSplitPoint findBestSplit(double parentEntropy, int currentDepth,std::map<int, set<string>> potentialSplits, vector <vector<string>> data,
BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, std::map<int, set<string>> potentialSplits,
vector <vector<string>> data,
vector <FeatureType> featureTypes) {
auto start = high_resolution_clock::now();
float entropy = 9999;
......@@ -284,77 +281,84 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth,std::map<int
map < int, set < string >> ::iterator
itr;
vector <vector<string>> dataTrue;
vector <vector<string>> dataFalse;
for (itr = potentialSplits.begin(); itr != potentialSplits.end(); ++itr) {
if (itr->second.size() == 0){
cout << itr->first << " " << itr->second.size() << endl;
cout << "feature types "<<featureTypes[itr->first]<<endl;
}
dataTrue.clear();
dataFalse.clear();
float localEntropy;
bool firstsplit = true;;
bool firstsplit = true;;
set <string> splits = itr->second;
if (splits.size() > 0) {
set<string>::iterator splitItr;
for (splitItr = splits.begin(); splitItr != splits.end(); splitItr++) {
FeatureSplitData featSplitData = splitData(data, itr->first, featureTypes, (*splitItr));
if (featureTypes[itr->first] == CONTINUOUS) {
float splitEntropy = calculateSplitEntropy(featSplitData);
if (featureTypes[itr->first] ==CONTINUOUS) {
if(firstsplit){
float splitEntropy;
if (firstsplit) {
firstsplit = false;
FeatureSplitData featSplitData = splitData(data, itr->first, featureTypes, (*splitItr));
splitEntropy = calculateSplitEntropy(featSplitData);
localEntropy = splitEntropy;
}
else{
if (localEntropy > splitEntropy) {
dataTrue = featSplitData.dataTrue;
dataFalse = featSplitData.dataFalse;
} else {
FeatureSplitData featSplitData = splitData(data, itr->first, featureTypes, (*splitItr));
//TODO maybe let's re-evaluate this?
// cout<<"data true before insert "<<dataTrue.size()<<endl;
// dataTrue.insert(dataTrue.end(), featSplitData.dataTrue.begin(), featSplitData.dataTrue.end());
// for(int x=0; x<dataTrue.size(); x++){
// for(int y=0; y<featSplitData.dataTrue.at(0).size(); y++){
// dataTrue.at(x).push_back(featSplitData.dataTrue.at(x).at(y));
// }
// }
// dataFalse = featSplitData.dataFalse;
splitEntropy = calculateSplitEntropy({featSplitData.dataTrue, featSplitData.dataFalse});
if(localEntropy >= splitEntropy){
localEntropy = splitEntropy;
if(splitEntropy ==1){
//cout << "It is one \n";
}
}
else
{
else{
break;
}
}
}
if (first_iteration || splitEntropy <= entropy) {
first_iteration = false;
entropy = splitEntropy;
// cout<<itr->first<<" ";
bestSplitFeature = itr->first;
bestSplitValue = (*splitItr);
if (splitEntropy <= entropy) {
entropy = splitEntropy;
bestSplitFeature = itr->first;
bestSplitValue = (*splitItr);
}
} else {
FeatureSplitData featSplitData = splitData(data, itr->first, featureTypes, (*splitItr));
float splitEntropy = calculateSplitEntropy(featSplitData);
if (first_iteration || splitEntropy <= entropy) {
first_iteration = false;
entropy = splitEntropy;
bestSplitFeature = itr->first;
bestSplitValue = (*splitItr);
}
}
}
}
}
//cout <<"!!!!!!!!!! "<< bestSplitValue << endl;
if(currentDepth!=0 && parentEntropy<entropy){
if (currentDepth != 0 && parentEntropy < entropy) {
bestSplitFeature = -1;
bestSplitValue = "";
}
if(bestSplitFeature>15 || bestSplitFeature <-1){
map < int, set < string >> ::iterator
itr;
for (itr = potentialSplits.begin(); itr != potentialSplits.end(); ++itr) {
cout << "------------- "<<endl;
cout << itr->first << " " << itr->second.size() << endl;
}
}
BestSplitPoint splitPoint = {bestSplitFeature, bestSplitValue};
//cout << "best split point "<< bestSplitFeature << " " << bestSplitValue << endl;
double time = (high_resolution_clock::now() - start).count() / NANOSECONDS_IN_SECOND;
//cout << "Time to find best split point "<< time << "\n";
return splitPoint;
}
......@@ -452,80 +456,74 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur
vector<int> index = randomSelect_WithoutReplacement(data.size(), featureWeight);
// vector<int> index (data.size());
// for ( int j = 0; j< data.size(); j++){
// index[j] = j;
// index[j] = j;
// }
for (int i = 0; i < index.size(); i++) {
temp.clear();
if (index[i] != data.size() - 1) { //because the last entry is the label
if (featureType[index[i]] == CATEGORICAL) {
for (int j = 0; j < data[index[i]].size(); j++) {
temp.insert(data[index[i]][j]);
}
possibleSplits[index[i]] = temp;
// cout<<temp.size()<<" of feature Categorical "<< index[i]<<endl;
} else if (featureType[index[i]] == CONTINUOUS) {
vector <string> continuousData = data[index[i]];
sort(continuousData.begin(), continuousData.end());
//Unsupervised binning for continuous data
int K = 100;
if(stod(continuousData[continuousData.size() - 1]) == stod(continuousData[0])){
continue;
}
else
{
double w = (stod(continuousData[continuousData.size() - 1]) - stod(continuousData[0])) / K;
if (w) {
for (int i = 0; i <= K; i++) {
int splitter = stod(continuousData[0]) + (i * w);
temp.insert(to_string(splitter));
}
}
// cout<<"Size of unique data "<< continuousData.size()<<endl;
// for (int j = 1; j < continuousData.size(); j++) {
// string average = to_string((stod(continuousData[j - 1]) + stod(continuousData[j])) / 2);
// temp.insert(average);
// temp.insert(continuousData[j]);
// }
possibleSplits[index[i]] = temp;
if(temp.size() == 0){
cout<< "Size is zerooooo"<<endl;
}
}
// cout<<temp.size()<<" of feature Continuous "<< index[i]<<endl;
// if (featureType[index[i]] == CATEGORICAL) {
for (int j = 0; j < data[index[i]].size(); j++) {
temp.insert(data[index[i]][j]);
}
possibleSplits[index[i]] = temp;
// } else if (featureType[index[i]] == CONTINUOUS) {
// vector <string> continuousData = data[index[i]];
// sort(continuousData.begin(), continuousData.end());
// //Unsupervised binning for continuous data
// int K = 100;
// if(stod(continuousData[continuousData.size() - 1]) == stod(continuousData[0])){
// continue;