Commit 72646146 authored by mccrabb's avatar mccrabb
Browse files

PIM implemented for training

parent 95eaf21a
continuous,categorical,continuous,categorical,continuous,categorical,categorical,categorical,categorical,categorical,continuous,continuous,continuous,categorical
continuous,categorical,continuous,categorical,continuous,categorical,categorical,categorical,categorical,categorical,continuous,continuous,continuous,categorical,categorical
......@@ -5,6 +5,7 @@
#include <vector>
#include <string>
#include <map>
#include <limits>
#include "Node.hpp"
#include "Question.hpp"
......@@ -17,15 +18,46 @@ DecisionTree::DecisionTree(vector <vector<string>> &data, vector<int> &trainingI
this->featureWeight = featureWeight;
vector<Path> * tempVec = new vector<Path>();
this->paths = tempVec;
//cout << "Path Size: " << to_string(paths->size());
//TODO int<vector> take all iteration
this->root = train(data, featureType, 0.0, 0, maxDepth, featureWeight, trainingIndx, paths);
// dataMinsAndMaxes is a map from feature to feature value contraints:
// continuous data: min and max values seen for the feature
// categorical: set of options / categories for the feature
unordered_map<int, vector<string>> dataMinsAndMaxes;
float thisMin, thisMax;
for (int feat = 0; feat < featureType.size(); feat++){
if (featureType.at(feat) == CONTINUOUS) {
thisMin = stof(data.at(feat).at(0));
thisMax = stof(data.at(feat).at(0));
for (int entryIdx = 0; entryIdx < data.at(feat).size(); entryIdx++ ) {
thisMin = min(stof(data.at(feat).at(entryIdx)), thisMin);
thisMax = max(stof(data.at(feat).at(entryIdx)), thisMax);
}
vector<string> tempStringVector{to_string(thisMin), to_string(thisMax)};
dataMinsAndMaxes.emplace(feat, tempStringVector);
} else if (featureType.at(feat) == CATEGORICAL) {
unordered_set<string> tempCats;
for (int entryIdx = 0; entryIdx < data.at(feat).size(); entryIdx++ ) {
tempCats.insert(data.at(feat).at(entryIdx));
}
vector<string> tempStringVector;
for (auto thisCat : tempCats) {
tempStringVector.emplace_back(thisCat);
}
dataMinsAndMaxes.emplace(feat, tempStringVector);
}
}
auto trainStart = high_resolution_clock::now();
this->root = train(data, dataMinsAndMaxes, featureType, 0.0, 0, maxDepth, featureWeight, trainingIndx, paths);
auto trainStop = high_resolution_clock::now();
cout << "Train Time: " << to_string(duration_cast<microseconds>(trainStop - trainStart).count()) << endl;
assert(paths->size() >= 2);
// this->printTree(this->root, 0);
};
Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
double parentEntropy, int currentDepth, int maxDepth, float featureWeight,
vector<int> nodeDatasetIndices, vector<Path> * paths) {
Node *train(vector <vector<string>> &data,
unordered_map<int, vector<string>> dataMinsAndMaxes,
vector <FeatureType> &featureType, double parentEntropy, int currentDepth,
int maxDepth, float featureWeight, vector<int> nodeDatasetIndices,
vector<Path> * paths) {
//TODO pass data pointer and index vector
std::pair<string, double> classificationAndEntropy = classifyWithEntropy(data, nodeDatasetIndices);
......@@ -51,9 +83,11 @@ Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
//TODO send data vector of index
//find best split point
BestSplitPoint bestSplit = findBestSplit(parentEntropy, currentDepth, data,
featureType, featureWeight, nodeDatasetIndices);
featureType, featureWeight,
nodeDatasetIndices, dataMinsAndMaxes);
if (bestSplit.featureIdx == -1 || bestSplit.featureIdx > data.size() - 1) {
//cout << "(!) Best Feature == -1" << endl;
Node *leaf = new Node(NULL, NULL, NULL, nodeDatasetIndices.size(), true, classification, originalEntropy, informationGainFromParent);
leaf->pathIndices.emplace_back(paths->size());
paths->emplace_back(Path(paths->size(), emptyPathConditions));
......@@ -64,17 +98,13 @@ Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
//split data
//TODO send data and index vector
//TODO return indices for left and right
cout << "--FSDI--: " << to_string(nodeDatasetIndices.size()) << endl;
//cout << "Split: Feat=" << to_string(bestSplit.featureIdx) << " / value=" << bestSplit.splitpoint << endl;
FeatureSplitDataIndx featureSplitData = splitData(data, bestSplit.featureIdx, featureType,
bestSplit.splitpoint, nodeDatasetIndices);
cout << "--FSDI--: " << to_string(featureSplitData.dataTrue.size()) << "|"
<< to_string(featureSplitData.dataFalse.size()) << endl;
//No longer splittable
if (featureSplitData.dataTrue.size() < 1 || featureSplitData.dataFalse.size() < 1) {
//cout << "(!) all data on same side of condition" << endl;
Node *leaf = new Node(NULL, NULL, NULL, nodeDatasetIndices.size(), true, classification, originalEntropy, informationGainFromParent);
leaf->pathIndices.emplace_back(paths->size());
paths->emplace_back(Path(paths->size(), emptyPathConditions));
......@@ -88,9 +118,9 @@ Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
//call train for left and right data
//TODO pass int vector from splits
Node *leftNode = train(data, featureType, originalEntropy, currentDepth + 1, maxDepth,
Node *leftNode = train(data, dataMinsAndMaxes, featureType, originalEntropy, currentDepth + 1, maxDepth,
featureWeight, featureSplitData.dataTrue, paths);
Node *rightNode = train(data, featureType, originalEntropy, currentDepth + 1, maxDepth,
Node *rightNode = train(data, dataMinsAndMaxes, featureType, originalEntropy, currentDepth + 1, maxDepth,
featureWeight, featureSplitData.dataFalse, paths);
......@@ -103,12 +133,10 @@ Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
node->pathIndices.insert(node->pathIndices.end(),
rightNode->pathIndices.begin(),
rightNode->pathIndices.end());
cout << "pathIndicies L Size: " << to_string(leftNode->pathIndices.size()) << endl;
cout << "pathIndicies R Size: " << to_string(rightNode->pathIndices.size()) << endl;
cout << "pathIndicies Size: " << to_string(node->pathIndices.size()) << endl;
//cout << "pathIndicies L Size: " << to_string(leftNode->pathIndices.size()) << endl;
//cout << "pathIndicies R Size: " << to_string(rightNode->pathIndices.size()) << endl;
//cout << "pathIndicies Size: " << to_string(node->pathIndices.size()) << endl;
for (int idx = 0; idx < node->pathIndices.size(); idx++) {
cout << "--FSDI--e: " << to_string(featureSplitData.dataTrue.size()) << "|"
<< to_string(featureSplitData.dataFalse.size()) << endl;
paths->at(node->pathIndices.at(idx)).pathConditionVector.emplace_back(
PathCondition(bestSplit.featureIdx,
bestSplit.splitpoint,
......@@ -117,7 +145,6 @@ Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
(idx < leftNode->pathIndices.size()) ?
featureSplitData.dataTrue.size() :
featureSplitData.dataFalse.size() ) );
cout << "--FSDI/PC: " << to_string(paths->at(node->pathIndices.at(idx)).pathConditionVector.back().trainingCount) << endl;
}
return node;
}
......
......@@ -32,7 +32,9 @@ private:
};
Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
Node *train(vector <vector<string>> &data,
unordered_map<int, vector<string>> dataMinsAndMaxes,
vector <FeatureType> &featureType,
double entropy, int currentDepth, int maxDepth, float featureWeight,
vector<int> nodeDatasetIndices, vector<Path>* paths);
......
No preview for this file type
......@@ -101,7 +101,7 @@ RandomForest::RandomForest(vector <vector<string>> &data, vector<int> &trainingI
cout << (tempPC.at(condIdx).leq ? "<=" : ">")
<< tempPC.at(condIdx).splitValue << "," << flush;
else
cout << "\"" << tempPC.at(condIdx).splitValue << "\"" << flush;
cout << "\"" << tempPC.at(condIdx).splitValue << "\"," << flush;
cout << to_string(tempPC.at(condIdx).trainingCount) << ")" << flush;
}
cout << endl;
......
No preview for this file type
......@@ -404,3 +404,161 @@
3 3 0.447214 0.7 29.2929 0.00476495
------ Report ------
3 3 0.258199 0.7 23.3604 0.440066
------ Report ------
3 3 0.447214 0.7 28.2828 0.0225976
------ Report ------
3 3 0.447214 0.7 29.2929 0.021092
------ Report ------
3 3 0.447214 0.7 32.3232 0.0290484
------ Report ------
1 3 0.447214 0.7 32.3232 0.0130981
------ Report ------
1 3 0.447214 0.7 14.1414 0.00900185
------ Report ------
1 3 0.447214 0.7 19.1919 0.0158988
------ Report ------
3 3 0.447214 0.7 24.2424 0.0382358
------ Report ------
5 3 0.447214 0.7 11.1111 0.0340896
------ Report ------
5 3 0.447214 0.7 24.2424 0.0551786
------ Report ------
5 3 0.447214 0.7 20.202 0.0487771
------ Report ------
5 3 0.447214 0.7 24.2424 0.0913152
------ Report ------
5 3 0.447214 0.7 23.2323 0.0562915
------ Report ------
5 3 0.447214 0.7 14.1414 0.0650814
------ Report ------
5 3 0.447214 0.7 11.1111 0.0318738
------ Report ------
5 3 0.447214 0.7 9.09091 0.0316227
------ Report ------
5 3 0.447214 0.7 20.202 0.0540896
------ Report ------
5 3 0.447214 0.7 19.1919 0.0368526
------ Report ------
5 3 0.447214 0.7 18.1818 0.0390601
------ Report ------
5 3 0.447214 0.7 23.2323 0.0448191
------ Report ------
5 3 0.447214 0.7 14.1414 0.0378914
------ Report ------
10 3 0.447214 0.7 24.2424 0.131557
------ Report ------
10 3 0.447214 0.7 18.1818 0.107642
------ Report ------
10 3 0.447214 0.7 10.101 0.124079
------ Report ------
10 3 0.447214 0.7 12.1212 0.0957903
------ Report ------
10 3 0.447214 0.7 27.2727 0.0979522
------ Report ------
10 3 0.447214 0.7 21.2121 0.104765
------ Report ------
10 3 0.447214 0.7 17.1717 0.133699
------ Report ------
10 3 0.447214 0.7 20.202 0.0891907
------ Report ------
10 3 0.447214 0.7 24.2424 0.131753
------ Report ------
10 3 0.447214 0.7 22.2222 0.114073
------ Report ------
10 3 0.447214 0.7 24.2424 0.119861
------ Report ------
10 3 0.447214 0.7 10.101 0.0897593
------ Report ------
1 3 0.447214 0.7 24.2424 0.0159847
------ Report ------
1 3 0.447214 0.7 28.2828 0.0164335
------ Report ------
1 3 0.447214 0.7 27.2727 0.01611
------ Report ------
1 3 0.447214 0.7 29.2929 0.0121766
------ Report ------
1 3 0.447214 0.7 23.2323 0.0166609
------ Report ------
1 3 0.447214 0.7 24.2424 0.0149495
------ Report ------
1 3 0.447214 0.7 26.2626 0.0159854
------ Report ------
1 3 0.447214 0.7 23.2323 0.0146134
------ Report ------
1 3 0.447214 0.7 25.2525 0.0167769
------ Report ------
1 3 0.447214 0.7 23.2323 0.0114175
------ Report ------
1 3 0.447214 0.7 26.2626 0.0106759
------ Report ------
1 3 0.447214 0.7 31.3131 0.0131831
------ Report ------
1 3 0.447214 0.7 24.2424 0.0118592
------ Report ------
1 3 0.447214 0.7 29.2929 0.0134454
------ Report ------
1 3 0.447214 0.7 20.202 0.0126252
------ Report ------
1 3 0.447214 0.7 31.3131 0.0274278
------ Report ------
1 3 0.447214 0.7 31.3131 0.0229685
------ Report ------
1 3 0.447214 0.7 27.2727 0.0298843
------ Report ------
1 3 0.447214 0.7 31.3131 0.0311799
------ Report ------
1 3 0.447214 0.7 30.303 0.0199681
------ Report ------
1 3 0.447214 0.7 24.2424 0.0325232
------ Report ------
1 3 0.447214 0.7 32.3232 0.01984
------ Report ------
1 3 0.447214 0.7 32.3232 0.018665
------ Report ------
1 3 0.447214 0.7 29.2929 0.019095
------ Report ------
1 3 0.447214 0.7 21.2121 0.0230168
------ Report ------
3 3 0.447214 0.7 28.2828 0.047687
------ Report ------
3 3 0.447214 0.7 30.303 0.0564906
------ Report ------
5 3 0.447214 0.7 32.3232 0.0852136
------ Report ------
9 3 0.447214 0.7 31.3131 0.154293
------ Report ------
9 3 0.447214 0.7 32.3232 0.12969
------ Report ------
9 3 0.447214 0.7 32.3232 0.109408
------ Report ------
99 3 0.447214 0.7 30.303 1.89003
------ Report ------
99 3 0.447214 0.7 32.3232 1.81639
------ Report ------
1 3 0.258199 0.7 24.7196 1.76445
------ Report ------
1 3 1 0.7 31.3131 0.0305578
------ Report ------
1 3 1 0.7 24.8743 2.88538
------ Report ------
3 3 1 0.7 25.8688 4.94117
------ Report ------
3 3 1 0.7 25.9186 4.47059
------ Report ------
3 3 1 0.7 26.239 4.62393
------ Report ------
3 3 1 0.7 26.0843 4.66885
------ Report ------
3 3 1 0.7 25.7583 4.62924
------ Report ------
3 3 1 0.7 25.8688 4.38307
------ Report ------
3 3 1 0.7 20.1061 4.32778
------ Report ------
3 3 1 0.7 30.303 0.0300562
------ Report ------
25 3 1 0.7 25.8357 8.98491
------ Report ------
21 3 1 0.7 25.9462 7.53338
------ Report ------
3 3 1 0.7 25.8136 4.61247
......@@ -8,15 +8,20 @@
#include <sstream>
#include <limits>
#include <random>
#include <bitset>
#include <bits/stdc++.h>
#include <boost/dynamic_bitset.hpp>
#include "helpers.hpp"
#include "DecisionTree.hpp"
#include "Node.hpp"
using namespace std::chrono;
using namespace boost;
#define NANOSECONDS_IN_SECOND 1000000000.0
int global_PimTime;
vector <FeatureType> parseFeatureTypes(string fileName) {
ifstream fIn;
fIn.open(fileName, ifstream::in);
......@@ -211,7 +216,10 @@ calSplitEntropy(std::map<std::string, int> leftLabelCount, std::map<std::string,
double totalData = leftCount + rightCount;
double probabilityRight = rightCount / totalData;
double probabilityLeft = leftCount / totalData;
//cout << "LLCsize=" << to_string(leftLabelCount.size())
// << " RLCsize=" << to_string(rightLabelCount.size())
// << " Lcount=" << to_string(rightCount)
// << " Rcount=" << to_string(leftCount) << endl;
double leftEntropy = 0.0;
double rightEntropy = 0.0;
......@@ -230,7 +238,10 @@ calSplitEntropy(std::map<std::string, int> leftLabelCount, std::map<std::string,
rightEntropy -= probability * log2(probability);
}
}
//cout << "ProbL=" << to_string(probabilityLeft)
// << " entL=" << to_string(leftEntropy)
// << " ProbR=" << to_string(probabilityRight)
// << " entR=" << to_string(rightEntropy) << endl;
double splitEntropy = (probabilityLeft * leftEntropy) + (probabilityRight * rightEntropy);
return splitEntropy;
......@@ -281,9 +292,11 @@ splitData(vector <vector<string>> &data, int splitFeature, vector <FeatureType>
}
}
} else {
} else if (featureTypes.at(splitFeature) == CONTINUOUS) {
for (int i: nodeDatasetIndices) {
try {
//cout << "Continuous." << endl;
//cout << "splitData: " << data.at(splitFeature).at(i) << "-" << splitValue << endl;
if (stod(data.at(splitFeature).at(i)) <= stod(splitValue)) {
splitTrue.push_back(i);
} else {
......@@ -297,8 +310,6 @@ splitData(vector <vector<string>> &data, int splitFeature, vector <FeatureType>
}
}
}
//TODO change to vector of int check
......@@ -319,18 +330,35 @@ sortDataByFeature(int featIdx, vector <vector<string>> &data, vector <pair<int,
}
BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vector<string>> &data,
vector <FeatureType> featureTypes, float featureWeight, vector<int> &nodeDatasetIndices) {
//TODO accept data and vector of index check
vector <FeatureType> featureTypes, float featureWeight, vector<int> &nodeDatasetIndices,
unordered_map<int, vector<string>> dataMinsAndMaxes) {
// Select subset of features to test
vector<int> randomFeatures = randomSelect_WithoutReplacement(data.size(), featureWeight);
assert(randomFeatures.size() > 0);
// General setup
int bestFeatureIndex = randomFeatures[0];
double minEntropy = 99999;
double thisMinEntropy;
string bestSplitValue = "";
string thisBestSplitValue;
// Create starting/original bitset
boost::dynamic_bitset<> originalBitset (data.at(0).size());
for (int entry : nodeDatasetIndices) {
//cout << to_string(entry) << endl;
originalBitset[entry] = true;
}
/**
for (int i = 0; i < data.at(0).size(); i++) {
cout << originalBitset[i] << endl;
}
**/
/**
// BEGIN PIM TEST
cout << "PIM TEST" << endl;
int pimOutputLength = data.at(0).size(); // TEMPORARY! CHANGE TO "data.at(0).size()""
int pimOutputLength = data.at(0).size();
bool pimOutputArray[pimOutputLength] = {true};
for (int entryIdx = 1; entryIdx < pimOutputLength; entryIdx++) {
pimOutputArray[entryIdx] = true;
......@@ -356,35 +384,146 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vec
}
cout << "PIM TEST DONE" << endl;
// END PIM TEST
**/
for (auto featureIndex: randomFeatures) {
if (featureIndex != data.size() - 1) {//because last column is label
boost::dynamic_bitset<> lowerPimOutputArray, upperPimOutputArray;
boost::dynamic_bitset<> pimOutputArray;
std::map<std::string, int> leftLabelCount;
std::map<std::string, int> rightLabelCount;
int leftSize, rightSize; // size of left/rightLabelCounts.
//cout << "FEATURE " << to_string(featureIndex)
// << " (" << (featureTypes.at(featureIndex) == CONTINUOUS ? "CON" : "CAT") << ")" << endl;
if (featureTypes.at(featureIndex) == CONTINUOUS) {
//initialize variables
string threshold = "";
int dataIndex;
std::map<std::string, int> leftLabelCount;
std::map<std::string, int> rightLabelCount;
//count right side labels
for (int i : nodeDatasetIndices) { //TODO check
if (rightLabelCount.count(data[data.size() - 1][i])) {
rightLabelCount[data[data.size() - 1][i]] += 1;
} else {
rightLabelCount[data[data.size() - 1][i]] = 1;
}
}
int leftSize = 0;
int rightSize = nodeDatasetIndices.size(); //TODO check
vector <pair<int, string>> featureData;
featureData.reserve(nodeDatasetIndices.size()); //TODO check
//done initializing variables
// Loop for binary search for best split values
float binSearchLowerBound = stof(dataMinsAndMaxes.at(featureIndex).at(0));
float binSearchUpperBound = stof(dataMinsAndMaxes.at(featureIndex).at(1));
float binSearchMean, lowerSplitValue, upperSplitValue;
float lowerEntropy = 1.1; // Just need to start as different values
float upperEntropy = 1.2;
float continuousEntropy;
//cout << "Lower/Upper Bounds: " << to_string(binSearchLowerBound)
// << " / " << to_string(binSearchUpperBound) << endl;
// While the upper and lower binary search sections have
// meaningfully different entropy values...
while(abs(lowerEntropy-upperEntropy) > 0.001) {
//cout << "." << originalBitset.count() << endl;
// Get value to split on for this iteration
binSearchMean = (binSearchLowerBound + binSearchUpperBound) / 2;
lowerSplitValue = (binSearchLowerBound + binSearchMean) / 2;
upperSplitValue = (binSearchUpperBound + binSearchMean) / 2;
// -- LOWER BINARY SEARCH HALF --
// PIM Function (Lower binary half)
//cout << "ENTER pimCompare" << endl;
lowerPimOutputArray = pimCompare(data.at(featureIndex), CONTINUOUS, to_string(lowerSplitValue));
//cout << "EXIT pimCompare" << endl;
//for (int b = 0; b < lowerPimOutputArray.size(); b++) {cout << lowerPimOutputArray[b] << endl;}
// Split L & R to calcuate Entropy
lowerPimOutputArray &= originalBitset;
//for (int b = 0; b < lowerPimOutputArray.size(); b++) {cout << lowerPimOutputArray[b] << endl;}
for (int entryIdx = 0; entryIdx < originalBitset.size(); entryIdx++) {
if (originalBitset[entryIdx]) {
// output bit remains true (<= lower mean)
if (lowerPimOutputArray[entryIdx]) {
// class mapping exists
if (leftLabelCount.count(data[data.size() - 1][entryIdx])) {
leftLabelCount[data[data.size() - 1][entryIdx]] += 1;
} else { // class mapping doesn't exist yet
leftLabelCount[data[data.size() - 1][entryIdx]] = 1;
}
} else { // output bit is now false (> lower mean)
// class mapping exists
if (rightLabelCount.count(data[data.size() - 1][entryIdx])) {
rightLabelCount[data[data.size() - 1][entryIdx]] += 1;
} else { // class mapping doesn't exist yet
rightLabelCount[data[data.size() - 1][entryIdx]] = 1;
}
}
}
}
// Accum. left and right label counts
leftSize = 0; rightSize = 0;
for (auto leftIter : leftLabelCount) {
leftSize += leftIter.second;
}
for (auto rightIter : rightLabelCount) {
rightSize += rightIter.second;
}
// Calculate entropy
lowerEntropy = calSplitEntropy(leftLabelCount, rightLabelCount, leftSize, rightSize);
// -- UPPER BINARY SEARCH HALF --
// PIM Function (Upper binary half)
upperPimOutputArray = pimCompare(data.at(featureIndex), CONTINUOUS, to_string(upperSplitValue));
//for (int b = 0; b < upperPimOutputArray.size(); b++) {cout << upperPimOutputArray[b] << endl;}
// Split L & R to calcuate Entropy
upperPimOutputArray &= originalBitset;
//for (int b = 0; b < upperPimOutputArray.size(); b++) {cout << upperPimOutputArray[b] << endl;}
for (int entryIdx = 0; entryIdx < originalBitset.size(); entryIdx++) {
if (originalBitset[entryIdx]) {
// output bit remains true (<= upper mean)
if (upperPimOutputArray[entryIdx]) {
// class mapping exists
if (leftLabelCount.count(data[data.size() - 1][entryIdx])) {
leftLabelCount[data[data.size() - 1][entryIdx]] += 1;
} else { // class mapping doesn't exist yet
leftLabelCount[data[data.size() - 1][entryIdx]] = 1;
}
} else { // output bit is now false (> Upper mean)
// class mapping exists
if (rightLabelCount.count(data[data.size() - 1][entryIdx])) {
rightLabelCount[data[data.size() - 1][entryIdx]] += 1;
} else { // class mapping doesn't exist yet
rightLabelCount[data[data.size() - 1][entryIdx]] = 1;
}
}
}
}
// Accum. left and right label counts
leftSize = 0; rightSize = 0;
for (auto leftIter : leftLabelCount) {
leftSize += leftIter.second;
}
for (auto rightIter : rightLabelCount) {
rightSize += rightIter.second;
}
// Calculate entropy
upperEntropy = calSplitEntropy(leftLabelCount, rightLabelCount, leftSize, rightSize);
// -- END BINARY SEARCH HALVES --
// Save and prep for next loop
thisMinEntropy = min(lowerEntropy, upperEntropy);
if (lowerEntropy <= upperEntropy) {
//cout << "LOWER WIN" << endl;
binSearchUpperBound = binSearchMean;
thisBestSplitValue = to_string(lowerSplitValue);
} else {
//cout << "UPPER WIN" << endl;
binSearchLowerBound = binSearchMean;
thisBestSplitValue = to_string(upperSplitValue);
}
/**
cout << "lower/upper Entropies: " << to_string(lowerEntropy)
<< " / " << to_string(upperEntropy) << endl;
cout << "Lower/Upper Bounds: " << to_string(binSearchLowerBound)
<< " / " << to_string(binSearchUpperBound) << endl;
cout << "thisBestSplitValue: " << bestSplitValue << endl;
cout << "minEntropy " << to_string(minEntropy) << endl;
**/
if (thisMinEntropy < minEntropy) {
minEntropy = thisMinEntropy;
bestFeatureIndex = featureIndex;
bestSplitValue = thisBestSplitValue;
}
}
/**
//sort data with selected feature
sortDataByFeature(featureIndex, data, featureData, nodeDatasetIndices); //TODO check
......@@ -423,33 +562,62 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vec
}
**/
// CATEGORICAL
} else {
set <string> uniqueValues;
for (int i: nodeDatasetIndices) {
uniqueValues.insert(data[featureIndex][i]);
}
set<string>::iterator splitItr;
for (splitItr = uniqueValues.begin(); splitItr != uniqueValues.end(); splitItr++) {
FeatureSplitDataIndx featSplitData = splitData(data, featureIndex, featureTypes, (*splitItr),
nodeDatasetIndices);
double splitEntropy = (double) calculateSplitEntropy(featSplitData, data);
for (string thisCat : dataMinsAndMaxes.at(featureIndex)) {
pimOutputArray = pimCompare(data.at(featureIndex), CATEGORICAL, thisCat);
pimOutputArray &= originalBitset;
// LEFT = matches thisCat. RIGHT = doesn't match thisCat.
for (int entryIdx = 0; entryIdx < originalBitset.size(); entryIdx++) {
if (originalBitset[entryIdx]) {
// output bit remains true (matches thisCat)
if (pimOutputArray[entryIdx]) {
// class mapping exists
if (leftLabelCount.count(data[data.size() - 1][entryIdx])) {
leftLabelCount[data[data.size() - 1][entryIdx]] += 1;
} else { // class mapping doesn't exist yet
leftLabelCount[data[data.size() - 1][entryIdx]] = 1;
}
} else { // output bit is now false (doesn't match thisCat)
// class mapping exists
if (rightLabelCount.count(data[data.size() - 1][entryIdx])) {
rightLabelCount[data[data.size() - 1][entryIdx]] += 1;