Commit 27425f7f authored by sgebreeg's avatar sgebreeg
Browse files

revisions on finding best splitting point

parent dea77701
......@@ -44,16 +44,9 @@ Node *train(vector <vector<string>> data, vector <FeatureType> featureType,
} else {
// cout<<"Finding splits"<<endl;
//create a random subspace
//find possible splits
std::map<int, set<string>> potentialSplits = findAllSplittingPoints(data, featureType, featureWeight);
// cout<<"Finding best split"<<endl;
//find best split
BestSplitPoint bestSplit = findBestSplit(parentEntropy, currentDepth, potentialSplits, data,
featureType);
BestSplitPoint bestSplit = findBestSplitTrial(parentEntropy, currentDepth, data,
featureType, featureWeight);
//cout << "------ best split index "<<bestSplit.featureIdx<<endl;
if(bestSplit.featureIdx == -1 || bestSplit.featureIdx > data.size()-1 ){
......
......@@ -205,6 +205,37 @@ float calculateEntropy(vector <vector<string>> data) {
return entropy;
}
double
calSplitEntropy(std::map<std::string, int> leftLabelCount, std::map<std::string, int> rightLabelCount, int leftCount,
int rightCount) {
double totalData = leftCount + rightCount;
double probabilityRight = rightCount / totalData;
double probabilityLeft = leftCount / totalData;
double leftEntropy = 0.0;
double rightEntropy = 0.0;
map<string, int>::iterator leftitr;
for (leftitr = leftLabelCount.begin(); leftitr != leftLabelCount.end(); ++leftitr) {
double probability = (double) leftitr->second / (double) leftCount;
if (probability > 0) {
leftEntropy -= probability * log2(probability);
}
}
map<string, int>::iterator rightitr;
for (rightitr = rightLabelCount.begin(); rightitr != rightLabelCount.end(); ++rightitr) {
double probability = (double) rightitr->second / (double) rightCount;
if (probability > 0) {
rightEntropy -= probability * log2(probability);
}
}
double splitEntropy = (probabilityLeft * leftEntropy) + (probabilityRight * rightEntropy);
return splitEntropy;
}
float calculateSplitEntropy(FeatureSplitData featsplitData) {
vector <vector<string>> splitDataTrue = featsplitData.dataTrue;
vector <vector<string>> splitDataFalse = featsplitData.dataFalse;
......@@ -270,6 +301,93 @@ splitData(vector <vector<string>> data, int splitFeature, vector <FeatureType> f
return featSplitData;
}
void
sortDataByFeature(int featIdx, vector <vector<string>> data, vector <pair<int, string>> &featureData) {
for (int dataIdx = 0; dataIdx < data[0].size(); dataIdx++) {
featureData.emplace_back(dataIdx, data[featIdx].at(dataIdx));
}
sort(featureData.begin(), featureData.end(), [](pair<int, string> &a, pair<int, string> &b) {
return a.second < b.second;
});
}
BestSplitPoint findBestSplitTrial(double parentEntropy, int currentDepth, vector <vector<string>> data,
vector <FeatureType> featureTypes, float featureWeight) {
vector<int> randomFeatures = randomSelect_WithoutReplacement(data.size(), featureWeight);
int bestFeatureIndex = randomFeatures[0];
double minEntropy = 99999;
string bestSplitValue = "";
for (auto featureIndex: randomFeatures) {
if ( featureIndex != data.size()-1 ) { //because last column is label
//initialize variables
string threshold = "";
int dataIndex;
std::map<std::string, int> leftLabelCount;
std::map<std::string, int> rightLabelCount;
//count right side labels
for (int i = 0; i < data[data.size() - 1].size(); i++) {
if (rightLabelCount.count(data[data.size() - 1][i])) {
rightLabelCount[data[data.size() - 1][i]] += 1;
} else {
rightLabelCount[data[data.size() - 1][i]] = 1;
}
}
int leftSize = 0;
int rightSize = data.at(featureIndex).size();
vector <pair<int, string>> featureData;
featureData.reserve(data[0].size());
//done initializing variables
//sort data with selected feature
sortDataByFeature(featureIndex, data, featureData);
for (int indx = 0; indx < featureData.size();) {
threshold = featureData.at(indx).second;
dataIndex = featureData.at(indx).first;
while (indx < data.at(featureIndex).size() && featureData.at(indx).second <= threshold) {
leftSize++;
rightSize--;
if (leftLabelCount.count(data[data.size() - 1][indx])) {
leftLabelCount[data[data.size() - 1][indx]] += 1;
} else {
leftLabelCount[data[data.size() - 1][indx]] = 1;
}
rightLabelCount[data[data.size() - 1][indx]] -= 1;
indx++;
dataIndex = featureData[indx].first;
}
if (indx == data[0].size()) {
continue;
}
double splitEntropy = calSplitEntropy(leftLabelCount, rightLabelCount, leftSize, rightSize);
if (splitEntropy < minEntropy) {
// cout<<"Best split at "<< featureIndex <<" value "<<threshold<<" Entropy "<< splitEntropy<<endl;
minEntropy = splitEntropy;
bestFeatureIndex = featureIndex;
bestSplitValue = threshold;
}
}
}
}
if (minEntropy >= parentEntropy && currentDepth != 0){
bestFeatureIndex = -1;
bestSplitValue = "";
}
return {bestFeatureIndex, bestSplitValue};
}
BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, std::map<int, set<string>> potentialSplits,
vector <vector<string>> data,
vector <FeatureType> featureTypes) {
......@@ -319,10 +437,9 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, std::map<in
// }
// dataFalse = featSplitData.dataFalse;
splitEntropy = calculateSplitEntropy({featSplitData.dataTrue, featSplitData.dataFalse});
if(localEntropy >= splitEntropy){
if (localEntropy >= splitEntropy) {
localEntropy = splitEntropy;
}
else{
} else {
break;
}
}
......
......@@ -41,6 +41,7 @@ float calculateSplitEntropy (FeatureSplitData featsplitData);
vector<vector<string>> bagData(vector<vector<string>> data, float baggingWeight);
vector<int> randomSelect_WithoutReplacement(int originalNum, float percentTraining);
vector<vector<string>> oversample(vector<vector<string>> data);
BestSplitPoint findBestSplitTrial(double parentEntropy, int currentDepth, vector <vector<string>> data,
vector <FeatureType> featureType, float featureWeight);
#endif
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment