Commit 955e75c9 authored by sgebreeg's avatar sgebreeg
Browse files

code clean up

parent 27425f7f
......@@ -45,10 +45,10 @@ Node *train(vector <vector<string>> data, vector <FeatureType> featureType,
} else {
BestSplitPoint bestSplit = findBestSplitTrial(parentEntropy, currentDepth, data,
//find best split point
BestSplitPoint bestSplit = findBestSplit(parentEntropy, currentDepth, data,
featureType, featureWeight);
//cout << "------ best split index "<<bestSplit.featureIdx<<endl;
if(bestSplit.featureIdx == -1 || bestSplit.featureIdx > data.size()-1 ){
Node *leaf = new Node(NULL, NULL, NULL, true, classification, originalEntropy, informationGainFromParent);
// cout<<"No more split"<<endl;
......
......@@ -311,7 +311,7 @@ sortDataByFeature(int featIdx, vector <vector<string>> data, vector <pair<int, s
});
}
BestSplitPoint findBestSplitTrial(double parentEntropy, int currentDepth, vector <vector<string>> data,
BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vector<string>> data,
vector <FeatureType> featureTypes, float featureWeight) {
vector<int> randomFeatures = randomSelect_WithoutReplacement(data.size(), featureWeight);
......@@ -388,117 +388,7 @@ BestSplitPoint findBestSplitTrial(double parentEntropy, int currentDepth, vector
return {bestFeatureIndex, bestSplitValue};
}
BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, std::map<int, set<string>> potentialSplits,
vector <vector<string>> data,
vector <FeatureType> featureTypes) {
auto start = high_resolution_clock::now();
float entropy = 9999;
int bestSplitFeature;
string bestSplitValue;
bool first_iteration = true;
map < int, set < string >> ::iterator
itr;
vector <vector<string>> dataTrue;
vector <vector<string>> dataFalse;
for (itr = potentialSplits.begin(); itr != potentialSplits.end(); ++itr) {
dataTrue.clear();
dataFalse.clear();
float localEntropy;
bool firstsplit = true;;
set <string> splits = itr->second;
if (splits.size() > 0) {
set<string>::iterator splitItr;
for (splitItr = splits.begin(); splitItr != splits.end(); splitItr++) {
if (featureTypes[itr->first] == CONTINUOUS) {
float splitEntropy;
if (firstsplit) {
firstsplit = false;
FeatureSplitData featSplitData = splitData(data, itr->first, featureTypes, (*splitItr));
splitEntropy = calculateSplitEntropy(featSplitData);
localEntropy = splitEntropy;
dataTrue = featSplitData.dataTrue;
dataFalse = featSplitData.dataFalse;
} else {
FeatureSplitData featSplitData = splitData(data, itr->first, featureTypes, (*splitItr));
//TODO maybe let's re-evaluate this?
// cout<<"data true before insert "<<dataTrue.size()<<endl;
// dataTrue.insert(dataTrue.end(), featSplitData.dataTrue.begin(), featSplitData.dataTrue.end());
// for(int x=0; x<dataTrue.size(); x++){
// for(int y=0; y<featSplitData.dataTrue.at(0).size(); y++){
// dataTrue.at(x).push_back(featSplitData.dataTrue.at(x).at(y));
// }
// }
// dataFalse = featSplitData.dataFalse;
splitEntropy = calculateSplitEntropy({featSplitData.dataTrue, featSplitData.dataFalse});
if (localEntropy >= splitEntropy) {
localEntropy = splitEntropy;
} else {
break;
}
}
if (splitEntropy <= entropy) {
entropy = splitEntropy;
bestSplitFeature = itr->first;
bestSplitValue = (*splitItr);
}
} else {
FeatureSplitData featSplitData = splitData(data, itr->first, featureTypes, (*splitItr));
float splitEntropy = calculateSplitEntropy(featSplitData);
if (first_iteration || splitEntropy <= entropy) {
first_iteration = false;
entropy = splitEntropy;
bestSplitFeature = itr->first;
bestSplitValue = (*splitItr);
}
}
}
}
}
if (currentDepth != 0 && parentEntropy < entropy) {
bestSplitFeature = -1;
bestSplitValue = "";
}
BestSplitPoint splitPoint = {bestSplitFeature, bestSplitValue};
double time = (high_resolution_clock::now() - start).count() / NANOSECONDS_IN_SECOND;
return splitPoint;
}
bool dataIsPure(vector <vector<string>> data) {
std::map<std::string, int> dataCount;
for (int i = 0; i < data[data.size() - 1].size(); i++) {
if (dataCount.count(data[data.size() - 1][i])) {
dataCount[data[data.size() - 1][i]] += 1;
} else {
dataCount[data[data.size() - 1][i]] = 1;
}
}
if (dataCount.size() == 1) {
return true;
}
return false;
}
string classifyData(vector <vector<string>> data) {
std::map<std::string, int> dataCount;
......@@ -563,75 +453,6 @@ std::pair<string, double> classifyWithEntropy(vector <vector<string>> data) {
return classificationWithEntropy;
}
std::map<int, set<string>>
findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featureType, float featureWeight) {
auto start = high_resolution_clock::now();
std::map<int, set<string>> possibleSplits;
set <string> temp;
vector<int> index = randomSelect_WithoutReplacement(data.size(), featureWeight);
// vector<int> index (data.size());
// for ( int j = 0; j< data.size(); j++){
// index[j] = j;
// }
for (int i = 0; i < index.size(); i++) {
temp.clear();
if (index[i] != data.size() - 1) { //because the last entry is the label
// if (featureType[index[i]] == CATEGORICAL) {
for (int j = 0; j < data[index[i]].size(); j++) {
temp.insert(data[index[i]][j]);
}
possibleSplits[index[i]] = temp;
// } else if (featureType[index[i]] == CONTINUOUS) {
// vector <string> continuousData = data[index[i]];
// sort(continuousData.begin(), continuousData.end());
// //Unsupervised binning for continuous data
// int K = 100;
// if(stod(continuousData[continuousData.size() - 1]) == stod(continuousData[0])){
// continue;
// }
// else
// {
// double w = (stod(continuousData[continuousData.size() - 1]) - stod(continuousData[0])) / K;
//
//
// if (w) {
// for (int i = 0; i <= K; i++) {
// int splitter = stod(continuousData[0]) + (i * w);
// temp.insert(to_string(splitter));
//
// }
// }
//
//
//
//// cout<<"Size of unique data "<< continuousData.size()<<endl;
//
//// for (int j = 1; j < continuousData.size(); j++) {
//// string average = to_string((stod(continuousData[j - 1]) + stod(continuousData[j])) / 2);
//// temp.insert(average);
//// temp.insert(continuousData[j]);
//
//// }
//
//
// possibleSplits[index[i]] = temp;
//
// }
//
//
// }
}
}
double time = (high_resolution_clock::now() - start).count() / NANOSECONDS_IN_SECOND;
//cout << "Time for finding possible splits: "<<time <<"\n";
return possibleSplits;
}
vector <vector<string>> bagData(vector <vector<string>> data, float baggingWeight) {
vector <vector<string>> newData;
......
......@@ -29,19 +29,15 @@ vector<int> randSelectIdxWithoutReplacement(int originalNum, float percentTraini
vector<int> randSelectIdxWithReplacement(int originalNum, float percent);
void splitTrainingAndTesting(vector<int> trainingIndecies,vector<vector<string>> dataString,
vector<vector<string>>& trainingDataString,vector<vector<string>>& testDataString);
bool dataIsPure(vector <vector<string>> data);
string classifyData(vector <vector<string>> data);
std::pair<string,double> classifyWithEntropy(vector<vector<string>> data);
std::map<int,set<string>>
findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featureType, float featureWeight);
FeatureSplitData splitData(vector<vector<string>>data, int splitFeature,vector<FeatureType> featureTypes, string splitValue);
BestSplitPoint findBestSplit(double parentEntropy, int currentDepth,std::map<int, set<string>> potentialSplits, vector<vector<string>> data, vector<FeatureType> featureTypes);
float calculateEntropy(vector<vector<string>> data);
float calculateSplitEntropy (FeatureSplitData featsplitData);
vector<vector<string>> bagData(vector<vector<string>> data, float baggingWeight);
vector<int> randomSelect_WithoutReplacement(int originalNum, float percentTraining);
vector<vector<string>> oversample(vector<vector<string>> data);
BestSplitPoint findBestSplitTrial(double parentEntropy, int currentDepth, vector <vector<string>> data,
BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vector<string>> data,
vector <FeatureType> featureType, float featureWeight);
#endif
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment