Commit c89f8366 authored by ahmedaj's avatar ahmedaj
Browse files

fixed local entropy in best splits

parent 4651fe3b
...@@ -13,7 +13,7 @@ using namespace std; ...@@ -13,7 +13,7 @@ using namespace std;
DecisionTree::DecisionTree(vector <vector<string>> data, int maxDepth, float featureWeight, DecisionTree::DecisionTree(vector <vector<string>> data, int maxDepth, float featureWeight,
vector <FeatureType> featureType) { vector <FeatureType> featureType) {
// vector<int> index = randomSelect_WithoutReplacement(15, featureWeight); // vector<int> index = randomSelect_WithoutReplacement(data.size(), featureWeight);
// for(int i=0; i<index.size();i++){ // for(int i=0; i<index.size();i++){
// for(int j=1; j<data[index[i]].size(); j++){ // for(int j=1; j<data[index[i]].size(); j++){
// data[index[i]][j] = data[index[i]][0]; // data[index[i]][j] = data[index[i]][0];
...@@ -68,6 +68,7 @@ Node *train(vector <vector<string>> data, vector <FeatureType> featureType, ...@@ -68,6 +68,7 @@ Node *train(vector <vector<string>> data, vector <FeatureType> featureType,
BestSplitPoint bestSplit = findBestSplit(parentEntropy, currentDepth, potentialSplits, data, BestSplitPoint bestSplit = findBestSplit(parentEntropy, currentDepth, potentialSplits, data,
featureType); featureType);
//cout << "------ best split index "<<bestSplit.featureIdx<<endl;
if(bestSplit.featureIdx == -1){ if(bestSplit.featureIdx == -1){
Node *leaf = new Node(NULL, NULL, NULL, true, classification, originalEntropy, informationGainFromParent); Node *leaf = new Node(NULL, NULL, NULL, true, classification, originalEntropy, informationGainFromParent);
// cout<<"No more split"<<endl; // cout<<"No more split"<<endl;
...@@ -171,7 +172,7 @@ string DecisionTree::predictSingle(vector <string> test, Node *treeRoot) { ...@@ -171,7 +172,7 @@ string DecisionTree::predictSingle(vector <string> test, Node *treeRoot) {
answer = treeRoot->falseBranch; answer = treeRoot->falseBranch;
} }
} else { } else {
if (test[splitIndex] >= splitValue) { if ( stod(test[splitIndex]) >= stod(splitValue)) {
answer = treeRoot->trueBranch; answer = treeRoot->trueBranch;
} else { } else {
answer = treeRoot->falseBranch; answer = treeRoot->falseBranch;
......
...@@ -245,7 +245,6 @@ accuracyReport RandomForest::getAccuracy(vector<vector<string>> testData){ ...@@ -245,7 +245,6 @@ accuracyReport RandomForest::getAccuracy(vector<vector<string>> testData){
} }
accuracyReport report = {accuracy,correctLables,incorrectLables,correct,incorrect,total}; accuracyReport report = {accuracy,correctLables,incorrectLables,correct,incorrect,total};
cout << "here"<< endl;
return report; return report;
......
...@@ -202,6 +202,9 @@ float calculateEntropy(vector <vector<string>> data) { ...@@ -202,6 +202,9 @@ float calculateEntropy(vector <vector<string>> data) {
entropy -= (probability) * log2(probability); entropy -= (probability) * log2(probability);
} }
} }
if(entropy == 1){
//cout << "size with one entropy "<< dataCount.size()<<endl;
}
return entropy; return entropy;
} }
...@@ -215,7 +218,8 @@ float calculateSplitEntropy(FeatureSplitData featsplitData) { ...@@ -215,7 +218,8 @@ float calculateSplitEntropy(FeatureSplitData featsplitData) {
float splitEntropy = (probabilityDataTrue * calculateEntropy(splitDataTrue)) + float splitEntropy = (probabilityDataTrue * calculateEntropy(splitDataTrue)) +
(probabilityDataFalse * calculateEntropy(splitDataFalse)); (probabilityDataFalse * calculateEntropy(splitDataFalse));
//cout << "Split Entropy "<<splitEntropy<<endl;
return splitEntropy; return splitEntropy;
} }
...@@ -251,7 +255,7 @@ splitData(vector <vector<string>> data, int splitFeature, vector <FeatureType> f ...@@ -251,7 +255,7 @@ splitData(vector <vector<string>> data, int splitFeature, vector <FeatureType> f
} }
} else { } else {
for (int dataIdx = 0; dataIdx < splitFeatureData.size(); dataIdx++) { for (int dataIdx = 0; dataIdx < splitFeatureData.size(); dataIdx++) {
if (splitFeatureData[dataIdx] >= splitValue) { if (stod(splitFeatureData[dataIdx]) >= stod(splitValue)) {
for (int featureIdx = 0; featureIdx < data.size(); featureIdx++) { for (int featureIdx = 0; featureIdx < data.size(); featureIdx++) {
splitTrue.at(featureIdx).push_back(data.at(featureIdx).at(dataIdx)); splitTrue.at(featureIdx).push_back(data.at(featureIdx).at(dataIdx));
} }
...@@ -276,28 +280,48 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth,std::map<int ...@@ -276,28 +280,48 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth,std::map<int
float entropy = 9999; float entropy = 9999;
int bestSplitFeature; int bestSplitFeature;
string bestSplitValue; string bestSplitValue;
bool first_iteration = true;
map < int, set < string >> ::iterator map < int, set < string >> ::iterator
itr; itr;
for (itr = potentialSplits.begin(); itr != potentialSplits.end(); ++itr) { for (itr = potentialSplits.begin(); itr != potentialSplits.end(); ++itr) {
//cout << itr->first << " " << itr->second.size() << endl; if (itr->second.size() == 0){
float localEntropy = 1; cout << itr->first << " " << itr->second.size() << endl;
cout << "feature types "<<featureTypes[itr->first]<<endl;
}
float localEntropy;
bool firstsplit = true;;
set <string> splits = itr->second; set <string> splits = itr->second;
if (splits.size() > 0) { if (splits.size() > 0) {
set<string>::iterator splitItr; set<string>::iterator splitItr;
for (splitItr = splits.begin(); splitItr != splits.end(); splitItr++) { for (splitItr = splits.begin(); splitItr != splits.end(); splitItr++) {
// cout<<"Spliting "<<endl;
FeatureSplitData featSplitData = splitData(data, itr->first, featureTypes, (*splitItr)); FeatureSplitData featSplitData = splitData(data, itr->first, featureTypes, (*splitItr));
float splitEntropy = calculateSplitEntropy(featSplitData); float splitEntropy = calculateSplitEntropy(featSplitData);
if (featureTypes[itr->first] ==CONTINUOUS) { if (featureTypes[itr->first] ==CONTINUOUS) {
if (localEntropy > splitEntropy) { if(firstsplit){
firstsplit = false;
localEntropy = splitEntropy; localEntropy = splitEntropy;
} else {
break;
} }
else{
if (localEntropy > splitEntropy) {
localEntropy = splitEntropy;
if(splitEntropy ==1){
//cout << "It is one \n";
}
}
else
{
break;
}
}
} }
if (splitEntropy <= entropy) { if (first_iteration || splitEntropy <= entropy) {
first_iteration = false;
entropy = splitEntropy; entropy = splitEntropy;
// cout<<itr->first<<" "; // cout<<itr->first<<" ";
bestSplitFeature = itr->first; bestSplitFeature = itr->first;
...@@ -310,9 +334,21 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth,std::map<int ...@@ -310,9 +334,21 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth,std::map<int
} }
} }
//cout <<"!!!!!!!!!! "<< bestSplitValue << endl;
if(currentDepth!=0 && parentEntropy<entropy){ if(currentDepth!=0 && parentEntropy<entropy){
BestSplitPoint splitPoint = {-1, ""}; bestSplitFeature = -1;
return splitPoint; bestSplitValue = "";
}
if(bestSplitFeature>15 || bestSplitFeature <-1){
map < int, set < string >> ::iterator
itr;
for (itr = potentialSplits.begin(); itr != potentialSplits.end(); ++itr) {
cout << "------------- "<<endl;
cout << itr->first << " " << itr->second.size() << endl;
}
} }
BestSplitPoint splitPoint = {bestSplitFeature, bestSplitValue}; BestSplitPoint splitPoint = {bestSplitFeature, bestSplitValue};
...@@ -414,6 +450,11 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur ...@@ -414,6 +450,11 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur
set <string> temp; set <string> temp;
vector<int> index = randomSelect_WithoutReplacement(data.size(), featureWeight); vector<int> index = randomSelect_WithoutReplacement(data.size(), featureWeight);
// vector<int> index (data.size());
// for ( int j = 0; j< data.size(); j++){
// index[j] = j;
// }
for (int i = 0; i < index.size(); i++) { for (int i = 0; i < index.size(); i++) {
temp.clear(); temp.clear();
if (index[i] != data.size() - 1) { //because the last entry is the label if (index[i] != data.size() - 1) { //because the last entry is the label
...@@ -428,11 +469,13 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur ...@@ -428,11 +469,13 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur
vector <string> continuousData = data[index[i]]; vector <string> continuousData = data[index[i]];
sort(continuousData.begin(), continuousData.end()); sort(continuousData.begin(), continuousData.end());
//Unsupervised binning for continuous data //Unsupervised binning for continuous data
int K = 20; int K = 100;
if((continuousData[continuousData.size() - 1]) == continuousData[0]){ if(stod(continuousData[continuousData.size() - 1]) == stod(continuousData[0])){
continue; continue;
} }
double w = (stod(continuousData[continuousData.size() - 1]) - stod(continuousData[0])) / K; else
{
double w = (stod(continuousData[continuousData.size() - 1]) - stod(continuousData[0])) / K;
if (w) { if (w) {
...@@ -453,8 +496,16 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur ...@@ -453,8 +496,16 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur
// temp.insert(continuousData[j]); // temp.insert(continuousData[j]);
// } // }
possibleSplits[index[i]] = temp; possibleSplits[index[i]] = temp;
if(temp.size() == 0){
cout<< "Size is zerooooo"<<endl;
}
}
// cout<<temp.size()<<" of feature Continuous "<< index[i]<<endl; // cout<<temp.size()<<" of feature Continuous "<< index[i]<<endl;
......
...@@ -35,7 +35,7 @@ int main(int argc, char *argv[]) { ...@@ -35,7 +35,7 @@ int main(int argc, char *argv[]) {
int numTrees = atoi(argv[1]); int numTrees = atoi(argv[1]);
float depth = atoi(argv[2]); float depth = atoi(argv[2]);
float baggingWeight = 0.7; float baggingWeight = 0.7;
float featWeight = 0.7; float featWeight = 0.3;
//float baggingWeight = atoi(argv[2]) * 0.1; //float baggingWeight = atoi(argv[2]) * 0.1;
// double featWeight = numFeatures * 0.1; // double featWeight = numFeatures * 0.1;
...@@ -45,9 +45,9 @@ int main(int argc, char *argv[]) { ...@@ -45,9 +45,9 @@ int main(int argc, char *argv[]) {
vector <vector<string>> datasetAsString; vector <vector<string>> datasetAsString;
vector <FeatureType> featureTypes; vector <FeatureType> featureTypes;
vector <string> features; vector <string> features;
datasetAsString = parseDataToString("../datasets/adult.data"); datasetAsString = parseDataToString("../datasets/loan.data");
featureTypes = parseFeatureTypes("../datasets/adult.featureTypes"); featureTypes = parseFeatureTypes("../datasets/loan.featureTypes");
features = parseFeatures("../datasets/adult.features"); features = parseFeatures("../datasets/loan.features");
double accuracy = 0.0; double accuracy = 0.0;
......
...@@ -2,10 +2,8 @@ ...@@ -2,10 +2,8 @@
for i in {5..100..5} for i in {5..100..5}
do do
# ./race $i 10 # ./race $i 10
for j in {5..10} for j in {5..7}
do do
./race $i $j ./race $i $j
./race $i $j
./race $i $j
done done
done done
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment