Commit c89f8366 authored by ahmedaj's avatar ahmedaj
Browse files

fixed local entropy in best splits

parent 4651fe3b
......@@ -13,7 +13,7 @@ using namespace std;
DecisionTree::DecisionTree(vector <vector<string>> data, int maxDepth, float featureWeight,
vector <FeatureType> featureType) {
// vector<int> index = randomSelect_WithoutReplacement(15, featureWeight);
// vector<int> index = randomSelect_WithoutReplacement(data.size(), featureWeight);
// for(int i=0; i<index.size();i++){
// for(int j=1; j<data[index[i]].size(); j++){
// data[index[i]][j] = data[index[i]][0];
......@@ -68,6 +68,7 @@ Node *train(vector <vector<string>> data, vector <FeatureType> featureType,
BestSplitPoint bestSplit = findBestSplit(parentEntropy, currentDepth, potentialSplits, data,
featureType);
//cout << "------ best split index "<<bestSplit.featureIdx<<endl;
if(bestSplit.featureIdx == -1){
Node *leaf = new Node(NULL, NULL, NULL, true, classification, originalEntropy, informationGainFromParent);
// cout<<"No more split"<<endl;
......@@ -171,7 +172,7 @@ string DecisionTree::predictSingle(vector <string> test, Node *treeRoot) {
answer = treeRoot->falseBranch;
}
} else {
if (test[splitIndex] >= splitValue) {
if ( stod(test[splitIndex]) >= stod(splitValue)) {
answer = treeRoot->trueBranch;
} else {
answer = treeRoot->falseBranch;
......
......@@ -245,7 +245,6 @@ accuracyReport RandomForest::getAccuracy(vector<vector<string>> testData){
}
accuracyReport report = {accuracy,correctLables,incorrectLables,correct,incorrect,total};
cout << "here"<< endl;
return report;
......
......@@ -202,6 +202,9 @@ float calculateEntropy(vector <vector<string>> data) {
entropy -= (probability) * log2(probability);
}
}
if(entropy == 1){
//cout << "size with one entropy "<< dataCount.size()<<endl;
}
return entropy;
}
......@@ -216,6 +219,7 @@ float calculateSplitEntropy(FeatureSplitData featsplitData) {
float splitEntropy = (probabilityDataTrue * calculateEntropy(splitDataTrue)) +
(probabilityDataFalse * calculateEntropy(splitDataFalse));
//cout << "Split Entropy "<<splitEntropy<<endl;
return splitEntropy;
}
......@@ -251,7 +255,7 @@ splitData(vector <vector<string>> data, int splitFeature, vector <FeatureType> f
}
} else {
for (int dataIdx = 0; dataIdx < splitFeatureData.size(); dataIdx++) {
if (splitFeatureData[dataIdx] >= splitValue) {
if (stod(splitFeatureData[dataIdx]) >= stod(splitValue)) {
for (int featureIdx = 0; featureIdx < data.size(); featureIdx++) {
splitTrue.at(featureIdx).push_back(data.at(featureIdx).at(dataIdx));
}
......@@ -276,28 +280,48 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth,std::map<int
float entropy = 9999;
int bestSplitFeature;
string bestSplitValue;
bool first_iteration = true;
map < int, set < string >> ::iterator
itr;
for (itr = potentialSplits.begin(); itr != potentialSplits.end(); ++itr) {
//cout << itr->first << " " << itr->second.size() << endl;
float localEntropy = 1;
if (itr->second.size() == 0){
cout << itr->first << " " << itr->second.size() << endl;
cout << "feature types "<<featureTypes[itr->first]<<endl;
}
float localEntropy;
bool firstsplit = true;;
set <string> splits = itr->second;
if (splits.size() > 0) {
set<string>::iterator splitItr;
for (splitItr = splits.begin(); splitItr != splits.end(); splitItr++) {
// cout<<"Spliting "<<endl;
FeatureSplitData featSplitData = splitData(data, itr->first, featureTypes, (*splitItr));
float splitEntropy = calculateSplitEntropy(featSplitData);
if (featureTypes[itr->first] ==CONTINUOUS) {
if(firstsplit){
firstsplit = false;
localEntropy = splitEntropy;
}
else{
if (localEntropy > splitEntropy) {
localEntropy = splitEntropy;
} else {
if(splitEntropy ==1){
//cout << "It is one \n";
}
}
else
{
break;
}
}
if (splitEntropy <= entropy) {
}
if (first_iteration || splitEntropy <= entropy) {
first_iteration = false;
entropy = splitEntropy;
// cout<<itr->first<<" ";
bestSplitFeature = itr->first;
......@@ -310,9 +334,21 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth,std::map<int
}
}
//cout <<"!!!!!!!!!! "<< bestSplitValue << endl;
if(currentDepth!=0 && parentEntropy<entropy){
BestSplitPoint splitPoint = {-1, ""};
return splitPoint;
bestSplitFeature = -1;
bestSplitValue = "";
}
if(bestSplitFeature>15 || bestSplitFeature <-1){
map < int, set < string >> ::iterator
itr;
for (itr = potentialSplits.begin(); itr != potentialSplits.end(); ++itr) {
cout << "------------- "<<endl;
cout << itr->first << " " << itr->second.size() << endl;
}
}
BestSplitPoint splitPoint = {bestSplitFeature, bestSplitValue};
......@@ -414,6 +450,11 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur
set <string> temp;
vector<int> index = randomSelect_WithoutReplacement(data.size(), featureWeight);
// vector<int> index (data.size());
// for ( int j = 0; j< data.size(); j++){
// index[j] = j;
// }
for (int i = 0; i < index.size(); i++) {
temp.clear();
if (index[i] != data.size() - 1) { //because the last entry is the label
......@@ -428,10 +469,12 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur
vector <string> continuousData = data[index[i]];
sort(continuousData.begin(), continuousData.end());
//Unsupervised binning for continuous data
int K = 20;
if((continuousData[continuousData.size() - 1]) == continuousData[0]){
int K = 100;
if(stod(continuousData[continuousData.size() - 1]) == stod(continuousData[0])){
continue;
}
else
{
double w = (stod(continuousData[continuousData.size() - 1]) - stod(continuousData[0])) / K;
......@@ -454,7 +497,15 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur
// }
possibleSplits[index[i]] = temp;
if(temp.size() == 0){
cout<< "Size is zerooooo"<<endl;
}
}
// cout<<temp.size()<<" of feature Continuous "<< index[i]<<endl;
......
......@@ -35,7 +35,7 @@ int main(int argc, char *argv[]) {
int numTrees = atoi(argv[1]);
float depth = atoi(argv[2]);
float baggingWeight = 0.7;
float featWeight = 0.7;
float featWeight = 0.3;
//float baggingWeight = atoi(argv[2]) * 0.1;
// double featWeight = numFeatures * 0.1;
......@@ -45,9 +45,9 @@ int main(int argc, char *argv[]) {
vector <vector<string>> datasetAsString;
vector <FeatureType> featureTypes;
vector <string> features;
datasetAsString = parseDataToString("../datasets/adult.data");
featureTypes = parseFeatureTypes("../datasets/adult.featureTypes");
features = parseFeatures("../datasets/adult.features");
datasetAsString = parseDataToString("../datasets/loan.data");
featureTypes = parseFeatureTypes("../datasets/loan.featureTypes");
features = parseFeatures("../datasets/loan.features");
double accuracy = 0.0;
......
......@@ -2,10 +2,8 @@
for i in {5..100..5}
do
# ./race $i 10
for j in {5..10}
for j in {5..7}
do
./race $i $j
./race $i $j
./race $i $j
done
done
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment