Commit 9e235da4 authored by ahmedaj's avatar ahmedaj
Browse files

fixing negative information gains

parent 955e75c9
......@@ -36,6 +36,10 @@ Node *train(vector <vector<string>> data, vector <FeatureType> featureType,
informationGainFromParent = 0.0;
} else {
informationGainFromParent = parentEntropy - originalEntropy;
// if(informationGainFromParent < 0){
// cout << "PARENT ENTROPY: "<<parentEntropy<<endl;
// cout << "ORIGINAL ENTROPY: "<<originalEntropy<<endl;
// }
}
if (currentDepth > maxDepth || originalEntropy == 0.0) {
......
......@@ -198,6 +198,9 @@ float calculateEntropy(vector <vector<string>> data) {
map<string, int>::iterator itr;
for (itr = dataCount.begin(); itr != dataCount.end(); ++itr) {
float probability = (float) itr->second / (float) data[data.size() - 1].size();
if(probability>1){
cout <<"probability more than one"<<endl;
}
if (probability > 0) {
entropy -= probability * log2(probability);
}
......@@ -207,17 +210,20 @@ float calculateEntropy(vector <vector<string>> data) {
double
calSplitEntropy(std::map<std::string, int> leftLabelCount, std::map<std::string, int> rightLabelCount, int leftCount,
int rightCount) {
int rightCount,double &leftEntropy ,double &rightEntropy) {
double totalData = leftCount + rightCount;
double probabilityRight = rightCount / totalData;
double probabilityLeft = leftCount / totalData;
double leftEntropy = 0.0;
double rightEntropy = 0.0;
leftEntropy = 0.0;
rightEntropy = 0.0;
map<string, int>::iterator leftitr;
for (leftitr = leftLabelCount.begin(); leftitr != leftLabelCount.end(); ++leftitr) {
double probability = (double) leftitr->second / (double) leftCount;
if(probability>1){
}
if (probability > 0) {
leftEntropy -= probability * log2(probability);
}
......@@ -226,26 +232,30 @@ calSplitEntropy(std::map<std::string, int> leftLabelCount, std::map<std::string,
map<string, int>::iterator rightitr;
for (rightitr = rightLabelCount.begin(); rightitr != rightLabelCount.end(); ++rightitr) {
double probability = (double) rightitr->second / (double) rightCount;
if(probability>1){
}
if (probability > 0) {
rightEntropy -= probability * log2(probability);
}
}
double splitEntropy = (probabilityLeft * leftEntropy) + (probabilityRight * rightEntropy);
return splitEntropy;
}
float calculateSplitEntropy(FeatureSplitData featsplitData) {
float calculateSplitEntropy(FeatureSplitData featsplitData, double& leftEntropy, double& rightEntropy) {
vector <vector<string>> splitDataTrue = featsplitData.dataTrue;
vector <vector<string>> splitDataFalse = featsplitData.dataFalse;
float totalData = splitDataTrue.at(0).size() + splitDataFalse.at(0).size();
float probabilityDataTrue = (float) splitDataTrue.at(0).size() / totalData;
float probabilityDataFalse = (float) splitDataFalse.at(0).size() / totalData;
float splitEntropy = (probabilityDataTrue * calculateEntropy(splitDataTrue)) +
(probabilityDataFalse * calculateEntropy(splitDataFalse));
leftEntropy = calculateEntropy(splitDataTrue);
rightEntropy = calculateEntropy(splitDataFalse);
float splitEntropy = (probabilityDataTrue * leftEntropy ) +
(probabilityDataFalse * rightEntropy);
return splitEntropy;
......@@ -317,71 +327,112 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vec
vector<int> randomFeatures = randomSelect_WithoutReplacement(data.size(), featureWeight);
int bestFeatureIndex = randomFeatures[0];
double minEntropy = 99999;
double minleftEntropy = 99999;
double minrightEntropy = 99999;
string bestSplitValue = "";
for (auto featureIndex: randomFeatures) {
if ( featureIndex != data.size()-1 ) { //because last column is label
//initialize variables
string threshold = "";
int dataIndex;
std::map<std::string, int> leftLabelCount;
std::map<std::string, int> rightLabelCount;
//count right side labels
for (int i = 0; i < data[data.size() - 1].size(); i++) {
if (rightLabelCount.count(data[data.size() - 1][i])) {
rightLabelCount[data[data.size() - 1][i]] += 1;
if(featureTypes[featureIndex] == CONTINUOUS){
string threshold = "";
int dataIndex;
std::map<std::string, int> leftLabelCount;
std::map<std::string, int> rightLabelCount;
double leftEntropy;
double rightEntropy;
//count right side labels
for (int i = 0; i < data[data.size() - 1].size(); i++) {
if (rightLabelCount.count(data[data.size() - 1][i])) {
rightLabelCount[data[data.size() - 1][i]] += 1;
} else {
rightLabelCount[data[data.size() - 1][i]] = 1;
} else {
rightLabelCount[data[data.size() - 1][i]] = 1;
}
}
}
int leftSize = 0;
int rightSize = data.at(featureIndex).size();
vector <pair<int, string>> featureData;
featureData.reserve(data[0].size());
//done initializing variables
int leftSize = 0;
int rightSize = data.at(featureIndex).size();
vector <pair<int, string>> featureData;
featureData.reserve(data[0].size());
//done initializing variables
//sort data with selected feature
//sort data with selected feature
sortDataByFeature(featureIndex, data, featureData);
sortDataByFeature(featureIndex, data, featureData);
for (int indx = 0; indx < featureData.size();) {
threshold = featureData.at(indx).second;
dataIndex = featureData.at(indx).first;
for (int indx = 0; indx < featureData.size();) {
threshold = featureData.at(indx).second;
dataIndex = featureData.at(indx).first;
while (indx < data.at(featureIndex).size() && featureData.at(indx).second <= threshold) {
leftSize++;
rightSize--;
if (leftLabelCount.count(data[data.size() - 1][indx])) {
leftLabelCount[data[data.size() - 1][indx]] += 1;
} else {
leftLabelCount[data[data.size() - 1][indx]] = 1;
while (indx < data.at(featureIndex).size() && featureData.at(indx).second <= threshold) {
leftSize++;
rightSize--;
if (leftLabelCount.count(data[data.size() - 1][indx])) {
leftLabelCount[data[data.size() - 1][indx]] += 1;
} else {
leftLabelCount[data[data.size() - 1][indx]] = 1;
}
rightLabelCount[data[data.size() - 1][indx]] -= 1;
indx++;
dataIndex = featureData[indx].first;
}
if (indx == data[0].size()) {
continue;
}
double splitEntropy = calSplitEntropy(leftLabelCount, rightLabelCount, leftSize, rightSize,leftEntropy,rightEntropy);
if (splitEntropy < minEntropy) {
// cout<<"Best split at "<< featureIndex <<" value "<<threshold<<" Entropy "<< splitEntropy<<endl;
minEntropy = splitEntropy;
minleftEntropy = leftEntropy;
minrightEntropy = rightEntropy;
bestFeatureIndex = featureIndex;
bestSplitValue = threshold;
}
rightLabelCount[data[data.size() - 1][indx]] -= 1;
indx++;
dataIndex = featureData[indx].first;
}
if (indx == data[0].size()) {
continue;
}
double splitEntropy = calSplitEntropy(leftLabelCount, rightLabelCount, leftSize, rightSize);
if (splitEntropy < minEntropy) {
// cout<<"Best split at "<< featureIndex <<" value "<<threshold<<" Entropy "<< splitEntropy<<endl;
minEntropy = splitEntropy;
bestFeatureIndex = featureIndex;
bestSplitValue = threshold;
}
else
{
set <string> splitPoints;
double leftEntropy;
double rightEntropy;
for (int dataIdx = 0; dataIdx<data[0].size(); dataIdx++){
splitPoints.insert(data[featureIndex][dataIdx]);
}
if(splitPoints.size()>0){
set<string>::iterator splitItr;
for (splitItr = splitPoints.begin(); splitItr != splitPoints.end(); splitItr++) {
FeatureSplitData featSplitData = splitData(data, featureIndex, featureTypes, (*splitItr));
//calulate entropy for split data
float splitEntropy = calculateSplitEntropy(featSplitData,leftEntropy,rightEntropy);
//check if it's first iteration or if current split entropy is less than global entropy
if (splitEntropy < minEntropy) {
// cout<<"Best split at "<< featureIndex <<" value "<<threshold<<" Entropy "<< splitEntropy<<endl;
minEntropy = splitEntropy;
minleftEntropy = leftEntropy;
minrightEntropy = rightEntropy;
bestFeatureIndex = featureIndex;
bestSplitValue = (*splitItr);
}
}
}
}
}
}
if (minEntropy >= parentEntropy && currentDepth != 0){
if ((minleftEntropy >= parentEntropy || minrightEntropy>=parentEntropy) && currentDepth != 0){
bestFeatureIndex = -1;
bestSplitValue = "";
}
......
......@@ -35,7 +35,7 @@ int main(int argc, char *argv[]) {
int numTrees = atoi(argv[1]);
float baggingWeight = 0.7;
int depth = atoi(argv[2]);
float featureWeight = 0.3;
float featureWeight = 0.7;
// double featWeight = numFeatures * 0.1;
// cout << featWeight << "\n";
......@@ -44,9 +44,9 @@ int main(int argc, char *argv[]) {
vector <vector<string>> datasetAsString;
vector <FeatureType> featureTypes;
vector <string> features;
datasetAsString = parseDataToString("../datasets/adult.data");
featureTypes = parseFeatureTypes("../datasets/adult.featureTypes");
features = parseFeatures("../datasets/adult.features");
datasetAsString = parseDataToString("../datasets/loan.data");
featureTypes = parseFeatureTypes("../datasets/loan.featureTypes");
features = parseFeatures("../datasets/loan.features");
double accuracy = 0.0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment