Commit cf528184 authored by sgebreeg's avatar sgebreeg
Browse files

added unique values fro categorical entries

parent 5603edaf
......@@ -20,24 +20,17 @@ DecisionTree::DecisionTree(vector <vector<string>> &data, vector<int> &trainingI
};
Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
double parentEntropy, int currentDepth, int maxDepth, float featureWeight, vector<int> nodeDatasetIndices ) {
double parentEntropy, int currentDepth, int maxDepth, float featureWeight, vector<int> nodeDatasetIndices ) {
//TODO pass data pointer and index vector
std::pair<string, double> classificationAndEntropy = classifyWithEntropy(data, nodeDatasetIndices);
string classification = classificationAndEntropy.first;
double originalEntropy = classificationAndEntropy.second;
if(originalEntropy<0 || originalEntropy>1){
cout<<"ERROR with entropy "<<originalEntropy<<endl;
}
double informationGainFromParent;
if (currentDepth == 0) {
informationGainFromParent = 0.0;
} else {
informationGainFromParent = parentEntropy - originalEntropy;
// if(informationGainFromParent < 0){
// cout << "PARENT ENTROPY: "<<parentEntropy<<endl;
// cout << "ORIGINAL ENTROPY: "<<originalEntropy<<endl;
// }
}
if (currentDepth > maxDepth || originalEntropy == 0.0) {
......@@ -110,7 +103,7 @@ string DecisionTree::predictSingle(vector <string> test, Node *treeRoot, Predict
Node *answer;
if (featureType == CATEGORICAL) {
if (test[splitIndex] == splitValue) {
if (test[splitIndex] <= splitValue) {
answer = treeRoot->trueBranch;
} else {
answer = treeRoot->falseBranch;
......
......@@ -30,7 +30,7 @@ private:
};
Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
double entropy, int currentDepth, int maxDepth, float featureWeight, vector<int> nodeDatasetIndices);
double entropy, int currentDepth, int maxDepth, float featureWeight, vector<int> nodeDatasetIndices);
#endif //RACER_DECISIONTREE_HPP
......@@ -39,14 +39,16 @@ vector<int> getParts(int trees, int cpus) {
return temp;
}
RandomForest::RandomForest(vector <vector<string>> &data, vector<int> &trainingIndx, vector <FeatureType> &featureTypes, int numTrees,
RandomForest::RandomForest(vector <vector<string>> &data, vector<int> &trainingIndx, vector <FeatureType> &featureTypes,
int numTrees,
float baggingWeight, float featureWeight, int maxDepth) {
vector < DecisionTree * > decisionTrees;
this->featureWeight = featureWeight;
this->depth = maxDepth;
unsigned num_cpus = std::thread::hardware_concurrency();
if(numTrees < num_cpus)
// unsigned num_cpus = 12;
if (numTrees < num_cpus)
num_cpus = numTrees;
// A mutex ensures orderly access.
std::mutex iomutex;
......@@ -61,10 +63,10 @@ RandomForest::RandomForest(vector <vector<string>> &data, vector<int> &trainingI
maxDepth, featureWeight, &featureTypes, &decisionTrees] {
for (int j = 0; j < temp.at(i); j++) {
// vector <int> baggedData = bagData(trainingIndx, baggingWeight); //TODO fix this
cout<<"Training tree "<< j<<" in thread "<<i<<endl;
DecisionTree *tree = new DecisionTree(data, trainingIndx, maxDepth, featureWeight, featureTypes);
cout<<"Done training tree "<< j<<" in thread "<<i<<endl;
vector <int> baggedData = bagData(trainingIndx, baggingWeight); //TODO fix this
cout << "Training tree " << j << " in thread " << i << endl;
DecisionTree *tree = new DecisionTree(data, baggedData, maxDepth, featureWeight, featureTypes);
cout << "Done training tree " << j << " in thread " << i << endl;
{
// Use a lexical scope and lock_guard to safely lock the mutex only for
// the duration of vector push.
......@@ -74,11 +76,10 @@ RandomForest::RandomForest(vector <vector<string>> &data, vector<int> &trainingI
}
});
}
for (auto& t : threads) {
for (auto &t : threads) {
t.join();
}
......@@ -111,10 +112,10 @@ RandomForest::getForestPrediction(vector <string> test, RandomForest *forest, ve
explanations[treeIdx] = report;
map<string, vector<std::pair < int, double> >>::iterator itr;
cout << "Explanation "<<treeIdx<< " classified "<< report->classification << " because ";
for (itr = report->path.begin() ; itr != report->path.end(); ++itr) {
if(itr->first == report->classification){
map < string, vector < std::pair < int, double > >>::iterator itr;
cout << "Explanation " << treeIdx << " classified " << report->classification << " because ";
for (itr = report->path.begin(); itr != report->path.end(); ++itr) {
if (itr->first == report->classification) {
sort(itr->second, test, features);
}
}
......@@ -145,7 +146,7 @@ RandomForest::getForestPrediction(vector <string> test, RandomForest *forest, ve
sort(reports, test, features);
for (int j = 0; j<explanations.size(); j++){
for (int j = 0; j < explanations.size(); j++) {
delete explanations[j];
}
......@@ -179,7 +180,7 @@ bool cmp(pair<int, double> &a,
return a.second > b.second;
}
vector <pair<int, double>> sort(vector<std::pair < int, double> > &M, vector <string> test, vector <string> features) {
vector <pair<int, double>> sort(vector <std::pair<int, double>> &M, vector <string> test, vector <string> features) {
// Declare vector of pairs
vector <pair<int, double>> A;
......@@ -201,7 +202,7 @@ vector <pair<int, double>> sort(vector<std::pair < int, double> > &M, vector <st
if (count > 2) {
break;
}
cout << features[it.first] << " is " << test[it.first]<< "(information gain: "<<it.second << "), ";
cout << features[it.first] << " is " << test[it.first] << "(information gain: " << it.second << "), ";
count++;
}
cout << endl;
......
......@@ -163,7 +163,7 @@ void splitTrainingAndTesting(vector<int> trainingIndecies, vector <vector<string
int currentTrainingIndex = trainingIndecies.back();
trainingIndecies.pop_back();
int numDataEntries = dataString.at(0).size();
int trainingCount = 1;
int trainingCount = 1;
for (int dataIdx = numDataEntries - 1; dataIdx >= 0; dataIdx--) {
if (currentTrainingIndex == dataIdx) {
......@@ -171,12 +171,12 @@ void splitTrainingAndTesting(vector<int> trainingIndecies, vector <vector<string
for (int featureIdx = 0; featureIdx < dataString.size(); featureIdx++) {
trainingDataString.at(featureIdx).push_back(dataString.at(featureIdx).at(dataIdx));
}
if(trainingCount<trainingSize){
if (trainingCount < trainingSize) {
currentTrainingIndex = trainingIndecies.back();
trainingIndecies.pop_back();
trainingCount ++;
trainingCount++;
}
} else {
for (int featureIdx = 0; featureIdx < dataString.size(); featureIdx++) {
testDataString.at(featureIdx).push_back(dataString.at(featureIdx).at(dataIdx));
......@@ -186,9 +186,9 @@ void splitTrainingAndTesting(vector<int> trainingIndecies, vector <vector<string
}
}
float calculateEntropy(vector <vector<string>> data) {
float calculateEntropy(vector <vector<string>> &data, vector<int> indices) {
std::map<std::string, int> dataCount;
for (int i = 0; i < data[data.size() - 1].size(); i++) {
for (int i :indices) {
if (dataCount.count(data[data.size() - 1][i])) {
dataCount[data[data.size() - 1][i]] += 1;
......@@ -203,7 +203,7 @@ float calculateEntropy(vector <vector<string>> data) {
map<string, int>::iterator itr;
for (itr = dataCount.begin(); itr != dataCount.end(); ++itr) {
float probability = (float) itr->second / (float) data[data.size() - 1].size();
float probability = (float) itr->second / (float) indices.size();
if (probability > 0) {
entropy -= probability * log2(probability);
}
......@@ -243,24 +243,36 @@ calSplitEntropy(std::map<std::string, int> leftLabelCount, std::map<std::string,
}
//TODO
float calculateSplitEntropy(FeatureSplitData featsplitData) {
vector <vector<string>> splitDataTrue = featsplitData.dataTrue;
vector <vector<string>> splitDataFalse = featsplitData.dataFalse;
float calculateSplitEntropy(FeatureSplitDataIndx featsplitData, vector <vector<string>> &data) {
float totalData = splitDataTrue.at(0).size() + splitDataFalse.at(0).size();
float probabilityDataTrue = (float) splitDataTrue.at(0).size() / totalData;
float probabilityDataFalse = (float) splitDataFalse.at(0).size() / totalData;
float splitEntropy = (probabilityDataTrue * calculateEntropy(splitDataTrue)) +
(probabilityDataFalse * calculateEntropy(splitDataFalse));
vector<int> splitDataTrue = featsplitData.dataTrue;
vector<int> splitDataFalse = featsplitData.dataFalse;
float totalData = splitDataTrue.size() + splitDataFalse.size();
float probabilityDataTrue = (float) splitDataTrue.size() / totalData;
float probabilityDataFalse = (float) splitDataFalse.size() / totalData;
float splitEntropy = (probabilityDataTrue * calculateEntropy(data, splitDataTrue)) +
(probabilityDataFalse * calculateEntropy(data, splitDataFalse));
return splitEntropy;
}
//float calculateSplitEntropy(FeatureSplitData featsplitData) {
// vector <vector<string>> splitDataTrue = featsplitData.dataTrue;
// vector <vector<string>> splitDataFalse = featsplitData.dataFalse;
//
// float totalData = splitDataTrue.at(0).size() + splitDataFalse.at(0).size();
// float probabilityDataTrue = (float) splitDataTrue.at(0).size() / totalData;
// float probabilityDataFalse = (float) splitDataFalse.at(0).size() / totalData;
//
// float splitEntropy = (probabilityDataTrue * calculateEntropy(splitDataTrue)) +
// (probabilityDataFalse * calculateEntropy(splitDataFalse));
//
// return splitEntropy;
//
//}
//TODO accept data reference and vector index
FeatureSplitDataIndx
splitData(vector <vector<string>> data, int splitFeature, vector <FeatureType> featureTypes, string splitValue,
splitData(vector <vector<string>> &data, int splitFeature, vector <FeatureType> featureTypes, string splitValue,
vector<int> &nodeDatasetIndices) {
//cout << "split feature " << splitFeature << " split val "<< splitValue<<endl;
FeatureSplitDataIndx featSplitData;
......@@ -294,7 +306,7 @@ splitData(vector <vector<string>> data, int splitFeature, vector <FeatureType> f
}
void
sortDataByFeature(int featIdx, vector <vector<string>> data, vector <pair<int, string>> &featureData,
sortDataByFeature(int featIdx, vector <vector<string>> &data, vector <pair<int, string>> &featureData,
vector<int> &nodeDatasetIndices) {
for (int dataIdx = 0; dataIdx < nodeDatasetIndices.size(); dataIdx++) { //TODO check
featureData.emplace_back(nodeDatasetIndices[dataIdx], data[featIdx].at(nodeDatasetIndices[dataIdx]));
......@@ -305,7 +317,7 @@ sortDataByFeature(int featIdx, vector <vector<string>> data, vector <pair<int, s
}
BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vector<string>> &data,
vector <FeatureType> featureTypes, float featureWeight, vector<int>& nodeDatasetIndices) {
vector <FeatureType> featureTypes, float featureWeight, vector<int> &nodeDatasetIndices) {
//TODO accept data and vector of index check
......@@ -316,69 +328,91 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vec
string bestSplitValue = "";
for (auto featureIndex: randomFeatures) {
if (featureIndex != data.size() - 1) { //because last column is label
//initialize variables
string threshold = "";
int dataIndex;
std::map<std::string, int> leftLabelCount;
std::map<std::string, int> rightLabelCount;
//count right side labels
for (int i : nodeDatasetIndices) { //TODO check
if (rightLabelCount.count(data[data.size() - 1][i])) {
rightLabelCount[data[data.size() - 1][i]] += 1;
if (featureIndex != data.size() - 1) {//because last column is label
if (featureTypes.at(featureIndex) == CONTINUOUS) {
//initialize variables
string threshold = "";
int dataIndex;
std::map<std::string, int> leftLabelCount;
std::map<std::string, int> rightLabelCount;
//count right side labels
for (int i : nodeDatasetIndices) { //TODO check
if (rightLabelCount.count(data[data.size() - 1][i])) {
rightLabelCount[data[data.size() - 1][i]] += 1;
} else {
rightLabelCount[data[data.size() - 1][i]] = 1;
} else {
rightLabelCount[data[data.size() - 1][i]] = 1;
}
}
}
int leftSize = 0;
int rightSize = nodeDatasetIndices.size(); //TODO check
vector <pair<int, string>> featureData;
featureData.reserve(nodeDatasetIndices.size()); //TODO check
//done initializing variables
int leftSize = 0;
int rightSize = nodeDatasetIndices.size(); //TODO check
vector <pair<int, string>> featureData;
featureData.reserve(nodeDatasetIndices.size()); //TODO check
//done initializing variables
//sort data with selected feature
//sort data with selected feature
sortDataByFeature(featureIndex, data, featureData, nodeDatasetIndices); //TODO check
sortDataByFeature(featureIndex, data, featureData, nodeDatasetIndices); //TODO check
for (int indx = 0; indx < featureData.size();) {
threshold = featureData.at(indx).second;
dataIndex = featureData.at(indx).first;
for (int indx = 0; indx < featureData.size();) {
threshold = featureData.at(indx).second;
dataIndex = featureData.at(indx).first;
while (indx < featureData.size() && featureData.at(indx).second <= threshold) { //TODO check
leftSize++;
rightSize--;
if (leftLabelCount.count(data[data.size() - 1][featureData.at(indx).first])) {
leftLabelCount[data[data.size() - 1][featureData.at(indx).first]] += 1;
} else {
leftLabelCount[data[data.size() - 1][featureData.at(indx).first]] = 1;
}
rightLabelCount[data[data.size() - 1][featureData.at(indx).first]] -= 1;
indx++;
if (indx < featureData.size()) {
dataIndex = featureData[indx].first;
}
while (indx < featureData.size() && featureData.at(indx).second <= threshold) { //TODO check
leftSize++;
rightSize--;
if (leftLabelCount.count(data[data.size() - 1][featureData.at(indx).first])) {
leftLabelCount[data[data.size() - 1][featureData.at(indx).first]] += 1;
} else {
leftLabelCount[data[data.size() - 1][featureData.at(indx).first]] = 1;
}
rightLabelCount[data[data.size() - 1][featureData.at(indx).first]] -= 1;
if (indx == featureData.size()) { //TODO check
continue;
}
double splitEntropy = calSplitEntropy(leftLabelCount, rightLabelCount, leftSize, rightSize);
indx++;
if(indx <featureData.size()){
dataIndex = featureData[indx].first;
if (splitEntropy < minEntropy) {
// cout<<"Best split at "<< featureIndex <<" value "<<threshold<<" Entropy "<< splitEntropy<<endl;
minEntropy = splitEntropy;
bestFeatureIndex = featureIndex;
bestSplitValue = threshold;
}
}
if (indx == featureData.size()) { //TODO check
continue;
}
double splitEntropy = calSplitEntropy(leftLabelCount, rightLabelCount, leftSize, rightSize);
if (splitEntropy < 0 || splitEntropy > 1) {
cout << "Checkpoint ERROR" << endl;
}
if (splitEntropy < minEntropy) {
// cout<<"Best split at "<< featureIndex <<" value "<<threshold<<" Entropy "<< splitEntropy<<endl;
minEntropy = splitEntropy;
bestFeatureIndex = featureIndex;
bestSplitValue = threshold;
} else {
set <string> uniqueValues;
for (int i: nodeDatasetIndices) {
uniqueValues.insert(data[featureIndex][i]);
}
set<string>::iterator splitItr;
for (splitItr = uniqueValues.begin(); splitItr != uniqueValues.end(); splitItr++) {
FeatureSplitDataIndx featSplitData = splitData(data, featureIndex, featureTypes, (*splitItr),
nodeDatasetIndices);
double splitEntropy = (double) calculateSplitEntropy(featSplitData, data);
if (splitEntropy < minEntropy) {
cout << "CATEGORICAL Best split at " << featureIndex << " value " << (*splitItr) << " Entropy "
<< splitEntropy << endl;
minEntropy = splitEntropy;
bestFeatureIndex = featureIndex;
bestSplitValue = (*splitItr);
}
}
}
}
......@@ -416,7 +450,7 @@ string classifyData(vector <vector<string>> data) {
return label;
}
std::pair<string, double> classifyWithEntropy(vector <vector<string>> &data, vector<int> indices) {
std::pair<string, double> classifyWithEntropy(vector <vector<string>> &data, vector<int> &indices) {
//TODO get data reference and vector of index
auto start = high_resolution_clock::now();
std::map<std::string, int> dataCount;
......@@ -433,8 +467,9 @@ std::pair<string, double> classifyWithEntropy(vector <vector<string>> &data, vec
}
map<string, int>::iterator itr;
for (itr = dataCount.begin(); itr != dataCount.end(); ++itr) {
// cout<<itr->first<<itr->second<<endl;
// cout<<itr->first<<" "<<itr->second<<endl;
double probability = (double) itr->second / (double) indices.size();
if (probability > 0) {
entropy -= (probability) * log2(probability);
}
......@@ -445,28 +480,29 @@ std::pair<string, double> classifyWithEntropy(vector <vector<string>> &data, vec
}
}
std::pair<string, double> classificationWithEntropy(label, entropy);
return classificationWithEntropy;
}
vector <int> bagData(vector <int> &indices, float baggingWeight) {
vector <int> newData;
vector<int> bagData(vector<int> &indices, float baggingWeight) {
vector<int> newData;
vector<int> selection = randSelectIdxWithReplacement(indices.size(), baggingWeight);
sort(selection.begin(), selection.end());
for (int i = 0; i < selection.size(); i++) {
newData.push_back(indices.at(i));
newData.push_back(indices.at(selection[i]));
}
return newData;
}
vector <int> oversample(vector <vector<string>> &data, vector<int> &indices) {
vector<int> oversample(vector <vector<string>> &data, vector<int> &indices) {
int lableIdx = data.size() - 1;
vector <int> oversampled;
vector<int> oversampled;
vector <string> emptyVecString;
vector<int> toAdd;
int highestCount = 0;
......@@ -527,8 +563,8 @@ vector <int> oversample(vector <vector<string>> &data, vector<int> &indices) {
}
void cleanTree(Node *node){
if(node->isLeaf){
void cleanTree(Node *node) {
if (node->isLeaf) {
delete node->question;
delete node;
return;
......
......@@ -36,10 +36,10 @@ vector<int> randSelectIdxWithReplacement(int originalNum, float percent);
void splitTrainingAndTesting(vector<int> trainingIndecies,vector<vector<string>> dataString,
vector<vector<string>>& trainingDataString,vector<vector<string>>& testDataString);
string classifyData(vector <vector<string>> data);
std::pair<string,double> classifyWithEntropy(vector<vector<string>> &data, vector<int> indices);
FeatureSplitDataIndx splitData(vector<vector<string>>data, int splitFeature,vector<FeatureType> featureTypes, string splitValue, vector<int> &nodeDatasetIndices );
float calculateEntropy(vector<vector<string>> data);
float calculateSplitEntropy (FeatureSplitData featsplitData);
std::pair<string,double> classifyWithEntropy(vector<vector<string>> &data, vector<int> &indices);
FeatureSplitDataIndx splitData(vector<vector<string>>& data, int splitFeature,vector<FeatureType> featureTypes, string splitValue, vector<int> &nodeDatasetIndices );
float calculateEntropy(vector <vector<string>>& data, vector<int> indices) ;
float calculateSplitEntropy (FeatureSplitDataIndx featsplitData, vector<vector<string>> &data);
vector <int> bagData(vector <int> &indices, float baggingWeight);
vector<int> randomSelect_WithoutReplacement(int originalNum, float percentTraining);
vector<int> oversample(vector<vector<string>> &data, vector<int> &indices);
......
......@@ -35,7 +35,7 @@ int main(int argc, char *argv[]) {
int numTrees = atoi(argv[1]);
float baggingWeight = 0.7;
int depth = atoi(argv[2]);
float featureWeight = 0.7;
float featureWeight = 0.3;
// double featWeight = numFeatures * 0.1;
// cout << featWeight << "\n";
......@@ -44,9 +44,9 @@ int main(int argc, char *argv[]) {
vector <vector<string>> datasetAsString;
vector <FeatureType> featureTypes;
vector <string> features;
datasetAsString = parseDataToString("../datasets/loan.data");
featureTypes = parseFeatureTypes("../datasets/loan.featureTypes");
features = parseFeatures("../datasets/loan.features");
datasetAsString = parseDataToString("../datasets/adult.data");
featureTypes = parseFeatureTypes("../datasets/adult.featureTypes");
features = parseFeatures("../datasets/adult.features");
double accuracy = 0.0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment