Commit 93f63439 authored by sgebreeg's avatar sgebreeg
Browse files

merge conflict resolved

parent a0364fe1
......@@ -33,7 +33,7 @@ Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
informationGainFromParent = parentEntropy - originalEntropy;
}
if (*currentDepth > maxDepth || originalEntropy == 0.0) {
if (currentDepth > maxDepth || originalEntropy == 0.0) {
Node *leaf = new Node(NULL, NULL, NULL, true, classification, originalEntropy, informationGainFromParent);
return leaf;
......@@ -43,7 +43,7 @@ Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
//TODO send data vector of index
//find best split point
BestSplitPoint bestSplit = findBestSplit(parentEntropy, currentDepth, data,
featureType, featureWeight, nodeDatasetIndices);
featureType, featureWeight, nodeDatasetIndices);
if(bestSplit.featureIdx == -1 || bestSplit.featureIdx > data.size()-1 ){
Node *leaf = new Node(NULL, NULL, NULL, true, classification, originalEntropy, informationGainFromParent);
......@@ -56,7 +56,7 @@ Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
//TODO send data and index vector
//TODO return indices for left and right
FeatureSplitDataIndx featureSplitData = splitData(data, bestSplit.featureIdx, featureType,
bestSplit.splitpoint, nodeDatasetIndices);
bestSplit.splitpoint, nodeDatasetIndices);
......@@ -177,4 +177,4 @@ void DecisionTree::printTree(Node *node, int space) {
cout << " " << node->classification << "\n";
}
}
}
\ No newline at end of file
......@@ -21,7 +21,6 @@ public:
Node *root;
int maxDepth;
int maxDepthReached;
float featureWeight;
......@@ -34,4 +33,4 @@ Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
double entropy, int currentDepth, int maxDepth, float featureWeight, vector<int> nodeDatasetIndices);
#endif //RACER_DECISIONTREE_HPP
#endif //RACER_DECISIONTREE_HPP
\ No newline at end of file
......@@ -119,8 +119,6 @@ RandomForest::getForestPrediction(vector <string>& test, RandomForest *forest, v
sort(itr->second, test, features);
}
}
cout << "Maximum depth reached = "<<tree->maxDepthReached<< " ";
cout << endl;
}
//pick the biggest voted label
......@@ -207,6 +205,7 @@ vector <pair<int, double>> sort(vector <std::pair<int, double>> &M, vector <stri
cout << features[it.first] << " is " << test[it.first] << "(information gain: " << it.second << "), ";
count++;
}
cout << endl;
return A;
}
......@@ -258,7 +257,7 @@ vector <string> getBatchPrediction(vector <vector<string>>& datasetAsString,vect
test.at(featIndex) = datasetAsString.at(featIndex).at(testIdxs[testIndex]);
}
//Get every tree in the forests prediction
//Get every tree in the forests prediction
for (int treeIdx = 0; treeIdx < forest->trees.size(); treeIdx++) {
DecisionTree *tree = forest->trees[treeIdx];
string prediction = tree->predictSingle(test, tree->root);
......@@ -327,7 +326,7 @@ accuracyReport RandomForest::getAccuracy(vector <vector<string>>& datasetAsStrin
accuracy = ((double) correct / (double) total) * 100;
} else {
cerr << "Predictions and lables are not equal" << endl;
cerr << "Predictions and labels are not equal" << endl;
}
accuracyReport report = {accuracy, correctLables, incorrectLables, correct, incorrect, total};
......@@ -345,12 +344,12 @@ void RandomForest::printAccuracyReport(accuracyReport report) {
cout << " and " << report.incorrect << " incorrect predictions " << endl;
map<string, int>::iterator citr;
for (citr = report.correctLables.begin(); citr != report.correctLables.end(); ++citr) {
cout << "Lable " << citr->first << " was predicted right " << citr->second << " times \n";
cout << "Label " << citr->first << " was predicted right " << citr->second << " times \n";
}
map<string, int>::iterator itr;
for (itr = report.incorrectLables.begin(); itr != report.incorrectLables.end(); ++itr) {
cout << "Lable " << itr->first << " was predicted wrong " << itr->second << " times \n";
cout << "Label " << itr->first << " was predicted wrong " << itr->second << " times \n";
}
}
......@@ -368,14 +367,13 @@ void RandomForest::printAccuracyReportFile(accuracyReport report) {
map<string, int>::iterator citr;
for (citr = report.correctLables.begin(); citr != report.correctLables.end(); ++citr) {
outfile << "Lable " << citr->first << " was predicted right " << citr->second << " times \n";
outfile << "Label " << citr->first << " was predicted right " << citr->second << " times \n";
}
map<string, int>::iterator itr;
for (itr = report.incorrectLables.begin(); itr != report.incorrectLables.end(); ++itr) {
outfile << "Lable " << itr->first << " was predicted wrong " << itr->second << " times \n";
outfile << "Label " << itr->first << " was predicted wrong " << itr->second << " times \n";
}
outfile.close();
}
......@@ -153,7 +153,7 @@ vector<int> randSelectIdxWithReplacement(int originalNum, float percent) {
vector<int> splitTrainingAndTesting(vector<int> trainingIndecies, vector <vector<string>> &dataString) {
vector <int> testIdxs;
vector<int> testIdxs;
int trainingSize = trainingIndecies.size();
int currentTrainingIndex = trainingIndecies.back();
trainingIndecies.pop_back();
......@@ -162,7 +162,7 @@ vector<int> splitTrainingAndTesting(vector<int> trainingIndecies, vector <vector
for (int dataIdx = numDataEntries - 1; dataIdx >= 0; dataIdx--) {
if (currentTrainingIndex == dataIdx) {
if (trainingCount < trainingSize) {
currentTrainingIndex = trainingIndecies.back();
trainingIndecies.pop_back();
......@@ -175,7 +175,7 @@ vector<int> splitTrainingAndTesting(vector<int> trainingIndecies, vector <vector
}
sort(testIdxs.begin(),testIdxs.end());
sort(testIdxs.begin(), testIdxs.end());
return testIdxs;
}
......@@ -289,10 +289,10 @@ splitData(vector <vector<string>> &data, int splitFeature, vector <FeatureType>
} else {
splitFalse.push_back(i);
}
}catch(const std::exception& e){
cout<<"ERROR"<<endl;
cout<<"Dataset " << data.at(splitFeature).at(i)<<endl;
cout<<"split value "<<splitValue<<endl;
} catch (const std::exception &e) {
cout << "ERROR" << endl;
cout << "Dataset " << data.at(splitFeature).at(i) << endl;
cout << "split value " << splitValue << endl;
exit(1);
}
......@@ -406,6 +406,8 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vec
nodeDatasetIndices);
double splitEntropy = (double) calculateSplitEntropy(featSplitData, data);
if (splitEntropy < minEntropy) {
// cout << "CATEGORICAL Best split at " << featureIndex << " value " << (*splitItr) << " Entropy "
// << splitEntropy << endl;
minEntropy = splitEntropy;
bestFeatureIndex = featureIndex;
bestSplitValue = (*splitItr);
......@@ -541,14 +543,15 @@ vector<int> oversample(vector <vector<string>> &data, vector<int> &indices) {
vector<int> idxs = labelWithIdx[lable];
while (difference > 0) {
if (difference < lablesize) {
vector<int> selection = randSelectIdxWithReplacement(idxs.size(), (double(difference)/double(lablesize)));
for (int selectionIdx:selection){
vector<int> selection = randSelectIdxWithReplacement(idxs.size(),
(double(difference) / double(lablesize)));
for (int selectionIdx:selection) {
toAdd.push_back(idxs.at(selectionIdx));
}
difference = 0;
} else {
vector<int> selection = randSelectIdxWithReplacement(idxs.size(), 1);
for (int selectionIdx:selection){
for (int selectionIdx:selection) {
toAdd.push_back(idxs.at(selectionIdx));
}
difference -= lablesize;
......@@ -559,7 +562,7 @@ vector<int> oversample(vector <vector<string>> &data, vector<int> &indices) {
}
}
cout << "lables to add "<<toAdd.size()<<endl;
cout << "lables to add " << toAdd.size() << endl;
return toAdd;
......
......@@ -43,7 +43,7 @@ vector <int> bagData(vector <int> &indices, float baggingWeight);
vector<int> randomSelect_WithoutReplacement(int originalNum, float percentTraining);
vector<int> oversample(vector<vector<string>> &data, vector<int> &indices);
BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vector<string>> &data,
vector <FeatureType> featureType, float featureWeight, vector<int>& nodeDatasetIndices );
vector <FeatureType> featureType, float featureWeight, vector<int>& nodeDatasetIndices );
void cleanTree(Node *node);
#endif
\ No newline at end of file
......@@ -35,7 +35,7 @@ int main(int argc, char *argv[]) {
int numTrees = atoi(argv[1]);
float baggingWeight = 0.7;
int depth = atoi(argv[2]);
float featureWeight = 0.005;
float featureWeight = 0.3;
// double featWeight = numFeatures * 0.1;
// cout << featWeight << "\n";
......@@ -44,15 +44,15 @@ int main(int argc, char *argv[]) {
vector <vector<string>> datasetAsString;
vector <FeatureType> featureTypes;
vector <string> features;
datasetAsString = parseDataToString("../datasets/mnist.data");
featureTypes = parseFeatureTypes("../datasets/mnist.featureTypes");
features = parseFeatures("../datasets/mnist.features");
datasetAsString = parseDataToString("../datasets/adult.data");
featureTypes = parseFeatureTypes("../datasets/adult.featureTypes");
features = parseFeatures("../datasets/adult.features");
double accuracy = 0.0;
double time = 0.0;
for (int x = 0; x < 3; x++) {
vector<int> trainingIdxs = randSelectIdxWithoutReplacement(datasetAsString.at(0).size(), 0.7);
vector<int> trainingIdxs = randomSelect_WithoutReplacement(datasetAsString.at(0).size(), 0.7);
//vector <vector<string>> trainingData;
vector <int> testingIdxs = splitTrainingAndTesting(trainingIdxs, datasetAsString);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment