Commit f06c4721 authored by ahmedaj's avatar ahmedaj
Browse files

testing with indices

parent ef1706f0
......@@ -87,7 +87,7 @@ Node *train(vector <vector<string>> &data, vector <FeatureType> &featureType,
}
}
string DecisionTree::predictSingle(vector <string> test, Node *treeRoot, PredictionReport *report) {
string DecisionTree::predictSingle(vector <string>& test, Node *treeRoot, PredictionReport *report) {
if (treeRoot->isLeaf == true) {
return treeRoot->classification;
}
......@@ -124,7 +124,7 @@ string DecisionTree::predictSingle(vector <string> test, Node *treeRoot, Predict
}
}
string DecisionTree::predictSingle(vector <string> test, Node *treeRoot) {
string DecisionTree::predictSingle(vector <string>& test, Node *treeRoot) {
if (treeRoot->isLeaf == true) {
return treeRoot->classification;
}
......
......@@ -14,8 +14,8 @@ class DecisionTree {
public:
DecisionTree(vector <vector<string>> &data, vector<int> &trainingIndx, int maxDepth, float featureWeight, vector <FeatureType> &featureType);
string predictSingle(vector <string> test, Node *treeRoot, PredictionReport * report);
string predictSingle(vector <string> test, Node *treeRoot);
string predictSingle(vector <string>& test, Node *treeRoot, PredictionReport * report);
string predictSingle(vector <string>& test, Node *treeRoot);
void printTree(Node *node, int space);
......
......@@ -88,7 +88,7 @@ RandomForest::RandomForest(vector <vector<string>> &data, vector<int> &trainingI
}
vector <string>
RandomForest::getForestPrediction(vector <string> test, RandomForest *forest, vector <string> features) {
RandomForest::getForestPrediction(vector <string>& test, RandomForest *forest, vector <string> &features) {
cout << "Trees in forest: " << to_string(forest->trees.size()) << endl;
cout << "Predicting" << endl;
vector <string> predictions(forest->trees.size());
......@@ -240,21 +240,21 @@ vector <pair<int, double>> sort(map<int, double> &M, vector <string> test, vecto
return A;
}
vector <string> getBatchPrediction(vector <vector<string>> testData, RandomForest *forest) {
vector <string> getBatchPrediction(vector <vector<string>>& datasetAsString,vector <int>& testIdxs, RandomForest *forest) {
vector <string> predictions;
for (int testIndex = 0; testIndex < testData.at(0).size(); testIndex++) {
for (int testIndex = 0; testIndex < testIdxs.size(); testIndex++) {
map<string, int> votes;
vector <string> test;
string emptystring;
for (int featIndex = 0; featIndex < testData.size(); featIndex++) {
for (int featIndex = 0; featIndex < datasetAsString.size(); featIndex++) {
test.push_back(emptystring);
}
for (int featIndex = 0; featIndex < testData.size(); featIndex++) {
test.at(featIndex) = testData.at(featIndex).at(testIndex);
for (int featIndex = 0; featIndex < datasetAsString.size(); featIndex++) {
test.at(featIndex) = datasetAsString.at(featIndex).at(testIdxs[testIndex]);
}
//Get every tree in the forests prediction
......@@ -289,9 +289,12 @@ vector <string> getBatchPrediction(vector <vector<string>> testData, RandomFores
}
accuracyReport RandomForest::getAccuracy(vector <vector<string>> testData) {
vector <string> predictions = getBatchPrediction(testData, this);
vector <string> labels = testData.at(testData.size() - 1);
accuracyReport RandomForest::getAccuracy(vector <vector<string>>& datasetAsString,vector <int> & testIdxs) {
vector <string> predictions = getBatchPrediction(datasetAsString,testIdxs, this);
vector <string> labels;
for(int dataidx:testIdxs){
labels.push_back(datasetAsString.at(datasetAsString.size()-1).at(dataidx));
}
std::map<std::string, int> incorrectLables;
std::map<std::string, int> correctLables;
......
......@@ -15,15 +15,15 @@ public:
RandomForest(vector <vector<string>> &data, vector<int> &trainingIndx, vector <FeatureType> &featureType, int numTrees,
float baggingWeight, float featureWeight, int maxDepth);
vector <string> getForestPrediction(vector <string> test, RandomForest *forest, vector <string> features);
accuracyReport getAccuracy(vector<vector<string>>);
vector <string> getForestPrediction(vector <string>& test, RandomForest *forest, vector <string>& features);
accuracyReport getAccuracy(vector <vector<string>>& datasetAsString,vector <int> &testIdxs);
void printAccuracyReport(accuracyReport report);
void printAccuracyReportFile(accuracyReport report);
};
vector <string> getBatchPrediction(vector<vector<string>> testData, RandomForest *forest);
vector <string> getBatchPrediction(vector <vector<string>>& datasetAsString,vector <int>& testIdxs, RandomForest *forest);
map<int, double> explain(string classification, vector<PredictionReport *> reports);
......
......@@ -151,14 +151,9 @@ vector<int> randSelectIdxWithReplacement(int originalNum, float percent) {
return selections;
}
void splitTrainingAndTesting(vector<int> trainingIndecies, vector <vector<string>> dataString,
vector <vector<string>> &trainingDataString, vector <vector<string>> &testDataString) {
vector<int> splitTrainingAndTesting(vector<int> trainingIndecies, vector <vector<string>> &dataString) {
vector <string> emptyStringVector;
for (int idx = 0; idx < dataString.size(); idx++) {
trainingDataString.push_back(emptyStringVector);
testDataString.push_back(emptyStringVector);
}
vector <int> testIdxs;
int trainingSize = trainingIndecies.size();
int currentTrainingIndex = trainingIndecies.back();
trainingIndecies.pop_back();
......@@ -167,10 +162,7 @@ void splitTrainingAndTesting(vector<int> trainingIndecies, vector <vector<string
for (int dataIdx = numDataEntries - 1; dataIdx >= 0; dataIdx--) {
if (currentTrainingIndex == dataIdx) {
//cout << "here"<<endl;
for (int featureIdx = 0; featureIdx < dataString.size(); featureIdx++) {
trainingDataString.at(featureIdx).push_back(dataString.at(featureIdx).at(dataIdx));
}
if (trainingCount < trainingSize) {
currentTrainingIndex = trainingIndecies.back();
trainingIndecies.pop_back();
......@@ -178,12 +170,14 @@ void splitTrainingAndTesting(vector<int> trainingIndecies, vector <vector<string
}
} else {
for (int featureIdx = 0; featureIdx < dataString.size(); featureIdx++) {
testDataString.at(featureIdx).push_back(dataString.at(featureIdx).at(dataIdx));
}
testIdxs.push_back(dataIdx);
}
}
sort(testIdxs.begin(),testIdxs.end());
return testIdxs;
}
float calculateEntropy(vector <vector<string>> &data, vector<int> indices) {
......
......@@ -33,8 +33,7 @@ vector<vector<string>> parseDataToString(string dataFile);
vector <FeatureType> parseFeatureTypes(string fileName);
vector<int> randSelectIdxWithoutReplacement(int originalNum, float percentTraining);
vector<int> randSelectIdxWithReplacement(int originalNum, float percent);
void splitTrainingAndTesting(vector<int> trainingIndecies,vector<vector<string>> dataString,
vector<vector<string>>& trainingDataString,vector<vector<string>>& testDataString);
vector<int> splitTrainingAndTesting(vector<int> trainingIndecies, vector <vector<string>> &dataString);
string classifyData(vector <vector<string>> data);
std::pair<string,double> classifyWithEntropy(vector<vector<string>> &data, vector<int> &indices);
FeatureSplitDataIndx splitData(vector<vector<string>>& data, int splitFeature,vector<FeatureType> featureTypes, string splitValue, vector<int> &nodeDatasetIndices );
......
......@@ -35,7 +35,7 @@ int main(int argc, char *argv[]) {
int numTrees = atoi(argv[1]);
float baggingWeight = 0.7;
int depth = atoi(argv[2]);
float featureWeight = 0.3;
float featureWeight = 0.7;
// double featWeight = numFeatures * 0.1;
// cout << featWeight << "\n";
......@@ -44,19 +44,17 @@ int main(int argc, char *argv[]) {
vector <vector<string>> datasetAsString;
vector <FeatureType> featureTypes;
vector <string> features;
datasetAsString = parseDataToString("../datasets/adult.data");
featureTypes = parseFeatureTypes("../datasets/adult.featureTypes");
features = parseFeatures("../datasets/adult.features");
datasetAsString = parseDataToString("../datasets/loan.data");
featureTypes = parseFeatureTypes("../datasets/loan.featureTypes");
features = parseFeatures("../datasets/loan.features");
double accuracy = 0.0;
double time = 0.0;
for (int x = 0; x < 3; x++) {
vector<int> trainingIdxs = randSelectIdxWithoutReplacement(datasetAsString.at(0).size(), 0.7);
vector <vector<string>> trainingData;
vector <vector<string>> testingData;
splitTrainingAndTesting(trainingIdxs, datasetAsString, trainingData, testingData);
//vector <vector<string>> trainingData;
vector <int> testingIdxs = splitTrainingAndTesting(trainingIdxs, datasetAsString);
cout << "Over sampling training data " << endl;
......@@ -67,7 +65,7 @@ int main(int argc, char *argv[]) {
vector <string> testData;
string emptystring;
for (int featIndex = 0; featIndex < testingData.size(); featIndex++) {
for (int featIndex = 0; featIndex < datasetAsString.size(); featIndex++) {
testData.push_back(emptystring);
}
......@@ -83,7 +81,7 @@ int main(int argc, char *argv[]) {
cout << endl;
cout << "********************* Forest accuracy *****************" << endl;
accuracyReport report = randomForest->getAccuracy(testingData);
accuracyReport report = randomForest->getAccuracy(datasetAsString,testingIdxs);
accuracy += report.accuracy;
randomForest->printAccuracyReportFile(report);
......@@ -91,9 +89,9 @@ int main(int argc, char *argv[]) {
cout << "**************** prediction with explanation ********** " << endl;
for (int featIndex = 0; featIndex < testingData.size(); featIndex++) {
testData.at(featIndex) = testingData.at(featIndex)[0];
cout << testingData.at(featIndex).at(0) << ", ";
for (int featIndex = 0; featIndex < datasetAsString.size(); featIndex++) {
testData.at(featIndex) = datasetAsString.at(featIndex)[testingIdxs[0]];
cout << datasetAsString.at(featIndex)[testingIdxs[0]] << ", ";
}
cout << endl;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment