Commit d02744cc authored by ahmedaj's avatar ahmedaj
Browse files

prediction for encoding

parent 3b24b859
#include "encoder.hpp" #include "encoder.hpp"
#include <iostream>
#include "util.hpp"
void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>& encodedDatasetAsString, void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>& encodedDatasetAsString,
vector <string> features, vector <string>& encodedFeatures, vector<int> featuresToEncode){ vector <string> features, vector <string>& encodedFeatures, vector<int> featuresToEncode,vector <FeatureType> featureTypes,vector <FeatureType>& encodedFeatureTypes){
map<string, int> uniqueValues; map<string, int> uniqueValues;
map<int,map<string, int>> featureUniqueValues; map<int,map<string, int>,greater <int>> featureUniqueValues;
for (int encodeidx :featuresToEncode){ for (int encodeidx :featuresToEncode){
int uniqueCounter = 0 ; int uniqueCounter = 0 ;
for(int dataidx =0; dataidx<datasetAsString[encodeidx].size();dataidx++){ for(int dataidx =0; dataidx<datasetAsString[encodeidx].size();dataidx++){
...@@ -27,13 +29,16 @@ void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>> ...@@ -27,13 +29,16 @@ void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>
for (featUniqueItr = featureUniqueValues.begin(); featUniqueItr != featureUniqueValues.end(); featUniqueItr++) { for (featUniqueItr = featureUniqueValues.begin(); featUniqueItr != featureUniqueValues.end(); featUniqueItr++) {
int featIdx = featUniqueItr ->first; int featIdx = featUniqueItr ->first;
cout<<featIdx<<endl;
encodedFeatures.erase(encodedFeatures.begin()+featIdx); encodedFeatures.erase(encodedFeatures.begin()+featIdx);
encodedFeatureTypes.erase(encodedFeatureTypes.begin()+featIdx);
map<string, int> unique = featUniqueItr->second; map<string, int> unique = featUniqueItr->second;
map<string, int>::iterator uniqueItr; map<string, int>::iterator uniqueItr;
vector <string> emptyRow(datasetAsString[0].size()); vector <string> emptyRow(datasetAsString[0].size());
for(uniqueItr = unique.begin(); uniqueItr!= unique.end(); uniqueItr++){ for(uniqueItr = unique.begin(); uniqueItr!= unique.end(); uniqueItr++){
encodedFeatures.push_back(uniqueItr->first); encodedFeatures.push_back(uniqueItr->first);
encodedFeatureTypes.push_back(CATEGORICAL);
encodedDatasetAsString.push_back(emptyRow); encodedDatasetAsString.push_back(emptyRow);
} }
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include <vector> #include <vector>
#include <map> #include <map>
#include <iterator> #include <iterator>
#include "util.hpp"
#ifndef CODE_ENCODER_H #ifndef CODE_ENCODER_H
#define CODE_ENCODER_H #define CODE_ENCODER_H
...@@ -9,7 +10,7 @@ ...@@ -9,7 +10,7 @@
using namespace std; using namespace std;
void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>& encodedDatasetAsString, void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>& encodedDatasetAsString,
vector <string> features, vector <string>& encodedFeatures, vector<int> featuresToEncode); vector <string> features, vector <string>& encodedFeatures, vector<int> featuresToEncode,vector <FeatureType> featureTypes,vector <FeatureType>& encodedFeatureTypes);
vector<int> binaryShift(int size, int value); vector<int> binaryShift(int size, int value);
......
...@@ -43,7 +43,7 @@ int main(int argc, char *argv[]) { ...@@ -43,7 +43,7 @@ int main(int argc, char *argv[]) {
// cout << featWeight << "\n"; // cout << featWeight << "\n";
vector <vector<string>> datasetAsString,encodedDatasetAsString; vector <vector<string>> datasetAsString,encodedDatasetAsString;
vector <FeatureType> featureTypes; vector <FeatureType> featureTypes, encodedFeatureTypes;
vector <string> features,encodedfeatures; vector <string> features,encodedfeatures;
if (argc == 4) { if (argc == 4) {
cout << "Dataset: " << argv[3] << endl; cout << "Dataset: " << argv[3] << endl;
...@@ -53,101 +53,113 @@ int main(int argc, char *argv[]) { ...@@ -53,101 +53,113 @@ int main(int argc, char *argv[]) {
} }
else { else {
cout << "WARNING: No dataset provided as an argument!" << endl; cout << "WARNING: No dataset provided as an argument!" << endl;
datasetAsString = parseDataToString("../datasets/adult1.data"); datasetAsString = parseDataToString("../datasets/adult.data");
featureTypes = parseFeatureTypes("../datasets/adult.featureTypes"); featureTypes = parseFeatureTypes("../datasets/adult.featureTypes");
features = parseFeatures("../datasets/adult.features"); features = parseFeatures("../datasets/adult.features");
} }
encodedDatasetAsString = datasetAsString; encodedDatasetAsString = datasetAsString;
vector<string> finalLables = encodedDatasetAsString.back();
encodedDatasetAsString.pop_back(); encodedDatasetAsString.pop_back();
encodedfeatures = features; encodedfeatures = features;
encodedFeatureTypes = featureTypes;
vector<int> featuresToEncode; vector<int> featuresToEncode;
featuresToEncode.push_back(5); featuresToEncode.push_back(5);
featuresToEncode.push_back(6);
encodeData(datasetAsString,encodedDatasetAsString,features,encodedfeatures,featuresToEncode); std::sort(featuresToEncode.begin(), featuresToEncode.end(), std::greater<int>());
for(string feature:encodedfeatures){
cout << " " << feature << " ";
}
cout<<endl;
for(int dataIdx=0; dataIdx < encodedDatasetAsString[0].size(); dataIdx++){ encodeData(datasetAsString,encodedDatasetAsString,features,encodedfeatures,featuresToEncode,featureTypes,encodedFeatureTypes);
for(int i = 0; i< encodedDatasetAsString.size();i++){ encodedDatasetAsString.push_back(finalLables);
cout << encodedDatasetAsString[i][dataIdx]<<","; // for(string feature:encodedfeatures){
} // cout << " " << feature << " ";
cout<<endl; // }
} // cout<<endl;
// for(FeatureType ft:encodedFeatureTypes){
// cout<< " "<<ft<< " ";
// }
// cout<<endl;
// //pick number of features to select for random sub-spacing // for(int dataIdx=0; dataIdx < encodedDatasetAsString[0].size(); dataIdx++){
// float featureWeight = sqrt(features.size())/features.size(); // for(int i = 0; i< encodedDatasetAsString.size();i++){
// cout << encodedDatasetAsString[i][dataIdx]<<",";
// }
// cout<<endl;
// }
// double accuracy = 0.0; //pick number of features to select for random sub-spacing
// double time = 0.0; float featureWeight = sqrt(encodedfeatures.size())/encodedfeatures.size();
// for (int x = 0; x < 3; x++) {
// vector<int> trainingIdxs = randomSelect_WithoutReplacement(datasetAsString.at(0).size(), 0.7);
// //vector <vector<string>> trainingData;
// vector <int> testingIdxs = splitTrainingAndTesting(trainingIdxs, datasetAsString);
// cout << "Over sampling training data " << endl; double accuracy = 0.0;
double time = 0.0;
for (int x = 0; x < 3; x++) {
vector<int> trainingIdxs = randomSelect_WithoutReplacement(encodedDatasetAsString.at(0).size(), 0.7);
//vector <vector<string>> trainingData;
vector <int> testingIdxs = splitTrainingAndTesting(trainingIdxs, encodedDatasetAsString);
// vector<int> oversampledData = oversample(datasetAsString, trainingIdxs); cout << "Over sampling training data " << endl;
// trainingIdxs.insert(trainingIdxs.end(), oversampledData.begin(), oversampledData.end()); vector<int> oversampledData = oversample(encodedDatasetAsString, trainingIdxs);
// // sort(trainingIdxs.begin(), trainingIdxs.end());
// vector <string> testData; trainingIdxs.insert(trainingIdxs.end(), oversampledData.begin(), oversampledData.end());
// string emptystring; // sort(trainingIdxs.begin(), trainingIdxs.end());
// for (int featIndex = 0; featIndex < datasetAsString.size(); featIndex++) {
// testData.push_back(emptystring);
// }
vector <string> testData;
string emptystring;
for (int featIndex = 0; featIndex < encodedDatasetAsString.size(); featIndex++) {
testData.push_back(emptystring);
}
// auto start = high_resolution_clock::now();
// RandomForest *randomForest = new RandomForest(datasetAsString, trainingIdxs, featureTypes, numTrees, auto start = high_resolution_clock::now();
// baggingWeight, featureWeight, depth);
RandomForest *randomForest = new RandomForest(encodedDatasetAsString, trainingIdxs, encodedFeatureTypes, numTrees,
baggingWeight, featureWeight, depth);
// time += (high_resolution_clock::now() - start).count() / 1000000000.0;
time += (high_resolution_clock::now() - start).count() / 1000000000.0;
// cout << endl;
// cout << "********************* Forest accuracy *****************" << endl;
// accuracyReport report = randomForest->getAccuracy(datasetAsString,testingIdxs);
// accuracy += report.accuracy; cout << endl;
// randomForest->printAccuracyReportFile(report); cout << "********************* Forest accuracy *****************" << endl;
accuracyReport report = randomForest->getAccuracy(encodedDatasetAsString,testingIdxs);
accuracy += report.accuracy;
randomForest->printAccuracyReportFile(report);
// cout << "**************** prediction with explanation ********** " << endl;
// for (int featIndex = 0; featIndex < datasetAsString.size(); featIndex++) { cout << "**************** prediction with explanation ********** " << endl;
// testData.at(featIndex) = datasetAsString.at(featIndex)[testingIdxs[0]];
// cout << datasetAsString.at(featIndex)[testingIdxs[0]] << ", ";
// }
// cout << endl;
for (int featIndex = 0; featIndex < encodedDatasetAsString.size(); featIndex++) {
testData.at(featIndex) = encodedDatasetAsString.at(featIndex)[testingIdxs[0]];
cout << encodedDatasetAsString.at(featIndex)[testingIdxs[0]] << ", ";
}
cout << endl;
// randomForest->predict("HARD",testData, randomForest, features);
// for (int i = 0; i<randomForest->trees.size(); i++){ randomForest->predict("HARD",testData, randomForest, features);
// cleanTree(randomForest->trees[i]->root); for (int i = 0; i<randomForest->trees.size(); i++){
// delete randomForest->trees[i];
// }
// delete randomForest;
cleanTree(randomForest->trees[i]->root);
delete randomForest->trees[i];
}
delete randomForest;
// }
// ofstream outfile;
// outfile.open("avg.txt", ios::app);
// outfile << "------ Report ------ " << endl;
// outfile << numTrees << "\t" << depth << "\t" << featureWeight << "\t" << baggingWeight << "\t" << accuracy / 3
// << "\t" << time / 3 << endl;
// // outfile<< numTrees<<"\t"<<10<<"\t"<<0.7<<"\t"<<baggingWeight<<"\t"<<accuracy/3<<"\t"<<time/3<<endl;
// outfile.close(); }
ofstream outfile;
outfile.open("avg.txt", ios::app);
outfile << "------ Report ------ " << endl;
outfile << numTrees << "\t" << depth << "\t" << featureWeight << "\t" << baggingWeight << "\t" << accuracy / 3
<< "\t" << time / 3 << endl;
// outfile<< numTrees<<"\t"<<10<<"\t"<<0.7<<"\t"<<baggingWeight<<"\t"<<accuracy/3<<"\t"<<time/3<<endl;
outfile.close();
return 0; return 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment