Commit 738f0a86 authored by ahmedaj's avatar ahmedaj
Browse files

dataset encoding added

parent 5461658f
39, State-gov, 00000, Bachelors, 13, Never-married, Adm-clerical, Not-in-family, White, Male, 2174, 0, 40, United-States, <=50K
50, Self-emp-not-inc, 00000, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 13, United-States, <=50K
38, Private, 000000, HS-grad, 9, Divorced, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K
\ No newline at end of file
......@@ -568,6 +568,63 @@ vector<int> oversample(vector <vector<string>> &data, vector<int> &indices) {
}
void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>& encodedDatasetAsString,
vector <string> features, vector <string>& encodedFeatures, vector<int> featuresToEncode){
map<string, int> uniqueValues;
map<int,map<string, int>> featureUniqueValues;
for (int encodeidx :featuresToEncode){
int uniqueCounter = 0 ;
for(int dataidx =0; dataidx<datasetAsString[encodeidx].size();dataidx++){
if(uniqueValues.count(datasetAsString[encodeidx][dataidx])){
uniqueValues[datasetAsString[encodeidx][dataidx]]+=1;
}
else
{
featureUniqueValues[encodeidx].insert({datasetAsString[encodeidx][dataidx],uniqueCounter});
uniqueValues[datasetAsString[encodeidx][dataidx]]=1;
uniqueCounter++;
}
}
}
map<int, map<string, int>>::iterator featUniqueItr;
for (featUniqueItr = featureUniqueValues.begin(); featUniqueItr != featureUniqueValues.end(); featUniqueItr++) {
int featIdx = featUniqueItr ->first;
encodedFeatures.erase(encodedFeatures.begin()+featIdx);
map<string, int> unique = featUniqueItr->second;
map<string, int>::iterator uniqueItr;
vector <string> emptyRow(datasetAsString[0].size());
for(uniqueItr = unique.begin(); uniqueItr!= unique.end(); uniqueItr++){
encodedFeatures.push_back(uniqueItr->first);
encodedDatasetAsString.push_back(emptyRow);
}
for(int dataIdx = 0; dataIdx < datasetAsString.at(featIdx).size(); dataIdx++){
int uniqueint = unique[datasetAsString.at(featIdx).at(dataIdx)];
vector<int> binaryValues = binaryShift(unique.size(),uniqueint);
for (int i = 0; i<binaryValues.size(); i++){
encodedDatasetAsString.at(encodedDatasetAsString.size()-(binaryValues.size()-i)).at(dataIdx) = to_string(binaryValues.at(i));
}
}
encodedDatasetAsString.erase(encodedDatasetAsString.begin()+featIdx);
}
}
vector<int> binaryShift(int size, int value){
vector<int> binaryValues(size,0);
binaryValues.at(value) = 1;
return binaryValues;
}
void cleanTree(Node *node) {
if (node->isLeaf) {
delete node->question;
......
......@@ -44,6 +44,10 @@ vector<int> randomSelect_WithoutReplacement(int originalNum, float percentTraini
vector<int> oversample(vector<vector<string>> &data, vector<int> &indices);
BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vector<string>> &data,
vector <FeatureType> featureType, float featureWeight, vector<int>& nodeDatasetIndices );
void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>& encodedDatasetAsString,
vector <string> features, vector <string>& encodedFeatures, vector<int> featuresToEncode);
vector<int> binaryShift(int size, int value);
void cleanTree(Node *node);
#endif
\ No newline at end of file
......@@ -41,91 +41,112 @@ int main(int argc, char *argv[]) {
// cout << featWeight << "\n";
// cout << featWeight << "\n";
vector <vector<string>> datasetAsString;
vector <vector<string>> datasetAsString,encodedDatasetAsString;
vector <FeatureType> featureTypes;
vector <string> features;
vector <string> features,encodedfeatures;
if (argc == 4) {
cout << "Dataset: " << argv[3] << endl;
datasetAsString = parseDataToString( (string)argv[3] + ".data");
datasetAsString,encodedDatasetAsString = parseDataToString( (string)argv[3] + ".data");
featureTypes = parseFeatureTypes( (string)argv[3] + ".featureTypes");
features = parseFeatures( (string)argv[3] + ".features");
features,encodedfeatures = parseFeatures( (string)argv[3] + ".features");
}
else {
cout << "WARNING: No dataset provided as an argument!" << endl;
datasetAsString = parseDataToString("../datasets/adult.data");
datasetAsString = parseDataToString("../datasets/adult1.data");
featureTypes = parseFeatureTypes("../datasets/adult.featureTypes");
features = parseFeatures("../datasets/adult.features");
}
encodedDatasetAsString = datasetAsString;
encodedDatasetAsString.pop_back();
encodedfeatures = features;
vector<int> featuresToEncode;
featuresToEncode.push_back(5);
encodeData(datasetAsString,encodedDatasetAsString,features,encodedfeatures,featuresToEncode);
for(string feature:encodedfeatures){
cout << " " << feature << " ";
}
cout<<endl;
for(int dataIdx=0; dataIdx < encodedDatasetAsString[0].size(); dataIdx++){
for(int i = 0; i< encodedDatasetAsString.size();i++){
cout << encodedDatasetAsString[i][dataIdx]<<",";
}
cout<<endl;
}
//pick number of features to select for random sub-spacing
float featureWeight = sqrt(features.size())/features.size();
// //pick number of features to select for random sub-spacing
// float featureWeight = sqrt(features.size())/features.size();
double accuracy = 0.0;
double time = 0.0;
for (int x = 0; x < 3; x++) {
vector<int> trainingIdxs = randomSelect_WithoutReplacement(datasetAsString.at(0).size(), 0.7);
//vector <vector<string>> trainingData;
vector <int> testingIdxs = splitTrainingAndTesting(trainingIdxs, datasetAsString);
// double accuracy = 0.0;
// double time = 0.0;
// for (int x = 0; x < 3; x++) {
// vector<int> trainingIdxs = randomSelect_WithoutReplacement(datasetAsString.at(0).size(), 0.7);
// //vector <vector<string>> trainingData;
// vector <int> testingIdxs = splitTrainingAndTesting(trainingIdxs, datasetAsString);
cout << "Over sampling training data " << endl;
// cout << "Over sampling training data " << endl;
vector<int> oversampledData = oversample(datasetAsString, trainingIdxs);
// vector<int> oversampledData = oversample(datasetAsString, trainingIdxs);
trainingIdxs.insert(trainingIdxs.end(), oversampledData.begin(), oversampledData.end());
// sort(trainingIdxs.begin(), trainingIdxs.end());
// trainingIdxs.insert(trainingIdxs.end(), oversampledData.begin(), oversampledData.end());
// // sort(trainingIdxs.begin(), trainingIdxs.end());
vector <string> testData;
string emptystring;
for (int featIndex = 0; featIndex < datasetAsString.size(); featIndex++) {
testData.push_back(emptystring);
}
// vector <string> testData;
// string emptystring;
// for (int featIndex = 0; featIndex < datasetAsString.size(); featIndex++) {
// testData.push_back(emptystring);
// }
auto start = high_resolution_clock::now();
// auto start = high_resolution_clock::now();
RandomForest *randomForest = new RandomForest(datasetAsString, trainingIdxs, featureTypes, numTrees,
baggingWeight, featureWeight, depth);
// RandomForest *randomForest = new RandomForest(datasetAsString, trainingIdxs, featureTypes, numTrees,
// baggingWeight, featureWeight, depth);
time += (high_resolution_clock::now() - start).count() / 1000000000.0;
// time += (high_resolution_clock::now() - start).count() / 1000000000.0;
cout << endl;
// cout << endl;
// cout << "********************* Forest accuracy *****************" << endl;
// accuracyReport report = randomForest->getAccuracy(datasetAsString,testingIdxs);
//
// accuracy += report.accuracy;
// randomForest->printAccuracyReportFile(report);
cout << "**************** prediction with explanation ********** " << endl;
// cout << "**************** prediction with explanation ********** " << endl;
for (int featIndex = 0; featIndex < datasetAsString.size(); featIndex++) {
testData.at(featIndex) = datasetAsString.at(featIndex)[testingIdxs[0]];
cout << datasetAsString.at(featIndex)[testingIdxs[0]] << ", ";
}
cout << endl;
// for (int featIndex = 0; featIndex < datasetAsString.size(); featIndex++) {
// testData.at(featIndex) = datasetAsString.at(featIndex)[testingIdxs[0]];
// cout << datasetAsString.at(featIndex)[testingIdxs[0]] << ", ";
// }
// cout << endl;
randomForest->getForestPrediction(testData, randomForest, features);
// randomForest->getForestPrediction(testData, randomForest, features);
for (int i = 0; i<randomForest->trees.size(); i++){
// for (int i = 0; i<randomForest->trees.size(); i++){
cleanTree(randomForest->trees[i]->root);
delete randomForest->trees[i];
}
delete randomForest;
// cleanTree(randomForest->trees[i]->root);
// delete randomForest->trees[i];
// }
// delete randomForest;
}
ofstream outfile;
outfile.open("avg.txt", ios::app);
outfile << "------ Report ------ " << endl;
outfile << numTrees << "\t" << depth << "\t" << featureWeight << "\t" << baggingWeight << "\t" << accuracy / 3
<< "\t" << time / 3 << endl;
// outfile<< numTrees<<"\t"<<10<<"\t"<<0.7<<"\t"<<baggingWeight<<"\t"<<accuracy/3<<"\t"<<time/3<<endl;
outfile.close();
// }
// ofstream outfile;
// outfile.open("avg.txt", ios::app);
// outfile << "------ Report ------ " << endl;
// outfile << numTrees << "\t" << depth << "\t" << featureWeight << "\t" << baggingWeight << "\t" << accuracy / 3
// << "\t" << time / 3 << endl;
// // outfile<< numTrees<<"\t"<<10<<"\t"<<0.7<<"\t"<<baggingWeight<<"\t"<<accuracy/3<<"\t"<<time/3<<endl;
// outfile.close();
return 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment