Commit 738f0a86 authored by ahmedaj's avatar ahmedaj
Browse files

dataset encoding added

parent 5461658f
39, State-gov, 00000, Bachelors, 13, Never-married, Adm-clerical, Not-in-family, White, Male, 2174, 0, 40, United-States, <=50K
50, Self-emp-not-inc, 00000, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 13, United-States, <=50K
38, Private, 000000, HS-grad, 9, Divorced, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K
\ No newline at end of file
...@@ -568,6 +568,63 @@ vector<int> oversample(vector <vector<string>> &data, vector<int> &indices) { ...@@ -568,6 +568,63 @@ vector<int> oversample(vector <vector<string>> &data, vector<int> &indices) {
} }
void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>& encodedDatasetAsString,
vector <string> features, vector <string>& encodedFeatures, vector<int> featuresToEncode){
map<string, int> uniqueValues;
map<int,map<string, int>> featureUniqueValues;
for (int encodeidx :featuresToEncode){
int uniqueCounter = 0 ;
for(int dataidx =0; dataidx<datasetAsString[encodeidx].size();dataidx++){
if(uniqueValues.count(datasetAsString[encodeidx][dataidx])){
uniqueValues[datasetAsString[encodeidx][dataidx]]+=1;
}
else
{
featureUniqueValues[encodeidx].insert({datasetAsString[encodeidx][dataidx],uniqueCounter});
uniqueValues[datasetAsString[encodeidx][dataidx]]=1;
uniqueCounter++;
}
}
}
map<int, map<string, int>>::iterator featUniqueItr;
for (featUniqueItr = featureUniqueValues.begin(); featUniqueItr != featureUniqueValues.end(); featUniqueItr++) {
int featIdx = featUniqueItr ->first;
encodedFeatures.erase(encodedFeatures.begin()+featIdx);
map<string, int> unique = featUniqueItr->second;
map<string, int>::iterator uniqueItr;
vector <string> emptyRow(datasetAsString[0].size());
for(uniqueItr = unique.begin(); uniqueItr!= unique.end(); uniqueItr++){
encodedFeatures.push_back(uniqueItr->first);
encodedDatasetAsString.push_back(emptyRow);
}
for(int dataIdx = 0; dataIdx < datasetAsString.at(featIdx).size(); dataIdx++){
int uniqueint = unique[datasetAsString.at(featIdx).at(dataIdx)];
vector<int> binaryValues = binaryShift(unique.size(),uniqueint);
for (int i = 0; i<binaryValues.size(); i++){
encodedDatasetAsString.at(encodedDatasetAsString.size()-(binaryValues.size()-i)).at(dataIdx) = to_string(binaryValues.at(i));
}
}
encodedDatasetAsString.erase(encodedDatasetAsString.begin()+featIdx);
}
}
vector<int> binaryShift(int size, int value){
vector<int> binaryValues(size,0);
binaryValues.at(value) = 1;
return binaryValues;
}
void cleanTree(Node *node) { void cleanTree(Node *node) {
if (node->isLeaf) { if (node->isLeaf) {
delete node->question; delete node->question;
......
...@@ -44,6 +44,10 @@ vector<int> randomSelect_WithoutReplacement(int originalNum, float percentTraini ...@@ -44,6 +44,10 @@ vector<int> randomSelect_WithoutReplacement(int originalNum, float percentTraini
vector<int> oversample(vector<vector<string>> &data, vector<int> &indices); vector<int> oversample(vector<vector<string>> &data, vector<int> &indices);
BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vector<string>> &data, BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vector<string>> &data,
vector <FeatureType> featureType, float featureWeight, vector<int>& nodeDatasetIndices ); vector <FeatureType> featureType, float featureWeight, vector<int>& nodeDatasetIndices );
void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>& encodedDatasetAsString,
vector <string> features, vector <string>& encodedFeatures, vector<int> featuresToEncode);
vector<int> binaryShift(int size, int value);
void cleanTree(Node *node); void cleanTree(Node *node);
#endif #endif
\ No newline at end of file
...@@ -41,91 +41,112 @@ int main(int argc, char *argv[]) { ...@@ -41,91 +41,112 @@ int main(int argc, char *argv[]) {
// cout << featWeight << "\n"; // cout << featWeight << "\n";
// cout << featWeight << "\n"; // cout << featWeight << "\n";
vector <vector<string>> datasetAsString; vector <vector<string>> datasetAsString,encodedDatasetAsString;
vector <FeatureType> featureTypes; vector <FeatureType> featureTypes;
vector <string> features; vector <string> features,encodedfeatures;
if (argc == 4) { if (argc == 4) {
cout << "Dataset: " << argv[3] << endl; cout << "Dataset: " << argv[3] << endl;
datasetAsString = parseDataToString( (string)argv[3] + ".data"); datasetAsString,encodedDatasetAsString = parseDataToString( (string)argv[3] + ".data");
featureTypes = parseFeatureTypes( (string)argv[3] + ".featureTypes"); featureTypes = parseFeatureTypes( (string)argv[3] + ".featureTypes");
features = parseFeatures( (string)argv[3] + ".features"); features,encodedfeatures = parseFeatures( (string)argv[3] + ".features");
} }
else { else {
cout << "WARNING: No dataset provided as an argument!" << endl; cout << "WARNING: No dataset provided as an argument!" << endl;
datasetAsString = parseDataToString("../datasets/adult.data"); datasetAsString = parseDataToString("../datasets/adult1.data");
featureTypes = parseFeatureTypes("../datasets/adult.featureTypes"); featureTypes = parseFeatureTypes("../datasets/adult.featureTypes");
features = parseFeatures("../datasets/adult.features"); features = parseFeatures("../datasets/adult.features");
} }
encodedDatasetAsString = datasetAsString;
encodedDatasetAsString.pop_back();
encodedfeatures = features;
vector<int> featuresToEncode;
featuresToEncode.push_back(5);
encodeData(datasetAsString,encodedDatasetAsString,features,encodedfeatures,featuresToEncode);
for(string feature:encodedfeatures){
cout << " " << feature << " ";
}
cout<<endl;
for(int dataIdx=0; dataIdx < encodedDatasetAsString[0].size(); dataIdx++){
for(int i = 0; i< encodedDatasetAsString.size();i++){
cout << encodedDatasetAsString[i][dataIdx]<<",";
}
cout<<endl;
}
//pick number of features to select for random sub-spacing // //pick number of features to select for random sub-spacing
float featureWeight = sqrt(features.size())/features.size(); // float featureWeight = sqrt(features.size())/features.size();
double accuracy = 0.0; // double accuracy = 0.0;
double time = 0.0; // double time = 0.0;
for (int x = 0; x < 3; x++) { // for (int x = 0; x < 3; x++) {
vector<int> trainingIdxs = randomSelect_WithoutReplacement(datasetAsString.at(0).size(), 0.7); // vector<int> trainingIdxs = randomSelect_WithoutReplacement(datasetAsString.at(0).size(), 0.7);
//vector <vector<string>> trainingData; // //vector <vector<string>> trainingData;
vector <int> testingIdxs = splitTrainingAndTesting(trainingIdxs, datasetAsString); // vector <int> testingIdxs = splitTrainingAndTesting(trainingIdxs, datasetAsString);
cout << "Over sampling training data " << endl; // cout << "Over sampling training data " << endl;
vector<int> oversampledData = oversample(datasetAsString, trainingIdxs); // vector<int> oversampledData = oversample(datasetAsString, trainingIdxs);
trainingIdxs.insert(trainingIdxs.end(), oversampledData.begin(), oversampledData.end()); // trainingIdxs.insert(trainingIdxs.end(), oversampledData.begin(), oversampledData.end());
// sort(trainingIdxs.begin(), trainingIdxs.end()); // // sort(trainingIdxs.begin(), trainingIdxs.end());
vector <string> testData; // vector <string> testData;
string emptystring; // string emptystring;
for (int featIndex = 0; featIndex < datasetAsString.size(); featIndex++) { // for (int featIndex = 0; featIndex < datasetAsString.size(); featIndex++) {
testData.push_back(emptystring); // testData.push_back(emptystring);
} // }
auto start = high_resolution_clock::now(); // auto start = high_resolution_clock::now();
RandomForest *randomForest = new RandomForest(datasetAsString, trainingIdxs, featureTypes, numTrees, // RandomForest *randomForest = new RandomForest(datasetAsString, trainingIdxs, featureTypes, numTrees,
baggingWeight, featureWeight, depth); // baggingWeight, featureWeight, depth);
time += (high_resolution_clock::now() - start).count() / 1000000000.0; // time += (high_resolution_clock::now() - start).count() / 1000000000.0;
cout << endl; // cout << endl;
// cout << "********************* Forest accuracy *****************" << endl; // cout << "********************* Forest accuracy *****************" << endl;
// accuracyReport report = randomForest->getAccuracy(datasetAsString,testingIdxs); // accuracyReport report = randomForest->getAccuracy(datasetAsString,testingIdxs);
//
// accuracy += report.accuracy; // accuracy += report.accuracy;
// randomForest->printAccuracyReportFile(report); // randomForest->printAccuracyReportFile(report);
cout << "**************** prediction with explanation ********** " << endl; // cout << "**************** prediction with explanation ********** " << endl;
for (int featIndex = 0; featIndex < datasetAsString.size(); featIndex++) { // for (int featIndex = 0; featIndex < datasetAsString.size(); featIndex++) {
testData.at(featIndex) = datasetAsString.at(featIndex)[testingIdxs[0]]; // testData.at(featIndex) = datasetAsString.at(featIndex)[testingIdxs[0]];
cout << datasetAsString.at(featIndex)[testingIdxs[0]] << ", "; // cout << datasetAsString.at(featIndex)[testingIdxs[0]] << ", ";
} // }
cout << endl; // cout << endl;
randomForest->getForestPrediction(testData, randomForest, features); // randomForest->getForestPrediction(testData, randomForest, features);
for (int i = 0; i<randomForest->trees.size(); i++){ // for (int i = 0; i<randomForest->trees.size(); i++){
cleanTree(randomForest->trees[i]->root); // cleanTree(randomForest->trees[i]->root);
delete randomForest->trees[i]; // delete randomForest->trees[i];
} // }
delete randomForest; // delete randomForest;
} // }
ofstream outfile; // ofstream outfile;
outfile.open("avg.txt", ios::app); // outfile.open("avg.txt", ios::app);
outfile << "------ Report ------ " << endl; // outfile << "------ Report ------ " << endl;
outfile << numTrees << "\t" << depth << "\t" << featureWeight << "\t" << baggingWeight << "\t" << accuracy / 3 // outfile << numTrees << "\t" << depth << "\t" << featureWeight << "\t" << baggingWeight << "\t" << accuracy / 3
<< "\t" << time / 3 << endl; // << "\t" << time / 3 << endl;
// outfile<< numTrees<<"\t"<<10<<"\t"<<0.7<<"\t"<<baggingWeight<<"\t"<<accuracy/3<<"\t"<<time/3<<endl; // // outfile<< numTrees<<"\t"<<10<<"\t"<<0.7<<"\t"<<baggingWeight<<"\t"<<accuracy/3<<"\t"<<time/3<<endl;
outfile.close(); // outfile.close();
return 0; return 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment