Commit bc6c7c23 authored by mccrabb's avatar mccrabb
Browse files

generalized which features to encode. cleaned main & encoder

parent 41104df3
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
......@@ -16,3 +16,41 @@
6 4 0.0980581 0.7 72.9768 5.06558
------ Report ------
5 5 0.0980581 0.7 72.9068 0.987395
------ Report ------
5 10 0.0980581 0.7 70.7003 0.259982
------ Report ------
15 10 0.0980581 0.7 72.1811 0.336416
------ Report ------
25 10 0.0980581 0.7 71.4333 0.402313
------ Report ------
35 10 0.0980581 0.7 71.448 0.571884
------ Report ------
45 10 0.0980581 0.7 71.3191 0.618753
------ Report ------
55 10 0.0980581 0.7 71.647 0.737441
------ Report ------
65 10 0.0980581 0.7 71.6948 0.747065
------ Report ------
75 10 0.0980581 0.7 71.6322 0.863602
------ Report ------
85 10 0.0980581 0.7 71.4259 0.936212
------ Report ------
95 10 0.0980581 0.7 71.9674 1.01317
------ Report ------
5 10 0.0980581 0.7 72.8257 0.27168
------ Report ------
15 10 0.0980581 0.7 71.7759 0.29845
------ Report ------
25 10 0.0980581 0.7 72.1295 0.445674
------ Report ------
35 10 0.0980581 0.7 71.6285 0.529526
------ Report ------
45 10 0.0980581 0.7 71.3486 0.565248
------ Report ------
5 10 0.0980581 0.7 69.9856 0.281512
------ Report ------
5 10 0.0980581 0.7 72.3984 0.280475
------ Report ------
5 10 0.0980581 0.7 24.7099 0.100397
------ Report ------
5 10 0.0980581 0.7 23.5017 0.108896
This diff is collapsed.
......@@ -6,10 +6,13 @@ void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>
vector <string> features, vector <string>& encodedFeatures, vector<int> featuresToEncode,vector <FeatureType> featureTypes,vector <FeatureType>& encodedFeatureTypes){
map<string, int> uniqueValues;
map<int,map<string, int>,greater <int>> featureUniqueValues;
map<int,map<string, int>,greater <int>> featureUniqueValues; // maps featureIdx --> map(string --> String encoded as a number)
// for each feature
for (int encodeidx :featuresToEncode){
int uniqueCounter = 0 ;
// for each data entry
for(int dataidx =0; dataidx<datasetAsString[encodeidx].size();dataidx++){
// if the value has been seen for this feature
if(uniqueValues.count(datasetAsString[encodeidx][dataidx])){
//TODO why do we need this??
uniqueValues[datasetAsString[encodeidx][dataidx]]+=1;
......@@ -26,7 +29,7 @@ void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>
}
map<int, map<string, int>>::iterator featUniqueItr;
// For each feature being encoded
for (featUniqueItr = featureUniqueValues.begin(); featUniqueItr != featureUniqueValues.end(); featUniqueItr++) {
int featIdx = featUniqueItr ->first;
encodedFeatures.erase(encodedFeatures.begin()+featIdx);
......@@ -34,28 +37,30 @@ void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>
map<string, int> unique = featUniqueItr->second;
map<string, int>::iterator uniqueItr;
vector <string> emptyRow(datasetAsString[0].size());
// For each category option in the feature, add as a new binary feature
for(uniqueItr = unique.begin(); uniqueItr!= unique.end(); uniqueItr++){
encodedFeatures.push_back(uniqueItr->first);
encodedFeatureTypes.push_back(CATEGORICAL);
encodedDatasetAsString.push_back(emptyRow);
}
// For each data entry
for(int dataIdx = 0; dataIdx < datasetAsString.at(featIdx).size(); dataIdx++){
int uniqueint = unique[datasetAsString.at(featIdx).at(dataIdx)];
// create one-hot encoding for each feature (if 3rd category of 4 options -> 0010)
vector<int> binaryValues = binaryShift(unique.size(),uniqueint);
// Assign values for each encoded feature (4 feature categories --> 4 new features to assign)
for (int i = 0; i<binaryValues.size(); i++){
encodedDatasetAsString.at(encodedDatasetAsString.size()-(binaryValues.size()-i)).at(dataIdx) = to_string(binaryValues.at(i));
}
}
// erase original feature
encodedDatasetAsString.erase(encodedDatasetAsString.begin()+featIdx);
}
}
// create feature-specific one-hot encoding vector
vector<int> binaryShift(int size, int value){
vector<int> binaryValues(size,0);
binaryValues.at(value) = 1;
......
No preview for this file type
No preview for this file type
......@@ -59,111 +59,80 @@ int main(int argc, char *argv[]) {
}
encodedDatasetAsString = datasetAsString;
vector<string> finalLables = encodedDatasetAsString.back();
vector<string> finalLabels = encodedDatasetAsString.back();
encodedDatasetAsString.pop_back();
encodedfeatures = features;
encodedFeatureTypes = featureTypes;
// Encode categorical data (convert from string to one-hot encoding)
vector<int> featuresToEncode;
featuresToEncode.push_back(1);
featuresToEncode.push_back(3);
featuresToEncode.push_back(5);
featuresToEncode.push_back(6);
featuresToEncode.push_back(7);
featuresToEncode.push_back(8);
featuresToEncode.push_back(9);
featuresToEncode.push_back(13);
std::sort(featuresToEncode.begin(), featuresToEncode.end(), std::greater<int>());
for(int i = 0; i < featureTypes.size(); i++) {
if (featureTypes.at(i) == CATEGORICAL) {
featuresToEncode.push_back(i);
}
}
std::sort(featuresToEncode.begin(), featuresToEncode.end(), std::greater<int>());
encodeData(datasetAsString,encodedDatasetAsString,features,encodedfeatures,featuresToEncode,featureTypes,encodedFeatureTypes);
encodedDatasetAsString.push_back(finalLables);
// for(string feature:encodedfeatures){
// cout << " " << feature << " ";
// }
// cout<<endl;
// for(FeatureType ft:encodedFeatureTypes){
// cout<< " "<<ft<< " ";
// }
// cout<<endl;
// for(int dataIdx=0; dataIdx < encodedDatasetAsString[0].size(); dataIdx++){
// for(int i = 0; i< encodedDatasetAsString.size();i++){
// cout << encodedDatasetAsString[i][dataIdx]<<",";
// }
// cout<<endl;
// }
encodedDatasetAsString.push_back(finalLabels);
//pick number of features to select for random sub-spacing
float featureWeight = sqrt(encodedfeatures.size())/encodedfeatures.size();
double accuracy = 0.0;
double time = 0.0;
for (int x = 0; x < 3; x++) {
for (int x = 0; x < 1; x++) {
// Split Training and Testing
vector<int> trainingIdxs = randomSelect_WithoutReplacement(encodedDatasetAsString.at(0).size(), 0.7);
//vector <vector<string>> trainingData;
vector <int> testingIdxs = splitTrainingAndTesting(trainingIdxs, encodedDatasetAsString);
cout << "Over sampling training data " << endl;
// Oversample training data
vector<int> oversampledData = oversample(encodedDatasetAsString, trainingIdxs);
trainingIdxs.insert(trainingIdxs.end(), oversampledData.begin(), oversampledData.end());
// sort(trainingIdxs.begin(), trainingIdxs.end());
// Prep a single test data entry slot
vector <string> testData;
string emptystring;
for (int featIndex = 0; featIndex < encodedDatasetAsString.size(); featIndex++) {
testData.push_back(emptystring);
}
// Start Timer
auto start = high_resolution_clock::now();
// Build Forest
RandomForest *randomForest = new RandomForest(encodedDatasetAsString, trainingIdxs, encodedFeatureTypes, numTrees,
baggingWeight, featureWeight, depth);
// Stop Timer
time += (high_resolution_clock::now() - start).count() / 1000000000.0;
cout << endl;
cout << "********************* Forest accuracy *****************" << endl;
accuracyReport report = randomForest->getAccuracy(encodedDatasetAsString,testingIdxs);
cout << "********************* Forest accuracy *****************" << endl;
accuracyReport report = randomForest->getAccuracy(encodedDatasetAsString,testingIdxs);
accuracy += report.accuracy;
randomForest->printAccuracyReportFile(report);
accuracy += report.accuracy;
randomForest->printAccuracyReportFile(report);
cout << "**************** prediction with explanation ********** " << endl;
// Setup one test entry for prediction & Explanation
for (int featIndex = 0; featIndex < encodedDatasetAsString.size(); featIndex++) {
testData.at(featIndex) = encodedDatasetAsString.at(featIndex)[testingIdxs[0]];
cout << encodedDatasetAsString.at(featIndex)[testingIdxs[0]] << ", ";
}
cout << endl;
randomForest->predict("HARD",testData, randomForest, encodedfeatures);
// Predict and explain entry
randomForest->predict("HARD",testData, randomForest, encodedfeatures);
// Delete Tree
for (int i = 0; i<randomForest->trees.size(); i++){
cleanTree(randomForest->trees[i]->root);
delete randomForest->trees[i];
}
delete randomForest;
}
ofstream outfile;
outfile.open("avg.txt", ios::app);
outfile << "------ Report ------ " << endl;
outfile << numTrees << "\t" << depth << "\t" << featureWeight << "\t" << baggingWeight << "\t" << accuracy / 3
<< "\t" << time / 3 << endl;
// outfile<< numTrees<<"\t"<<10<<"\t"<<0.7<<"\t"<<baggingWeight<<"\t"<<accuracy/3<<"\t"<<time/3<<endl;
outfile.close();
......
No preview for this file type
No preview for this file type
#!/bin/bash
for i in {5..100..10}
#for i in {5..100..10}
for i in {5..5..1}
do
./race $i 10
# for j in {5..7}
......@@ -14,4 +15,4 @@ done
## do
## ./race $i $j
## done
#done
\ No newline at end of file
#done
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment