Commit 468673fd authored by mccrabb's avatar mccrabb
Browse files

Fixed encoding for 4-arg runs + more cleaning

parent bc6c7c23
......@@ -11,6 +11,7 @@
using namespace std::chrono;
// Return a vector (size=threads) with number of trees that thread will produce
vector<int> getParts(int trees, int cpus) {
vector<int> temp;
if (cpus > trees) {
......@@ -42,31 +43,38 @@ vector<int> getParts(int trees, int cpus) {
RandomForest::RandomForest(vector <vector<string>> &data, vector<int> &trainingIndx, vector <FeatureType> &featureTypes,
int numTrees,
float baggingWeight, float featureWeight, int maxDepth) {
// Set the basic tree properties
vector < DecisionTree * > decisionTrees;
this->featureWeight = featureWeight;
this->depth = maxDepth;
// determine # available threads
unsigned num_cpus = std::thread::hardware_concurrency() / 2;
// unsigned num_cpus = 12;
if (numTrees < num_cpus)
num_cpus = numTrees;
// A mutex ensures orderly access.
std::mutex iomutex;
std::vector <std::thread> threads(num_cpus);
cout << "Launching " << num_cpus << " jobs for training trees.\n";
// Determine how many trees per thread
vector<int> temp = getParts(numTrees, num_cpus);
vector<int> temp = getParts(numTrees, num_cpus); //determine how many trees to run in parallel
// Start each thread
for (int i = 0; i < num_cpus; i++) {
if (i < temp.size())
if (i < temp.size()) {
// begin definition of thread
threads[i] = std::thread([&iomutex, i, temp, &data, &trainingIndx, baggingWeight,
maxDepth, featureWeight, &featureTypes, &decisionTrees] {
// for each tree the thread should build
for (int j = 0; j < temp.at(i); j++) {
// bag data
vector<int> baggedData = bagData(trainingIndx, baggingWeight); //TODO fix this
// build Tree
cout << "Training tree " << j << " in thread " << i << endl;
DecisionTree *tree = new DecisionTree(data, baggedData, maxDepth, featureWeight, featureTypes);
cout << "Done training tree " << j << " in thread " << i << endl;
// Add Tree to decisionTrees (vector of the trees for the forest)
{
// Use a lexical scope and lock_guard to safely lock the mutex only for
// the duration of vector push.
......@@ -74,17 +82,14 @@ RandomForest::RandomForest(vector <vector<string>> &data, vector<int> &trainingI
decisionTrees.push_back(tree);
}
}
});
}); // End definition of thread
}
}
// end threads and assign trees to forest
for (auto &t : threads) {
t.join();
}
this->trees = decisionTrees;
}
......
No preview for this file type
......@@ -54,3 +54,33 @@
5 10 0.0980581 0.7 24.7099 0.100397
------ Report ------
5 10 0.0980581 0.7 23.5017 0.108896
------ Report ------
5 10 0.0980581 0.7 24.0984 0.105467
------ Report ------
5 10 0.0980581 0.7 23.3249 0.0787362
------ Report ------
5 10 0.0980581 0.7 22.4113 0.0799633
------ Report ------
5 10 0.0980581 0.7 21.1883 0.0906889
------ Report ------
100 20 0.0980581 0.7 23.7301 0.363827
------ Report ------
5 5 0.0980581 0.7 23.0781 0.0793603
------ Report ------
5 5 0.0764719 0.7 32.9167 0.00538968
------ Report ------
5 5 0.0764719 0.7 28.2639 0.00614722
------ Report ------
5 5 0.0980581 0.7 23.4943 0.0742166
------ Report ------
5 5 0.0980581 0.7 23.6269 0.0793303
------ Report ------
5 5 0.0764719 0.7 29.1667 0.00527632
------ Report ------
5 5 0.0467269 0.7 24.0578 0.022341
------ Report ------
5 5 0.0467269 0.7 21.8973 0.0229613
------ Report ------
5 5 0.0467269 0.7 24.4314 0.0239732
------ Report ------
100 20 0.0467269 0.7 25.4386 0.155197
This diff is collapsed.
......@@ -39,6 +39,7 @@ void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>
vector <string> emptyRow(datasetAsString[0].size());
// For each category option in the feature, add as a new binary feature
for(uniqueItr = unique.begin(); uniqueItr!= unique.end(); uniqueItr++){
cout << "Adding feature: " << uniqueItr->first << endl;
encodedFeatures.push_back(uniqueItr->first);
encodedFeatureTypes.push_back(CATEGORICAL);
encodedDatasetAsString.push_back(emptyRow);
......@@ -56,9 +57,7 @@ void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>
}
// erase original feature
encodedDatasetAsString.erase(encodedDatasetAsString.begin()+featIdx);
}
}
// create feature-specific one-hot encoding vector
vector<int> binaryShift(int size, int value){
......
No preview for this file type
......@@ -580,4 +580,4 @@ void cleanTree(Node *node) {
cleanTree(node->falseBranch);
delete node->question;
delete node;
}
\ No newline at end of file
}
......@@ -28,36 +28,36 @@ vector <string> parseFeatures(string fileName) {
int main(int argc, char *argv[]) {
// Confirm 3-4 args: race [trees] [depth] [optional: dataset]
if ((argc <= 2) || (argc >= 5)) {
cout << "Given " << to_string(argc) <<" args. Need 2 or 3." << endl;
cout << "race [numTrees] [depth] [dataset (no suffix, optional)]" << endl;
exit(1);
}
// Set #trees and depth
int numTrees = atoi(argv[1]);
float baggingWeight = 0.7;
int depth = atoi(argv[2]);
// double featWeight = numFeatures * 0.1;
// cout << featWeight << "\n";
// cout << featWeight << "\n";
// Parse data from files
vector <vector<string>> datasetAsString,encodedDatasetAsString;
vector <FeatureType> featureTypes, encodedFeatureTypes;
vector <string> features,encodedfeatures;
if (argc == 4) {
cout << "Dataset: " << argv[3] << endl;
datasetAsString,encodedDatasetAsString = parseDataToString( (string)argv[3] + ".data");
datasetAsString = parseDataToString( (string)argv[3] + ".data");
featureTypes = parseFeatureTypes( (string)argv[3] + ".featureTypes");
features,encodedfeatures = parseFeatures( (string)argv[3] + ".features");
features = parseFeatures( (string)argv[3] + ".features");
}
else {
else { // if no dataset provided...
cout << "WARNING: No dataset provided as an argument!" << endl;
datasetAsString = parseDataToString("../datasets/adult.data");
featureTypes = parseFeatureTypes("../datasets/adult.featureTypes");
features = parseFeatures("../datasets/adult.features");
}
// Prep for encoding
encodedDatasetAsString = datasetAsString;
vector<string> finalLabels = encodedDatasetAsString.back();
encodedDatasetAsString.pop_back();
......@@ -80,13 +80,18 @@ int main(int argc, char *argv[]) {
double accuracy = 0.0;
double time = 0.0;
// loop for number of identical trials to run
for (int x = 0; x < 1; x++) {
// Split Training and Testing
vector<int> trainingIdxs = randomSelect_WithoutReplacement(encodedDatasetAsString.at(0).size(), 0.7);
vector <int> testingIdxs = splitTrainingAndTesting(trainingIdxs, encodedDatasetAsString);
// Oversample training data
vector<int> oversampledData = oversample(encodedDatasetAsString, trainingIdxs);
trainingIdxs.insert(trainingIdxs.end(), oversampledData.begin(), oversampledData.end());
// Prep a single test data entry slot
vector <string> testData;
string emptystring;
......@@ -102,25 +107,23 @@ int main(int argc, char *argv[]) {
// Stop Timer
time += (high_resolution_clock::now() - start).count() / 1000000000.0;
// Get and Print overall accuracy
cout << endl;
cout << "********************* Forest accuracy *****************" << endl;
accuracyReport report = randomForest->getAccuracy(encodedDatasetAsString,testingIdxs);
accuracy += report.accuracy;
randomForest->printAccuracyReportFile(report);
// Get and Print one test entry, its prediction, and its explanation
cout << "**************** prediction with explanation ********** " << endl;
// Setup one test entry for prediction & Explanation
for (int featIndex = 0; featIndex < encodedDatasetAsString.size(); featIndex++) {
testData.at(featIndex) = encodedDatasetAsString.at(featIndex)[testingIdxs[0]];
cout << encodedDatasetAsString.at(featIndex)[testingIdxs[0]] << ", ";
cout << encodedfeatures.at(featIndex) << "=" << encodedDatasetAsString.at(featIndex)[testingIdxs[0]] << ", ";
}
cout << endl;
// Predict and explain entry
randomForest->predict("HARD",testData, randomForest, encodedfeatures);
// Delete Tree
// Delete Tree (necessary for multiple identical trials
for (int i = 0; i<randomForest->trees.size(); i++){
cleanTree(randomForest->trees[i]->root);
delete randomForest->trees[i];
......@@ -128,12 +131,13 @@ int main(int argc, char *argv[]) {
delete randomForest;
}
// Write overall report to file
ofstream outfile;
outfile.open("avg.txt", ios::app);
outfile << "------ Report ------ " << endl;
outfile << numTrees << "\t" << depth << "\t" << featureWeight << "\t" << baggingWeight << "\t" << accuracy / 3
<< "\t" << time / 3 << endl;
outfile.close();
return 0;
......
No preview for this file type
No preview for this file type
......@@ -2,7 +2,7 @@
#for i in {5..100..10}
for i in {5..5..1}
do
./race $i 10
./race $i 10 ../datasets/wine
# for j in {5..7}
# do
# ./race $i $j
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment