Commit 3b24b859 authored by sgebreeg's avatar sgebreeg
Browse files

code refactor: encoder code moved to encoder.cpp

parent 521708f7
#include "encoder.hpp"
void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>& encodedDatasetAsString,
vector <string> features, vector <string>& encodedFeatures, vector<int> featuresToEncode){
map<string, int> uniqueValues;
map<int,map<string, int>> featureUniqueValues;
for (int encodeidx :featuresToEncode){
int uniqueCounter = 0 ;
for(int dataidx =0; dataidx<datasetAsString[encodeidx].size();dataidx++){
if(uniqueValues.count(datasetAsString[encodeidx][dataidx])){
//TODO why do we need this??
uniqueValues[datasetAsString[encodeidx][dataidx]]+=1;
}
else
{
featureUniqueValues[encodeidx].insert({datasetAsString[encodeidx][dataidx],uniqueCounter});
uniqueValues[datasetAsString[encodeidx][dataidx]]=1;
uniqueCounter++;
}
}
}
map<int, map<string, int>>::iterator featUniqueItr;
for (featUniqueItr = featureUniqueValues.begin(); featUniqueItr != featureUniqueValues.end(); featUniqueItr++) {
int featIdx = featUniqueItr ->first;
encodedFeatures.erase(encodedFeatures.begin()+featIdx);
map<string, int> unique = featUniqueItr->second;
map<string, int>::iterator uniqueItr;
vector <string> emptyRow(datasetAsString[0].size());
for(uniqueItr = unique.begin(); uniqueItr!= unique.end(); uniqueItr++){
encodedFeatures.push_back(uniqueItr->first);
encodedDatasetAsString.push_back(emptyRow);
}
for(int dataIdx = 0; dataIdx < datasetAsString.at(featIdx).size(); dataIdx++){
int uniqueint = unique[datasetAsString.at(featIdx).at(dataIdx)];
vector<int> binaryValues = binaryShift(unique.size(),uniqueint);
for (int i = 0; i<binaryValues.size(); i++){
encodedDatasetAsString.at(encodedDatasetAsString.size()-(binaryValues.size()-i)).at(dataIdx) = to_string(binaryValues.at(i));
}
}
encodedDatasetAsString.erase(encodedDatasetAsString.begin()+featIdx);
}
}
vector<int> binaryShift(int size, int value){
vector<int> binaryValues(size,0);
binaryValues.at(value) = 1;
return binaryValues;
}
#include <string>
#include <vector>
#include <map>
#include <iterator>
#ifndef CODE_ENCODER_H
#define CODE_ENCODER_H
using namespace std;
void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>& encodedDatasetAsString,
vector <string> features, vector <string>& encodedFeatures, vector<int> featuresToEncode);
vector<int> binaryShift(int size, int value);
#endif //CODE_ENCODER_H
......@@ -568,62 +568,6 @@ vector<int> oversample(vector <vector<string>> &data, vector<int> &indices) {
}
void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>& encodedDatasetAsString,
vector <string> features, vector <string>& encodedFeatures, vector<int> featuresToEncode){
map<string, int> uniqueValues;
map<int,map<string, int>> featureUniqueValues;
for (int encodeidx :featuresToEncode){
int uniqueCounter = 0 ;
for(int dataidx =0; dataidx<datasetAsString[encodeidx].size();dataidx++){
if(uniqueValues.count(datasetAsString[encodeidx][dataidx])){
uniqueValues[datasetAsString[encodeidx][dataidx]]+=1;
}
else
{
featureUniqueValues[encodeidx].insert({datasetAsString[encodeidx][dataidx],uniqueCounter});
uniqueValues[datasetAsString[encodeidx][dataidx]]=1;
uniqueCounter++;
}
}
}
map<int, map<string, int>>::iterator featUniqueItr;
for (featUniqueItr = featureUniqueValues.begin(); featUniqueItr != featureUniqueValues.end(); featUniqueItr++) {
int featIdx = featUniqueItr ->first;
encodedFeatures.erase(encodedFeatures.begin()+featIdx);
map<string, int> unique = featUniqueItr->second;
map<string, int>::iterator uniqueItr;
vector <string> emptyRow(datasetAsString[0].size());
for(uniqueItr = unique.begin(); uniqueItr!= unique.end(); uniqueItr++){
encodedFeatures.push_back(uniqueItr->first);
encodedDatasetAsString.push_back(emptyRow);
}
for(int dataIdx = 0; dataIdx < datasetAsString.at(featIdx).size(); dataIdx++){
int uniqueint = unique[datasetAsString.at(featIdx).at(dataIdx)];
vector<int> binaryValues = binaryShift(unique.size(),uniqueint);
for (int i = 0; i<binaryValues.size(); i++){
encodedDatasetAsString.at(encodedDatasetAsString.size()-(binaryValues.size()-i)).at(dataIdx) = to_string(binaryValues.at(i));
}
}
encodedDatasetAsString.erase(encodedDatasetAsString.begin()+featIdx);
}
}
vector<int> binaryShift(int size, int value){
vector<int> binaryValues(size,0);
binaryValues.at(value) = 1;
return binaryValues;
}
void cleanTree(Node *node) {
if (node->isLeaf) {
......
......@@ -44,9 +44,6 @@ vector<int> randomSelect_WithoutReplacement(int originalNum, float percentTraini
vector<int> oversample(vector<vector<string>> &data, vector<int> &indices);
BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, vector <vector<string>> &data,
vector <FeatureType> featureType, float featureWeight, vector<int>& nodeDatasetIndices );
void encodeData(vector <vector<string>> datasetAsString, vector <vector<string>>& encodedDatasetAsString,
vector <string> features, vector <string>& encodedFeatures, vector<int> featuresToEncode);
vector<int> binaryShift(int size, int value);
void cleanTree(Node *node);
......
#include "helpers.hpp"
#include "DecisionTree.hpp"
#include "RandomForest.hpp"
#include "encoder.hpp"
#include "util.hpp"
using namespace std::chrono;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment