Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
sgebreeg
RACER
Commits
c89f8366
Commit
c89f8366
authored
Sep 01, 2020
by
ahmedaj
Browse files
fixed local entropy in best splits
parent
4651fe3b
Changes
5
Hide whitespace changes
Inline
Side-by-side
src/DecisionTree.cpp
View file @
c89f8366
...
@@ -13,7 +13,7 @@ using namespace std;
...
@@ -13,7 +13,7 @@ using namespace std;
DecisionTree
::
DecisionTree
(
vector
<
vector
<
string
>>
data
,
int
maxDepth
,
float
featureWeight
,
DecisionTree
::
DecisionTree
(
vector
<
vector
<
string
>>
data
,
int
maxDepth
,
float
featureWeight
,
vector
<
FeatureType
>
featureType
)
{
vector
<
FeatureType
>
featureType
)
{
// vector<int> index = randomSelect_WithoutReplacement(
15
, featureWeight);
// vector<int> index = randomSelect_WithoutReplacement(
data.size()
, featureWeight);
// for(int i=0; i<index.size();i++){
// for(int i=0; i<index.size();i++){
// for(int j=1; j<data[index[i]].size(); j++){
// for(int j=1; j<data[index[i]].size(); j++){
// data[index[i]][j] = data[index[i]][0];
// data[index[i]][j] = data[index[i]][0];
...
@@ -68,6 +68,7 @@ Node *train(vector <vector<string>> data, vector <FeatureType> featureType,
...
@@ -68,6 +68,7 @@ Node *train(vector <vector<string>> data, vector <FeatureType> featureType,
BestSplitPoint
bestSplit
=
findBestSplit
(
parentEntropy
,
currentDepth
,
potentialSplits
,
data
,
BestSplitPoint
bestSplit
=
findBestSplit
(
parentEntropy
,
currentDepth
,
potentialSplits
,
data
,
featureType
);
featureType
);
//cout << "------ best split index "<<bestSplit.featureIdx<<endl;
if
(
bestSplit
.
featureIdx
==
-
1
){
if
(
bestSplit
.
featureIdx
==
-
1
){
Node
*
leaf
=
new
Node
(
NULL
,
NULL
,
NULL
,
true
,
classification
,
originalEntropy
,
informationGainFromParent
);
Node
*
leaf
=
new
Node
(
NULL
,
NULL
,
NULL
,
true
,
classification
,
originalEntropy
,
informationGainFromParent
);
// cout<<"No more split"<<endl;
// cout<<"No more split"<<endl;
...
@@ -171,7 +172,7 @@ string DecisionTree::predictSingle(vector <string> test, Node *treeRoot) {
...
@@ -171,7 +172,7 @@ string DecisionTree::predictSingle(vector <string> test, Node *treeRoot) {
answer
=
treeRoot
->
falseBranch
;
answer
=
treeRoot
->
falseBranch
;
}
}
}
else
{
}
else
{
if
(
test
[
splitIndex
]
>=
splitValue
)
{
if
(
stod
(
test
[
splitIndex
]
)
>=
stod
(
splitValue
)
)
{
answer
=
treeRoot
->
trueBranch
;
answer
=
treeRoot
->
trueBranch
;
}
else
{
}
else
{
answer
=
treeRoot
->
falseBranch
;
answer
=
treeRoot
->
falseBranch
;
...
...
src/RandomForest.cpp
View file @
c89f8366
...
@@ -245,7 +245,6 @@ accuracyReport RandomForest::getAccuracy(vector<vector<string>> testData){
...
@@ -245,7 +245,6 @@ accuracyReport RandomForest::getAccuracy(vector<vector<string>> testData){
}
}
accuracyReport
report
=
{
accuracy
,
correctLables
,
incorrectLables
,
correct
,
incorrect
,
total
};
accuracyReport
report
=
{
accuracy
,
correctLables
,
incorrectLables
,
correct
,
incorrect
,
total
};
cout
<<
"here"
<<
endl
;
return
report
;
return
report
;
...
...
src/helpers.cpp
View file @
c89f8366
...
@@ -202,6 +202,9 @@ float calculateEntropy(vector <vector<string>> data) {
...
@@ -202,6 +202,9 @@ float calculateEntropy(vector <vector<string>> data) {
entropy
-=
(
probability
)
*
log2
(
probability
);
entropy
-=
(
probability
)
*
log2
(
probability
);
}
}
}
}
if
(
entropy
==
1
){
//cout << "size with one entropy "<< dataCount.size()<<endl;
}
return
entropy
;
return
entropy
;
}
}
...
@@ -215,7 +218,8 @@ float calculateSplitEntropy(FeatureSplitData featsplitData) {
...
@@ -215,7 +218,8 @@ float calculateSplitEntropy(FeatureSplitData featsplitData) {
float
splitEntropy
=
(
probabilityDataTrue
*
calculateEntropy
(
splitDataTrue
))
+
float
splitEntropy
=
(
probabilityDataTrue
*
calculateEntropy
(
splitDataTrue
))
+
(
probabilityDataFalse
*
calculateEntropy
(
splitDataFalse
));
(
probabilityDataFalse
*
calculateEntropy
(
splitDataFalse
));
//cout << "Split Entropy "<<splitEntropy<<endl;
return
splitEntropy
;
return
splitEntropy
;
}
}
...
@@ -251,7 +255,7 @@ splitData(vector <vector<string>> data, int splitFeature, vector <FeatureType> f
...
@@ -251,7 +255,7 @@ splitData(vector <vector<string>> data, int splitFeature, vector <FeatureType> f
}
}
}
else
{
}
else
{
for
(
int
dataIdx
=
0
;
dataIdx
<
splitFeatureData
.
size
();
dataIdx
++
)
{
for
(
int
dataIdx
=
0
;
dataIdx
<
splitFeatureData
.
size
();
dataIdx
++
)
{
if
(
splitFeatureData
[
dataIdx
]
>=
splitValue
)
{
if
(
stod
(
splitFeatureData
[
dataIdx
]
)
>=
stod
(
splitValue
)
)
{
for
(
int
featureIdx
=
0
;
featureIdx
<
data
.
size
();
featureIdx
++
)
{
for
(
int
featureIdx
=
0
;
featureIdx
<
data
.
size
();
featureIdx
++
)
{
splitTrue
.
at
(
featureIdx
).
push_back
(
data
.
at
(
featureIdx
).
at
(
dataIdx
));
splitTrue
.
at
(
featureIdx
).
push_back
(
data
.
at
(
featureIdx
).
at
(
dataIdx
));
}
}
...
@@ -276,28 +280,48 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth,std::map<int
...
@@ -276,28 +280,48 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth,std::map<int
float
entropy
=
9999
;
float
entropy
=
9999
;
int
bestSplitFeature
;
int
bestSplitFeature
;
string
bestSplitValue
;
string
bestSplitValue
;
bool
first_iteration
=
true
;
map
<
int
,
set
<
string
>>
::
iterator
map
<
int
,
set
<
string
>>
::
iterator
itr
;
itr
;
for
(
itr
=
potentialSplits
.
begin
();
itr
!=
potentialSplits
.
end
();
++
itr
)
{
for
(
itr
=
potentialSplits
.
begin
();
itr
!=
potentialSplits
.
end
();
++
itr
)
{
//cout << itr->first << " " << itr->second.size() << endl;
if
(
itr
->
second
.
size
()
==
0
){
float
localEntropy
=
1
;
cout
<<
itr
->
first
<<
" "
<<
itr
->
second
.
size
()
<<
endl
;
cout
<<
"feature types "
<<
featureTypes
[
itr
->
first
]
<<
endl
;
}
float
localEntropy
;
bool
firstsplit
=
true
;;
set
<
string
>
splits
=
itr
->
second
;
set
<
string
>
splits
=
itr
->
second
;
if
(
splits
.
size
()
>
0
)
{
if
(
splits
.
size
()
>
0
)
{
set
<
string
>::
iterator
splitItr
;
set
<
string
>::
iterator
splitItr
;
for
(
splitItr
=
splits
.
begin
();
splitItr
!=
splits
.
end
();
splitItr
++
)
{
for
(
splitItr
=
splits
.
begin
();
splitItr
!=
splits
.
end
();
splitItr
++
)
{
// cout<<"Spliting "<<endl;
FeatureSplitData
featSplitData
=
splitData
(
data
,
itr
->
first
,
featureTypes
,
(
*
splitItr
));
FeatureSplitData
featSplitData
=
splitData
(
data
,
itr
->
first
,
featureTypes
,
(
*
splitItr
));
float
splitEntropy
=
calculateSplitEntropy
(
featSplitData
);
float
splitEntropy
=
calculateSplitEntropy
(
featSplitData
);
if
(
featureTypes
[
itr
->
first
]
==
CONTINUOUS
)
{
if
(
featureTypes
[
itr
->
first
]
==
CONTINUOUS
)
{
if
(
localEntropy
>
splitEntropy
)
{
if
(
firstsplit
){
firstsplit
=
false
;
localEntropy
=
splitEntropy
;
localEntropy
=
splitEntropy
;
}
else
{
break
;
}
}
else
{
if
(
localEntropy
>
splitEntropy
)
{
localEntropy
=
splitEntropy
;
if
(
splitEntropy
==
1
){
//cout << "It is one \n";
}
}
else
{
break
;
}
}
}
}
if
(
splitEntropy
<=
entropy
)
{
if
(
first_iteration
||
splitEntropy
<=
entropy
)
{
first_iteration
=
false
;
entropy
=
splitEntropy
;
entropy
=
splitEntropy
;
// cout<<itr->first<<" ";
// cout<<itr->first<<" ";
bestSplitFeature
=
itr
->
first
;
bestSplitFeature
=
itr
->
first
;
...
@@ -310,9 +334,21 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth,std::map<int
...
@@ -310,9 +334,21 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth,std::map<int
}
}
}
}
//cout <<"!!!!!!!!!! "<< bestSplitValue << endl;
if
(
currentDepth
!=
0
&&
parentEntropy
<
entropy
){
if
(
currentDepth
!=
0
&&
parentEntropy
<
entropy
){
BestSplitPoint
splitPoint
=
{
-
1
,
""
};
bestSplitFeature
=
-
1
;
return
splitPoint
;
bestSplitValue
=
""
;
}
if
(
bestSplitFeature
>
15
||
bestSplitFeature
<-
1
){
map
<
int
,
set
<
string
>>
::
iterator
itr
;
for
(
itr
=
potentialSplits
.
begin
();
itr
!=
potentialSplits
.
end
();
++
itr
)
{
cout
<<
"------------- "
<<
endl
;
cout
<<
itr
->
first
<<
" "
<<
itr
->
second
.
size
()
<<
endl
;
}
}
}
BestSplitPoint
splitPoint
=
{
bestSplitFeature
,
bestSplitValue
};
BestSplitPoint
splitPoint
=
{
bestSplitFeature
,
bestSplitValue
};
...
@@ -414,6 +450,11 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur
...
@@ -414,6 +450,11 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur
set
<
string
>
temp
;
set
<
string
>
temp
;
vector
<
int
>
index
=
randomSelect_WithoutReplacement
(
data
.
size
(),
featureWeight
);
vector
<
int
>
index
=
randomSelect_WithoutReplacement
(
data
.
size
(),
featureWeight
);
// vector<int> index (data.size());
// for ( int j = 0; j< data.size(); j++){
// index[j] = j;
// }
for
(
int
i
=
0
;
i
<
index
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
index
.
size
();
i
++
)
{
temp
.
clear
();
temp
.
clear
();
if
(
index
[
i
]
!=
data
.
size
()
-
1
)
{
//because the last entry is the label
if
(
index
[
i
]
!=
data
.
size
()
-
1
)
{
//because the last entry is the label
...
@@ -428,11 +469,13 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur
...
@@ -428,11 +469,13 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur
vector
<
string
>
continuousData
=
data
[
index
[
i
]];
vector
<
string
>
continuousData
=
data
[
index
[
i
]];
sort
(
continuousData
.
begin
(),
continuousData
.
end
());
sort
(
continuousData
.
begin
(),
continuousData
.
end
());
//Unsupervised binning for continuous data
//Unsupervised binning for continuous data
int
K
=
2
0
;
int
K
=
10
0
;
if
((
continuousData
[
continuousData
.
size
()
-
1
])
==
continuousData
[
0
]){
if
(
stod
(
continuousData
[
continuousData
.
size
()
-
1
])
==
stod
(
continuousData
[
0
])
)
{
continue
;
continue
;
}
}
double
w
=
(
stod
(
continuousData
[
continuousData
.
size
()
-
1
])
-
stod
(
continuousData
[
0
]))
/
K
;
else
{
double
w
=
(
stod
(
continuousData
[
continuousData
.
size
()
-
1
])
-
stod
(
continuousData
[
0
]))
/
K
;
if
(
w
)
{
if
(
w
)
{
...
@@ -453,8 +496,16 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur
...
@@ -453,8 +496,16 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur
// temp.insert(continuousData[j]);
// temp.insert(continuousData[j]);
// }
// }
possibleSplits
[
index
[
i
]]
=
temp
;
possibleSplits
[
index
[
i
]]
=
temp
;
if
(
temp
.
size
()
==
0
){
cout
<<
"Size is zerooooo"
<<
endl
;
}
}
// cout<<temp.size()<<" of feature Continuous "<< index[i]<<endl;
// cout<<temp.size()<<" of feature Continuous "<< index[i]<<endl;
...
...
src/test.cpp
View file @
c89f8366
...
@@ -35,7 +35,7 @@ int main(int argc, char *argv[]) {
...
@@ -35,7 +35,7 @@ int main(int argc, char *argv[]) {
int
numTrees
=
atoi
(
argv
[
1
]);
int
numTrees
=
atoi
(
argv
[
1
]);
float
depth
=
atoi
(
argv
[
2
]);
float
depth
=
atoi
(
argv
[
2
]);
float
baggingWeight
=
0.7
;
float
baggingWeight
=
0.7
;
float
featWeight
=
0.
7
;
float
featWeight
=
0.
3
;
//float baggingWeight = atoi(argv[2]) * 0.1;
//float baggingWeight = atoi(argv[2]) * 0.1;
// double featWeight = numFeatures * 0.1;
// double featWeight = numFeatures * 0.1;
...
@@ -45,9 +45,9 @@ int main(int argc, char *argv[]) {
...
@@ -45,9 +45,9 @@ int main(int argc, char *argv[]) {
vector
<
vector
<
string
>>
datasetAsString
;
vector
<
vector
<
string
>>
datasetAsString
;
vector
<
FeatureType
>
featureTypes
;
vector
<
FeatureType
>
featureTypes
;
vector
<
string
>
features
;
vector
<
string
>
features
;
datasetAsString
=
parseDataToString
(
"../datasets/
adult
.data"
);
datasetAsString
=
parseDataToString
(
"../datasets/
loan
.data"
);
featureTypes
=
parseFeatureTypes
(
"../datasets/
adult
.featureTypes"
);
featureTypes
=
parseFeatureTypes
(
"../datasets/
loan
.featureTypes"
);
features
=
parseFeatures
(
"../datasets/
adult
.features"
);
features
=
parseFeatures
(
"../datasets/
loan
.features"
);
double
accuracy
=
0.0
;
double
accuracy
=
0.0
;
...
...
src/test.sh
View file @
c89f8366
...
@@ -2,10 +2,8 @@
...
@@ -2,10 +2,8 @@
for
i
in
{
5..100..5
}
for
i
in
{
5..100..5
}
do
do
# ./race $i 10
# ./race $i 10
for
j
in
{
5..
10
}
for
j
in
{
5..
7
}
do
do
./race
$i
$j
./race
$i
$j
./race
$i
$j
./race
$i
$j
done
done
done
done
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment