Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
sgebreeg
RACER
Commits
c89f8366
Commit
c89f8366
authored
Sep 01, 2020
by
ahmedaj
Browse files
fixed local entropy in best splits
parent
4651fe3b
Changes
5
Hide whitespace changes
Inline
Side-by-side
src/DecisionTree.cpp
View file @
c89f8366
...
...
@@ -13,7 +13,7 @@ using namespace std;
DecisionTree
::
DecisionTree
(
vector
<
vector
<
string
>>
data
,
int
maxDepth
,
float
featureWeight
,
vector
<
FeatureType
>
featureType
)
{
// vector<int> index = randomSelect_WithoutReplacement(
15
, featureWeight);
// vector<int> index = randomSelect_WithoutReplacement(
data.size()
, featureWeight);
// for(int i=0; i<index.size();i++){
// for(int j=1; j<data[index[i]].size(); j++){
// data[index[i]][j] = data[index[i]][0];
...
...
@@ -68,6 +68,7 @@ Node *train(vector <vector<string>> data, vector <FeatureType> featureType,
BestSplitPoint
bestSplit
=
findBestSplit
(
parentEntropy
,
currentDepth
,
potentialSplits
,
data
,
featureType
);
//cout << "------ best split index "<<bestSplit.featureIdx<<endl;
if
(
bestSplit
.
featureIdx
==
-
1
){
Node
*
leaf
=
new
Node
(
NULL
,
NULL
,
NULL
,
true
,
classification
,
originalEntropy
,
informationGainFromParent
);
// cout<<"No more split"<<endl;
...
...
@@ -171,7 +172,7 @@ string DecisionTree::predictSingle(vector <string> test, Node *treeRoot) {
answer
=
treeRoot
->
falseBranch
;
}
}
else
{
if
(
test
[
splitIndex
]
>=
splitValue
)
{
if
(
stod
(
test
[
splitIndex
]
)
>=
stod
(
splitValue
)
)
{
answer
=
treeRoot
->
trueBranch
;
}
else
{
answer
=
treeRoot
->
falseBranch
;
...
...
src/RandomForest.cpp
View file @
c89f8366
...
...
@@ -245,7 +245,6 @@ accuracyReport RandomForest::getAccuracy(vector<vector<string>> testData){
}
accuracyReport
report
=
{
accuracy
,
correctLables
,
incorrectLables
,
correct
,
incorrect
,
total
};
cout
<<
"here"
<<
endl
;
return
report
;
...
...
src/helpers.cpp
View file @
c89f8366
...
...
@@ -202,6 +202,9 @@ float calculateEntropy(vector <vector<string>> data) {
entropy
-=
(
probability
)
*
log2
(
probability
);
}
}
if
(
entropy
==
1
){
//cout << "size with one entropy "<< dataCount.size()<<endl;
}
return
entropy
;
}
...
...
@@ -215,7 +218,8 @@ float calculateSplitEntropy(FeatureSplitData featsplitData) {
float
splitEntropy
=
(
probabilityDataTrue
*
calculateEntropy
(
splitDataTrue
))
+
(
probabilityDataFalse
*
calculateEntropy
(
splitDataFalse
));
//cout << "Split Entropy "<<splitEntropy<<endl;
return
splitEntropy
;
}
...
...
@@ -251,7 +255,7 @@ splitData(vector <vector<string>> data, int splitFeature, vector <FeatureType> f
}
}
else
{
for
(
int
dataIdx
=
0
;
dataIdx
<
splitFeatureData
.
size
();
dataIdx
++
)
{
if
(
splitFeatureData
[
dataIdx
]
>=
splitValue
)
{
if
(
stod
(
splitFeatureData
[
dataIdx
]
)
>=
stod
(
splitValue
)
)
{
for
(
int
featureIdx
=
0
;
featureIdx
<
data
.
size
();
featureIdx
++
)
{
splitTrue
.
at
(
featureIdx
).
push_back
(
data
.
at
(
featureIdx
).
at
(
dataIdx
));
}
...
...
@@ -276,28 +280,48 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth,std::map<int
float
entropy
=
9999
;
int
bestSplitFeature
;
string
bestSplitValue
;
bool
first_iteration
=
true
;
map
<
int
,
set
<
string
>>
::
iterator
itr
;
for
(
itr
=
potentialSplits
.
begin
();
itr
!=
potentialSplits
.
end
();
++
itr
)
{
//cout << itr->first << " " << itr->second.size() << endl;
float
localEntropy
=
1
;
if
(
itr
->
second
.
size
()
==
0
){
cout
<<
itr
->
first
<<
" "
<<
itr
->
second
.
size
()
<<
endl
;
cout
<<
"feature types "
<<
featureTypes
[
itr
->
first
]
<<
endl
;
}
float
localEntropy
;
bool
firstsplit
=
true
;;
set
<
string
>
splits
=
itr
->
second
;
if
(
splits
.
size
()
>
0
)
{
set
<
string
>::
iterator
splitItr
;
for
(
splitItr
=
splits
.
begin
();
splitItr
!=
splits
.
end
();
splitItr
++
)
{
// cout<<"Spliting "<<endl;
FeatureSplitData
featSplitData
=
splitData
(
data
,
itr
->
first
,
featureTypes
,
(
*
splitItr
));
float
splitEntropy
=
calculateSplitEntropy
(
featSplitData
);
if
(
featureTypes
[
itr
->
first
]
==
CONTINUOUS
)
{
if
(
localEntropy
>
splitEntropy
)
{
if
(
firstsplit
){
firstsplit
=
false
;
localEntropy
=
splitEntropy
;
}
else
{
break
;
}
else
{
if
(
localEntropy
>
splitEntropy
)
{
localEntropy
=
splitEntropy
;
if
(
splitEntropy
==
1
){
//cout << "It is one \n";
}
}
else
{
break
;
}
}
}
if
(
splitEntropy
<=
entropy
)
{
if
(
first_iteration
||
splitEntropy
<=
entropy
)
{
first_iteration
=
false
;
entropy
=
splitEntropy
;
// cout<<itr->first<<" ";
bestSplitFeature
=
itr
->
first
;
...
...
@@ -310,9 +334,21 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth,std::map<int
}
}
//cout <<"!!!!!!!!!! "<< bestSplitValue << endl;
if
(
currentDepth
!=
0
&&
parentEntropy
<
entropy
){
BestSplitPoint
splitPoint
=
{
-
1
,
""
};
return
splitPoint
;
bestSplitFeature
=
-
1
;
bestSplitValue
=
""
;
}
if
(
bestSplitFeature
>
15
||
bestSplitFeature
<-
1
){
map
<
int
,
set
<
string
>>
::
iterator
itr
;
for
(
itr
=
potentialSplits
.
begin
();
itr
!=
potentialSplits
.
end
();
++
itr
)
{
cout
<<
"------------- "
<<
endl
;
cout
<<
itr
->
first
<<
" "
<<
itr
->
second
.
size
()
<<
endl
;
}
}
BestSplitPoint
splitPoint
=
{
bestSplitFeature
,
bestSplitValue
};
...
...
@@ -414,6 +450,11 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur
set
<
string
>
temp
;
vector
<
int
>
index
=
randomSelect_WithoutReplacement
(
data
.
size
(),
featureWeight
);
// vector<int> index (data.size());
// for ( int j = 0; j< data.size(); j++){
// index[j] = j;
// }
for
(
int
i
=
0
;
i
<
index
.
size
();
i
++
)
{
temp
.
clear
();
if
(
index
[
i
]
!=
data
.
size
()
-
1
)
{
//because the last entry is the label
...
...
@@ -428,11 +469,13 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur
vector
<
string
>
continuousData
=
data
[
index
[
i
]];
sort
(
continuousData
.
begin
(),
continuousData
.
end
());
//Unsupervised binning for continuous data
int
K
=
2
0
;
if
((
continuousData
[
continuousData
.
size
()
-
1
])
==
continuousData
[
0
]){
int
K
=
10
0
;
if
(
stod
(
continuousData
[
continuousData
.
size
()
-
1
])
==
stod
(
continuousData
[
0
])
)
{
continue
;
}
double
w
=
(
stod
(
continuousData
[
continuousData
.
size
()
-
1
])
-
stod
(
continuousData
[
0
]))
/
K
;
else
{
double
w
=
(
stod
(
continuousData
[
continuousData
.
size
()
-
1
])
-
stod
(
continuousData
[
0
]))
/
K
;
if
(
w
)
{
...
...
@@ -453,8 +496,16 @@ findAllSplittingPoints(vector <vector<string>> data, vector <FeatureType> featur
// temp.insert(continuousData[j]);
// }
possibleSplits
[
index
[
i
]]
=
temp
;
if
(
temp
.
size
()
==
0
){
cout
<<
"Size is zerooooo"
<<
endl
;
}
}
// cout<<temp.size()<<" of feature Continuous "<< index[i]<<endl;
...
...
src/test.cpp
View file @
c89f8366
...
...
@@ -35,7 +35,7 @@ int main(int argc, char *argv[]) {
int
numTrees
=
atoi
(
argv
[
1
]);
float
depth
=
atoi
(
argv
[
2
]);
float
baggingWeight
=
0.7
;
float
featWeight
=
0.
7
;
float
featWeight
=
0.
3
;
//float baggingWeight = atoi(argv[2]) * 0.1;
// double featWeight = numFeatures * 0.1;
...
...
@@ -45,9 +45,9 @@ int main(int argc, char *argv[]) {
vector
<
vector
<
string
>>
datasetAsString
;
vector
<
FeatureType
>
featureTypes
;
vector
<
string
>
features
;
datasetAsString
=
parseDataToString
(
"../datasets/
adult
.data"
);
featureTypes
=
parseFeatureTypes
(
"../datasets/
adult
.featureTypes"
);
features
=
parseFeatures
(
"../datasets/
adult
.features"
);
datasetAsString
=
parseDataToString
(
"../datasets/
loan
.data"
);
featureTypes
=
parseFeatureTypes
(
"../datasets/
loan
.featureTypes"
);
features
=
parseFeatures
(
"../datasets/
loan
.features"
);
double
accuracy
=
0.0
;
...
...
src/test.sh
View file @
c89f8366
...
...
@@ -2,10 +2,8 @@
for
i
in
{
5..100..5
}
do
# ./race $i 10
for
j
in
{
5..
10
}
for
j
in
{
5..
7
}
do
./race
$i
$j
./race
$i
$j
./race
$i
$j
done
done
\ No newline at end of file
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment