Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
sgebreeg
RACER
Commits
27425f7f
Commit
27425f7f
authored
Sep 30, 2020
by
sgebreeg
Browse files
revisions on finding best splitting point
parent
dea77701
Changes
3
Hide whitespace changes
Inline
Side-by-side
src/DecisionTree.cpp
View file @
27425f7f
...
@@ -44,16 +44,9 @@ Node *train(vector <vector<string>> data, vector <FeatureType> featureType,
...
@@ -44,16 +44,9 @@ Node *train(vector <vector<string>> data, vector <FeatureType> featureType,
}
else
{
}
else
{
// cout<<"Finding splits"<<endl;
//create a random subspace
//find possible splits
std
::
map
<
int
,
set
<
string
>>
potentialSplits
=
findAllSplittingPoints
(
data
,
featureType
,
featureWeight
);
// cout<<"Finding best split"<<endl;
BestSplitPoint
bestSplit
=
findBestSplitTrial
(
parentEntropy
,
currentDepth
,
data
,
featureType
,
featureWeight
);
//find best split
BestSplitPoint
bestSplit
=
findBestSplit
(
parentEntropy
,
currentDepth
,
potentialSplits
,
data
,
featureType
);
//cout << "------ best split index "<<bestSplit.featureIdx<<endl;
//cout << "------ best split index "<<bestSplit.featureIdx<<endl;
if
(
bestSplit
.
featureIdx
==
-
1
||
bestSplit
.
featureIdx
>
data
.
size
()
-
1
){
if
(
bestSplit
.
featureIdx
==
-
1
||
bestSplit
.
featureIdx
>
data
.
size
()
-
1
){
...
...
src/helpers.cpp
View file @
27425f7f
...
@@ -205,6 +205,37 @@ float calculateEntropy(vector <vector<string>> data) {
...
@@ -205,6 +205,37 @@ float calculateEntropy(vector <vector<string>> data) {
return
entropy
;
return
entropy
;
}
}
double
calSplitEntropy
(
std
::
map
<
std
::
string
,
int
>
leftLabelCount
,
std
::
map
<
std
::
string
,
int
>
rightLabelCount
,
int
leftCount
,
int
rightCount
)
{
double
totalData
=
leftCount
+
rightCount
;
double
probabilityRight
=
rightCount
/
totalData
;
double
probabilityLeft
=
leftCount
/
totalData
;
double
leftEntropy
=
0.0
;
double
rightEntropy
=
0.0
;
map
<
string
,
int
>::
iterator
leftitr
;
for
(
leftitr
=
leftLabelCount
.
begin
();
leftitr
!=
leftLabelCount
.
end
();
++
leftitr
)
{
double
probability
=
(
double
)
leftitr
->
second
/
(
double
)
leftCount
;
if
(
probability
>
0
)
{
leftEntropy
-=
probability
*
log2
(
probability
);
}
}
map
<
string
,
int
>::
iterator
rightitr
;
for
(
rightitr
=
rightLabelCount
.
begin
();
rightitr
!=
rightLabelCount
.
end
();
++
rightitr
)
{
double
probability
=
(
double
)
rightitr
->
second
/
(
double
)
rightCount
;
if
(
probability
>
0
)
{
rightEntropy
-=
probability
*
log2
(
probability
);
}
}
double
splitEntropy
=
(
probabilityLeft
*
leftEntropy
)
+
(
probabilityRight
*
rightEntropy
);
return
splitEntropy
;
}
float
calculateSplitEntropy
(
FeatureSplitData
featsplitData
)
{
float
calculateSplitEntropy
(
FeatureSplitData
featsplitData
)
{
vector
<
vector
<
string
>>
splitDataTrue
=
featsplitData
.
dataTrue
;
vector
<
vector
<
string
>>
splitDataTrue
=
featsplitData
.
dataTrue
;
vector
<
vector
<
string
>>
splitDataFalse
=
featsplitData
.
dataFalse
;
vector
<
vector
<
string
>>
splitDataFalse
=
featsplitData
.
dataFalse
;
...
@@ -270,6 +301,93 @@ splitData(vector <vector<string>> data, int splitFeature, vector <FeatureType> f
...
@@ -270,6 +301,93 @@ splitData(vector <vector<string>> data, int splitFeature, vector <FeatureType> f
return
featSplitData
;
return
featSplitData
;
}
}
void
sortDataByFeature
(
int
featIdx
,
vector
<
vector
<
string
>>
data
,
vector
<
pair
<
int
,
string
>>
&
featureData
)
{
for
(
int
dataIdx
=
0
;
dataIdx
<
data
[
0
].
size
();
dataIdx
++
)
{
featureData
.
emplace_back
(
dataIdx
,
data
[
featIdx
].
at
(
dataIdx
));
}
sort
(
featureData
.
begin
(),
featureData
.
end
(),
[](
pair
<
int
,
string
>
&
a
,
pair
<
int
,
string
>
&
b
)
{
return
a
.
second
<
b
.
second
;
});
}
BestSplitPoint
findBestSplitTrial
(
double
parentEntropy
,
int
currentDepth
,
vector
<
vector
<
string
>>
data
,
vector
<
FeatureType
>
featureTypes
,
float
featureWeight
)
{
vector
<
int
>
randomFeatures
=
randomSelect_WithoutReplacement
(
data
.
size
(),
featureWeight
);
int
bestFeatureIndex
=
randomFeatures
[
0
];
double
minEntropy
=
99999
;
string
bestSplitValue
=
""
;
for
(
auto
featureIndex
:
randomFeatures
)
{
if
(
featureIndex
!=
data
.
size
()
-
1
)
{
//because last column is label
//initialize variables
string
threshold
=
""
;
int
dataIndex
;
std
::
map
<
std
::
string
,
int
>
leftLabelCount
;
std
::
map
<
std
::
string
,
int
>
rightLabelCount
;
//count right side labels
for
(
int
i
=
0
;
i
<
data
[
data
.
size
()
-
1
].
size
();
i
++
)
{
if
(
rightLabelCount
.
count
(
data
[
data
.
size
()
-
1
][
i
]))
{
rightLabelCount
[
data
[
data
.
size
()
-
1
][
i
]]
+=
1
;
}
else
{
rightLabelCount
[
data
[
data
.
size
()
-
1
][
i
]]
=
1
;
}
}
int
leftSize
=
0
;
int
rightSize
=
data
.
at
(
featureIndex
).
size
();
vector
<
pair
<
int
,
string
>>
featureData
;
featureData
.
reserve
(
data
[
0
].
size
());
//done initializing variables
//sort data with selected feature
sortDataByFeature
(
featureIndex
,
data
,
featureData
);
for
(
int
indx
=
0
;
indx
<
featureData
.
size
();)
{
threshold
=
featureData
.
at
(
indx
).
second
;
dataIndex
=
featureData
.
at
(
indx
).
first
;
while
(
indx
<
data
.
at
(
featureIndex
).
size
()
&&
featureData
.
at
(
indx
).
second
<=
threshold
)
{
leftSize
++
;
rightSize
--
;
if
(
leftLabelCount
.
count
(
data
[
data
.
size
()
-
1
][
indx
]))
{
leftLabelCount
[
data
[
data
.
size
()
-
1
][
indx
]]
+=
1
;
}
else
{
leftLabelCount
[
data
[
data
.
size
()
-
1
][
indx
]]
=
1
;
}
rightLabelCount
[
data
[
data
.
size
()
-
1
][
indx
]]
-=
1
;
indx
++
;
dataIndex
=
featureData
[
indx
].
first
;
}
if
(
indx
==
data
[
0
].
size
())
{
continue
;
}
double
splitEntropy
=
calSplitEntropy
(
leftLabelCount
,
rightLabelCount
,
leftSize
,
rightSize
);
if
(
splitEntropy
<
minEntropy
)
{
// cout<<"Best split at "<< featureIndex <<" value "<<threshold<<" Entropy "<< splitEntropy<<endl;
minEntropy
=
splitEntropy
;
bestFeatureIndex
=
featureIndex
;
bestSplitValue
=
threshold
;
}
}
}
}
if
(
minEntropy
>=
parentEntropy
&&
currentDepth
!=
0
){
bestFeatureIndex
=
-
1
;
bestSplitValue
=
""
;
}
return
{
bestFeatureIndex
,
bestSplitValue
};
}
BestSplitPoint
findBestSplit
(
double
parentEntropy
,
int
currentDepth
,
std
::
map
<
int
,
set
<
string
>>
potentialSplits
,
BestSplitPoint
findBestSplit
(
double
parentEntropy
,
int
currentDepth
,
std
::
map
<
int
,
set
<
string
>>
potentialSplits
,
vector
<
vector
<
string
>>
data
,
vector
<
vector
<
string
>>
data
,
vector
<
FeatureType
>
featureTypes
)
{
vector
<
FeatureType
>
featureTypes
)
{
...
@@ -319,10 +437,9 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, std::map<in
...
@@ -319,10 +437,9 @@ BestSplitPoint findBestSplit(double parentEntropy, int currentDepth, std::map<in
// }
// }
// dataFalse = featSplitData.dataFalse;
// dataFalse = featSplitData.dataFalse;
splitEntropy
=
calculateSplitEntropy
({
featSplitData
.
dataTrue
,
featSplitData
.
dataFalse
});
splitEntropy
=
calculateSplitEntropy
({
featSplitData
.
dataTrue
,
featSplitData
.
dataFalse
});
if
(
localEntropy
>=
splitEntropy
){
if
(
localEntropy
>=
splitEntropy
)
{
localEntropy
=
splitEntropy
;
localEntropy
=
splitEntropy
;
}
}
else
{
else
{
break
;
break
;
}
}
}
}
...
...
src/helpers.hpp
View file @
27425f7f
...
@@ -41,6 +41,7 @@ float calculateSplitEntropy (FeatureSplitData featsplitData);
...
@@ -41,6 +41,7 @@ float calculateSplitEntropy (FeatureSplitData featsplitData);
vector
<
vector
<
string
>>
bagData
(
vector
<
vector
<
string
>>
data
,
float
baggingWeight
);
vector
<
vector
<
string
>>
bagData
(
vector
<
vector
<
string
>>
data
,
float
baggingWeight
);
vector
<
int
>
randomSelect_WithoutReplacement
(
int
originalNum
,
float
percentTraining
);
vector
<
int
>
randomSelect_WithoutReplacement
(
int
originalNum
,
float
percentTraining
);
vector
<
vector
<
string
>>
oversample
(
vector
<
vector
<
string
>>
data
);
vector
<
vector
<
string
>>
oversample
(
vector
<
vector
<
string
>>
data
);
BestSplitPoint
findBestSplitTrial
(
double
parentEntropy
,
int
currentDepth
,
vector
<
vector
<
string
>>
data
,
vector
<
FeatureType
>
featureType
,
float
featureWeight
);
#endif
#endif
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment