Скачиваний:
28
Добавлен:
01.05.2014
Размер:
15.41 Кб
Скачать
unit J48SplitClass;

interface

uses

Instances,
DistributionClass,
SplitClass,
EntropyCalculator,
GainRatioCrit,
Utils,
DmmTypes,
RuleItem,
ItemSet,
Attribute;

type J48Split = class (Split)

private

// number of split attribute
m_Attribute: integer;

// Desired number of branches.
m_complexityIndex: integer;

// Minimum number of objects in a split.
m_minNoObj: integer;

// Value of split point.
m_splitPoint: double;

// InfoGain of split.
m_infoGain: double;

// GainRatio of split.
m_gainRatio: double;

// The sum of the weights of the instances.
m_sumOfWeights: double;

// Number of split points.
m_index: integer;

procedure handleEnumeratedAttribute(trainInstances: TDMInstances);

procedure handleNumericAttribute(trainInstances: TDMInstances);


public

constructor create(attIndex: integer; minNoObj: integer; sumOfWeights: double);

procedure buildClassifier(trainInstances: TDMInstances);

function checkModel(): boolean;

function infoGain(): double;

function gainRatio(): double;

function attIndex(): integer;

procedure setSplitPoint(allInstances: TDMInstances);

function getWeights(instance: TDMInstance): DArray;

function whichSubset(instance: TDMInstance): integer;

procedure resetDistribution(data: TDMInstances);

function leftSide(data: TDMInstances): String;

function rightSide(index: integer; data: TDMInstances): String;

function split(data: TDMInstances): TDMInstanceArray;

function ruleSeting(data: TDMInstances; ri: TDMRuleItem; index: integer): TDMRuleItem;

end;


implementation

uses Math, SysUtils;

constructor J48Split.create(attIndex: integer; minNoObj: integer; sumOfWeights: double);
begin
// Get index of attribute to split on.
m_Attribute := attIndex;
// Set minimum number of objects.
m_minNoObj := minNoObj;
// Set the sum of the weights
m_sumOfWeights := sumOfWeights;
end;

//Creates a C4.5-type split on the given data. Assumes that none of the class values is missing.
procedure J48Split.buildClassifier(trainInstances: TDMInstances);
begin
// Initialize the remaining instance variables.
m_numSubsets := 0;
m_splitPoint := MaxDouble;
m_infoGain := 0;
m_gainRatio := 0;

// Different treatment for enumerated and numeric attributes.
if (trainInstances.attribute(m_Attribute).isNominal())
then
begin
{ m_complexityIndex := trainInstances.attribute(m_Attribute).numValues();
m_index := m_complexityIndex;}
handleEnumeratedAttribute(trainInstances);
// m_index := m_complexityIndex;
end
else
begin
{ m_complexityIndex := 2;
m_index := 0;}
trainInstances.sort(trainInstances.attribute(m_Attribute));
handleNumericAttribute(trainInstances);
end;
end;

procedure J48Split.resetDistribution(data: TDMInstances);
var
insts: TDMInstances;
newD: Distribution;
i: integer;
begin
insts := TDMInstances.Create(data, data.numInstances());
for i := 0 to data.numInstances()-1
do
begin
if (whichSubset(data.instance(i)) > -1)
then
begin
insts.add(data.instance(i));
end;
end;
newD := Distribution.CreateM(insts, Self);
newD.addInstWithUnknown(data, m_Attribute);
m_distribution := newD;
end;



//Creates split on enumerated attribute.
{ procedure J48Split.handleEnumeratedAttribute(trainInstances: TDMInstances);
var
instance: TDMInstance;
enu: TDMInstanceEnumeration;
begin

m_distribution := Distribution.Create(m_complexityIndex,
trainInstances.numClasses());

// Only Instances with known values are relevant.
enu := trainInstances.enumerateInstances();
while (enu.hasMoreElements()) do
begin
instance := TDMInstance (enu.nextElement());
if (not instance.isMissing(m_Attribute))
then
m_distribution.add( trunc(instance.value(m_Attribute)), instance);
end;

// Check if minimum number of Instances in at least two
// subsets.
if (m_distribution.check(m_minNoObj))
then
begin
m_numSubsets := m_complexityIndex;
m_infoGain := EntropyCalculator.splitCritValue(m_distribution, m_sumOfWeights);
m_gainRatio := EntropyCalculator.splitCritValueN(m_distribution, m_sumOfWeights,
m_infoGain);
end;
end;}


procedure J48Split.handleEnumeratedAttribute(trainInstances: TDMInstances);
var
newDistribution, secondDistribution: Distribution;
numAttValues: integer;
currIG,currGR: double;
instance: TDMInstance;
i: integer;
enu: TDMInstanceEnumeration;

begin
numAttValues := trainInstances.attribute(m_Attribute).numValues();
newDistribution := Distribution.Create(numAttValues, trainInstances.numClasses());
// Only Instances with known values are relevant.
enu := trainInstances.enumerateInstances();
while (enu.hasMoreElements())
do
begin
instance := enu.nextElement();
if (not instance.isMissing(m_Attribute))
then
newDistribution.add(trunc(instance.value(m_Attribute)),instance);
end;
m_distribution := newDistribution;
// For all values
for i := 0 to numAttValues-1
do
begin
if (Utils.grOrEq(newDistribution.perBag(i),m_minNoObj))
then
begin
secondDistribution := Distribution.CreateDI(newDistribution,i);
// Check if minimum number of Instances in the two
// subsets.
if (secondDistribution.check(m_minNoObj))
then
begin
m_numSubsets := 2;
currIG := EntropyCalculator.splitCritValue(secondDistribution,
m_sumOfWeights);
currGR := GainRatioCrit.splitCritValueN(secondDistribution,
m_sumOfWeights, currIG);
if (i = 0) or Utils.gr(currGR,m_gainRatio)
then
begin
m_gainRatio := currGR;
m_infoGain := currIG;
m_splitPoint := trunc(i);
m_distribution := secondDistribution;
end;
end;
end;
end;
end;



//Creates split on numeric attribute.
procedure J48Split.handleNumericAttribute(trainInstances: TDMInstances);
var
firstMiss: integer;
next: integer;
last: integer;
splitIndex: integer;
currentInfoGain: double;
defaultEnt: double;
minSplit: double;
instance: TDMInstance;
i: integer;
enu: TDMInstanceEnumeration;

begin

next := 1;
last := 0;
splitIndex := -1;

// Current attribute is a numeric attribute.
m_distribution := Distribution.Create(2, trainInstances.numClasses());

// Only Instances with known values are relevant.
enu := trainInstances.enumerateInstances();
i := 0;
while (enu.hasMoreElements()) do
begin
instance := TDMInstance (enu.nextElement());
if (instance.isMissing(m_Attribute))
then
break;
m_distribution.add(1, instance);
i := i+1;
end;

firstMiss := i;
// Compute minimum number of Instances required in each
// subset.
minSplit := 0.1 * (m_distribution.total()) /
( trunc(trainInstances.numClasses()));
if (Utils.smOrEq(minSplit, m_minNoObj))
then
minSplit := m_minNoObj
else
if (Utils.gr(minSplit, 25))
then
minSplit := 25;
if (Utils.sm( trunc(firstMiss), 2 * minSplit))
then
exit;

// Compute values of criteria for all possible split
// indices.
defaultEnt := EntropyCalculator.oldEnt(m_distribution);
while (next < firstMiss) do
begin
if (trainInstances.instance(next - 1).value(m_Attribute) + 1e-5 <
trainInstances.instance(next).value(m_Attribute))
then
begin

// Move class values for all Instances up to next
// possible split point.
m_distribution.shiftRange(1, 0, trainInstances, last, next);

// Check if enough Instances in each subset and compute
// values for criteria.
if (Utils.grOrEq(m_distribution.perBag(0), minSplit) and
Utils.grOrEq(m_distribution.perBag(1), minSplit))
then
begin
currentInfoGain := EntropyCalculator.
splitCritValueN(m_distribution, m_sumOfWeights,
defaultEnt);
if (Utils.gr(currentInfoGain, m_infoGain))
then
begin
m_infoGain := currentInfoGain;
splitIndex := next - 1;
end;
m_index := m_index + 1;
end;
last := next;
end;
next := next +1;
end;

// Was there any useful split?
if (m_index = 0)
then
exit;

// Compute modified information gain for best split.
m_infoGain := m_infoGain - (Math.log2(m_index) / m_sumOfWeights);
if (Utils.smOrEq(m_infoGain, 0))
then
exit;

// Set instance variables' values to values for
// best split.
m_numSubsets := 2;
m_splitPoint :=
(trainInstances.instance(splitIndex + 1).value(m_Attribute) +
trainInstances.instance(splitIndex).value(m_Attribute)) / 2;

// Restore distributioN for best split.
m_distribution := Distribution.Create(2, trainInstances.numClasses());
m_distribution.addRange(0, trainInstances, 0, splitIndex + 1);
m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss);

// Compute modified gain ratio for best split.
m_gainRatio := GainRatioCrit.
splitCritValueN(m_distribution, m_sumOfWeights,
m_infoGain);
end;


function J48Split.checkModel(): boolean;
begin
if (m_numSubsets > 0)
then
result := true
else
result := false;
end;


function J48Split.infoGain(): double;
begin
result := m_infoGain;
end;

function J48Split.gainRatio(): double;
begin
result := m_gainRatio;
end;

function J48Split.attIndex(): integer;
begin
result := m_Attribute;
end;


procedure J48Split.setSplitPoint(allInstances: TDMInstances);
var
newSplitPoint: double;
tempValue: double;
instance: TDMInstance;
enu: TDMInstanceEnumeration;
begin
newSplitPoint := -MaxDouble;
if ( (allInstances.attribute(m_Attribute).isNumeric()) and
(m_numSubsets > 1))
then
begin
enu := allInstances.enumerateInstances();
while (enu.hasMoreElements())
do
begin
instance := enu.nextElement();
if (not instance.isMissing(m_Attribute))
then
begin
tempValue := instance.value(m_Attribute);
if (Utils.gr(tempValue, newSplitPoint) and
Utils.smOrEq(tempValue, m_splitPoint))
then
begin
newSplitPoint := tempValue;
end;
end;
end;
m_splitPoint := newSplitPoint;
end;
end;

function J48Split.split(data: TDMInstances): TDMInstanceArray;
var
instances: TDMInstanceArray;
weights: DArray;
newWeight: double;
instance: TDMInstance;
subset, i, j: integer;

begin
SetLength(instances, m_numSubsets);
for j := 0 to m_numSubsets-1
do
instances[j] := TDMInstances.Create(data,
data.numInstances());
for i := 0 to data.numInstances()-1
do
begin
instance := (data).instance(i);
weights := getWeights(instance);
subset := whichSubset(instance);
if (subset > -1)
then
instances[subset].add(instance)
else
for j := 0 to m_numSubsets-1
do
if (Utils.gr(weights[j],0))
then
begin
newWeight := weights[j]*instance.weight();
instances[j].add(instance);
instances[j].lastInstance().setWeight(newWeight);
end;
end;
for j := 0 to m_numSubsets-1
do
instances[j].compactify();
result := instances;
end;

function J48Split.getWeights(instance: TDMInstance): DArray;
var
weights: DArray;
i: integer;

begin
if (instance.isMissing(m_Attribute))
then
begin
SetLength(weights, m_numSubsets);
for i := 0 to m_numSubsets-1
do
begin
weights[i] := m_distribution.perBag(i) / m_distribution.total();
end;
result := weights;
end
else
result := nil;
end;


function J48Split.whichSubset(instance: TDMInstance): integer;
begin
if (instance.isMissing(m_Attribute))
then
begin
result := -1;
exit;
end
else
begin
if (instance.attribute(m_Attribute).isNominal())
then
begin
//result := trunc(instance.value(m_Attribute));
if (trunc(m_splitPoint) = trunc(instance.value(m_Attribute)))
then
result := 0
else
result := 1;
exit;
end
else
if (Utils.smOrEq(instance.value(m_Attribute), m_splitPoint))
then
begin
result := 0;
exit;
end
else
begin
result := 1;
exit;
end
end
end;


function J48Split.leftSide(data: TDMInstances): String;
begin
result := data.attribute(m_Attribute).name();
end;

function J48Split.ruleSeting(data: TDMInstances; ri: TDMRuleItem; index: integer): TDMRuleItem;
var
left: TDMItemSet;
la, ra: IArray;

begin
left := ri.premise();
if (left = nil)
then
begin
SetLength(la, 1000);
SetLength(ra, 1000);
left := TDMItemSet.create(la, ra);
end;
if (data.attribute(m_Attribute).isNominal())
then
begin
if index = 0
then
begin
left.m_items[m_Attribute] := trunc(m_splitPoint);
left.m_condidtions[m_Attribute] := 0;
end
else
begin
left.m_items[m_Attribute] := trunc(m_splitPoint);
left.m_condidtions[m_Attribute] := 10;
end
end
else
if (index = 0)
then
begin
left.m_items[m_Attribute] := trunc(m_splitPoint);
left.m_condidtions[m_Attribute] := -1;
end
else
begin
left.m_items[m_Attribute] := trunc(m_splitPoint);
left.m_condidtions[m_Attribute] := 1;
end;
ri.m_premise := left;
result := ri;
end;

function J48Split.rightSide(index: integer; data: TDMInstances): String;
var
text: String;
begin
if (data.attribute(m_Attribute).isNominal())
then
begin
if (index = 0)
then
text := text + ' = '+ data.attribute(m_Attribute).value(trunc(m_splitPoint))
else
text := text + ' != '+data.attribute(m_Attribute).value(trunc(m_splitPoint));
end
else
if (index = 0)
then
text := text + ' <= '+IntToStr(trunc(m_splitPoint))
else
text := text + ' > '+IntToStr(trunc(m_splitPoint));

result := text;
end;

end.


Соседние файлы в папке j48