Скачиваний:
28
Добавлен:
01.05.2014
Размер:
10 Кб
Скачать
unit ModelSelector;

interface

uses

J48SplitClass,
NoSplitClass,
SplitClass,
DistributionClass,
Attribute,
Utils,
Debugger,
FastVector,
Classes,
SysUtils,
Instances;


type MSelector = class

private

// Minimum number of objects in interval.
m_minNoObj: integer;

// All the training data
m_allData: TDMInstances;

public

constructor Create(allData: TDMInstances; minNoObj: integer);

function selectModel(data: TDMInstances): Split;

function selectBinModel(data: TDMInstances): Split;

end;
implementation


constructor MSelector.Create(allData: TDMInstances; minNoObj: integer);
begin
m_minNoObj := minNoObj;
m_allData := allData;
end;


// Selects C4.5-type split for the given dataset. (non-bin tree)
function MSelector.selectModel(data: TDMInstances): Split;
var
minResult: double;
currentResult: double;
currentModel: array of J48Split;
bestModel: J48Split;
noSplitModel: NoSplit;
averageInfoGain: double;
validModels: integer;
multiVal: boolean;
checkDistribution: Distribution;
attribute: TDMAttribute;
sumOfWeights: double;
i, j: integer;
enu: TDMAttributeEnumeration;
d: Distribution;
v: TDMFastVector;
atr: TDMAttribute;
begin
bestModel := nil;
noSplitModel := nil;
averageInfoGain := 0;
validModels := 0;
multiVal := true;



Debugger.writeln('C:\Weka-3-4\data\log.txt', 'Model Selection...');
// Check if all Instances belong to one class or if not
// enough Instances to split.
checkDistribution := Distribution.createI(data);

noSplitModel := NoSplit.create(checkDistribution);
checkDistribution := Distribution.createI(data);
if (Utils.sm(checkDistribution.total(),2*m_minNoObj) or
Utils.eq(checkDistribution.total(),
checkDistribution.perClass(checkDistribution.maxClass())))
then
begin
result := noSplitModel;
exit;
end;

// Check if all attributes are nominal and have a
// lot of values.
if (m_allData <> nil)
then
begin
enu := data.enumerateAttributes();
while (enu.hasMoreElements())
do
begin
attribute := enu.nextElement();
if ((attribute.isNumeric()) or
(Utils.sm(trunc(attribute.numValues()),
(0.3*trunc(m_allData.numInstances())))))
then
begin
multiVal := false;
break;
end;
end;
end;

SetLength(currentModel, data.numAttributes());
sumOfWeights := data.sumOfWeights();

// For each attribute.
for i := 0 to data.numAttributes()-1
do
begin
// Apart from class attribute.
if (i <> (data).classIndex())
then
begin
// Get models for current attribute.
currentModel[i] := J48Split.Create(i,m_minNoObj,sumOfWeights);
currentModel[i].buildClassifier(data);

v := data.attributes();
for j := 0 to v.size-1
do
begin
atr := v.elementAt(j) as TDMAttribute;
Debugger.Writeln('C:\Weka-3-4\data\log.txt', IntToStr(j)+': '+atr.getAttrType);
end;

// Check if useful split for current attribute
// exists and check for enumerated attributes with
// a lot of values.
if (currentModel[i].checkModel())
then
begin
if (m_allData <> nil)
then
begin

if ((data.attribute(i).isNumeric()) or
(multiVal or Utils.sm(trunc(data.attribute(i).numValues()),
(0.3*trunc(m_allData.numInstances())))))
then
begin
averageInfoGain := averageInfoGain+currentModel[i].infoGain();
validModels := validModels+1;
end;
end;
end
else
begin
averageInfoGain := averageInfoGain+currentModel[i].infoGain();
validModels := validModels+1;
end;
end
else
currentModel[i] := nil;
end;

// Check if any useful split was found.
if (validModels = 0)
then
begin
result := noSplitModel;
exit;
end;
averageInfoGain := averageInfoGain/trunc(validModels);

// Find "best" attribute to split on.
minResult := 0;
for i:=0 to data.numAttributes()-1
do
begin
if (i <> (data).classIndex()) and (currentModel[i].checkModel())
then
// Use 1E-3 here to get a closer approximation to the original
// implementation.
if ((currentModel[i].infoGain() >= (averageInfoGain-1E-3)) and
Utils.gr(currentModel[i].gainRatio(),minResult))
then
begin
bestModel := currentModel[i];
minResult := currentModel[i].gainRatio();
end;
end;
// Check if useful split was found.
if (Utils.eq(minResult,0))
then
begin
result := noSplitModel;
exit;
end;
// Add all Instances with unknown values for the corresponding
// attribute to the distribution for the model, so that
// the complete distribution is stored with the model.
d := (bestModel as J48Split).getDistribution();
d.addInstWithUnknown(data,bestModel.attIndex());

// Set the split point analogue to C45 if attribute numeric.
if (m_allData <> nil)
then
bestModel.setSplitPoint(m_allData);
result := bestModel;
Debugger.writeln('C:\Weka-3-4\data\log.txt', bestModel.leftSide(data)+bestModel.rightSide(0, data));
Debugger.writeln('C:\Weka-3-4\data\log.txt', 'Model Selection End');
end;



function MSelector.selectBinModel(data: TDMInstances): Split;
var
minResult: double;
currentResult: double;
currentModel: array of J48Split;
bestModel: J48Split;
noSplitModel: NoSplit;
averageInfoGain: double;
validModels: integer;
multiVal: boolean;
checkDistribution: Distribution;
sumOfWeights: double;
i: integer;
enu: TDMAttributeEnumeration;
attribute: TDMAttribute;

begin
bestModel := nil;
noSplitModel := nil;
averageInfoGain := 0;
validModels := 0;
multiVal := true;

// Check if all Instances belong to one class or if not
// enough Instances to split.
checkDistribution := Distribution.CreateI(data);
noSplitModel := NoSplit.create(checkDistribution);
if (Utils.sm(checkDistribution.total(),2*m_minNoObj) or
Utils.eq(checkDistribution.total(),
checkDistribution.perClass(checkDistribution.maxClass())))
then
begin
result := noSplitModel;
exit;
end;

// Check if all attributes are nominal and have a
// lot of values.
enu := data.enumerateAttributes();
while (enu.hasMoreElements())
do
begin
attribute := enu.nextElement() as TDMAttribute;
if ((attribute.isNumeric()) or
(Utils.sm(trunc(attribute.numValues()),
(0.3*trunc(m_allData.numInstances())))))
then
begin
multiVal := false;
break;
end;
end;
SetLength(currentModel, data.numAttributes());
sumOfWeights := data.sumOfWeights();

// For each attribute.
for i := 0 to data.numAttributes()-1
do
begin
// Apart from class attribute.
if (i <> (data).classIndex())
then
begin
// Get models for current attribute.
currentModel[i] := J48Split.create(i,m_minNoObj,sumOfWeights);
currentModel[i].buildClassifier(data);

// Check if useful split for current attribute
// exists and check for enumerated attributes with
// a lot of values.
if (currentModel[i].checkModel())
then
if (data.attribute(i).isNumeric() or
(multiVal or Utils.sm(trunc(data.attribute(i).numValues()),
(0.3*trunc(m_allData.numInstances())))))
then
begin
averageInfoGain := averageInfoGain+currentModel[i].infoGain();
validModels := validModels+1;
end;
end
else
currentModel[i] := nil;
end;

// Check if any useful split was found.
if (validModels = 0)
then
begin
result := noSplitModel;
exit;
end;
averageInfoGain := averageInfoGain/trunc(validModels);

// Find "best" attribute to split on.
minResult := 0;
for i := 0 to data.numAttributes()-1
do
begin
if ((i <> (data).classIndex()) and
(currentModel[i].checkModel()))
then
// Use 1E-3 here to get a closer approximation to the original
// implementation.
if ((currentModel[i].infoGain() >= (averageInfoGain-1E-3)) and
Utils.gr(currentModel[i].gainRatio(),minResult))
then
begin
bestModel := currentModel[i];
minResult := currentModel[i].gainRatio();
end;
end;

// Check if useful split was found.
if (Utils.eq(minResult,0))
then
begin
result := noSplitModel;
exit;
end;

// Add all Instances with unknown values for the corresponding
// attribute to the distribution for the model, so that
// the complete distribution is stored with the model.
bestModel.getDistribution().addInstWithUnknown(data,bestModel.attIndex());

// Set the split point analogue to C45 if attribute numeric.
bestModel.setSplitPoint(m_allData);
result := bestModel;
end;


end.
Соседние файлы в папке j48