?? binc45modelselection.java
字號:
/**
*
* AgentAcademy - an open source Data Mining framework for
* training intelligent agents
*
* Copyright (C) 2001-2003 AA Consortium.
*
* This library is open source software; you can redistribute it
* and/or modify it under the terms of the GNU Lesser General
* Public License as published by the Free Software Foundation;
* either version 2.0 of the License, or (at your option) any later
* version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*
*/
package org.agentacademy.modules.dataminer.classifiers;
/**
* <p>Title: The Data Miner prototype</p>
* <p>Description: A prototype for the DataMiner (DM), the Agent Academy (AA) module responsible for performing data mining on the contents of the Agent Use Repository (AUR). The extracted knowledge is to be sent back to the AUR in the form of a PMML document.</p>
* <p>Copyright: Copyright (c) 2002</p>
* <p>Company: CERTH</p>
* @author asymeon
* @version 0.3
*/
import java.util.*;
import org.agentacademy.modules.dataminer.core.*;
/**
* Class for selecting a C4.5-like binary (!) split for a given dataset.
*
*/
public class BinC45ModelSelection extends ModelSelection{
/** Minimum number of instances in interval. */
private int m_minNoObj;
/** The FULL training dataset. */
private Instances m_allData;
/**
* Initializes the split selection method with the given parameters.
*
* @param minNoObj minimum number of instances that have to occur in
* at least two subsets induced by split
* @param allData FULL training dataset (necessary for selection of
* split points).
*/
public BinC45ModelSelection(int minNoObj,Instances allData){
m_minNoObj = minNoObj;
m_allData = allData;
}
/**
* Sets reference to training data to null.
*/
public void cleanup() {
m_allData = null;
}
/**
* Selects C4.5-type split for the given dataset.
*/
public final ClassifierSplitModel selectModel(Instances data){
double minResult;
double currentResult;
BinC45Split [] currentModel;
BinC45Split bestModel = null;
NoSplit noSplitModel = null;
double averageInfoGain = 0;
int validModels = 0;
boolean multiVal = true;
Distribution checkDistribution;
double sumOfWeights;
int i;
try{
// Check if all Instances belong to one class or if not
// enough Instances to split.
checkDistribution = new Distribution(data);
noSplitModel = new NoSplit(checkDistribution);
if (Utils.sm(checkDistribution.total(),2*m_minNoObj) ||
Utils.eq(checkDistribution.total(),
checkDistribution.perClass(checkDistribution.maxClass())))
return noSplitModel;
// Check if all attributes are nominal and have a
// lot of values.
Enumeration enum = data.enumerateAttributes();
while (enum.hasMoreElements()) {
Attribute attribute = (Attribute) enum.nextElement();
if ((attribute.isNumeric()) ||
(Utils.sm((double)attribute.numValues(),
(0.3*(double)m_allData.numInstances())))){
multiVal = false;
break;
}
}
currentModel = new BinC45Split[data.numAttributes()];
sumOfWeights = data.sumOfWeights();
// For each attribute.
for (i = 0; i < data.numAttributes(); i++){
// Apart from class attribute.
if (i != (data).classIndex()){
// Get models for current attribute.
currentModel[i] = new BinC45Split(i,m_minNoObj,sumOfWeights);
currentModel[i].buildClassifier(data);
// Check if useful split for current attribute
// exists and check for enumerated attributes with
// a lot of values.
if (currentModel[i].checkModel())
if ((data.attribute(i).isNumeric()) ||
(multiVal || Utils.sm((double)data.attribute(i).numValues(),
(0.3*(double)m_allData.numInstances())))){
averageInfoGain = averageInfoGain+currentModel[i].infoGain();
validModels++;
}
}else
currentModel[i] = null;
}
// Check if any useful split was found.
if (validModels == 0)
return noSplitModel;
averageInfoGain = averageInfoGain/(double)validModels;
// Find "best" attribute to split on.
minResult = 0;
for (i=0;i<data.numAttributes();i++){
if ((i != (data).classIndex()) &&
(currentModel[i].checkModel()))
// Use 1E-3 here to get a closer approximation to the original
// implementation.
if ((currentModel[i].infoGain() >= (averageInfoGain-1E-3)) &&
Utils.gr(currentModel[i].gainRatio(),minResult)){
bestModel = currentModel[i];
minResult = currentModel[i].gainRatio();
}
}
// Check if useful split was found.
if (Utils.eq(minResult,0))
return noSplitModel;
// Add all Instances with unknown values for the corresponding
// attribute to the distribution for the model, so that
// the complete distribution is stored with the model.
bestModel.distribution().
addInstWithUnknown(data,bestModel.attIndex());
// Set the split point analogue to C45 if attribute numeric.
bestModel.setSplitPoint(m_allData);
return bestModel;
}catch(Exception e){
e.printStackTrace();
}
return null;
}
/**
* Selects C4.5-type split for the given dataset.
*/
public final ClassifierSplitModel selectModel(Instances train, Instances test) {
return selectModel(train);
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -