?? cfssubseteval.java
字號:
/**
*
* AgentAcademy - an open source Data Mining framework for
* training intelligent agents
*
* Copyright (C) 2001-2003 AA Consortium.
*
* This library is open source software; you can redistribute it
* and/or modify it under the terms of the GNU Lesser General
* Public License as published by the Free Software Foundation;
* either version 2.0 of the License, or (at your option) any later
* version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*
*/
package org.agentacademy.modules.dataminer.attributeSelection;
import java.util.BitSet;
import java.util.Enumeration;
import java.util.Vector;
import org.agentacademy.modules.dataminer.core.ContingencyTables;
import org.agentacademy.modules.dataminer.core.Instance;
import org.agentacademy.modules.dataminer.core.Instances;
import org.agentacademy.modules.dataminer.core.Matrix;
import org.agentacademy.modules.dataminer.core.Option;
import org.agentacademy.modules.dataminer.core.OptionHandler;
import org.agentacademy.modules.dataminer.core.Utils;
import org.agentacademy.modules.dataminer.filters.DiscretizeFilter;
import org.agentacademy.modules.dataminer.filters.Filter;
import org.apache.log4j.Logger;
/**
* CFS attribute subset evaluator.
* For more information see: <p>
*
* Hall, M. A. (1998). Correlation-based Feature Subset Selection for Machine
* Learning. Thesis submitted in partial fulfilment of the requirements of the
* degree of Doctor of Philosophy at the University of Waikato. <p>
*
* Valid options are:
*
* -M <br>
* Treat missing values as a seperate value. <p>
*
* -L <br>
* Include locally predictive attributes. <p>
*
* @author Mark Hall (mhall@cs.waikato.ac.nz)
* @version $Revision: 1.3 $
*/
public class CfsSubsetEval
extends SubsetEvaluator
implements OptionHandler
{
public static Logger log = Logger.getLogger(CfsSubsetEval.class);
/** The training instances */
private Instances m_trainInstances;
/** Discretise attributes when class in nominal */
private DiscretizeFilter m_disTransform;
/** The class index */
private int m_classIndex;
/** Is the class numeric */
private boolean m_isNumeric;
/** Number of attributes in the training data */
private int m_numAttribs;
/** Number of instances in the training data */
private int m_numInstances;
/** Treat missing values as seperate values */
private boolean m_missingSeperate;
/** Include locally predicitive attributes */
private boolean m_locallyPredictive;
/** Holds the matrix of attribute correlations */
private Matrix m_corr_matrix;
/** Standard deviations of attributes (when using pearsons correlation) */
private double[] m_std_devs;
/** Threshold for admitting locally predictive features */
private double m_c_Threshold;
/**
* Returns a string describing this attribute evaluator
* @return a description of the evaluator suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "CfsSubsetEval :\n\nEvaluates the worth of a subset of attributes "
+"by considering the individual predictive ability of each feature "
+"along with the degree of redundancy between them.\n\n"
+"Subsets of features that are highly correlated with the class "
+"while having low intercorrelation are preferred.\n";
}
/**
* Constructor
*/
public CfsSubsetEval () {
resetOptions();
}
/**
* Returns an enumeration describing the available options.
* @return an enumeration of all the available options.
*
**/
public Enumeration listOptions () {
Vector newVector = new Vector(3);
newVector.addElement(new Option("\tTreat missing values as a seperate"
+ "\n\tvalue.", "M", 0, "-M"));
newVector.addElement(new Option("\tInclude locally predictive attributes"
+ ".", "L", 0, "-L"));
return newVector.elements();
}
/**
* Parses and sets a given list of options. <p>
*
* Valid options are:
*
* -M <br>
* Treat missing values as a seperate value. <p>
*
* -L <br>
* Include locally predictive attributes. <p>
*
* @param options the list of options as an array of strings
* @exception Exception if an option is not supported
*
**/
public void setOptions (String[] options)
throws Exception
{
String optionString;
resetOptions();
setMissingSeperate(Utils.getFlag('M', options));
setLocallyPredictive(Utils.getFlag('L', options));
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String locallyPredictiveTipText() {
return "Identify locally predictive attributes. Iteratively adds "
+"attributes with the highest correlation with the class as long "
+"as there is not already an attribute in the subset that has a "
+"higher correlation with the attribute in question";
}
/**
* Include locally predictive attributes
*
* @param b true or false
*/
public void setLocallyPredictive (boolean b) {
m_locallyPredictive = b;
}
/**
* Return true if including locally predictive attributes
*
* @return true if locally predictive attributes are to be used
*/
public boolean getLocallyPredictive () {
return m_locallyPredictive;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String missingSeperateTipText() {
return "Treat missing as a separate value. Otherwise, counts for missing "
+"values are distributed across other values in proportion to their "
+"frequency.";
}
/**
* Treat missing as a seperate value
*
* @param b true or false
*/
public void setMissingSeperate (boolean b) {
m_missingSeperate = b;
}
/**
* Return true is missing is treated as a seperate value
*
* @return true if missing is to be treated as a seperate value
*/
public boolean getMissingSeperate () {
return m_missingSeperate;
}
/**
* Gets the current settings of CfsSubsetEval
*
* @return an array of strings suitable for passing to setOptions()
*/
public String[] getOptions () {
String[] options = new String[2];
int current = 0;
if (getMissingSeperate()) {
options[current++] = "-M";
}
if (getLocallyPredictive()) {
options[current++] = "-L";
}
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* Generates a attribute evaluator. Has to initialize all fields of the
* evaluator that are not being set via options.
*
* CFS also discretises attributes (if necessary) and initializes
* the correlation matrix.
*
* @param data set of instances serving as training data
* @exception Exception if the evaluator has not been
* generated successfully
*/
public void buildEvaluator (Instances data)
throws Exception
{
if (data.checkForStringAttributes()) {
throw new Exception("Can't handle string attributes!");
}
m_trainInstances = data;
m_trainInstances.deleteWithMissingClass();
m_classIndex = m_trainInstances.classIndex();
m_numAttribs = m_trainInstances.numAttributes();
m_numInstances = m_trainInstances.numInstances();
m_isNumeric = m_trainInstances.attribute(m_classIndex).isNumeric();
if (!m_isNumeric) {
m_disTransform = new DiscretizeFilter();
m_disTransform.setUseBetterEncoding(true);
m_disTransform.setInputFormat(m_trainInstances);
m_trainInstances = Filter.useFilter(m_trainInstances, m_disTransform);
}
m_std_devs = new double[m_numAttribs];
m_corr_matrix = new Matrix(m_numAttribs, m_numAttribs);
for (int i = 0; i < m_corr_matrix.numRows(); i++) {
m_corr_matrix.setElement(i, i, 1.0);
m_std_devs[i] = 1.0;
}
for (int i = 0; i < m_numAttribs; i++) {
for (int j = i + 1; j < m_numAttribs; j++) {
m_corr_matrix.setElement(i, j, -999);
m_corr_matrix.setElement(j, i, -999);
}
}
}
/**
* evaluates a subset of attributes
*
* @param subset a bitset representing the attribute subset to be
* evaluated
* @exception Exception if the subset could not be evaluated
*/
public double evaluateSubset (BitSet subset)
throws Exception
{
double num = 0.0;
double denom = 0.0;
double corr;
// do numerator
for (int i = 0; i < m_numAttribs; i++) {
if (i != m_classIndex) {
if (subset.get(i)) {
if (m_corr_matrix.getElement(i, m_classIndex) == -999) {
corr = correlate(i, m_classIndex);
m_corr_matrix.setElement(i, m_classIndex, corr);
m_corr_matrix.setElement(m_classIndex, i, corr);
num += (m_std_devs[i] * corr);
}
else {num += (m_std_devs[i] *
m_corr_matrix.getElement(i, m_classIndex));
}
}
}
}
// do denominator
for (int i = 0; i < m_numAttribs; i++) {
if (i != m_classIndex) {
if (subset.get(i)) {
denom += (1.0 * m_std_devs[i] * m_std_devs[i]);
for (int j = i + 1; j < m_numAttribs; j++) {if (subset.get(j)) {
if (m_corr_matrix.getElement(i, j) == -999) {
corr = correlate(i, j);
m_corr_matrix.setElement(i, j, corr);
m_corr_matrix.setElement(j, i, corr);
denom += (2.0 * m_std_devs[i] * m_std_devs[j] * corr);
}
else {denom += (2.0 * m_std_devs[i] * m_std_devs[j] *
m_corr_matrix.getElement(i, j));
}
}
}
}
}
}
if (denom < 0.0) {
denom *= -1.0;
}
if (denom == 0.0) {
return (0.0);
}
double merit = (num/Math.sqrt(denom));
if (merit < 0.0) {
merit *= -1.0;
}
return merit;
}
private double correlate (int att1, int att2) {
if (!m_isNumeric) {
return symmUncertCorr(att1, att2);
}
boolean att1_is_num = (m_trainInstances.attribute(att1).isNumeric());
boolean att2_is_num = (m_trainInstances.attribute(att2).isNumeric());
if (att1_is_num && att2_is_num) {
return num_num(att1, att2);
}
else {if (att2_is_num) {
return num_nom2(att1, att2);
}
else {if (att1_is_num) {
return num_nom2(att2, att1);
}
}
}
return nom_nom(att1, att2);
}
private double symmUncertCorr (int att1, int att2) {
int i, j, k, ii, jj;
int nnj, nni, ni, nj;
double sum = 0.0;
double sumi[], sumj[];
double counts[][];
Instance inst;
double corr_measure;
boolean flag = false;
double temp = 0.0;
if (att1 == m_classIndex || att2 == m_classIndex) {
flag = true;
}
ni = m_trainInstances.attribute(att1).numValues() + 1;
nj = m_trainInstances.attribute(att2).numValues() + 1;
counts = new double[ni][nj];
sumi = new double[ni];
sumj = new double[nj];
for (i = 0; i < ni; i++) {
sumi[i] = 0.0;
for (j = 0; j < nj; j++) {
sumj[j] = 0.0;
counts[i][j] = 0.0;
}
}
// Fill the contingency table
for (i = 0; i < m_numInstances; i++) {
inst = m_trainInstances.instance(i);
if (inst.isMissing(att1)) {
ii = ni - 1;
}
else {
ii = (int)inst.value(att1);
}
if (inst.isMissing(att2)) {
jj = nj - 1;
}
else {
jj = (int)inst.value(att2);
}
counts[ii][jj]++;
}
// get the row totals
for (i = 0; i < ni; i++) {
sumi[i] = 0.0;
for (j = 0; j < nj; j++) {
sumi[i] += counts[i][j];
sum += counts[i][j];
}
}
// get the column totals
for (j = 0; j < nj; j++) {
sumj[j] = 0.0;
for (i = 0; i < ni; i++) {
sumj[j] += counts[i][j];
}
}
// distribute missing counts
if (!m_missingSeperate &&
(sumi[ni-1] < m_numInstances) &&
(sumj[nj-1] < m_numInstances)) {
double[] i_copy = new double[sumi.length];
double[] j_copy = new double[sumj.length];
double[][] counts_copy = new double[sumi.length][sumj.length];
for (i = 0; i < ni; i++) {
System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);
}
System.arraycopy(sumi, 0, i_copy, 0, sumi.length);
System.arraycopy(sumj, 0, j_copy, 0, sumj.length);
double total_missing =
(sumi[ni - 1] + sumj[nj - 1] - counts[ni - 1][nj - 1]);
// do the missing i's
if (sumi[ni - 1] > 0.0) {
for (j = 0; j < nj - 1; j++) {
if (counts[ni - 1][j] > 0.0) {
for (i = 0; i < ni - 1; i++) {
temp = ((i_copy[i]/(sum - i_copy[ni - 1]))*counts[ni - 1][j]);
counts[i][j] += temp;
sumi[i] += temp;
}
counts[ni - 1][j] = 0.0;
}
}
}
sumi[ni - 1] = 0.0;
// do the missing j's
if (sumj[nj - 1] > 0.0) {
for (i = 0; i < ni - 1; i++) {
if (counts[i][nj - 1] > 0.0) {
for (j = 0; j < nj - 1; j++) {
temp = ((j_copy[j]/(sum - j_copy[nj - 1]))*counts[i][nj - 1]);
counts[i][j] += temp;
sumj[j] += temp;
}
counts[i][nj - 1] = 0.0;
}
}
}
sumj[nj - 1] = 0.0;
// do the both missing
if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) {
for (i = 0; i < ni - 1; i++) {
for (j = 0; j < nj - 1; j++) {
temp = (counts_copy[i][j]/(sum - total_missing)) *
counts_copy[ni - 1][nj - 1];
counts[i][j] += temp;
sumi[i] += temp;
sumj[j] += temp;
}
}
counts[ni - 1][nj - 1] = 0.0;
}
}
// corr_measure = Correlate.symm_uncert(counts,sumi,sumj,sum,ni,nj,flag);
corr_measure = ContingencyTables.symmetricalUncertainty(counts);
// corr_measure = ContingencyTables.gainRatio(counts);
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -