?? principalcomponents.java
字號:
/**
*
* AgentAcademy - an open source Data Mining framework for
* training intelligent agents
*
* Copyright (C) 2001-2003 AA Consortium.
*
* This library is open source software; you can redistribute it
* and/or modify it under the terms of the GNU Lesser General
* Public License as published by the Free Software Foundation;
* either version 2.0 of the License, or (at your option) any later
* version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*
*/
package org.agentacademy.modules.dataminer.attributeSelection;
import java.util.Enumeration;
import java.util.Vector;
import org.agentacademy.modules.dataminer.core.Attribute;
import org.agentacademy.modules.dataminer.core.FastVector;
import org.agentacademy.modules.dataminer.core.Instance;
import org.agentacademy.modules.dataminer.core.Instances;
import org.agentacademy.modules.dataminer.core.Matrix;
import org.agentacademy.modules.dataminer.core.Option;
import org.agentacademy.modules.dataminer.core.OptionHandler;
import org.agentacademy.modules.dataminer.core.SparseInstance;
import org.agentacademy.modules.dataminer.core.Utils;
import org.agentacademy.modules.dataminer.filters.AttributeFilter;
import org.agentacademy.modules.dataminer.filters.Filter;
import org.agentacademy.modules.dataminer.filters.NominalToBinaryFilter;
import org.agentacademy.modules.dataminer.filters.NormalizationFilter;
import org.agentacademy.modules.dataminer.filters.ReplaceMissingValuesFilter;
import org.apache.log4j.Logger;
/**
* Class for performing principal components analysis/transformation.
*
* @author Mark Hall (mhall@cs.waikato.ac.nz)
* @author Gabi Schmidberger (gabi@cs.waikato.ac.nz)
* @version $Revision: 1.3 $
*/
public class PrincipalComponents extends AttributeEvaluator
implements AttributeTransformer, OptionHandler {
public static Logger log = Logger.getLogger(PrincipalComponents.class);
/** The data to transform analyse/transform */
private Instances m_trainInstances;
/** Keep a copy for the class attribute (if set) */
private Instances m_trainCopy;
/** The header for the transformed data format */
private Instances m_transformedFormat;
/** The header for data transformed back to the original space */
private Instances m_originalSpaceFormat;
/** Data has a class set */
private boolean m_hasClass;
/** Class index */
private int m_classIndex;
/** Number of attributes */
private int m_numAttribs;
/** Number of instances */
private int m_numInstances;
/** Correlation matrix for the original data */
private double [][] m_correlation;
/** Will hold the unordered linear transformations of the (normalized)
original data */
private double [][] m_eigenvectors;
/** Eigenvalues for the corresponding eigenvectors */
private double [] m_eigenvalues = null;
/** Sorted eigenvalues */
private int [] m_sortedEigens;
/** sum of the eigenvalues */
private double m_sumOfEigenValues = 0.0;
/** Filters for original data */
private ReplaceMissingValuesFilter m_replaceMissingFilter;
private NormalizationFilter m_normalizeFilter;
private NominalToBinaryFilter m_nominalToBinFilter;
private AttributeFilter m_attributeFilter;
/** used to remove the class column if a class column is set */
private AttributeFilter m_attribFilter;
/** The number of attributes in the pc transformed data */
private int m_outputNumAtts = -1;
/** normalize the input data? */
private boolean m_normalize = true;
/** the amount of varaince to cover in the original data when
retaining the best n PC's */
private double m_coverVariance = 0.95;
/** transform the data through the pc space and back to the original
space ? */
private boolean m_transBackToOriginal = false;
/** holds the transposed eigenvectors for converting back to the
original space */
private double [][] m_eTranspose;
/**
* Returns a string describing this attribute transformer
* @return a description of the evaluator suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "Performs a principal components analysis and transformation of "
+"the data. Use in conjunction with a Ranker search. Dimensionality "
+"reduction is accomplished by choosing enough eigenvectors to "
+"account for some percentage of the variance in the original data---"
+"default 0.95 (95%). Attribute noise can be filtered by transforming "
+"to the PC space, eliminating some of the worst eigenvectors, and "
+"then transforming back to the original space.";
}
/**
* Returns an enumeration describing the available options. <p>
*
* -N <classifier>
* Don't normalize the input data. <p>
*
* @return an enumeration of all the available options.
**/
public Enumeration listOptions () {
Vector newVector = new Vector(3);
newVector.addElement(new Option("\tDon't normalize input data."
, "D", 0, "-D"));
newVector.addElement(new Option("\tRetain enough PC attributes to account "
+"\n\tfor this proportion of variance in "
+"the original data. (default = 0.95)",
"R",1,"-R"));
newVector.addElement(new Option("\tTransform through the PC space and "
+"\n\tback to the original space."
, "O", 0, "-O"));
return newVector.elements();
}
/**
* Parses a given list of options.
*
* Valid options are:<p>
* -N <classifier>
* Don't normalize the input data. <p>
*
* @param options the list of options as an array of strings
* @exception Exception if an option is not supported
*/
public void setOptions (String[] options)
throws Exception
{
resetOptions();
String optionString;
optionString = Utils.getOption('R', options);
if (optionString.length() != 0) {
Double temp;
temp = Double.valueOf(optionString);
setVarianceCovered(temp.doubleValue());
}
setNormalize(!Utils.getFlag('D', options));
setTransformBackToOriginal(Utils.getFlag('O', options));
}
/**
* Reset to defaults
*/
private void resetOptions() {
m_coverVariance = 0.95;
m_normalize = true;
m_sumOfEigenValues = 0.0;
m_transBackToOriginal = false;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String normalizeTipText() {
return "Normalize input data.";
}
/**
* Set whether input data will be normalized.
* @param n true if input data is to be normalized
*/
public void setNormalize(boolean n) {
m_normalize = n;
}
/**
* Gets whether or not input data is to be normalized
* @return true if input data is to be normalized
*/
public boolean getNormalize() {
return m_normalize;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String varianceCoveredTipText() {
return "Retain enough PC attributes to account for this proportion of "
+"variance.";
}
/**
* Sets the amount of variance to account for when retaining
* principal components
* @param vc the proportion of total variance to account for
*/
public void setVarianceCovered(double vc) {
m_coverVariance = vc;
}
/**
* Gets the proportion of total variance to account for when
* retaining principal components
* @return the proportion of variance to account for
*/
public double getVarianceCovered() {
return m_coverVariance;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String transformBackToOriginalTipText() {
return "Transform through the PC space and back to the original space. "
+"If only the best n PCs are retained (by setting varianceCovered < 1) "
+"then this option will give a dataset in the original space but with "
+"less attribute noise.";
}
/**
* Sets whether the data should be transformed back to the original
* space
* @param b true if the data should be transformed back to the
* original space
*/
public void setTransformBackToOriginal(boolean b) {
m_transBackToOriginal = b;
}
/**
* Gets whether the data is to be transformed back to the original
* space.
* @return true if the data is to be transformed back to the original space
*/
public boolean getTransformBackToOriginal() {
return m_transBackToOriginal;
}
/**
* Gets the current settings of PrincipalComponents
*
* @return an array of strings suitable for passing to setOptions()
*/
public String[] getOptions () {
String[] options = new String[4];
int current = 0;
if (!getNormalize()) {
options[current++] = "-D";
}
options[current++] = "-R"; options[current++] = ""+getVarianceCovered();
if (getTransformBackToOriginal()) {
options[current++] = "-O";
}
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* Initializes principal components and performs the analysis
* @param data the instances to analyse/transform
* @exception Exception if analysis fails
*/
public void buildEvaluator(Instances data) throws Exception {
buildAttributeConstructor(data);
}
private void buildAttributeConstructor (Instances data) throws Exception {
m_eigenvalues = null;
m_outputNumAtts = -1;
m_attributeFilter = null;
m_nominalToBinFilter = null;
m_sumOfEigenValues = 0.0;
if (data.checkForStringAttributes()) {
throw new Exception("Can't handle string attributes!");
}
m_trainInstances = data;
// make a copy of the training data so that we can get the class
// column to append to the transformed data (if necessary)
m_trainCopy = new Instances(m_trainInstances);
m_replaceMissingFilter = new ReplaceMissingValuesFilter();
m_replaceMissingFilter.setInputFormat(m_trainInstances);
m_trainInstances = Filter.useFilter(m_trainInstances,
m_replaceMissingFilter);
if (m_normalize) {
m_normalizeFilter = new NormalizationFilter();
m_normalizeFilter.setInputFormat(m_trainInstances);
m_trainInstances = Filter.useFilter(m_trainInstances, m_normalizeFilter);
}
m_nominalToBinFilter = new NominalToBinaryFilter();
m_nominalToBinFilter.setInputFormat(m_trainInstances);
m_trainInstances = Filter.useFilter(m_trainInstances,
m_nominalToBinFilter);
// delete any attributes with only one distinct value or are all missing
Vector deleteCols = new Vector();
for (int i=0;i<m_trainInstances.numAttributes();i++) {
if (m_trainInstances.numDistinctValues(i) <=1) {
deleteCols.addElement(new Integer(i));
}
}
if (m_trainInstances.classIndex() >=0) {
// get rid of the class column
m_hasClass = true;
m_classIndex = m_trainInstances.classIndex();
deleteCols.addElement(new Integer(m_classIndex));
}
// remove columns from the data if necessary
if (deleteCols.size() > 0) {
m_attributeFilter = new AttributeFilter();
int [] todelete = new int [deleteCols.size()];
for (int i=0;i<deleteCols.size();i++) {
todelete[i] = ((Integer)(deleteCols.elementAt(i))).intValue();
}
m_attributeFilter.setAttributeIndicesArray(todelete);
m_attributeFilter.setInvertSelection(false);
m_attributeFilter.setInputFormat(m_trainInstances);
m_trainInstances = Filter.useFilter(m_trainInstances, m_attributeFilter);
}
m_numInstances = m_trainInstances.numInstances();
m_numAttribs = m_trainInstances.numAttributes();
fillCorrelation();
double [] d = new double[m_numAttribs];
double [][] v = new double[m_numAttribs][m_numAttribs];
Matrix corr = new Matrix(m_correlation);
corr.eigenvalueDecomposition(v, d);
//if (debug) {
// Matrix V = new Matrix(v);
// boolean b = corr.testEigen(V, d, true);
// if (!b)
// System.out.println("Problem with eigenvektors!!!");
// else
// System.out.println("***** everything's fine !!!");
// }
m_eigenvectors = (double [][])v.clone();
m_eigenvalues = (double [])d.clone();
// any eigenvalues less than 0 are not worth anything --- change to 0
for (int i = 0; i < m_eigenvalues.length; i++) {
if (m_eigenvalues[i] < 0) {
m_eigenvalues[i] = 0.0;
}
}
m_sortedEigens = Utils.sort(m_eigenvalues);
m_sumOfEigenValues = Utils.sum(m_eigenvalues);
m_transformedFormat = setOutputFormat();
if (m_transBackToOriginal) {
m_originalSpaceFormat = setOutputFormatOriginal();
// new ordered eigenvector matrix
int numVectors = (m_transformedFormat.classIndex() < 0)
? m_transformedFormat.numAttributes()
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -