?? splitscore.java
字號:
package shared;
import java.lang.*;
import java.io.*;
/** A class for determining, holding, and returning the information associated
* with an attribute split. Uses methods from the Entropy class for the
* determination of the scores.
* The scores for a specific item are cached in an internal cache structure.
* This structure contains the entropy, conditional entropy, split entropy,
* mutual information, gain ratio, split distribution, label distribution, and
* the number of instances that these scores have been evaluated on. Access to
* scores that have been cached are constant time, while items that need
* refreshment have a longer access time for recalculation.
* @author James Louis Initial Java implementation.
* @author Eric Eros Initial revision 7/08/96
*/
public class SplitScore {
//ENUMS
/** Value indicating how splits are scored.
*/
public static final byte mutualInfo = 0; /* SplitScoreCriterion enum */
/** Value indicating how splits are scored.
*/
public static final byte normalizedMutualInfo = 1;/* */
/** Value indicating how splits are scored.
*/
public static final byte gainRatio = 2; /* */
/** Value indicating how splits are scored.
*/
public static final byte mutualInfoRatio = 3; /* */
/** Value indicating how splits are scored.
*/
public static final byte externalScore = 4; /* */
//END ENUM
/** Default criterion for determining the score of a particular split.
* @see #mutualInfo
* @see #normalizedMutualInfo
* @see #gainRatio
* @see #mutualInfoRatio
* @see #externalScore
*/
public static byte defaultSplitScoreCriterion = normalizedMutualInfo;
/** String names for each form of split criterion. **/
public static String[] splitScoreCriterionEnum = {"mutualInfo",
"normalizedMutualInfo","gainRatio","mutualInfoRatio","externalScore"};
/***************************************************************************
*Cache structure for storing scores from sets of instances. Stores the
*information for the entropy, conditional entropy, split entropy,
*mutual information, gain ratio, split distribution, label distribution,
*and the number of instances that these scores have been evaluated on.
**************************************************************************/
private class CacheStruct {
/** Mutual Information calculation. **/
public double mutualInfo;
/** Distribution of attribute splits over the set of instances. **/
public double[] splitDist;
/** Distribution of labels over the set of instances. **/
public double[] labelDist;
/** The total weight of the set of instances. **/
public double totalWeight;
/** The entropy calculation. **/
public double entropy;
/** The gain ratio calculation. **/
public double gainRatio;
/** The entropy for the split examined. **/
public double splitEntropy;
/** The conditional entropy calculation for the split examined. **/
public double condEntropy;
/** Cache Constructor.
*/
public CacheStruct() {
totalWeight = Globals.UNDEFINED_REAL;
entropy = Globals.UNDEFINED_REAL;
mutualInfo = Globals.UNDEFINED_REAL;
condEntropy = Globals.UNDEFINED_REAL;
gainRatio = Globals.UNDEFINED_REAL;
splitEntropy = Globals.UNDEFINED_REAL;
splitDist = null;
labelDist = null;
}
/** Copies the specified CacheStruct to this CacheStruct instance.
* @param source The CacheStruct to be copied from.
*/
public void copy(CacheStruct source) {
totalWeight = source.totalWeight;
entropy = source.entropy;
mutualInfo = source.mutualInfo;
condEntropy = source.condEntropy;
gainRatio = source.gainRatio;
splitEntropy = source.splitEntropy;
splitDist = source.splitDist;
labelDist = source.labelDist;
}
/** Cache copy constructor.
* @param source The CacheStruct to be copied from.
*/
public CacheStruct(CacheStruct source) {
copy(source);
}
/** Assignment of specified CacheStruct to this CacheStruct instance.
* @param source The CacheStruct to be copied.
* @return This CacheStruct.
*/
public CacheStruct assign(CacheStruct source) {
copy(source);
return this;
}
}
/** Cache of calculated data. **/
private CacheStruct cache;
/** A matrix of labels(rows) by splits (columns). **/
private double[][] splitAndLabelDist; //row, column
/** Indicator of wether cache is filled with calculations. TRUE indicates
*the values are in place, FALSE otherwise. **/
private boolean validCache;
/** Type of criterion used for examining the attribute split. **/
private byte splitScoreCriterion;
private double theExternalScore;
/** Logging options for this class. **/
protected LogOptions logOptions = new LogOptions();
/** Sets the logging level for this object.
* @param level The new logging level.
*/
public void set_log_level(int level){logOptions.set_log_level(level);}
/** Returns the logging level for this object.
* @return The level of logging in this class.
*/
public int get_log_level(){return logOptions.get_log_level();}
/** Sets the stream to which logging options are displayed.
* @param strm The stream to which logs will be written.
*/
public void set_log_stream(Writer strm)
{logOptions.set_log_stream(strm);}
/** Returns the stream to which logs for this object are written.
* @return The stream to which logs for this object are written.
*/
public Writer get_log_stream(){return logOptions.get_log_stream();}
/** Returns the LogOptions object for this object.
* @return The LogOptions object for this object.
*/
public LogOptions get_log_options(){return logOptions;}
/** Sets the LogOptions object for this object.
* @param opt The new LogOptions object.
*/
public void set_log_options(LogOptions opt)
{logOptions.set_log_options(opt);}
/** Sets the logging message prefix for this object.
* @param file The file name to be displayed in the prefix of log messages.
* @param line The line number to be displayed in the prefix of log messages.
* @param lvl1 The log level of the statement being logged.
* @param lvl2 The level of log messages being displayed.
*/
public void set_log_prefixes(String file, int line,int lvl1, int lvl2)
{logOptions.set_log_prefixes(file, line, lvl1, lvl2);}
/** Copy Constructor.
* @param source The SplitScore to be copied.
*/
public SplitScore(SplitScore source) {
cache = new CacheStruct();
splitAndLabelDist = null;
cache.splitDist = null;
cache.labelDist = null;
reset();
splitScoreCriterion = source.splitScoreCriterion;
copy_split_and_label_dist(source);
// Must initialize after reset
theExternalScore = source.theExternalScore;
}
/** Constructor.
*/
public SplitScore() {
cache = new CacheStruct();
splitAndLabelDist = null;
cache.splitDist = null;
cache.labelDist = null;
reset();
splitScoreCriterion = defaultSplitScoreCriterion;
}
/** Returns the mutual info (information gain) score for the split this
* SplitScore object represents. Created to avoid JVM error.
* @return The unnormalized mutual info for this split.
*/
public double get_unnormalized_mutual_info() {
valid_cache(); // Percolate validCache to the cache members.
if ((cache.mutualInfo == Globals.UNDEFINED_REAL) && (has_distribution(true)))
cache.mutualInfo =
Entropy.mutual_info(get_entropy(), get_split_and_label_dist(),
get_split_dist(), total_weight());
return cache.mutualInfo;
}
/** Returns the mutual info (information gain) score for the split this
* SplitScore object represents. This method updates the cache.
* @param normalize TRUE if normalization is requested, FALSE otherwise.
* @return The mutual info value for this split.
*/
public double get_mutual_info(boolean normalize) {
valid_cache(); // Percolate validCache to the cache members.
if ((cache.mutualInfo == Globals.UNDEFINED_REAL) && (has_distribution(true)))
cache.mutualInfo =
Entropy.mutual_info(get_entropy(), get_split_and_label_dist(),
get_split_dist(), total_weight());
return normalize ?
normalize_by_num_splits(cache.mutualInfo) : cache.mutualInfo;
}
/** Normalize by the number of splits. Divide by (the number of bits needed to store
* the value (number of splits - 1)). This method updates the cache.
*
* @param score The score to be normalized.
* @return The normalized score value.
*/
public double normalize_by_num_splits(double score) {
int numSplits = num_splits();
if (numSplits <= 0)
Error.err("SplitScore::normalize_by_num_splits: number of splits "+
"not greater than 0: " + num_splits() + "-->fatal_error");
// If num_splits is 1 or 2, it becomes 2, with the log_2(2) == 1
if (numSplits >= 3)
// There may be only one value and it's useful because of
// unknowns. We therefore divide by max(2, value)
score /= Entropy.log_bin(numSplits);
// score /= Math.log((double)numSplits);// /Math.log(2.0);
return score;
}
/** Returns the number of splits--not including unknowns. This method updates the
* cache.
* @return The number of splits.
*/
public int num_splits() {
valid_cache(); // Percolate validCache to the cache members.
int numSplits = Globals.UNDEFINED_INT;
if (has_distribution(true)) {
get_split_dist(); // Ensure the distribution is non-NULL.
// Note that by looking at high, we ignore the UNKNOWN edge
// if it exists.
numSplits = cache.splitDist.length - 1;
//obs cache.splitDist.high() + 1;
//The -1 is to offset the movement of unknown values to array index
//zero. Only the number of actual values for the split should
//be returned. -JL
}
return numSplits;
}
/** The split distribution is calculated from the split and label distribution.
*
* @return The split distribution.
*/
public double[] get_split_dist() {
valid_cache(); // Percolate validCache to the cache members.
if (cache.splitDist != null)
return (cache.splitDist);
if (!has_distribution(false))
Error.err("SplitScore::get_split_dist: splitAndLabelDist has not "+
"been set-->fatal_error");
else {
cache.splitDist = new double[splitAndLabelDist[0].length];
Matrix.sum_cols(cache.splitDist,splitAndLabelDist);
}
return (cache.splitDist);
}
/** Checks if there exists a splitAndLabel distribution.
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -