?? splitscore.java
字號:
*
* @return TRUE if there is a splitAndLabel distribution, FALSE otherwise.
*/
public boolean has_distribution(){return has_distribution(true);}
/** Checks if there exists a splitAndLabel distribution.
*
* @param fatalOnFalse TRUE if an error message is to be displayed if there is no splitAndLabel
* distribution.
* @return TRUE if there is a splitAndLabel distribution, FALSE otherwise.
*/
public boolean has_distribution(boolean fatalOnFalse) {
if (!(splitAndLabelDist == null))
return true;
if (fatalOnFalse)
Error.err("SplitScore::has_distribution: no distribution-->"+
"fatal_error");
return false;
}
/** Returns cache.condEntropy, first checking to see if it has yet been set.
* This method updates the cache.
* @return The condEntropy stored in the cache.
*/
public double get_cond_entropy() {
valid_cache(); // Percolate validCache to the cache members.
if (cache.condEntropy == Globals.UNDEFINED_REAL && has_distribution(true))
this.cache.condEntropy =
Entropy.cond_entropy(get_split_and_label_dist(),
get_split_dist(), total_weight());
return cache.condEntropy;
}
private boolean valid_cache() {
if (validCache && has_distribution(false))
return true;
// When the cache is not valid, set the ancillary arrays NULL,
// invalidate the numeric data, and set the valid flag TRUE.
cache.splitDist = null;
cache.labelDist = null;
cache.totalWeight = Globals.UNDEFINED_REAL;
cache.mutualInfo = Globals.UNDEFINED_REAL;
cache.entropy = Globals.UNDEFINED_REAL;
cache.condEntropy = Globals.UNDEFINED_REAL;
cache.gainRatio = Globals.UNDEFINED_REAL;
cache.splitEntropy = Globals.UNDEFINED_REAL;
validCache = true;
return false;
}
/** Returns the total weight from the cache. This method updates the cache.
*
* @return The total weight.
*/
public double total_weight() {
valid_cache(); // Percolate validCache to the cache members.
if (cache.totalWeight != Globals.UNDEFINED_REAL)
return cache.totalWeight;
if (!has_distribution(true))
Error.err("SplitScore::num_instancess: splitAndLabelDist not yet "+
"set-->fatal_error");
else {
if (cache.splitDist != null)
cache.totalWeight = MLJArray.sum(cache.splitDist);
else {
get_label_dist();
cache.totalWeight = MLJArray.sum(cache.labelDist);
}
}
return cache.totalWeight;
}
/** The label distribution is calculated from the split and label distribution.
* This method updates the cache.
* @return The label distribution.
*/
public double[] get_label_dist() {
valid_cache(); // Percolate validCache to the cache members.
if (cache.labelDist != null)
return cache.labelDist;
if (!has_distribution(false))
Error.err("SplitScore::get_label_dist: splitAndLabelDist has not "+
"been set-->fatal_error");
else {
// cache.labelDist = new double[splitAndLabelDist[0].length];
cache.labelDist = new double[splitAndLabelDist.length];
Matrix.sum_rows(cache.labelDist,splitAndLabelDist);
}
return cache.labelDist;
}
/** Returns a reference to the requested distribution array.
*
* @return The splitAndLabel distribution array.
*/
public double[][] get_split_and_label_dist() {
valid_cache();
if (splitAndLabelDist == null)
Error.err("SplitScore::get_split_and_label_dist: Array has not "+
"been allocated-->fatal_error");
return splitAndLabelDist;
}
/** Returns cache.entropy, first checking to see if it has yet been set.
* This method updates the cache.
*
*
* @return The entropy stored in the cache.
*/
public double get_entropy() {
valid_cache(); // Percolate validCache to the cache members.
if (cache.entropy == Globals.UNDEFINED_REAL && has_distribution(true))
cache.entropy = Entropy.entropy(get_label_dist());
return cache.entropy;
}
private void verify_strictly_greater(double lhs, double rhs,
String additionalErrMsg) {//basicCore class function
if (lhs <= (rhs + (MLJ.realEpsilon))) {
Error.err(additionalErrMsg + "\n verify_strictly_greater(Real): "+
"variable is not at least " + MLJ.realEpsilon + " greater than its"+
"lower bound (" + rhs + ")-->fatal_error");
// Error.err(additionalErrMsg + "\n verify_strictly_greater(Real): variable (" + MString(lhs, 20, 0, MString::general) +
// ") is not at least " + MLJ.realEpsilon + " greater than its lower bound (" + rhs + ")-->fatal_error");
}
}
/** The criterion calculation depends on the score criterion. For gainRatio it's
* (surprise) gain ratio. For mutualInfo and normalizedMutualInfo it's mutualInfo.
* For mutualInfoRatio it's mutualInfo / entropy. This method updates the cache.
*
* @return The score for the split.
*/
public double score() {
switch (get_split_score_criterion()) {
case mutualInfo:
return get_mutual_info(false);
case normalizedMutualInfo:
return get_mutual_info(true);
case gainRatio:
return get_gain_ratio();
case mutualInfoRatio:
return get_mutual_info_ratio();
case externalScore:
return get_external_score();
default:
Error.err("SplitScore::score: split score criterion of " +
get_split_score_criterion() +
" is out of range-->fatal_error");
return 0;
}
}
/** Computes the scores and updates the cache when there are being computed many
* times for the same number of instances and entropy. This would happen, for
* instance, when determining the best threshold for a split.
* @param sAndLDist The split and label distribution.
* @param sDist The split distribution.
* @param lDist The label distribution.
* @param passedEntropy The entropy value for this split.
* @param passedWeight The weight of instances for this split.
* @return The score for this split distribution.
* @see Entropy#find_best_threshold
*/
public double score(double[][] sAndLDist, double[] sDist,
double[] lDist, double passedEntropy,
double passedWeight) {
// Distribution arrays are passed as consts; handed over to
// SplitScore; then released back to the invoker.
// Save the both the current cache and the splitAndLabelDist reference.
// Restore them prior to returning. Note: the cache saves the
// references to the old dists, not the dists themselves.
double theOldExternalScore = theExternalScore;
boolean oldValidCache = validCache;
double[][] oldSplitAndLabelDist = splitAndLabelDist;
CacheStruct oldCache = cache;
splitAndLabelDist = null;
cache.splitDist = null;
cache.labelDist = null;
double[][] sAndLDistP = sAndLDist; // No const.
set_split_and_label_dist(sAndLDistP);
if (sDist != null) {
double[] sDistP = sDist; // No const.
set_split_dist(sDistP);
}
if (lDist != null) {
double[] lDistP = lDist; // No const.
set_split_dist(lDistP);
}
valid_cache();
if (passedEntropy != Globals.UNDEFINED_REAL) {
// DBG(mlc.verify_approx_equal(passedEntropy, get_entropy(),
// "SplitScore::score: given entropy "
// "not equal to calculated entropy");
// );
cache.entropy = passedEntropy;
}
cache.totalWeight = passedWeight;
double theScore = score();
if (sDist != null) {
double[] releasedSplitDist = release_split_dist();
// (void)releasedSplitDist;
// DBG(ASSERT(mlc.approx_equal(*releasedSplitDist, *sDist)));
}
if (lDist != null) {
double[] releasedLabelDist = release_label_dist();
// (void)releasedLabelDist;
// DBG(ASSERT(mlc.approx_equal(*releasedLabelDist, *lDist)));
}
double[][] releasedSplitAndLabelDist = release_split_and_label_dist();
// (void)releasedSplitAndLabelDist;
// DBG(ASSERT(mlc.approx_equal(*releasedSplitAndLabelDist, *sAndLDist)));
// Restore
cache = oldCache;
splitAndLabelDist = oldSplitAndLabelDist;
validCache = oldValidCache;
theExternalScore = theOldExternalScore;
return theScore;
}
/** Returns the type of criterion used in scoring splits.
* @return The scoring criterion.
* @see #mutualInfo
* @see #normalizedMutualInfo
* @see #gainRatio
* @see #mutualInfoRatio
* @see #externalScore
*/
public byte get_split_score_criterion()
{return splitScoreCriterion;}
/** Returns the value, set externally, for the score.
*
* @return The externally set score value.
*/
public double get_external_score() {
if (splitAndLabelDist == null && theExternalScore != Globals.UNDEFINED_REAL)
Error.err("SplitScore::get_external_score: splitAndLabelDist was "+
"deleted without theExternalScore being invalidated-->fatal_error");
if (theExternalScore == Globals.UNDEFINED_REAL)
Error.err("SplitScore::get_external_score: no score set-->"+
"fatal_error");
return theExternalScore;
}
/** Returns the mutual information ratio, which is the ratio between the mutual
* info and entropy. The mutual information must be >= 0. Although const, this
* method updates the cache.
*
* @return Mutual information ratio.
*/
public double get_mutual_info_ratio() {
double denominator = get_entropy();
MLJ.verify_strictly_greater(denominator, 0,
"SplitScore::get_mutual_info_ratio: Need to divide by entropy, which "+
"is too small");
DoubleRef ratio = new DoubleRef(get_mutual_info(false) / denominator);
MLJ.clamp_to_range(ratio, 0, 1, "SplitScore::get_mutual_info_ratio: "+
"ratio not in required range [0, 1]");
return ratio.value;
}
/** Determines, and returns, cache.gainRatio. This method updates the cache.
*
* @return The gainRatio stored in the cache.
*/
public double get_gain_ratio() {
valid_cache(); // Percolate validCache to the cache members.
double gain = cache.gainRatio;
if (cache.gainRatio == Globals.UNDEFINED_REAL && has_distribution(true)) {
double numerator = get_mutual_info(false);
double divisor = get_split_entropy();
// If the divisor is zero, we abort.
if (MLJ.approx_equal(divisor, 0.0))
Error.err("SplitScore::get_gain_ratio: split entropy (" + divisor +
") is too close to zero for division. Split and Label Dist is: " +
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -