?? entropy.java
字號:
* @param labelCount The count of each label found in the data.
* @return The entropy for the supplied label counts.
*/
public static double entropy(double[] labelCount) {
double sum = MLJArray.sum(labelCount);
if (MLJ.approx_equal(sum, 0.0))
return Globals.UNDEFINED_REAL;
return entropy(labelCount, sum);
}
/** Compute the entropy H(Y) for label Y. Giving us an InstanceList forces counters.
* If you don't give us the total instances, we count (slightly less efficient).
* Entropy without the sum acts a bit differently by allowing nodes with 0
* instances and returning entropy -1. The reason is that the caller shouldn't be
* required to compute the sum just for this.
*
* @param labelCount The count of each label found in the data.
* @param totalInstanceWeight The total weight for all of the instances.
* @return The entropy for the supplied label counts.
*/
public static double entropy(double[] labelCount,double totalInstanceWeight) {
MLJ.verify_strictly_greater(totalInstanceWeight, 0, "entropy: totalInstanceWeight is too small");
if (Globals.DBG) {
double sum = MLJArray.sum(labelCount);
MLJ.verify_approx_equal(sum, totalInstanceWeight,
"entropy(,,): sum and totalWeight don\'t "
+ "match");
}
double H = 0;
for(int y = 0 ; y < labelCount.length ; y++)
if (labelCount[y] != 0) {
// We know that totalInstanceWeight > 0 by the check
// on verify_strictly_greater above.
double prob_y = labelCount[y]/ totalInstanceWeight;
H -= prob_y * log_bin(prob_y);
}
return H;
}
/** Compute the entropy H(Y) for label Y. Giving us an InstanceList forces counters.
* If you don't give us the total instances, we count (slightly less efficient).
* Entropy without the sum acts a bit differently by allowing nodes with 0
* instances and returning entropy -1. The reason is that the caller shouldn't be
* required to compute the sum just for this.
*
* @param instList The supplied instances for whihc entropy will be calculated.
* @return The entropy value.
*/
public static double entropy(InstanceList instList) {
return entropy(instList.counters().label_counts(),instList.total_weight());
}
/** Search a score array to find the best score/index.
* @param totalKnownWeight Total weight of all Instances for which a value is known.
* @param scores The scores of the available Splits.
* @param minSplit The minimum value for a split.
* @param bestSplitIndex The index of the best split.
* @return The score of the best split.
*/
public static double find_best_score(double totalKnownWeight,
Vector scores, double minSplit,
IntRef bestSplitIndex) {
double minimalDistanceFromCenter = totalKnownWeight/ 2;
double bestScore = Globals.UNDEFINED_REAL;
for(int k = 0 ; k < scores.size(); k++) {
if (totalKnownWeight -((ThresholdInfo)scores.get(k)).weight < minSplit)
break;
if (((ThresholdInfo)scores.get(k)).weight < minSplit)
continue;
double currentScore =((ThresholdInfo)scores.get(k)).score;
double currentDistance = Math.abs(totalKnownWeight / 2 -((ThresholdInfo)scores.get(k)).weight);
if (currentScore > bestScore + MLJ.realEpsilon ||(MLJ.approx_equal(bestScore, currentScore)&& currentDistance < minimalDistanceFromCenter)) {
bestSplitIndex.value =((ThresholdInfo)scores.get(k)).index;
minimalDistanceFromCenter = currentDistance;
bestScore = currentScore;
LogOptions.GLOBLOG(6, "index " +bestSplitIndex+ ", entropy " +bestScore+ '\n');
}
}
return bestScore;
}
/** Build the splitAndLabelDist and splitDist arrays needed for calculating
* conditional entropy. All of the splitAndLabelDist arrays of the Instance Lists
* are concatenated. The unaccounted instances allow the list of nodes to be
* partial, i.e., not to contain all instances. The split will be created so that
* the unaccounted instances are in an extra split with the same label, so that the
* entropy will be decreased correctly as if they were in a pure node.
* @param currentLevel The list of instances in the current partition
* for which a split is being determined.
* @param attrNum The number of the attribute over which a split and label distribution is to be
* built.
* @return Distributions over each split and label pair.
*/
public static double[][] build_split_and_label_dist(InstanceList[] currentLevel, int attrNum) {
return build_split_and_label_dist(currentLevel,attrNum,0);
}
/** Build the splitAndLabelDist and splitDist arrays needed for calculating
* conditional entropy. All of the splitAndLabelDist arrays of the Instance Lists
* are concatenated. The unaccounted instances allow the list of nodes to be
* partial, i.e., not to contain all instances. The split will be created so that
* the unaccounted instances are in an extra split with the same label, so that the
* entropy will be decreased correctly as if they were in a pure node.
* @param currentLevel The list of instances in the current partition for which a split is being
* determined.
* @param attrNum The number of the attribute over which a split and label distribution is to be
* built.
* @param unaccountedWeight The weight for instances that are not
* accounted for in this partition.
* @return Distributions over each split and label pair.
*/
public static double[][] build_split_and_label_dist(InstanceList[] currentLevel,
int attrNum,double unaccountedWeight) {
MLJ.ASSERT(currentLevel[0] != null, "Entropy::build_split_and_label_dist: currentlevel == null.");
Schema schema = currentLevel[0].get_schema();
int numInstLists = currentLevel.length;
int numLabelValues = schema.num_label_values();
int numAttrValues = schema.num_attr_values(attrNum);
MLJ.ASSERT(numInstLists > 0, "Entropy::build_split_and_label_dist: numInstLists <= 0.");
MLJ.ASSERT(numLabelValues > 0, "Entropy::build_split_and_label_dist: numLabelValues <= 0.");
MLJ.ASSERT(numAttrValues > 0, "Entropy::build_split_and_label_dist: numAttrValues <= 0.");
MLJ.ASSERT(unaccountedWeight >= 0, "Entropy::build_split_and_label_dist: unaccountedWeight < 0.");
double[][] splitAndLabelDist = new double[numLabelValues][numInstLists*(numAttrValues+1)+((MLJ.approx_equal(unaccountedWeight, 0.0))?0:1)];
int countSplitAndLabelDistCol = 0;
for(int instListCount = 0 ; instListCount < numInstLists ; instListCount++) {
MLJ.ASSERT(currentLevel[instListCount] != null, "Entropy::build_split_and_label_dist: currentLevel[instListCount] == null.");
for(int attrValCount = 0 ;
attrValCount < numAttrValues +1 ; //add 1 to account for unknown being moved from -1 to 0 -JL
attrValCount++, countSplitAndLabelDistCol++) {
for(int labelValCount = 0 ; labelValCount < numLabelValues ;
labelValCount++) {
BagCounters bc = currentLevel[instListCount].counters();
splitAndLabelDist[labelValCount][countSplitAndLabelDistCol]=
bc.value_counts()[attrNum][labelValCount+1][attrValCount];
//labelValCount needs 1 added because the unknown label has
//has been moved from -1 to 0. The first nominal label is now
//at 1, not 0, in the BagCounters class. -JL
}
}
}
// Assign the unaccounted to the last split with all counts
// for one label (so it looks pure).
if (unaccountedWeight > 0) {
MLJ.ASSERT(countSplitAndLabelDistCol == splitAndLabelDist[0][splitAndLabelDist.length], "Entropy::build_split_and_label_dist: countSplitAndLabelDistCol != splitAndLabelDist[0][splitAndLabelDist.length]");
for(int labelValCount = 0 ; labelValCount < numLabelValues ; labelValCount++)
splitAndLabelDist[labelValCount][countSplitAndLabelDistCol]=(labelValCount != 0)? 0 : unaccountedWeight;
}
return splitAndLabelDist;
}
/** Returns the minSplit which is used in find_best_threshold(), given
* lowerBoundMinSplit, upperBoundMinSplit, and minSplitPercent. This
* function is called by inducers.
* @return The minSplit which is used in find_best_threshold().
* @param upperBoundMinSplit Upper bound for the minimum split value.
* @param totalWeight The total weight of all instances in the list of instances for which a split
* is requested.
* @param numTotalCategories Number of possible values an instance may be categorized as.
* @param lowerBoundMinSplit Lower bound for the minimum split value.
* @param minSplitPercent The percentage of total weight per category that represents the minimum value
* for a split.
*/
public static double min_split(double totalWeight,
int numTotalCategories, double lowerBoundMinSplit,
double upperBoundMinSplit, double minSplitPercent) {
if (numTotalCategories <= 0)
Error.fatalErr("min_split: divisor (numTotalCategories) is zero or neg");
if ((Globals.DBG)&&(lowerBoundMinSplit <= 0))
Error.fatalErr("min_split: lowerBoundMinSplit ("
+ lowerBoundMinSplit + ") must be at least one");
double minSplit = minSplitPercent * totalWeight/ numTotalCategories;
if (minSplit > upperBoundMinSplit)
minSplit = upperBoundMinSplit;
if (minSplit <= lowerBoundMinSplit)
minSplit = lowerBoundMinSplit;
MLJ.ASSERT(minSplit > 0, "Entropy::build_split_and_label_dist: minSplit <= 0");
return minSplit;
}
/** Returns the minSplit which is used in find_best_threshold(), given
* lowerBoundMinSplit, upperBoundMinSplit, and minSplitPercent. This
* function is called by inducers.
* @param instList The list of instances over which a split is requested.
* @param upperBoundMinSplit Upper bound for the minimum split value.
* @param lowerBoundMinSplit Lower bound for the minimum split value.
* @param minSplitPercent The percentage of total weight per category that represents the minimum value
* for a split.
* @param ignoreNumCat Indicator that the number of values that an instance may be classified as should
* be ignored for this split computation.
* @return The minSplit which is used in find_best_threshold().
*/
public static double min_split(InstanceList instList,
double lowerBoundMinSplit, double upperBoundMinSplit,
double minSplitPercent, boolean ignoreNumCat) {
if (ignoreNumCat)
return min_split(instList.total_weight(), 1, lowerBoundMinSplit, upperBoundMinSplit, minSplitPercent);
else return min_split(instList.total_weight(), instList.num_categories(), lowerBoundMinSplit, upperBoundMinSplit, minSplitPercent);
}
/** Returns the minSplit which is used in find_best_threshold(), given
* lowerBoundMinSplit, upperBoundMinSplit, and minSplitPercent. This
* function is called by inducers.
* @param instList The list of instances over which a split is requested.
* @param upperBoundMinSplit Upper bound for the minimum split value.
* @param lowerBoundMinSplit Lower bound for the minimum split value.
* @param minSplitPercent The percentage of total weight per category that represents the minimum value
* for a split.
* @return The minSplit which is used in find_best_threshold().
*/
public static double min_split(InstanceList instList,
double lowerBoundMinSplit, double upperBoundMinSplit,
double minSplitPercent) {
return min_split(instList,lowerBoundMinSplit,upperBoundMinSplit,minSplitPercent,false);
}
/** Computes conditional entropy of the label given attribute X. From Ross,
* Conditional entropy is defined as: <BR>
* H(Y|X) = sum_x H(Y|X=x)*P(X=x). <BR>
* = sum_x (-sum_y p(Y=y|X=x)log p(Y=y|X=x)) * P(X=x) <BR>
* now derive Pagallo & Haussler's formula <BR>
* = -sum_{x,y} p(Y=y, X=x) log p(Y=y|X=x) <BR>
* Here we estimate p(Y=y, X=x) by counting, but if we
* have priors on the probabilities of the labels, then <BR>
* p(x,y) = p(x|y)*p(y) = count(x,y)/s(y)* prior(y) <BR>
* and p(x) = sum_y prior(y) count(x,y)/s(y). <BR>
*
* By counting we get the following: <BR>
* -sum_{x,y} num(Y=y,X=x)/num-rec * log num(Y=y,X=x)/num(X=x)
*
* @param splitAndLabelDist Distributions over each split and label pair.
* @param splitDist The distribution over splits.
* @param totalWeight The total weight distributed.
* @return The conditional entropy.
*/
public static double cond_entropy(double[][] splitAndLabelDist,
double[] splitDist, double totalWeight) {//AttrAndOrigin class function
// DBG(MLC::verify_strictly_greater(totalWeight, 0,
// "cond_entropy:: totalWeight must be "
// "positive");
double sum = MLJArray.sum(splitDist);
MLJ.verify_approx_equal(totalWeight, sum,
"cond_entropy:: totalWeight does not match splitDist");
sum = Matrix.total_sum(splitAndLabelDist);
MLJ.verify_approx_equal(totalWeight, sum,
"cond_entropy:: totalWeight does not match splitAndLabelDist");
// DBGSLOW(
// Array<Real> sDist(UNKNOWN_CATEGORY_VAL,
// splitAndLabelDist.num_cols());
// splitAndLabelDist.sum_cols(sDist);
// ASSERT(MLJ.approx_equal(sDist, splitDist));
// )
// );
DoubleRef H = new DoubleRef(0);
for (int x = 0;
x < splitAndLabelDist[0].length; x++) {
double num_x = splitDist[x];
if (MLJ.approx_equal(num_x, 0.0))
continue;
for (int y = 0;
y < splitAndLabelDist.length; y++) {
double num_xy = splitAndLabelDist[y][x];
if (MLJ.approx_equal(num_xy, 0.0))
continue;
H.value -= num_xy * log_bin(num_xy/num_x);
}
}
H.value /= totalWeight; // We know this won't be division by zero.
MLJ.clamp_above(H, 0, "cond_entropy: negative entropy not allowed");
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -