?? entropy.java
字號:
return H.value;
}
/** Computes conditional entropy of the label given attribute X. From Ross,
* Conditional entropy is defined as: <BR>
* H(Y|X) = sum_x H(Y|X=x)*P(X=x). <BR>
* = sum_x (-sum_y p(Y=y|X=x)log p(Y=y|X=x)) * P(X=x) <BR>
* now derive Pagallo & Haussler's formula <BR>
* = -sum_{x,y} p(Y=y, X=x) log p(Y=y|X=x) <BR>
* Here we estimate p(Y=y, X=x) by counting, but if we
* have priors on the probabilities of the labels, then <BR>
* p(x,y) = p(x|y)*p(y) = count(x,y)/s(y)* prior(y) <BR>
* and p(x) = sum_y prior(y) count(x,y)/s(y). <BR>
*
* By counting we get the following: <BR>
* -sum_{x,y} num(Y=y,X=x)/num-rec * log num(Y=y,X=x)/num(X=x)
*
* @param instList The instance list over which conditional entropy is calculated.
* @param attrNumX The number of the attribute for which conditional entropy is requested.
* @return The conditional entropy.
*/
public static double cond_entropy(InstanceList instList, int attrNumX) {
return cond_entropy(instList.counters().value_counts()[attrNumX],
instList.counters().attr_counts()[attrNumX],
instList.total_weight());
}
/** Compute the mutual information which is defined as I(Y;X) = H(Y) - H(Y|X). Some
* researchers like Quinlan call this "gain." This is the amount of information
* gained about the category value of an instance after we test the variable X.
*
* @param ent Entropy value.
* @param splitAndLabelDist Distributions over each split and label pair.
* @param splitDist The distribution over splits.
* @param totalWeight Total weight of the Instances trained on.
* @return The mutual information value.
*/
public static double mutual_info(double ent,double[][] splitAndLabelDist,
double[] splitDist, double totalWeight) {
double condEntropy = Entropy.cond_entropy(splitAndLabelDist, splitDist,
totalWeight);
DoubleRef mi = new DoubleRef(ent - condEntropy);
// Mutual information should never be negative; the following
// accounts for possible numerical representation errors.
MLJ.clamp_above(mi, 0, "mutual_info: negative values not allowed");
return mi.value;
}
/** Compute the mutual information which is defined as I(Y;X) = H(Y) - H(Y|X). Some
* researchers like Quinlan call this "gain." This is the amount of information
* gained about the category value of an instance after we test the variable X.
* @param instList The instance list over which mutual information is calculated.
* @param attrNumX The number of the attribute for which mutual information is requested.
* @return The mutual information value.
*/
public static double mutual_info(InstanceList instList, int attrNumX) {
if (instList.counters().attr_counts()[attrNumX] == null)
Error.fatalErr("entropy::mutual_info: attribute "+attrNumX+
" is not nominal (counts array is NULL)");
double ent = entropy(instList.counters().label_counts(),
instList.total_weight());
return mutual_info(instList, ent, attrNumX);
}
/** Compute the mutual information which is defined as I(Y;X) = H(Y) - H(Y|X). Some
* researchers like Quinlan call this "gain." This is the amount of information
* gained about the category value of an instance after we test the variable X.
* @param instList The instance list over which mutual information is calculated.
* @param ent Entropy value.
* @param attrNumX The number of the attribute for which mutual information is requested.
* @return The mutual information value.
*/
public static double mutual_info(InstanceList instList,
double ent, int attrNumX) {
if (instList.counters().attr_counts()[attrNumX] == null)
Error.fatalErr("entropy::mutual_info: attribute "+attrNumX+
" is not nominal (counts array is NULL)");
return mutual_info(ent,
instList.counters().value_counts()[attrNumX],
instList.counters().attr_counts()[attrNumX],
instList.total_weight());
}
/** Builds the distribution arrays necessary for calculating conditional entropy for
* nominal attributes. All of the splitAndLabelDist arrays of the Instance Lists are
* concatenated. The unaccounted instances allow the list of nodes to be partial,
* i.e., not to contain all instances. The split will be created so that the
* unaccounted instances are in an extra split with the same label, so that the
* entropy will be decreased correctly as if they were in a pure node.
*
* @param currentLevel The list of instances in the current partition for which a split is being
* determined.
* @param attrNum The number of the attribute for which mutual information is requested.
* @return The distribution over splits.
*/
static public double[] build_nominal_attr_split_dist(InstanceList[] currentLevel,
int attrNum) {
return build_nominal_attr_split_dist(currentLevel,attrNum,0);
}
/** Builds the distribution arrays necessary for calculating conditional entropy for
* nominal attributes. All of the splitAndLabelDist arrays of the Instance Lists are
* concatenated. The unaccounted instances allow the list of nodes to be partial,
* i.e., not to contain all instances. The split will be created so that the
* unaccounted instances are in an extra split with the same label, so that the
* entropy will be decreased correctly as if they were in a pure node.
*
* @param currentLevel The list of instances in the current partition for which a split is being
* determined.
* @param attrNum The number of the attribute for which mutual information is requested.
* @param unaccountedWeight Weight that is not accounted for in the list of instances.
* @return The distribution over splits.
*/
static public double[] build_nominal_attr_split_dist(InstanceList[] currentLevel,
int attrNum, double unaccountedWeight) {
MLJ.ASSERT(currentLevel[0]!= null,"Entropy.build_nominal_attr_split_dist:currentLevel[0]== null.");
Schema schema = currentLevel[0].get_schema();
int numInstLists = currentLevel.length;
int numAttrValues = schema.num_attr_values(attrNum);
MLJ.ASSERT(numInstLists > 0,"Entropy.build_nominal_attr_split_dist:numInstLists <= 0");
MLJ.ASSERT(numAttrValues > 0,"Entropy.build_nominal_attr_split_dist:numAttrValues <= 0");
int unaccnt_wght_col = (unaccountedWeight > 0)? 1 : 0;
double[] splitDist = new double[numInstLists * (numAttrValues + 1) + unaccnt_wght_col];
int countSplitDist = Globals.UNKNOWN_CATEGORY_VAL;
for (int instListCount = 0; instListCount < numInstLists; instListCount++) {
MLJ.ASSERT(currentLevel[instListCount] != null,"Entropy.build_nominal_attr_split_dist:currentLevel[instListCount] == null");
for (int attrCount = Globals.UNKNOWN_CATEGORY_VAL; attrCount < numAttrValues;
attrCount++, countSplitDist++) {
BagCounters bc = currentLevel[instListCount].counters();
splitDist[countSplitDist] = bc.attr_counts()[attrNum][attrCount];
}
}
if (unaccountedWeight > 0) {
MLJ.ASSERT(countSplitDist == splitDist.length,"Entropy.build_nominal_attr_split_dist:countSplitDist != splitDist.length");
splitDist[countSplitDist] = unaccountedWeight;
}
return splitDist;
}
/** Compute the J-measure. See papers by Goodman and Smyth, such as Data
* Engineering, v.4, no.4, pp.301-316, 1992. The J-measure summed over all
* values of x gives info-gain. The J-measure is <BR>
* sum_y p(x,y)log(p(x,y)/(p(x)p(y))) <BR>
* 1/n * sum_y n(x,y)log(n(x,y)*n/(n(x)n(y))) <BR>
* Used in t_entropy.java.
*
* @return The j-measure value.
* @param splitAndLabelDist Distributions over each split and label pair.
* @param splitDist The distribution over splits.
* @param labelCounts Counts of each label found in the data.
* @param x The x value for the j-measure equation.
* @param totalWeight Total weight of all data.
*/
public static double j_measure(double[][] splitAndLabelDist,
double[] splitDist, double[] labelCounts,
int x, double totalWeight) {
MLJ.verify_strictly_greater(totalWeight, 0, "j_measure: totalWeight is "+
"too small");
DoubleRef j = new DoubleRef();
for (int y = 0;
y < splitAndLabelDist.length; y++) {
double num_xy = splitAndLabelDist[y][x];
double num_x = splitDist[x];
double num_y = labelCounts[y];
if (!MLJ.approx_equal(num_xy, 0.0)) { // beware of log(0)
if (Globals.DBG) MLJ.ASSERT((num_x > 0 && num_y > 0),"Entropy.j_measure: num_x <= 0 || num_y <= 0");
j.value += num_xy *
log_bin(totalWeight*(num_xy)/(num_x * num_y));
}
}
j.value /= totalWeight; // We know this won't be division by zero.
// Allow for possible numerical representation errors.
MLJ.clamp_above(j, 0, "j_measure: negative j-measure not allowed");
return j.value;
}
/** Compute the J-measure. See papers by Goodman and Smyth, such as Data
* Engineering, v.4, no.4, pp.301-316, 1992. The J-measure summed over all
* values of x gives info-gain. The J-measure is <BR>
* sum_y p(x,y)log(p(x,y)/(p(x)p(y)))
* 1/n * sum_y n(x,y)log(n(x,y)*n/(n(x)n(y))) <BR>
* Used in t_entropy.java.
*
* @param instList The list of Instances over which a j measure is to be
* calculated.
* @param attrNumX The number of attributes in the Schema of the Instances
* supplied.
* @param x The x value for the j-measure equation.
* @return The j-measure value.
*/
public static double j_measure(InstanceList instList, int attrNumX, int x) {
return j_measure(instList.counters().value_counts()[attrNumX],
instList.counters().attr_counts()[attrNumX],
instList.counters().label_counts(), x,
instList.total_weight());
}
/** Builds columns of real values and their associated label values. Invokes
* InstanceList's transpose function to provide a single column for the passed
* attribute number, sorts it, and returns the columns to the caller. The second
* calling argument, if set to an attribute index, results in a single column
* being transposed and sorted. When set to UNDEFINED_INT, all columns are
* so treated.
* @param instList The instance list containing the instance values for the attribute.
* @param attrNum The number of the attribute for which the real and label column is
* requested.
* @return The columns of real values and their associated labels, organized by attribute.
*/
public static RealAndLabelColumn[] build_real_and_label_columns(
InstanceList instList, int attrNum) {
// We initialize the array to FALSE, except for any element(s)
// we want to get the RealAndLabelColumn for, which is/are set to TRUE.
boolean initializer = (attrNum == Globals.UNDEFINED_INT) ? true : false;
boolean[] transp = new boolean[instList.get_schema().num_attr()];
Arrays.fill(transp, initializer);
if (attrNum != Globals.UNDEFINED_INT)
transp[attrNum] = true;
RealAndLabelColumn[] columns = instList.transpose(transp);
// If a particular column was requested, check that it was transposed.
if (attrNum != Globals.UNDEFINED_INT)
if (columns[attrNum] != null)
columns[attrNum].sort();
else
Error.fatalErr("build_real_and_label_columns: for attribute " +attrNum
+", no column was built to sort");
else
for (int x = 0; x < instList.get_schema().num_attr(); x++)
if (columns[x] != null)
columns[x].sort();
return columns;
}
/** Builds a column of real values and their associated label values for the given
* attribute. Invokes InstanceList's transpose function to provide a single column
* for the passed attribute number, sorts it, and returns the columns to the caller.
* The second calling argument, if set to an attribute index, results in a single
* column being transposed and sorted. When set to UNDEFINED_INT, all columns are
* so treated.
* @param instList The instance list containing the instance values for the attribute.
* @param attrNum The number of the attribute for which the real and label column is
* requested.
* @return The column of real values and their associated labels.
*/
public static RealAndLabelColumn build_real_and_label_column(InstanceList
instList, int attrNum) {
RealAndLabelColumn[] columns = build_real_and_label_columns(instList, attrNum);
// We want to pass the sorted column back to the caller, but delete
// the rest of the array. Save a reference to the single desired
// column, and set the entry in the array that points to it to NULL so
// that when the array's deleted, the column isn't. The caller must
// delete the single column.
RealAndLabelColumn sortedColumn = columns[attrNum];
columns[attrNum] = null;
columns = null;
return sortedColumn;
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -