?? naivebayescat.java
字號:
attrImp[i] = 0;
}
else {
double condEnt = Entropy.cond_entropy(instList.counters().value_counts()[i], instList.counters().attr_counts()[i], instList.total_weight());
if(condEnt < 0 && -condEnt < MLJ.realEpsilon) {
condEnt = 0;
}
attrImp[i] = 100 - 100 * (condEnt / ent);
if(attrImp[i] < 0 && attrImp[i] >= -1000 * MLJ.realEpsilon) {
attrImp[i] = 0; // avoid small negatives
}
else if(attrImp[i] < 0) {
Error.fatalErr("compute_importance: attribute " + i +
" had importance " + attrImp[i] + "which is severly negative");
}
}
}
else {
Error.fatalErr("compute_importance: attribute " + i + " has " +
"unsupported type. Must be real or nominal.");
}
}
return attrImp;
} /** Computes the distance metrics for all attributes. * Should only be called once. */ private void compute_kl_distances() { if(unkIsVal != null) { Error.fatalErr("NaiveBayesCat.compute_kl_distances: kl distances already computed"); } unkIsVal = new boolean[get_schema().num_attr()]; for(int i=0; i<get_schema().num_attr(); i++) { if(!get_schema().attr_info(i).can_cast_to_nominal()) { Error.fatalErr("NaiveBayesCat.categorize: UNKNOWN_IS_VALUE is set and " + get_schema().attr_name(i) + " is a real value with unknowns. " + "UNKNOWN_IS_VALUE settings of " + "yes and auto are not supported for undiscretized real values " + "with unknowns."); } double dist = kl_distance(i); if(dist >= klThreshold) { logOptions.LOG(1, "k-l distance for attribute " + get_schema().attr_name(i) + " (" + dist + ") exceeds threshold" + endl);
unkIsVal[i] = true;
}
else {
unkIsVal[i] = false;
}
}
} /** copyAttrImportance copys the array of doubles stored in the attrImportance
* Array and returns the new Array. This function is used to copy NaiveBayesCat
* Objects. * @author James Plummer added to package for compatibility. * @return the new copy of attrImportance. */ private double[] copyAttrImportance() {
if ( this.attrImportance != null ) {
double[] result = new double[attrImportance.length];
for (int i = 0; i < attrImportance.length; i++) {
result[i] = attrImportance[i];
}
return result;
}
else {
return null;
}
} /** copyContinNorm copys the array of NBNorms stored in the continNorm
* Array and returns the new Array. This function is used to copy NaiveBayesCat
* Objects. * @author James Plummer added for compatiblity. * @return the new copy of continNorm. */ private NBNorm[][] copyContinNorm() {
if ( this.continNorm != null ) {
NBNorm[][] result = new NBNorm[continNorm.length][];
for (int i = 0; i < continNorm.length; i++) {
result[i] = new NBNorm[continNorm[i].length];
for (int j = 0; j < continNorm[i].length; j++) {
result[i][j] = new NBNorm(continNorm[i][j]);
}
}
return result;
}
else {
return null;
}
} /** Prints a readable representation of the Cat to the
*given stream. */ public void display_struct(BufferedWriter stream, DisplayPref dp) {
try {
if (stream != null) {
logOptions.set_log_stream(stream);
}
stream.write("Simple NaiveBayes Cat " + this.description() +
" categorizing using prevalence data in BagCounter: " + endl +
nominCounts + endl);
if ( continNorm != null ) {
stream.write("Categorizing uses Normal Density to estimate probability" +
" of continuous attributes. The mean, variance, and standard" +
" deviation of each attribute,label combination is: " + endl);
for (int i = 0; i < numAttributes; i++) {
if ( nominCounts.value_counts()[i] != null ) // nominal attribute
stream.write("Attribute " + i + ":" + " Nominal Attribute." + endl);
else {
stream.write("Attribute " + i + ":" + endl);
for (int j = 0; j < num_categories(); j++)
stream.write(" Label " + j + "\t\t" + continNorm[i][j].mean +
"\t" + continNorm[i][j].var + endl);
}
}
}
}
catch (Exception e) {
e.printStackTrace();
}
} /** findMax finds the largest value for an array of doubles
* @author James Plummer added to match C++ functionality. * @param d - the array of doubles.
* @return the maximum number.
*/ public static double findMax(double[] d) {
double result = d[0];
for (int i = 1; i < d.length; i++) {
if (result < d[i]) {
result = d[i];
}
}
return result;
} /** findMin finds the smallest value for an array of doubles
* @author James Plummer added to match C++ functionality. * @param d - the array of doubles.
* @return the minimum number.
*/
public static double findMin(double[] d) {
double result = d[0];
for (int i = 1; i < d.length; i++) {
if (result > d[i]) {
result = d[i];
}
}
return result;
} /** Helper function: generate a single probability using * the Laplace correction. * Evidence projection is not used if there's no data * (labelCount == 0). */ private double generate_cond_probability(double labelValCount, double labelCount,
int numAttrVals, int numAttr) {
if(useEvidenceProjection && labelCount > 0) {
double maxEvidence = MLJ.log_bin(1.0 + trainWeight*evidenceFactor);
return CatDist.single_evidence_projection(labelValCount, labelCount, maxEvidence);
}
else if (useLaplace) {
double effectiveMEstimate = mEstimateFactor;
if(effectiveMEstimate == 0.0) {
effectiveMEstimate = 1.0 / trainWeight;
}
return (labelValCount + effectiveMEstimate) / (labelCount + numAttrVals * effectiveMEstimate);
}
else if (labelValCount == 0) {
if(noMatchesFactor >= 0) {
return noMatchesFactor / trainWeight;
}
else if(noMatchesFactor == -1) {
return (double)(labelCount) / trainWeight / trainWeight;
}
else if(noMatchesFactor == -2) {
return (double)(labelCount) / trainWeight
/ (trainWeight * this.get_schema().num_attr());
}
else {
Error.fatalErr("NaiveBayesCat.generate_cond_probability: noMatchesFactor has illegal value of " +
noMatchesFactor);
return 0;
}
}
else {
// if labelCount == 0, then labelValCount should also be 0 and we'll
// choose the case above instead of this one.
MLJ.ASSERT( (labelCount > 0), "NaiveBayesCat.generate_cond_probability()");
return (double)(labelValCount / labelCount);
}
} /** Helper function: generate a single probability. Allow
* for Laplace correction. */ private double generate_probability_prior(double labelCount, int numLabels) {
if(useEvidenceProjection) {
double maxEvidence = MLJ.log_bin(1.0 + trainWeight*evidenceFactor);
return CatDist.single_evidence_projection(labelCount, trainWeight, maxEvidence);
}
else if(useLaplace) {
double effectiveMEstimate = mEstimateFactor;
if(effectiveMEstimate == 0.0) {
effectiveMEstimate = 1.0 / trainWeight;
}
return (labelCount + effectiveMEstimate) / (trainWeight + numLabels * effectiveMEstimate);
}
else if(labelCount == 0)
return 0;
else {
// if labelCount == 0, then labelValCount should also be 0 and we'll
// choose the case above instead of this one.
MLJ.ASSERT( (labelCount > 0), "NaiveBayesCat.generate_probablility()");
return labelCount / trainWeight;
}
} /** Removed function *//* public void generate_viz(BufferedWriter stream, boolean[] autoDiscVector, int evivizVersion) throws IOException {
}*/ /** fuctions for retrieving and setting optional variables. */ public double get_evidence_factor() { return evidenceFactor; }
public double get_kl_threshold() { return klThreshold; }
public double get_m_estimate_factor() { return mEstimateFactor; }
public double get_no_matches_factor() { return noMatchesFactor; }
public int get_unknown_is_value() { return unknownIsValue; }
public boolean get_use_evidence_projection() { return useEvidenceProjection; }
public boolean get_use_laplace() { return useLaplace; }
public void set_evidence_factor(double f) { evidenceFactor = f; }
public void set_kl_threshold(double th) { klThreshold = th; }
/** Initialize the probabilities to be the class probabilities
* P(L = l) * @param nominCoutns - the BagCounter to initilize. */ public static void init_class_prob(BagCounters nominCounts,
double trainWeight,
double[] prob, boolean useLaplace,
boolean useEvidenceProjection,
double evidenceFactor)
{
if (useEvidenceProjection) {
for (int labelVal = 0; labelVal < prob.length; labelVal++) {
prob[labelVal] = nominCounts.label_count(labelVal);
}
CatDist.apply_evidence_projection(prob, evidenceFactor, true);
}
else if (useLaplace) {
int numLabels = prob.length - 1;
// No laplace correction for unknown label. This fixes bug #526924
MLJ.ASSERT(nominCounts.label_count(Globals.UNKNOWN_CATEGORY_VAL) == 0,"NaiveBayesCat.init_class_prob()");
prob[Globals.UNKNOWN_CATEGORY_VAL] = 0;
for (int labelVal = 1; labelVal < prob.length; labelVal++) {
prob[labelVal] = (double)(nominCounts.label_count(labelVal) + 1)/(trainWeight + numLabels);
}
}
else {
for (int labelVal = 0; labelVal < prob.length; labelVal++) {
prob[labelVal] = (double)(nominCounts.label_count(labelVal))/(trainWeight);
}
}
// Check that probabilities sum to about 1.0
double probSum = sumArray(prob);
MLJ.verify_approx_equal(probSum,1,"NaiveBayesCat.init_class_prob: prob does not sum to one");
} /** Compute the KL distance metric for a single attribute. * If we don't have minimum support, we always return 0 * @param attrNum - the number of the attribute to compute distances. * @return the distance for the attribute. */ private double kl_distance(int attrNum) { int numLabelVals = get_schema().num_label_values();
double[] p = new double[numLabelVals];
double[] q = new double[numLabelVals];
if(!get_schema().attr_info(attrNum).can_cast_to_nominal()) {
Error.fatalErr("NaiveBayesCat.kl_distance: this function does not work " +
"for real attributes");
} double support = nominCounts.attr_count(attrNum, Globals.UNKNOWN_CATEGORY_VAL);
MLJ.verify_strictly_greater(trainWeight,0.0,"NaiveBayesCat.kl_distance: " +
"total train weight is negative");
if (support < 5) { // @@ make this an option
return 0;
}
int numLabelValues = get_schema().num_label_values();
for(int i=0; i < numLabelValues; i++) {
// Compute p(C) and p(C|?) with laplace correction so we
// avoid zeros and can do KL distance.
q[i] = (nominCounts.label_count(i) + 1)/(trainWeight + numLabelValues);
MLJ.ASSERT(support > 0,"NaiveBayesCat.kl_distance()");
p[i]=(nominCounts.val_count(i, attrNum, Globals.UNKNOWN_CATEGORY_VAL) + 1)/(support + numLabelValues);
}
// now get the distance
logOptions.LOG(3, "p=" + p + "\nq=" + q + endl);
double dist = this.kullback_leibler_distance(p, q);
logOptions.LOG(2, "k-l distance for attribute " + this.get_schema().attr_name(attrNum) +
" (" + attrNum + "): " + dist + endl);
return dist;
} /** Removed function *//* boolean operator==(Categorizer rhs) { }*/ /** Removed function *//* public void make_persistent(PerCategorizer_ dat) {
*/ /** Compute a Kullback Leibler distance metric given an array * of p(x) and q(x) for all x.
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -