?? naivebayescat.java
字號:
package nb;import shared.AttrInfo;import shared.AugCategory;import shared.BagCounters;import shared.CatDist;import shared.Categorizer;import shared.DisplayPref;import shared.Entropy;import shared.Error;import shared.Globals;import shared.Instance;import shared.InstanceList;import shared.MLJ;import shared.NominalAttrInfo;import shared.Schema;import shared.StatData;import java.io.BufferedWriter;
import java.io.IOException;/** This categorizer returns the category (label) that had the * greatest relative probability of being correct, assuming * independence of attributes. Relative probability of a label * is calculated by multiplying the relative probability for * each attribute. The calculation of relative probabity for a * label on a single attribute depends on whether the attribute * is descrete or continuous. * By Bayes Theorem, P(L=l | X1=x1, X2=x2, ... Xn=xn) * = P(X1=x1, X2=x2, ... Xn=xn | L=l)*P(L=l)/P(X) * where P(X) is P(X1=x1, ..., Xn=xn). * Since P(X) is constant independent of the classes, we * can ignore it. * The Naive Bayesian approach asssumes complete independence * of the attributes GIVEN the label, thus * P(X1=x1, X2=x2, ... Xn=xn | L=l) = * P(X1=x1|L=l)*P(X2=x2|L)*... P(Xn=xn|L) * and P(X1=x1|L=l) = P(X1=x1 ^ L=l)/P(L=l) where this * quantity is approximated form the data. * When the computed probabilities for two labels have the same * value, we break the tie in favor of the most prevalent label. * * If the instance being categorized has the first attribute = 1, * and in the training set label A occured 20 times, 10 of * which had value 1 for the first attribute, then the * relative probability is 10/20 = 0.5. * * For continuous (real) attributes, the relative probability * is based on the Normal Distribution of the values of the * attribute on training instances with the label. The actual * calculation is done with the Normal Density; constants, * which do not affect the relative probability between labels, * are ignored. For example, say 3 training instances have * label 1 and these instances have the following values for a * continous attribute: 35, 50, 65. The program would use the * mean and variance of this "sample" along with the attribute * value of the instance that is being categorized in the * Normal Density equation. The evaluation of the Normal * Density equation, without constant factors, provides the * relative probability. * * Unknown attributes are skipped over. * * Assumptions : This method calculates the probability of a label as the * product of the probabilities of each attribute. * This is assuming that the attributes are * independent, a condition not likely corresponding to * reality. Thus the "Naive" of the title. * This method assumes that all continous attributes have a * Normal distribution for each label value. * * Comments : For nominal attributes, if a label does not have * any occurences for a given attribute value * of the test instance, a probability of * noMatchesFactor * ( 1 / # instances in training set ) * is used. * * For nominal attributes, if an attribute value does not * occur in the training set, the attribute is skipped * in the categorizer, since it does not serve to * differentiate the labels. * * The code can handle dealing with unknowns as a special * value by doing the is_unknown only in the real attribute * case. * * Helper class NBNorm is a simple structure to hold the * parameters needed to calculate the Normal Distribution * of each Attribute,Label pair. The NBNorms are stored in * a Array2 table "continNorm" which is indexed by attribute * number and label value. * * For continuous attributes the variance must not equal 0 since * it is in the denominator. If the variance is undefined for * a label value (e.g. if a label only has only one instance * in the training set), NaiveBayesInd will declare the * variance to be defaultVariance, a static variable. In * cases where the variance is defined but equal to 0, * NaiveBayesInd will declare the variance to be epsilon, * a very small static variable. * * For continous attributes, if a label does not occur in * the training set, a zero relative probability is * assigned. If a label occurs in the training set but only * has unknown values for the attribute, noMatchesFactor is * used as in the nominal attribute case above. * * Complexity : categorize() is O(ln) where l = the number of categories * and n = the number of attributes. * * @author James Plummer 5/15/2001 Ported to Java * @author Eric Bauer and Clay Kunz 5/24/1996 Added L'aplace correction * @author Robert Allen 12/03/94 Initial revision */public class NaiveBayesCat extends Categorizer {
public final static String endl = new String("\n");
// Member data (also see public data)
private BagCounters nominCounts; // hold data on nominal attributs
private NBNorm[][] continNorm; // hold data on real attributes
private double trainWeight;
private int numAttributes;
private boolean useLaplace; // turn on to activate Laplace correction
private double mEstimateFactor; // noise in Laplace correction
private double[] attrImportance; // importance values per attribute
private boolean[] unkIsVal; // should unknowns be special values? // decisions per attribute
/** Ported from C++ > * enum UnknownIsValueEnum { unknownNo, unknownYes, unknownAuto }; //C++ equivalent */ public static final int unknownNo = 1;
public static final int unknownYes = 2;
public static final int unknownAuto = 3;
private int unknownIsValue; // 1, 2, 3.
private double klThreshold;
/** Fraction of a single occurence to use in cases when a label * has no occurences of a given nominal value in the training set: */ private double noMatchesFactor;
/** If true Evidence projection is used. */ private boolean useEvidenceProjection;
/** The scale factor to use with Evidence Projection. */ private double evidenceFactor;
/** Categorizer option defaults. */ public static final double defaultMEstimateFactor = 1.0;
public static final boolean defaultLaplaceCorrection = false;
public static final int defaultUnknownIsValue = unknownNo;
public static final double defaultKLThreshold = 0.1;
public static final double defaultNoMatchesFactor = 0.0;
public static final boolean defaultUseEvidenceProjection = false;
public static final double defaultEvidenceFactor = 1.0;
/** Value to use for Variance when actual variance = 0: */ public static final double epsilon = .01;
/** Value to use for Vaiance when actual variance is undefined becase there * is only one occurance. */ public static final double defaultVariance = 1.0;
/** Constructor * @param dscr - the description of this Inducer. * @param instList - training data. */ public NaiveBayesCat(String dscr, InstanceList instList) {
super(instList.num_categories(), dscr, instList.get_schema());
nominCounts = instList.counters();
trainWeight = instList.total_weight();
numAttributes = instList.num_attr();
logOptions.LOG(3, "NBC . . numAttributes = "+numAttributes);
useLaplace = defaultLaplaceCorrection;
mEstimateFactor = defaultMEstimateFactor;
unkIsVal = null;
unknownIsValue = defaultUnknownIsValue;
klThreshold = defaultKLThreshold;
noMatchesFactor = defaultNoMatchesFactor;
useEvidenceProjection = defaultUseEvidenceProjection;
evidenceFactor = defaultEvidenceFactor;
attrImportance = this.compute_importance(instList);
continNorm = this.compute_contin_norm(instList);
} /** Copy Constructor. * @param source - the NaiveBayesCat to copy. */ public NaiveBayesCat(NaiveBayesCat source) {
super(source.num_categories(), source.description(), source.get_schema());
nominCounts = new BagCounters(source.nominCounts);
continNorm = source.copyContinNorm();
attrImportance = source.copyAttrImportance();
trainWeight = source.trainWeight;
numAttributes = source.numAttributes;
useLaplace = source.useLaplace;
mEstimateFactor = source.mEstimateFactor;
unkIsVal = null;
unknownIsValue = source.unknownIsValue;
klThreshold = source.klThreshold;
noMatchesFactor = source.noMatchesFactor;
useEvidenceProjection = source.useEvidenceProjection;
evidenceFactor = source.evidenceFactor;
} /** Categorizes a single instances based upon the training data. * @param instance - the instance to categorize. * @return the predicted category. */ public AugCategory categorize(Instance instance) {
CatDist cDist = score(instance);
AugCategory cat = cDist.best_category();
return cat;
} /** Simple Method to return an ID. * @return - an int representing this Categorizer. * @deprecated CLASS_NB_CATEGORIZER has been deprecated */ public int class_id() {return CLASS_NB_CATEGORIZER;}
/** Returns a pointer to a deep copy of this NaiveBayesCat. * @return - the copy of this Categorizer. */ public Object clone() {
if ( !(this instanceof NaiveBayesCat) ) {
Error.fatalErr("NaiveBayesCat.clone: invoked for improper class"); } return new NaiveBayesCat(this);
} /** Compute the norms of the continuous attributes
* @param instList - the instances to calculate. * @return the array[][] of NBNorms. */ public static NBNorm[][] compute_contin_norm(InstanceList instList) {
int contAttrCount = 0;
int numCategories = instList.num_categories();
Schema schema = instList.get_schema();
int numAttributes = schema.num_attr();
// start labels at -1 for unknown
NBNorm[][] normDens = new NBNorm[numAttributes][numCategories + 1]; // no initial value
for (int m=0; m<normDens.length;m++) {
for (int n=0; n<normDens[m].length;n++) {
normDens[m][n] = new NBNorm();
normDens[m][n].set_mean_and_var(0,0);
}
}
// loop through each attribute, and process all instances for each
// continuous one
for (int attrNum = 0; attrNum < numAttributes; attrNum++) {
AttrInfo attrinfo = schema.attr_info(attrNum);
if (attrinfo.can_cast_to_real()) {
// this is a continuous attribute
contAttrCount++;
// read each occurance in the list and feed the stats for attribute
StatData[] continStats = new StatData[numCategories + 1];
for (int j=0; j<continStats.length;j++) {
continStats[j]=new StatData();
}
// for (ILPix pix(instList); pix; ++pix) { //What?
for (int i = 0; i < instList.num_instances(); i++) {
Instance inst = new Instance((Instance)instList.instance_list().get(i));
int labelVal = schema.label_info().cast_to_nominal().get_nominal_val(inst.get_label()); //for some reason the label values for the instances are one number higher than the actual value
MLJ.ASSERT(labelVal < numCategories, " NaiveBayesCat.compute_contin_norm()");
// Ignore unknowns.
if ( !attrinfo.is_unknown(inst.get_value(attrNum))) {
double value = attrinfo.get_real_val(inst.get_value(attrNum));
continStats[labelVal].insert( value );
}
}
double mean;
double var;
// extract Normal Density parameters into normDens table
for (int label = 0; label < numCategories; label++) {
if (continStats[label].size() == 0 ) {
mean = 0;
var = defaultVariance;
}
else {
mean = continStats[label].mean();
if (continStats[label].size() == 1 )
var = defaultVariance;
else if ( (var = continStats[label].variance(0))<=0 ) // var == 0
var = epsilon;
}
normDens[attrNum][label].set_mean_and_var(mean,var);
//@@ pass in a log option?
//LOG(3, " Continuous Attribute # " << attrNum <<
//", Label " << label << ": Mean = " << mean <<
//", Variation = " << var << endl );
}
} // end of handling this continous attribute
} // end of loop through all attributes
if (contAttrCount==0) { // no continous attributes found
normDens = null;
}
return normDens;
} /** Computes importance values for each nominal attribute using
* the mutual_info (entropy).
* Static function; used as helper by train() below. * @param instList - the instances to use. * @return - the array[] of importance values. */ public static double[] compute_importance(InstanceList instList) {
double[] attrImp = new double[instList.num_attr()];
for (int i = 0; i < attrImp.length; i++) {
attrImp[i] = 0;
}
double ent = Entropy.entropy(instList);
if (ent == Globals.UNDEFINED_REAL) {
Error.fatalErr("compute_importance: undefined entropy");
}
if(ent < 0 && -ent < MLJ.realEpsilon) {
ent = 0;
}
for (int i=0; i<instList.num_attr(); i++) {
if(instList.get_schema().attr_info(i).can_cast_to_real()) {
attrImp[i] = 0;
}
else if(instList.get_schema().attr_info(i).can_cast_to_nominal()) {
if(ent <= 0) {
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -