?? bayesnet.cpp
字號:
#include <string>#include <iostream>#include <sstream>#include <expat.h>#include <list>#include "BayesNet.h"#include "DecisionTree.h"// HACK -- leaving this as module data prevents us from// parsing more than one BayesNet at a time.string elementCdata = "";DecisionTree* dtree = NULL;int currVar = -1;bool inTypeVariables = false;int maxVarState = -1;int getVarIndex(const char* colname){ int ret = 0; int pow = 1; for (int i = strlen(colname) - 1; isdigit(colname[i]); i--) { ret += pow * (colname[i] - '0'); pow *= 10; } return ret - 1;}const XML_Char* getAtt(const XML_Char* attName, const XML_Char** attList){ for (int i = 0; attList[i]; i += 2) { if (!strcmp(attList[i], attName)) { return attList[i+1]; } } return NULL;}void characterDataHandler(void* userData, const XML_Char *cdata, int len){ elementCdata.append(cdata, len);}void startElementHandler(void* userData, const XML_Char *name, const XML_Char **atts){ // DEBUG //cout << "Found start element: " << name << endl; BayesNet* model = (BayesNet*)userData; if (!strcmp(name, "TypeVariables")) { inTypeVariables = true; } else if (!strcmp(name, "Variable")) { // HACK: this only works if every single variable // has a typeVariable with a name of the form: // "TypeK n-state" OR "TypeK continuous" if (!inTypeVariables) { const char* typeVarName = getAtt("typeVariable", atts); int numStates = -1; // DEBUG / HACK -- Really ought to handle general case, // when typeVariable might not be specified. if (typeVarName != NULL) { if (strstr(typeVarName, "ontinuous")) { model->addVar(-1); } else { // We expect the number of states to appear after // a space, in a name of the form: "TypeK n-state" for (unsigned int i = 0; i < strlen(typeVarName); i++) { if (isspace(typeVarName[i])) { numStates = atoi(typeVarName + i + 1); break; } } model->addVar(numStates); } } else { const char* type = getAtt("type", atts); if (type == NULL) { cout << "ERROR: no type variable or type specified!\n"; return; } // Add continuous vars now. Add categorical vars once we've // counted their states. if (!strcmp(type, "continuous")) { model->addVar(-1); } } } } else if (!strcmp(name, "State")) { if (!inTypeVariables) { maxVarState++; } } else if (!strcmp(name, "LocalModel")) { const char* varName = getAtt("variable", atts); if (!varName) { cout << "ERROR: no variable specified in LocalModel!\n"; } currVar = getVarIndex(varName); // DEBUG //cout << "Reading model for variable " << currVar << endl; } else if (!strcmp(name, "Input")) { const char* varName = getAtt("variable", atts); if (!varName) { cout << "ERROR: no variable specified in Input!\n"; } model->addChild(getVarIndex(varName), currVar); } else if (!strcmp(name, "DecisionTree")) { // dtree = new DecisionTree(currVar, model->schema.getRange(currVar)); dtree = model->getDecisionTree(currVar); } else if (!strcmp(name, "Vertex")) { const char* splitName = getAtt("split", atts); if (!splitName) { cout << "ERROR: no split variable specified at Vertex!\n"; } dtree->beginVertex(getVarIndex(splitName)); } else if (!strcmp(name, "Branch")) { dtree->beginBranch(); } else if (!strcmp(name, "Multinomial")) { dtree->beginMultinomial(); } elementCdata.clear();}void endElementHandler(void* userData, const XML_Char *name){ // DEBUG //cout << "Found end element: " << name << endl; BayesNet* model = (BayesNet*)userData; if (!strcmp(name, "TypeVariables")) { inTypeVariables = false; } else if (!strcmp(name, "Variable")) { if (!inTypeVariables) { if (maxVarState >= 0) { model->addVar(maxVarState + 1); } maxVarState = -1; } } else if (!strcmp(name, "DecisionTree")) { /* model->setDecisionTree(currVar, dtree); dtree = NULL; */ } else if (!strcmp(name, "Vertex")) { dtree->endVertex(); } else if (!strcmp(name, "Branch")) { dtree->endBranch(); } else if (!strcmp(name, "Values")) { list<Range> values; istringstream in(elementCdata); while (in) { Range r; in >> r; if (in) { values.push_back(r); } } dtree->endValues(values); } else if (!strcmp(name, "Probs")) { unsigned int base = 0; int index = 0; double *probs = new double[dtree->getRange()]; // DEBUG //cout << "Probs: " << elementCdata << " ("; for (unsigned int i = 0; i < elementCdata.length(); i++) { if (isspace(elementCdata[i])) { if (i > base) { probs[index++] = atof(elementCdata.substr(base, i-base).c_str()); // DEBUG //cout << probs[index-1] << " "; } base = i + 1; } } if (elementCdata.length() > base) { probs[index++] = atof(elementCdata.substr(base).c_str()); // DEBUG //cout << probs[index-1] << " "; } // DEBUG //cout << ")\n"; dtree->endProbs(probs); } else if (!strcmp(name, "Mean")) { dtree->endMean(atof(elementCdata.c_str())); } else if (!strcmp(name, "SD")) { dtree->endSD(atof(elementCdata.c_str())); } else if (!strcmp(name, "ProbMissing")) { dtree->endProbMissing(atof(elementCdata.c_str())); } else if (!strcmp(name, "BinGaussian")) { dtree->endBinGaussian(); } elementCdata.clear();}// DEBUG#include <stdio.h>void loadModel(FILE* in, BayesNet& model){ XML_Parser parser = XML_ParserCreate(NULL); if (!parser) { cout << "ERROR: could not allocate memory for parser!\n"; return; } XML_SetUserData(parser, &model); XML_SetElementHandler(parser, startElementHandler, endElementHandler); XML_SetCharacterDataHandler(parser, characterDataHandler); char buf[10240]; while (!feof(in)) { int bytesRead = fread(buf, 1, 1000, in); /* DEBUG ((char*)buf)[bytesRead] = '\0'; printf((char*)buf); cout << "in.eof()? " << feof(in) << endl; */ if ( !XML_Parse(parser, buf, bytesRead, feof(in)) ) { cout << "Parse error at line " << XML_GetCurrentLineNumber(parser) << ":\n" << XML_ErrorString(XML_GetErrorCode(parser)) << endl; return; } } XML_ParserFree(parser);// We have no reason at present to get the splits of a continuous variable.// This was debugging code for functionality that's no longer used.#if 0 for (int i = 0; i < model.getNumVars(); i++) { if (model.getRange(i) >= 0) { continue; } // Find all splits list<double> allsplits; for (int v = 0; v < model.getNumVars(); v++) { DecisionTree* currTree = model.getDecisionTree(v); list<double> currsplits = currTree->getSplits(i); allsplits.splice(allsplits.end(), currsplits); } allsplits.sort(); // Print out values cout << i << ":"; list<double>::iterator iter; for (iter = allsplits.begin(); iter != allsplits.end(); iter++) { cout << " " << *iter; } cout << endl; }#endif return;}vector<double> BayesNet::MBdist(int var, VarSet& allVars) const{ const Leaf* l = decisionTrees[var]->getLeaf(allVars.getArray()); int numVals = decisionTrees[var]->getRange(); vector<double> distrib(numVals); double totalProb = -1.0/0.0000000000001; // Compute probability distribution across all values, // using Markov-blanket inference. double origVal = allVars[var]; for (int i = 0; i < numVals; i++) { // Compute p(x | parents(x)) distrib[i] = l->getLogProb(i); // DEBUG if (isnan(distrib[i])) { cout << "distrib[" << i << "] = 0.0!\n"; } // Scale by p(children(x) | parents(children(x))) list<int>::const_iterator c; for (c = children[var].begin(); c != children[var].end(); c++) { allVars[var] = i; distrib[i] += getLogProb(*c, allVars.getArray()); // DEBUG if (isnan(distrib[i])) { cout << "distrib[" << i << "] = 0.0 after child " << *c << "\n"; } } // Keep track of total probability, so we can normalize if (totalProb - distrib[i] < -10) { // If distrib[i] is much larger, just use it. totalProb = distrib[i]; } else if (totalProb - distrib[i] < 10) { // If they're within the same range, add them totalProb = log(exp(totalProb - distrib[i]) + 1) + distrib[i]; } } allVars[var] = origVal; if (isnan(totalProb)) { cout << "totalProb = 0!\n"; } // Normalize for (int i = 0; i < numVals; i++) { distrib[i] = exp(distrib[i] - totalProb); } return distrib;}double BayesNet::MBsample(int var, VarSet& allVars) const{ const Leaf* l = decisionTrees[var]->getLeaf(allVars.getArray()); int numVals; vector<double> actualVals; if (decisionTrees[var]->getRange() < 0) { DT::BinGaussian* g = (DT::BinGaussian*)l; double minVal = g->mean - 4 * g->SD; double maxVal = g->mean + 4 * g->SD; int numIncs = 1000; double dx = (maxVal - minVal)/numIncs; numVals = numIncs; for (double currX = minVal; currX < maxVal; currX += dx) { actualVals.push_back(currX); } actualVals.push_back(VarSet::UNKNOWN); } else { numVals = decisionTrees[var]->getRange(); for (int i = 0; i < numVals; i++) { actualVals.push_back(i); } } vector<double> distrib(numVals); //double totalProb = 0.0; double totalProb = -1.0/0.0000000000001; // Compute probability distribution across all values, // using Markov-blanket inference. double origVal = allVars[var]; for (int i = 0; i < numVals; i++) { allVars[var] = actualVals[i]; // Compute p(x | parents(x)) distrib[i] = l->getLogProb(actualVals[i]); // Scale by p(children(x) | parents(children(x))) list<int>::const_iterator c; for (c = children[var].begin(); c != children[var].end(); c++) { distrib[i] += getLogProb(*c, allVars.getArray()); } // Keep track of total probability, so we can normalize if (totalProb - distrib[i] < -10) { // If distrib[i] is much larger, just use it. totalProb = distrib[i]; } else if (totalProb - distrib[i] < 10) { // If they're within the same range, add them totalProb = log(exp(totalProb - distrib[i]) + 1) + distrib[i]; } } allVars[var] = origVal; // Normalize for (int i = 0; i < numVals; i++) { distrib[i] = exp(distrib[i] - totalProb); } // Sample double p = (double)rand() / RAND_MAX; for (int i = 0; i < numVals; i++) { p -= distrib[i]; if (p < 0) { return actualVals[i]; } } // DEBUG cout << "Error: defaulting to last value!\n"; cout << "p = " << p << endl; cout << "var = " << var << endl; cout << "numVals = " << numVals << " (" << decisionTrees[var]->getRange() << ")\n"; for (int i = 0; i < numVals; i++) { cout << distrib[i] << " "; } cout << endl; return actualVals.back();}// Generate a complete samplevoid BayesNet::wholeSample(VarSet& allvars) const{ /* HACK -- * If you need a variable to be sampled, it must be * marked as UNTESTED in allvars before calling this function. * UNKNOWN is simply not sufficient, because UNKNOWN is a legal * value for continuous variables. (For discrete variables, * the unknown value is a separate value taken care of when building * the dataset. It must be handled specially for continuous variables.) */ int knownVars = 0; for (int i = 0; i < numVars; i++) { /* if (allvars.isObserved(i) && allvars.isTested(i)) { knownVars++; } */ if (allvars.isTested(i)) { knownVars++; } } // Sample each unknown variable whose parents have been specified. // Keep going until all variables have been sampled. while (knownVars < numVars) { for (int i = 0; i < numVars; i++) { if (/*!allvars.isObserved(i) || */ !allvars.isTested(i)) { bool unspecifiedParent = false; list<int>::const_iterator p; for (p = parents[i].begin(); p != parents[i].end(); p++) { if (/* !allvars.isObserved(*p) || */ !allvars.isTested(*p)) { unspecifiedParent = true; break; } } if (!unspecifiedParent) { allvars[i] = decisionTrees[i]->sample(allvars.getArray()); knownVars++; } } } }}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -