?? vfem.c
字號:
#include "vfml.h"#include "vfem-engine.h"#include <stdio.h>#include <string.h>#include <sys/times.h>#include <time.h>#include <math.h>char *gFileStem = "DF";char *gSourceDirectory = ".";int gDoTests = 0;int gMessageLevel = 0;int gNumClusters = -1;float gDelta = 0.05;float gErrorTarget = 0.01;float gThisErrorTarget = 0.01;int gEstimatedNumIterations = 5;int gBatch = 0;long gMaxExamplesPerIteration = 1000000000;int gInitNumber = 1;int gFancyStop = 1;float gConvergeDelta = 0.001;int gSeed = -1;float gR = 0;float gAssignErrorScale = 1.0;int gAllowBadConverge = 0;float gSigmaSquare = .05;int gStdin = 0;int gIterativeResults = 0;int gTestOnTrain = 0; /* default test total center-matching distance */int gNormalApprox = 0;long gDBSize = 0;/* HERE add parameters for these */int gDoEarlyStop = 0;int gLoadCenters = 0;int gReassignCentersEachRound = 0;int gUseGeoffBound = 1;/* HERE some hackish globals & stuff*/VoidAListPtr gStatsList;int gD;int gIteration = 0;int gRound = 0;int gTotalExamplesSeen = 0;float gNeededDelta;float gTargetEkd;long gN;float *gIterationNs = 0;int gNumIterationNs;static void _printUsage(char *name) { printf("%s : Very Fast Expectation Maximization\n", name); printf("-f <filestem>\t Set the name of the dataset (default DF)\n"); printf("-source <dir>\t Set the source data directory (default '.')\n"); printf("-clusters <numClusters>\t Set the num clusters to find (REQUIRED)\n"); printf("-variance <value>\t The variance of the Gaussians, the current\n\t\t version of EM needs this as a parameter (default 0.05)\n"); printf("-assignErrorScale <scale>\t Loosen bound by scaling the assignment\n\t\t part of it by the scale factor (default 1.0)\n"); printf("-delta <delta>\t Find final solution with confidence (1 - <delta>)\n\t\t (default 5%%)\n"); printf("-epsilon <epsilon>\t The maximum distance between a learned\n\t\t centroid and the coresponding infinite\n\t\t data centroid (default .01)\n"); printf("-converge <distance>\t Stop when distance centroids move between\n\t\t iterations squared is less than <distance> (default .001)\n"); printf("-batch\t Do a traditional kmeans run on the whole input file. Doesn't\n\t\t make sense to combine with -stdin (default off)\n"); printf("-init <num>\t use the <num>th valid assignment of initial centroids\n\t\t (default 1)\n"); printf("-maxPerIteration <num> \t use no more than num examples per\n\t\t iteration (default 1,000,000,000)\n"); printf("-l <number>\t The estimated # of iterations to converge if you\n\t\t guess wrong and aren't using batch VFEM will fix it for you\n\t\t and do an extra round (default 10)\n"); printf("-seed <s>\t Seed to use picking initial centers (default random)\n"); printf("-progressive\t Don't use the Lagrange based optimization\n\t\t to pick the # of samples at each iteration of the\n\t\t next round (default use the optimization)\n"); printf("-tightBound\t Use a tighter assignment error bound (requires\n\t\t a second pass over the dataset) (default use a\n\t\t looser one-pass bound)\n"); printf("-allowBadConverge\t Allows VFEM to converge at a time when\n\t\t em wouldn't (default off).\n"); printf("-r <range>\t The maximum range between pairs of examples\n\t\t (default assume the range of each dimension is 0 - 1.0\n\t\t and calculate it from that)\n"); printf("-stdin \t\t Reads training examples as a stream from stdin\n\t\t instead of from <stem>.data (default off)\n"); printf("-testOnTrain \t loss is the sum of the square of the distances\n\t\t from allu training examples to their assigned\n\t\t centroid (default compare learned centroids to\n\t\t centroids in <stem>.test)\n"); printf("-normalApprox \t use a normal approximation of the Hoeffding\n\t\t bound (default use hoeffding bound)\n"); printf("-loadCenters \t load initial centroids from <stem>.centers\n\t\t (default use random based on -init and -seed)\n"); printf("-u\t\t Output results after each iteration\n"); printf("-v\t\t Can be used multiple times to increase the debugging output\n");}static void _processArgs(int argc, char *argv[]) { int i; /* HERE on the ones that use the next arg make sure it is there */ for(i = 1 ; i < argc ; i++) { if(!strcmp(argv[i], "-f")) { gFileStem = argv[i+1]; /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-source")) { gSourceDirectory = argv[i+1]; /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-u")) { gDoTests = 1; } else if(!strcmp(argv[i], "-testOnTrain")) { gTestOnTrain = 1; } else if(!strcmp(argv[i], "-clusters")) { sscanf(argv[i+1], "%d", &gNumClusters); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-dbSize")) { sscanf(argv[i+1], "%ld", &gDBSize); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-delta")) { sscanf(argv[i+1], "%f", &gDelta); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-converge")) { sscanf(argv[i+1], "%f", &gConvergeDelta); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-r")) { sscanf(argv[i+1], "%f", &gR); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-assignErrorScale")) { sscanf(argv[i+1], "%f", &gAssignErrorScale); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-variance")) { sscanf(argv[i+1], "%f", &gSigmaSquare); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-epsilon")) { sscanf(argv[i+1], "%f", &gErrorTarget); gThisErrorTarget = gErrorTarget; /* ignore the next argument */ i++;// } else if(!strcmp(argv[i], "-n")) {// sscanf(argv[i+1], "%ld", &gN);// /* ignore the next argument */// i++; } else if(!strcmp(argv[i], "-maxPerIteration")) { sscanf(argv[i+1], "%ld", &gMaxExamplesPerIteration); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-batch")) { gBatch = 1; } else if(!strcmp(argv[i], "-allowBadConverge")) { gAllowBadConverge = 1; } else if(!strcmp(argv[i], "-progressive")) { gFancyStop = 0; } else if(!strcmp(argv[i], "-tightBound")) { gUseGeoffBound = 0; } else if(!strcmp(argv[i], "-init")) { sscanf(argv[i+1], "%d", &gInitNumber); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-l")) { sscanf(argv[i+1], "%d", &gEstimatedNumIterations); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-seed")) { sscanf(argv[i+1], "%d", &gSeed); /* ignore the next argument */ i++; } else if(!strcmp(argv[i], "-normalApprox")) { gNormalApprox = 1; } else if(!strcmp(argv[i], "-loadCenters")) { gLoadCenters = 1; } else if(!strcmp(argv[i], "-v")) { gMessageLevel++; } else if(!strcmp(argv[i], "-h")) { _printUsage(argv[0]); exit(0); } else if(!strcmp(argv[i], "-stdin")) { gStdin = 1; } else { printf("Unknown argument: %s. use -h for help\n", argv[i]); exit(0); } } if(gNumClusters < 0) { printf("Didn't supply the required '-clusters' argument\n"); exit(0); } if(gDBSize <= 0) { printf("Didn't supply the required '-dbSize' argument\n"); exit(0); } if(gMessageLevel >= 1) { printf("Stem: %s\n", gFileStem); printf("Source: %s\n", gSourceDirectory); if(gStdin) { printf("Reading examples from standard in.\n"); } if(gDoTests) { printf("Running tests\n"); } printf("DB Size %ld\n", gDBSize); }}static float _MatchCentersGetDistanceSquare(VoidAListPtr learnedCenters, VoidAListPtr testCenters) { /* perfom a 1 to 1 matching between the learned and test centers. change the class lables of the learnedCenters to correspond to the closest testCenters. Do a greedy assignment that tries to minimize the total distance between assigned pairs */ int i, j, init; double bestDistance, thisDistance; int lBestIndex, tBestIndex; VoidAListPtr unLearned = VALNew(); VoidAListPtr unTest = VALNew(); float totalDistance = 0.0; for(i = 0 ; i < VALLength(learnedCenters) ; i++) { VALAppend(unLearned, VALIndex(learnedCenters, i)); } for(i = 0 ; i < VALLength(testCenters) ; i++) { VALAppend(unTest, VALIndex(testCenters, i)); } bestDistance = lBestIndex = tBestIndex = 0; while(VALLength(unLearned) > 0) { init = 0; for(i = 0 ; i < VALLength(unLearned) ; i++) { for(j = 0 ; j < VALLength(unTest) ; j++) { thisDistance = ExampleDistance(VALIndex(unLearned, i), VALIndex(unTest, j)); if(thisDistance < bestDistance || !init) { init = 1; bestDistance = thisDistance; lBestIndex = i; tBestIndex = j; } } } if(gMessageLevel > 0) { printf("\tMatched cluster %d distance %lf\n", ExampleGetClass(VALIndex(unTest, tBestIndex)), ExampleDistance(VALIndex(unLearned, lBestIndex), VALIndex(unTest, tBestIndex))); } totalDistance += bestDistance * bestDistance; ExampleSetClass(VALRemove(unLearned, lBestIndex), ExampleGetClass(VALRemove(unTest, tBestIndex))); } VALFree(unLearned); VALFree(unTest); return totalDistance;}static int _FindClosestCenterIndex(ExamplePtr e, VoidAListPtr centers) { int i; double bestDistance, thisDistance; int bestIndex; bestIndex = 0; bestDistance = ExampleDistance(e, VLIndex(centers, 0)); for(i = 1 ; i < VLLength(centers) ; i++) { thisDistance = ExampleDistance(e, VLIndex(centers, i)); if(thisDistance < bestDistance) { bestDistance = thisDistance; bestIndex = i; } } return bestIndex;}static ExamplePtr _FindClosestCenter(ExamplePtr e, VoidAListPtr centers) { return VALIndex(centers, _FindClosestCenterIndex(e, centers));}static float _CalculateIDKMEarlyBound(void) { int i, j, k; float maxError, thisError, bound, cError; IterationStatsPtr isL, isI; ExamplePtr eL, eI; isL = VALIndex(gStatsList, VALLength(gStatsList) - 1); maxError = 0; for(i = 0 ; i < VALLength(gStatsList) ; i++) { isI = VALIndex(gStatsList, i); if(isI->possibleIDConverge) { thisError = 0; for(j = 0 ; j < VALLength(isI->centroids) ; j++) { eL = VALIndex(isL->centroids, j); eI = VALIndex(isI->centroids, j); cError = 0; for(k = 0 ; k < ExampleGetNumAttributes(eL) ; k++) { /* HERE fix for discrete */ bound = ExampleGetContinuousAttributeValue(eL, k) - ExampleGetContinuousAttributeValue(eI, k) + IterationStatsErrorBoundDimension(isI, j, k); cError += pow(bound, 2); } thisError += cError; //printf("i%d c%d cError %f totalError %f\n", // i, j, cError, thisError); } if(thisError > maxError) { maxError = thisError; } } } return maxError;}static float _CalculateErrorBound(void) { int i; float maxError = 0; float earlyError; IterationStatsPtr isL; isL = VALIndex(gStatsList, VALLength(gStatsList) - 1); /* find the error from the final iteration */ for(i = 0 ; i < VALLength(isL->centroids) ; i++) { maxError += pow(isL->lastBound[i], 2); if(gMessageLevel > 2) { printf("max ekd %.4f sum bnd^2 %.4f bnd%d %.4f bnd^2%d %.4f\n", isL->maxEkd, maxError, i, isL->lastBound[i], i, pow(isL->lastBound[i], 2)); } } if(!gAllowBadConverge) { earlyError = _CalculateIDKMEarlyBound(); if(earlyError > maxError) { if(gMessageLevel > 1) { printf("early stop error (%.5f) greater than El (%.5f)\n", earlyError, maxError); } maxError = earlyError; } } return maxError;}static int _WhenEMConverged(void) { int i; IterationStatsPtr is; for(i = 0 ; i < VLLength(gStatsList) ; i++) { is = VLIndex(gStatsList, i); if(is->wouldEMConverge) { return i; } } return VLLength(gStatsList) - 1;}static void _doTests(ExampleSpecPtr es, VoidAListPtr learnedCenters, long learnCount, long learnTime, int finalOutput) { char fileNames[255]; FILE *exampleIn, *testCentersIn, *centersOut; ExamplePtr e, tc, lc; VoidAListPtr testCenters; long i; float loss; float bound; int foundBound, guarenteeIDConverge, lastFoundBound; long tested = 0; foundBound = 0; if(VALLength(gStatsList) > 0) { lastFoundBound = ((IterationStatsPtr)VALIndex(gStatsList, VALLength(gStatsList) - 1))->foundBound; guarenteeIDConverge = ((IterationStatsPtr)VALIndex(gStatsList, VALLength(gStatsList) - 1))->guarenteeIDConverge; if(lastFoundBound) { if(guarenteeIDConverge) { foundBound = 1; if(gMessageLevel >= 1) { printf("Have a bound and ID would converge\n"); } } else if(((IterationStatsPtr)VALIndex(gStatsList, VALLength(gStatsList) - 1))->wouldEMConverge && gAllowBadConverge) { if(gMessageLevel >= 1) { printf("Have a bound and ID may or may not converge, we converge.\n");
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -