?? vfem.c
字號:
printf(" c#%d: ", i); ExampleWrite(VALIndex(is->centroids, i), stdout); } fflush(stdout); } /* do the Ws */ denominator = 0; for(i = 0 ; i < VALLength(is->centroids) ; i++) { centroid = VLIndex(is->centroids, i); denominator += exp( (-1.0 / (2.0 * gSigmaSquare)) * pow(ExampleDistance(e, centroid), 2)); } for(i = 0 ; i < VALLength(is->centroids) ; i++) { centroid = VLIndex(is->centroids, i); numerator = exp( (-1.0 / (2.0 * gSigmaSquare)) * pow(ExampleDistance(e, centroid), 2));//if(i == 4) { printf("Denom: %.3f Numer: %.3f w: %.3f distance: %.3f, true: %d\n",// denominator, numerator, (numerator / denominator),// ExampleDistance(e, centroid), ExampleGetClass(e)); } is->w[i] += (numerator / denominator); for(j = 0 ; j < ExampleSpecGetNumAttributes(es) ; j++) { is->wx[i][j] += (numerator / denominator) * ExampleGetContinuousAttributeValue(e, j); } } if(gUseGeoffBound) { _RecordGeoffBoundInfo(e, is, es); } if(gMessageLevel > 3) { IterationStatsWrite(is, es, stdout); fflush(stdout); } ExampleFree(e); /* check to see if we should move to the next iteration */ if(!gBatch && gFancyStop && gIteration <= gNumIterationNs) { /* Test to see if this iteration is done */ if(seen >= gIterationNs[gIteration - 1]) { done = 1; } } else if(!gBatch && seen >= gN) { done = 1; } else if(seen > gMaxExamplesPerIteration) { done = 1; } if(!done) { /* if we didn't get stopped by the termination check get another */ e = ExampleRead(data, es); } } if(gMessageLevel > 1) { printf("Finished an iteration, n is %ld.\n", is->n); IterationStatsWrite(is, es, stdout); } newIs = IterationStatsNext(is, gNeededDelta, 1.0, gAssignErrorScale, es, !gUseGeoffBound, boundData); VALAppend(gStatsList, newIs); if(gMessageLevel > 1) { printf("exit iteration %d seen %d\n", gIteration, gTotalExamplesSeen); fflush(stdout); } if(newIs) { return _CheckConverganceUpdateStats(is, newIs); } else { /* we didn't converge, but this round will be stoped by the foundBound of 0 */ return 0; }}static ExamplePtr _PickInitalCentroid(ExampleSpecPtr es, VoidAListPtr centroids, FILE *data) { float minDistance; int done = 0; ExamplePtr e; int used; int i; minDistance = gR / ((float)gNumClusters * 4); //minDistance = gR / ((float)gNumClusters * 2); while(!done) { e = ExampleRead(data, es); DebugError(e == 0, "Unable to get enough unique initial centroids"); /* make sure this isn't too close to one in the list */ used = 0; for(i = 0 ; i < VALLength(centroids) && !used ; i++) { /* HERE make a parameter? */ if(ExampleDistance(e, VALIndex(centroids, i)) <= minDistance) { used = 1; } } /* if it's ok then use it */ if(!used) { done = 1; } else { ExampleFree(e); } } return e;}static void _PickInitialCentroids(ExampleSpecPtr es, VoidAListPtr centroids, FILE *data) { /* pick the first unique 'gNumClusters' points from the dataset to be centroids HERE should I add some randomness to this? */ int j; ExamplePtr e; char fileNames[255]; FILE *centersIn; /* burn some examples to make the selection be more random */ for(j = RandomRange(0, 10 * gNumClusters) ; j > 0 ; j--) { ExampleFree(ExampleRead(data, es)); } /* if instructed, read in the initial centroids */ if(gLoadCenters) { sprintf(fileNames, "%s.centers", gFileStem); centersIn = fopen(fileNames, "r"); if(centersIn) { if(gMessageLevel > 0) { printf("Loading inital centers from %s\n", fileNames); } e = ExampleRead(centersIn, es); while(e != 0) { VALAppend(centroids, e); e = ExampleRead(centersIn, es); } fclose(centersIn); } } while(VALLength(centroids) < gNumClusters) { VALAppend(centroids, _PickInitalCentroid(es, centroids, data)); } if(gMessageLevel > 0) { printf("loss for initial centroids:\n"); _doTests(es, centroids, 0, 0, 0); }}//static void _OutputGoodCentroids(float bound) {// int i;// char fileNames[255];// FILE *centersOut;// IterationStatsPtr thisIs, firstIs;// float totalError, errorThreshold; /* write out the centroids, if we didn't get a bound only write out the good ones, where lastBound < (100 / gNumClusers)% of the total lastBound error */// thisIs = VALIndex(gStatsList, VALLength(gStatsList) - 1);// firstIs = VALIndex(gStatsList, 0);// sprintf(fileNames, "%s.centers", gFileStem);// centersOut = fopen(fileNames, "w");// totalError = 0;// for(i = 0 ; i < VALLength(thisIs->centroids) ; i++) {// totalError += thisIs->lastBound[i];// }// errorThreshold = totalError / (float)gNumClusters; //IterationStatsWrite(thisIs, stdout);// if(gMessageLevel > 0) {// printf("output good centers, thresh: %f\n", errorThreshold);// }// for(i = 0 ; i < VALLength(thisIs->centroids) ; i++) {// /* if we finished with a bound or if the centroid's error was ok */// if((thisIs->guarenteeIDConverge && thisIs->foundBound &&// bound <= gThisErrorTarget) || // (thisIs->lastBound[i] < errorThreshold)) {// ExampleWrite(VALIndex(firstIs->centroids, i), centersOut);// }// }// fclose(centersOut);//}static void _OutputAllCentroids(float bound) { int i; char fileNames[255]; FILE *centersOut; IterationStatsPtr thisIs; thisIs = VALIndex(gStatsList, VALLength(gStatsList) - 1); sprintf(fileNames, "%s.centers", gFileStem); centersOut = fopen(fileNames, "w"); for(i = 0 ; i < VALLength(thisIs->centroids) ; i++) { ExampleWrite(VALIndex(thisIs->centroids, i), centersOut); } fclose(centersOut);}static void _OutputCentroidMovement(void) { IterationStatsPtr firstIs, lastIs; int i; float total, current, totalSquare; firstIs = VALIndex(gStatsList, 0); lastIs = VALIndex(gStatsList, VALLength(gStatsList) - 1); total = 0; totalSquare = 0; for(i = 0 ; i < VALLength(firstIs->centroids) ; i++) { current = ExampleDistance(VALIndex(firstIs->centroids, i), VALIndex(lastIs->centroids, i)); total += current; totalSquare += pow(current, 2); printf(" centroid %d moved: %f move^2: %f\n", i, current, pow(current, 2)); } printf(" total: %f total^2: %f\n", total, totalSquare);}static IterationStatsPtr _FindLastIsWithBound(void) { int i; IterationStatsPtr is; for(i = VALLength(gStatsList) - 1 ; i >= 0 ; i--) { is = VALIndex(gStatsList, i); if(is->foundBound) { return is; } } DebugWarn(1, "_FindLastIsWithBound didn't find a bound\n"); return 0;}static float _FindMedian(float *array, int len) { float *errorArray = MNewPtr(sizeof(float) * len); float tmp, median; int i, j; /* create the sorted error array which we'll need to find the median */ for(i = 0 ; i < len ; i++) { errorArray[i] = array[i]; } for(i = 0 ; i < len ; i++) { for(j = 0 ; j < len - (i + 1) ; j++) { if(errorArray[j] > errorArray[j + 1]) { tmp = errorArray[j + 1]; errorArray[j + 1] = errorArray[j]; errorArray[j] = tmp; } } } if(len % 2 == 0) { i = (len / 2) - 1; median = (errorArray[i] + errorArray[i + 1]) / 2.0; } else { i = ((len + 1) / 2) - 1; median = errorArray[i]; } MFreePtr(errorArray); return median;}VoidAListPtr _GetCentroidsForNextRound(FILE *data, ExampleSpecPtr es) { IterationStatsPtr is = _FindLastIsWithBound();// IterationStatsPtr is = VALIndex(gStatsList, VALLength(gStatsList) - 1); float median = _FindMedian(is->lastBound, VALLength(is->centroids)); IterationStatsPtr iIs = VALIndex(gStatsList, 0); VoidAListPtr newCentroids = VALNew(); int i; //IterationStatsWrite(is, stdout); for(i = 0 ; i < VALLength(is->centroids) ; i++) { if(is->lastBound[i] <= median * 5) { VALAppend(newCentroids, ExampleClone(VALIndex(iIs->centroids, i))); //VALAppend(newCentroids, ExampleClone(VALIndex(is->centroids, i))); } else if(gMessageLevel > 1) { printf(" Reassigning centroid %d bound %f median %f\n", i, is->lastBound[i], median); fflush(stdout); } } while(VALLength(newCentroids) < VALLength(is->centroids)) { VALAppend(newCentroids, _PickInitalCentroid(es, newCentroids, data)); } return newCentroids;}/* this should be static */void CalculateExamplesPerIteration(VoidAListPtr last, float **nextNiOut, int *num);int main(int argc, char *argv[]) { char fileNames[255]; FILE *exampleIn = 0, *boundDataIn = 0; ExampleSpecPtr es; ExamplePtr e; VoidListPtr centers, newCenters = 0; float iterationNSum; long learnTime; int i; int breakOut; int fileDone; long nIncrement; float lastDelta, bound; struct tms starttime; struct tms endtime; IterationStatsPtr thisIs, lastIs; _processArgs(argc, argv); if(gStdin) { /* This is a hack because when I pipe clusterdata to vfem vfem tries to read the spec before clusterdata can write it */ sleep(5); } sprintf(fileNames, "%s/%s.names", gSourceDirectory, gFileStem); es = ExampleSpecRead(fileNames); DebugError(es == 0, "Unable to open the .names file"); RandomInit(); /* seed for the concept */ if(gSeed != -1) { RandomSeed(gSeed); } else { gSeed = RandomRange(1, 30000); RandomSeed(gSeed); } if(gMessageLevel > 0) { printf("running with seed %d\n", gSeed); } /* initialize some globals */ gStatsList = VALNew(); gD = ExampleSpecGetNumAttributes(es); if(gR == 0) { gR = sqrt(gD); } if(!gAllowBadConverge) { /* use a tighter bound so we get a better convergence behavior */ gThisErrorTarget = min(gErrorTarget, gConvergeDelta / 3.0); } else { /* HERE this is a hack for now, take it out!! */ gThisErrorTarget = gErrorTarget; //gThisErrorTarget = min(gErrorTarget, gConvergeDelta); } gNeededDelta = 1.0 - pow(1.0 - gDelta, 1.0 / (float)(gD * gNumClusters * gEstimatedNumIterations)); gTargetEkd = sqrt(gThisErrorTarget / ((float)gNumClusters * (float)gD)); /* HERE fix this for the new bound */ gN = (gNumClusters / 2.0) * pow(1.0/gTargetEkd, 2) * log(2.0/gNeededDelta) * 1.1; if(gMessageLevel > 1) { printf("Target Ekd: %.3lf\n", gTargetEkd); } if(gStdin) {
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -