?? garff.cpp
字號(hào):
{ pRow = GetVector(n); pRow[nAttribute] -= dInputMin; pRow[nAttribute] *= dScale; pRow[nAttribute] += dOutputMin; }}/*static*/ double GArffData::Normalize(double dVal, double dInputMin, double dInputRange, double dOutputMin, double dOutputRange){ GAssert(dInputRange > 0, "divide by zero"); dVal -= dInputMin; dVal /= dInputRange; dVal *= dOutputRange; dVal += dOutputMin; return dVal;}double* GArffData::MakeSetOfMostCommonOutputs(GArffRelation* pRelation){ int nOutputs = pRelation->GetOutputCount(); double* pOutputs = new double[nOutputs]; double* pRow; int nVal; int nIndex; int n; for(n = 0; n < nOutputs; n++) { nIndex = pRelation->GetOutputIndex(n); GArffAttribute* pAttr = pRelation->GetAttribute(nIndex); if(pAttr->IsContinuous()) { // Find the mean output value int i; int nCount = GetSize(); double dSum = 0; for(i = 0; i < nCount; i++) { pRow = GetVector(i); dSum += pRow[n]; } pOutputs[n] = dSum / nCount; } else { // Init the counts to zero int nCount = pAttr->GetValueCount(); Holder<int*> hCounts(new int[nCount]); int* pCounts = hCounts.Get(); memset(pCounts, '\0', sizeof(int) * nCount); // Count occurrences of each output value int i; nCount = GetSize(); for(i = 0; i < nCount; i++) { pRow = GetVector(i); nVal = (int)pRow[nIndex]; if(nVal < 0) { GAssert(nVal == -1, "out of range"); continue; } pCounts[nVal]++; } // Find the most common output value nCount = pAttr->GetValueCount(); int nMaxCount = pCounts[0]; int nBestValue = 0; for(i = 1; i < nCount; i++) { if(pCounts[i] > nMaxCount) { nBestValue = i; nMaxCount = pCounts[i]; } } // Set the value pOutputs[n] = (double)nBestValue; } } return pOutputs;}bool GArffData::IsOutputHomogenous(GArffRelation* pRelation){ int nRowCount = GetSize(); if(nRowCount <= 0) return true; int nOutputs = pRelation->GetOutputCount(); int n, i, nIndex, nVal, nTmp; double* pRow; double dVal; for(i = 0; i < nOutputs; i++) { nIndex = pRelation->GetOutputIndex(i); GArffAttribute* pAttr = pRelation->GetAttribute(nIndex); if(pAttr->IsContinuous()) { pRow = GetVector(0); dVal = pRow[nIndex]; for(n = 1; n < nRowCount; n++) { pRow = GetVector(n); if(pRow[nIndex] != dVal) return false; } } else { for(n = 0; n < nRowCount; n++) { pRow = GetVector(n); nVal = (int)pRow[nIndex]; if(nVal >= 0) { n++; break; } } for( ; n < nRowCount; n++) { pRow = GetVector(n); nTmp = (int)pRow[nIndex]; if(nTmp != nVal && nTmp >= 0) return false; } } } return true;}void GArffData::RandomlyReplaceMissingData(GArffRelation* pRelation){ int n, i, j; int nRowCount = GetSize(); int nAttrCount = pRelation->GetAttributeCount(); int nMaxValues = 0; int nValues; int nVal; int nSum; int nRand; int* pCounts = NULL; double* pRow; GArffAttribute* pAttr; for(i = 0; i < nAttrCount; i++) { // Make a buffer to hold the counts pAttr = pRelation->GetAttribute(i); if(pAttr->IsContinuous()) continue; nValues = pAttr->GetValueCount(); if(nValues > nMaxValues) { delete(pCounts); nMaxValues = pAttr->GetValueCount() + 3; pCounts = new int[nMaxValues]; } // Count the number of each value memset(pCounts, '\0', sizeof(int) * nValues); for(n = 0; n < nRowCount; n++) { nVal = (int)GetVector(n)[i]; if(nVal >= 0) { GAssert(nVal < nValues, "out of range"); pCounts[nVal]++; } else { GAssert(nVal == -1, "out of range"); } } // Sum the value counts nSum = 0; for(n = 0; n < nValues; n++) nSum += pCounts[n]; // Replace the missing values for(n = 0; n < nRowCount; n++) { pRow = GetVector(n); nVal = (int)pRow[i]; if(nVal < 0) { nRand = (int)(GBits::GetRandomUint() % nSum); for(j = 0; ; j++) { GAssert(j < nValues, "internal inconsistency"); nRand -= pCounts[j]; if(nRand < 0) { pRow[i] = (double)j; break; } } } } }}void GArffData::ReplaceMissingAttributeWithMostCommonValue(GArffRelation* pRelation, int nAttribute){ GArffAttribute* pAttr = pRelation->GetAttribute(nAttribute); if(pAttr->IsContinuous()) return; // missing values are currently only supported for discreet values int nValues = pAttr->GetValueCount(); GTEMPBUF(int, pCounts, nValues); memset(pCounts, '\0', sizeof(int) * nValues); double* pRow; int nRowCount = GetSize(); int n, nVal; for(n = 0; n < nRowCount; n++) { pRow = GetVector(n); nVal = (int)pRow[nAttribute]; if(nVal < 0) continue; GAssert(nVal < nValues, "out of range"); pCounts[nVal]++; } int nBest = 0; for(n = 1; n < nValues; n++) { if(pCounts[n] > pCounts[nBest]) nBest = n; } for(n = 0; n < nRowCount; n++) { pRow = GetVector(n); nVal = (int)pRow[nAttribute]; if(nVal < 0) { pRow[nAttribute] = (double)nBest; } }}void GArffData::Print(int nAttributes){ int nRows = GetSize(); double* pRow; int n, i; for(n = 0; n < nRows; n++) { pRow = GetVector(n); printf("%f", pRow[0]); for(i = 1; i < nAttributes; i++) printf("\t%f", pRow[i]); printf("\n"); }}int ComputeMinimumVariancePivotComparer(void* pThis, void* pA, void* pB){ int nAttr = (int)pThis; double* pdA = (double*)pA; double* pdB = (double*)pB; if(pdA[nAttr] > pdB[nAttr]) return 1; else return -1;}double GArffData::ComputeMinimumVariancePivot(int nAttr){ int nRows = GetSize(); GPointerArray arr(nRows); int n; for(n = 0; n < nRows; n++) arr.AddPointer(GetVector(n)); arr.Sort(ComputeMinimumVariancePivotComparer, (void*)nAttr); double dBestPivotScore = 1e100; double dBestPivot = 0; double dPivot, d; double* pRow1; double* pRow2; double dMean1, dMean2, dVar1, dVar2; int nCount1, nCount2, i; for(n = nRows - 2; n >= 0; n--) { // Try a pivot pRow1 = (double*)arr.GetPointer(n); pRow2 = (double*)arr.GetPointer(n + 1); dPivot = (pRow1[nAttr] + pRow2[nAttr]) / 2; // Compute the mean of each half dMean1 = 0; dMean2 = 0; nCount1 = 0; nCount2 = 0; for(i = 0; i < nRows; i++) { pRow1 = GetVector(i); if(pRow1[nAttr] < dPivot) { nCount1++; dMean1 += pRow1[nAttr]; } else { nCount2++; dMean2 += pRow1[nAttr]; } } dMean1 /= nCount1; dMean2 /= nCount2; // Compute the variance of each half dVar1 = 0; dVar2 = 0; for(i = 0; i < nRows; i++) { pRow1 = GetVector(i); if(pRow1[nAttr] < dPivot) { d = pRow1[nAttr] - dMean1; dVar1 += (d * d); } else { d = pRow2[nAttr] - dMean2; dVar2 += (d * d); } } dVar1 /= nCount1; dVar2 /= nCount2; d = dVar1 + dVar2; // See if we've got a new best score if(d < dBestPivotScore) { dBestPivotScore = d; dBestPivot = dPivot; } } return dBestPivot;}double GArffData::ComputeMinimumInfoPivot(GArffRelation* pRelation, int nAttr, double* pOutputInfo){ int nRows = GetSize(); GPointerArray arr(nRows); int n; for(n = 0; n < nRows; n++) arr.AddPointer(GetVector(n)); arr.Sort(ComputeMinimumVariancePivotComparer, (void*)nAttr); double dBestPivotScore = 1e100; double dBestPivot = 0; double dPivot, d; double* pRow1; double* pRow2; for(n = nRows - 2; n >= 0; n--) { // Try a pivot pRow1 = (double*)arr.GetPointer(n); pRow2 = (double*)arr.GetPointer(n + 1); dPivot = (pRow1[nAttr] + pRow2[nAttr]) / 2; // Split at the pivot and measure the sum info GArffData* pData2 = SplitByPivot(nAttr, dPivot); d = pRelation->MeasureTotalOutputInfo(this) + pRelation->MeasureTotalOutputInfo(pData2); Merge(pData2); delete(pData2); // See if we've got a new best score if(d < dBestPivotScore) { dBestPivotScore = d; dBestPivot = dPivot; } } *pOutputInfo = dBestPivotScore; return dBestPivot;}void GArffData::ComputeCovarianceMatrix(GMatrix* pOutMatrix, GArffRelation* pRelation){ // Resize the matrix int nInputs = pRelation->GetInputCount(); pOutMatrix->Resize(nInputs, nInputs); // Compute the deviations Holder<double*> hMeans(new double[nInputs]); double* pMeans = hMeans.Get(); int nRowCount = GetSize(); double* pRow; int n, i, j, nIndex; for(i = 0; i < nInputs; i++) { nIndex = pRelation->GetInputIndex(i); // Compute the mean double dSum = 0; for(n = 0; n < nRowCount; n++) { pRow = GetVector(n); dSum += pRow[nIndex]; } pMeans[i] = dSum / nRowCount; } // Compute the covariances for half the matrix for(i = 0; i < nInputs; i++) { for(n = i; n < nInputs; n++) { double dSum = 0; for(j = 0; j < nRowCount; j++) { pRow = GetVector(j); dSum += ((pRow[i] - pMeans[i]) * (pRow[n] - pMeans[n])); } pOutMatrix->Set(i, n, dSum / (nRowCount - 1)); } } // Fill out the other half of the matrix for(i = 1; i < nInputs; i++) { for(n = 0; n < i; n++) pOutMatrix->Set(i, n, pOutMatrix->Get(n, i)); }}void GArffData::ComputeCoprobabilityMatrix(GMatrix* pOutMatrix, GArffRelation* pRelation, int nAttr, double noDataValue){ // Resize the matrix GArffAttribute* pAttr = pRelation->GetAttribute(nAttr); int nRows = pAttr->GetValueCount(); int nAttributes = pRelation->GetAttributeCount(); int nCols = 0; int i; for(i = 0; i < nAttributes; i++) { GArffAttribute* pAttrCol = pRelation->GetAttribute(i); nCols += pAttrCol->GetValueCount(); } pOutMatrix->Resize(nRows, nCols); // Compute the coprobabilities int nRowCount = GetSize(); int row, col, nMatch, nTotal, nAttrCol, nVal; double* pRow; for(row = 0; row < nRows; row++) { col = 0; for(nAttrCol = 0; nAttrCol < nAttributes; nAttrCol++) { GArffAttribute* pAttrCol = pRelation->GetAttribute(nAttrCol); for(nVal = 0; nVal < pAttrCol->GetValueCount(); nVal++) { nMatch = 0; nTotal = 0; for(i = 0; i < nRowCount; i++) { pRow = GetVector(i); if((int)pRow[nAttrCol] == nVal) { nTotal++; if((int)pRow[nAttr] == row) nMatch++; } } if(nTotal == 0) pOutMatrix->Set(row, col, noDataValue); else pOutMatrix->Set(row, col, (double)nMatch / nTotal); col++; } } GAssert(col == nCols, "problem with columns"); }}int DimensionComparer(void* pThis, void* pA, void* pB){ int nDim = *(int*)pThis; if(((double*)pA)[nDim] < ((double*)pB)[nDim]) return -1; else if(((double*)pA)[nDim] > ((double*)pB)[nDim]) return 1; else return 0;}void GArffData::Sort(int nDimension){ GPointerArray::Sort(DimensionComparer, &nDimension);}
?? 快捷鍵說(shuō)明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -