?? garff.cpp
字號:
{ if(pRow2[nIndex] != pRow1[nIndex]) dSum += pInputScales[n]; } } return dSum;}// ------------------------------------------------------------------GArffAttribute::GArffAttribute(){ m_szName = NULL; m_nValues = 0; m_szValues = NULL; m_bIsInput = true;}GArffAttribute::GArffAttribute(bool bIsInput, int nValues, const char** szValues){ m_szName = NULL; m_nValues = nValues; if(nValues == 0) m_szValues = NULL; else { if(szValues) { m_szValues = new char*[nValues]; int n; for(n = 0; n < nValues; n++) { m_szValues[n] = new char[strlen(szValues[n]) + 1]; strcpy(m_szValues[n], szValues[n]); } } else m_szValues = NULL; } m_bIsInput = bIsInput;}GArffAttribute::~GArffAttribute(){ delete(m_szName); if(m_szValues) { int n; for(n = 0; n < m_nValues; n++) delete(m_szValues[n]); delete(m_szValues); }}GArffAttribute* GArffAttribute::NewCopy(){ return new GArffAttribute(m_bIsInput, m_nValues, (const char**)m_szValues);}/*static*/ GArffAttribute* GArffAttribute::Parse(const char* szFile, int nLen){ // Eat whitespace while(nLen > 0 && *szFile <= ' ') { if(*szFile == '\n') return NULL; szFile++; nLen--; } if(nLen < 1) return NULL; // Parse the name Holder<GArffAttribute*> hAttr(new GArffAttribute()); GArffAttribute* pAttr = hAttr.Get(); int nQuotes = 0; if(szFile[0] == '\'' || szFile[0] == '"') nQuotes = 1; int nPos = 1; for( ; nPos < nLen && (szFile[nPos] > ' ' || nQuotes > 0); nPos++) { if(szFile[nPos] == '\'' || szFile[nPos] == '"') nQuotes--; } pAttr->m_szName = new char[nPos + 1]; memcpy(pAttr->m_szName, szFile, nPos); pAttr->m_szName[nPos] = '\0'; // Eat whitespace while(nPos < nLen && szFile[nPos] <= ' ') { if(szFile[nPos] == '\n') return NULL; nPos++; } if(nPos >= nLen) return NULL; // Check for CONTINUOUS if(nLen - nPos >= 10 && strnicmp(&szFile[nPos], "CONTINUOUS", 10) == 0) return hAttr.Drop(); if(nLen - nPos >= 7 && strnicmp(&szFile[nPos], "NUMERIC", 7) == 0) return hAttr.Drop(); if(nLen - nPos >= 4 && strnicmp(&szFile[nPos], "REAL", 4) == 0) return hAttr.Drop(); // Parse the values if(szFile[nPos] != '{') return NULL; nPos++; // Count the values int nCount = 1; int n; for(n = nPos; szFile[n] != '{' && szFile[n] != '\n' && n < nLen; n++) { if(szFile[n] == ',') nCount++; } // Parse the values pAttr->m_szValues = new char*[nCount]; pAttr->m_nValues = nCount; int nValue = 0; for(n = nPos; szFile[n] != '}' && szFile[n] != '\n' && n < nLen; n++) { if(szFile[n] == ',') { int nStart = nPos; int nEnd = n; while(nStart < nEnd && szFile[nStart] <= ' ') nStart++; while(nStart < nEnd && szFile[nEnd - 1] <= ' ') nEnd--; pAttr->m_szValues[nValue] = new char[nEnd - nStart + 1]; memcpy(pAttr->m_szValues[nValue], &szFile[nStart], nEnd - nStart); (pAttr->m_szValues[nValue])[nEnd - nStart] = '\0'; nPos = n + 1; nValue++; } } int nStart = nPos; int nEnd = n; while(nStart < nEnd && szFile[nStart] <= ' ') nStart++; while(nStart < nEnd && szFile[nEnd - 1] <= ' ') nEnd--; pAttr->m_szValues[nValue] = new char[nEnd - nStart + 1]; memcpy(pAttr->m_szValues[nValue], &szFile[nStart], nEnd - nStart); (pAttr->m_szValues[nValue])[nEnd - nStart] = '\0'; if(szFile[n] != '}') return NULL; return hAttr.Drop();}void GArffAttribute::SetContinuous(){ if(m_szValues) { int n; for(n = 0; n < m_nValues; n++) delete(m_szValues[n]); delete(m_szValues); } m_szValues = NULL; m_nValues = 0;}int GArffAttribute::GetValueCount(){ return m_nValues;}const char* GArffAttribute::GetValue(int n){ if(n < 0) { GAssert(n == -1, "out of range"); return "<?>"; } GAssert(n < m_nValues, "out of range"); return m_szValues[n];}int GArffAttribute::FindEnumeratedValue(const char* szValue){ GAssert(!IsContinuous(), "Not an enumerated attribute"); int n; for(n = 0; n < m_nValues; n++) { if(strcmp(m_szValues[n], szValue) == 0) return n; } return -1;}// ------------------------------------------------------------------GArffData::GArffData(int nGrowSize): GPointerArray(nGrowSize){}GArffData::~GArffData(){ int nCount = GetSize(); int n; for(n = 0; n < nCount; n++) delete[] (double*)GetPointer(n);}void GArffData::CopyVector(double* pVector, int nAttributeCount){ double* pNewVector = new double[nAttributeCount]; memcpy(pNewVector, pVector, sizeof(double) * nAttributeCount); AddVector(pNewVector);}double* GArffData::DropVector(int nIndex){ int nCount = GetSize(); double* pVector = GetVector(nIndex); SetPointer(nIndex, GetPointer(nCount - 1)); DeleteCell(nCount - 1); return pVector;}void GArffData::DropAllVectors(){ Clear();}void GArffData::Shuffle(){ // Swap every row with a randomely selected row int nCount = GetSize(); int n, r; void* pTemp; for(n = nCount - 1; n > 0; n--) { r = rand() % n; pTemp = GetPointer(r); SetPointer(r, GetPointer(n)); SetPointer(n, pTemp); }}double GArffData::MeasureEntropy(GArffRelation* pRelation, int nColumn){ // Count the number of occurrences of each value GArffAttribute* pAttr = pRelation->GetAttribute(nColumn); GAssert(!pAttr->IsInput(), "Expected an output"); GAssert(!pAttr->IsContinuous(), "MeasureEntropy doesn't work with continuous attributes"); int nPossibleValues = pAttr->GetValueCount(); GTEMPBUF(int, pnCounts, nPossibleValues); int nTotalCount = 0; memset(pnCounts, '\0', pAttr->GetValueCount() * sizeof(int)); int n; int nRows = GetSize(); for(n = 0; n < nRows; n++) { int nValue = (int)GetVector(n)[nColumn]; if(nValue < 0) { GAssert(nValue == -1, "out of range"); continue; } GAssert(nValue < nPossibleValues, "value out of range"); pnCounts[nValue]++; nTotalCount++; } if(nTotalCount == 0) return 0; // Total up the entropy double dLog2 = log((double)2); double dEntropy = 0; double dRatio; for(n = 0; n < nPossibleValues; n++) { if(pnCounts[n] > 0) { dRatio = (double)pnCounts[n] / nTotalCount; dEntropy -= (dRatio * log(dRatio) / dLog2); } } return dEntropy;}GArffData* GArffData::SplitByPivot(int nColumn, double dPivot){ GArffData* pNewSet = new GArffData(MAX(8, GetSize())); double* pRow; int n; for(n = 0; n < GetSize(); n++) { pRow = GetVector(n); if(pRow[nColumn] <= dPivot) { pNewSet->AddVector(DropVector(n)); n--; } } return pNewSet;}int DoubleRefComparer(void* pThis, void* pA, void* pB){ if(*(double*)pA > *(double*)pB) return 1; if(*(double*)pA < *(double*)pB) return -1; return 0;}GArffData** GArffData::SplitByAttribute(GArffRelation* pRelation, int nAttribute){ GArffAttribute* pAttr = pRelation->GetAttribute(nAttribute); GAssert(pAttr->IsInput(), "Expected an input"); int nCount = pAttr->GetValueCount(); GAssert(nCount > 0, "Only discreet values are supported"); GArffData** ppParts = new GArffData*[nCount]; int n; for(n = 0; n < nCount; n++) ppParts[n] = SplitByPivot(nAttribute, (double)n); GAssert(GetSize() == 0, "some data out of range"); return ppParts;}GArffData* GArffData::SplitBySize(int nRows){ GAssert(nRows >= 0 && nRows <= GetSize(), "out of range"); GArffData* pNewSet = new GArffData(MAX(8, GetSize() - nRows)); while(GetSize() > nRows) pNewSet->AddVector(DropVector(nRows)); return pNewSet;}void GArffData::Merge(GArffData* pData){ while(pData->GetSize() > 0) AddVector(pData->DropVector(0));}void GArffData::DiscretizeNonContinuousOutputs(GArffRelation* pRelation){ int nOutputs = pRelation->GetOutputCount(); int n, nIndex, i, nValueCount, nVal; int nRowCount = GetSize(); double* pRow; for(n = 0; n < nOutputs; n++) { nIndex = pRelation->GetOutputIndex(n); GArffAttribute* pAttr = pRelation->GetAttribute(nIndex); if(pAttr->IsContinuous()) continue; nValueCount = pAttr->GetValueCount(); for(i = 0; i < nRowCount; i++) { pRow = GetVector(i); nVal = (int)(pRow[nIndex] - .5); if(nVal < 0) nVal = 0; else if(nVal >= nValueCount) nVal = nValueCount - 1; pRow[nIndex] = (double)nVal; } }}double GArffData::ComputeMean(int nAttribute){ double dMean = 0; int nRowCount = GetSize(); double* pRow; int i; for(i = 0; i < nRowCount; i++) { pRow = GetVector(i); dMean += pRow[nAttribute]; } return dMean / nRowCount;}void GArffData::GetMeans(double* pOutMeans, int nAttributes){ int n; for(n = 0; n < nAttributes; n++) pOutMeans[n] = 0; int nRowCount = GetSize(); double* pRow; int i; for(i = 0; i < nRowCount; i++) { pRow = GetVector(i); for(n = 0; n < nAttributes; n++) pOutMeans[n] += pRow[n]; } for(n = 0; n < nAttributes; n++) pOutMeans[n] /= nRowCount;}double GArffData::ComputeVariance(double dMean, int nAttribute){ double dVariance = 0; double* pRow; double d; int i; int nRowCount = GetSize(); for(i = 0; i < nRowCount; i++) { pRow = GetVector(i); d = pRow[nAttribute] - dMean; dVariance += (d * d); } return dVariance / nRowCount;}void GArffData::GetVariance(double* pOutVariance, double* pMeans, int nAttributes){ int n; for(n = 0; n < nAttributes; n++) pOutVariance[n] = 0; int nRowCount = GetSize(); double* pRow; int i; for(i = 0; i < nRowCount; i++) { pRow = GetVector(i); for(n = 0; n < nAttributes; n++) pOutVariance[n] += ((pRow[n] - pMeans[n]) * (pRow[n] - pMeans[n])); } for(n = 0; n < nAttributes; n++) pOutVariance[n] /= nRowCount;}int GArffData::RemoveOutlyers(double dStandardDeviations, int nAttributes){ int nOutlyers = 0; GTEMPBUF(double, pMeans, nAttributes); GTEMPBUF(double, pVariance, nAttributes); GetMeans(pMeans, nAttributes); GetVariance(pVariance, pMeans, nAttributes); int n, i; for(n = 0; n < nAttributes; n++) pVariance[n] = sqrt(pVariance[n]); double* pRow; int nRowCount = GetSize(); for(i = nRowCount - 1; i >= 0; i--) { pRow = GetVector(i); for(n = 0; n < nAttributes; n++) { if(ABS(pRow[n] - pMeans[n]) > dStandardDeviations * pVariance[n]) { delete(DropVector(i)); nOutlyers++; break; } } } return nOutlyers;}void GArffData::GetMinAndRange(int nAttribute, double* pMin, double* pRange){ int nCount = GetSize(); GAssert(nCount > 0, "No data"); double* pRow = GetVector(0); double dMin = pRow[nAttribute]; double dMax = dMin; int n; for(n = 1; n < nCount; n++) { pRow = GetVector(n); if(pRow[nAttribute] < dMin) dMin = pRow[nAttribute]; if(pRow[nAttribute] > dMax) dMax = pRow[nAttribute]; } *pMin = dMin; *pRange = dMax - dMin;}void GArffData::Normalize(int nAttribute, double dInputMin, double dInputRange, double dOutputMin, double dOutputRange){ GAssert(dInputRange > 0, "divide by zero"); int nCount = GetSize(); double* pRow; double dScale = dOutputRange / dInputRange; int n; for(n = 0; n < nCount; n++)
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -