?? utf8.cpp
字號:
ret = ::MultiByteToWideChar(CP_ACP,0,srcData,(int)srcCount,wstr,(int)destCount);
if(ret == 0) ret = -1;
#else
ret = ::mbstowcs(wstr, srcData,srcCount);
#endif
if(ret < 0){
throw UTF8ConvertError();
}
len = UNICODEToUTF8(wstr,ret,destData,destCount);
}catch(...){
free(wstr);
throw;
}
free(wstr);
return len;
}
DNC_DECLARE size_t UTF8ToANSI(custr srcData,size_t srcCount, astr destData,size_t destCount){
wchar_t *wstr = (wchar_t*)malloc((destCount+1)*2);
int ret = 0;
try{
size_t len = UTF8ToUNICODE(srcData,srcCount,wstr,destCount);
wstr[len] = 0;
#if defined(_MSC_VER)
ret = ::WideCharToMultiByte(CP_ACP,0,wstr,(int)len,destData,(int)len*2, NULL, NULL );
#else
ret = ::wcstombs(destData,wstr,len*2);
#endif
if(ret < 0){
throw UTF8ConvertError();
}
}catch(...){
free(wstr);
throw;
}
free(wstr);
return ret;
}
DNC_DECLARE size_t UTF8ToUNICODE(custr srcData,size_t srcCount, wstr destData,size_t destCount){
// Watch for pathological scenario. Shouldn't happen, but...
if (!srcCount || !destCount)
return 0;
//
//unsigned char charSizes[1024];
unsigned int bytesEaten;
//
// Get pointers to our start and end points of the input and output
// buffers.
//
custr srcPtr = srcData;
custr srcEnd = srcPtr + srcCount;
wstr outPtr = destData;
wstr outEnd = outPtr + destCount;
//ustr sizePtr = charSizes;
//
// We now loop until we either run out of input data, or room to store
// output chars.
//
while ((srcPtr < srcEnd) && (outPtr < outEnd))
{
// Special-case ASCII, which is a leading byte value of <= 127
if (*srcPtr <= 127)
{
*outPtr++ = wchar_t(*srcPtr++);
//*sizePtr++ = 1;
continue;
}
// See how many trailing src bytes this sequence is going to require
const unsigned int trailingBytes = gUTFBytes[*srcPtr];
//
// If there are not enough source bytes to do this one, then we
// are done. Note that we done >= here because we are implicitly
// counting the 1 byte we get no matter what.
//
// If we break out here, then there is nothing to undo since we
// haven't updated any pointers yet.
//
if (srcPtr + trailingBytes >= srcEnd)
break;
// Looks ok, so lets build up the value
// or at least let's try to do so--remembering that
// we cannot assume the encoding to be valid:
// first, test first byte
if((gUTFByteIndicatorTest[trailingBytes] & *srcPtr) != gUTFByteIndicator[trailingBytes])
throw UTF8FormatError(0,trailingBytes,*srcPtr,srcPtr-srcData);
unsigned long tmpVal = *srcPtr++;
tmpVal <<= 6;
for(unsigned int i=1; i<trailingBytes; i++)
{
if((*srcPtr & 0xC0) == 0x80)
{
tmpVal += *srcPtr++;
tmpVal <<= 6;
}else throw UTF8FormatError(i,trailingBytes,*srcPtr,srcPtr-srcData);
}
if((*srcPtr & 0xC0) == 0x80){
tmpVal += *srcPtr++;
}else throw UTF8FormatError(trailingBytes,trailingBytes,*srcPtr,srcPtr-srcData);
// since trailingBytes comes from an array, this logic is redundant
// default :
// ThrowXML(TranscodingException, XMLExcepts::Trans_BadSrcSeq);
//}
tmpVal -= gUTFOffsets[trailingBytes];
//
// If it will fit into a single char, then put it in. Otherwise
// encode it as a surrogate pair. If its not valid, use the
// replacement char.
//
if (!(tmpVal & 0xFFFF0000)) {
//*sizePtr++ = trailingBytes + 1;
*outPtr++ = wchar_t(tmpVal);
}else if (tmpVal > 0x10FFFF){
//
// If we've gotten more than 32 chars so far, then just break
// out for now and lets process those. When we come back in
// here again, we'll get no chars and throw an exception. This
// way, the error will have a line and col number closer to
// the real problem area.
//
if ((outPtr - destData) > 32)
break;
throw UTF8FormatError(srcPtr-srcData);
}else{
//
// If we have enough room to store the leading and trailing
// chars, then lets do it. Else, pretend this one never
// happened, and leave it for the next time. Since we don't
// update the bytes read until the bottom of the loop, by
// breaking out here its like it never happened.
//
if (outPtr + 1 >= outEnd)
break;
// Store the leading surrogate char
tmpVal -= 0x10000;
//*sizePtr++ = trailingBytes + 1;
*outPtr++ = wchar_t((tmpVal >> 10) + 0xD800);
//
// And then the treailing char. This one accounts for no
// bytes eaten from the source, so set the char size for this
// one to be zero.
//
//*sizePtr++ = 0;
*outPtr++ = wchar_t(tmpVal & 0x3FF) + 0xDC00;
}
}
// Update the bytes eaten
bytesEaten = srcPtr - (unsigned char*)srcData;
// Return the characters read
return outPtr - destData;
}
DNC_DECLARE size_t UNICODEToUTF8(cwstr srcData,size_t srcCount,ustr destData,size_t destCount){
// Watch for pathological scenario. Shouldn't happen, but...
if (!srcCount || !destCount)
return 0;
//
unsigned int charsEaten;
//
// Get pointers to our start and end points of the input and output
// buffers.
//
cwstr srcPtr = srcData;
cwstr srcEnd = srcPtr + srcCount;
ustr outPtr = destData;
ustr outEnd = destData + destCount;
while (srcPtr < srcEnd)
{
//
// Tentatively get the next char out. We have to get it into a
// 32 bit value, because it could be a surrogate pair.
//
unsigned long curVal = *srcPtr;
//
// If its a leading surrogate, then lets see if we have the trailing
// available. If not, then give up now and leave it for next time.
//
unsigned int srcUsed = 1;
if ((curVal >= 0xD800) && (curVal <= 0xDBFF))
{
if (srcPtr + 1 >= srcEnd)
break;
// Create the composite surrogate pair
curVal = ((curVal - 0xD800) << 10)
+ ((*(srcPtr + 1) - 0xDC00) + 0x10000);
// And indicate that we ate another one
srcUsed++;
}
// Figure out how many bytes we need
unsigned int encodedBytes;
if (curVal < 0x80)
encodedBytes = 1;
else if (curVal < 0x800)
encodedBytes = 2;
else if (curVal < 0x10000)
encodedBytes = 3;
else if (curVal < 0x200000)
encodedBytes = 4;
else if (curVal < 0x4000000)
encodedBytes = 5;
else if (curVal <= 0x7FFFFFFF)
encodedBytes = 6;
else
{
assert(0);
// Else, use the replacement character
//*outPtr++ = chSpace;
//srcPtr += srcUsed;
continue;
}
//
// If we cannot fully get this char into the output buffer,
// then leave it for the next time.
//
if (outPtr + encodedBytes > outEnd)
break;
// We can do it, so update the source index
srcPtr += srcUsed;
//
// And spit out the bytes. We spit them out in reverse order
// here, so bump up the output pointer and work down as we go.
//
outPtr += encodedBytes;
switch(encodedBytes)
{
case 6 : *--outPtr = byte((curVal | 0x80UL) & 0xBFUL);
curVal >>= 6;
case 5 : *--outPtr = byte((curVal | 0x80UL) & 0xBFUL);
curVal >>= 6;
case 4 : *--outPtr = byte((curVal | 0x80UL) & 0xBFUL);
curVal >>= 6;
case 3 : *--outPtr = byte((curVal | 0x80UL) & 0xBFUL);
curVal >>= 6;
case 2 : *--outPtr = byte((curVal | 0x80UL) & 0xBFUL);
curVal >>= 6;
case 1 : *--outPtr = byte
(
curVal | gFirstByteMark[encodedBytes]
);
}
// Add the encoded bytes back in again to indicate we've eaten them
outPtr += encodedBytes;
}
// Fill in the chars we ate
charsEaten = (srcPtr - srcData);
// And return the bytes we filled in
return (outPtr - destData);
}
DNC_DECLARE std::string UTF8ToANSI(const std::string &srcData){
Array<char> destData(srcData.size()+1);
size_t len = UTF8ToANSI((uchar*)srcData.c_str(),srcData.size(),destData,srcData.size()+1);
destData[len] = 0;
return destData.data();
}
DNC_DECLARE std::string ANSIToUTF8(const std::string &srcData){
Array<uchar> destData(srcData.size()*3+1);
size_t len = ANSIToUTF8(srcData.c_str(),srcData.size(),destData,srcData.size()*3+1);
destData[len] = 0;
return (char*)destData.data();
}
}
#include "utf8.dnc"
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -