?? utf8.cpp
字號:
#include "utf8.h"
#include "stdlib.h"
#include <stdio.h>
#include <wchar.h>
#include <exception>
#include <assert.h>
#include <string.h>
#if defined(WIN32)
#include <windows.h>
#pragma warning(disable : 4244)
#endif
namespace dnc{
class UTF8ConvertError : public std::exception{
public:
virtual castr what() const throw(){
return "UTF8ConvertError";
}
};
class XCharError : public std::exception{
public:
XCharError(xchar ch){
m_ch = ch;
::sprintf(m_msg,"XCharError:%i",ch);
}
virtual castr what() const throw(){
return m_msg;
}
private:
char m_msg[100];
xchar m_ch;
};
class UTF8FormatError : public std::exception{
public:
UTF8FormatError(unsigned long pos,unsigned long len,byte byte,unsigned long index){
m_len = len;
m_pos = pos;
m_byte= byte;
m_index=index;
}
UTF8FormatError(unsigned long index){
m_len = 0;
m_pos = 0;
m_byte= 0;
m_index=index;
}
public:
virtual const char* what() const throw(){
return "UTF8FormatError";
}
private:
unsigned long m_len;
unsigned long m_pos;
byte m_byte;
unsigned long m_index;
};
DNC_DECLARE unsigned int XCharToUTF8(xchar ch,astr utf8){
unsigned int encodedBytes;
if(ch <= 127){
*utf8 = (char)ch;
encodedBytes = 1;
}else{
uchar *chars = (ustr)utf8;
uchar *outPtr = chars;
// Figure out how many bytes we need
if (ch < 0x80)
encodedBytes = 1;
else if (ch < 0x800)
encodedBytes = 2;
else if (ch < 0x10000)
encodedBytes = 3;
else if (ch < 0x200000)
encodedBytes = 4;
else if (ch < 0x4000000)
encodedBytes = 5;
else if (ch <= 0x7FFFFFFF)
encodedBytes = 6;
else{
throw XCharError(ch);
}
//
// And spit out the bytes. We spit them out in reverse order
// here, so bump up the output pointer and work down as we go.
//
outPtr += encodedBytes;
switch(encodedBytes){
case 6 : *--outPtr = byte((ch | 0x80UL) & 0xBFUL);
ch >>= 6;
case 5 : *--outPtr = byte((ch | 0x80UL) & 0xBFUL);
ch >>= 6;
case 4 : *--outPtr = byte((ch | 0x80UL) & 0xBFUL);
ch >>= 6;
case 3 : *--outPtr = byte((ch | 0x80UL) & 0xBFUL);
ch >>= 6;
case 2 : *--outPtr = byte((ch | 0x80UL) & 0xBFUL);
ch >>= 6;
case 1 : *--outPtr = byte(ch | gFirstByteMark[encodedBytes]);
}
}
return encodedBytes;
}
DNC_DECLARE unsigned int UTF8ToXChar(custr utf8,xchar &ch){
const unsigned char *srcPtr = (const unsigned char*)utf8;
if (*srcPtr <= 127){
return *srcPtr;
}
unsigned int trailingBytes = gUTFBytes[*srcPtr];
xchar tmpVal = *srcPtr++;
tmpVal <<= 6;
for(unsigned int i=1; i<trailingBytes; i++){
if((*srcPtr & 0xC0) == 0x80){
tmpVal += *srcPtr++;
tmpVal <<= 6;
}else throw UTF8FormatError(i,trailingBytes,*srcPtr,0xffffffff);
}
if((*srcPtr & 0xC0) == 0x80){
tmpVal += *srcPtr++;
}else throw UTF8FormatError(trailingBytes,trailingBytes,*srcPtr,0xffffffff);
tmpVal -= gUTFOffsets[trailingBytes];
//
// If it will fit into a single char, then put it in. Otherwise
// encode it as a surrogate pair. If its not valid, use the
// replacement char.
//
if (tmpVal & 0xFFFF0000){
// Store the leading surrogate char
tmpVal -= 0x10000;
}
ch = tmpVal;
return trailingBytes+1;
}
DNC_DECLARE xchar utf8_value(custr str){
unsigned char *srcPtr = (unsigned char*)str;
if (*srcPtr <= 127){
return *srcPtr;
}
unsigned int trailingBytes = gUTFBytes[*srcPtr];
xchar tmpVal = *srcPtr++;
tmpVal <<= 6;
for(unsigned int i=1; i<trailingBytes; i++){
if((*srcPtr & 0xC0) == 0x80){
tmpVal += *srcPtr++;
tmpVal <<= 6;
}else throw UTF8FormatError(i,trailingBytes,*srcPtr,0xffffffff);
}
if((*srcPtr & 0xC0) == 0x80){
tmpVal += *srcPtr++;
}else throw UTF8FormatError(trailingBytes,trailingBytes,*srcPtr,0xffffffff);
tmpVal -= gUTFOffsets[trailingBytes];
//
// If it will fit into a single char, then put it in. Otherwise
// encode it as a surrogate pair. If its not valid, use the
// replacement char.
//
if (tmpVal & 0xFFFF0000){
// Store the leading surrogate char
tmpVal -= 0x10000;
}
return tmpVal;
}
DNC_DECLARE int utf8_strcmp(castr str1,castr str2,size_t count){
if(count == -1)
return ::strcmp((char*)str1,(char*)str2);
else{
unsigned int len1=(unsigned int)strlen((char*)str1);
unsigned int len2=(unsigned int)strlen((char*)str2);
unsigned int len = (len1<len2) ? len1 : len2;
len = (len<count) ? len : count;
int ret = memcmp(str1,str2,len);
if(ret == 0){
if(len1 > len2) ret = 1;
else if(len1 < len2) ret = -1;
}
return ret;
}
}
DNC_DECLARE void utf8_strlen(castr str,unsigned int &size,unsigned int &rawSize,unsigned int count){
assert(str != NULL);
rawSize = 0;
size = 0;
custr p=(custr)str;
for(;*p!=0 && rawSize<count;p++){
if(*p < 0x80 || *p >= 0xE0) size++;
rawSize++;
}
}
// ---------------------------------------------------------------------------
// Local static data
//
// gUTFBytes
// A list of counts of trailing bytes for each initial byte in the input.
//
// gUTFByteIndicator
// For a UTF8 sequence of n bytes, n>=2, the first byte of the
// sequence must contain n 1's followed by precisely 1 0 with the
// rest of the byte containing arbitrary bits. This array stores
// the required bit pattern for validity checking.
// gUTFByteIndicatorTest
// When bitwise and'd with the observed value, if the observed
// value is correct then a result matching gUTFByteIndicator will
// be produced.
//
// gUTFOffsets
// A list of values to offset each result char type, according to how
// many source bytes when into making it.
//
// gFirstByteMark
// A list of values to mask onto the first byte of an encoded sequence,
// indexed by the number of bytes used to create the sequence.
// ---------------------------------------------------------------------------
cuchar gUTFBytes[256] =
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
};
static cuchar gUTFByteIndicator[6] =
{
0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
};
static cuchar gUTFByteIndicatorTest[6] =
{
0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE
};
const unsigned long gUTFOffsets[6] =
{
0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
};
cuchar gFirstByteMark[7] =
{
0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
};
DNC_DECLARE size_t ANSIToUNICODE(castr srcData,size_t srcCount, wstr destData,size_t destCount){
int ret;
#if defined(_MSC_VER)
ret = ::MultiByteToWideChar(CP_ACP,0,srcData,(int)srcCount,destData,(int)destCount);
if(ret == 0) ret = -1;
return ret;
#else
ret = ::mbstowcs(destData, srcData,destCount);
#endif
return (unsigned int)ret;
}
DNC_DECLARE size_t UNICODEToANSI(cwstr srcData,size_t srcCount,astr destData,size_t destCount){
int ret;
#if defined(_MSC_VER)
ret = ::WideCharToMultiByte(CP_ACP,0,srcData,(int)srcCount,destData,(int)destCount, NULL, NULL );
#else
ret = ::wcstombs(destData,srcData,destCount);
#endif
return (unsigned int)ret;
}
// ---------------------------------------------------------------------------
// XMLUTF8Transcoder: Implementation of the transcoder API
// ---------------------------------------------------------------------------
DNC_DECLARE size_t ANSIToUTF8(castr srcData,size_t srcCount,ustr destData,size_t destCount){
wchar_t *wstr = (wchar_t*)malloc(srcCount*2);
size_t len = 0;
int ret = 0;
try{
#if defined(WIN32)
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -