?? encoding.c
字號(hào):
/* * encoding.c : implements the encoding conversion functions needed for XML * * Related specs: * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies * [ISO-10646] UTF-8 and UTF-16 in Annexes * [ISO-8859-1] ISO Latin-1 characters codes. * [UNICODE] The Unicode Consortium, "The Unicode Standard -- * Worldwide Character Encoding -- Version 1.0", Addison- * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is * described in Unicode Technical Report #4. * [US-ASCII] Coded Character Set--7-bit American Standard Code for * Information Interchange, ANSI X3.4-1986. * * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org> * * See Copyright for the status of this software. * * Daniel.Veillard@w3.org */#include "global.h"#ifdef WIN32#include "win32config.h"#else#include "config.h"#endif#include <stdio.h>#include <string.h>#ifdef HAVE_CTYPE_H#include <ctype.h>#endif#ifdef HAVE_STDLIB_H#include <stdlib.h>#endif#include <libxml/encoding.h>#include <libxml/xmlmemory.h>xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;/* * From rfc2044: encoding of the Unicode values on UTF-8: * * UCS-4 range (hex.) UTF-8 octet sequence (binary) * 0000 0000-0000 007F 0xxxxxxx * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * * I hope we won't use values > 0xFFFF anytime soon ! *//** * xmlCheckUTF8: Check utf-8 string for legality. * @utf: Pointer to putative utf-8 encoded string. * * Checks @utf for being valid utf-8. @utf is assumed to be * null-terminated. This function is not super-strict, as it will * allow longer utf-8 sequences than necessary. Note that Java is * capable of producing these sequences if provoked. Also note, this * routine checks for the 4-byte maxiumum size, but does not check for * 0x10ffff maximum value. * * Return value: true if @utf is valid. **/intxmlCheckUTF8(const unsigned char *utf){ int ix; unsigned char c; for (ix = 0; (c = utf[ix]);) { if (c & 0x80) { if ((utf[ix + 1] & 0xc0) != 0x80) return(0); if ((c & 0xe0) == 0xe0) { if ((utf[ix + 2] & 0xc0) != 0x80) return(0); if ((c & 0xf0) == 0xf0) { if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80) return(0); ix += 4; /* 4-byte code */ } else /* 3-byte code */ ix += 3; } else /* 2-byte code */ ix += 2; } else /* 1-byte code */ ix++; } return(1);}/** * isolat1ToUTF8: * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out * @in: a pointer to an array of ISO Latin 1 chars * @inlen: the length of @in * * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8 * block of chars out. * Returns the number of byte written, or -1 by lack of space. */intisolat1ToUTF8(unsigned char* out, int outlen, const unsigned char* in, int *inlen) { unsigned char* outstart= out; unsigned char* outend= out+outlen; const unsigned char* inend= in+*inlen; unsigned char c; while (in < inend) { c= *in++; if (c < 0x80) { if (out >= outend) return(-1); *out++ = c; } else { if (out >= outend) return(-1); *out++ = 0xC0 | (c >> 6); if (out >= outend) return(-1); *out++ = 0x80 | (0x3F & c); } } return(out-outstart);}/** * UTF8Toisolat1: * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out * @in: a pointer to an array of UTF-8 chars * @inlen: the length of @in * * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1 * block of chars out. * TODO: UTF8Toisolat1 need a fallback mechanism ... * * Returns the number of byte written, or -1 by lack of space, or -2 * if the transcoding fails (for *in is not valid utf8 string or * the result of transformation can't fit into the encoding we want) * The value of @inlen after return is the number of octets consumed * as the return value is positive, else unpredictiable. */intUTF8Toisolat1(unsigned char* out, int outlen, const unsigned char* in, int *inlen) { unsigned char* outstart= out; unsigned char* outend= out+outlen; const unsigned char* inend= in+*inlen; unsigned char c; while (in < inend) { c= *in++; if (c < 0x80) { if (out >= outend) return(-1); *out++= c; } else if (in == inend) { *inlen -= 1; break; } else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) { /* a two byte utf-8 and can be encoding as isolate1 */ *out++= ((c & 0x03) << 6) | (*in++ & 0x3F); } else return(-2); /* TODO : some should be represent as "&#x____;" */ } return(out-outstart);}/** * UTF16LEToUTF8: * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out * @inb: a pointer to an array of UTF-16LE passwd as a byte array * @inlenb: the length of @in in UTF-16LE chars * * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8 * block of chars out. This function assume the endian properity * is the same between the native type of this machine and the * inputed one. * * Returns the number of byte written, or -1 by lack of space, or -2 * if the transcoding fails (for *in is not valid utf16 string) * The value of *inlen after return is the number of octets consumed * as the return value is positive, else unpredictiable. */intUTF16LEToUTF8(unsigned char* out, int outlen, const unsigned char* inb, int *inlenb){ unsigned char* outstart= out; unsigned char* outend= out+outlen; unsigned short* in = (unsigned short*) inb; unsigned short* inend; unsigned int c, d, inlen; unsigned char *tmp; int bits; if ((*inlenb % 2) == 1) (*inlenb)--; inlen = *inlenb / 2; inend= in + inlen; while (in < inend) {#ifdef BIG_ENDIAN tmp = (unsigned char *) in; c = *tmp++; c = c | (((unsigned int)*tmp) << 8); in++;#else /* BIG_ENDIAN */ c= *in++;#endif /* BIG_ENDIAN */ if ((c & 0xFC00) == 0xD800) { /* surrogates */ if (in >= inend) { /* (in > inend) shouldn't happens */ (*inlenb) -= 2; break; }#ifdef BIG_ENDIAN tmp = (unsigned char *) in; d = *tmp++; d = d | (((unsigned int)*tmp) << 8); in++;#else /* BIG_ENDIAN */ d = *in++;#endif /* BIG_ENDIAN */ if ((d & 0xFC00) == 0xDC00) { c &= 0x03FF; c <<= 10; c |= d & 0x03FF; c += 0x10000; } else return(-2); } /* assertion: c is a single UTF-4 value */ if (out >= outend) return(-1); if (c < 0x80) { *out++= c; bits= -6; } else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; } else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; } else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; } for ( ; bits >= 0; bits-= 6) { if (out >= outend) return(-1); *out++= ((c >> bits) & 0x3F) | 0x80; } } return(out-outstart);}/** * UTF8ToUTF16LE: * @outb: a pointer to an array of bytes to store the result * @outlen: the length of @outb * @in: a pointer to an array of UTF-8 chars * @inlen: the length of @in * * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE * block of chars out. * TODO: UTF8ToUTF16LE need a fallback mechanism ... * * Returns the number of byte written, or -1 by lack of space, or -2 * if the transcoding failed. */intUTF8ToUTF16LE(unsigned char* outb, int outlen, const unsigned char* in, int *inlen){ unsigned short* out = (unsigned short*) outb; unsigned short* outstart= out; unsigned short* outend; const unsigned char* inend= in+*inlen; unsigned int c, d, trailing;#ifdef BIG_ENDIAN unsigned char *tmp; unsigned short tmp1, tmp2;#endif /* BIG_ENDIAN */ outlen /= 2; /* convert in short length */ outend = out + outlen; while (in < inend) { d= *in++; if (d < 0x80) { c= d; trailing= 0; } else if (d < 0xC0) return(-2); /* trailing byte in leading position */ else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } else if (d < 0xF8) { c= d & 0x07; trailing= 3; } else return(-2); /* no chance for this in UTF-16 */ if (inend - in < trailing) { *inlen -= (inend - in); break; } for ( ; trailing; trailing--) { if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return(-1); c <<= 6; c |= d & 0x3F; } /* assertion: c is a single UTF-4 value */ if (c < 0x10000) { if (out >= outend) return(-1);#ifdef BIG_ENDIAN tmp = (unsigned char *) out; *tmp = c ; *(tmp + 1) = c >> 8 ; out++;#else /* BIG_ENDIAN */ *out++ = c;#endif /* BIG_ENDIAN */ } else if (c < 0x110000) { if (out+1 >= outend) return(-1); c -= 0x10000;#ifdef BIG_ENDIAN tmp1 = 0xD800 | (c >> 10); tmp = (unsigned char *) out; *tmp = tmp1; *(tmp + 1) = tmp1 >> 8; out++; tmp2 = 0xDC00 | (c & 0x03FF); tmp = (unsigned char *) out; *tmp = tmp2; *(tmp + 1) = tmp2 >> 8; out++;#else /* BIG_ENDIAN */ *out++ = 0xD800 | (c >> 10); *out++ = 0xDC00 | (c & 0x03FF);#endif /* BIG_ENDIAN */ } else return(-1); } return(out-outstart);}/** * UTF16BEToUTF8: * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out * @inb: a pointer to an array of UTF-16 passwd as a byte array * @inlenb: the length of @in in UTF-16 chars * * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8 * block of chars out. This function assume the endian properity * is the same between the native type of this machine and the * inputed one. * * Returns the number of byte written, or -1 by lack of space, or -2 * if the transcoding fails (for *in is not valid utf16 string) * The value of *inlen after return is the number of octets consumed * as the return value is positive, else unpredictiable. */intUTF16BEToUTF8(unsigned char* out, int outlen, const unsigned char* inb, int *inlenb){ unsigned char* outstart= out; unsigned char* outend= out+outlen; unsigned short* in = (unsigned short*) inb; unsigned short* inend; unsigned int c, d, inlen;#ifdef BIG_ENDIAN#else /* BIG_ENDIAN */ unsigned char *tmp;#endif /* BIG_ENDIAN */ int bits; if ((*inlenb % 2) == 1) (*inlenb)--; inlen = *inlenb / 2; inend= in + inlen; while (in < inend) {#ifdef BIG_ENDIAN c= *in++;#else tmp = (unsigned char *) in; c = *tmp++; c = c << 8; c = c | (unsigned int) *tmp; in++;#endif if ((c & 0xFC00) == 0xD800) { /* surrogates */ if (in >= inend) { /* (in > inend) shouldn't happens */ (*inlenb) -= 2; break; }#ifdef BIG_ENDIAN d= *in++;#else tmp = (unsigned char *) in; d = *tmp++; d = d << 8; d = d | (unsigned int) *tmp; in++;#endif if ((d & 0xFC00) == 0xDC00) { c &= 0x03FF; c <<= 10; c |= d & 0x03FF; c += 0x10000; } else return(-2); } /* assertion: c is a single UTF-4 value */ if (out >= outend) return(-1); if (c < 0x80) { *out++= c; bits= -6; } else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; } else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; } else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; } for ( ; bits >= 0; bits-= 6) { if (out >= outend) return(-1); *out++= ((c >> bits) & 0x3F) | 0x80; } } return(out-outstart);}
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -