?? ascmagic.c
字號:
/* * Copyright (c) Ian F. Darwin 1986-1995. * Software written by Ian F. Darwin and others; * maintained 1995-present by Christos Zoulas and others. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. *//* * ASCII magic -- file types that we know based on keywords * that can appear anywhere in the file. * * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000, * to handle character codes other than ASCII on a unified basis. * * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit * international characters, now subsumed into this file. */#include "file.h"#include "magic.h"#include <stdio.h>#include <string.h>#include <memory.h>#include <ctype.h>#include <stdlib.h>#ifdef HAVE_UNISTD_H#include <unistd.h>#endif#include "names.h"#ifndef lintFILE_RCSID("@(#)$File: ascmagic.c,v 1.50 2007/03/15 14:51:00 christos Exp $")#endif /* lint */typedef unsigned long unichar;#define MAXLINELEN 300 /* longest sane line length */#define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \ || (x) == 0x85 || (x) == '\f')private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);private int looks_utf8(const unsigned char *, size_t, unichar *, size_t *);private int looks_unicode(const unsigned char *, size_t, unichar *, size_t *);private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);private void from_ebcdic(const unsigned char *, size_t, unsigned char *);private int ascmatch(const unsigned char *, const unichar *, size_t);protected intfile_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes){ size_t i; unsigned char *nbuf = NULL; unichar *ubuf = NULL; size_t ulen; struct names *p; int rv = -1; const char *code = NULL; const char *code_mime = NULL; const char *type = NULL; const char *subtype = NULL; const char *subtype_mime = NULL; int has_escapes = 0; int has_backspace = 0; int seen_cr = 0; int n_crlf = 0; int n_lf = 0; int n_cr = 0; int n_nel = 0; size_t last_line_end = (size_t)-1; int has_long_lines = 0; /* * Undo the NUL-termination kindly provided by process() * but leave at least one byte to look at */ while (nbytes > 1 && buf[nbytes - 1] == '\0') nbytes--; if ((nbuf = calloc(1, (nbytes + 1) * sizeof(nbuf[0]))) == NULL) goto done; if ((ubuf = calloc(1, (nbytes + 1) * sizeof(ubuf[0]))) == NULL) goto done; /* * Then try to determine whether it's any character code we can * identify. Each of these tests, if it succeeds, will leave * the text converted into one-unichar-per-character Unicode in * ubuf, and the number of characters converted in ulen. */ if (looks_ascii(buf, nbytes, ubuf, &ulen)) { code = "ASCII"; code_mime = "us-ascii"; type = "text"; } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) { code = "UTF-8 Unicode"; code_mime = "utf-8"; type = "text"; } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) { if (i == 1) code = "Little-endian UTF-16 Unicode"; else code = "Big-endian UTF-16 Unicode"; type = "character data"; code_mime = "utf-16"; /* is this defined? */ } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) { code = "ISO-8859"; type = "text"; code_mime = "iso-8859-1"; } else if (looks_extended(buf, nbytes, ubuf, &ulen)) { code = "Non-ISO extended-ASCII"; type = "text"; code_mime = "unknown"; } else { from_ebcdic(buf, nbytes, nbuf); if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) { code = "EBCDIC"; type = "character data"; code_mime = "ebcdic"; } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) { code = "International EBCDIC"; type = "character data"; code_mime = "ebcdic"; } else { rv = 0; goto done; /* doesn't look like text at all */ } } if (nbytes <= 1) { rv = 0; goto done; } /* * for troff, look for . + letter + letter or .\"; * this must be done to disambiguate tar archives' ./file * and other trash from real troff input. * * I believe Plan 9 troff allows non-ASCII characters in the names * of macros, so this test might possibly fail on such a file. */ if ((ms->flags & MAGIC_NO_CHECK_TROFF) == 0 && *ubuf == '.') { unichar *tp = ubuf + 1; while (ISSPC(*tp)) ++tp; /* skip leading whitespace */ if ((tp[0] == '\\' && tp[1] == '\"') || (isascii((unsigned char)tp[0]) && isalnum((unsigned char)tp[0]) && isascii((unsigned char)tp[1]) && isalnum((unsigned char)tp[1]) && ISSPC(tp[2]))) { subtype_mime = "text/troff"; subtype = "troff or preprocessor input"; goto subtype_identified; } } if ((ms->flags & MAGIC_NO_CHECK_FORTRAN) == 0 && (*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) { subtype_mime = "text/fortran"; subtype = "fortran program"; goto subtype_identified; } /* look for tokens from names.h - this is expensive! */ if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0) goto subtype_identified; i = 0; while (i < ulen) { size_t end; /* * skip past any leading space */ while (i < ulen && ISSPC(ubuf[i])) i++; if (i >= ulen) break; /* * find the next whitespace */ for (end = i + 1; end < nbytes; end++) if (ISSPC(ubuf[end])) break; /* * compare the word thus isolated against the token list */ for (p = names; p < names + NNAMES; p++) { if (ascmatch((const unsigned char *)p->name, ubuf + i, end - i)) { subtype = types[p->type].human; subtype_mime = types[p->type].mime; goto subtype_identified; } } i = end; }subtype_identified: /* * Now try to discover other details about the file. */ for (i = 0; i < ulen; i++) { if (ubuf[i] == '\n') { if (seen_cr) n_crlf++; else n_lf++; last_line_end = i; } else if (seen_cr) n_cr++; seen_cr = (ubuf[i] == '\r'); if (seen_cr) last_line_end = i; if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */ n_nel++; last_line_end = i; } /* If this line is _longer_ than MAXLINELEN, remember it. */ if (i > last_line_end + MAXLINELEN) has_long_lines = 1; if (ubuf[i] == '\033') has_escapes = 1; if (ubuf[i] == '\b') has_backspace = 1; } /* Beware, if the data has been truncated, the final CR could have been followed by a LF. If we have HOWMANY bytes, it indicates that the data might have been truncated, probably even before this function was called. */ if (seen_cr && nbytes < HOWMANY) n_cr++; if ((ms->flags & MAGIC_MIME)) { if (subtype_mime) { if (file_printf(ms, subtype_mime) == -1) goto done; } else { if (file_printf(ms, "text/plain") == -1) goto done; } if (code_mime) { if (file_printf(ms, "; charset=") == -1) goto done; if (file_printf(ms, code_mime) == -1) goto done; } } else { if (file_printf(ms, code) == -1) goto done; if (subtype) { if (file_printf(ms, " ") == -1) goto done; if (file_printf(ms, subtype) == -1) goto done; } if (file_printf(ms, " ") == -1) goto done; if (file_printf(ms, type) == -1) goto done; if (has_long_lines) if (file_printf(ms, ", with very long lines") == -1) goto done; /* * Only report line terminators if we find one other than LF, * or if we find none at all. */ if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) || (n_crlf != 0 || n_cr != 0 || n_nel != 0)) { if (file_printf(ms, ", with") == -1) goto done; if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) { if (file_printf(ms, " no") == -1) goto done; } else { if (n_crlf) { if (file_printf(ms, " CRLF") == -1) goto done; if (n_cr || n_lf || n_nel) if (file_printf(ms, ",") == -1) goto done; } if (n_cr) { if (file_printf(ms, " CR") == -1) goto done; if (n_lf || n_nel) if (file_printf(ms, ",") == -1) goto done; } if (n_lf) { if (file_printf(ms, " LF") == -1) goto done; if (n_nel) if (file_printf(ms, ",") == -1) goto done; } if (n_nel) if (file_printf(ms, " NEL") == -1) goto done; } if (file_printf(ms, " line terminators") == -1) goto done; } if (has_escapes) if (file_printf(ms, ", with escape sequences") == -1) goto done; if (has_backspace) if (file_printf(ms, ", with overstriking") == -1) goto done; }
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -