?? gopherenum-depth.c

?? harvest是一個下載html網頁得機器人
?? C
?? 第 1 頁 / 共 2 頁
字號:
12 下一頁
static char rcsid[] = "$Id: gopherenum-depth.c,v 2.5 2000/02/03 12:45:56 sxw Exp $";/* *  gopherenum-depth.c - Depth First RootNode URL enumerator for Gopher URLs * *  Usage: gopherenum-depth gopher-URL * *  Outputs the following format: * *      URL of tree root *      URL <tab> md5 *      ... *      URL <tab> md5 * *  DEBUG: section  43, level 1, 5, 9   Gatherer enumeration for Gopher *  AUTHOR: Harvest derived * *  Harvest Indexer http://harvest.sourceforge.net/ *  ----------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. * *  Please mail lee@arco.de if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. * *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//*  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <string.h>#include <signal.h>#include <stdlib.h>#include <gdbm.h>#include "util.h"#include "url.h"#define PUBLIC extern#include "filter.h"/* From robots-txt.c */extern int RobotsTxtCheck _PARAMS((URL *));typedef struct _list_t {    void *ptr;    struct _list_t *next;} list_t;/* Global variables */int max_depth = 0;int start_depth = 0;/* Local variables */static int url_max = 0;static int nurls = 0;static int host_max = 0;static int nhosts = 0;static char *tree_root = NULL;static char *urldb_filename = NULL;static char *hostdb_filename = NULL;static char *md5db_filename = NULL;static GDBM_FILE urldbf = NULL;static GDBM_FILE hostdbf = NULL;static GDBM_FILE md5dbf = NULL;/* Local functions */static void process_url();static void usage();static void mark_failed();static void mark_retrieved();static void sigdie();static int url_in_db();static int md5_in_db();/* ---------------------------------------------------------------------- *//* *  mark_failed() - Mark that a URL failed to be retrieved, so that the *  enumerator doesn't try it again. This option may not be wanted by *  some users and so should be configurable. */static void mark_failed(URL *up) {    datum k,d;    Debug(43, 9, ("mark_failed: url='%s'",up->url));    k.dptr = xstrdup(up->url);    k.dsize = strlen(k.dptr) + 1;    d.dptr = xstrdup("FailedAccess");    d.dsize = strlen(d.dptr) + 1;    if (!gdbm_exists(urldbf, k) && gdbm_store(urldbf, k, d, GDBM_INSERT))        fatal("GDBM URLDB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno));    xfree(k.dptr);    xfree(d.dptr);}/* *  mark_retrieved() - Mark that the given URL was successfully retrieved, *  so that the URL is not retrieved again.  This prevents cycles in the *  enumeration. */static void mark_retrieved(up)     URL *up;{    datum k, d;    k.dptr = xstrdup(up->url);    k.dsize = strlen(k.dptr) + 1;    d.dptr = xstrdup(up->md5);    d.dsize = strlen(d.dptr) + 1;    if (!gdbm_exists(urldbf, k) && gdbm_store(urldbf, k, d, GDBM_INSERT))	fatal("GDBM URLDB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno));    if (!gdbm_exists(md5dbf, d) && gdbm_store(md5dbf, d, k, GDBM_INSERT))	fatal("GDBM MD5DB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno));    /* Print URL to stdout to enumerate; flush to keep pipe moving */    fprintf(stdout, "%s\t%s\n", up->url, up->md5);	/* URL <tab> MD5 */    fflush(stdout);    if (nurls++ >= url_max) {	Log("Truncating RootNode %s at %d LeafNode URLs\n",	    tree_root, url_max);	sigdie();    }    xfree(k.dptr);		/* Clean up */    xfree(d.dptr);}/* *  url_in_db() - check to see if the URL is in the database */static int url_in_db(url)     char *url;{    datum k;    int r;    k.dptr = xstrdup(url);    k.dsize = strlen(k.dptr) + 1;    r = gdbm_exists(urldbf, k);    xfree(k.dptr);    return (r);}/* *  md5_in_db() - check to see if the MD5 is in the database */static int md5_in_db(md5)     char *md5;{    datum k;    int r;    k.dptr = xstrdup(md5);    k.dsize = strlen(k.dptr) + 1;    r = gdbm_exists(md5dbf, k);    xfree(k.dptr);    return (r);}/* *  host_in_db() - check to see if the host is in the database */static int host_in_db(host)     char *host;{    datum k;    int r;    Host *h;    h = get_host(host);    if (h == (Host *) NULL)	return 0;    k.dptr = xstrdup(h->dotaddr);    k.dsize = strlen(k.dptr) + 1;    r = gdbm_exists(hostdbf, k);    xfree(k.dptr);    return (r);}/* *  visit_server() - Determine if we should visit the server.  Return *  zero if we should not process the URL; otherwise, return non-zero. */static int visit_server(up)     URL *up;{    datum k, d;    Host *h;    if (host_in_db(up->host))	/* Host is already in the db */	return (1);    if (++nhosts > host_max)	return (0);    h = get_host(up->host);    if (h == (Host *) NULL)	return (0);    k.dptr = xstrdup(h->dotaddr);    k.dsize = strlen(k.dptr) + 1;    d.dptr = xstrdup(up->url);    d.dsize = strlen(d.dptr) + 1;    if (gdbm_store(hostdbf, k, d, GDBM_INSERT))	fatal("GDBM HOSTDB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno));    xfree(k.dptr);    xfree(d.dptr);    return (1);}/* *  gopher_enum() - Returns all of the URLs.  The buffer that is returned *  has the URLs separated by \n's.  Returns NULL on error. */static list_t *gopher_enum(up)     URL *up;{    char buf[BUFSIZ];    char newurl[BUFSIZ];    list_t *head = NULL;    list_t **Tail = NULL;    list_t *l = NULL;    FILE *fp = NULL;    char *p = NULL;    char *q = NULL;    char *urlbuf = NULL;    char *gopher_name = NULL;    char *gopher_path = NULL;    char *gopher_host = NULL;
12 下一頁
?? 文件大小 7910 K
?? 上傳用戶 pc1667pc1667
?? 所屬分類網絡
??? 相關標簽

#harvest #html #頁
?? 快捷鍵說明

復制代碼 Ctrl + C
搜索代碼 Ctrl + F
全屏模式 F11
切換主題 Ctrl + Shift + D
顯示快捷鍵 ?
增大字號 Ctrl + =
減小字號 Ctrl + -
亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

?? gopherenum-depth.c

?? 快捷鍵說明