?? gopherenum-breadth.c
字號:
static char rcsid[] = "$Id: gopherenum-breadth.c,v 2.5 2000/02/03 12:45:56 sxw Exp $";/* * gopherenum-breadth.c - RootNode URL enumerator for Gopher URLs * * Usage: gopherenum-breadth gopher-URL * * Outputs the following format: * * URL of tree root * URL <tab> md5 * ... * URL <tab> md5 * * DEBUG: section 43, level 1, 5, 9 Gatherer enumeration for Gopher * AUTHOR: Harvest derived * * Harvest Indexer http://harvest.sourceforge.net/ * ----------------------------------------------- * * The Harvest Indexer is a continued development of code developed by * the Harvest Project. Development is carried out by numerous individuals * in the Internet community, and is not officially connected with the * original Harvest Project or its funding sources. * * Please mail lee@arco.de if you are interested in participating * in the development effort. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* ---------------------------------------------------------------------- * Copyright (c) 1994, 1995. All rights reserved. * * The Harvest software was developed by the Internet Research Task * Force Research Group on Resource Discovery (IRTF-RD): * * Mic Bowman of Transarc Corporation. * Peter Danzig of the University of Southern California. * Darren R. Hardy of the University of Colorado at Boulder. * Udi Manber of the University of Arizona. * Michael F. Schwartz of the University of Colorado at Boulder. * Duane Wessels of the University of Colorado at Boulder. * * This copyright notice applies to software in the Harvest * ``src/'' directory only. Users should consult the individual * copyright notices in the ``components/'' subdirectories for * copyright information about other software bundled with the * Harvest source code distribution. * * TERMS OF USE * * The Harvest software may be used and re-distributed without * charge, provided that the software origin and research team are * cited in any use of the system. Most commonly this is * accomplished by including a link to the Harvest Home Page * (gopher://harvest.cs.colorado.edu/) from the query page of any * Broker you deploy, as well as in the query result pages. These * links are generated automatically by the standard Broker * software distribution. * * The Harvest software is provided ``as is'', without express or * implied warranty, and with no support nor obligation to assist * in its use, correction, modification or enhancement. We assume * no liability with respect to the infringement of copyrights, * trade secrets, or any patents, and are not responsible for * consequential damages. Proper use of the Harvest software is * entirely the responsibility of the user. * * DERIVATIVE WORKS * * Users may make derivative works from the Harvest software, subject * to the following constraints: * * - You must include the above copyright notice and these * accompanying paragraphs in all forms of derivative works, * and any documentation and other materials related to such * distribution and use acknowledge that the software was * developed at the above institutions. * * - You must notify IRTF-RD regarding your distribution of * the derivative work. * * - You must clearly notify users that your are distributing * a modified version and not the original Harvest software. * * - Any derivative product is also subject to these copyright * and use restrictions. * * Note that the Harvest software is NOT in the public domain. We * retain copyright, as specified above. * * HISTORY OF FREE SOFTWARE STATUS * * Originally we required sites to license the software in cases * where they were going to build commercial products/services * around Harvest. In June 1995 we changed this policy. We now * allow people to use the core Harvest software (the code found in * the Harvest ``src/'' directory) for free. We made this change * in the interest of encouraging the widest possible deployment of * the technology. The Harvest software is really a reference * implementation of a set of protocols and formats, some of which * we intend to standardize. We encourage commercial * re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <stdlib.h>#include <unistd.h>#include <memory.h>#include <string.h>#include <signal.h>#include <gdbm.h>#include "util.h"#include "url.h"#define PUBLIC extern#include "filter.h"typedef struct _list_t { void *ptr; int depth; struct _list_t *next;} list_t;list_t *head = NULL;list_t **Tail = NULL;/* define HOST_COUNT_IP to 'count' visited hosts based on IP, not the *//* given hostname. This way aliased machines will be properly *//* enumerated */#define HOST_COUNT_IP/* Global variables */int max_depth = 0;int cur_depth = 0;int depth_hist[100];/* Local variables */static int url_max = 0;static int nurls = 0;static int host_max = 0;static int nhosts = 0;static char *tree_root = NULL;static char *urldb_filename = NULL;static char *hostdb_filename = NULL;static char *md5db_filename = NULL;static GDBM_FILE urldbf = NULL;static GDBM_FILE hostdbf = NULL;static GDBM_FILE md5dbf = NULL;static FILE *not_visited = NULL;/* Local functions */static void usage();static void mark_failed();static void mark_retrieved();static void sigdie();static int url_in_db();static int md5_in_db();static int gopher_enum();extern int RobotsTxtCheck _PARAMS((URL *));list_t *add_to_list(url, depth) char *url; int depth;{ list_t *l = NULL; l = (list_t *) xmalloc(sizeof(list_t)); l->ptr = (void *) xstrdup(url); l->next = (list_t *) NULL; l->depth = depth; *Tail = l; Tail = &(l->next); return l;}list_t *free_from_list(l) list_t *l;{ list_t *r = NULL; r = l->next; xfree(l->ptr); xfree(l); return r;}/* ---------------------------------------------------------------------- *//* * mark_failed() - Mark that a URL failed to be retrieved, so that the * enumerator doesn't try it again. This option may not be wanted by * some users and so should be configurable. */static void mark_failed(URL *up) { datum k,d; Debug(43, 9, ("mark_failed: url='%s'",up->url)); k.dptr = xstrdup(up->url); k.dsize = strlen(k.dptr) + 1; d.dptr = xstrdup("FailedAccess"); d.dsize = strlen(d.dptr) + 1; if (!gdbm_exists(urldbf, k) && gdbm_store(urldbf, k, d, GDBM_INSERT)) fatal("GDBM URLDB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno)); xfree(k.dptr); xfree(d.dptr);}/* * mark_retrieved() - Mark that the given URL was successfully retrieved, * so that the URL is not retrieved again. This prevents cycles in the * enumeration. */static void mark_retrieved(up) URL *up;{ datum k, d; Debug(43, 9, ("mark_retrieved: url='%s', md5='%s'\n", up->url, up->md5)); k.dptr = xstrdup(up->url); k.dsize = strlen(k.dptr) + 1; d.dptr = xstrdup(up->md5); d.dsize = strlen(d.dptr) + 1; if (!gdbm_exists(urldbf, k) && gdbm_store(urldbf, k, d, GDBM_INSERT)) fatal("GDBM URLDB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno)); if (!gdbm_exists(md5dbf, d) && gdbm_store(md5dbf, d, k, GDBM_INSERT)) fatal("GDBM MD5DB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno)); xfree(k.dptr); xfree(d.dptr); /* Print URL to stdout to enumerate; flush to keep pipe moving */ fprintf(stdout, "%s\t%s\n", up->url, up->md5); /* URL <tab> MD5 */ fflush(stdout); if (nurls++ >= url_max) { Log("Truncating RootNode %s at %d LeafNode URLs\n", tree_root, url_max); url_close(up); up = NULL; sigdie(0); }}/* * url_in_db() - check to see if the URL is in the database */static int url_in_db(url) char *url;{ datum k; int r; Debug(43, 9, ("url_in_db: checking for url='%s'\n", url)); k.dptr = xstrdup(url); k.dsize = strlen(k.dptr) + 1; r = gdbm_exists(urldbf, k); xfree(k.dptr); return (r);}/* * md5_in_db() - check to see if the MD5 is in the database */static int md5_in_db(md5) char *md5;{ datum k; int r; k.dptr = xstrdup(md5); k.dsize = strlen(k.dptr) + 1; r = gdbm_exists(md5dbf, k); xfree(k.dptr); return (r);}/* * host_in_db() - check to see if the host is in the database */static int host_in_db(host) char *host;{ datum k; int r;#ifdef HOST_COUNT_IP Host *h; h = get_host(host); if (!h) return 0; k.dptr = xstrdup(h->dotaddr);#else k.dptr = xstrdup(host);#endif k.dsize = strlen(k.dptr) + 1; r = gdbm_exists(hostdbf, k); xfree(k.dptr); return (r);}/* * visit_server() - Determine if we should visit the server. Return * zero if we should not process the URL; otherwise, return non-zero. */static int visit_server(up) URL *up;{ datum k, d;#ifdef HOST_COUNT_IP Host *h = NULL;#endif if (host_in_db(up->host)) /* Host is already in the db */ return (1); if (++nhosts > host_max) return (0);#ifdef HOST_COUNT_IP h = get_host(up->host); if (!h) return (0); k.dptr = xstrdup(h->dotaddr);#else k.dptr = xstrdup(up->host);#endif k.dsize = strlen(k.dptr) + 1; d.dptr = xstrdup(up->url); d.dsize = strlen(d.dptr) + 1; if (gdbm_store(hostdbf, k, d, GDBM_INSERT)) fatal("GDBM HOSTDB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno)); xfree(k.dptr); xfree(d.dptr); return (1);}int url_is_allowed(url) char *url;{ URL *tup = NULL;
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -