?? gopherenum-breadth.c
字號:
int y; if ((tup = url_open(url)) == NULL) return 0;; if (url_in_db(tup->url)) { /* Have we been here? */ Debug(43, 1, ("Already Visited URL: %s\n", tup->url)); url_close(tup); return 0; } if ((y = filter_selection(tup))) { Debug(43, 1, ("Removing Candidate: [%s] %s\n", Filter_Type_Name[y], tup->url)); if (not_visited) fprintf(not_visited, "[FILTER] %s\n", tup->url); url_close(tup); return 0; } if (!visit_server(tup)) { Debug(43, 1, ("Server count exceeded: %s\n", tup->url)); if (not_visited) fprintf(not_visited, "[SERVER] %s\n", tup->url); url_close(tup); return 0; } if (!RobotsTxtCheck(tup)) { Debug(43, 1, ("Disallowed by robots.txt: %s\n", tup->url)); if (not_visited) fprintf(not_visited, "[ROBOTS.TXT] %s\n", tup->url); url_close(tup); return 0; } return 1;}static int gopher_enum(up, depth) URL *up; int depth;{ FILE *fp = NULL; char *s = NULL; char *p = NULL; char *q = NULL; char *gopher_name = NULL; char *gopher_path = NULL; char *gopher_host = NULL; char *gopher_port = NULL; int nurls = 0; static char buf[BUFSIZ]; static char urlbuf[BUFSIZ]; static char newurl[BUFSIZ]; if (url_in_db(up->url)) { /* Have we been here? */ Debug(43, 1, ("Already Visited URL: %s\n", up->url)); return 0; } if (url_retrieve(up)) { /* Grab the URL; success? */ Debug(43, 1, ("Cannot Retrieve URL: %s\n", up->url));#ifdef DONT_RETRY_FAILS mark_failed(up);#endif return 0; } if (up->md5 && md5_in_db(up->md5)) { /* Have we been here? */ Debug(43, 1, ("Already Visited MD5: %s\n", up->url)); return 0; } /* Remember that we've been here before */ if (up->md5) mark_retrieved(up); if (up->gophertype == 0) return 0; /* * For each meny entry, convert it to a URL, and add it to * the global list of URLs to process. */ if ((fp = fopen(up->filename, "r")) == NULL) { log_errno2(__FILE__, __LINE__, up->filename); return 0; } while (fgets(buf, BUFSIZ, fp)) { if ((s = strchr(buf, '\r'))) *s = (char) '\n'; strcpy(urlbuf, buf); if ((s = strchr(buf, '\n'))) *s = (char) '\0'; Debug(43, 5, ("Input: %s\n", buf)); if (!strcmp(buf, ".")) break; p = urlbuf; if ((q = strchr(p, '\t')) == NULL) { errorlog("Illegal Gopher format: No Name: %s\n", buf); goto gopher_enum_cont; } *q = (char) '\0'; gopher_name = xstrdup(p); Debug(43, 5, ("gopher_name = '%s'\n", gopher_name)); p = q + 1; if ((q = strchr(p, '\t')) == NULL) { errorlog("Illegal Gopher format: No Path: %s\n", buf); goto gopher_enum_cont; } *q = (char) '\0'; gopher_path = xstrdup(rfc1738_escape(p)); Debug(43, 5, ("gopher_path = '%s'\n", gopher_path)); p = q + 1; if ((q = strchr(p, '\t')) == NULL) { errorlog("Illegal Gopher format: No Host: %s\n", buf); goto gopher_enum_cont; } *q = (char) '\0'; gopher_host = xstrdup(p); Debug(43, 5, ("gopher_host = '%s'\n", gopher_host)); p = q + 1; q = strchr(p, '\t'); if (q == NULL) q = strchr(p, '\n'); if (q == NULL) { errorlog("Illegal Gopher format: No Port: %s\n", buf); goto gopher_enum_cont; } *q = (char) '\0'; gopher_port = xstrdup(p); Debug(43, 5, ("gopher_port = '%s'\n", gopher_port)); /* Fix for wierd cross-site Gopher links - wessels */ if (!strncasecmp(gopher_path, "ftp%3a", 6)) goto gopher_enum_cont; if (!strncasecmp(gopher_path, "ftp:", 4)) goto gopher_enum_cont; if (!strncasecmp(gopher_path, "exec%3a", 7)) goto gopher_enum_cont; if (!strncasecmp(gopher_path, "exec:", 5)) goto gopher_enum_cont; sprintf(newurl, "gopher://%s:%d/%c%s", gopher_host, atoi(gopher_port), gopher_name[0], gopher_path); if (url_is_allowed(newurl)) { add_to_list(newurl, depth); nurls++; } gopher_enum_cont: xfree(gopher_name); gopher_name = NULL; xfree(gopher_path); gopher_path = NULL; xfree(gopher_host); gopher_host = NULL; xfree(gopher_port); gopher_port = NULL; } fclose(fp); Debug(43, 1, ("Adding %d URLs from %s to workload\n", nurls, up->url)); return 1;}/* ---------------------------------------------------------------------- *//* * initialize() - Basic init routines */static void initialize(){ char *s = NULL; extern int liburl_conform_rfc1738; FILE *logfp = NULL;#ifdef USE_HOST_CACHE host_cache_init();#endif cur_depth = max_depth = url_max = host_max = 0; if ((s = getenv("HARVEST_URL_MAX")) != NULL) url_max = atoi(s); if ((s = getenv("HARVEST_HOST_MAX")) != NULL) host_max = atoi(s); if ((s = getenv("HARVEST_DEPTH_MAX")) != NULL) max_depth = atoi(s); if ((s = getenv("HARVEST_DEPTH_CUR")) != NULL) cur_depth = atoi(s); if (url_max < 1) url_max = 250; /* hard-coded maximum */ if (host_max < 1) host_max = 1; /* hard-coded maximum */ if (max_depth < 1) max_depth = 0; /* hard-coded maximum */ host_filterfile = getenv("HARVEST_HOST_FILTER"); url_filterfile = getenv("HARVEST_URL_FILTER"); access_types = getenv("HARVEST_ACCESS_TYPES"); if ((s = getenv("HARVEST_GATHERER_LOGFILE")) != (char *) NULL) logfp = fopen(s, "a+"); if (logfp == (FILE *) NULL) logfp = stderr; init_log3("gopherenum-breadth", logfp, stderr); init_url(); liburl_conform_rfc1738 = 1; filter_initialize(); Debug(43, 5, ("access_mask: %#02X\n", access_mask)); /* Open GDBM databases to keep track of where we've been */ urldb_filename = xstrdup(tempnam(NULL, "Gurl")); urldbf = gdbm_open(urldb_filename, 0, GDBM_NEWDB, 0644, NULL); if (urldbf == NULL) { log_errno(urldb_filename); fatal("gdbm_open: %s: %s", urldb_filename, gdbm_strerror(gdbm_errno)); } hostdb_filename = xstrdup(tempnam(NULL, "Ghost")); hostdbf = gdbm_open(hostdb_filename, 0, GDBM_NEWDB, 0644, NULL); if (hostdbf == NULL) { log_errno(hostdb_filename); fatal("gdbm_open: %s: %s", hostdb_filename, gdbm_strerror(gdbm_errno)); } md5db_filename = xstrdup(tempnam(NULL, "Gmd5")); md5dbf = gdbm_open(md5db_filename, 0, GDBM_NEWDB, 0644, NULL); if (md5dbf == NULL) { log_errno(md5db_filename); fatal("gdbm_open: %s: %s", md5db_filename, gdbm_strerror(gdbm_errno)); } /* open not-visited file */ if ((s = getenv("HARVEST_NOT_VISITED_LOG")) != NULL) not_visited = fopen(s, "a+"); if (not_visited) setbuf(not_visited, NULL);}/* Die gracefully */static void sigdie(x) int x;{ int i;#ifdef USE_HOST_CACHE dump_host_cache(43, 9);#endif if (urldbf != NULL) gdbm_close(urldbf); if (hostdbf != NULL) gdbm_close(hostdbf); if (md5dbf != NULL) gdbm_close(md5dbf); if (not_visited) fclose(not_visited); /* (void) unlink(urldb_filename); */ crremove(urldb_filename); xfree(urldb_filename); /* (void) unlink(hostdb_filename); */ crremove(hostdb_filename); xfree(hostdb_filename); /* (void) unlink(md5db_filename); */ crremove(md5db_filename); xfree(md5db_filename); for (i = 0; i < 100; i++) { if (i > max_depth && depth_hist[i] == 0) break; Log("Found %8d objects at depth %d\n", depth_hist[i], i); } Debug(43, 1, ("gopherenum-breadth: exiting (signal %d)\n", x)); exit(0);}/* ---------------------------------------------------------------------- */static void usage(){ fprintf(stderr, "Usage: gopherenum-breadth gopher-URL\n"); exit(1);}int main(argc, argv) int argc; char **argv;{ URL *up = NULL; list_t *l = NULL; char *url = NULL; int depth = 0; debug_init(); /* from $HARVEST_DEBUG */ for (argc--, argv++; argc > 0 && **argv == '-'; argc--, argv++) { if (!strncmp(*argv, "-D", 2)) { debug_flag(*argv); } } if (argc != 1) usage(); for (depth = 0; depth < 100; depth++) depth_hist[depth] = 0; signal(SIGTERM, sigdie); /* Die gracefully */ signal(SIGINT, sigdie); signal(SIGPIPE, sigdie); /* Quickly clean up on broken pipe */ initialize(); /* Initialize */ Debug(43, 1, ("gopherenum-breadth: Starting...\n")); /* Grab the RootNode URL from the command line */ if ((up = url_open(*argv)) == NULL || up->type != URL_GOPHER) { usage(); } /* Mark the RootNode */ tree_root = xstrdup(up->url); Tail = &head; /* * helpdesk@ecs.soton.ac.uk -- Gatherer visits too many hosts * 6/3/96. Make sure the first URL we start with is added * to the list of servers visited. */ (void)visit_server(up); printf("%s\n", up->url); /* Print tree root */ add_to_list(up->url, cur_depth); /* start at depth = 0 */ url_close(up); for (l = head; l; l = free_from_list(l)) { url = (char *) l->ptr; depth = l->depth; if (depth < 100) depth_hist[depth]++; if (max_depth > 0 && depth > max_depth) { if (not_visited) fprintf(not_visited, "[DEPTH] %s\n", url); Debug(43, 1, ("Maximum Depth of %d Reached: %s\n", max_depth, url)); continue; } Debug(43, 1, ("Processing: [%2d] %s\n", depth, url)); if ((up = url_open(url)) == NULL) continue; /* search for more links from this one */ gopher_enum(up, depth + 1); url_close(up); } finish_url(); sigdie(0); /* NOTREACHED */}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -