?? uri.l
字號:
/** * uri.l -- Routines dealing with URI, mainly parsing and merging. * Created: Xie Han, net lab of Peking University. <e@pku.edu.cn> * * This is the first module of the web crawler. Used widely. * Created: Sep 25 04:15am 2003. version 0.1.1 * Last updated: Oct 13 04:15am 2005. version 1.6.3 *//* The followings are BNFs generating URI-refernce, taken from RFC 2396. */URI-reference ({absoluteURI}|{relativeURI})?("#"{fragment})?absoluteURI {scheme}":"({hier_part}|{opaque_part})relativeURI ({net_path}|{abs_path}|{rel_path})("?"{query})?hier_part ({net_path}|{abs_path})("?"{query})?opaque_part {uric_no_slash}{uric}*uric_no_slash {unreserved}|{escaped}|";"|"?"|":"|"@"|"&"|"="|"+"|"$"|","net_path "//"{authority}{abs_path}?abs_path "/"{path_segments}rel_path {rel_segment}{abs_path}?rel_segment ({unreserved}|{escaped}|";"|"@"|"&"|"="|"+"|"$"|",")+scheme {alpha}({alpha}|{digit}|"+"|"-"|".")*authority {server}|{reg_name}reg_name ({unreserved}|{escaped}|"$"|","|";"|":"|"@"|"&"|"="|"+")+server (({userinfo}"@")?{hostport})?userinfo ({unreserved}|{escaped}|";"|":"|"&"|"="|"+"|"$"|",")*hostport {host}(":"{port})?host {hostname}|{IPv4address}hostname ({domainlabel}".")*{toplabel}"."?domainlabel {alphanum}|{alphanum}({alphanum}|"-")*{alphanum}toplabel {alpha}|{alpha}({alphanum}|"-")*{alphanum}IPv4address {digit}+"."{digit}+"."{digit}+"."{digit}+port {digit}*path ({abs_path}|{opaque_part})?path_segments {segment}("/"{segment})*segment {pchar}*(";"{param})*param {pchar}*pchar {unreserved}|{escaped}|":"|"@"|"&"|"="|"+"|"$"|","query {uric}*fragment {uric}*uric {reserved}|{unreserved}|{escaped}reserved ";"|"/"|"?"|":"|"@"|"&"|"="|"+"|"$"|","unreserved {alphanum}|{mark}mark "-"|"_"|"."|"!"|"~"|"*"|"'"|"("|")"escaped "%"{hex}{hex}hex {digit}|[A-Fa-f]alphanum {alpha}|{digit}alpha {lowalpha}|{upalpha}lowalpha [a-z]upalpha [A-Z]digit [0-9]%option stack%s SCHEME REL_PATH AUTHORITY USERINFO HOST PORT REG_NAME ABS_PATH%s OPAQUE_PART QUERY FRAGMENT ACCEPT%{#include <errno.h>#include <stdlib.h>#include <stdio.h>#include <string.h>#include <ctype.h>#include <stack.h>#include "uri.h"#define URI_INIT(uri) \do { \ (uri)->scheme = NULL; \ (uri)->authority = NULL; \ (uri)->path = NULL; \ (uri)->query = NULL; \ (uri)->fragment = NULL; \} while (0)#define AUTH_INIT(auth, at) \do { \ if (((auth)->type = (at)) == AT_SERVER) \ { \ (auth)->userinfo = NULL; \ (auth)->host = NULL; \ (auth)->port = NULL; \ } \ else \ (auth)->reg_name = NULL; \} while (0)#define AUTH_DESTROY(auth) \do { \ if ((auth)->type == AT_SERVER) \ { \ free((auth)->userinfo); \ free((auth)->host); \ free((auth)->port); \ } \ else \ free((auth)->reg_name); \} while (0)static int __length;static struct uri *__uri;char *__memtostr(const void *s, int n){ char *str = (char *)malloc((n + 1) * sizeof (char)); if (str) { memcpy(str, s, n); *(str + n) = '\0'; } return str;}%}%%<SCHEME>{scheme}":" { if (__uri->scheme = __memtostr(yytext, yyleng - 1)) { __length += yyleng; yy_push_state(AUTHORITY); } else { uri_destroy(__uri); return -1; }}<SCHEME>.|\n { yyless(0); BEGIN REL_PATH;}<SCHEME><<EOF>> BEGIN REL_PATH;<REL_PATH>{rel_path} { if (__uri->path = __memtostr(yytext, yyleng)) { __length += yyleng; BEGIN QUERY; } else { uri_destroy(__uri); return -1; }}<REL_PATH>.|\n { yyless(0); yy_push_state(AUTHORITY);}<REL_PATH><<EOF>> yy_push_state(AUTHORITY); /* Authority and abs_path have conflict! If the following is "//", * we assume that it's an authority; if the following is "/", it's * an abs_path. */<AUTHORITY>"//" { yy_pop_state(); __uri->authority = (struct authority *)malloc(sizeof (struct authority)); if (__uri->authority) { AUTH_INIT(__uri->authority, AT_SERVER); __length += yyleng; BEGIN USERINFO; } else { uri_destroy(__uri); return -1; }}<AUTHORITY>.|\n { yyless(0); yy_push_state(ABS_PATH);}<AUTHORITY><<EOF>> yy_push_state(ABS_PATH);<USERINFO>{userinfo}"@" { if (__uri->authority->userinfo = __memtostr(yytext, yyleng - 1)) { __length += yyleng; BEGIN HOST; } else { uri_destroy(__uri); return -1; }}<USERINFO>.|\n { yyless(0); BEGIN HOST;}<USERINFO><<EOF>> BEGIN HOST;<HOST>{host} { if (__uri->authority->host = __memtostr(yytext, yyleng)) { __length += yyleng; BEGIN PORT; } else { uri_destroy(__uri); return -1; }}<HOST>.|\n { yyless(0); BEGIN REG_NAME;}<HOST><<EOF>> BEGIN REG_NAME;<PORT>":"{port} { if (__uri->authority->port = __memtostr(yytext + 1, yyleng - 1)) { __length += yyleng; BEGIN REG_NAME; } else { uri_destroy(__uri); return -1; }}<PORT>.|\n { yyless(0); BEGIN REG_NAME;}<PORT><<EOF>> BEGIN REG_NAME;<REG_NAME>{reg_name} { /* We have assumed that the authority is a server, but it seems that * we are wrong: it's a reg_name. We should join the userinfo, host * and the port together with this yytext into a reg_name. */ char *reg_name; int len = yyleng; char *curpos; if (__uri->authority->userinfo) len += strlen(__uri->authority->userinfo) + 1; if (__uri->authority->host) len += strlen(__uri->authority->host); if (__uri->authority->port) len += strlen(__uri->authority->port) + 1; if (reg_name = (char *)malloc((len + 1) * sizeof (char))) { curpos = reg_name; if (__uri->authority->userinfo) { len = strlen(__uri->authority->userinfo); memcpy(curpos, __uri->authority->userinfo, len); curpos += len; *curpos++ = '@'; } if (__uri->authority->host) { len = strlen(__uri->authority->host); memcpy(curpos, __uri->authority->host, len); curpos += len; } if (__uri->authority->port) { *curpos++ = ':'; len = strlen(__uri->authority->port); memcpy(curpos, __uri->authority->port, len); curpos += len; } len = strlen(yytext); memcpy(curpos, yytext, len); curpos += len; *curpos = '\0'; AUTH_DESTROY(__uri->authority); AUTH_INIT(__uri->authority, AT_REG_NAME); __uri->authority->reg_name = reg_name; __length += yyleng; yy_push_state(ABS_PATH); } else { uri_destroy(__uri); return -1; }}<REG_NAME>.|\n { yyless(0); yy_push_state(ABS_PATH);}<REG_NAME><<EOF>> yy_push_state(ABS_PATH);<ABS_PATH>{abs_path} { yy_pop_state(); if (YY_START == AUTHORITY) yy_pop_state(); if (__uri->path = __memtostr(yytext, yyleng)) { __length += yyleng; BEGIN QUERY; } else { uri_destroy(__uri); return -1; }}<ABS_PATH>.|\n |<ABS_PATH><<EOF>> { /* When encountered an EOF we can not yyless. */ if (yyleng == 1) yyless(0); yy_pop_state(); /* The previous state is "AUTHORITY" indicates the URI * has NO authority. */ if (YY_START == AUTHORITY) { yy_pop_state(); /* The previous state is "SCHEME" indicates the URI * HAS a scheme. It's a little confusing. */ if (YY_START == SCHEME) BEGIN OPAQUE_PART; else BEGIN FRAGMENT; } else BEGIN QUERY;}<OPAQUE_PART>{opaque_part} { if (__uri->path = __memtostr(yytext, yyleng)) { __length += yyleng; BEGIN FRAGMENT; } else { uri_destroy(__uri); return -1; }}<OPAQUE_PART>.|\n { yyless(0); BEGIN FRAGMENT;}<OPAQUE_PART><<EOF>> BEGIN FRAGMENT;<QUERY>"?"{query} { if (__uri->query = __memtostr(yytext + 1, yyleng - 1)) { __length += yyleng; BEGIN FRAGMENT; } else { uri_destroy(__uri); return -1; }}<QUERY>.|\n { yyless(0); BEGIN FRAGMENT;}<QUERY><<EOF>> BEGIN FRAGMENT;<FRAGMENT>"#"{fragment} { if (__uri->fragment = __memtostr(yytext + 1, yyleng - 1)) { __length += yyleng; BEGIN ACCEPT; } else { uri_destroy(__uri); return -1; }}<FRAGMENT>.|\n { yyless(0); BEGIN ACCEPT;}<FRAGMENT><<EOF>> BEGIN ACCEPT;<ACCEPT>.|\n { yyless(0); return __length;}<ACCEPT><<EOF>> return __length;<INITIAL>{URI-reference} return yyleng;<INITIAL>.|\n { yyless(0); return 0;}<INITIAL><<EOF>> return 0;%%int yywrap(void){ return 1;}char __hex2char[] = {/* 00 nul 01 soh 02 stx 03 etx 04 eot 05 enq 06 ack 07 bel */ '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', /* 08 bs 09 ht 0a nl 0b vt 0c np 0d cr 0e so 0f si */ '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', /* 10 dle 11 dc1 12 dc2 13 dc3 14 dc4 15 nak 16 syn 17 etb */ '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', /* 18 can 19 em 1a sub 1b esc 1c fs 1d gs 1e rs 1f us */ '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', /* 20 sp 21 ! 22 " 23 # 24 $ 25 % 26 & 27 ' */ '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', /* 28 ( 29 ) 2a * 2b + 2c , 2d - 2e . 2f / */ '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', /* 30 0 31 1 32 2 33 3 34 4 35 5 36 6 37 7 */ 0, 1, 2, 3, 4, 5, 6, 7, /* 38 8 39 9 3a : 3b ; 3c < 3d = 3e > 3f ? */ 8, 9, '\0', '\0', '\0', '\0', '\0', '\0', /* 40 @ 41 A 42 B 43 C 44 D 45 E 46 F 47 G */ '\0', 10, 11, 12, 13, 14, 15, '\0', /* 48 H 49 I 4a J 4b K 4c L 4d M 4e N 4f O */ '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', /* 50 P 51 Q 52 R 53 S 54 T 55 U 56 V 57 W */ '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', /* 58 X 59 Y 5a Z 5b [ 5c \ 5d ] 5e ^ 5f _ */ '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', /* 60 ` 61 a 62 b 63 c 64 d 65 e 66 f 67 g */ '\0', 10, 11, 12, 13, 14, 15, '\0', /* 68 h 69 i 6a j 6b k 6c l 6d m 6e n 6f o */ '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', /* 70 p 71 q 72 r 73 s 74 t 75 u 76 v 77 w */ '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', /* 78 x 79 y 7a z 7b { 7c | 7d } 7e ~ 7f del */ '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', };char __char2hex[] = "0123456789ABCDEF";char __uri_chr[] = { 0x00, 0x00, 0x00, 0x00, 0x5b, 0xff, 0xff, 0xf5, 0xff, 0xff, 0xff, 0xe1, 0x7f, 0xff, 0xff, 0xe2};static int __uri_parse(struct uri *uri){ __uri = uri; __length = 0; URI_INIT(__uri); BEGIN SCHEME; return yylex();}/* Scan a string ('\0' terminated) and return the length of the uri. * Return negative number when and only when failed to allocate memory. */int uri_parse_string(const char *string, struct uri *uri){ YY_BUFFER_STATE buf; int n = -1; if (buf = yy_scan_string(string)) { yy_switch_to_buffer(buf); n = __uri_parse(uri); yy_delete_buffer(buf); } return n;}/* Scan some memory bytes. */int uri_parse_bytes(const char *bytes, int len, struct uri *uri){ YY_BUFFER_STATE buf; int n = -1; if (buf = yy_scan_bytes(bytes, len)) { yy_switch_to_buffer(buf); n = __uri_parse(uri);
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -