?? rx.h
字號:
struct rexp_node{ enum rexp_node_type type; union { rx_Bitset cset; rx_side_effect side_effect; struct { struct rexp_node *left; struct rexp_node *right; } pair; void * data; } params;};/* NFA * * A syntax tree is compiled into an NFA. This page defines the structure * of that NFA. */struct rx_nfa_state{ /* These are kept in a list as the NFA is being built. */ struct rx_nfa_state *next; /* After the NFA is built, states are given integer id's. * States whose outgoing transitions are all either epsilon or * side effect edges are given ids less than 0. Other states * are given successive non-negative ids starting from 0. */ int id; /* The list of NFA edges that go from this state to some other. */ struct rx_nfa_edge *edges; /* If you land in this state, then you implicitly land * in all other states reachable by only epsilon translations. * Call the set of maximal paths to such states the epsilon closure * of this state. * * There may be other states that are reachable by a mixture of * epsilon and side effect edges. Consider the set of maximal paths * of that sort from this state. Call it the epsilon-side-effect * closure of the state. * * The epsilon closure of the state is a subset of the epsilon-side- * effect closure. It consists of all the paths that contain * no side effects -- only epsilon edges. * * The paths in the epsilon-side-effect closure can be partitioned * into equivalance sets. Two paths are equivalant if they have the * same set of side effects, in the same order. The epsilon-closure * is one of these equivalance sets. Let's call these equivalance * sets: observably equivalant path sets. That name is chosen * because equivalance of two paths means they cause the same side * effects -- so they lead to the same subsequent observations other * than that they may wind up in different target states. * * The superstate nfa, which is derived from this nfa, is based on * the observation that all of the paths in an observably equivalant * path set can be explored at the same time, provided that the * matcher keeps track not of a single nfa state, but of a set of * states. In particular, after following all the paths in an * observably equivalant set, you wind up at a set of target states. * That set of target states corresponds to one state in the * superstate NFA. * * Staticly, before matching begins, it is convenient to analyze the * nfa. Each state is labeled with a list of the observably * equivalant path sets who's union covers all the * epsilon-side-effect paths beginning in this state. This list is * called the possible futures of the state. * * A trivial example is this NFA: * s1 * A ---> B * * s2 * ---> C * * epsilon s1 * ---------> D ------> E * * * In this example, A has two possible futures. * One invokes the side effect `s1' and contains two paths, * one ending in state B, the other in state E. * The other invokes the side effect `s2' and contains only * one path, landing in state C. */ struct rx_possible_future *futures; /* There are exactly two distinguished states in every NFA: */ unsigned int is_final:1; unsigned int is_start:1; /* These are used during NFA construction... */ unsigned int eclosure_needed:1; unsigned int mark:1;};/* An edge in an NFA is typed: */enum rx_nfa_etype{ /* A cset edge is labled with a set of characters one of which * must be matched for the edge to be taken. */ ne_cset, /* An epsilon edge is taken whenever its starting state is * reached. */ ne_epsilon, /* A side effect edge is taken whenever its starting state is * reached. Side effects may cause the match to fail or the * position of the matcher to advance. */ ne_side_effect /* A special kind of epsilon. */};struct rx_nfa_edge{ struct rx_nfa_edge *next; enum rx_nfa_etype type; struct rx_nfa_state *dest; union { rx_Bitset cset; rx_side_effect side_effect; } params;};/* A possible future consists of a list of side effects * and a set of destination states. Below are their * representations. These structures are hash-consed which * means that lists with the same elements share a representation * (their addresses are ==). */struct rx_nfa_state_set{ struct rx_nfa_state * car; struct rx_nfa_state_set * cdr;};struct rx_se_list{ rx_side_effect car; struct rx_se_list * cdr;};struct rx_possible_future{ struct rx_possible_future *next; struct rx_se_list * effects; struct rx_nfa_state_set * destset;};/* This begins the description of the superstate NFA. * * The superstate NFA corresponds to the NFA in these ways: * * Every superstate NFA states SUPER correspond to sets of NFA states, * nfa_states(SUPER). * * Superstate edges correspond to NFA paths. * * The superstate has no epsilon transitions; * every edge has a character label, and a (possibly empty) side * effect label. The side effect label corresponds to a list of * side effects that occur in the NFA. These parts are referred * to as: superedge_character(EDGE) and superedge_sides(EDGE). * * For a superstate edge EDGE starting in some superstate SUPER, * the following is true (in pseudo-notation :-): * * exists DEST in nfa_states s.t. * exists nfaEDGE in nfa_edges s.t. * origin (nfaEDGE) == DEST * && origin (nfaEDGE) is a member of nfa_states(SUPER) * && exists PF in possible_futures(dest(nfaEDGE)) s.t. * sides_of_possible_future (PF) == superedge_sides (EDGE) * * also: * * let SUPER2 := superedge_destination(EDGE) * nfa_states(SUPER2) * == union of all nfa state sets S s.t. * exists PF in possible_futures(dest(nfaEDGE)) s.t. * sides_of_possible_future (PF) == superedge_sides (EDGE) * && S == dests_of_possible_future (PF) } * * Or in english, every superstate is a set of nfa states. A given * character and a superstate implies many transitions in the NFA -- * those that begin with an edge labeled with that character from a * state in the set corresponding to the superstate. * * The destinations of those transitions each have a set of possible * futures. A possible future is a list of side effects and a set of * destination NFA states. Two sets of possible futures can be * `merged' by combining all pairs of possible futures that have the * same side effects. A pair is combined by creating a new future * with the same side effect but the union of the two destination sets. * In this way, all the possible futures suggested by a superstate * and a character can be merged into a set of possible futures where * no two elements of the set have the same set of side effects. * * The destination of a possible future, being a set of NFA states, * corresponds to a supernfa state. So, the merged set of possible * futures we just created can serve as a set of edges in the * supernfa. * * The representation of the superstate nfa and the nfa is critical. * The nfa has to be compact, but has to facilitate the rapid * computation of missing superstates. The superstate nfa has to * be fast to interpret, lazilly constructed, and bounded in space. * * To facilitate interpretation, the superstate data structures are * peppered with `instruction frames'. There is an instruction set * defined below which matchers using the supernfa must be able to * interpret. * * We'd like to make it possible but not mandatory to use code * addresses to represent instructions (c.f. gcc's computed goto). * Therefore, we define an enumerated type of opcodes, and when * writing one of these instructions into a data structure, use * the opcode as an index into a table of instruction values. * * Here are the opcodes that occur in the superstate nfa: */ /* Every superstate contains a table of instruction frames indexed * by characters. A normal `move' in a matcher is to fetch the next * character and use it as an index into a superstates transition * * In the fasted case, only one edge follows from that character. * In other cases there is more work to do. * * The descriptions of the opcodes refer to data structures that are * described further below. */enum rx_opcode{ /* * BACKTRACK_POINT is invoked when a character transition in * a superstate leads to more than one edge. In that case, * the edges have to be explored independently using a backtracking * strategy. * * A BACKTRACK_POINT instruction is stored in a superstate's * transition table for some character when it is known that that * character crosses more than one edge. On encountering this * instruction, the matcher saves enough state to backtrack to this * point in the match later. */ rx_backtrack_point = 0, /* data is (struct transition_class *) */ /* * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path. * There is one occurence of this instruction per rx_distinct_future. * This instruction is skipped if a rx_distinct_future has no side effects. */ rx_do_side_effects = rx_backtrack_point + 1, /* data is (struct rx_distinct_future *) */ /* * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose * destination superstate has been reclaimed (or was never built). * It recomputes the destination superstate. * RX_CACHE_MISS is also stored in a superstate transition table before * any of its edges have been built. */ rx_cache_miss = rx_do_side_effects + 1, /* data is (struct rx_distinct_future *) */ /* * RX_NEXT_CHAR is called to consume the next character and take the * corresponding transition. This is the only instruction that uses * the DATA field of the instruction frame instead of DATA_2. * (see EXPLORE_FUTURE in regex.c). */ rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */ /* RX_BACKTRACK indicates that a transition fails. */ rx_backtrack = rx_next_char + 1, /* no data */ /* * RX_ERROR_INX is stored only in places that should never be executed. */ rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */ rx_num_instructions = rx_error_inx + 1};/* An id_instruction_table holds the values stored in instruction * frames. The table is indexed by the enums declared above. */extern void * rx_id_instruction_table[rx_num_instructions];/* The heart of the matcher is a `word-code-interpreter' * (like a byte-code interpreter, except that instructions * are a full word wide). * * Instructions are not stored in a vector of code, instead, * they are scattered throughout the data structures built * by the regexp compiler and the matcher. One word-code instruction, * together with the arguments to that instruction, constitute * an instruction frame (struct rx_inx). * * This structure type is padded by hand to a power of 2 because * in one of the dominant cases, we dispatch by indexing a table * of instruction frames. If that indexing can be accomplished * by just a shift of the index, we're happy. * * Instructions take at most one argument, but there are two * slots in an instruction frame that might hold that argument. * These are called data and data_2. The data slot is only * used for one instruction (RX_NEXT_CHAR). For all other * instructions, data should be set to 0. * * RX_NEXT_CHAR is the most important instruction by far. * By reserving the data field for its exclusive use, * instruction dispatch is sped up in that case. There is * no need to fetch both the instruction and the data, * only the data is needed. In other words, a `cycle' begins * by fetching the field data. If that is non-0, then it must * be the destination state of a next_char transition, so * make that value the current state, advance the match position * by one character, and start a new cycle. On the other hand, * if data is 0, fetch the instruction and do a more complicated * dispatch on that. */struct rx_inx { void * data; void * data_2; void * inx; void * fnord;};#ifndef RX_TAIL_ARRAY#define RX_TAIL_ARRAY 1#endif/* A superstate corresponds to a set of nfa states. Those sets are * represented by STRUCT RX_SUPERSET. The constructors * guarantee that only one (shared) structure is created for a given set. */struct rx_superset{ int refs; /* This is a reference counted structure. */ /* We keep these sets in a cache because (in an unpredictable way), * the same set is often created again and again. But that is also * problematic -- compatibility with POSIX and GNU regex requires * that we not be able to tell when a program discards a particular * NFA (thus invalidating the supersets created from it). * * But when a cache hit appears to occur, we will have in hand the * nfa for which it may have happened. That is why every nfa is given * its own sequence number. On a cache hit, the cache is validated * by comparing the nfa sequence number to this field: */ int id; struct rx_nfa_state * car; /* May or may not be a valid addr. */ struct rx_superset * cdr; /* If the corresponding superstate exists: */ struct rx_superstate * superstate; /* There is another bookkeeping problem. It is expensive to * compute the starting nfa state set for an nfa. So, once computed, * it is cached in the `struct rx'. * * But, the state set can be flushed from the superstate cache. * When that happens, we can't know if the corresponding `struct rx' * is still alive or if it has been freed or re-used by the program. * So, the cached pointer to this set in a struct rx might be invalid
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -