?? bnfa_search.c
字號:
/*** bnfa_search.c **** Basic multi-pattern search engine using Aho-Corasick NFA construction.**** Version 3.0 (based on acsmx.c and acsmx2.c)**** author: marc norton** date: started 12/21/05**** Copyright(C) 2005-2007 Sourcefire, Inc.** ** General Design** Aho-Corasick based NFA state machine. ** Compacted sparse storage mode for better performance.** Up to 16 Million states + transitions (combined) in compacted sparse mode.**** ** Compacted sparse array storage ** **** The primary data is held in one array.** The patterns themselves are stored separately.** The matching lists of patterns for each state are stored separately as well.** The compacted sparse format improves caching/performance.**** word 1 : state ( only low 24 bits are used )** word 2 : control word = cb << 24 | fs** cb: control byte ** cb = mb | fb | nt** mb : 8th bit - if set state has matching patterns bit** fb : 7th bit - if set full storage array bit (256 entries used), else sparse** nt : 0-63= number of transitions (more than 63 requires full storage)** fs: 24 bits for failure state transition index.** word 3+ : transition word = input<<24 | next-state-index** input : 8 bit character, input to state machine from search text** next-state-index : 24 bits for index of next state** (if we reallly need 16M states, we can add a state->index lookup array)** ...repeat for each state ...**** * if a state is empty it has words 1 and 2, but no transition words.** ** Construction:**** Patterns are added to a list based trie.** The list based trie is compiled into a list based NFA with failure states.** The list based NFA is converted to full or sparse format NFA. ** The Zero'th state sparse transitions may be stored in full format for performance.** Sparse transition arrays are searched using linear and binary search strategies** depending on the number of entries to search through in each state.** The state machine in sparse mode is compacted into a single vector for * better performance.** ** Notes:** ** The NFA can require twice the state transitions that a DFA uses. However,** the construction of a DFA generates many additional transitions in each** state which consumes significant additional memory. This particular ** implementation is best suited to environments where the very large memory ** requirements of a full state table implementation is not possible and/or ** the speed trade off is warranted to maintain a small memory footprint.**** Each state of an NFA usually has very few transitions but can have up to 256.** It is important to not degenerate into a linear search so we utilize a binary** search if there are more than 5 elements in the state to test for a match.** This allows us to use a simple sparse memory design with an acceptable worst case** search scenario. The binary search over 256 elements is limtied to a max of** 8 tests. The zero'th state may use a full 256 state array, so a quick index lookup** provides the next state transition. The zero'th state is generally visited much** more than other states.**** Compiling : gcc, Intel C/C++, Microsoft C/C++, each optimize differently. My studies** have shown Intel C/C++ 9,8,7 to be the fastest, Microsoft 8,7,6 is next fastest,** and gcc 4.x,3.x,2.x is the slowest of the three. My testing has been mainly on x86.** In general gcc does a poor job with optimizing this state machine for performance, ** compared to other less cache and prefetch sensitive algorithms. I've documented** this behavior in a paper 'Optimizing Pattern Matching for IDS' (www.sourcefire.com,** www.idsresearch.org).**** The code is sensitive to cache optimization and prefetching, as well as instruction ** pipelining. Aren't we all. To this end, the number of patterns, length of search text,** and cpu cache L1,L2,L3 all affect performance. The relative performance of the sparse** and full format NFA and DFA varies as you vary the pattern charactersitics,and search** text length, but strong performance trends are present and stable.****** BNFA API SUMMARY**** bnfa=bnfaNew(); create a state machine** bnfaAddPattern(bnfa,..); add a pattern to the state machine** bnfaCompile (bnfa,..) compile the state machine** bnfaPrintInfo(bnfa); print memory usage and state info** bnfaPrint(bnfa); print the state machine in total ** state=bnfaSearch(bnfa, ...,state); search a data buffer for a pattern match** bnfaFree (bnfa); free the bnfa****** Reference - Efficient String matching: An Aid to Bibliographic Search** Alfred V Aho and Margaret J Corasick** Bell Labratories ** Copyright(C) 1975 Association for Computing Machinery,Inc**** 12/4/06 - man - modified summary** 6/26/07 - man - Added last_match tracking, and accounted for nocase/case by** preseting the last match state, and reverting if we fail the ** case memcmp test for any rule in the states matching rule list.** The states in the defaul matcher represent either case or nocase** states, so they are dual mode, that makes this a bit tricky.** When we sue the pure exact match, or pure don't care matching ** routines, we just track the last state, and never need to revert.** This only tracks the single repeated states and repeated data. ****** LICENSE (GPL)**** This program is free software; you can redistribute it and/or modify** it under the terms of the GNU General Public License Version 2 as** published by the Free Software Foundation. You may not use, modify or** distribute this program under any other version of the GNU General** Public License.**** This program is distributed in the hope that it will be useful,** but WITHOUT ANY WARRANTY; without even the implied warranty of** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the** GNU General Public License for more details.**** You should have received a copy of the GNU General Public License** along with this program; if not, write to the Free Software** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.***/ #include <signal.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <ctype.h> #include "bnfa_search.h"#include "util.h"/* * Used to initialize last state, states are limited to 0-16M * so this will not conflict. */#define LAST_STATE_INIT 0xffffffff#define printf LogMessage/** Case Translation Table - his guarantees we use * indexed lookups for case conversion*/ static unsigned char xlatcase[BNFA_MAX_ALPHABET_SIZE];staticvoid init_xlatcase(void) { int i; static int first=1; if( !first ) return; for(i=0; i<BNFA_MAX_ALPHABET_SIZE; i++) { xlatcase[i] = (unsigned char)toupper(i); } first=0;}/** Custom memory allocator*/ staticvoid * bnfa_alloc( int n, int * m ){ void * p = calloc(1,n); if( p ) { if(m) { m[0] += n; } } return p;}staticvoid bnfa_free( void *p, int n, int * m ){ if( p ) { free(p); if(m) { m[0] -= n; } }}#define BNFA_MALLOC(n,memory) bnfa_alloc(n,&(memory))#define BNFA_FREE(p,n,memory) bnfa_free(p,n,&(memory))/* queue memory traker */static int queue_memory=0;/** simple queue node*/ typedef struct _qnode{ unsigned state; struct _qnode *next; }QNODE;/** simple fifo queue structure*/ typedef struct _queue{ QNODE * head, *tail; int count; int maxcnt;}QUEUE;/** Initialize the fifo queue*/ staticvoid queue_init (QUEUE * s) { s->head = s->tail = 0; s->count= 0; s->maxcnt=0;}/** Add items to tail of queue (fifo)*/ staticint queue_add (QUEUE * s, int state) { QNODE * q; if (!s->head) { q = s->tail = s->head = (QNODE *) BNFA_MALLOC (sizeof(QNODE),queue_memory); if(!q) return -1; q->state = state; q->next = 0; } else { q = (QNODE *) BNFA_MALLOC (sizeof(QNODE),queue_memory); q->state = state; q->next = 0; s->tail->next = q; s->tail = q; } s->count++; if( s->count > s->maxcnt ) s->maxcnt = s->count; return 0;}/** Remove items from head of queue (fifo)*/ static int queue_remove (QUEUE * s) { int state = 0; QNODE * q; if (s->head) { q = s->head; state = q->state; s->head = s->head->next; s->count--; if( !s->head ) { s->tail = 0; s->count = 0; } BNFA_FREE (q,sizeof(QNODE),queue_memory); } return state;}/** Return count of items in the queue*/ static int queue_count (QUEUE * s) { return s->count;}/** Free the queue*/ staticvoid queue_free (QUEUE * s) { while (queue_count (s)) { queue_remove (s); }}/** Get next state from transition list*/static int _bnfa_list_get_next_state( bnfa_struct_t * bnfa, int state, int input ){ if ( state == 0 ) /* Full set of states always */ { bnfa_state_t * p = (bnfa_state_t*)bnfa->bnfaTransTable[0]; if(!p) { return 0; } return p[input]; } else { bnfa_trans_node_t * t = bnfa->bnfaTransTable[state]; while( t ) { if( t->key == (unsigned)input ) { return t->next_state; } t=t->next; } return BNFA_FAIL_STATE; /* Fail state */ }}/** Put next state - head insertion, and transition updates*/static int _bnfa_list_put_next_state( bnfa_struct_t * bnfa, int state, int input, int next_state ){ if( state >= bnfa->bnfaMaxStates ) { return -1; } if( input >= bnfa->bnfaAlphabetSize ) { return -1; } if( state == 0 ) { bnfa_state_t * p; p = (bnfa_state_t*)bnfa->bnfaTransTable[0]; if( !p ) { p = (bnfa_state_t*)BNFA_MALLOC(sizeof(bnfa_state_t)*bnfa->bnfaAlphabetSize,bnfa->list_memory); if( !p ) { return -1; } bnfa->bnfaTransTable[0] = (bnfa_trans_node_t*)p; } if( p[input] ) { p[input] = next_state; return 0; } p[input] = next_state; } else { bnfa_trans_node_t * p; bnfa_trans_node_t * tnew; /* Check if the transition already exists, if so just update the next_state */ p = bnfa->bnfaTransTable[state]; while( p ) { if( p->key == (unsigned)input ) /* transition already exists- reset the next state */ { p->next_state = next_state; return 0; } p=p->next; } /* Definitely not an existing transition - add it */ tnew = (bnfa_trans_node_t*)BNFA_MALLOC(sizeof(bnfa_trans_node_t),bnfa->list_memory); if( !tnew ) { return -1; } tnew->key = input; tnew->next_state = next_state; tnew->next = bnfa->bnfaTransTable[state]; bnfa->bnfaTransTable[state] = tnew; } bnfa->bnfaNumTrans++; return 0; }/** Free the entire transition list table */static int _bnfa_list_free_table( bnfa_struct_t * bnfa ){ int i; bnfa_trans_node_t * t, *p; if( !bnfa->bnfaTransTable ) return 0; if( bnfa->bnfaTransTable[0] ) { BNFA_FREE(bnfa->bnfaTransTable[0],sizeof(bnfa_state_t)*bnfa->bnfaAlphabetSize,bnfa->list_memory); } for(i=1; i<bnfa->bnfaMaxStates; i++) { t = bnfa->bnfaTransTable[i]; while( t ) { p = t; t = t->next; BNFA_FREE(p,sizeof(bnfa_trans_node_t),bnfa->list_memory); } } if( bnfa->bnfaTransTable ) { BNFA_FREE(bnfa->bnfaTransTable,sizeof(bnfa_trans_node_t*)*bnfa->bnfaMaxStates,bnfa->list_memory); bnfa->bnfaTransTable = 0; } return 0;}#ifdef ALLOW_LIST_PRINT/** Print the transition list table to stdout*/static int _bnfa_list_print_table( bnfa_struct_t * bnfa ){ int i; bnfa_trans_node_t * t; bnfa_match_node_t * mn; bnfa_pattern_t * patrn; if( !bnfa->bnfaTransTable ) { return 0; } printf("Print Transition Table- %d active states\n",bnfa->bnfaNumStates); for(i=0;i< bnfa->bnfaNumStates;i++) { printf("state %3d: ",i); if( i == 0 ) { int k; bnfa_state_t * p = (bnfa_state_t*)bnfa->bnfaTransTable[0]; if(!p) continue; for(k=0;k<bnfa->bnfaAlphabetSize;k++) { if( p[k] == 0 ) continue; if( isprint(p[k]) ) printf("%3c->%-5d\t",k,p[k]); else printf("%3d->%-5d\t",k,p[k]); } } else { t = bnfa->bnfaTransTable[i]; while( t ) { if( isprint(t->key) ) printf("%3c->%-5d\t",t->key,t->next_state); else printf("%3d->%-5d\t",t->key,t->next_state); t = t->next; } } mn =bnfa->bnfaMatchList[i]; while( mn ) { patrn =(bnfa_pattern_t *)mn->data; printf("%.*s ",patrn->n,patrn->casepatrn); mn = mn->next; } printf("\n"); } return 0;}#endif/** Converts a single row of states from list format to a full format*/ static int _bnfa_list_conv_row_to_full(bnfa_struct_t * bnfa, bnfa_state_t state, bnfa_state_t * full ){ if( (int)state >= bnfa->bnfaMaxStates ) /* protects 'full' against overflow */ { return -1; } if( state == 0 ) { if( bnfa->bnfaTransTable[0] ) memcpy(full,bnfa->bnfaTransTable[0],sizeof(bnfa_state_t)*bnfa->bnfaAlphabetSize); else memset(full,0,sizeof(bnfa_state_t)*bnfa->bnfaAlphabetSize); return bnfa->bnfaAlphabetSize; } else { int tcnt = 0; bnfa_trans_node_t * t = bnfa->bnfaTransTable[ state ]; memset(full,0,sizeof(bnfa_state_t)*bnfa->bnfaAlphabetSize);
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -