?? regexpr.pas
字號:
unit RegExpr;
interface
// ======== Determine compiler
{$IFDEF VER80} Sorry, TRegExpr is for 32-bits Delphi only. Delphi 1 is not supported (and whos really care today?!). {$ENDIF}
{$IFDEF VER90} {$DEFINE D2} {$ENDIF} // D2
{$IFDEF VER93} {$DEFINE D2} {$ENDIF} // CPPB 1
{$IFDEF VER100} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D3
{$IFDEF VER110} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // CPPB 3
{$IFDEF VER120} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D4
{$IFDEF VER130} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D5
{$IFDEF VER140} {$DEFINE D6} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D6
{$IFDEF VER150} {$DEFINE D7} {$DEFINE D6} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D7
// ======== Define base compiler options
{$BOOLEVAL OFF}
{$EXTENDEDSYNTAX ON}
{$LONGSTRINGS ON}
{$OPTIMIZATION ON}
{$IFDEF D6}
{$WARN SYMBOL_PLATFORM OFF} // Suppress .Net warnings
{$ENDIF}
{$IFDEF D7}
{$WARN UNSAFE_CAST OFF} // Suppress .Net warnings
{$WARN UNSAFE_TYPE OFF} // Suppress .Net warnings
{$WARN UNSAFE_CODE OFF} // Suppress .Net warnings
{$ENDIF}
{$IFDEF FPC}
{$MODE DELPHI} // Delphi-compatible mode in FreePascal
{$ENDIF}
// ======== Define options for TRegExpr engine
{.$DEFINE UniCode} // Unicode support
{$DEFINE RegExpPCodeDump} // p-code dumping (see Dump method)
{$IFNDEF FPC} // the option is not supported in FreePascal
{$DEFINE reRealExceptionAddr} // exceptions will point to appropriate source line, not to Error procedure
{$ENDIF}
{$DEFINE ComplexBraces} // support braces in complex cases
{$IFNDEF UniCode} // the option applicable only for non-UniCode mode
{$DEFINE UseSetOfChar} // Significant optimization by using set of char
{$ENDIF}
{$IFDEF UseSetOfChar}
{$DEFINE UseFirstCharSet} // Fast skip between matches for r.e. that starts with determined set of chars
{$ENDIF}
// ======== Define Pascal-language options
// Define 'UseAsserts' option (do not edit this definitions).
// Asserts used to catch 'strange bugs' in TRegExpr implementation (when something goes
// completely wrong). You can swith asserts on/off with help of {$C+}/{$C-} compiler options.
{$IFDEF D3} {$DEFINE UseAsserts} {$ENDIF}
{$IFDEF FPC} {$DEFINE UseAsserts} {$ENDIF}
// Define 'use subroutine parameters default values' option (do not edit this definition).
{$IFDEF D4} {$DEFINE DefParam} {$ENDIF}
// Define 'OverMeth' options, to use method overloading (do not edit this definitions).
{$IFDEF D5} {$DEFINE OverMeth} {$ENDIF}
{$IFDEF FPC} {$DEFINE OverMeth} {$ENDIF}
uses
Classes, // TStrings in Split method
SysUtils; // Exception
type
{$IFDEF UniCode}
PRegExprChar = PWideChar;
RegExprString = WideString;
REChar = WideChar;
{$ELSE}
PRegExprChar = PChar;
RegExprString = AnsiString; //###0.952 was string
REChar = Char;
{$ENDIF}
TREOp = REChar; // internal p-code type //###0.933
PREOp = ^TREOp;
TRENextOff = integer; // internal Next "pointer" (offset to current p-code) //###0.933
PRENextOff = ^TRENextOff; // used for extracting Next "pointers" from compiled r.e. //###0.933
TREBracesArg = integer; // type of {m,n} arguments
PREBracesArg = ^TREBracesArg;
const
REOpSz = SizeOf (TREOp) div SizeOf (REChar); // size of p-code in RegExprString units
RENextOffSz = SizeOf (TRENextOff) div SizeOf (REChar); // size of Next 'pointer' -"-
REBracesArgSz = SizeOf (TREBracesArg) div SizeOf (REChar); // size of BRACES arguments -"-
type
TRegExprInvertCaseFunction = function (const Ch : REChar) : REChar
of object;
const
EscChar = '\'; // 'Escape'-char ('\' in common r.e.) used for escaping metachars (\w, \d etc).
RegExprModifierI : boolean = False; // default value for ModifierI
RegExprModifierR : boolean = True; // default value for ModifierR
RegExprModifierS : boolean = True; // default value for ModifierS
RegExprModifierG : boolean = True; // default value for ModifierG
RegExprModifierM : boolean = False; // default value for ModifierM
RegExprModifierX : boolean = False; // default value for ModifierX
RegExprSpaceChars : RegExprString = // default value for SpaceChars
' '#$9#$A#$D#$C;
RegExprWordChars : RegExprString = // default value for WordChars
'0123456789' //###0.940
+ 'abcdefghijklmnopqrstuvwxyz'
+ 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_';
RegExprLineSeparators : RegExprString =// default value for LineSeparators
#$d#$a{$IFDEF UniCode}+#$b#$c#$2028#$2029#$85{$ENDIF}; //###0.947
RegExprLinePairedSeparator : RegExprString =// default value for LinePairedSeparator
#$d#$a;
{ if You need Unix-styled line separators (only \n), then use:
RegExprLineSeparators = #$a;
RegExprLinePairedSeparator = '';
}
const
NSUBEXP = 15; // max number of subexpression //###0.929
// Cannot be more than NSUBEXPMAX
// Be carefull - don't use values which overflow CLOSE opcode
// (in this case you'll get compiler erorr).
// Big NSUBEXP will cause more slow work and more stack required
NSUBEXPMAX = 255; // Max possible value for NSUBEXP. //###0.945
// Don't change it! It's defined by internal TRegExpr design.
MaxBracesArg = $7FFFFFFF - 1; // max value for {n,m} arguments //###0.933
{$IFDEF ComplexBraces}
LoopStackMax = 10; // max depth of loops stack //###0.925
{$ENDIF}
TinySetLen = 3;
// if range includes more then TinySetLen chars, //###0.934
// then use full (32 bytes) ANYOFFULL instead of ANYOF[BUT]TINYSET
// !!! Attension ! If you change TinySetLen, you must
// change code marked as "//!!!TinySet"
type
{$IFDEF UseSetOfChar}
PSetOfREChar = ^TSetOfREChar;
TSetOfREChar = set of REChar;
{$ENDIF}
TRegExpr = class;
TRegExprReplaceFunction = function (ARegExpr : TRegExpr): string
of object;
TRegExpr = class
private
startp : array [0 .. NSUBEXP - 1] of PRegExprChar; // founded expr starting points
endp : array [0 .. NSUBEXP - 1] of PRegExprChar; // founded expr end points
{$IFDEF ComplexBraces}
LoopStack : array [1 .. LoopStackMax] of integer; // state before entering loop
LoopStackIdx : integer; // 0 - out of all loops
{$ENDIF}
// The "internal use only" fields to pass info from compile
// to execute that permits the execute phase to run lots faster on
// simple cases.
regstart : REChar; // char that must begin a match; '\0' if none obvious
reganch : REChar; // is the match anchored (at beginning-of-line only)?
regmust : PRegExprChar; // string (pointer into program) that match must include, or nil
regmlen : integer; // length of regmust string
// Regstart and reganch permit very fast decisions on suitable starting points
// for a match, cutting down the work a lot. Regmust permits fast rejection
// of lines that cannot possibly match. The regmust tests are costly enough
// that regcomp() supplies a regmust only if the r.e. contains something
// potentially expensive (at present, the only such thing detected is * or +
// at the start of the r.e., which can involve a lot of backup). Regmlen is
// supplied because the test in regexec() needs it and regcomp() is computing
// it anyway.
{$IFDEF UseFirstCharSet} //###0.929
FirstCharSet : TSetOfREChar;
{$ENDIF}
// work variables for Exec's routins - save stack in recursion}
reginput : PRegExprChar; // String-input pointer.
fInputStart : PRegExprChar; // Pointer to first char of input string.
fInputEnd : PRegExprChar; // Pointer to char AFTER last char of input string
// work variables for compiler's routines
regparse : PRegExprChar; // Input-scan pointer.
regnpar : integer; // count.
regdummy : char;
regcode : PRegExprChar; // Code-emit pointer; @regdummy = don't.
regsize : integer; // Code size.
regexpbeg : PRegExprChar; // only for error handling. Contains
// pointer to beginning of r.e. while compiling
fExprIsCompiled : boolean; // true if r.e. successfully compiled
// programm is essentially a linear encoding
// of a nondeterministic finite-state machine (aka syntax charts or
// "railroad normal form" in parsing technology). Each node is an opcode
// plus a "next" pointer, possibly plus an operand. "Next" pointers of
// all nodes except BRANCH implement concatenation; a "next" pointer with
// a BRANCH on both ends of it is connecting two alternatives. (Here we
// have one of the subtle syntax dependencies: an individual BRANCH (as
// opposed to a collection of them) is never concatenated with anything
// because of operator precedence.) The operand of some types of node is
// a literal string; for others, it is a node leading into a sub-FSM. In
// particular, the operand of a BRANCH node is the first node of the branch.
// (NB this is *not* a tree structure: the tail of the branch connects
// to the thing following the set of BRANCHes.) The opcodes are:
programm : PRegExprChar; // Unwarranted chumminess with compiler.
fExpression : PRegExprChar; // source of compiled r.e.
fInputString : PRegExprChar; // input string
fLastError : integer; // see Error, LastError
fModifiers : integer; // modifiers
fCompModifiers : integer; // compiler's copy of modifiers
fProgModifiers : integer; // modifiers values from last programm compilation
fSpaceChars : RegExprString; //###0.927
fWordChars : RegExprString; //###0.929
fInvertCase : TRegExprInvertCaseFunction; //###0.927
fLineSeparators : RegExprString; //###0.941
fLinePairedSeparatorAssigned : boolean;
fLinePairedSeparatorHead,
fLinePairedSeparatorTail : REChar;
{$IFNDEF UniCode}
fLineSeparatorsSet : set of REChar;
{$ENDIF}
procedure InvalidateProgramm;
// Mark programm as have to be [re]compiled
function IsProgrammOk : boolean; //###0.941
// Check if we can use precompiled r.e. or
// [re]compile it if something changed
function GetExpression : RegExprString;
procedure SetExpression (const s : RegExprString);
function GetModifierStr : RegExprString;
class function ParseModifiersStr (const AModifiers : RegExprString;
var AModifiersInt : integer) : boolean; //###0.941 class function now
// Parse AModifiers string and return true and set AModifiersInt
// if it's in format 'ismxrg-ismxrg'.
procedure SetModifierStr (const AModifiers : RegExprString);
function GetModifier (AIndex : integer) : boolean;
procedure SetModifier (AIndex : integer; ASet : boolean);
procedure Error (AErrorID : integer); virtual; // error handler.
// Default handler raise exception ERegExpr with
// Message = ErrorMsg (AErrorID), ErrorCode = AErrorID
// and CompilerErrorPos = value of property CompilerErrorPos.
{==================== Compiler section ===================}
function CompileRegExpr (exp : PRegExprChar) : boolean;
// compile a regular expression into internal code
procedure Tail (p : PRegExprChar; val : PRegExprChar);
// set the next-pointer at the end of a node chain
procedure OpTail (p : PRegExprChar; val : PRegExprChar);
// regoptail - regtail on operand of first argument; nop if operandless
function EmitNode (op : TREOp) : PRegExprChar;
// regnode - emit a node, return location
procedure EmitC (b : REChar);
// emit (if appropriate) a byte of code
procedure InsertOperator (op : TREOp; opnd : PRegExprChar; sz : integer); //###0.90
// insert an operator in front of already-emitted operand
// Means relocating the operand.
function ParseReg (paren : integer; var flagp : integer) : PRegExprChar;
// regular expression, i.e. main body or parenthesized thing
function ParseBranch (var flagp : integer) : PRegExprChar;
// one alternative of an | operator
function ParsePiece (var flagp : integer) : PRegExprChar;
// something followed by possible [*+?]
function ParseAtom (var flagp : integer) : PRegExprChar;
// the lowest level
function GetCompilerErrorPos : integer;
// current pos in r.e. - for error hanling
{$IFDEF UseFirstCharSet} //###0.929
procedure FillFirstCharSet (prog : PRegExprChar);
{$ENDIF}
{===================== Mathing section ===================}
function regrepeat (p : PRegExprChar; AMax : integer) : integer;
// repeatedly match something simple, report how many
function regnext (p : PRegExprChar) : PRegExprChar;
// dig the "next" pointer out of a node
function MatchPrim (prog : PRegExprChar) : boolean;
// recursively matching routine
function ExecPrim (AOffset: integer) : boolean;
// Exec for stored InputString
{$IFDEF RegExpPCodeDump}
function DumpOp (op : REChar) : RegExprString;
{$ENDIF}
function GetSubExprMatchCount : integer;
function GetMatchPos (Idx : integer) : integer;
function GetMatchLen (Idx : integer) : integer;
function GetMatch (Idx : integer) : RegExprString;
function GetInputString : RegExprString;
procedure SetInputString (const AInputString : RegExprString);
{$IFNDEF UseSetOfChar}
function StrScanCI (s : PRegExprChar; ch : REChar) : PRegExprChar; //###0.928
{$ENDIF}
procedure SetLineSeparators (const AStr : RegExprString);
procedure SetLinePairedSeparator (const AStr : RegExprString);
function GetLinePairedSeparator : RegExprString;
public
constructor Create;
destructor Destroy; override;
class function VersionMajor : integer; //###0.944
class function VersionMinor : integer; //###0.944
property Expression : RegExprString read GetExpression write SetExpression;
// Regular expression.
// For optimization, TRegExpr will automatically compiles it into 'P-code'
// (You can see it with help of Dump method) and stores in internal
// structures. Real [re]compilation occures only when it really needed -
// while calling Exec[Next], Substitute, Dump, etc
// and only if Expression or other P-code affected properties was changed
// after last [re]compilation.
// If any errors while [re]compilation occures, Error method is called
// (by default Error raises exception - see below)
property ModifierStr : RegExprString read GetModifierStr write SetModifierStr;
// Set/get default values of r.e.syntax modifiers. Modifiers in
// r.e. (?ismx-ismx) will replace this default values.
// If you try to set unsupported modifier, Error will be called
// (by defaul Error raises exception ERegExpr).
property ModifierI : boolean index 1 read GetModifier write SetModifier;
// Modifier /i - caseinsensitive, initialized from RegExprModifierI
property ModifierR : boolean index 2 read GetModifier write SetModifier;
// Modifier /r - use r.e.syntax extended for russian,
// (was property ExtSyntaxEnabled in previous versions)
// If true, then
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -