Chameleon

Chameleon Svn Source Tree

Root/branches/xZenu/src/util/doxygen/qtools/qregexp.cpp

Source at commit 1322 created 12 years 8 months ago.
By meklort, Add doxygen to utils folder
1/****************************************************************************
2**
3**
4** Implementation of QRegExp class
5**
6** Created : 950126
7**
8** Copyright (C) 1992-2000 Trolltech AS. All rights reserved.
9**
10** This file is part of the tools module of the Qt GUI Toolkit.
11**
12** This file may be distributed under the terms of the Q Public License
13** as defined by Trolltech AS of Norway and appearing in the file
14** LICENSE.QPL included in the packaging of this file.
15**
16** This file may be distributed and/or modified under the terms of the
17** GNU General Public License version 2 as published by the Free Software
18** Foundation and appearing in the file LICENSE.GPL included in the
19** packaging of this file.
20**
21** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
22** licenses may use this file in accordance with the Qt Commercial License
23** Agreement provided with the Software.
24**
25** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
26** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
27**
28** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
29** information about Qt Commercial License Agreements.
30** See http://www.trolltech.com/qpl/ for QPL licensing information.
31** See http://www.trolltech.com/gpl/ for GPL licensing information.
32**
33** Contact info@trolltech.com if any conditions of this licensing are
34** not clear to you.
35**
36**********************************************************************/
37
38#include "qregexp.h"
39#include <ctype.h>
40#include <stdlib.h>
41
42// NOT REVISED
43/*!
44 \class QRegExp qregexp.h
45 \ingroup tools
46 \ingroup misc
47 \brief The QRegExp class provides pattern matching using regular
48 expressions or wildcards.
49
50 QRegExp knows these regexp primitives:
51 <ul plain>
52 <li><dfn>c</dfn> matches the character 'c'
53 <li><dfn>.</dfn> matches any character
54 <li><dfn>^</dfn> matches start of input
55 <li><dfn>$</dfn> matches end of input
56 <li><dfn>[]</dfn> matches a defined set of characters - see below.
57 <li><dfn>a*</dfn> matches a sequence of zero or more a's
58 <li><dfn>a+</dfn> matches a sequence of one or more a's
59 <li><dfn>a?</dfn> matches an optional a
60 <li><dfn>\c</dfn> escape code for matching special characters such
61 as \, [, *, +, . etc.
62 <li><dfn>\t</dfn> matches the TAB character (9)
63 <li><dfn>\n</dfn> matches newline (10)
64 <li><dfn>\r</dfn> matches return (13)
65 <li><dfn>\s</dfn> matches a white space (defined as any character
66 for which QChar::isSpace() returns TRUE. This includes at least
67 ASCII characters 9 (TAB), 10 (LF), 11 (VT), 12(FF), 13 (CR) and 32
68 (Space)).
69 <li><dfn>\d</dfn> matches a digit (defined as any character for
70 which QChar::isDigit() returns TRUE. This includes at least ASCII
71 characters '0'-'9').
72 <li><dfn>\x1f6b</dfn> matches the character with unicode point U1f6b
73 (hexadecimal 1f6b). \x0012 will match the ASCII/Latin1 character
74 0x12 (18 decimal, 12 hexadecimal).
75 <li><dfn>\022</dfn> matches the ASCII/Latin1 character 022 (18
76 decimal, 22 octal).
77 </ul>
78
79 In wildcard mode, it only knows four primitives:
80 <ul plain>
81 <li><dfn>c</dfn> matches the character 'c'
82 <li><dfn>?</dfn> matches any character
83 <li><dfn>*</dfn> matches any sequence of characters
84 <li><dfn>[]</dfn> matches a defined set of characters - see below.
85 </ul>
86
87 QRegExp supports Unicode both in the pattern strings and in the
88 strings to be matched.
89
90 When writing regular expressions in C++ code, remember that C++
91 processes \ characters. So in order to match e.g. a "." character,
92 you must write "\\." in C++ source, not "\.".
93
94 A character set matches a defined set of characters. For example,
95 [BSD] matches any of 'B', 'D' and 'S'. Within a character set, the
96 special characters '.', '*', '?', '^', '$', '+' and '[' lose their
97 special meanings. The following special characters apply:
98 <ul plain>
99 <li><dfn>^</dfn> When placed first in the list, changes the
100 character set to match any character \e not in the list. To include
101 the character '^' itself in the set, escape it or place it anywhere
102 but first.
103 <li><dfn>-</dfn> Defines a range of characters. To include the
104 character '-' itself in the set, escape it or place it last.
105 <li><dfn>]</dfn> Ends the character set definition. To include the
106 character ']' itself in the set, escape it or place it first (but
107 after the negation operator '^', if present)
108 </ul>
109 Thus, [a-zA-Z0-9.] matches upper and lower case ASCII letters,
110 digits and dot; and [^\s] matches everything except white space.
111
112 \bug Case insensitive matching is not supported for non-ASCII/Latin1
113 (non-8bit) characters. Any character with a non-zero QChar.row() is
114 matched case sensitively even if the QRegExp is in case insensitive
115 mode.
116
117 \note In Qt 3.0, the language of regular expressions will contain
118 five more special characters, namely '(', ')', '{', '|' and '}'. To
119 ease porting, it's a good idea to escape these characters with a
120 backslash in all the regular expressions you'll write from now on.
121*/
122
123
124//
125// The regexp pattern is internally represented as an array of uints,
126// each element containing an 16-bit character or a 32-bit code
127// (listed below). User-defined character classes (e.g. [a-zA-Z])
128// are encoded as this:
129// uint no:123...
130// value:CCL | nfrom | tofrom | to
131//
132// where n is the (16-bit) number of following range definitions and
133// from and to define the ranges inclusive. from <= to is always true,
134// otherwise it is a built-in charclass (Pxx, eg \s - PWS). Single
135// characters in the class are coded as from==to. Negated classes
136// (e.g. [^a-z]) use CCN instead of CCL.
137
138const uint END= 0x00000000;
139const uint PWS= 0x10010000;// predef charclass: whitespace (\s)
140const uint PDG= 0x10020000;// predef charclass: digit (\d)
141const uint CCL= 0x20010000;// character class[]
142const uint CCN= 0x20020000;// neg character class[^]
143const uint CHR= 0x40000000;// character
144const uint BOL= 0x80010000;// beginning of line^
145const uint EOL= 0x80020000;// end of line$
146const uint BOW= 0x80030000;// beginning of word\<
147const uint EOW= 0x80040000;// end of word\>
148const uint ANY= 0x80050000;// any character.
149const uint CLO= 0x80070000;// Kleene closure*
150const uint OPT= 0x80080000;// Optional closure?
151
152const uint MCC = 0x20000000;// character class bitmask
153const uint MCD = 0xffff0000;// code mask
154const uint MVL = 0x0000ffff;// value mask
155
156//
157// QRegExp::error codes (internal)
158//
159
160const int PatOk= 0;// pattern ok
161const int PatNull= 1;// no pattern defined
162const int PatSyntax= 2;// pattern syntax error
163const int PatOverflow= 4;// pattern too long
164
165
166/*****************************************************************************
167 QRegExp member functions
168 *****************************************************************************/
169
170/*!
171 Constructs an empty regular expression.
172*/
173
174QRegExp::QRegExp()
175{
176 rxdata = 0;
177 cs = TRUE;
178 wc = FALSE;
179 error = PatOk;
180}
181
182/*!
183 Constructs a regular expression.
184
185 \arg \e pattern is the regular expression pattern string.
186 \arg \e caseSensitive specifies whether or not to use case sensitive
187 matching.
188 \arg \e wildcard specifies whether the pattern string should be used for
189 wildcard matching (also called globbing expression), normally used for
190 matching file names.
191
192 \sa setWildcard()
193*/
194
195QRegExp::QRegExp( const QCString &pattern, bool caseSensitive, bool wildcard )
196{
197 rxstring = pattern;
198 rxdata = 0;
199 cs = caseSensitive;
200 wc = wildcard;
201 compile();
202}
203
204/*!
205 Constructs a regular expression which is a copy of \e r.
206 \sa operator=(const QRegExp&)
207*/
208
209QRegExp::QRegExp( const QRegExp &r )
210{
211 rxstring = r.pattern();
212 rxdata = 0;
213 cs = r.caseSensitive();
214 wc = r.wildcard();
215 compile();
216}
217
218/*!
219 Destructs the regular expression and cleans up its internal data.
220*/
221
222QRegExp::~QRegExp()
223{
224 if ( rxdata ) // Avoid purify complaints
225delete [] rxdata;
226}
227
228/*!
229 Copies the regexp \e r and returns a reference to this regexp.
230 The case sensitivity and wildcard options are copied, as well.
231*/
232
233QRegExp &QRegExp::operator=( const QRegExp &r )
234{
235 rxstring = r.rxstring;
236 cs = r.cs;
237 wc = r.wc;
238 compile();
239 return *this;
240}
241
242/*!
243 \obsolete
244 Consider using setPattern() instead of this method.
245
246 Sets the pattern string to \e pattern and returns a reference to this regexp.
247 The case sensitivity or wildcard options do not change.
248*/
249
250QRegExp &QRegExp::operator=( const QCString &pattern )
251{
252 rxstring = pattern;
253 compile();
254 return *this;
255}
256
257
258/*!
259 Returns TRUE if this regexp is equal to \e r.
260
261 Two regexp objects are equal if they have equal pattern strings,
262 case sensitivity options and wildcard options.
263*/
264
265bool QRegExp::operator==( const QRegExp &r ) const
266{
267 return rxstring == r.rxstring && cs == r.cs && wc == r.wc;
268}
269
270/*!
271 \fn bool QRegExp::operator!=( const QRegExp &r ) const
272
273 Returns TRUE if this regexp is \e not equal to \e r.
274
275 \sa operator==()
276*/
277
278/*!
279 \fn bool QRegExp::isEmpty() const
280 Returns TRUE if the regexp is empty.
281*/
282
283/*!
284 \fn bool QRegExp::isValid() const
285 Returns TRUE if the regexp is valid, or FALSE if it is invalid.
286
287 The pattern "[a-z" is an example of an invalid pattern, since it lacks a
288 closing bracket.
289*/
290
291
292/*!
293 \fn bool QRegExp::wildcard() const
294 Returns TRUE if wildcard mode is on, otherwise FALSE. \sa setWildcard().
295*/
296
297/*!
298 Sets the wildcard option for the regular expression.The default
299 is FALSE.
300
301 Setting \e wildcard to TRUE makes it convenient to match filenames
302 instead of plain text.
303
304 For example, "qr*.cpp" matches the string "qregexp.cpp" in wildcard mode,
305 but not "qicpp" (which would be matched in normal mode).
306
307 \sa wildcard()
308*/
309
310void QRegExp::setWildcard( bool wildcard )
311{
312 if ( wildcard != wc ) {
313wc = wildcard;
314compile();
315 }
316}
317
318/*!
319 \fn bool QRegExp::caseSensitive() const
320
321 Returns TRUE if case sensitivity is enabled, otherwise FALSE. The
322 default is TRUE.
323
324 \sa setCaseSensitive()
325*/
326
327/*!
328 Enables or disables case sensitive matching.
329
330 In case sensitive mode, "a.e" matches "axe" but not "Axe".
331
332 See also: caseSensitive()
333*/
334
335void QRegExp::setCaseSensitive( bool enable )
336{
337 if ( cs != enable ) {
338cs = enable;
339compile();
340 }
341}
342
343
344/*!
345 \fn QCString QRegExp::pattern() const
346 Returns the pattern string of the regexp.
347*/
348
349
350/*!
351 \fn void QRegExp::setPattern(const QCString & pattern)
352 Sets the pattern string to \a pattern and returns a reference to this regexp.
353 The case sensitivity or wildcard options do not change.
354*/
355
356static inline bool iswordchar( int x )
357{
358 return isalnum(x) || x == '_';//# Only 8-bit support
359}
360
361
362/*!
363 \internal
364 Match character class
365*/
366
367static bool matchcharclass( uint *rxd, char c )
368{
369 uint *d = rxd;
370 uint clcode = *d & MCD;
371 bool neg = clcode == CCN;
372 if ( clcode != CCL && clcode != CCN)
373qWarning("QRegExp: Internal error, please report to qt-bugs@trolltech.com");
374 uint numFields = *d & MVL;
375 uint cval = (uint)c; //(((uint)(c.row())) << 8) | ((uint)c.cell());
376 bool found = FALSE;
377 for ( int i = 0; i < (int)numFields; i++ ) {
378d++;
379if ( *d == PWS && isspace(c) ) {
380 found = TRUE;
381 break;
382}
383if ( *d == PDG && isdigit(c) ) {
384 found = TRUE;
385 break;
386}
387else {
388 uint from = ( *d & MCD ) >> 16;
389 uint to = *d & MVL;
390 if ( (cval >= from) && (cval <= to) ) {
391found = TRUE;
392break;
393 }
394}
395 }
396 return neg ? !found : found;
397}
398
399
400
401/*
402 Internal: Recursively match string.
403*/
404
405static int matchstring( uint *rxd, const char *str, uint strlength,
406const char *bol, bool cs )
407{
408 const char *p = str;
409 const char *start = p;
410 uint pl = strlength;
411 uint *d = rxd;
412
413 //### in all cases here: handle pl == 0! (don't read past strlen)
414 while ( *d ) {
415if ( *d & CHR ) {// match char
416 if ( !pl )
417return -1;
418 char c = *d;
419 if ( !cs /*&& !c.row()*/ ) {// case insensitive, #Only 8bit
420if ( tolower(*p) != c )
421 return -1;
422p++;
423pl--;
424 } else {// case insensitive
425if ( *p != c )
426 return -1;
427p++;
428pl--;
429 }
430 d++;
431}
432else if ( *d & MCC ) {// match char class
433 if ( !pl )
434return -1;
435 if ( !matchcharclass( d, *p ) )
436return -1;
437 p++;
438 pl--;
439 d += (*d & MVL) + 1;
440}
441else switch ( *d++ ) {
442 case PWS:// match whitespace
443if ( !pl || !isspace(*p) )
444 return -1;
445p++;
446pl--;
447break;
448 case PDG:// match digits
449if ( !pl || !isdigit(*p) )
450 return -1;
451p++;
452pl--;
453break;
454 case ANY:// match anything
455if ( !pl )
456 return -1;
457p++;
458pl--;
459break;
460 case BOL:// match beginning of line
461if ( p != bol )
462 return -1;
463break;
464 case EOL:// match end of line
465if ( pl )
466 return -1;
467break;
468 case BOW:// match beginning of word
469if ( !iswordchar(*p) || (p > bol && iswordchar(*(p-1)) ) )
470 return -1;
471break;
472 case EOW:// match end of word
473if ( iswordchar(*p) || p == bol || !iswordchar(*(p-1)) )
474 return -1;
475break;
476 case CLO:// Kleene closure
477{
478const char *first_p = p;
479if ( *d & CHR ) {// match char
480 char c = *d;
481 if ( !cs /*&& !c.row()*/ ) {// case insensitive, #only 8bit
482while ( pl /*&& !p->row()*/ && tolower(*p)==c ) {
483 p++;
484 pl--;
485}
486 }
487 else {// case sensitive
488while ( pl && *p == c ) {
489 p++;
490 pl--;
491}
492 }
493 d++;
494}
495else if ( *d & MCC ) {// match char class
496 while( pl && matchcharclass( d, *p ) ) {
497p++;
498pl--;
499 }
500 d += (*d & MVL) + 1;
501}
502else if ( *d == PWS ) {
503 while ( pl && isspace(*p) ) {
504p++;
505pl--;
506 }
507 d++;
508}
509else if ( *d == PDG ) {
510 while ( pl && isdigit(*p) ) {
511p++;
512pl--;
513 }
514 d++;
515}
516else if ( *d == ANY ) {
517 p += pl;
518 pl = 0;
519 d++;
520}
521else {
522 return -1;// error
523}
524d++;// skip CLO's END
525while ( p >= first_p ) {// go backwards
526 int end = matchstring( d, p, pl, bol, cs );
527 if ( end >= 0 )
528return ( p - start ) + end;
529 if ( !p )
530return -1;
531 --p;
532 ++pl;
533}
534}
535return -1;
536 case OPT:// optional closure
537{
538const char *first_p = p;
539if ( *d & CHR ) {// match char
540 char c = *d;
541 if ( !cs /*&& !c.row()*/ ) {// case insensitive, #only 8bit
542if ( pl && /*!p->row() &&*/ tolower(*p) == c ) {
543 p++;
544 pl--;
545}
546 }
547 else {// case sensitive
548if ( pl && *p == c ) {
549 p++;
550 pl--;
551}
552 }
553 d++;
554}
555else if ( *d & MCC ) {// match char class
556 if ( pl && matchcharclass( d, *p ) ) {
557p++;
558pl--;
559 }
560 d += (*d & MVL) + 1;
561}
562else if ( *d == PWS ) {
563 if ( pl && isspace(*p) ) {
564p++;
565pl--;
566 }
567 d++;
568}
569else if ( *d == PDG ) {
570 if ( pl && isdigit(*p) ) {
571p++;
572pl--;
573 }
574 d++;
575}
576else if ( *d == ANY ) {
577 if ( pl ) {
578p++;
579pl--;
580 }
581 d++;
582}
583else {
584 return -1;// error
585}
586d++;// skip OPT's END
587while ( p >= first_p ) {// go backwards
588 int end = matchstring( d, p, pl, bol, cs );
589 if ( end >= 0 )
590return ( p - start ) + end;
591 if ( !p )
592return -1;
593 --p;
594 ++pl;
595}
596}
597return -1;
598
599 default:// error
600return -1;
601}
602 }
603 return p - start;
604}
605
606
607/*!
608 \internal
609 Recursively match string.
610*/
611
612// This is obsolete now, but since it is protected (not private), it
613// is still implemented on the off-chance that somebody has made a
614// class derived from QRegExp and calls this directly.
615// Qt 3.0: Remove this?
616
617#if 0
618const char *QRegExp::matchstr( uint *rxd, const QChar *str, uint strlength,
619const QChar *bol ) const
620{
621 int len = matchstring( rxd, str, strlength, bol, cs );
622 if ( len < 0 )
623return 0;
624 return str + len;
625}
626#endif
627
628/*!
629 Attempts to match in \e str, starting from position \e index.
630 Returns the position of the match, or -1 if there was no match.
631
632 If \e len is not a null pointer, the length of the match is stored in
633 \e *len.
634
635 If \e indexIsStart is TRUE (the default), the position \e index in
636 the string will match the start-of-input primitive (^) in the
637 regexp, if present. Otherwise, position 0 in \e str will match.
638
639 Example:
640 \code
641 QRegExp r("[0-9]*\\.[0-9]+");// matches floating point
642 int len;
643 r.match("pi = 3.1416", 0, &len);// returns 5, len == 6
644 \endcode
645
646 \note In Qt 3.0, this function will be replaced by find().
647*/
648
649int QRegExp::match( const QCString &str, int index, int *len,
650 bool indexIsStart ) const
651{
652 if ( !isValid() || isEmpty() )
653return -1;
654 if ( str.length() < (uint)index )
655return -1;
656 const char *start = str.data();
657 const char *p = start + index;
658 uint pl = str.length() - index;
659 uint *d = rxdata;
660 int ep = -1;
661
662 if ( *d == BOL ) {// match from beginning of line
663ep = matchstring( d, p, pl, indexIsStart ? p : start, cs );
664 } else {
665if ( *d & CHR ) {
666 char c = *d;
667 if ( !cs /*&& !c.row()*/ ) {// case sensitive, # only 8bit
668while ( pl && ( /*p->row() ||*/ tolower(*p) != c ) ) {
669 p++;
670 pl--;
671}
672 } else {// case insensitive
673while ( pl && *p != c ) {
674 p++;
675 pl--;
676}
677 }
678}
679while( 1 ) {// regular match
680 ep = matchstring( d, p, pl, indexIsStart ? start+index : start, cs );
681 if ( ep >= 0 )
682break;
683 if ( !pl )
684break;
685 p++;
686 pl--;
687}
688 }
689 if ( len )
690*len = ep >= 0 ? ep : 0; // No match -> 0, for historical reasons
691 return ep >= 0 ? (int)(p - start) : -1;// return index;
692}
693
694/*! \fn int QRegExp::find( const QCString& str, int index )
695
696 Attempts to match in \e str, starting from position \e index.
697 Returns the position of the match, or -1 if there was no match.
698
699 \sa match()
700*/
701
702//
703// Translate wildcard pattern to standard regexp pattern.
704// Ex: *.cpp==> ^.*\.cpp$
705//
706
707static QCString wc2rx( const QCString &pattern )
708{
709 int patlen = (int)pattern.length();
710 QCString wcpattern("^");
711
712 char c;
713 for( int i = 0; i < patlen; i++ ) {
714c = pattern[i];
715switch ( (char)c ) {
716case '*':// '*' ==> '.*'
717 wcpattern += '.';
718 break;
719case '?':// '?' ==> '.'
720 c = '.';
721 break;
722case '.':// quote special regexp chars
723case '+':
724case '\\':
725case '$':
726case '^':
727 wcpattern += '\\';
728 break;
729case '[':
730 if ( (char)pattern[i+1] == '^' ) { // don't quote '^' after '['
731wcpattern += '[';
732c = pattern[i+1];
733i++;
734 }
735 break;
736}
737wcpattern += c;
738
739 }
740 wcpattern += '$';
741 return wcpattern;// return new regexp pattern
742}
743
744
745//
746// Internal: Get char value and increment pointer.
747//
748
749static uint char_val( const char **str, uint *strlength ) // get char value
750{
751 const char *p = *str;
752 uint pl = *strlength;
753 uint len = 1;
754 uint v = 0;
755 if ( (char)*p == '\\' ) {// escaped code
756p++;
757pl--;
758if ( !pl ) {// it is just a '\'
759 (*str)++;
760 (*strlength)--;
761 return '\\';
762}
763len++;// length at least 2
764int i;
765char c;
766char ch = tolower((char)*p);
767switch ( ch ) {
768 case 'b': v = '\b'; break;// bell
769 case 'f': v = '\f'; break;// form feed
770 case 'n': v = '\n'; break;// newline
771 case 'r': v = '\r'; break;// return
772 case 't': v = '\t'; break;// tab
773 case 's': v = PWS; break;// whitespace charclass
774 case 'd': v = PDG; break;// digit charclass
775 case '<': v = BOW; break;// word beginning matcher
776 case '>': v = EOW; break;// word ending matcher
777
778 case 'x': {// hex code
779p++;
780pl--;
781for ( i = 0; (i < 4) && pl; i++ ) {//up to 4 hex digits
782 c = tolower((char)*p);
783 bool a = ( c >= 'a' && c <= 'f' );
784 if ( (c >= '0' && c <= '9') || a ) {
785v <<= 4;
786v += a ? 10 + c - 'a' : c - '0';
787len++;
788 }
789 else {
790break;
791 }
792 p++;
793 pl--;
794}
795 }
796 break;
797
798 default: {
799if ( ch >= '0' && ch <= '7' ) {//octal code
800 len--;
801 for ( i = 0; (i < 3) && pl; i++ ) {// up to 3 oct digits
802c = (char)*p;
803if ( c >= '0' && c <= '7' ) {
804 v <<= 3;
805 v += c - '0';
806 len++;
807}
808else {
809 break;
810}
811p++;
812pl--;
813 }
814}
815else {// not an octal number
816 v = (uint)*p; //(((uint)(p->row())) << 8) | ((uint)p->cell());
817}
818 }
819}
820 } else {
821v = (uint)*p; //(((uint)(p->row())) << 8) | ((uint)p->cell());
822 }
823 *str += len;
824 *strlength -= len;
825 return v;
826}
827
828
829#if defined(DEBUG)
830static uint *dump( uint *p )
831{
832 while ( *p != END ) {
833if ( *p & CHR ) {
834 uchar uc = (uchar)*p;
835 char c = (char)uc;
836 uint u = (uint)uc; //(((uint)(uc.row())) << 8) | ((uint)uc.cell());
837 qDebug( "\tCHR\tU%04x (%c)", u, (c ? c : ' '));
838 p++;
839}
840else if ( *p & MCC ) {
841 uint clcode = *p & MCD;
842 uint numFields = *p & MVL;
843 if ( clcode == CCL )
844qDebug( "\tCCL\t%i", numFields );
845 else if ( clcode == CCN )
846qDebug( "\tCCN\t%i", numFields );
847 else
848qDebug("coding error!");
849 for ( int i = 0; i < (int)numFields; i++ ) {
850p++;
851if ( *p == PWS )
852 qDebug( "\t\tPWS" );
853else if ( *p == PDG )
854 qDebug( "\t\tPDG" );
855else {
856 uint from = ( *p & MCD ) >> 16;
857 uint to = *p & MVL;
858 char fc = (char)from;
859 char tc = (char)to;
860 qDebug( "\t\tU%04x (%c) - U%04x (%c)", from,
861 (fc ? fc : ' '), to, (tc ? tc : ' ') );
862}
863 }
864 p++;
865}
866else switch ( *p++ ) {
867 case PWS:
868qDebug( "\tPWS" );
869break;
870 case PDG:
871qDebug( "\tPDG" );
872break;
873 case BOL:
874qDebug( "\tBOL" );
875break;
876 case EOL:
877qDebug( "\tEOL" );
878break;
879 case BOW:
880qDebug( "\tBOW" );
881break;
882 case EOW:
883qDebug( "\tEOW" );
884break;
885 case ANY:
886qDebug( "\tANY" );
887break;
888 case CLO:
889qDebug( "\tCLO" );
890p = dump( p );
891break;
892 case OPT:
893qDebug( "\tOPT" );
894p = dump( p );
895break;
896}
897 }
898 qDebug( "\tEND" );
899 return p+1;
900}
901#endif // DEBUG
902
903
904static const int maxlen = 1024;// max length of regexp array
905static uint rxarray[ maxlen ];// tmp regexp array
906
907/*!
908 \internal
909 Compiles the regular expression and stores the result in rxdata.
910 The 'error' flag is set to non-zero if an error is detected.
911 NOTE! This function is not reentrant!
912*/
913
914void QRegExp::compile()
915{
916 if ( rxdata ) {// delete old data
917delete [] rxdata;
918rxdata = 0;
919 }
920 if ( rxstring.isEmpty() ) {// no regexp pattern set
921error = PatNull;
922return;
923 }
924
925 error = PatOk;// assume pattern is ok
926
927 QCString pattern;
928 if ( wc )
929pattern = wc2rx(rxstring);
930 else
931pattern = rxstring;
932 const char *start = pattern.data(); // pattern pointer
933 const char *p = start;// pattern pointer
934 uint pl = pattern.length();
935 uint *d = rxarray;// data pointer
936 uint *prev_d = 0;
937
938#define GEN(x)*d++ = (x)
939
940 while ( pl ) {
941char ch = (char)*p;
942switch ( ch ) {
943
944 case '^':// beginning of line
945prev_d = d;
946GEN( p == start ? BOL : (CHR | ch) );
947p++;
948pl--;
949break;
950
951 case '$':// end of line
952prev_d = d;
953GEN( pl == 1 ? EOL : (CHR | ch) );
954p++;
955pl--;
956break;
957
958 case '.':// any char
959prev_d = d;
960GEN( ANY );
961p++;
962pl--;
963break;
964
965 case '[':// character class
966{
967prev_d = d;
968p++;
969pl--;
970if ( !pl ) {
971 error = PatSyntax;
972 return;
973}
974bool firstIsEscaped = ( (char)*p == '\\' );
975uint cch = char_val( &p, &pl );
976if ( cch == '^' && !firstIsEscaped ) {// negate!
977 GEN( CCN );
978 if ( !pl ) {
979error = PatSyntax;
980return;
981 }
982 cch = char_val( &p, &pl );
983} else {
984 GEN( CCL );
985}
986uint numFields = 0;
987while ( pl ) {
988 if ((pl>2) && ((char)*p == '-') && ((char)*(p+1) != ']')) {
989// Found a range
990 char_val( &p, &pl ); // Read the '-'
991uint cch2 = char_val( &p, &pl ); // Read the range end
992if ( cch > cch2 ) { // swap start and stop
993 int tmp = cch;
994 cch = cch2;
995 cch2 = tmp;
996}
997GEN( (cch << 16) | cch2 );// from < to
998numFields++;
999 }
1000 else {
1001// Found a single character
1002if ( cch & MCD ) // It's a code; will not be mistaken
1003 GEN( cch ); // for a range, since from > to
1004else
1005 GEN( (cch << 16) | cch ); // from == to range
1006numFields++;
1007 }
1008 if ( d >= rxarray + maxlen ) {// pattern too long
1009error = PatOverflow;
1010return;
1011 }
1012 if ( !pl ) {// At least ']' should be left
1013error = PatSyntax;
1014return;
1015 }
1016 bool nextIsEscaped = ( (char)*p == '\\' );
1017 cch = char_val( &p, &pl );
1018 if ( cch == (uint)']' && !nextIsEscaped )
1019break;
1020 if ( !pl ) {// End, should have seen ']'
1021error = PatSyntax;
1022return;
1023 }
1024}
1025*prev_d |= numFields;// Store number of fields
1026}
1027break;
1028
1029 case '*':// Kleene closure, or
1030 case '+':// positive closure, or
1031 case '?':// optional closure
1032{
1033if ( prev_d == 0 ) {// no previous expression
1034 error = PatSyntax;// empty closure
1035 return;
1036}
1037switch ( *prev_d ) {// test if invalid closure
1038 case BOL:
1039 case BOW:
1040 case EOW:
1041 case CLO:
1042 case OPT:
1043error = PatSyntax;
1044return;
1045}
1046int ddiff = d - prev_d;
1047if ( *p == '+' ) {// convert to Kleene closure
1048 if ( d + ddiff >= rxarray + maxlen ) {
1049error = PatOverflow;// pattern too long
1050return;
1051 }
1052 memcpy( d, prev_d, ddiff*sizeof(uint) );
1053 d += ddiff;
1054 prev_d += ddiff;
1055}
1056memmove( prev_d+1, prev_d, ddiff*sizeof(uint) );
1057*prev_d = ch == '?' ? OPT : CLO;
1058d++;
1059GEN( END );
1060p++;
1061pl--;
1062}
1063break;
1064
1065 default:
1066{
1067prev_d = d;
1068uint cv = char_val( &p, &pl );
1069if ( cv & MCD ) {// It's a code
1070 GEN( cv );
1071}
1072else {
1073 if ( !cs && cv <= 0xff )// #only 8bit support
1074cv = tolower( cv );
1075 GEN( CHR | cv );
1076}
1077}
1078}
1079if ( d >= rxarray + maxlen ) {// oops!
1080 error = PatOverflow;// pattern too long
1081 return;
1082}
1083 }
1084 GEN( END );
1085 int len = d - rxarray;
1086 rxdata = new uint[ len ];// copy from rxarray to rxdata
1087 CHECK_PTR( rxdata );
1088 memcpy( rxdata, rxarray, len*sizeof(uint) );
1089#if defined(DEBUG)
1090 //dump( rxdata );// uncomment this line for debugging
1091#endif
1092}
1093

Archive Download this file

Revision: 1322