branches/xZenu/src/util/doxygen/qtools/qregexp.cpp - Chameleon Svn Source Tree - Chameleon open source boot loader project.

Root/branches/xZenu/src/util/doxygen/qtools/qregexp.cpp

Source at commit 1322 created 12 years 11 months ago. By meklort, Add doxygen to utils folder
1	/****************************************************************************␊
2	** ␊
3	**␊
4	** Implementation of QRegExp class␊
5	**␊
6	** Created : 950126␊
7	**␊
8	** Copyright (C) 1992-2000 Trolltech AS. All rights reserved.␊
9	**␊
10	** This file is part of the tools module of the Qt GUI Toolkit.␊
11	**␊
12	** This file may be distributed under the terms of the Q Public License␊
13	** as defined by Trolltech AS of Norway and appearing in the file␊
14	** LICENSE.QPL included in the packaging of this file.␊
15	**␊
16	** This file may be distributed and/or modified under the terms of the␊
17	** GNU General Public License version 2 as published by the Free Software␊
18	** Foundation and appearing in the file LICENSE.GPL included in the␊
19	** packaging of this file.␊
20	**␊
21	** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition␊
22	** licenses may use this file in accordance with the Qt Commercial License␊
23	** Agreement provided with the Software.␊
24	**␊
25	** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE␊
26	** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.␊
27	**␊
28	** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for␊
29	** information about Qt Commercial License Agreements.␊
30	** See http://www.trolltech.com/qpl/ for QPL licensing information.␊
31	** See http://www.trolltech.com/gpl/ for GPL licensing information.␊
32	**␊
33	** Contact info@trolltech.com if any conditions of this licensing are␊
34	** not clear to you.␊
35	**␊
36	**********************************************************************/␊
37	␊
38	#include "qregexp.h"␊
39	#include <ctype.h>␊
40	#include <stdlib.h>␊
41	␊
42	// NOT REVISED␊
43	/*!␊
44	\class QRegExp qregexp.h␊
45	\ingroup tools␊
46	\ingroup misc␊
47	\brief The QRegExp class provides pattern matching using regular␊
48	expressions or wildcards.␊
49	␊
50	QRegExp knows these regexp primitives:␊
51	<ul plain>␊
52	<li><dfn>c</dfn> matches the character 'c'␊
53	<li><dfn>.</dfn> matches any character␊
54	<li><dfn>^</dfn> matches start of input␊
55	<li><dfn>$</dfn> matches end of input␊
56	<li><dfn>[]</dfn> matches a defined set of characters - see below.␊
57	<li><dfn>a*</dfn> matches a sequence of zero or more a's␊
58	<li><dfn>a+</dfn> matches a sequence of one or more a's␊
59	<li><dfn>a?</dfn> matches an optional a␊
60	<li><dfn>\c</dfn> escape code for matching special characters such␊
61	as \, [, *, +, . etc.␊
62	<li><dfn>\t</dfn> matches the TAB character (9)␊
63	<li><dfn>\n</dfn> matches newline (10)␊
64	<li><dfn>\r</dfn> matches return (13)␊
65	<li><dfn>\s</dfn> matches a white space (defined as any character␊
66	for which QChar::isSpace() returns TRUE. This includes at least␊
67	ASCII characters 9 (TAB), 10 (LF), 11 (VT), 12(FF), 13 (CR) and 32␊
68	(Space)).␊
69	<li><dfn>\d</dfn> matches a digit (defined as any character for␊
70	which QChar::isDigit() returns TRUE. This includes at least ASCII␊
71	characters '0'-'9').␊
72	<li><dfn>\x1f6b</dfn> matches the character with unicode point U1f6b␊
73	(hexadecimal 1f6b). \x0012 will match the ASCII/Latin1 character␊
74	0x12 (18 decimal, 12 hexadecimal).␊
75	<li><dfn>\022</dfn> matches the ASCII/Latin1 character 022 (18␊
76	decimal, 22 octal).␊
77	</ul>␊
78	␊
79	In wildcard mode, it only knows four primitives:␊
80	<ul plain>␊
81	<li><dfn>c</dfn> matches the character 'c'␊
82	<li><dfn>?</dfn> matches any character␊
83	<li><dfn>*</dfn> matches any sequence of characters␊
84	<li><dfn>[]</dfn> matches a defined set of characters - see below.␊
85	</ul>␊
86	␊
87	QRegExp supports Unicode both in the pattern strings and in the␊
88	strings to be matched.␊
89	␊
90	When writing regular expressions in C++ code, remember that C++␊
91	processes \ characters. So in order to match e.g. a "." character,␊
92	you must write "\\." in C++ source, not "\.".␊
93	␊
94	A character set matches a defined set of characters. For example,␊
95	[BSD] matches any of 'B', 'D' and 'S'. Within a character set, the␊
96	special characters '.', '*', '?', '^', '$', '+' and '[' lose their␊
97	special meanings. The following special characters apply:␊
98	<ul plain>␊
99	<li><dfn>^</dfn> When placed first in the list, changes the␊
100	character set to match any character \e not in the list. To include␊
101	the character '^' itself in the set, escape it or place it anywhere␊
102	but first.␊
103	<li><dfn>-</dfn> Defines a range of characters. To include the␊
104	character '-' itself in the set, escape it or place it last.␊
105	<li><dfn>]</dfn> Ends the character set definition. To include the␊
106	character ']' itself in the set, escape it or place it first (but␊
107	after the negation operator '^', if present)␊
108	</ul>␊
109	Thus, [a-zA-Z0-9.] matches upper and lower case ASCII letters,␊
110	digits and dot; and [^\s] matches everything except white space.␊
111	␊
112	\bug Case insensitive matching is not supported for non-ASCII/Latin1␊
113	(non-8bit) characters. Any character with a non-zero QChar.row() is␊
114	matched case sensitively even if the QRegExp is in case insensitive␊
115	mode.␊
116	␊
117	\note In Qt 3.0, the language of regular expressions will contain␊
118	five more special characters, namely '(', ')', '{', '\|' and '}'. To␊
119	ease porting, it's a good idea to escape these characters with a␊
120	backslash in all the regular expressions you'll write from now on.␊
121	*/␊
122	␊
123	␊
124	//␊
125	// The regexp pattern is internally represented as an array of uints,␊
126	// each element containing an 16-bit character or a 32-bit code␊
127	// (listed below). User-defined character classes (e.g. [a-zA-Z])␊
128	// are encoded as this:␊
129	// uint no:␉1␉␉2␉␉3␉␉...␊
130	// value:␉CCL \| n␉␉from \| to␉from \| to␊
131	//␊
132	// where n is the (16-bit) number of following range definitions and␊
133	// from and to define the ranges inclusive. from <= to is always true,␊
134	// otherwise it is a built-in charclass (Pxx, eg \s - PWS). Single␊
135	// characters in the class are coded as from==to. Negated classes␊
136	// (e.g. [^a-z]) use CCN instead of CCL.␊
137	␊
138	const uint END␉= 0x00000000;␊
139	const uint PWS␉= 0x10010000;␉␉// predef charclass: whitespace (\s)␊
140	const uint PDG␉= 0x10020000;␉␉// predef charclass: digit (\d)␊
141	const uint CCL␉= 0x20010000;␉␉// character class␉[]␊
142	const uint CCN␉= 0x20020000;␉␉// neg character class␉[^]␊
143	const uint CHR␉= 0x40000000;␉␉// character␊
144	const uint BOL␉= 0x80010000;␉␉// beginning of line␉^␊
145	const uint EOL␉= 0x80020000;␉␉// end of line␉␉$␊
146	const uint BOW␉= 0x80030000;␉␉// beginning of word␉\<␊
147	const uint EOW␉= 0x80040000;␉␉// end of word␉␉\>␊
148	const uint ANY␉= 0x80050000;␉␉// any character␉.␊
149	const uint CLO␉= 0x80070000;␉␉// Kleene closure␉*␊
150	const uint OPT␉= 0x80080000;␉␉// Optional closure␉?␊
151	␊
152	const uint MCC = 0x20000000;␉␉// character class bitmask␊
153	const uint MCD = 0xffff0000;␉␉// code mask␊
154	const uint MVL = 0x0000ffff;␉␉// value mask␊
155	␊
156	//␊
157	// QRegExp::error codes (internal)␊
158	//␊
159	␊
160	const int PatOk␉␉= 0;␉␉␉// pattern ok␊
161	const int PatNull␉= 1;␉␉␉// no pattern defined␊
162	const int PatSyntax␉= 2;␉␉␉// pattern syntax error␊
163	const int PatOverflow␉= 4;␉␉␉// pattern too long␊
164	␊
165	␊
166	/*****************************************************************************␊
167	QRegExp member functions␊
168	*****************************************************************************/␊
169	␊
170	/*!␊
171	Constructs an empty regular expression.␊
172	*/␊
173	␊
174	QRegExp::QRegExp()␊
175	{␊
176	rxdata = 0;␊
177	cs = TRUE;␊
178	wc = FALSE;␊
179	error = PatOk;␊
180	}␊
181	␊
182	/*!␊
183	Constructs a regular expression.␊
184	␊
185	\arg \e pattern is the regular expression pattern string.␊
186	\arg \e caseSensitive specifies whether or not to use case sensitive␊
187	matching.␊
188	\arg \e wildcard specifies whether the pattern string should be used for␊
189	wildcard matching (also called globbing expression), normally used for␊
190	matching file names.␊
191	␊
192	\sa setWildcard()␊
193	*/␊
194	␊
195	QRegExp::QRegExp( const QCString &pattern, bool caseSensitive, bool wildcard )␊
196	{␊
197	rxstring = pattern;␊
198	rxdata = 0;␊
199	cs = caseSensitive;␊
200	wc = wildcard;␊
201	compile();␊
202	}␊
203	␊
204	/*!␊
205	Constructs a regular expression which is a copy of \e r.␊
206	\sa operator=(const QRegExp&)␊
207	*/␊
208	␊
209	QRegExp::QRegExp( const QRegExp &r )␊
210	{␊
211	rxstring = r.pattern();␊
212	rxdata = 0;␊
213	cs = r.caseSensitive();␊
214	wc = r.wildcard();␊
215	compile();␊
216	}␊
217	␊
218	/*!␊
219	Destructs the regular expression and cleans up its internal data.␊
220	*/␊
221	␊
222	QRegExp::~QRegExp()␊
223	{␊
224	if ( rxdata ) // Avoid purify complaints␊
225	␉delete [] rxdata;␊
226	}␊
227	␊
228	/*!␊
229	Copies the regexp \e r and returns a reference to this regexp.␊
230	The case sensitivity and wildcard options are copied, as well.␊
231	*/␊
232	␊
233	QRegExp &QRegExp::operator=( const QRegExp &r )␊
234	{␊
235	rxstring = r.rxstring;␊
236	cs = r.cs;␊
237	wc = r.wc;␊
238	compile();␊
239	return *this;␊
240	}␊
241	␊
242	/*!␊
243	\obsolete␊
244	Consider using setPattern() instead of this method.␊
245	␊
246	Sets the pattern string to \e pattern and returns a reference to this regexp.␊
247	The case sensitivity or wildcard options do not change.␊
248	*/␊
249	␊
250	QRegExp &QRegExp::operator=( const QCString &pattern )␊
251	{␊
252	rxstring = pattern;␊
253	compile();␊
254	return *this;␊
255	}␊
256	␊
257	␊
258	/*!␊
259	Returns TRUE if this regexp is equal to \e r.␊
260	␊
261	Two regexp objects are equal if they have equal pattern strings,␊
262	case sensitivity options and wildcard options.␊
263	*/␊
264	␊
265	bool QRegExp::operator==( const QRegExp &r ) const␊
266	{␊
267	return rxstring == r.rxstring && cs == r.cs && wc == r.wc;␊
268	}␊
269	␊
270	/*!␊
271	\fn bool QRegExp::operator!=( const QRegExp &r ) const␊
272	␊
273	Returns TRUE if this regexp is \e not equal to \e r.␊
274	␊
275	\sa operator==()␊
276	*/␊
277	␊
278	/*!␊
279	\fn bool QRegExp::isEmpty() const␊
280	Returns TRUE if the regexp is empty.␊
281	*/␊
282	␊
283	/*!␊
284	\fn bool QRegExp::isValid() const␊
285	Returns TRUE if the regexp is valid, or FALSE if it is invalid.␊
286	␊
287	The pattern "[a-z" is an example of an invalid pattern, since it lacks a␊
288	closing bracket.␊
289	*/␊
290	␊
291	␊
292	/*!␊
293	\fn bool QRegExp::wildcard() const␊
294	Returns TRUE if wildcard mode is on, otherwise FALSE. \sa setWildcard().␊
295	*/␊
296	␊
297	/*!␊
298	Sets the wildcard option for the regular expression.␉The default␊
299	is FALSE.␊
300	␊
301	Setting \e wildcard to TRUE makes it convenient to match filenames␊
302	instead of plain text.␊
303	␊
304	For example, "qr*.cpp" matches the string "qregexp.cpp" in wildcard mode,␊
305	but not "qicpp" (which would be matched in normal mode).␊
306	␊
307	\sa wildcard()␊
308	*/␊
309	␊
310	void QRegExp::setWildcard( bool wildcard )␊
311	{␊
312	if ( wildcard != wc ) {␊
313	␉wc = wildcard;␊
314	␉compile();␊
315	}␊
316	}␊
317	␊
318	/*!␊
319	\fn bool QRegExp::caseSensitive() const␊
320	␊
321	Returns TRUE if case sensitivity is enabled, otherwise FALSE.␉ The␊
322	default is TRUE.␊
323	␊
324	\sa setCaseSensitive()␊
325	*/␊
326	␊
327	/*!␊
328	Enables or disables case sensitive matching.␊
329	␊
330	In case sensitive mode, "a.e" matches "axe" but not "Axe".␊
331	␊
332	See also: caseSensitive()␊
333	*/␊
334	␊
335	void QRegExp::setCaseSensitive( bool enable )␊
336	{␊
337	if ( cs != enable ) {␊
338	␉cs = enable;␊
339	␉compile();␊
340	}␊
341	}␊
342	␊
343	␊
344	/*!␊
345	\fn QCString QRegExp::pattern() const␊
346	Returns the pattern string of the regexp.␊
347	*/␊
348	␊
349	␊
350	/*!␊
351	\fn void QRegExp::setPattern(const QCString & pattern)␊
352	Sets the pattern string to \a pattern and returns a reference to this regexp.␊
353	The case sensitivity or wildcard options do not change.␊
354	*/␊
355	␊
356	static inline bool iswordchar( int x )␊
357	{␊
358	return isalnum(x) \|\| x == '_';␉//# Only 8-bit support␊
359	}␊
360	␊
361	␊
362	/*!␊
363	\internal␊
364	Match character class␊
365	*/␊
366	␊
367	static bool matchcharclass( uint *rxd, char c )␊
368	{␊
369	uint *d = rxd;␊
370	uint clcode = *d & MCD;␊
371	bool neg = clcode == CCN;␊
372	if ( clcode != CCL && clcode != CCN)␊
373	␉qWarning("QRegExp: Internal error, please report to qt-bugs@trolltech.com");␊
374	uint numFields = *d & MVL;␊
375	uint cval = (uint)c; //(((uint)(c.row())) << 8) \| ((uint)c.cell());␊
376	bool found = FALSE;␊
377	for ( int i = 0; i < (int)numFields; i++ ) {␊
378	␉d++;␊
379	␉if ( *d == PWS && isspace(c) ) {␊
380	␉ found = TRUE;␊
381	␉ break;␊
382	␉}␊
383	␉if ( *d == PDG && isdigit(c) ) {␊
384	␉ found = TRUE;␊
385	␉ break;␊
386	␉}␊
387	␉else {␊
388	␉ uint from = ( *d & MCD ) >> 16;␊
389	␉ uint to = *d & MVL;␊
390	␉ if ( (cval >= from) && (cval <= to) ) {␊
391	␉␉found = TRUE;␊
392	␉␉break;␊
393	␉ }␊
394	␉}␊
395	}␊
396	return neg ? !found : found;␊
397	}␊
398	␊
399	␊
400	␊
401	/*␊
402	Internal: Recursively match string.␊
403	*/␊
404	␊
405	static int matchstring( uint rxd, const char str, uint strlength,␊
406	␉␉␉const char *bol, bool cs )␊
407	{␊
408	const char *p = str;␊
409	const char *start = p;␊
410	uint pl = strlength;␊
411	uint *d = rxd;␊
412	␊
413	//### in all cases here: handle pl == 0! (don't read past strlen)␊
414	while ( *d ) {␊
415	␉if ( *d & CHR ) {␉␉␉// match char␊
416	␉ if ( !pl )␊
417	␉␉return -1;␊
418	␉ char c = *d;␊
419	␉ if ( !cs /&& !c.row()/ ) {␉␉// case insensitive, #Only 8bit␊
420	␉␉if ( tolower(*p) != c )␊
421	␉␉ return -1;␊
422	␉␉p++;␊
423	␉␉pl--;␊
424	␉ } else {␉␉␉␉// case insensitive␊
425	␉␉if ( *p != c )␊
426	␉␉ return -1;␊
427	␉␉p++;␊
428	␉␉pl--;␊
429	␉ }␊
430	␉ d++;␊
431	␉}␊
432	␉else if ( *d & MCC ) {␉␉␉// match char class␊
433	␉ if ( !pl )␊
434	␉␉return -1;␊
435	␉ if ( !matchcharclass( d, *p ) )␊
436	␉␉return -1;␊
437	␉ p++;␊
438	␉ pl--;␊
439	␉ d += (*d & MVL) + 1;␊
440	␉}␊
441	␉else switch ( *d++ ) {␊
442	␉ case PWS:␉␉␉␉// match whitespace␊
443	␉␉if ( !pl \|\| !isspace(*p) )␊
444	␉␉ return -1;␊
445	␉␉p++;␊
446	␉␉pl--;␊
447	␉␉break;␊
448	␉ case PDG:␉␉␉␉// match digits␊
449	␉␉if ( !pl \|\| !isdigit(*p) )␊
450	␉␉ return -1;␊
451	␉␉p++;␊
452	␉␉pl--;␊
453	␉␉break;␊
454	␉ case ANY:␉␉␉␉// match anything␊
455	␉␉if ( !pl )␊
456	␉␉ return -1;␊
457	␉␉p++;␊
458	␉␉pl--;␊
459	␉␉break;␊
460	␉ case BOL:␉␉␉␉// match beginning of line␊
461	␉␉if ( p != bol )␊
462	␉␉ return -1;␊
463	␉␉break;␊
464	␉ case EOL:␉␉␉␉// match end of line␊
465	␉␉if ( pl )␊
466	␉␉ return -1;␊
467	␉␉break;␊
468	␉ case BOW:␉␉␉␉// match beginning of word␊
469	␉␉if ( !iswordchar(p) \|\| (p > bol && iswordchar((p-1)) ) )␊
470	␉␉ return -1;␊
471	␉␉break;␊
472	␉ case EOW:␉␉␉␉// match end of word␊
473	␉␉if ( iswordchar(p) \|\| p == bol \|\| !iswordchar((p-1)) )␊
474	␉␉ return -1;␊
475	␉␉break;␊
476	␉ case CLO:␉␉␉␉// Kleene closure␊
477	␉␉{␊
478	␉␉const char *first_p = p;␊
479	␉␉if ( *d & CHR ) {␉␉// match char␊
480	␉␉ char c = *d;␊
481	␉␉ if ( !cs /&& !c.row()/ ) {␉// case insensitive, #only 8bit␊
482	␉␉␉while ( pl /&& !p->row()/ && tolower(*p)==c ) {␊
483	␉␉␉ p++;␊
484	␉␉␉ pl--;␊
485	␉␉␉}␊
486	␉␉ }␊
487	␉␉ else {␉␉␉// case sensitive␊
488	␉␉␉while ( pl && *p == c ) {␊
489	␉␉␉ p++;␊
490	␉␉␉ pl--;␊
491	␉␉␉}␊
492	␉␉ }␊
493	␉␉ d++;␊
494	␉␉}␊
495	␉␉else if ( *d & MCC ) {␉␉␉// match char class␊
496	␉␉ while( pl && matchcharclass( d, *p ) ) {␊
497	␉␉␉p++;␊
498	␉␉␉pl--;␊
499	␉␉ }␊
500	␉␉ d += (*d & MVL) + 1;␊
501	␉␉}␊
502	␉␉else if ( *d == PWS ) {␊
503	␉␉ while ( pl && isspace(*p) ) {␊
504	␉␉␉p++;␊
505	␉␉␉pl--;␊
506	␉␉ }␊
507	␉␉ d++;␊
508	␉␉}␊
509	␉␉else if ( *d == PDG ) {␊
510	␉␉ while ( pl && isdigit(*p) ) {␊
511	␉␉␉p++;␊
512	␉␉␉pl--;␊
513	␉␉ }␊
514	␉␉ d++;␊
515	␉␉}␊
516	␉␉else if ( *d == ANY ) {␊
517	␉␉ p += pl;␊
518	␉␉ pl = 0;␊
519	␉␉ d++;␊
520	␉␉}␊
521	␉␉else {␊
522	␉␉ return -1;␉␉␉// error␊
523	␉␉}␊
524	␉␉d++;␉␉␉␉// skip CLO's END␊
525	␉␉while ( p >= first_p ) {␉// go backwards␊
526	␉␉ int end = matchstring( d, p, pl, bol, cs );␊
527	␉␉ if ( end >= 0 )␊
528	␉␉␉return ( p - start ) + end;␊
529	␉␉ if ( !p )␊
530	␉␉␉return -1;␊
531	␉␉ --p;␊
532	␉␉ ++pl;␊
533	␉␉}␊
534	␉␉}␊
535	␉␉return -1;␊
536	␉ case OPT:␉␉␉␉// optional closure␊
537	␉␉{␊
538	␉␉const char *first_p = p;␊
539	␉␉if ( *d & CHR ) {␉␉// match char␊
540	␉␉ char c = *d;␊
541	␉␉ if ( !cs /&& !c.row()/ ) {␉// case insensitive, #only 8bit␊
542	␉␉␉if ( pl && /!p->row() &&/ tolower(*p) == c ) {␊
543	␉␉␉ p++;␊
544	␉␉␉ pl--;␊
545	␉␉␉}␊
546	␉␉ }␊
547	␉␉ else {␉␉␉// case sensitive␊
548	␉␉␉if ( pl && *p == c ) {␊
549	␉␉␉ p++;␊
550	␉␉␉ pl--;␊
551	␉␉␉}␊
552	␉␉ }␊
553	␉␉ d++;␊
554	␉␉}␊
555	␉␉else if ( *d & MCC ) {␉␉␉// match char class␊
556	␉␉ if ( pl && matchcharclass( d, *p ) ) {␊
557	␉␉␉p++;␊
558	␉␉␉pl--;␊
559	␉␉ }␊
560	␉␉ d += (*d & MVL) + 1;␊
561	␉␉}␊
562	␉␉else if ( *d == PWS ) {␊
563	␉␉ if ( pl && isspace(*p) ) {␊
564	␉␉␉p++;␊
565	␉␉␉pl--;␊
566	␉␉ }␊
567	␉␉ d++;␊
568	␉␉}␊
569	␉␉else if ( *d == PDG ) {␊
570	␉␉ if ( pl && isdigit(*p) ) {␊
571	␉␉␉p++;␊
572	␉␉␉pl--;␊
573	␉␉ }␊
574	␉␉ d++;␊
575	␉␉}␊
576	␉␉else if ( *d == ANY ) {␊
577	␉␉ if ( pl ) {␊
578	␉␉␉p++;␊
579	␉␉␉pl--;␊
580	␉␉ }␊
581	␉␉ d++;␊
582	␉␉}␊
583	␉␉else {␊
584	␉␉ return -1;␉␉␉// error␊
585	␉␉}␊
586	␉␉d++;␉␉␉␉// skip OPT's END␊
587	␉␉while ( p >= first_p ) {␉// go backwards␊
588	␉␉ int end = matchstring( d, p, pl, bol, cs );␊
589	␉␉ if ( end >= 0 )␊
590	␉␉␉return ( p - start ) + end;␊
591	␉␉ if ( !p )␊
592	␉␉␉return -1;␊
593	␉␉ --p;␊
594	␉␉ ++pl;␊
595	␉␉}␊
596	␉␉}␊
597	␉␉return -1;␊
598	␊
599	␉ default:␉␉␉␉// error␊
600	␉␉return -1;␊
601	␉}␊
602	}␊
603	return p - start;␊
604	}␊
605	␊
606	␊
607	/*!␊
608	\internal␊
609	Recursively match string.␊
610	*/␊
611	␊
612	// This is obsolete now, but since it is protected (not private), it␊
613	// is still implemented on the off-chance that somebody has made a␊
614	// class derived from QRegExp and calls this directly.␊
615	// Qt 3.0: Remove this?␊
616	␊
617	#if 0␊
618	const char QRegExp::matchstr( uint rxd, const QChar *str, uint strlength,␊
619	␉␉␉␉const QChar *bol ) const␊
620	{␊
621	int len = matchstring( rxd, str, strlength, bol, cs );␊
622	if ( len < 0 )␊
623	␉return 0;␊
624	return str + len;␊
625	}␊
626	#endif␊
627	␊
628	/*!␊
629	Attempts to match in \e str, starting from position \e index.␊
630	Returns the position of the match, or -1 if there was no match.␊
631	␊
632	If \e len is not a null pointer, the length of the match is stored in␊
633	\e *len.␊
634	␊
635	If \e indexIsStart is TRUE (the default), the position \e index in␊
636	the string will match the start-of-input primitive (^) in the␊
637	regexp, if present. Otherwise, position 0 in \e str will match.␊
638	␊
639	Example:␊
640	\code␊
641	QRegExp r("[0-9]*\\.[0-9]+");␉␉// matches floating point␊
642	int len;␊
643	r.match("pi = 3.1416", 0, &len);␉␉// returns 5, len == 6␊
644	\endcode␊
645	␊
646	\note In Qt 3.0, this function will be replaced by find().␊
647	*/␊
648	␊
649	int QRegExp::match( const QCString &str, int index, int *len,␊
650	␉␉ bool indexIsStart ) const␊
651	{␊
652	if ( !isValid() \|\| isEmpty() )␊
653	␉return -1;␊
654	if ( str.length() < (uint)index )␊
655	␉return -1;␊
656	const char *start = str.data();␊
657	const char *p = start + index;␊
658	uint pl = str.length() - index;␊
659	uint *d = rxdata;␊
660	int ep = -1;␊
661	␊
662	if ( *d == BOL ) {␉␉␉␉// match from beginning of line␊
663	␉ep = matchstring( d, p, pl, indexIsStart ? p : start, cs );␊
664	} else {␊
665	␉if ( *d & CHR ) {␊
666	␉ char c = *d;␊
667	␉ if ( !cs /&& !c.row()/ ) {␉␉// case sensitive, # only 8bit␊
668	␉␉while ( pl && ( /p->row() \|\|/ tolower(*p) != c ) ) {␊
669	␉␉ p++;␊
670	␉␉ pl--;␊
671	␉␉}␊
672	␉ } else {␉␉␉␉// case insensitive␊
673	␉␉while ( pl && *p != c ) {␊
674	␉␉ p++;␊
675	␉␉ pl--;␊
676	␉␉}␊
677	␉ }␊
678	␉}␊
679	␉while( 1 ) {␉␉␉␉// regular match␊
680	␉ ep = matchstring( d, p, pl, indexIsStart ? start+index : start, cs );␊
681	␉ if ( ep >= 0 )␊
682	␉␉break;␊
683	␉ if ( !pl )␊
684	␉␉break;␊
685	␉ p++;␊
686	␉ pl--;␊
687	␉}␊
688	}␊
689	if ( len )␊
690	␉*len = ep >= 0 ? ep : 0; // No match -> 0, for historical reasons␊
691	return ep >= 0 ? (int)(p - start) : -1;␉␉// return index;␊
692	}␊
693	␊
694	/*! \fn int QRegExp::find( const QCString& str, int index )␊
695	␊
696	Attempts to match in \e str, starting from position \e index.␊
697	Returns the position of the match, or -1 if there was no match.␊
698	␊
699	\sa match()␊
700	*/␊
701	␊
702	//␊
703	// Translate wildcard pattern to standard regexp pattern.␊
704	// Ex:␉ .cpp␉==> ^.\.cpp$␊
705	//␊
706	␊
707	static QCString wc2rx( const QCString &pattern )␊
708	{␊
709	int patlen = (int)pattern.length();␊
710	QCString wcpattern("^");␊
711	␊
712	char c;␊
713	for( int i = 0; i < patlen; i++ ) {␊
714	␉c = pattern[i];␊
715	␉switch ( (char)c ) {␊
716	␉case '':␉␉␉␉// '' ==> '.*'␊
717	␉ wcpattern += '.';␊
718	␉ break;␊
719	␉case '?':␉␉␉␉// '?' ==> '.'␊
720	␉ c = '.';␊
721	␉ break;␊
722	␉case '.':␉␉␉␉// quote special regexp chars␊
723	␉case '+':␊
724	␉case '\\':␊
725	␉case '$':␊
726	␉case '^':␊
727	␉ wcpattern += '\\';␊
728	␉ break;␊
729	␉case '[':␊
730	␉ if ( (char)pattern[i+1] == '^' ) { // don't quote '^' after '['␊
731	␉␉wcpattern += '[';␊
732	␉␉c = pattern[i+1];␊
733	␉␉i++;␊
734	␉ }␊
735	␉ break;␊
736	␉}␊
737	␉wcpattern += c;␊
738	␊
739	}␊
740	wcpattern += '$';␊
741	return wcpattern;␉␉␉␉// return new regexp pattern␊
742	}␊
743	␊
744	␊
745	//␊
746	// Internal: Get char value and increment pointer.␊
747	//␊
748	␊
749	static uint char_val( const char *str, uint strlength ) // get char value␊
750	{␊
751	const char p = str;␊
752	uint pl = *strlength;␊
753	uint len = 1;␊
754	uint v = 0;␊
755	if ( (char)*p == '\\' ) {␉␉␉// escaped code␊
756	␉p++;␊
757	␉pl--;␊
758	␉if ( !pl ) {␉␉␉␉// it is just a '\'␊
759	␉ (*str)++;␊
760	␉ (*strlength)--;␊
761	␉ return '\\';␊
762	␉}␊
763	␉len++;␉␉␉␉␉// length at least 2␊
764	␉int i;␊
765	␉char c;␊
766	␉char ch = tolower((char)*p);␊
767	␉switch ( ch ) {␊
768	␉ case 'b': v = '\b'; break;␉// bell␊
769	␉ case 'f': v = '\f'; break;␉// form feed␊
770	␉ case 'n': v = '\n'; break;␉// newline␊
771	␉ case 'r': v = '\r'; break;␉// return␊
772	␉ case 't': v = '\t'; break;␉// tab␊
773	␉ case 's': v = PWS; break;␉␉// whitespace charclass␊
774	␉ case 'd': v = PDG; break;␉␉// digit charclass␊
775	␉ case '<': v = BOW; break;␉␉// word beginning matcher␊
776	␉ case '>': v = EOW; break;␉␉// word ending matcher␊
777	␊
778	␉ case 'x': {␉␉␉␉// hex code␊
779	␉␉p++;␊
780	␉␉pl--;␊
781	␉␉for ( i = 0; (i < 4) && pl; i++ ) {␉//up to 4 hex digits␊
782	␉␉ c = tolower((char)*p);␊
783	␉␉ bool a = ( c >= 'a' && c <= 'f' );␊
784	␉␉ if ( (c >= '0' && c <= '9') \|\| a ) {␊
785	␉␉␉v <<= 4;␊
786	␉␉␉v += a ? 10 + c - 'a' : c - '0';␊
787	␉␉␉len++;␊
788	␉␉ }␊
789	␉␉ else {␊
790	␉␉␉break;␊
791	␉␉ }␊
792	␉␉ p++;␊
793	␉␉ pl--;␊
794	␉␉}␊
795	␉ }␊
796	␉ break;␊
797	␊
798	␉ default: {␊
799	␉␉if ( ch >= '0' && ch <= '7' ) {␉//octal code␊
800	␉␉ len--;␊
801	␉␉ for ( i = 0; (i < 3) && pl; i++ ) {␉// up to 3 oct digits␊
802	␉␉␉c = (char)*p;␊
803	␉␉␉if ( c >= '0' && c <= '7' ) {␊
804	␉␉␉ v <<= 3;␊
805	␉␉␉ v += c - '0';␊
806	␉␉␉ len++;␊
807	␉␉␉}␊
808	␉␉␉else {␊
809	␉␉␉ break;␊
810	␉␉␉}␊
811	␉␉␉p++;␊
812	␉␉␉pl--;␊
813	␉␉ }␊
814	␉␉}␊
815	␉␉else {␉␉␉␉// not an octal number␊
816	␉␉ v = (uint)*p; //(((uint)(p->row())) << 8) \| ((uint)p->cell());␊
817	␉␉}␊
818	␉ }␊
819	␉}␊
820	} else {␊
821	␉v = (uint)*p; //(((uint)(p->row())) << 8) \| ((uint)p->cell());␊
822	}␊
823	*str += len;␊
824	*strlength -= len;␊
825	return v;␊
826	}␊
827	␊
828	␊
829	#if defined(DEBUG)␊
830	static uint dump( uint p )␊
831	{␊
832	while ( *p != END ) {␊
833	␉if ( *p & CHR ) {␊
834	␉ uchar uc = (uchar)*p;␊
835	␉ char c = (char)uc;␊
836	␉ uint u = (uint)uc; //(((uint)(uc.row())) << 8) \| ((uint)uc.cell());␊
837	␉ qDebug( "\tCHR\tU%04x (%c)", u, (c ? c : ' '));␊
838	␉ p++;␊
839	␉}␊
840	␉else if ( *p & MCC ) {␊
841	␉ uint clcode = *p & MCD;␊
842	␉ uint numFields = *p & MVL;␊
843	␉ if ( clcode == CCL )␊
844	␉␉qDebug( "\tCCL\t%i", numFields );␊
845	␉ else if ( clcode == CCN )␊
846	␉␉qDebug( "\tCCN\t%i", numFields );␊
847	␉ else␊
848	␉␉qDebug("coding error!");␊
849	␉ for ( int i = 0; i < (int)numFields; i++ ) {␊
850	␉␉p++;␊
851	␉␉if ( *p == PWS )␊
852	␉␉ qDebug( "\t\tPWS" );␊
853	␉␉else if ( *p == PDG )␊
854	␉␉ qDebug( "\t\tPDG" );␊
855	␉␉else {␊
856	␉␉ uint from = ( *p & MCD ) >> 16;␊
857	␉␉ uint to = *p & MVL;␊
858	␉␉ char fc = (char)from;␊
859	␉␉ char tc = (char)to;␊
860	␉␉ qDebug( "\t\tU%04x (%c) - U%04x (%c)", from,␊
861	␉␉␉ (fc ? fc : ' '), to, (tc ? tc : ' ') );␊
862	␉␉}␊
863	␉ }␊
864	␉ p++;␊
865	␉}␊
866	␉else switch ( *p++ ) {␊
867	␉ case PWS:␊
868	␉␉qDebug( "\tPWS" );␊
869	␉␉break;␊
870	␉ case PDG:␊
871	␉␉qDebug( "\tPDG" );␊
872	␉␉break;␊
873	␉ case BOL:␊
874	␉␉qDebug( "\tBOL" );␊
875	␉␉break;␊
876	␉ case EOL:␊
877	␉␉qDebug( "\tEOL" );␊
878	␉␉break;␊
879	␉ case BOW:␊
880	␉␉qDebug( "\tBOW" );␊
881	␉␉break;␊
882	␉ case EOW:␊
883	␉␉qDebug( "\tEOW" );␊
884	␉␉break;␊
885	␉ case ANY:␊
886	␉␉qDebug( "\tANY" );␊
887	␉␉break;␊
888	␉ case CLO:␊
889	␉␉qDebug( "\tCLO" );␊
890	␉␉p = dump( p );␊
891	␉␉break;␊
892	␉ case OPT:␊
893	␉␉qDebug( "\tOPT" );␊
894	␉␉p = dump( p );␊
895	␉␉break;␊
896	␉}␊
897	}␊
898	qDebug( "\tEND" );␊
899	return p+1;␊
900	}␊
901	#endif // DEBUG␊
902	␊
903	␊
904	static const int maxlen = 1024;␉␉␉// max length of regexp array␊
905	static uint rxarray[ maxlen ];␉␉␉// tmp regexp array␊
906	␊
907	/*!␊
908	\internal␊
909	Compiles the regular expression and stores the result in rxdata.␊
910	The 'error' flag is set to non-zero if an error is detected.␊
911	NOTE! This function is not reentrant!␊
912	*/␊
913	␊
914	void QRegExp::compile()␊
915	{␊
916	if ( rxdata ) {␉␉␉␉// delete old data␊
917	␉delete [] rxdata;␊
918	␉rxdata = 0;␊
919	}␊
920	if ( rxstring.isEmpty() ) {␉␉␉// no regexp pattern set␊
921	␉error = PatNull;␊
922	␉return;␊
923	}␊
924	␊
925	error = PatOk;␉␉␉␉// assume pattern is ok␊
926	␊
927	QCString pattern;␊
928	if ( wc )␊
929	␉pattern = wc2rx(rxstring);␊
930	else␊
931	␉pattern = rxstring;␊
932	const char *start = pattern.data();␉ // pattern pointer␊
933	const char *p = start;␉␉␉// pattern pointer␊
934	uint pl = pattern.length();␊
935	uint *d = rxarray;␉␉␉␉// data pointer␊
936	uint *prev_d = 0;␊
937	␊
938	#define GEN(x)␉*d++ = (x)␊
939	␊
940	while ( pl ) {␊
941	␉char ch = (char)*p;␊
942	␉switch ( ch ) {␊
943	␊
944	␉ case '^':␉␉␉␉// beginning of line␊
945	␉␉prev_d = d;␊
946	␉␉GEN( p == start ? BOL : (CHR \| ch) );␊
947	␉␉p++;␊
948	␉␉pl--;␊
949	␉␉break;␊
950	␊
951	␉ case '$':␉␉␉␉// end of line␊
952	␉␉prev_d = d;␊
953	␉␉GEN( pl == 1 ? EOL : (CHR \| ch) );␊
954	␉␉p++;␊
955	␉␉pl--;␊
956	␉␉break;␊
957	␊
958	␉ case '.':␉␉␉␉// any char␊
959	␉␉prev_d = d;␊
960	␉␉GEN( ANY );␊
961	␉␉p++;␊
962	␉␉pl--;␊
963	␉␉break;␊
964	␊
965	␉ case '[':␉␉␉␉// character class␊
966	␉␉{␊
967	␉␉prev_d = d;␊
968	␉␉p++;␊
969	␉␉pl--;␊
970	␉␉if ( !pl ) {␊
971	␉␉ error = PatSyntax;␊
972	␉␉ return;␊
973	␉␉}␊
974	␉␉bool firstIsEscaped = ( (char)*p == '\\' );␊
975	␉␉uint cch = char_val( &p, &pl );␊
976	␉␉if ( cch == '^' && !firstIsEscaped ) {␉// negate!␊
977	␉␉ GEN( CCN );␊
978	␉␉ if ( !pl ) {␊
979	␉␉␉error = PatSyntax;␊
980	␉␉␉return;␊
981	␉␉ }␊
982	␉␉ cch = char_val( &p, &pl );␊
983	␉␉} else {␊
984	␉␉ GEN( CCL );␊
985	␉␉}␊
986	␉␉uint numFields = 0;␊
987	␉␉while ( pl ) {␊
988	␉␉ if ((pl>2) && ((char)p == '-') && ((char)(p+1) != ']')) {␊
989	␉␉␉// Found a range␊
990	␉␉ ␉char_val( &p, &pl ); // Read the '-'␊
991	␉␉␉uint cch2 = char_val( &p, &pl ); // Read the range end␊
992	␉␉␉if ( cch > cch2 ) { ␉␉// swap start and stop␊
993	␉␉␉ int tmp = cch;␊
994	␉␉␉ cch = cch2;␊
995	␉␉␉ cch2 = tmp;␊
996	␉␉␉}␊
997	␉␉␉GEN( (cch << 16) \| cch2 );␉// from < to␊
998	␉␉␉numFields++;␊
999	␉␉ }␊
1000	␉␉ else {␊
1001	␉␉␉// Found a single character␊
1002	␉␉␉if ( cch & MCD ) // It's a code; will not be mistaken␊
1003	␉␉␉ GEN( cch );␉ // for a range, since from > to␊
1004	␉␉␉else␊
1005	␉␉␉ GEN( (cch << 16) \| cch ); // from == to range␊
1006	␉␉␉numFields++;␊
1007	␉␉ }␊
1008	␉␉ if ( d >= rxarray + maxlen ) {␉// pattern too long␊
1009	␉␉␉error = PatOverflow;␉␉␊
1010	␉␉␉return;␊
1011	␉␉ }␊
1012	␉␉ if ( !pl ) {␉␉// At least ']' should be left␊
1013	␉␉␉error = PatSyntax;␊
1014	␉␉␉return;␊
1015	␉␉ }␊
1016	␉␉ bool nextIsEscaped = ( (char)*p == '\\' );␊
1017	␉␉ cch = char_val( &p, &pl );␊
1018	␉␉ if ( cch == (uint)']' && !nextIsEscaped )␊
1019	␉␉␉break;␊
1020	␉␉ if ( !pl ) {␉␉// End, should have seen ']'␊
1021	␉␉␉error = PatSyntax;␊
1022	␉␉␉return;␊
1023	␉␉ }␊
1024	␉␉}␊
1025	␉␉*prev_d \|= numFields;␉␉// Store number of fields␊
1026	␉␉}␊
1027	␉␉break;␊
1028	␊
1029	␉ case '*':␉␉␉␉// Kleene closure, or␊
1030	␉ case '+':␉␉␉␉// positive closure, or␊
1031	␉ case '?':␉␉␉␉// optional closure␊
1032	␉␉{␊
1033	␉␉if ( prev_d == 0 ) {␉␉// no previous expression␊
1034	␉␉ error = PatSyntax;␉␉// empty closure␊
1035	␉␉ return;␊
1036	␉␉}␊
1037	␉␉switch ( *prev_d ) {␉␉// test if invalid closure␊
1038	␉␉ case BOL:␊
1039	␉␉ case BOW:␊
1040	␉␉ case EOW:␊
1041	␉␉ case CLO:␊
1042	␉␉ case OPT:␊
1043	␉␉␉error = PatSyntax;␊
1044	␉␉␉return;␊
1045	␉␉}␊
1046	␉␉int ddiff = d - prev_d;␊
1047	␉␉if ( *p == '+' ) {␉␉// convert to Kleene closure␊
1048	␉␉ if ( d + ddiff >= rxarray + maxlen ) {␊
1049	␉␉␉error = PatOverflow;␉// pattern too long␊
1050	␉␉␉return;␊
1051	␉␉ }␊
1052	␉␉ memcpy( d, prev_d, ddiff*sizeof(uint) );␊
1053	␉␉ d += ddiff;␊
1054	␉␉ prev_d += ddiff;␊
1055	␉␉}␊
1056	␉␉memmove( prev_d+1, prev_d, ddiff*sizeof(uint) );␊
1057	␉␉*prev_d = ch == '?' ? OPT : CLO;␊
1058	␉␉d++;␊
1059	␉␉GEN( END );␊
1060	␉␉p++;␊
1061	␉␉pl--;␊
1062	␉␉}␊
1063	␉␉break;␊
1064	␊
1065	␉ default:␊
1066	␉␉{␊
1067	␉␉prev_d = d;␊
1068	␉␉uint cv = char_val( &p, &pl );␊
1069	␉␉if ( cv & MCD ) {␉␉␉// It's a code␊
1070	␉␉ GEN( cv );␊
1071	␉␉}␊
1072	␉␉else {␊
1073	␉␉ if ( !cs && cv <= 0xff )␉␉// #only 8bit support␊
1074	␉␉␉cv = tolower( cv );␊
1075	␉␉ GEN( CHR \| cv );␊
1076	␉␉}␊
1077	␉␉}␊
1078	␉}␊
1079	␉if ( d >= rxarray + maxlen ) {␉␉// oops!␊
1080	␉ error = PatOverflow;␉␉// pattern too long␊
1081	␉ return;␊
1082	␉}␊
1083	}␊
1084	GEN( END );␊
1085	int len = d - rxarray;␊
1086	rxdata = new uint[ len ];␉␉␉// copy from rxarray to rxdata␊
1087	CHECK_PTR( rxdata );␊
1088	memcpy( rxdata, rxarray, len*sizeof(uint) );␊
1089	#if defined(DEBUG)␊
1090	//dump( rxdata );␉// uncomment this line for debugging␊
1091	#endif␊
1092	}␊
1093

Download this file