Chameleon

Chameleon Svn Source Tree

Root/branches/xZenu/src/modules/TinyXML/tinyxmlparser.cpp

Source at commit 1307 created 12 years 8 months ago.
By meklort, Add TinyXML as a module
1/*
2www.sourceforge.net/projects/tinyxml
3Original code by Lee Thomason (www.grinninglizard.com)
4
5This software is provided 'as-is', without any express or implied
6warranty. In no event will the authors be held liable for any
7damages arising from the use of this software.
8
9Permission is granted to anyone to use this software for any
10purpose, including commercial applications, and to alter it and
11redistribute it freely, subject to the following restrictions:
12
131. The origin of this software must not be misrepresented; you must
14not claim that you wrote the original software. If you use this
15software in a product, an acknowledgment in the product documentation
16would be appreciated but is not required.
17
182. Altered source versions must be plainly marked as such, and
19must not be misrepresented as being the original software.
20
213. This notice may not be removed or altered from any source
22distribution.
23*/
24
25#include <ctype.h>
26#include <stddef.h>
27
28#include "tinyxml.h"
29
30//#define DEBUG_PARSER
31#if defined( DEBUG_PARSER )
32#if defined( DEBUG ) && defined( _MSC_VER )
33#include <windows.h>
34#define TIXML_LOG OutputDebugString
35#else
36#define TIXML_LOG printf
37#endif
38#endif
39
40// Note tha "PutString" hardcodes the same list. This
41// is less flexible than it appears. Changing the entries
42// or order will break putstring.
43TiXmlBase::Entity TiXmlBase::entity[ TiXmlBase::NUM_ENTITY ] =
44{
45{ "&amp;", 5, '&' },
46{ "&lt;", 4, '<' },
47{ "&gt;", 4, '>' },
48{ "&quot;", 6, '\"' },
49{ "&apos;", 6, '\'' }
50};
51
52// Bunch of unicode info at:
53//http://www.unicode.org/faq/utf_bom.html
54// Including the basic of this table, which determines the #bytes in the
55// sequence from the lead byte. 1 placed for invalid sequences --
56// although the result will be junk, pass it through as much as possible.
57// Beware of the non-characters in UTF-8:
58//ef bb bf (Microsoft "lead bytes")
59//ef bf be
60//ef bf bf
61
62const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
63const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
64const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
65
66const int TiXmlBase::utf8ByteTable[256] =
67{
68//0123456789abcdef
691,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x00
701,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x10
711,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x20
721,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x30
731,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x40
741,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x50
751,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x60
761,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x70End of ASCII range
771,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x80 0x80 to 0xc1 invalid
781,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x90
791,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0xa0
801,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0xb0
811,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xc0 0xc2 to 0xdf 2 byte
822,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xd0
833,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,// 0xe0 0xe0 to 0xef 3 byte
844,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1// 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
85};
86
87
88void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
89{
90const unsigned long BYTE_MASK = 0xBF;
91const unsigned long BYTE_MARK = 0x80;
92const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
93
94if (input < 0x80)
95*length = 1;
96else if ( input < 0x800 )
97*length = 2;
98else if ( input < 0x10000 )
99*length = 3;
100else if ( input < 0x200000 )
101*length = 4;
102else
103{ *length = 0; return; }// This code won't covert this correctly anyway.
104
105output += *length;
106
107// Scary scary fall throughs.
108switch (*length)
109{
110case 4:
111--output;
112*output = (char)((input | BYTE_MARK) & BYTE_MASK);
113input >>= 6;
114case 3:
115--output;
116*output = (char)((input | BYTE_MARK) & BYTE_MASK);
117input >>= 6;
118case 2:
119--output;
120*output = (char)((input | BYTE_MARK) & BYTE_MASK);
121input >>= 6;
122case 1:
123--output;
124*output = (char)(input | FIRST_BYTE_MARK[*length]);
125}
126}
127
128
129/*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
130{
131// This will only work for low-ascii, everything else is assumed to be a valid
132// letter. I'm not sure this is the best approach, but it is quite tricky trying
133// to figure out alhabetical vs. not across encoding. So take a very
134// conservative approach.
135
136//if ( encoding == TIXML_ENCODING_UTF8 )
137//{
138if ( anyByte < 127 )
139return isalpha( anyByte );
140else
141return 1;// What else to do? The unicode set is huge...get the english ones right.
142//}
143//else
144//{
145//return isalpha( anyByte );
146//}
147}
148
149
150/*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
151{
152// This will only work for low-ascii, everything else is assumed to be a valid
153// letter. I'm not sure this is the best approach, but it is quite tricky trying
154// to figure out alhabetical vs. not across encoding. So take a very
155// conservative approach.
156
157//if ( encoding == TIXML_ENCODING_UTF8 )
158//{
159if ( anyByte < 127 )
160return isalnum( anyByte );
161else
162return 1;// What else to do? The unicode set is huge...get the english ones right.
163//}
164//else
165//{
166//return isalnum( anyByte );
167//}
168}
169
170
171class TiXmlParsingData
172{
173friend class TiXmlDocument;
174 public:
175void Stamp( const char* now, TiXmlEncoding encoding );
176
177const TiXmlCursor& Cursor() const{ return cursor; }
178
179 private:
180// Only used by the document!
181TiXmlParsingData( const char* start, int _tabsize, int row, int col )
182{
183assert( start );
184stamp = start;
185tabsize = _tabsize;
186cursor.row = row;
187cursor.col = col;
188}
189
190TiXmlCursorcursor;
191const char*stamp;
192inttabsize;
193};
194
195
196void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
197{
198assert( now );
199
200// Do nothing if the tabsize is 0.
201if ( tabsize < 1 )
202{
203return;
204}
205
206// Get the current row, column.
207int row = cursor.row;
208int col = cursor.col;
209const char* p = stamp;
210assert( p );
211
212while ( p < now )
213{
214// Treat p as unsigned, so we have a happy compiler.
215const unsigned char* pU = (const unsigned char*)p;
216
217// Code contributed by Fletcher Dunn: (modified by lee)
218switch (*pU) {
219case 0:
220// We *should* never get here, but in case we do, don't
221// advance past the terminating null character, ever
222return;
223
224case '\r':
225// bump down to the next line
226++row;
227col = 0;
228// Eat the character
229++p;
230
231// Check for \r\n sequence, and treat this as a single character
232if (*p == '\n') {
233++p;
234}
235break;
236
237case '\n':
238// bump down to the next line
239++row;
240col = 0;
241
242// Eat the character
243++p;
244
245// Check for \n\r sequence, and treat this as a single
246// character. (Yes, this bizarre thing does occur still
247// on some arcane platforms...)
248if (*p == '\r') {
249++p;
250}
251break;
252
253case '\t':
254// Eat the character
255++p;
256
257// Skip to next tab stop
258col = (col / tabsize + 1) * tabsize;
259break;
260
261case TIXML_UTF_LEAD_0:
262if ( encoding == TIXML_ENCODING_UTF8 )
263{
264if ( *(p+1) && *(p+2) )
265{
266// In these cases, don't advance the column. These are
267// 0-width spaces.
268if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
269p += 3;
270else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
271p += 3;
272else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
273p += 3;
274else
275{ p +=3; ++col; }// A normal character.
276}
277}
278else
279{
280++p;
281++col;
282}
283break;
284
285default:
286if ( encoding == TIXML_ENCODING_UTF8 )
287{
288// Eat the 1 to 4 byte utf8 character.
289int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)];
290if ( step == 0 )
291step = 1;// Error case from bad encoding, but handle gracefully.
292p += step;
293
294// Just advance one column, of course.
295++col;
296}
297else
298{
299++p;
300++col;
301}
302break;
303}
304}
305cursor.row = row;
306cursor.col = col;
307assert( cursor.row >= -1 );
308assert( cursor.col >= -1 );
309stamp = p;
310assert( stamp );
311}
312
313
314const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
315{
316if ( !p || !*p )
317{
318return 0;
319}
320if ( encoding == TIXML_ENCODING_UTF8 )
321{
322while ( *p )
323{
324const unsigned char* pU = (const unsigned char*)p;
325
326// Skip the stupid Microsoft UTF-8 Byte order marks
327if (*(pU+0)==TIXML_UTF_LEAD_0
328 && *(pU+1)==TIXML_UTF_LEAD_1
329 && *(pU+2)==TIXML_UTF_LEAD_2 )
330{
331p += 3;
332continue;
333}
334else if(*(pU+0)==TIXML_UTF_LEAD_0
335 && *(pU+1)==0xbfU
336 && *(pU+2)==0xbeU )
337{
338p += 3;
339continue;
340}
341else if(*(pU+0)==TIXML_UTF_LEAD_0
342 && *(pU+1)==0xbfU
343 && *(pU+2)==0xbfU )
344{
345p += 3;
346continue;
347}
348
349if ( IsWhiteSpace( *p ) )// Still using old rules for white space.
350++p;
351else
352break;
353}
354}
355else
356{
357while ( *p && IsWhiteSpace( *p ) )
358++p;
359}
360
361return p;
362}
363
364#ifdef TIXML_USE_STL
365/*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag )
366{
367for( ;; )
368{
369if ( !in->good() ) return false;
370
371int c = in->peek();
372// At this scope, we can't get to a document. So fail silently.
373if ( !IsWhiteSpace( c ) || c <= 0 )
374return true;
375
376*tag += (char) in->get();
377}
378}
379
380/*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag )
381{
382//assert( character > 0 && character < 128 );// else it won't work in utf-8
383while ( in->good() )
384{
385int c = in->peek();
386if ( c == character )
387return true;
388if ( c <= 0 )// Silent failure: can't get document at this scope
389return false;
390
391in->get();
392*tag += (char) c;
393}
394return false;
395}
396#endif
397
398// One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The
399// "assign" optimization removes over 10% of the execution time.
400//
401const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
402{
403// Oddly, not supported on some comilers,
404//name->clear();
405// So use this:
406*name = "";
407assert( p );
408
409// Names start with letters or underscores.
410// Of course, in unicode, tinyxml has no idea what a letter *is*. The
411// algorithm is generous.
412//
413// After that, they can be letters, underscores, numbers,
414// hyphens, or colons. (Colons are valid ony for namespaces,
415// but tinyxml can't tell namespaces from names.)
416if ( p && *p
417 && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
418{
419const char* start = p;
420while(p && *p
421&&(IsAlphaNum( (unsigned char ) *p, encoding )
422 || *p == '_'
423 || *p == '-'
424 || *p == '.'
425 || *p == ':' ) )
426{
427//(*name) += *p; // expensive
428++p;
429}
430if ( p-start > 0 ) {
431name->assign( start, p-start );
432}
433return p;
434}
435return 0;
436}
437
438const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
439{
440// Presume an entity, and pull it out.
441 TIXML_STRING ent;
442int i;
443*length = 0;
444
445if ( *(p+1) && *(p+1) == '#' && *(p+2) )
446{
447unsigned long ucs = 0;
448ptrdiff_t delta = 0;
449unsigned mult = 1;
450
451if ( *(p+2) == 'x' )
452{
453// Hexadecimal.
454if ( !*(p+3) ) return 0;
455
456const char* q = p+3;
457q = strchr( q, ';' );
458
459if ( !q || !*q ) return 0;
460
461delta = q-p;
462--q;
463
464while ( *q != 'x' )
465{
466if ( *q >= '0' && *q <= '9' )
467ucs += mult * (*q - '0');
468else if ( *q >= 'a' && *q <= 'f' )
469ucs += mult * (*q - 'a' + 10);
470else if ( *q >= 'A' && *q <= 'F' )
471ucs += mult * (*q - 'A' + 10 );
472else
473return 0;
474mult *= 16;
475--q;
476}
477}
478else
479{
480// Decimal.
481if ( !*(p+2) ) return 0;
482
483const char* q = p+2;
484q = strchr( q, ';' );
485
486if ( !q || !*q ) return 0;
487
488delta = q-p;
489--q;
490
491while ( *q != '#' )
492{
493if ( *q >= '0' && *q <= '9' )
494ucs += mult * (*q - '0');
495else
496return 0;
497mult *= 10;
498--q;
499}
500}
501if ( encoding == TIXML_ENCODING_UTF8 )
502{
503// convert the UCS to UTF-8
504ConvertUTF32ToUTF8( ucs, value, length );
505}
506else
507{
508*value = (char)ucs;
509*length = 1;
510}
511return p + delta + 1;
512}
513
514// Now try to match it.
515for( i=0; i<NUM_ENTITY; ++i )
516{
517if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
518{
519assert( strlen( entity[i].str ) == entity[i].strLength );
520*value = entity[i].chr;
521*length = 1;
522return ( p + entity[i].strLength );
523}
524}
525
526// So it wasn't an entity, its unrecognized, or something like that.
527*value = *p;// Don't put back the last one, since we return it!
528//*length = 1;// Leave unrecognized entities - this doesn't really work.
529// Just writes strange XML.
530return p+1;
531}
532
533
534bool TiXmlBase::StringEqual( const char* p,
535 const char* tag,
536 bool ignoreCase,
537 TiXmlEncoding encoding )
538{
539assert( p );
540assert( tag );
541if ( !p || !*p )
542{
543assert( 0 );
544return false;
545}
546
547const char* q = p;
548
549if ( ignoreCase )
550{
551while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
552{
553++q;
554++tag;
555}
556
557if ( *tag == 0 )
558return true;
559}
560else
561{
562while ( *q && *tag && *q == *tag )
563{
564++q;
565++tag;
566}
567
568if ( *tag == 0 )// Have we found the end of the tag, and everything equal?
569return true;
570}
571return false;
572}
573
574const char* TiXmlBase::ReadText(const char* p,
575TIXML_STRING * text,
576bool trimWhiteSpace,
577const char* endTag,
578bool caseInsensitive,
579TiXmlEncoding encoding )
580{
581 *text = "";
582if ( !trimWhiteSpace// certain tags always keep whitespace
583 || !condenseWhiteSpace )// if true, whitespace is always kept
584{
585// Keep all the white space.
586while ( p && *p
587&& !StringEqual( p, endTag, caseInsensitive, encoding )
588 )
589{
590int len;
591char cArr[4] = { 0, 0, 0, 0 };
592p = GetChar( p, cArr, &len, encoding );
593text->append( cArr, len );
594}
595}
596else
597{
598bool whitespace = false;
599
600// Remove leading white space:
601p = SkipWhiteSpace( p, encoding );
602while ( p && *p
603&& !StringEqual( p, endTag, caseInsensitive, encoding ) )
604{
605if ( *p == '\r' || *p == '\n' )
606{
607whitespace = true;
608++p;
609}
610else if ( IsWhiteSpace( *p ) )
611{
612whitespace = true;
613++p;
614}
615else
616{
617// If we've found whitespace, add it before the
618// new character. Any whitespace just becomes a space.
619if ( whitespace )
620{
621(*text) += ' ';
622whitespace = false;
623}
624int len;
625char cArr[4] = { 0, 0, 0, 0 };
626p = GetChar( p, cArr, &len, encoding );
627if ( len == 1 )
628(*text) += cArr[0];// more efficient
629else
630text->append( cArr, len );
631}
632}
633}
634if ( p && *p )
635p += strlen( endTag );
636return ( p && *p ) ? p : 0;
637}
638
639#ifdef TIXML_USE_STL
640
641void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag )
642{
643// The basic issue with a document is that we don't know what we're
644// streaming. Read something presumed to be a tag (and hope), then
645// identify it, and call the appropriate stream method on the tag.
646//
647// This "pre-streaming" will never read the closing ">" so the
648// sub-tag can orient itself.
649
650if ( !StreamTo( in, '<', tag ) )
651{
652SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
653return;
654}
655
656while ( in->good() )
657{
658int tagIndex = (int) tag->length();
659while ( in->good() && in->peek() != '>' )
660{
661int c = in->get();
662if ( c <= 0 )
663{
664SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
665break;
666}
667(*tag) += (char) c;
668}
669
670if ( in->good() )
671{
672// We now have something we presume to be a node of
673// some sort. Identify it, and call the node to
674// continue streaming.
675TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
676
677if ( node )
678{
679node->StreamIn( in, tag );
680bool isElement = node->ToElement() != 0;
681delete node;
682node = 0;
683
684// If this is the root element, we're done. Parsing will be
685// done by the >> operator.
686if ( isElement )
687{
688return;
689}
690}
691else
692{
693SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
694return;
695}
696}
697}
698// We should have returned sooner.
699SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
700}
701
702#endif
703
704const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
705{
706ClearError();
707
708// Parse away, at the document level. Since a document
709// contains nothing but other tags, most of what happens
710// here is skipping white space.
711if ( !p || !*p )
712{
713SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
714return 0;
715}
716
717// Note that, for a document, this needs to come
718// before the while space skip, so that parsing
719// starts from the pointer we are given.
720location.Clear();
721if ( prevData )
722{
723location.row = prevData->cursor.row;
724location.col = prevData->cursor.col;
725}
726else
727{
728location.row = 0;
729location.col = 0;
730}
731TiXmlParsingData data( p, TabSize(), location.row, location.col );
732location = data.Cursor();
733
734if ( encoding == TIXML_ENCODING_UNKNOWN )
735{
736// Check for the Microsoft UTF-8 lead bytes.
737const unsigned char* pU = (const unsigned char*)p;
738if (*(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
739 && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
740 && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
741{
742encoding = TIXML_ENCODING_UTF8;
743useMicrosoftBOM = true;
744}
745}
746
747 p = SkipWhiteSpace( p, encoding );
748if ( !p )
749{
750SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
751return 0;
752}
753
754while ( p && *p )
755{
756TiXmlNode* node = Identify( p, encoding );
757if ( node )
758{
759p = node->Parse( p, &data, encoding );
760LinkEndChild( node );
761}
762else
763{
764break;
765}
766
767// Did we get encoding info?
768if ( encoding == TIXML_ENCODING_UNKNOWN
769 && node->ToDeclaration() )
770{
771TiXmlDeclaration* dec = node->ToDeclaration();
772const char* enc = dec->Encoding();
773assert( enc );
774
775if ( *enc == 0 )
776encoding = TIXML_ENCODING_UTF8;
777else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
778encoding = TIXML_ENCODING_UTF8;
779else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
780encoding = TIXML_ENCODING_UTF8;// incorrect, but be nice
781else
782encoding = TIXML_ENCODING_LEGACY;
783}
784
785p = SkipWhiteSpace( p, encoding );
786}
787
788// Was this empty?
789if ( !firstChild ) {
790SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
791return 0;
792}
793
794// All is well.
795return p;
796}
797
798void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
799{
800// The first error in a chain is more accurate - don't set again!
801if ( error )
802return;
803
804assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
805error = true;
806errorId = err;
807errorDesc = errorString[ errorId ];
808
809errorLocation.Clear();
810if ( pError && data )
811{
812data->Stamp( pError, encoding );
813errorLocation = data->Cursor();
814}
815}
816
817
818TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
819{
820TiXmlNode* returnNode = 0;
821
822p = SkipWhiteSpace( p, encoding );
823if( !p || !*p || *p != '<' )
824{
825return 0;
826}
827
828p = SkipWhiteSpace( p, encoding );
829
830if ( !p || !*p )
831{
832return 0;
833}
834
835// What is this thing?
836// - Elements start with a letter or underscore, but xml is reserved.
837// - Comments: <!--
838// - Decleration: <?xml
839// - Everthing else is unknown to tinyxml.
840//
841
842const char* xmlHeader = { "<?xml" };
843const char* commentHeader = { "<!--" };
844const char* dtdHeader = { "<!" };
845const char* cdataHeader = { "<![CDATA[" };
846
847if ( StringEqual( p, xmlHeader, true, encoding ) )
848{
849#ifdef DEBUG_PARSER
850TIXML_LOG( "XML parsing Declaration\n" );
851#endif
852returnNode = new TiXmlDeclaration();
853}
854else if ( StringEqual( p, commentHeader, false, encoding ) )
855{
856#ifdef DEBUG_PARSER
857TIXML_LOG( "XML parsing Comment\n" );
858#endif
859returnNode = new TiXmlComment();
860}
861else if ( StringEqual( p, cdataHeader, false, encoding ) )
862{
863#ifdef DEBUG_PARSER
864TIXML_LOG( "XML parsing CDATA\n" );
865#endif
866TiXmlText* text = new TiXmlText( "" );
867text->SetCDATA( true );
868returnNode = text;
869}
870else if ( StringEqual( p, dtdHeader, false, encoding ) )
871{
872#ifdef DEBUG_PARSER
873TIXML_LOG( "XML parsing Unknown(1)\n" );
874#endif
875returnNode = new TiXmlUnknown();
876}
877else if ( IsAlpha( *(p+1), encoding )
878 || *(p+1) == '_' )
879{
880#ifdef DEBUG_PARSER
881TIXML_LOG( "XML parsing Element\n" );
882#endif
883returnNode = new TiXmlElement( "" );
884}
885else
886{
887#ifdef DEBUG_PARSER
888TIXML_LOG( "XML parsing Unknown(2)\n" );
889#endif
890returnNode = new TiXmlUnknown();
891}
892
893if ( returnNode )
894{
895// Set the parent, so it can report errors
896returnNode->parent = this;
897}
898return returnNode;
899}
900
901#ifdef TIXML_USE_STL
902
903void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag)
904{
905// We're called with some amount of pre-parsing. That is, some of "this"
906// element is in "tag". Go ahead and stream to the closing ">"
907while( in->good() )
908{
909int c = in->get();
910if ( c <= 0 )
911{
912TiXmlDocument* document = GetDocument();
913if ( document )
914document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
915return;
916}
917(*tag) += (char) c ;
918
919if ( c == '>' )
920break;
921}
922
923if ( tag->length() < 3 ) return;
924
925// Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
926// If not, identify and stream.
927
928if ( tag->at( tag->length() - 1 ) == '>'
929 && tag->at( tag->length() - 2 ) == '/' )
930{
931// All good!
932return;
933}
934else if ( tag->at( tag->length() - 1 ) == '>' )
935{
936// There is more. Could be:
937//text
938//cdata text (which looks like another node)
939//closing tag
940//another node.
941for ( ;; )
942{
943StreamWhiteSpace( in, tag );
944
945// Do we have text?
946if ( in->good() && in->peek() != '<' )
947{
948// Yep, text.
949TiXmlText text( "" );
950text.StreamIn( in, tag );
951
952// What follows text is a closing tag or another node.
953// Go around again and figure it out.
954continue;
955}
956
957// We now have either a closing tag...or another node.
958// We should be at a "<", regardless.
959if ( !in->good() ) return;
960assert( in->peek() == '<' );
961int tagIndex = (int) tag->length();
962
963bool closingTag = false;
964bool firstCharFound = false;
965
966for( ;; )
967{
968if ( !in->good() )
969return;
970
971int c = in->peek();
972if ( c <= 0 )
973{
974TiXmlDocument* document = GetDocument();
975if ( document )
976document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
977return;
978}
979
980if ( c == '>' )
981break;
982
983*tag += (char) c;
984in->get();
985
986// Early out if we find the CDATA id.
987if ( c == '[' && tag->size() >= 9 )
988{
989size_t len = tag->size();
990const char* start = tag->c_str() + len - 9;
991if ( strcmp( start, "<![CDATA[" ) == 0 ) {
992assert( !closingTag );
993break;
994}
995}
996
997if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
998{
999firstCharFound = true;
1000if ( c == '/' )
1001closingTag = true;
1002}
1003}
1004// If it was a closing tag, then read in the closing '>' to clean up the input stream.
1005// If it was not, the streaming will be done by the tag.
1006if ( closingTag )
1007{
1008if ( !in->good() )
1009return;
1010
1011int c = in->get();
1012if ( c <= 0 )
1013{
1014TiXmlDocument* document = GetDocument();
1015if ( document )
1016document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1017return;
1018}
1019assert( c == '>' );
1020*tag += (char) c;
1021
1022// We are done, once we've found our closing tag.
1023return;
1024}
1025else
1026{
1027// If not a closing tag, id it, and stream.
1028const char* tagloc = tag->c_str() + tagIndex;
1029TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
1030if ( !node )
1031return;
1032node->StreamIn( in, tag );
1033delete node;
1034node = 0;
1035
1036// No return: go around from the beginning: text, closing tag, or node.
1037}
1038}
1039}
1040}
1041#endif
1042
1043const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1044{
1045p = SkipWhiteSpace( p, encoding );
1046TiXmlDocument* document = GetDocument();
1047
1048if ( !p || !*p )
1049{
1050if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
1051return 0;
1052}
1053
1054if ( data )
1055{
1056data->Stamp( p, encoding );
1057location = data->Cursor();
1058}
1059
1060if ( *p != '<' )
1061{
1062if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
1063return 0;
1064}
1065
1066p = SkipWhiteSpace( p+1, encoding );
1067
1068// Read the name.
1069const char* pErr = p;
1070
1071 p = ReadName( p, &value, encoding );
1072if ( !p || !*p )
1073{
1074if ( document )document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
1075return 0;
1076}
1077
1078 TIXML_STRING endTag ("</");
1079endTag += value;
1080
1081// Check for and read attributes. Also look for an empty
1082// tag or an end tag.
1083while ( p && *p )
1084{
1085pErr = p;
1086p = SkipWhiteSpace( p, encoding );
1087if ( !p || !*p )
1088{
1089if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1090return 0;
1091}
1092if ( *p == '/' )
1093{
1094++p;
1095// Empty tag.
1096if ( *p != '>' )
1097{
1098if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
1099return 0;
1100}
1101return (p+1);
1102}
1103else if ( *p == '>' )
1104{
1105// Done with attributes (if there were any.)
1106// Read the value -- which can include other
1107// elements -- read the end tag, and return.
1108++p;
1109p = ReadValue( p, data, encoding );// Note this is an Element method, and will set the error if one happens.
1110if ( !p || !*p ) {
1111// We were looking for the end tag, but found nothing.
1112// Fix for [ 1663758 ] Failure to report error on bad XML
1113if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1114return 0;
1115}
1116
1117// We should find the end tag now
1118// note that:
1119// </foo > and
1120// </foo>
1121// are both valid end tags.
1122if ( StringEqual( p, endTag.c_str(), false, encoding ) )
1123{
1124p += endTag.length();
1125p = SkipWhiteSpace( p, encoding );
1126if ( p && *p && *p == '>' ) {
1127++p;
1128return p;
1129}
1130if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1131return 0;
1132}
1133else
1134{
1135if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1136return 0;
1137}
1138}
1139else
1140{
1141// Try to read an attribute:
1142TiXmlAttribute* attrib = new TiXmlAttribute();
1143if ( !attrib )
1144{
1145return 0;
1146}
1147
1148attrib->SetDocument( document );
1149pErr = p;
1150p = attrib->Parse( p, data, encoding );
1151
1152if ( !p || !*p )
1153{
1154if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
1155delete attrib;
1156return 0;
1157}
1158
1159// Handle the strange case of double attributes:
1160#ifdef TIXML_USE_STL
1161TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() );
1162#else
1163TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
1164#endif
1165if ( node )
1166{
1167if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
1168delete attrib;
1169return 0;
1170}
1171
1172attributeSet.Add( attrib );
1173}
1174}
1175return p;
1176}
1177
1178
1179const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1180{
1181TiXmlDocument* document = GetDocument();
1182
1183// Read in text and elements in any order.
1184const char* pWithWhiteSpace = p;
1185p = SkipWhiteSpace( p, encoding );
1186
1187while ( p && *p )
1188{
1189if ( *p != '<' )
1190{
1191// Take what we have, make a text element.
1192TiXmlText* textNode = new TiXmlText( "" );
1193
1194if ( !textNode )
1195{
1196 return 0;
1197}
1198
1199if ( TiXmlBase::IsWhiteSpaceCondensed() )
1200{
1201p = textNode->Parse( p, data, encoding );
1202}
1203else
1204{
1205// Special case: we want to keep the white space
1206// so that leading spaces aren't removed.
1207p = textNode->Parse( pWithWhiteSpace, data, encoding );
1208}
1209
1210if ( !textNode->Blank() )
1211LinkEndChild( textNode );
1212else
1213delete textNode;
1214}
1215else
1216{
1217// We hit a '<'
1218// Have we hit a new element or an end tag? This could also be
1219// a TiXmlText in the "CDATA" style.
1220if ( StringEqual( p, "</", false, encoding ) )
1221{
1222return p;
1223}
1224else
1225{
1226TiXmlNode* node = Identify( p, encoding );
1227if ( node )
1228{
1229p = node->Parse( p, data, encoding );
1230LinkEndChild( node );
1231}
1232else
1233{
1234return 0;
1235}
1236}
1237}
1238pWithWhiteSpace = p;
1239p = SkipWhiteSpace( p, encoding );
1240}
1241
1242if ( !p )
1243{
1244if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
1245}
1246return p;
1247}
1248
1249
1250#ifdef TIXML_USE_STL
1251void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag )
1252{
1253while ( in->good() )
1254{
1255int c = in->get();
1256if ( c <= 0 )
1257{
1258TiXmlDocument* document = GetDocument();
1259if ( document )
1260document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1261return;
1262}
1263(*tag) += (char) c;
1264
1265if ( c == '>' )
1266{
1267// All is well.
1268return;
1269}
1270}
1271}
1272#endif
1273
1274
1275const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1276{
1277TiXmlDocument* document = GetDocument();
1278p = SkipWhiteSpace( p, encoding );
1279
1280if ( data )
1281{
1282data->Stamp( p, encoding );
1283location = data->Cursor();
1284}
1285if ( !p || !*p || *p != '<' )
1286{
1287if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
1288return 0;
1289}
1290++p;
1291 value = "";
1292
1293while ( p && *p && *p != '>' )
1294{
1295value += *p;
1296++p;
1297}
1298
1299if ( !p )
1300{
1301if ( document )
1302document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
1303}
1304if ( p && *p == '>' )
1305return p+1;
1306return p;
1307}
1308
1309#ifdef TIXML_USE_STL
1310void TiXmlComment::StreamIn( std::istream * in, TIXML_STRING * tag )
1311{
1312while ( in->good() )
1313{
1314int c = in->get();
1315if ( c <= 0 )
1316{
1317TiXmlDocument* document = GetDocument();
1318if ( document )
1319document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1320return;
1321}
1322
1323(*tag) += (char) c;
1324
1325if ( c == '>'
1326 && tag->at( tag->length() - 2 ) == '-'
1327 && tag->at( tag->length() - 3 ) == '-' )
1328{
1329// All is well.
1330return;
1331}
1332}
1333}
1334#endif
1335
1336
1337const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1338{
1339TiXmlDocument* document = GetDocument();
1340value = "";
1341
1342p = SkipWhiteSpace( p, encoding );
1343
1344if ( data )
1345{
1346data->Stamp( p, encoding );
1347location = data->Cursor();
1348}
1349const char* startTag = "<!--";
1350const char* endTag = "-->";
1351
1352if ( !StringEqual( p, startTag, false, encoding ) )
1353{
1354if ( document )
1355document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
1356return 0;
1357}
1358p += strlen( startTag );
1359
1360// [ 1475201 ] TinyXML parses entities in comments
1361// Oops - ReadText doesn't work, because we don't want to parse the entities.
1362// p = ReadText( p, &value, false, endTag, false, encoding );
1363//
1364// from the XML spec:
1365/*
1366 [Definition: Comments may appear anywhere in a document outside other markup; in addition,
1367 they may appear within the document type declaration at places allowed by the grammar.
1368 They are not part of the document's character data; an XML processor MAY, but need not,
1369 make it possible for an application to retrieve the text of comments. For compatibility,
1370 the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity
1371 references MUST NOT be recognized within comments.
1372
1373 An example of a comment:
1374
1375 <!-- declarations for <head> & <body> -->
1376*/
1377
1378 value = "";
1379// Keep all the white space.
1380while (p && *p && !StringEqual( p, endTag, false, encoding ) )
1381{
1382value.append( p, 1 );
1383++p;
1384}
1385if ( p && *p )
1386p += strlen( endTag );
1387
1388return p;
1389}
1390
1391
1392const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1393{
1394p = SkipWhiteSpace( p, encoding );
1395if ( !p || !*p ) return 0;
1396
1397if ( data )
1398{
1399data->Stamp( p, encoding );
1400location = data->Cursor();
1401}
1402// Read the name, the '=' and the value.
1403const char* pErr = p;
1404p = ReadName( p, &name, encoding );
1405if ( !p || !*p )
1406{
1407if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1408return 0;
1409}
1410p = SkipWhiteSpace( p, encoding );
1411if ( !p || !*p || *p != '=' )
1412{
1413if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1414return 0;
1415}
1416
1417++p;// skip '='
1418p = SkipWhiteSpace( p, encoding );
1419if ( !p || !*p )
1420{
1421if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1422return 0;
1423}
1424
1425const char* end;
1426const char SINGLE_QUOTE = '\'';
1427const char DOUBLE_QUOTE = '\"';
1428
1429if ( *p == SINGLE_QUOTE )
1430{
1431++p;
1432end = "\'";// single quote in string
1433p = ReadText( p, &value, false, end, false, encoding );
1434}
1435else if ( *p == DOUBLE_QUOTE )
1436{
1437++p;
1438end = "\"";// double quote in string
1439p = ReadText( p, &value, false, end, false, encoding );
1440}
1441else
1442{
1443// All attribute values should be in single or double quotes.
1444// But this is such a common error that the parser will try
1445// its best, even without them.
1446value = "";
1447while ( p && *p// existence
1448&& !IsWhiteSpace( *p )// whitespace
1449&& *p != '/' && *p != '>' )// tag end
1450{
1451if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) {
1452// [ 1451649 ] Attribute values with trailing quotes not handled correctly
1453// We did not have an opening quote but seem to have a
1454// closing one. Give up and throw an error.
1455if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1456return 0;
1457}
1458value += *p;
1459++p;
1460}
1461}
1462return p;
1463}
1464
1465#ifdef TIXML_USE_STL
1466void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag )
1467{
1468while ( in->good() )
1469{
1470int c = in->peek();
1471if ( !cdata && (c == '<' ) )
1472{
1473return;
1474}
1475if ( c <= 0 )
1476{
1477TiXmlDocument* document = GetDocument();
1478if ( document )
1479document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1480return;
1481}
1482
1483(*tag) += (char) c;
1484in->get();// "commits" the peek made above
1485
1486if ( cdata && c == '>' && tag->size() >= 3 ) {
1487size_t len = tag->size();
1488if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) {
1489// terminator of cdata.
1490return;
1491}
1492}
1493}
1494}
1495#endif
1496
1497const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1498{
1499value = "";
1500TiXmlDocument* document = GetDocument();
1501
1502if ( data )
1503{
1504data->Stamp( p, encoding );
1505location = data->Cursor();
1506}
1507
1508const char* const startTag = "<![CDATA[";
1509const char* const endTag = "]]>";
1510
1511if ( cdata || StringEqual( p, startTag, false, encoding ) )
1512{
1513cdata = true;
1514
1515if ( !StringEqual( p, startTag, false, encoding ) )
1516{
1517if ( document )
1518document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
1519return 0;
1520}
1521p += strlen( startTag );
1522
1523// Keep all the white space, ignore the encoding, etc.
1524while ( p && *p
1525&& !StringEqual( p, endTag, false, encoding )
1526 )
1527{
1528value += *p;
1529++p;
1530}
1531
1532TIXML_STRING dummy;
1533p = ReadText( p, &dummy, false, endTag, false, encoding );
1534return p;
1535}
1536else
1537{
1538bool ignoreWhite = true;
1539
1540const char* end = "<";
1541p = ReadText( p, &value, ignoreWhite, end, false, encoding );
1542if ( p && *p )
1543return p-1;// don't truncate the '<'
1544return 0;
1545}
1546}
1547
1548#ifdef TIXML_USE_STL
1549void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag )
1550{
1551while ( in->good() )
1552{
1553int c = in->get();
1554if ( c <= 0 )
1555{
1556TiXmlDocument* document = GetDocument();
1557if ( document )
1558document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1559return;
1560}
1561(*tag) += (char) c;
1562
1563if ( c == '>' )
1564{
1565// All is well.
1566return;
1567}
1568}
1569}
1570#endif
1571
1572const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
1573{
1574p = SkipWhiteSpace( p, _encoding );
1575// Find the beginning, find the end, and look for
1576// the stuff in-between.
1577TiXmlDocument* document = GetDocument();
1578if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
1579{
1580if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
1581return 0;
1582}
1583if ( data )
1584{
1585data->Stamp( p, _encoding );
1586location = data->Cursor();
1587}
1588p += 5;
1589
1590version = "";
1591encoding = "";
1592standalone = "";
1593
1594while ( p && *p )
1595{
1596if ( *p == '>' )
1597{
1598++p;
1599return p;
1600}
1601
1602p = SkipWhiteSpace( p, _encoding );
1603if ( StringEqual( p, "version", true, _encoding ) )
1604{
1605TiXmlAttribute attrib;
1606p = attrib.Parse( p, data, _encoding );
1607version = attrib.Value();
1608}
1609else if ( StringEqual( p, "encoding", true, _encoding ) )
1610{
1611TiXmlAttribute attrib;
1612p = attrib.Parse( p, data, _encoding );
1613encoding = attrib.Value();
1614}
1615else if ( StringEqual( p, "standalone", true, _encoding ) )
1616{
1617TiXmlAttribute attrib;
1618p = attrib.Parse( p, data, _encoding );
1619standalone = attrib.Value();
1620}
1621else
1622{
1623// Read over whatever it is.
1624while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
1625++p;
1626}
1627}
1628return 0;
1629}
1630
1631bool TiXmlText::Blank() const
1632{
1633for ( unsigned i=0; i<value.length(); i++ )
1634if ( !IsWhiteSpace( value[i] ) )
1635return false;
1636return true;
1637}
1638
1639

Archive Download this file

Revision: 1307