Chameleon

Chameleon Svn Source Tree

Root/branches/Chimera/package/bin/po4a/lib/Locale/Po4a/Xml.pm

1#!/usr/bin/perl
2
3# Po4a::Xml.pm
4#
5# extract and translate translatable strings from XML documents.
6#
7# This code extracts plain text from tags and attributes from generic
8# XML documents, and it can be used as a base to build modules for
9# XML-based documents.
10#
11# Copyright (c) 2004 by Jordi Vilalta <jvprat@gmail.com>
12# Copyright (c) 2008-2009 by Nicolas François <nicolas.francois@centraliens.net>
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc.,
27# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
28#
29########################################################################
30
31=encoding UTF-8
32
33=head1 NAME
34
35Locale::Po4a::Xml - convert XML documents and derivates from/to PO files
36
37=head1 DESCRIPTION
38
39The po4a (PO for anything) project goal is to ease translations (and more
40interestingly, the maintenance of translations) using gettext tools on
41areas where they were not expected like documentation.
42
43Locale::Po4a::Xml is a module to help the translation of XML documents into
44other [human] languages. It can also be used as a base to build modules for
45XML-based documents.
46
47=cut
48
49package Locale::Po4a::Xml;
50
51use 5.006;
52use strict;
53use warnings;
54
55require Exporter;
56use vars qw(@ISA @EXPORT);
57@ISA = qw(Locale::Po4a::TransTractor);
58@EXPORT = qw(new initialize @tag_types);
59
60use Locale::Po4a::TransTractor;
61use Locale::Po4a::Common;
62use Carp qw(croak);
63use File::Basename;
64use File::Spec;
65
66#It will mantain the path from the root tag to the current one
67my @path;
68
69#It will contain a list of external entities and their attached paths
70my %entities;
71
72my @comments;
73my %translate_options_cache;
74
75my $_shiftline_in_comment = 0;
76sub shiftline {
77 my $self = shift;
78 # call Transtractor's shiftline
79 my ($line,$ref) = $self->SUPER::shiftline();
80 return ($line,$ref) if (not defined $line);
81
82 if ($self->{options}{'includeexternal'}) {
83 my $tmp;
84
85 for my $k (keys %entities) {
86 if ($line =~ m/^(.*?)&$k;(.*)$/s) {
87 my ($before, $after) = ($1, $2);
88 my $linenum=0;
89 my @textentries;
90
91 $tmp = $before;
92 my $tmp_in_comment = 0;
93 if ($_shiftline_in_comment) {
94 if ($before =~ m/^.*?-->(.*)$/s) {
95 $tmp = $1;
96 $tmp_in_comment = 0;
97 } else {
98 $tmp_in_comment = 1;
99 }
100 }
101 if ($tmp_in_comment == 0) {
102 while ($tmp =~ m/^.*?<!--.*?-->(.*)$/s) {
103 $tmp = $1;
104 }
105 if ($tmp =~ m/<!--/s) {
106 $tmp_in_comment = 1;
107 }
108 }
109 next if ($tmp_in_comment);
110
111 open (my $in, $entities{$k})
112 or croak wrap_mod("po4a::xml",
113 dgettext("po4a", "Can't read from %s: %s"),
114 $entities{$k}, $!);
115 while (defined (my $textline = <$in>)) {
116 $linenum++;
117 my $textref=$entities{$k}.":$linenum";
118 push @textentries, ($textline,$textref);
119 }
120 close $in
121 or croak wrap_mod("po4a::xml",
122 dgettext("po4a", "Can't close %s after reading: %s"),
123 $entities{$k}, $!);
124
125 push @textentries, ($after, $ref);
126 $line = $before.(shift @textentries);
127 $ref .= " ".(shift @textentries);
128 $self->unshiftline(@textentries);
129 }
130 }
131
132 $tmp = $line;
133 if ($_shiftline_in_comment) {
134 if ($line =~ m/^.*?-->(.*)$/s) {
135 $tmp = $1;
136 $_shiftline_in_comment = 0;
137 } else {
138 $_shiftline_in_comment = 1;
139 }
140 }
141 if ($_shiftline_in_comment == 0) {
142 while ($tmp =~ m/^.*?<!--.*?-->(.*)$/s) {
143 $tmp = $1;
144 }
145 if ($tmp =~ m/<!--/s) {
146 $_shiftline_in_comment = 1;
147 }
148 }
149 }
150
151 return ($line,$ref);
152}
153
154sub read {
155my ($self,$filename)=@_;
156push @{$self->{DOCPOD}{infile}}, $filename;
157$self->Locale::Po4a::TransTractor::read($filename);
158}
159
160sub parse {
161my $self=shift;
162map {$self->parse_file($_)} @{$self->{DOCPOD}{infile}};
163}
164
165# @save_holders is a stack of references to ('paragraph', 'translation',
166# 'sub_translations', 'open', 'close', 'folded_attributes') hashes, where:
167# paragraph is a reference to an array (see paragraph in the
168# treat_content() subroutine) of strings followed by
169# references. It contains the @paragraph array as it was
170# before the processing was interrupted by a tag instroducing
171# a placeholder.
172# translation is the translation of this level up to now
173# sub_translations is a reference to an array of strings containing the
174# translations which must replace the placeholders.
175# open is the tag which opened the placeholder.
176# close is the tag which closed the placeholder.
177# folded_attributes is an hash of tags with their attributes (<tag attrs=...>
178# strings), referenced by the folded tag id, which should
179# replace the <tag po4a-id=id> strings in the current
180# translation.
181#
182# If @save_holders only has 1 holder, then we are not processing the
183# content of an holder, we are translating the document.
184my @save_holders;
185
186
187# If we are at the bottom of the stack and there is no <placeholder ...> in
188# the current translation, we can push the translation in the translated
189# document.
190# Otherwise, we keep the translation in the current holder.
191sub pushline {
192my ($self, $line) = (shift, shift);
193
194my $holder = $save_holders[$#save_holders];
195my $translation = $holder->{'translation'};
196$translation .= $line;
197
198while ( %{$holder->{folded_attributes}}
199 and $translation =~ m/^(.*)<([^>]+?)\s+po4a-id=([0-9]+)>(.*)$/s) {
200my $begin = $1;
201my $tag = $2;
202my $id = $3;
203my $end = $4;
204if (defined $holder->{folded_attributes}->{$id}) {
205# TODO: check if the tag is the same
206$translation = $begin.$holder->{folded_attributes}->{$id}.$end;
207delete $holder->{folded_attributes}->{$id};
208} else {
209# TODO: It will be hard to identify the location.
210# => find a way to retrieve the reference.
211die wrap_mod("po4a::xml", dgettext("po4a", "'po4a-id=%d' in the translation does not exist in the original string (or 'po4a-id=%d' used twice in the translation)."), $id, $id);
212}
213}
214# TODO: check that %folded_attributes is empty at some time
215# => in translate_paragraph?
216
217if ( ($#save_holders > 0)
218 or ($translation =~ m/<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>/s)) {
219$holder->{'translation'} = $translation;
220} else {
221$self->SUPER::pushline($translation);
222$holder->{'translation'} = '';
223}
224}
225
226=head1 TRANSLATING WITH PO4A::XML
227
228This module can be used directly to handle generic XML documents. This will
229extract all tag's content, and no attributes, since it's where the text is
230written in most XML based documents.
231
232There are some options (described in the next section) that can customize
233this behavior. If this doesn't fit to your document format you're encouraged
234to write your own module derived from this, to describe your format's details.
235See the section B<WRITING DERIVATE MODULES> below, for the process description.
236
237=cut
238
239#
240# Parse file and translate it
241#
242sub parse_file {
243my ($self,$filename) = @_;
244my $eof = 0;
245
246while (!$eof) {
247# We get all the text until the next breaking tag (not
248# inline) and translate it
249$eof = $self->treat_content;
250if (!$eof) {
251# And then we treat the following breaking tag
252$eof = $self->treat_tag;
253}
254}
255}
256
257=head1 OPTIONS ACCEPTED BY THIS MODULE
258
259The global debug option causes this module to show the excluded strings, in
260order to see if it skips something important.
261
262These are this module's particular options:
263
264=over 4
265
266=item B<nostrip>
267
268Prevents it to strip the spaces around the extracted strings.
269
270=item B<wrap>
271
272Canonizes the string to translate, considering that whitespaces are not
273important, and wraps the translated document. This option can be overridden
274by custom tag options. See the "tags" option below.
275
276=item B<caseinsensitive>
277
278It makes the tags and attributes searching to work in a case insensitive
279way. If it's defined, it will treat E<lt>BooKE<gt>laNG and E<lt>BOOKE<gt>Lang as E<lt>bookE<gt>lang.
280
281=item B<includeexternal>
282
283When defined, external entities are included in the generated (translated)
284document, and for the extraction of strings. If it's not defined, you
285will have to translate external entities separately as independent
286documents.
287
288=item B<ontagerror>
289
290This option defines the behavior of the module when it encounter a invalid
291XML syntax (a closing tag which does not match the last opening tag, or a
292tag's attribute without value).
293It can take the following values:
294
295=over
296
297=item I<fail>
298
299This is the default value.
300The module will exit with an error.
301
302=item I<warn>
303
304The module will continue, and will issue a warning.
305
306=item I<silent>
307
308The module will continue without any warnings.
309
310=back
311
312Be careful when using this option.
313It is generally recommended to fix the input file.
314
315=item B<tagsonly>
316
317Extracts only the specified tags in the "tags" option. Otherwise, it
318will extract all the tags except the ones specified.
319
320Note: This option is deprecated.
321
322=item B<doctype>
323
324String that will try to match with the first line of the document's doctype
325(if defined). If it doesn't, a warning will indicate that the document
326might be of a bad type.
327
328=item B<addlang>
329
330String indicating the path (e.g. E<lt>bbbE<gt>E<lt>aaaE<gt>) of a tag
331where a lang="..." attribute shall be added. The language will be defined
332as the basename of the PO file without any .po extension.
333
334=item B<tags>
335
336Space-separated list of tags you want to translate or skip. By default,
337the specified tags will be excluded, but if you use the "tagsonly" option,
338the specified tags will be the only ones included. The tags must be in the
339form E<lt>aaaE<gt>, but you can join some (E<lt>bbbE<gt>E<lt>aaaE<gt>) to say that the content of
340the tag E<lt>aaaE<gt> will only be translated when it's into a E<lt>bbbE<gt> tag.
341
342You can also specify some tag options putting some characters in front of
343the tag hierarchy. For example, you can put 'w' (wrap) or 'W' (don't wrap)
344to override the default behavior specified by the global "wrap" option.
345
346Example: WE<lt>chapterE<gt>E<lt>titleE<gt>
347
348Note: This option is deprecated.
349You should use the B<translated> and B<untranslated> options instead.
350
351=item B<attributes>
352
353Space-separated list of tag's attributes you want to translate. You can
354specify the attributes by their name (for example, "lang"), but you can
355prefix it with a tag hierarchy, to specify that this attribute will only be
356translated when it's into the specified tag. For example: E<lt>bbbE<gt>E<lt>aaaE<gt>lang
357specifies that the lang attribute will only be translated if it's into an
358E<lt>aaaE<gt> tag, and it's into a E<lt>bbbE<gt> tag.
359
360=item B<foldattributes>
361
362Do not translate attributes in inline tags.
363Instead, replace all attributes of a tag by po4a-id=<id>.
364
365This is useful when attributes shall not be translated, as this simplifies the
366strings for translators, and avoids typos.
367
368=item B<customtag>
369
370Space-separated list of tags which should not be treated as tags.
371These tags are treated as inline, and do not need to be closed.
372
373=item B<break>
374
375Space-separated list of tags which should break the sequence.
376By default, all tags break the sequence.
377
378The tags must be in the form <aaa>, but you can join some
379(<bbb><aaa>), if a tag (<aaa>) should only be considered
380when it's into another tag (<bbb>).
381
382=item B<inline>
383
384Space-separated list of tags which should be treated as inline.
385By default, all tags break the sequence.
386
387The tags must be in the form <aaa>, but you can join some
388(<bbb><aaa>), if a tag (<aaa>) should only be considered
389when it's into another tag (<bbb>).
390
391=item B<placeholder>
392
393Space-separated list of tags which should be treated as placeholders.
394Placeholders do not break the sequence, but the content of placeholders is
395translated separately.
396
397The location of the placeholder in its block will be marked with a string
398similar to:
399
400 <placeholder type=\"footnote\" id=\"0\"/>
401
402The tags must be in the form <aaa>, but you can join some
403(<bbb><aaa>), if a tag (<aaa>) should only be considered
404when it's into another tag (<bbb>).
405
406=item B<nodefault>
407
408Space separated list of tags that the module should not try to set by
409default in any category.
410
411=item B<cpp>
412
413Support C preprocessor directives.
414When this option is set, po4a will consider preprocessor directives as
415paragraph separators.
416This is important if the XML file must be preprocessed because otherwise
417the directives may be inserted in the middle of lines if po4a consider it
418belong to the current paragraph, and they won't be recognized by the
419preprocessor.
420Note: the preprocessor directives must only appear between tags
421(they must not break a tag).
422
423=item B<translated>
424
425Space-separated list of tags you want to translate.
426
427The tags must be in the form <aaa>, but you can join some
428(<bbb><aaa>), if a tag (<aaa>) should only be considered
429when it's into another tag (<bbb>).
430
431You can also specify some tag options putting some characters in front of
432the tag hierarchy. For example, you can put 'w' (wrap) or 'W' (don't wrap)
433to overide the default behavior specified by the global "wrap" option.
434
435Example: WE<lt>chapterE<gt>E<lt>titleE<gt>
436
437=item B<untranslated>
438
439Space-separated list of tags you do not want to translate.
440
441The tags must be in the form <aaa>, but you can join some
442(<bbb><aaa>), if a tag (<aaa>) should only be considered
443when it's into another tag (<bbb>).
444
445=item B<defaulttranslateoption>
446
447The default categories for tags that are not in any of the translated,
448untranslated, break, inline, or placeholder.
449
450This is a set of letters:
451
452=over
453
454=item I<w>
455
456Tags should be translated and content can be re-wrapped.
457
458=item I<W>
459
460Tags should be translated and content should not be re-wrapped.
461
462=item I<i>
463
464Tags should be translated inline.
465
466=item I<p>
467
468Tags should be translated as placeholders.
469
470=back
471
472=back
473
474=cut
475# TODO: defaulttranslateoption
476# w => indicate that it is only valid for translatable tags and do not
477# care about inline/break/placeholder?
478# ...
479
480sub initialize {
481my $self = shift;
482my %options = @_;
483
484# Reset the path
485@path = ();
486
487# Initialize the stack of holders
488my @paragraph = ();
489my @sub_translations = ();
490my %folded_attributes;
491my %holder = ('paragraph' => \@paragraph,
492 'translation' => "",
493 'sub_translations' => \@sub_translations,
494 'folded_attributes' => \%folded_attributes);
495@save_holders = (\%holder);
496
497$self->{options}{'addlang'}=0;
498$self->{options}{'nostrip'}=0;
499$self->{options}{'wrap'}=0;
500$self->{options}{'caseinsensitive'}=0;
501$self->{options}{'tagsonly'}=0;
502$self->{options}{'tags'}='';
503$self->{options}{'break'}='';
504$self->{options}{'translated'}='';
505$self->{options}{'untranslated'}='';
506$self->{options}{'defaulttranslateoption'}='';
507$self->{options}{'attributes'}='';
508$self->{options}{'foldattributes'}=0;
509$self->{options}{'inline'}='';
510$self->{options}{'placeholder'}='';
511$self->{options}{'customtag'}='';
512$self->{options}{'doctype'}='';
513$self->{options}{'nodefault'}='';
514$self->{options}{'includeexternal'}=0;
515$self->{options}{'ontagerror'}="fail";
516$self->{options}{'cpp'}=0;
517
518$self->{options}{'verbose'}='';
519$self->{options}{'debug'}='';
520
521foreach my $opt (keys %options) {
522if ($options{$opt}) {
523die wrap_mod("po4a::xml",
524dgettext("po4a", "Unknown option: %s"), $opt)
525unless exists $self->{options}{$opt};
526$self->{options}{$opt} = $options{$opt};
527}
528}
529# Default options set by modules. Forbidden for users.
530$self->{options}{'_default_translated'}='';
531$self->{options}{'_default_untranslated'}='';
532$self->{options}{'_default_break'}='';
533$self->{options}{'_default_inline'}='';
534$self->{options}{'_default_placeholder'}='';
535$self->{options}{'_default_attributes'}='';
536$self->{options}{'_default_customtag'}='';
537
538#It will maintain the list of the translatable tags
539$self->{tags}=();
540$self->{translated}=();
541$self->{untranslated}=();
542#It will maintain the list of the translatable attributes
543$self->{attributes}=();
544#It will maintain the list of the breaking tags
545$self->{break}=();
546#It will maintain the list of the inline tags
547$self->{inline}=();
548#It will maintain the list of the placeholder tags
549$self->{placeholder}=();
550#It will maintain the list of the customtag tags
551$self->{customtag}=();
552#list of the tags that must not be set in the tags or inline category
553#by this module or sub-module (unless specified in an option)
554$self->{nodefault}=();
555
556$self->treat_options;
557
558# Clear cache
559%translate_options_cache=();
560}
561
562=head1 WRITING DERIVATE MODULES
563
564=head2 DEFINE WHAT TAGS AND ATTRIBUTES TO TRANSLATE
565
566The simplest customization is to define which tags and attributes you want
567the parser to translate. This should be done in the initialize function.
568First you should call the main initialize, to get the command-line options,
569and then, append your custom definitions to the options hash. If you want
570to treat some new options from command line, you should define them before
571calling the main initialize:
572
573 $self->{options}{'new_option'}='';
574 $self->SUPER::initialize(%options);
575 $self->{options}{'_default_translated'}.=' <p> <head><title>';
576 $self->{options}{'attributes'}.=' <p>lang id';
577 $self->{options}{'_default_inline'}.=' <br>';
578 $self->treat_options;
579
580You should use the B<_default_inline>, B<_default_break>,
581B<_default_placeholder>, B<_default_translated>, B<_default_untranslated>,
582and B<_default_attributes> options in derivated modules. This allow users
583to override the default behavior defined in your module with command line
584options.
585
586=head2 OVERRIDING THE found_string FUNCTION
587
588Another simple step is to override the function "found_string", which
589receives the extracted strings from the parser, in order to translate them.
590There you can control which strings you want to translate, and perform
591transformations to them before or after the translation itself.
592
593It receives the extracted text, the reference on where it was, and a hash
594that contains extra information to control what strings to translate, how
595to translate them and to generate the comment.
596
597The content of these options depends on the kind of string it is (specified in an
598entry of this hash):
599
600=over
601
602=item type="tag"
603
604The found string is the content of a translatable tag. The entry "tag_options"
605contains the option characters in front of the tag hierarchy in the module
606"tags" option.
607
608=item type="attribute"
609
610Means that the found string is the value of a translatable attribute. The
611entry "attribute" has the name of the attribute.
612
613=back
614
615It must return the text that will replace the original in the translated
616document. Here's a basic example of this function:
617
618 sub found_string {
619 my ($self,$text,$ref,$options)=@_;
620 $text = $self->translate($text,$ref,"type ".$options->{'type'},
621 'wrap'=>$self->{options}{'wrap'});
622 return $text;
623 }
624
625There's another simple example in the new Dia module, which only filters
626some strings.
627
628=cut
629
630sub found_string {
631my ($self,$text,$ref,$options)=@_;
632
633if ($text =~ m/^\s*$/s) {
634return $text;
635}
636
637my $comment;
638my $wrap = $self->{options}{'wrap'};
639
640if ($options->{'type'} eq "tag") {
641$comment = "Content of: ".$self->get_path;
642
643if($options->{'tag_options'} =~ /w/) {
644$wrap = 1;
645}
646if($options->{'tag_options'} =~ /W/) {
647$wrap = 0;
648}
649} elsif ($options->{'type'} eq "attribute") {
650$comment = "Attribute '".$options->{'attribute'}."' of: ".$self->get_path;
651} elsif ($options->{'type'} eq "CDATA") {
652$comment = "CDATA";
653$wrap = 0;
654} else {
655die wrap_ref_mod($ref, "po4a::xml", dgettext("po4a", "Internal error: unknown type identifier '%s'."), $options->{'type'});
656}
657$text = $self->translate($text,$ref,$comment,'wrap'=>$wrap, comment => $options->{'comments'});
658return $text;
659}
660
661=head2 MODIFYING TAG TYPES (TODO)
662
663This is a more complex one, but it enables a (almost) total customization.
664It's based in a list of hashes, each one defining a tag type's behavior. The
665list should be sorted so that the most general tags are after the most
666concrete ones (sorted first by the beginning and then by the end keys). To
667define a tag type you'll have to make a hash with the following keys:
668
669=over 4
670
671=item B<beginning>
672
673Specifies the beginning of the tag, after the "E<lt>".
674
675=item B<end>
676
677Specifies the end of the tag, before the "E<gt>".
678
679=item B<breaking>
680
681It says if this is a breaking tag class. A non-breaking (inline) tag is one
682that can be taken as part of the content of another tag. It can take the
683values false (0), true (1) or undefined. If you leave this undefined, you'll
684have to define the f_breaking function that will say whether a concrete tag of
685this class is a breaking tag or not.
686
687=item B<f_breaking>
688
689It's a function that will tell if the next tag is a breaking one or not. It
690should be defined if the B<breaking> option is not.
691
692=item B<f_extract>
693
694If you leave this key undefined, the generic extraction function will have to
695extract the tag itself. It's useful for tags that can have other tags or
696special structures in them, so that the main parser doesn't get mad. This
697function receives a boolean that says if the tag should be removed from the
698input stream or not.
699
700=item B<f_translate>
701
702This function receives the tag (in the get_string_until() format) and returns
703the translated tag (translated attributes or all needed transformations) as a
704single string.
705
706=back
707
708=cut
709
710##### Generic XML tag types #####'
711
712our @tag_types = (
713{beginning=> "!--#",
714end=> "--",
715breaking=> 0,
716f_extract=> \&tag_extract_comment,
717f_translate=> \&tag_trans_comment},
718{beginning=> "!--",
719end=> "--",
720breaking=> 0,
721f_extract=> \&tag_extract_comment,
722f_translate=> \&tag_trans_comment},
723{beginning=> "?xml",
724end=> "?",
725breaking=> 1,
726f_translate=> \&tag_trans_xmlhead},
727{beginning=> "?",
728end=> "?",
729breaking=> 1,
730f_translate=> \&tag_trans_procins},
731{beginning=> "!DOCTYPE",
732end=> "",
733breaking=> 1,
734f_extract=> \&tag_extract_doctype,
735f_translate=> \&tag_trans_doctype},
736{beginning=> "![CDATA[",
737end=> "]]",
738breaking=> 1,
739f_extract=> \&CDATA_extract,
740f_translate=> \&CDATA_trans},
741{beginning=> "/",
742end=> "",
743f_breaking=> \&tag_break_close,
744f_translate=> \&tag_trans_close},
745{beginning=> "",
746end=> "/",
747f_breaking=> \&tag_break_alone,
748f_translate=> \&tag_trans_alone},
749{beginning=> "",
750end=> "",
751f_breaking=> \&tag_break_open,
752f_translate=> \&tag_trans_open}
753);
754
755sub tag_extract_comment {
756my ($self,$remove)=(shift,shift);
757my ($eof,@tag)=$self->get_string_until('-->',{include=>1,remove=>$remove});
758return ($eof,@tag);
759}
760
761sub tag_trans_comment {
762my ($self,@tag)=@_;
763return $self->join_lines(@tag);
764}
765
766sub tag_trans_xmlhead {
767my ($self,@tag)=@_;
768
769# We don't have to translate anything from here: throw away references
770my $tag = $self->join_lines(@tag);
771$tag =~ /encoding=(("|')|)(.*?)(\s|\2)/s;
772my $in_charset=$3;
773$self->detected_charset($in_charset);
774my $out_charset=$self->get_out_charset;
775
776if (defined $in_charset) {
777$tag =~ s/$in_charset/$out_charset/;
778} else {
779if ($tag =~ m/standalone/) {
780$tag =~ s/(standalone)/encoding="$out_charset" $1/;
781} else {
782$tag.= " encoding=\"$out_charset\"";
783}
784}
785
786return $tag;
787}
788
789sub tag_trans_procins {
790my ($self,@tag)=@_;
791return $self->join_lines(@tag);
792}
793
794sub tag_extract_doctype {
795my ($self,$remove)=(shift,shift);
796
797# Check if there is an internal subset (between []).
798my ($eof,@tag)=$self->get_string_until('>',{include=>1,unquoted=>1});
799my $parity = 0;
800my $paragraph = "";
801map { $parity = 1 - $parity; $paragraph.= $parity?$_:""; } @tag;
802my $found = 0;
803if ($paragraph =~ m/<.*\[.*</s) {
804$found = 1
805}
806
807if (not $found) {
808($eof,@tag)=$self->get_string_until('>',{include=>1,remove=>$remove,unquoted=>1});
809} else {
810($eof,@tag)=$self->get_string_until(']\s*>',{include=>1,remove=>$remove,unquoted=>1,regex=>1});
811}
812return ($eof,@tag);
813}
814
815sub tag_trans_doctype {
816# This check is not really reliable. There are system and public
817# identifiers. Only the public one could be checked reliably.
818my ($self,@tag)=@_;
819if (defined $self->{options}{'doctype'} ) {
820my $doctype = $self->{options}{'doctype'};
821if ( $tag[0] !~ /\Q$doctype\E/i ) {
822warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Bad document type. '%s' expected. You can fix this warning with a -o doctype option, or ignore this check with -o doctype=\"\"."), $doctype);
823}
824}
825my $i = 0;
826my $basedir = $tag[1];
827$basedir =~ s/:[0-9]+$//;
828$basedir = dirname($basedir);
829
830while ( $i < $#tag ) {
831my $t = $tag[$i];
832my $ref = $tag[$i+1];
833if ( $t =~ /^(\s*<!ENTITY\s+)(.*)$/is ) {
834my $part1 = $1;
835my $part2 = $2;
836my $includenow = 0;
837my $file = 0;
838my $name = "";
839if ($part2 =~ /^(%\s+)(.*)$/s ) {
840$part1.= $1;
841$part2 = $2;
842$includenow = 1;
843}
844$part2 =~ /^(\S+)(\s+)(.*)$/s;
845$name = $1;
846$part1.= $1.$2;
847$part2 = $3;
848if ( $part2 =~ /^(SYSTEM\s+)(.*)$/is ) {
849$part1.= $1;
850$part2 = $2;
851$file = 1;
852if ($self->{options}{'includeexternal'}) {
853$entities{$name} = $part2;
854$entities{$name} =~ s/^"?(.*?)".*$/$1/s;
855$entities{$name} = File::Spec->catfile($basedir, $entities{$name});
856}
857}
858if ((not $file) and (not $includenow)) {
859 if ($part2 =~ m/^\s*(["'])(.*)\1(\s*>.*)$/s) {
860my $comment = "Content of the $name entity";
861my $quote = $1;
862my $text = $2;
863$part2 = $3;
864$text = $self->translate($text,
865 $ref,
866 $comment,
867 'wrap'=>1);
868$t = $part1."$quote$text$quote$part2";
869 }
870}
871#print $part1."\n";
872#print $name."\n";
873#print $part2."\n";
874}
875$tag[$i] = $t;
876$i += 2;
877}
878return $self->join_lines(@tag);
879}
880
881sub tag_break_close {
882my ($self,@tag)=@_;
883my $struct = $self->get_path;
884my $options = $self->get_translate_options($struct);
885if ($options =~ m/[ip]/) {
886return 0;
887} else {
888return 1;
889}
890}
891
892sub tag_trans_close {
893my ($self,@tag)=@_;
894my $name = $self->get_tag_name(@tag);
895
896my $test = pop @path;
897if (!defined($test) || $test ne $name ) {
898my $ontagerror = $self->{options}{'ontagerror'};
899if ($ontagerror eq "warn") {
900warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong. Continuing..."), $name);
901} elsif ($ontagerror ne "silent") {
902die wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong."), $name);
903}
904}
905return $self->join_lines(@tag);
906}
907
908sub CDATA_extract {
909my ($self,$remove)=(shift,shift);
910 my ($eof, @tag) = $self->get_string_until(']]>',{include=>1,unquoted=>0,remove=>$remove});
911
912return ($eof, @tag);
913}
914
915sub CDATA_trans {
916my ($self,@tag)=@_;
917return $self->found_string($self->join_lines(@tag),
918 $tag[1],
919 {'type' => "CDATA"});
920}
921
922sub tag_break_alone {
923my ($self,@tag)=@_;
924my $struct = $self->get_path($self->get_tag_name(@tag));
925if ($self->get_translate_options($struct) =~ m/i/) {
926return 0;
927} else {
928return 1;
929}
930}
931
932sub tag_trans_alone {
933my ($self,@tag)=@_;
934my $name = $self->get_tag_name(@tag);
935push @path, $name;
936
937$name = $self->treat_attributes(@tag);
938
939pop @path;
940return $name;
941}
942
943sub tag_break_open {
944my ($self,@tag)=@_;
945my $struct = $self->get_path($self->get_tag_name(@tag));
946my $options = $self->get_translate_options($struct);
947if ($options =~ m/[ip]/) {
948return 0;
949} else {
950return 1;
951}
952}
953
954sub tag_trans_open {
955my ($self,@tag)=@_;
956my $name = $self->get_tag_name(@tag);
957push @path, $name;
958
959$name = $self->treat_attributes(@tag);
960
961if (defined $self->{options}{'addlang'}) {
962my $struct = $self->get_path();
963if ($struct eq $self->{options}{'addlang'}) {
964$name .= ' lang="'.$self->{TT}{po_in}->{lang}.'"';
965}
966}
967
968return $name;
969}
970
971##### END of Generic XML tag types #####
972
973=head1 INTERNAL FUNCTIONS used to write derivated parsers
974
975=head2 WORKING WITH TAGS
976
977=over 4
978
979=item get_path()
980
981This function returns the path to the current tag from the document's root,
982in the form E<lt>htmlE<gt>E<lt>bodyE<gt>E<lt>pE<gt>.
983
984An additional array of tags (without brackets) can be passed as argument.
985These path elements are added to the end of the current path.
986
987=cut
988
989sub get_path {
990my $self = shift;
991my @add = @_;
992if ( @path > 0 or @add > 0 ) {
993return "<".join("><",@path,@add).">";
994} else {
995return "outside any tag (error?)";
996}
997}
998
999=item tag_type()
1000
1001This function returns the index from the tag_types list that fits to the next
1002tag in the input stream, or -1 if it's at the end of the input file.
1003
1004=cut
1005
1006sub tag_type {
1007my $self = shift;
1008my ($line,$ref) = $self->shiftline();
1009my ($match1,$match2);
1010my $found = 0;
1011my $i = 0;
1012
1013if (!defined($line)) { return -1; }
1014
1015$self->unshiftline($line,$ref);
1016my ($eof,@lines) = $self->get_string_until(">",{include=>1,unquoted=>1});
1017my $line2 = $self->join_lines(@lines);
1018while (!$found && $i < @tag_types) {
1019($match1,$match2) = ($tag_types[$i]->{beginning},$tag_types[$i]->{end});
1020if ($line =~ /^<\Q$match1\E/) {
1021if (!defined($tag_types[$i]->{f_extract})) {
1022#print substr($line2,length($line2)-1-length($match2),1+length($match2))."\n";
1023if (defined($line2) and $line2 =~ /\Q$match2\E>$/) {
1024$found = 1;
1025#print "YES: <".$match1." ".$match2.">\n";
1026} else {
1027#print "NO: <".$match1." ".$match2.">\n";
1028$i++;
1029}
1030} else {
1031$found = 1;
1032}
1033} else {
1034$i++;
1035}
1036}
1037if (!$found) {
1038#It should never enter here, unless you undefine the most
1039#general tags (as <...>)
1040die "po4a::xml: Unknown tag type: ".$line."\n";
1041} else {
1042return $i;
1043}
1044}
1045
1046=item extract_tag($$)
1047
1048This function returns the next tag from the input stream without the beginning
1049and end, in an array form, to maintain the references from the input file. It
1050has two parameters: the type of the tag (as returned by tag_type) and a
1051boolean, that indicates if it should be removed from the input stream.
1052
1053=cut
1054
1055sub extract_tag {
1056my ($self,$type,$remove) = (shift,shift,shift);
1057my ($match1,$match2) = ($tag_types[$type]->{beginning},$tag_types[$type]->{end});
1058my ($eof,@tag);
1059if (defined($tag_types[$type]->{f_extract})) {
1060($eof,@tag) = &{$tag_types[$type]->{f_extract}}($self,$remove);
1061} else {
1062($eof,@tag) = $self->get_string_until($match2.">",{include=>1,remove=>$remove,unquoted=>1});
1063}
1064$tag[0] =~ /^<\Q$match1\E(.*)$/s;
1065$tag[0] = $1;
1066$tag[$#tag-1] =~ /^(.*)\Q$match2\E>$/s;
1067$tag[$#tag-1] = $1;
1068return ($eof,@tag);
1069}
1070
1071=item get_tag_name(@)
1072
1073This function returns the name of the tag passed as an argument, in the array
1074form returned by extract_tag.
1075
1076=cut
1077
1078sub get_tag_name {
1079my ($self,@tag)=@_;
1080$tag[0] =~ /^(\S*)/;
1081return $1;
1082}
1083
1084=item breaking_tag()
1085
1086This function returns a boolean that says if the next tag in the input stream
1087is a breaking tag or not (inline tag). It leaves the input stream intact.
1088
1089=cut
1090
1091sub breaking_tag {
1092my $self = shift;
1093my $break;
1094
1095my $type = $self->tag_type;
1096if ($type == -1) { return 0; }
1097
1098#print "TAG TYPE = ".$type."\n";
1099$break = $tag_types[$type]->{breaking};
1100if (!defined($break)) {
1101# This tag's breaking depends on its content
1102my ($eof,@lines) = $self->extract_tag($type,0);
1103$break = &{$tag_types[$type]->{f_breaking}}($self,@lines);
1104}
1105#print "break = ".$break."\n";
1106return $break;
1107}
1108
1109=item treat_tag()
1110
1111This function translates the next tag from the input stream. Using each
1112tag type's custom translation functions.
1113
1114=cut
1115
1116sub treat_tag {
1117my $self = shift;
1118my $type = $self->tag_type;
1119
1120my ($match1,$match2) = ($tag_types[$type]->{beginning},$tag_types[$type]->{end});
1121my ($eof,@lines) = $self->extract_tag($type,1);
1122
1123$lines[0] =~ /^(\s*)(.*)$/s;
1124my $space1 = $1;
1125$lines[0] = $2;
1126$lines[$#lines-1] =~ /^(.*?)(\s*)$/s;
1127my $space2 = $2;
1128$lines[$#lines-1] = $1;
1129
1130# Calling this tag type's specific handling (translation of
1131# attributes...)
1132my $line = &{$tag_types[$type]->{f_translate}}($self,@lines);
1133$self->pushline("<".$match1.$space1.$line.$space2.$match2.">");
1134return $eof;
1135}
1136
1137=item tag_in_list($@)
1138
1139This function returns a string value that says if the first argument (a tag
1140hierarchy) matches any of the tags from the second argument (a list of tags
1141or tag hierarchies). If it doesn't match, it returns 0. Else, it returns the
1142matched tag's options (the characters in front of the tag) or 1 (if that tag
1143doesn't have options).
1144
1145=back
1146
1147=cut
1148sub tag_in_list ($$$) {
1149my ($self,$path,$list) = @_;
1150if ($self->{options}{'caseinsensitive'}) {
1151$path = lc $path;
1152}
1153
1154while (1) {
1155if (defined $list->{$path}) {
1156if (length $list->{$path}) {
1157return $list->{$path};
1158} else {
1159return 1;
1160}
1161}
1162last unless ($path =~ m/</);
1163$path =~ s/^<.*?>//;
1164}
1165
1166return 0;
1167}
1168
1169=head2 WORKING WITH ATTRIBUTES
1170
1171=over 4
1172
1173=item treat_attributes(@)
1174
1175This function handles the translation of the tags' attributes. It receives the tag
1176without the beginning / end marks, and then it finds the attributes, and it
1177translates the translatable ones (specified by the module option "attributes").
1178This returns a plain string with the translated tag.
1179
1180=back
1181
1182=cut
1183
1184sub treat_attributes {
1185my ($self,@tag)=@_;
1186
1187$tag[0] =~ /^(\S*)(.*)/s;
1188my $text = $1;
1189$tag[0] = $2;
1190
1191while (@tag) {
1192my $complete = 1;
1193
1194$text .= $self->skip_spaces(\@tag);
1195if (@tag) {
1196# Get the attribute's name
1197$complete = 0;
1198
1199$tag[0] =~ /^([^\s=]+)(.*)/s;
1200my $name = $1;
1201my $ref = $tag[1];
1202$tag[0] = $2;
1203$text .= $name;
1204$text .= $self->skip_spaces(\@tag);
1205if (@tag) {
1206# Get the '='
1207if ($tag[0] =~ /^=(.*)/s) {
1208$tag[0] = $1;
1209$text .= "=";
1210$text .= $self->skip_spaces(\@tag);
1211if (@tag) {
1212# Get the value
1213my $value="";
1214$ref=$tag[1];
1215my $quot=substr($tag[0],0,1);
1216if ($quot ne "\"" and $quot ne "'") {
1217# Unquoted value
1218$quot="";
1219$tag[0] =~ /^(\S+)(.*)/s;
1220$value = $1;
1221$tag[0] = $2;
1222} else {
1223# Quoted value
1224$text .= $quot;
1225$tag[0] =~ /^\Q$quot\E(.*)/s;
1226$tag[0] = $1;
1227while ($tag[0] !~ /\Q$quot\E/) {
1228$value .= $tag[0];
1229shift @tag;
1230shift @tag;
1231}
1232$tag[0] =~ /^(.*?)\Q$quot\E(.*)/s;
1233$value .= $1;
1234$tag[0] = $2;
1235}
1236$complete = 1;
1237if ($self->tag_in_list($self->get_path.$name,$self->{attributes})) {
1238$text .= $self->found_string($value, $ref, { type=>"attribute", attribute=>$name });
1239} else {
1240print wrap_ref_mod($ref, "po4a::xml", dgettext("po4a", "Content of attribute %s excluded: %s"), $self->get_path.$name, $value)
1241 if $self->debug();
1242$text .= $self->recode_skipped_text($value);
1243}
1244$text .= $quot;
1245}
1246}
1247}
1248
1249unless ($complete) {
1250my $ontagerror = $self->{options}{'ontagerror'};
1251if ($ontagerror eq "warn") {
1252warn wrap_ref_mod($ref, "po4a::xml", dgettext ("po4a", "Bad attribute syntax. Continuing..."));
1253} elsif ($ontagerror ne "silent") {
1254die wrap_ref_mod($ref, "po4a::xml", dgettext ("po4a", "Bad attribute syntax"));
1255}
1256}
1257}
1258}
1259return $text;
1260}
1261
1262# Returns an empty string if the content in the $path should not be
1263# translated.
1264#
1265# Otherwise, returns the set of options for translation:
1266# w: the content shall be re-wrapped
1267# W: the content shall not be re-wrapped
1268# i: the tag shall be inlined
1269# p: a placeholder shall replace the tag (and its content)
1270# n: a custom tag
1271#
1272# A translatable inline tag in an untranslated tag is treated as a translatable breaking tag.
1273sub get_translate_options {
1274my $self = shift;
1275my $path = shift;
1276
1277if (defined $translate_options_cache{$path}) {
1278return $translate_options_cache{$path};
1279}
1280
1281my $options = "";
1282my $translate = 0;
1283my $usedefault = 1;
1284
1285my $inlist = 0;
1286my $tag = $self->get_tag_from_list($path, $self->{tags});
1287if (defined $tag) {
1288$inlist = 1;
1289}
1290if ($self->{options}{'tagsonly'} eq $inlist) {
1291$usedefault = 0;
1292if (defined $tag) {
1293$options = $tag;
1294$options =~ s/<.*$//;
1295} else {
1296if ($self->{options}{'wrap'}) {
1297$options = "w";
1298} else {
1299$options = "W";
1300}
1301}
1302$translate = 1;
1303}
1304
1305# TODO: a less precise set of tags should not override a more precise one
1306# The tags and tagsonly options are deprecated.
1307# The translated and untranslated options have an higher priority.
1308$tag = $self->get_tag_from_list($path, $self->{translated});
1309if (defined $tag) {
1310$usedefault = 0;
1311$options = $tag;
1312$options =~ s/<.*$//;
1313$translate = 1;
1314}
1315
1316if ($translate and $options !~ m/w/i) {
1317$options .= ($self->{options}{'wrap'})?"w":"W";
1318}
1319
1320if (not defined $tag) {
1321$tag = $self->get_tag_from_list($path, $self->{untranslated});
1322if (defined $tag) {
1323$usedefault = 0;
1324$options = "";
1325$translate = 0;
1326}
1327}
1328
1329$tag = $self->get_tag_from_list($path, $self->{inline});
1330if (defined $tag) {
1331$usedefault = 0;
1332$options .= "i";
1333} else {
1334$tag = $self->get_tag_from_list($path, $self->{placeholder});
1335if (defined $tag) {
1336$usedefault = 0;
1337$options .= "p";
1338}
1339}
1340
1341$tag = $self->get_tag_from_list($path, $self->{customtag});
1342if (defined $tag) {
1343$usedefault = 0;
1344$options = "in"; # This erase any other setting
1345}
1346
1347if ($usedefault) {
1348$options = $self->{options}{'defaulttranslateoption'};
1349}
1350
1351# A translatable inline tag in an untranslated tag is treated as a
1352# translatable breaking tag.
1353if ($options =~ m/i/) {
1354my $ppath = $path;
1355$ppath =~ s/<[^>]*>$//;
1356my $poptions = $self->get_translate_options ($ppath);
1357if ($poptions eq "") {
1358$options =~ s/i//;
1359}
1360}
1361
1362if ($options =~ m/i/ and $self->{options}{'foldattributes'}) {
1363$options .= "f";
1364}
1365
1366$translate_options_cache{$path} = $options;
1367return $options;
1368}
1369
1370
1371# Return the tag (or biggest set of tags) of a list which matches with the
1372# given path.
1373#
1374# The tag (or set of tags) is returned with its options.
1375#
1376# If no tags could match the path, undef is returned.
1377sub get_tag_from_list ($$$) {
1378my ($self,$path,$list) = @_;
1379if ($self->{options}{'caseinsensitive'}) {
1380$path = lc $path;
1381}
1382
1383while (1) {
1384if (defined $list->{$path}) {
1385return $list->{$path}.$path;
1386}
1387last unless ($path =~ m/</);
1388$path =~ s/^<.*?>//;
1389}
1390
1391return undef;
1392}
1393
1394
1395
1396sub treat_content {
1397my $self = shift;
1398my $blank="";
1399# Indicates if the paragraph will have to be translated
1400my $translate = "";
1401
1402my ($eof,@paragraph)=$self->get_string_until('<',{remove=>1});
1403
1404while (!$eof and !$self->breaking_tag) {
1405NEXT_TAG:
1406my @text;
1407my $type = $self->tag_type;
1408my $f_extract = $tag_types[$type]->{'f_extract'};
1409if ( defined($f_extract)
1410 and $f_extract eq \&tag_extract_comment) {
1411# Remove the content of the comments
1412($eof, @text) = $self->extract_tag($type,1);
1413$text[$#text-1] .= "\0";
1414if ($tag_types[$type]->{'beginning'} eq "!--#") {
1415$text[0] = "#".$text[0];
1416}
1417push @comments, @text;
1418} else {
1419my ($tmpeof, @tag) = $self->extract_tag($type,0);
1420# Append the found inline tag
1421($eof,@text)=$self->get_string_until('>',
1422 {include=>1,
1423 remove=>1,
1424 unquoted=>1});
1425# Append or remove the opening/closing tag from
1426# the tag path
1427if ($tag_types[$type]->{'end'} eq "") {
1428if ($tag_types[$type]->{'beginning'} eq "") {
1429# Opening inline tag
1430my $cur_tag_name = $self->get_tag_name(@tag);
1431my $t_opts = $self->get_translate_options($self->get_path($cur_tag_name));
1432if ($t_opts =~ m/p/) {
1433# We enter a new holder.
1434# Append a <placeholder ...> tag to the current
1435# paragraph, and save the @paragraph in the
1436# current holder.
1437my $last_holder = $save_holders[$#save_holders];
1438my $placeholder_str = "<placeholder type=\"".$cur_tag_name."\" id=\"".($#{$last_holder->{'sub_translations'}}+1)."\"/>";
1439push @paragraph, ($placeholder_str, $text[1]);
1440my @saved_paragraph = @paragraph;
1441
1442$last_holder->{'paragraph'} = \@saved_paragraph;
1443
1444# Then we must push a new holder
1445my @new_paragraph = ();
1446my @sub_translations = ();
1447my %folded_attributes;
1448my %new_holder = ('paragraph' => \@new_paragraph,
1449 'open' => $self->join_lines(@text),
1450 'translation' => "",
1451 'close' => undef,
1452 'sub_translations' => \@sub_translations,
1453 'folded_attributes' => \%folded_attributes);
1454push @save_holders, \%new_holder;
1455@text = ();
1456
1457# The current @paragraph
1458# (for the current holder)
1459# is empty.
1460@paragraph = ();
1461} elsif ($t_opts =~ m/f/) {
1462my $tag_full = $self->join_lines(@text);
1463my $tag_ref = $text[1];
1464if ($tag_full =~ m/^<\s*\S+\s+\S.*>$/s) {
1465my $holder = $save_holders[$#save_holders];
1466my $id = 0;
1467foreach (keys %{$holder->{folded_attributes}}) {
1468$id = $_ + 1 if ($_ >= $id);
1469}
1470$holder->{folded_attributes}->{$id} = $tag_full;
1471
1472@text = ("<$cur_tag_name po4a-id=$id>", $tag_ref);
1473}
1474}
1475unless ($t_opts =~ m/n/) {
1476push @path, $cur_tag_name;
1477}
1478} elsif ($tag_types[$type]->{'beginning'} eq "/") {
1479# Closing inline tag
1480
1481# Check if this is closing the
1482# last opening tag we detected.
1483my $test = pop @path;
1484my $name = $self->get_tag_name(@tag);
1485if (!defined($test) ||
1486 $test ne $name ) {
1487my $ontagerror = $self->{options}{'ontagerror'};
1488if ($ontagerror eq "warn") {
1489warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong. Continuing..."), $name);
1490} elsif ($ontagerror ne "silent") {
1491die wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong."), $name);
1492}
1493}
1494
1495if ($self->get_translate_options($self->get_path($self->get_tag_name(@tag))) =~ m/p/) {
1496# This closes the current holder.
1497
1498push @path, $self->get_tag_name(@tag);
1499# Now translate this paragraph if needed.
1500# This will call pushline and append the
1501# translation to the current holder's translation.
1502$self->translate_paragraph(@paragraph);
1503pop @path;
1504
1505# Now that this holder is closed, we can remove
1506# the holder from the stack.
1507my $holder = pop @save_holders;
1508# We need to keep the translation of this holder
1509my $translation = $holder->{'open'}.$holder->{'translation'};
1510$translation .= $self->join_lines(@text);
1511
1512@text = ();
1513
1514# Then we store the translation in the previous
1515# holder's sub_translations array
1516my $previous_holder = $save_holders[$#save_holders];
1517push @{$previous_holder->{'sub_translations'}}, $translation;
1518# We also need to restore the @paragraph array, as
1519# it was before we encountered the holder.
1520@paragraph = @{$previous_holder->{'paragraph'}};
1521}
1522}
1523}
1524push @paragraph, @text;
1525}
1526
1527# Next tag
1528($eof,@text)=$self->get_string_until('<',{remove=>1});
1529if ($#text > 0) {
1530# Check if text (extracted after the inline tag)
1531# has to be translated
1532push @paragraph, @text;
1533}
1534}
1535
1536# This strips the extracted strings
1537# (only if you don't specify the 'nostrip' option, and if the
1538# paragraph can be re-wrapped)
1539$translate = $self->get_translate_options($self->get_path);
1540if (!$self->{options}{'nostrip'} and $translate !~ m/W/) {
1541my $clean = 0;
1542# Clean the beginning
1543while (!$clean and $#paragraph > 0) {
1544$paragraph[0] =~ /^(\s*)(.*)/s;
1545my $match = $1;
1546if ($paragraph[0] eq $match) {
1547if ($match ne "") {
1548$self->pushline($match);
1549}
1550shift @paragraph;
1551shift @paragraph;
1552} else {
1553$paragraph[0] = $2;
1554if ($match ne "") {
1555$self->pushline($match);
1556}
1557$clean = 1;
1558}
1559}
1560$clean = 0;
1561# Clean the end
1562while (!$clean and $#paragraph > 0) {
1563$paragraph[$#paragraph-1] =~ /^(.*?)(\s*)$/s;
1564my $match = $2;
1565if ($paragraph[$#paragraph-1] eq $match) {
1566if ($match ne "") {
1567$blank = $match.$blank;
1568}
1569pop @paragraph;
1570pop @paragraph;
1571} else {
1572$paragraph[$#paragraph-1] = $1;
1573if ($match ne "") {
1574$blank = $match.$blank;
1575}
1576$clean = 1;
1577}
1578}
1579}
1580
1581# Translate the string when needed
1582# This will either push the translation in the translated document or
1583# in the current holder translation.
1584$self->translate_paragraph(@paragraph);
1585
1586# Push the trailing blanks
1587if ($blank ne "") {
1588$self->pushline($blank);
1589}
1590return $eof;
1591}
1592
1593# Translate a @paragraph array of (string, reference).
1594# The $translate argument indicates if the strings must be translated or
1595# just pushed
1596sub translate_paragraph {
1597my $self = shift;
1598my @paragraph = @_;
1599my $translate = $self->get_translate_options($self->get_path);
1600
1601while ( (scalar @paragraph)
1602 and ($paragraph[0] =~ m/^\s*\n/s)) {
1603$self->pushline($paragraph[0]);
1604shift @paragraph;
1605shift @paragraph;
1606}
1607
1608my $comments;
1609while (@comments) {
1610my ($comment,$eoc);
1611do {
1612my ($t,$l) = (shift @comments, shift @comments);
1613$t =~ s/\n?(\0)?$//;
1614$eoc = $1;
1615$comment .= "\n" if defined $comment;
1616$comment .= $t;
1617} until ($eoc);
1618$comments .= "\n" if defined $comments;
1619$comments .= $comment;
1620$self->pushline("<!--".$comment."-->\n") if defined $comment;
1621}
1622@comments = ();
1623
1624if ($self->{options}{'cpp'}) {
1625my @tmp = @paragraph;
1626@paragraph = ();
1627while (@tmp) {
1628my ($t,$l) = (shift @tmp, shift @tmp);
1629# #include can be followed by a filename between
1630# <> brackets. In that case, the argument won't be
1631# handled in the same call to translate_paragraph.
1632# Thus do not try to match "include ".
1633if ($t =~ m/^#[ \t]*(if |endif|undef |include|else|ifdef |ifndef |define )/si) {
1634if (@paragraph) {
1635$self->translate_paragraph(@paragraph);
1636@paragraph = ();
1637$self->pushline("\n");
1638}
1639$self->pushline($t);
1640} else {
1641push @paragraph, ($t,$l);
1642}
1643}
1644}
1645
1646my $para = $self->join_lines(@paragraph);
1647if ( length($para) > 0 ) {
1648if ($translate ne "") {
1649# This tag should be translated
1650$self->pushline($self->found_string(
1651$para,
1652$paragraph[1], {
1653type=>"tag",
1654tag_options=>$translate,
1655comments=>$comments
1656}));
1657} else {
1658# Inform that this tag isn't translated in debug mode
1659print wrap_ref_mod($paragraph[1], "po4a::xml", dgettext ("po4a", "Content of tag %s excluded: %s"), $self->get_path, $para)
1660 if $self->debug();
1661$self->pushline($self->recode_skipped_text($para));
1662}
1663}
1664# Now the paragraph is fully translated.
1665# If we have all the holders' translation, we can replace the
1666# placeholders by their translations.
1667# We must wait to have all the translations because the holders are
1668# numbered.
1669{
1670my $holder = $save_holders[$#save_holders];
1671my $translation = $holder->{'translation'};
1672
1673# Count the number of <placeholder ...> in $translation
1674my $count = 0;
1675my $str = $translation;
1676while ( (defined $str)
1677 and ($str =~ m/^.*?<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>(.*)$/s)) {
1678$count += 1;
1679$str = $2;
1680if ($holder->{'sub_translations'}->[$1] =~ m/<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>/s) {
1681$count = -1;
1682last;
1683}
1684}
1685
1686if ( (defined $translation)
1687 and (scalar(@{$holder->{'sub_translations'}}) == $count)) {
1688# OK, all the holders of the current paragraph are
1689# closed (and translated).
1690# Replace them by their translation.
1691while ($translation =~ m/^(.*?)<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>(.*)$/s) {
1692# FIXME: we could also check that
1693# * the holder exists
1694# * all the holders are used
1695$translation = $1.$holder->{'sub_translations'}->[$2].$3;
1696}
1697# We have our translation
1698$holder->{'translation'} = $translation;
1699# And there is no need for any holder in it.
1700my @sub_translations = ();
1701$holder->{'sub_translations'} = \@sub_translations;
1702}
1703}
1704
1705}
1706
1707
1708
1709=head2 WORKING WITH THE MODULE OPTIONS
1710
1711=over 4
1712
1713=item treat_options()
1714
1715This function fills the internal structures that contain the tags, attributes
1716and inline data with the options of the module (specified in the command-line
1717or in the initialize function).
1718
1719=back
1720
1721=cut
1722
1723sub treat_options {
1724my $self = shift;
1725
1726if ($self->{options}{'caseinsensitive'}) {
1727$self->{options}{'nodefault'} = lc $self->{options}{'nodefault'};
1728$self->{options}{'tags'} = lc $self->{options}{'tags'};
1729$self->{options}{'break'} = lc $self->{options}{'break'};
1730$self->{options}{'_default_break'} = lc $self->{options}{'_default_break'};
1731$self->{options}{'translated'} = lc $self->{options}{'translated'};
1732$self->{options}{'_default_translated'} = lc $self->{options}{'_default_translated'};
1733$self->{options}{'untranslated'} = lc $self->{options}{'untranslated'};
1734$self->{options}{'_default_untranslated'} = lc $self->{options}{'_default_untranslated'};
1735$self->{options}{'attributes'} = lc $self->{options}{'attributes'};
1736$self->{options}{'_default_attributes'} = lc $self->{options}{'_default_attributes'};
1737$self->{options}{'inline'} = lc $self->{options}{'inline'};
1738$self->{options}{'_default_inline'} = lc $self->{options}{'_default_inline'};
1739$self->{options}{'placeholder'} = lc $self->{options}{'placeholder'};
1740$self->{options}{'_default_placeholder'} = lc $self->{options}{'_default_placeholder'};
1741$self->{options}{'customtag'} = lc $self->{options}{'customtag'};
1742$self->{options}{'_default_customtag'} = lc $self->{options}{'_default_customtag'};
1743}
1744
1745$self->{options}{'nodefault'} =~ /^\s*(.*)\s*$/s;
1746my %list_nodefault;
1747foreach (split(/\s+/s,$1)) {
1748$list_nodefault{$_} = 1;
1749}
1750$self->{nodefault} = \%list_nodefault;
1751
1752$self->{options}{'tags'} =~ /^\s*(.*)\s*$/s;
1753if (length $self->{options}{'tags'}) {
1754warn wrap_mod("po4a::xml",
1755 dgettext("po4a",
1756 "The '%s' option is deprecated. Please use the translated/untranslated and/or break/inline/placeholder categories."), "tags");
1757}
1758foreach (split(/\s+/s,$1)) {
1759$_ =~ m/^(.*?)(<.*)$/;
1760$self->{tags}->{$2} = $1 || "";
1761}
1762
1763if ($self->{options}{'tagsonly'}) {
1764warn wrap_mod("po4a::xml",
1765 dgettext("po4a",
1766 "The '%s' option is deprecated. Please use the translated/untranslated and/or break/inline/placeholder categories."), "tagsonly");
1767}
1768
1769$self->{options}{'break'} =~ /^\s*(.*)\s*$/s;
1770foreach my $tag (split(/\s+/s,$1)) {
1771$tag =~ m/^(.*?)(<.*)$/;
1772$self->{break}->{$2} = $1 || "";
1773}
1774$self->{options}{'_default_break'} =~ /^\s*(.*)\s*$/s;
1775foreach my $tag (split(/\s+/s,$1)) {
1776$tag =~ m/^(.*?)(<.*)$/;
1777$self->{break}->{$2} = $1 || ""
1778unless $list_nodefault{$2}
1779 or defined $self->{break}->{$2};
1780}
1781
1782$self->{options}{'translated'} =~ /^\s*(.*)\s*$/s;
1783foreach my $tag (split(/\s+/s,$1)) {
1784$tag =~ m/^(.*?)(<.*)$/;
1785$self->{translated}->{$2} = $1 || "";
1786}
1787$self->{options}{'_default_translated'} =~ /^\s*(.*)\s*$/s;
1788foreach my $tag (split(/\s+/s,$1)) {
1789$tag =~ m/^(.*?)(<.*)$/;
1790$self->{translated}->{$2} = $1 || ""
1791unless $list_nodefault{$2}
1792 or defined $self->{translated}->{$2};
1793}
1794
1795$self->{options}{'untranslated'} =~ /^\s*(.*)\s*$/s;
1796foreach my $tag (split(/\s+/s,$1)) {
1797$tag =~ m/^(.*?)(<.*)$/;
1798$self->{untranslated}->{$2} = $1 || "";
1799}
1800$self->{options}{'_default_untranslated'} =~ /^\s*(.*)\s*$/s;
1801foreach my $tag (split(/\s+/s,$1)) {
1802$tag =~ m/^(.*?)(<.*)$/;
1803$self->{untranslated}->{$2} = $1 || ""
1804unless $list_nodefault{$2}
1805 or defined $self->{untranslated}->{$2};
1806}
1807
1808$self->{options}{'attributes'} =~ /^\s*(.*)\s*$/s;
1809foreach my $tag (split(/\s+/s,$1)) {
1810if ($tag =~ m/^(.*?)(<.*)$/) {
1811$self->{attributes}->{$2} = $1 || "";
1812} else {
1813$self->{attributes}->{$tag} = "";
1814}
1815}
1816$self->{options}{'_default_attributes'} =~ /^\s*(.*)\s*$/s;
1817foreach my $tag (split(/\s+/s,$1)) {
1818if ($tag =~ m/^(.*?)(<.*)$/) {
1819$self->{attributes}->{$2} = $1 || ""
1820unless $list_nodefault{$2}
1821 or defined $self->{attributes}->{$2};
1822} else {
1823$self->{attributes}->{$tag} = ""
1824unless $list_nodefault{$tag}
1825 or defined $self->{attributes}->{$tag};
1826}
1827}
1828
1829$self->{options}{'inline'} =~ /^\s*(.*)\s*$/s;
1830foreach my $tag (split(/\s+/s,$1)) {
1831$tag =~ m/^(.*?)(<.*)$/;
1832$self->{inline}->{$2} = $1 || "";
1833}
1834$self->{options}{'_default_inline'} =~ /^\s*(.*)\s*$/s;
1835foreach my $tag (split(/\s+/s,$1)) {
1836$tag =~ m/^(.*?)(<.*)$/;
1837$self->{inline}->{$2} = $1 || ""
1838unless $list_nodefault{$2}
1839 or defined $self->{inline}->{$2};
1840}
1841
1842$self->{options}{'placeholder'} =~ /^\s*(.*)\s*$/s;
1843foreach my $tag (split(/\s+/s,$1)) {
1844$tag =~ m/^(.*?)(<.*)$/;
1845$self->{placeholder}->{$2} = $1 || "";
1846}
1847$self->{options}{'_default_placeholder'} =~ /^\s*(.*)\s*$/s;
1848foreach my $tag (split(/\s+/s,$1)) {
1849$tag =~ m/^(.*?)(<.*)$/;
1850$self->{placeholder}->{$2} = $1 || ""
1851unless $list_nodefault{$2}
1852 or defined $self->{placeholder}->{$2};
1853}
1854
1855$self->{options}{'customtag'} =~ /^\s*(.*)\s*$/s;
1856foreach my $tag (split(/\s+/s,$1)) {
1857$tag =~ m/^(.*?)(<.*)$/;
1858$self->{customtag}->{$2} = $1 || "";
1859}
1860$self->{options}{'_default_customtag'} =~ /^\s*(.*)\s*$/s;
1861foreach my $tag (split(/\s+/s,$1)) {
1862$tag =~ m/^(.*?)(<.*)$/;
1863$self->{customtag}->{$2} = $1 || ""
1864unless $list_nodefault{$2}
1865 or defined $self->{customtag}->{$2};
1866}
1867
1868# There should be no translated and untranslated tags
1869foreach my $tag (keys %{$self->{translated}}) {
1870die wrap_mod("po4a::xml",
1871 dgettext("po4a",
1872 "Tag '%s' both in the %s and %s categories."), $tag, "translated", "untranslated")
1873if defined $self->{untranslated}->{$tag};
1874}
1875# There should be no inline, break, placeholder, and customtag tags
1876foreach my $tag (keys %{$self->{inline}}) {
1877die wrap_mod("po4a::xml",
1878 dgettext("po4a",
1879 "Tag '%s' both in the %s and %s categories."), $tag, "inline", "break")
1880if defined $self->{break}->{$tag};
1881die wrap_mod("po4a::xml",
1882 dgettext("po4a",
1883 "Tag '%s' both in the %s and %s categories."), $tag, "inline", "placeholder")
1884if defined $self->{placeholder}->{$tag};
1885die wrap_mod("po4a::xml",
1886 dgettext("po4a",
1887 "Tag '%s' both in the %s and %s categories."), $tag, "inline", "customtag")
1888if defined $self->{customtag}->{$tag};
1889}
1890foreach my $tag (keys %{$self->{break}}) {
1891die wrap_mod("po4a::xml",
1892 dgettext("po4a",
1893 "Tag '%s' both in the %s and %s categories."), $tag, "break", "placeholder")
1894if defined $self->{placeholder}->{$tag};
1895die wrap_mod("po4a::xml",
1896 dgettext("po4a",
1897 "Tag '%s' both in the %s and %s categories."), $tag, "break", "customtag")
1898if defined $self->{customtag}->{$tag};
1899}
1900foreach my $tag (keys %{$self->{placeholder}}) {
1901die wrap_mod("po4a::xml",
1902 dgettext("po4a",
1903 "Tag '%s' both in the %s and %s categories."), $tag, "placeholder", "customtag")
1904if defined $self->{customtag}->{$tag};
1905}
1906}
1907
1908=head2 GETTING TEXT FROM THE INPUT DOCUMENT
1909
1910=over
1911
1912=item get_string_until($%)
1913
1914This function returns an array with the lines (and references) from the input
1915document until it finds the first argument. The second argument is an options
1916hash. Value 0 means disabled (the default) and 1, enabled.
1917
1918The valid options are:
1919
1920=over 4
1921
1922=item B<include>
1923
1924This makes the returned array to contain the searched text
1925
1926=item B<remove>
1927
1928This removes the returned stream from the input
1929
1930=item B<unquoted>
1931
1932This ensures that the searched text is outside any quotes
1933
1934=back
1935
1936=cut
1937
1938sub get_string_until {
1939my ($self,$search) = (shift,shift);
1940my $options = shift;
1941my ($include,$remove,$unquoted, $regex) = (0,0,0,0);
1942
1943if (defined($options->{include})) { $include = $options->{include}; }
1944if (defined($options->{remove})) { $remove = $options->{remove}; }
1945if (defined($options->{unquoted})) { $unquoted = $options->{unquoted}; }
1946if (defined($options->{regex})) { $regex = $options->{regex}; }
1947
1948my ($line,$ref) = $self->shiftline();
1949my (@text,$paragraph);
1950my ($eof,$found) = (0,0);
1951
1952$search = "\Q$search\E" unless $regex;
1953while (defined($line) and !$found) {
1954push @text, ($line,$ref);
1955$paragraph .= $line;
1956if ($unquoted) {
1957if ( $paragraph =~ /^((\".*?\")|(\'.*?\')|[^\"\'])*$search/s ) {
1958$found = 1;
1959}
1960} else {
1961if ( $paragraph =~ /$search/s ) {
1962$found = 1;
1963}
1964}
1965if (!$found) {
1966($line,$ref)=$self->shiftline();
1967}
1968}
1969
1970if (!defined($line)) { $eof = 1; }
1971
1972if ( $found ) {
1973$line = "";
1974if($unquoted) {
1975$paragraph =~ /^(?:(?:\".*?\")|(?:\'.*?\')|[^\"\'])*?$search(.*)$/s;
1976$line = $1;
1977$text[$#text-1] =~ s/\Q$line\E$//s;
1978} else {
1979$paragraph =~ /$search(.*)$/s;
1980$line = $1;
1981$text[$#text-1] =~ s/\Q$line\E$//s;
1982}
1983if(!$include) {
1984$text[$#text-1] =~ /^(.*)($search.*)$/s;
1985$text[$#text-1] = $1;
1986$line = $2.$line;
1987}
1988if (defined($line) and ($line ne "")) {
1989$self->unshiftline ($line,$text[$#text]);
1990}
1991}
1992if (!$remove) {
1993$self->unshiftline (@text);
1994}
1995
1996#If we get to the end of the file, we return the whole paragraph
1997return ($eof,@text);
1998}
1999
2000=item skip_spaces(\@)
2001
2002This function receives as argument the reference to a paragraph (in the format
2003returned by get_string_until), skips his heading spaces and returns them as
2004a simple string.
2005
2006=cut
2007
2008sub skip_spaces {
2009my ($self,$pstring)=@_;
2010my $space="";
2011
2012while (@$pstring and (@$pstring[0] =~ /^(\s+)(.*)$/s or @$pstring[0] eq "")) {
2013if (@$pstring[0] ne "") {
2014$space .= $1;
2015@$pstring[0] = $2;
2016}
2017
2018if (@$pstring[0] eq "") {
2019shift @$pstring;
2020shift @$pstring;
2021}
2022}
2023return $space;
2024}
2025
2026=item join_lines(@)
2027
2028This function returns a simple string with the text from the argument array
2029(discarding the references).
2030
2031=cut
2032
2033sub join_lines {
2034my ($self,@lines)=@_;
2035my ($line,$ref);
2036my $text = "";
2037while ($#lines > 0) {
2038($line,$ref) = (shift @lines,shift @lines);
2039$text .= $line;
2040}
2041return $text;
2042}
2043
2044=back
2045
2046=head1 STATUS OF THIS MODULE
2047
2048This module can translate tags and attributes.
2049
2050=head1 TODO LIST
2051
2052DOCTYPE (ENTITIES)
2053
2054There is a minimal support for the translation of entities. They are
2055translated as a whole, and tags are not taken into account. Multilines
2056entities are not supported and entities are always rewrapped during the
2057translation.
2058
2059MODIFY TAG TYPES FROM INHERITED MODULES
2060(move the tag_types structure inside the $self hash?)
2061
2062=head1 SEE ALSO
2063
2064L<Locale::Po4a::TransTractor(3pm)|Locale::Po4a::TransTractor>,
2065L<po4a(7)|po4a.7>
2066
2067=head1 AUTHORS
2068
2069 Jordi Vilalta <jvprat@gmail.com>
2070 Nicolas François <nicolas.francois@centraliens.net>
2071
2072=head1 COPYRIGHT AND LICENSE
2073
2074 Copyright (c) 2004 by Jordi Vilalta <jvprat@gmail.com>
2075 Copyright (c) 2008-2009 by Nicolas François <nicolas.francois@centraliens.net>
2076
2077This program is free software; you may redistribute it and/or modify it
2078under the terms of GPL (see the COPYING file).
2079
2080=cut
2081
20821;
2083

Archive Download this file

Revision: 2225