Chameleon

Chameleon Svn Source Tree

Root/trunk/package/bin/po4a/lib/Locale/Po4a/Xml.pm

1#!/usr/bin/perl
2
3# Po4a::Xml.pm
4#
5# extract and translate translatable strings from XML documents.
6#
7# This code extracts plain text from tags and attributes from generic
8# XML documents, and it can be used as a base to build modules for
9# XML-based documents.
10#
11# Copyright (c) 2004 by Jordi Vilalta <jvprat@gmail.com>
12# Copyright (c) 2008-2009 by Nicolas François <nicolas.francois@centraliens.net>
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc.,
27# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
28#
29########################################################################
30
31=encoding UTF-8
32
33=head1 NAME
34
35Locale::Po4a::Xml - convert XML documents and derivates from/to PO files
36
37=head1 DESCRIPTION
38
39The po4a (PO for anything) project goal is to ease translations (and more
40interestingly, the maintenance of translations) using gettext tools on
41areas where they were not expected like documentation.
42
43Locale::Po4a::Xml is a module to help the translation of XML documents into
44other [human] languages. It can also be used as a base to build modules for
45XML-based documents.
46
47=cut
48
49package Locale::Po4a::Xml;
50
51use 5.006;
52use strict;
53use warnings;
54
55require Exporter;
56use vars qw(@ISA @EXPORT);
57@ISA = qw(Locale::Po4a::TransTractor);
58@EXPORT = qw(new initialize @tag_types);
59
60use Locale::Po4a::TransTractor;
61use Locale::Po4a::Common;
62use Carp qw(croak);
63use File::Basename;
64use File::Spec;
65
66#It will mantain the path from the root tag to the current one
67my @path;
68
69#It will contain a list of external entities and their attached paths
70my %entities;
71
72my @comments;
73my %translate_options_cache;
74
75my $_shiftline_in_comment = 0;
76sub shiftline {
77 my $self = shift;
78 # call Transtractor's shiftline
79 my ($line,$ref) = $self->SUPER::shiftline();
80 return ($line,$ref) if (not defined $line);
81
82 if ($self->{options}{'includeexternal'}) {
83 my $tmp;
84
85 for my $k (keys %entities) {
86 if ($line =~ m/^(.*?)&$k;(.*)$/s) {
87 my ($before, $after) = ($1, $2);
88 my $linenum=0;
89 my @textentries;
90
91 $tmp = $before;
92 my $tmp_in_comment = 0;
93 if ($_shiftline_in_comment) {
94 if ($before =~ m/^.*?-->(.*)$/s) {
95 $tmp = $1;
96 $tmp_in_comment = 0;
97 } else {
98 $tmp_in_comment = 1;
99 }
100 }
101 if ($tmp_in_comment == 0) {
102 while ($tmp =~ m/^.*?<!--.*?-->(.*)$/s) {
103 $tmp = $1;
104 }
105 if ($tmp =~ m/<!--/s) {
106 $tmp_in_comment = 1;
107 }
108 }
109 next if ($tmp_in_comment);
110
111 open (my $in, $entities{$k})
112 or croak wrap_mod("po4a::xml",
113 dgettext("po4a", "Can't read from %s: %s"),
114 $entities{$k}, $!);
115 while (defined (my $textline = <$in>)) {
116 $linenum++;
117 my $textref=$entities{$k}.":$linenum";
118 push @textentries, ($textline,$textref);
119 }
120 close $in
121 or croak wrap_mod("po4a::xml",
122 dgettext("po4a", "Can't close %s after reading: %s"),
123 $entities{$k}, $!);
124
125 push @textentries, ($after, $ref);
126 $line = $before.(shift @textentries);
127 $ref .= " ".(shift @textentries);
128 $self->unshiftline(@textentries);
129 }
130 }
131
132 $tmp = $line;
133 if ($_shiftline_in_comment) {
134 if ($line =~ m/^.*?-->(.*)$/s) {
135 $tmp = $1;
136 $_shiftline_in_comment = 0;
137 } else {
138 $_shiftline_in_comment = 1;
139 }
140 }
141 if ($_shiftline_in_comment == 0) {
142 while ($tmp =~ m/^.*?<!--.*?-->(.*)$/s) {
143 $tmp = $1;
144 }
145 if ($tmp =~ m/<!--/s) {
146 $_shiftline_in_comment = 1;
147 }
148 }
149 }
150
151 return ($line,$ref);
152}
153
154sub read {
155my ($self,$filename)=@_;
156push @{$self->{DOCPOD}{infile}}, $filename;
157$self->Locale::Po4a::TransTractor::read($filename);
158}
159
160sub parse {
161my $self=shift;
162map {$self->parse_file($_)} @{$self->{DOCPOD}{infile}};
163}
164
165# @save_holders is a stack of references to ('paragraph', 'translation',
166# 'sub_translations', 'open', 'close', 'folded_attributes') hashes, where:
167# paragraph is a reference to an array (see paragraph in the
168# treat_content() subroutine) of strings followed by
169# references. It contains the @paragraph array as it was
170# before the processing was interrupted by a tag instroducing
171# a placeholder.
172# translation is the translation of this level up to now
173# sub_translations is a reference to an array of strings containing the
174# translations which must replace the placeholders.
175# open is the tag which opened the placeholder.
176# close is the tag which closed the placeholder.
177# folded_attributes is an hash of tags with their attributes (<tag attrs=...>
178# strings), referenced by the folded tag id, which should
179# replace the <tag po4a-id=id> strings in the current
180# translation.
181#
182# If @save_holders only has 1 holder, then we are not processing the
183# content of an holder, we are translating the document.
184my @save_holders;
185
186
187# If we are at the bottom of the stack and there is no <placeholder ...> in
188# the current translation, we can push the translation in the translated
189# document.
190# Otherwise, we keep the translation in the current holder.
191sub pushline {
192my ($self, $line) = (shift, shift);
193
194my $holder = $save_holders[$#save_holders];
195my $translation = $holder->{'translation'};
196$translation .= $line;
197
198while ( %{$holder->{folded_attributes}}
199 and $translation =~ m/^(.*)<([^>]+?)\s+po4a-id=([0-9]+)>(.*)$/s) {
200my $begin = $1;
201my $tag = $2;
202my $id = $3;
203my $end = $4;
204if (defined $holder->{folded_attributes}->{$id}) {
205# TODO: check if the tag is the same
206$translation = $begin.$holder->{folded_attributes}->{$id}.$end;
207delete $holder->{folded_attributes}->{$id};
208} else {
209# TODO: It will be hard to identify the location.
210# => find a way to retrieve the reference.
211die wrap_mod("po4a::xml", dgettext("po4a", "'po4a-id=%d' in the translation does not exist in the original string (or 'po4a-id=%d' used twice in the translation)."), $id, $id);
212}
213}
214# TODO: check that %folded_attributes is empty at some time
215# => in translate_paragraph?
216
217if ( ($#save_holders > 0)
218 or ($translation =~ m/<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>/s)) {
219$holder->{'translation'} = $translation;
220} else {
221$self->SUPER::pushline($translation);
222$holder->{'translation'} = '';
223}
224}
225
226=head1 TRANSLATING WITH PO4A::XML
227
228This module can be used directly to handle generic XML documents. This will
229extract all tag's content, and no attributes, since it's where the text is
230written in most XML based documents.
231
232There are some options (described in the next section) that can customize
233this behavior. If this doesn't fit to your document format you're encouraged
234to write your own module derived from this, to describe your format's details.
235See the section B<WRITING DERIVATE MODULES> below, for the process description.
236
237=cut
238
239#
240# Parse file and translate it
241#
242sub parse_file {
243my ($self,$filename) = @_;
244my $eof = 0;
245
246while (!$eof) {
247# We get all the text until the next breaking tag (not
248# inline) and translate it
249$eof = $self->treat_content;
250if (!$eof) {
251# And then we treat the following breaking tag
252$eof = $self->treat_tag;
253}
254}
255}
256
257=head1 OPTIONS ACCEPTED BY THIS MODULE
258
259The global debug option causes this module to show the excluded strings, in
260order to see if it skips something important.
261
262These are this module's particular options:
263
264=over 4
265
266=item B<nostrip>
267
268Prevents it to strip the spaces around the extracted strings.
269
270=item B<wrap>
271
272Canonizes the string to translate, considering that whitespaces are not
273important, and wraps the translated document. This option can be overridden
274by custom tag options. See the "tags" option below.
275
276=item B<caseinsensitive>
277
278It makes the tags and attributes searching to work in a case insensitive
279way. If it's defined, it will treat E<lt>BooKE<gt>laNG and E<lt>BOOKE<gt>Lang as E<lt>bookE<gt>lang.
280
281=item B<includeexternal>
282
283When defined, external entities are included in the generated (translated)
284document, and for the extraction of strings. If it's not defined, you
285will have to translate external entities separately as independent
286documents.
287
288=item B<ontagerror>
289
290This option defines the behavior of the module when it encounter a invalid
291XML syntax (a closing tag which does not match the last opening tag, or a
292tag's attribute without value).
293It can take the following values:
294
295=over
296
297=item I<fail>
298
299This is the default value.
300The module will exit with an error.
301
302=item I<warn>
303
304The module will continue, and will issue a warning.
305
306=item I<silent>
307
308The module will continue without any warnings.
309
310=back
311
312Be careful when using this option.
313It is generally recommended to fix the input file.
314
315=item B<tagsonly>
316
317Extracts only the specified tags in the "tags" option. Otherwise, it
318will extract all the tags except the ones specified.
319
320Note: This option is deprecated.
321
322=item B<doctype>
323
324String that will try to match with the first line of the document's doctype
325(if defined). If it doesn't, a warning will indicate that the document
326might be of a bad type.
327
328=item B<addlang>
329
330String indicating the path (e.g. E<lt>bbbE<gt>E<lt>aaaE<gt>) of a tag
331where a lang="..." attribute shall be added. The language will be defined
332as the basename of the PO file without any .po extension.
333
334=item B<tags>
335
336Space-separated list of tags you want to translate or skip. By default,
337the specified tags will be excluded, but if you use the "tagsonly" option,
338the specified tags will be the only ones included. The tags must be in the
339form E<lt>aaaE<gt>, but you can join some (E<lt>bbbE<gt>E<lt>aaaE<gt>) to say that the content of
340the tag E<lt>aaaE<gt> will only be translated when it's into a E<lt>bbbE<gt> tag.
341
342You can also specify some tag options by putting some characters in front of
343the tag hierarchy. For example, you can put 'w' (wrap) or 'W' (don't wrap)
344to override the default behavior specified by the global "wrap" option.
345
346Example: WE<lt>chapterE<gt>E<lt>titleE<gt>
347
348Note: This option is deprecated.
349You should use the B<translated> and B<untranslated> options instead.
350
351=item B<attributes>
352
353Space-separated list of tag's attributes you want to translate. You can
354specify the attributes by their name (for example, "lang"), but you can
355prefix it with a tag hierarchy, to specify that this attribute will only be
356translated when it's into the specified tag. For example: E<lt>bbbE<gt>E<lt>aaaE<gt>lang
357specifies that the lang attribute will only be translated if it's into an
358E<lt>aaaE<gt> tag, and it's into a E<lt>bbbE<gt> tag.
359
360=item B<foldattributes>
361
362Do not translate attributes in inline tags.
363Instead, replace all attributes of a tag by po4a-id=<id>.
364
365This is useful when attributes shall not be translated, as this simplifies the
366strings for translators, and avoids typos.
367
368=item B<customtag>
369
370Space-separated list of tags which should not be treated as tags.
371These tags are treated as inline, and do not need to be closed.
372
373=item B<break>
374
375Space-separated list of tags which should break the sequence.
376By default, all tags break the sequence.
377
378The tags must be in the form <aaa>, but you can join some
379(<bbb><aaa>), if a tag (<aaa>) should only be considered
380when it's into another tag (<bbb>).
381
382=item B<inline>
383
384Space-separated list of tags which should be treated as inline.
385By default, all tags break the sequence.
386
387The tags must be in the form <aaa>, but you can join some
388(<bbb><aaa>), if a tag (<aaa>) should only be considered
389when it's into another tag (<bbb>).
390
391=item B<placeholder>
392
393Space-separated list of tags which should be treated as placeholders.
394Placeholders do not break the sequence, but the content of placeholders is
395translated separately.
396
397The location of the placeholder in its block will be marked with a string
398similar to:
399
400 <placeholder type=\"footnote\" id=\"0\"/>
401
402The tags must be in the form <aaa>, but you can join some
403(<bbb><aaa>), if a tag (<aaa>) should only be considered
404when it's into another tag (<bbb>).
405
406=item B<nodefault>
407
408Space separated list of tags that the module should not try to set by
409default in any category.
410
411=item B<cpp>
412
413Support C preprocessor directives.
414When this option is set, po4a will consider preprocessor directives as
415paragraph separators.
416This is important if the XML file must be preprocessed because otherwise
417the directives may be inserted in the middle of lines if po4a consider it
418belong to the current paragraph, and they won't be recognized by the
419preprocessor.
420Note: the preprocessor directives must only appear between tags
421(they must not break a tag).
422
423=item B<translated>
424
425Space-separated list of tags you want to translate.
426
427The tags must be in the form <aaa>, but you can join some
428(<bbb><aaa>), if a tag (<aaa>) should only be considered
429when it's into another tag (<bbb>).
430
431You can also specify some tag options by putting some characters in front of
432the tag hierarchy. For example, you can put 'w' (wrap) or 'W' (don't wrap)
433to override the default behavior specified by the global "wrap" option.
434
435Example: WE<lt>chapterE<gt>E<lt>titleE<gt>
436
437=item B<untranslated>
438
439Space-separated list of tags you do not want to translate.
440
441The tags must be in the form <aaa>, but you can join some
442(<bbb><aaa>), if a tag (<aaa>) should only be considered
443when it's into another tag (<bbb>).
444
445=item B<defaulttranslateoption>
446
447The default categories for tags that are not in any of the translated,
448untranslated, break, inline, or placeholder.
449
450This is a set of letters:
451
452=over
453
454=item I<w>
455
456Tags should be translated and content can be re-wrapped.
457
458=item I<W>
459
460Tags should be translated and content should not be re-wrapped.
461
462=item I<i>
463
464Tags should be translated inline.
465
466=item I<p>
467
468Tags should be translated as placeholders.
469
470=back
471
472=back
473
474=cut
475# TODO: defaulttranslateoption
476# w => indicate that it is only valid for translatable tags and do not
477# care about inline/break/placeholder?
478# ...
479
480sub initialize {
481my $self = shift;
482my %options = @_;
483
484# Reset the path
485@path = ();
486
487# Initialize the stack of holders
488my @paragraph = ();
489my @sub_translations = ();
490my %folded_attributes;
491my %holder = ('paragraph' => \@paragraph,
492 'translation' => "",
493 'sub_translations' => \@sub_translations,
494 'folded_attributes' => \%folded_attributes);
495@save_holders = (\%holder);
496
497$self->{options}{'addlang'}=0;
498$self->{options}{'nostrip'}=0;
499$self->{options}{'wrap'}=0;
500$self->{options}{'caseinsensitive'}=0;
501$self->{options}{'tagsonly'}=0;
502$self->{options}{'tags'}='';
503$self->{options}{'break'}='';
504$self->{options}{'translated'}='';
505$self->{options}{'untranslated'}='';
506$self->{options}{'defaulttranslateoption'}='';
507$self->{options}{'attributes'}='';
508$self->{options}{'foldattributes'}=0;
509$self->{options}{'inline'}='';
510$self->{options}{'placeholder'}='';
511$self->{options}{'customtag'}='';
512$self->{options}{'doctype'}='';
513$self->{options}{'nodefault'}='';
514$self->{options}{'includeexternal'}=0;
515$self->{options}{'ontagerror'}="fail";
516$self->{options}{'cpp'}=0;
517
518$self->{options}{'verbose'}='';
519$self->{options}{'debug'}='';
520
521foreach my $opt (keys %options) {
522if ($options{$opt}) {
523die wrap_mod("po4a::xml",
524dgettext("po4a", "Unknown option: %s"), $opt)
525unless exists $self->{options}{$opt};
526$self->{options}{$opt} = $options{$opt};
527}
528}
529# Default options set by modules. Forbidden for users.
530$self->{options}{'_default_translated'}='';
531$self->{options}{'_default_untranslated'}='';
532$self->{options}{'_default_break'}='';
533$self->{options}{'_default_inline'}='';
534$self->{options}{'_default_placeholder'}='';
535$self->{options}{'_default_attributes'}='';
536$self->{options}{'_default_customtag'}='';
537
538#It will maintain the list of the translatable tags
539$self->{tags}=();
540$self->{translated}=();
541$self->{untranslated}=();
542#It will maintain the list of the translatable attributes
543$self->{attributes}=();
544#It will maintain the list of the breaking tags
545$self->{break}=();
546#It will maintain the list of the inline tags
547$self->{inline}=();
548#It will maintain the list of the placeholder tags
549$self->{placeholder}=();
550#It will maintain the list of the customtag tags
551$self->{customtag}=();
552#list of the tags that must not be set in the tags or inline category
553#by this module or sub-module (unless specified in an option)
554$self->{nodefault}=();
555
556$self->treat_options;
557
558# Clear cache
559%translate_options_cache=();
560}
561
562=head1 WRITING DERIVATE MODULES
563
564=head2 DEFINE WHAT TAGS AND ATTRIBUTES TO TRANSLATE
565
566The simplest customization is to define which tags and attributes you want
567the parser to translate. This should be done in the initialize function.
568First you should call the main initialize, to get the command-line options,
569and then, append your custom definitions to the options hash. If you want
570to treat some new options from command line, you should define them before
571calling the main initialize:
572
573 $self->{options}{'new_option'}='';
574 $self->SUPER::initialize(%options);
575 $self->{options}{'_default_translated'}.=' <p> <head><title>';
576 $self->{options}{'attributes'}.=' <p>lang id';
577 $self->{options}{'_default_inline'}.=' <br>';
578 $self->treat_options;
579
580You should use the B<_default_inline>, B<_default_break>,
581B<_default_placeholder>, B<_default_translated>, B<_default_untranslated>,
582and B<_default_attributes> options in derivated modules. This allow users
583to override the default behavior defined in your module with command line
584options.
585
586=head2 OVERRIDING THE found_string FUNCTION
587
588Another simple step is to override the function "found_string", which
589receives the extracted strings from the parser, in order to translate them.
590There you can control which strings you want to translate, and perform
591transformations to them before or after the translation itself.
592
593It receives the extracted text, the reference on where it was, and a hash
594that contains extra information to control what strings to translate, how
595to translate them and to generate the comment.
596
597The content of these options depends on the kind of string it is (specified in an
598entry of this hash):
599
600=over
601
602=item type="tag"
603
604The found string is the content of a translatable tag. The entry "tag_options"
605contains the option characters in front of the tag hierarchy in the module
606"tags" option.
607
608=item type="attribute"
609
610Means that the found string is the value of a translatable attribute. The
611entry "attribute" has the name of the attribute.
612
613=back
614
615It must return the text that will replace the original in the translated
616document. Here's a basic example of this function:
617
618 sub found_string {
619 my ($self,$text,$ref,$options)=@_;
620 $text = $self->translate($text,$ref,"type ".$options->{'type'},
621 'wrap'=>$self->{options}{'wrap'});
622 return $text;
623 }
624
625There's another simple example in the new Dia module, which only filters
626some strings.
627
628=cut
629
630sub found_string {
631my ($self,$text,$ref,$options)=@_;
632
633if ($text =~ m/^\s*$/s) {
634return $text;
635}
636
637my $comment;
638my $wrap = $self->{options}{'wrap'};
639
640if ($options->{'type'} eq "tag") {
641$comment = "Content of: ".$self->get_path;
642
643if($options->{'tag_options'} =~ /w/) {
644$wrap = 1;
645}
646if($options->{'tag_options'} =~ /W/) {
647$wrap = 0;
648}
649} elsif ($options->{'type'} eq "attribute") {
650$comment = "Attribute '".$options->{'attribute'}."' of: ".$self->get_path;
651} elsif ($options->{'type'} eq "CDATA") {
652$comment = "CDATA";
653$wrap = 0;
654} else {
655die wrap_ref_mod($ref, "po4a::xml", dgettext("po4a", "Internal error: unknown type identifier '%s'."), $options->{'type'});
656}
657$text = $self->translate($text,$ref,$comment,'wrap'=>$wrap, comment => $options->{'comments'});
658return $text;
659}
660
661=head2 MODIFYING TAG TYPES (TODO)
662
663This is a more complex one, but it enables a (almost) total customization.
664It's based in a list of hashes, each one defining a tag type's behavior. The
665list should be sorted so that the most general tags are after the most
666concrete ones (sorted first by the beginning and then by the end keys). To
667define a tag type you'll have to make a hash with the following keys:
668
669=over 4
670
671=item B<beginning>
672
673Specifies the beginning of the tag, after the "E<lt>".
674
675=item B<end>
676
677Specifies the end of the tag, before the "E<gt>".
678
679=item B<breaking>
680
681It says if this is a breaking tag class. A non-breaking (inline) tag is one
682that can be taken as part of the content of another tag. It can take the
683values false (0), true (1) or undefined. If you leave this undefined, you'll
684have to define the f_breaking function that will say whether a concrete tag of
685this class is a breaking tag or not.
686
687=item B<f_breaking>
688
689It's a function that will tell if the next tag is a breaking one or not. It
690should be defined if the B<breaking> option is not.
691
692=item B<f_extract>
693
694If you leave this key undefined, the generic extraction function will have to
695extract the tag itself. It's useful for tags that can have other tags or
696special structures in them, so that the main parser doesn't get mad. This
697function receives a boolean that says if the tag should be removed from the
698input stream or not.
699
700=item B<f_translate>
701
702This function receives the tag (in the get_string_until() format) and returns
703the translated tag (translated attributes or all needed transformations) as a
704single string.
705
706=back
707
708=cut
709
710##### Generic XML tag types #####'
711
712our @tag_types = (
713{beginning=> "!--#",
714end=> "--",
715breaking=> 0,
716f_extract=> \&tag_extract_comment,
717f_translate=> \&tag_trans_comment},
718{beginning=> "!--",
719end=> "--",
720breaking=> 0,
721f_extract=> \&tag_extract_comment,
722f_translate=> \&tag_trans_comment},
723{beginning=> "?xml",
724end=> "?",
725breaking=> 1,
726f_translate=> \&tag_trans_xmlhead},
727{beginning=> "?",
728end=> "?",
729breaking=> 1,
730f_translate=> \&tag_trans_procins},
731{beginning=> "!DOCTYPE",
732end=> "",
733breaking=> 1,
734f_extract=> \&tag_extract_doctype,
735f_translate=> \&tag_trans_doctype},
736{beginning=> "![CDATA[",
737end=> "]]",
738breaking=> 1,
739f_extract=> \&CDATA_extract,
740f_translate=> \&CDATA_trans},
741{beginning=> "/",
742end=> "",
743f_breaking=> \&tag_break_close,
744f_translate=> \&tag_trans_close},
745{beginning=> "",
746end=> "/",
747f_breaking=> \&tag_break_alone,
748f_translate=> \&tag_trans_alone},
749{beginning=> "",
750end=> "",
751f_breaking=> \&tag_break_open,
752f_translate=> \&tag_trans_open}
753);
754
755sub tag_extract_comment {
756my ($self,$remove)=(shift,shift);
757my ($eof,@tag)=$self->get_string_until('-->',{include=>1,remove=>$remove});
758return ($eof,@tag);
759}
760
761sub tag_trans_comment {
762my ($self,@tag)=@_;
763return $self->join_lines(@tag);
764}
765
766sub tag_trans_xmlhead {
767my ($self,@tag)=@_;
768
769# We don't have to translate anything from here: throw away references
770my $tag = $self->join_lines(@tag);
771$tag =~ /encoding=(("|')|)(.*?)(\s|\2)/s;
772my $in_charset=$3;
773$self->detected_charset($in_charset);
774my $out_charset=$self->get_out_charset;
775
776if (defined $in_charset) {
777$tag =~ s/$in_charset/$out_charset/;
778} else {
779if ($tag =~ m/standalone/) {
780$tag =~ s/(standalone)/encoding="$out_charset" $1/;
781} else {
782$tag.= " encoding=\"$out_charset\"";
783}
784}
785
786return $tag;
787}
788
789sub tag_trans_procins {
790my ($self,@tag)=@_;
791return $self->join_lines(@tag);
792}
793
794sub tag_extract_doctype {
795my ($self,$remove)=(shift,shift);
796
797# Check if there is an internal subset (between []).
798my ($eof,@tag)=$self->get_string_until('>',{include=>1,unquoted=>1});
799my $parity = 0;
800my $paragraph = "";
801map { $parity = 1 - $parity; $paragraph.= $parity?$_:""; } @tag;
802my $found = 0;
803if ($paragraph =~ m/<.*\[.*</s) {
804$found = 1
805}
806
807if (not $found) {
808($eof,@tag)=$self->get_string_until('>',{include=>1,remove=>$remove,unquoted=>1});
809} else {
810($eof,@tag)=$self->get_string_until(']\s*>',{include=>1,remove=>$remove,unquoted=>1,regex=>1});
811}
812return ($eof,@tag);
813}
814
815sub tag_trans_doctype {
816# This check is not really reliable. There are system and public
817# identifiers. Only the public one could be checked reliably.
818my ($self,@tag)=@_;
819if (defined $self->{options}{'doctype'} ) {
820my $doctype = $self->{options}{'doctype'};
821if ( $tag[0] !~ /\Q$doctype\E/i ) {
822warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Bad document type. '%s' expected. You can fix this warning with a -o doctype option, or ignore this check with -o doctype=\"\"."), $doctype);
823}
824}
825my $i = 0;
826my $basedir = $tag[1];
827$basedir =~ s/:[0-9]+$//;
828$basedir = dirname($basedir);
829
830while ( $i < $#tag ) {
831my $t = $tag[$i];
832my $ref = $tag[$i+1];
833if ( $t =~ /^(\s*<!ENTITY\s+)(.*)$/is ) {
834my $part1 = $1;
835my $part2 = $2;
836my $includenow = 0;
837my $file = 0;
838my $name = "";
839if ($part2 =~ /^(%\s+)(.*)$/s ) {
840$part1.= $1;
841$part2 = $2;
842$includenow = 1;
843}
844$part2 =~ /^(\S+)(\s+)(.*)$/s;
845$name = $1;
846$part1.= $1.$2;
847$part2 = $3;
848if ( $part2 =~ /^(SYSTEM\s+)(.*)$/is ) {
849$part1.= $1;
850$part2 = $2;
851$file = 1;
852if ($self->{options}{'includeexternal'}) {
853$entities{$name} = $part2;
854$entities{$name} =~ s/^"?(.*?)".*$/$1/s;
855$entities{$name} = File::Spec->catfile($basedir, $entities{$name});
856}
857}
858if ((not $file) and (not $includenow)) {
859 if ($part2 =~ m/^\s*(["'])(.*)\1(\s*>.*)$/s) {
860my $comment = "Content of the $name entity";
861my $quote = $1;
862my $text = $2;
863$part2 = $3;
864$text = $self->translate($text,
865 $ref,
866 $comment,
867 'wrap'=>1);
868$t = $part1."$quote$text$quote$part2";
869 }
870}
871#print $part1."\n";
872#print $name."\n";
873#print $part2."\n";
874}
875$tag[$i] = $t;
876$i += 2;
877}
878return $self->join_lines(@tag);
879}
880
881sub tag_break_close {
882my ($self,@tag)=@_;
883my $struct = $self->get_path;
884my $options = $self->get_translate_options($struct);
885if ($options =~ m/[ip]/) {
886return 0;
887} else {
888return 1;
889}
890}
891
892sub tag_trans_close {
893my ($self,@tag)=@_;
894my $name = $self->get_tag_name(@tag);
895
896my $test = pop @path;
897if (!defined($test) || $test ne $name ) {
898my $ontagerror = $self->{options}{'ontagerror'};
899if ($ontagerror eq "warn") {
900warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong. Continuing..."), $name);
901} elsif ($ontagerror ne "silent") {
902die wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong."), $name);
903}
904}
905return $self->join_lines(@tag);
906}
907
908sub CDATA_extract {
909my ($self,$remove)=(shift,shift);
910 my ($eof, @tag) = $self->get_string_until(']]>',{include=>1,unquoted=>0,remove=>$remove});
911
912return ($eof, @tag);
913}
914
915sub CDATA_trans {
916my ($self,@tag)=@_;
917return $self->found_string($self->join_lines(@tag),
918 $tag[1],
919 {'type' => "CDATA"});
920}
921
922sub tag_break_alone {
923my ($self,@tag)=@_;
924my $struct = $self->get_path($self->get_tag_name(@tag));
925if ($self->get_translate_options($struct) =~ m/i/) {
926return 0;
927} else {
928return 1;
929}
930}
931
932sub tag_trans_alone {
933my ($self,@tag)=@_;
934my $name = $self->get_tag_name(@tag);
935push @path, $name;
936
937$name = $self->treat_attributes(@tag);
938
939pop @path;
940return $name;
941}
942
943sub tag_break_open {
944my ($self,@tag)=@_;
945my $struct = $self->get_path($self->get_tag_name(@tag));
946my $options = $self->get_translate_options($struct);
947if ($options =~ m/[ip]/) {
948return 0;
949} else {
950return 1;
951}
952}
953
954sub tag_trans_open {
955my ($self,@tag)=@_;
956my $name = $self->get_tag_name(@tag);
957push @path, $name;
958
959$name = $self->treat_attributes(@tag);
960
961if (defined $self->{options}{'addlang'}) {
962my $struct = $self->get_path();
963if ($struct eq $self->{options}{'addlang'}) {
964$name .= ' lang="'.$self->{TT}{po_in}->{lang}.'"';
965}
966}
967
968return $name;
969}
970
971##### END of Generic XML tag types #####
972
973=head1 INTERNAL FUNCTIONS used to write derivated parsers
974
975=head2 WORKING WITH TAGS
976
977=over 4
978
979=item get_path()
980
981This function returns the path to the current tag from the document's root,
982in the form E<lt>htmlE<gt>E<lt>bodyE<gt>E<lt>pE<gt>.
983
984An additional array of tags (without brackets) can be passed as argument.
985These path elements are added to the end of the current path.
986
987=cut
988
989sub get_path {
990my $self = shift;
991my @add = @_;
992if ( @path > 0 or @add > 0 ) {
993return "<".join("><",@path,@add).">";
994} else {
995return "outside any tag (error?)";
996}
997}
998
999=item tag_type()
1000
1001This function returns the index from the tag_types list that fits to the next
1002tag in the input stream, or -1 if it's at the end of the input file.
1003
1004=cut
1005
1006sub tag_type {
1007my $self = shift;
1008my ($line,$ref) = $self->shiftline();
1009my ($match1,$match2);
1010my $found = 0;
1011my $i = 0;
1012
1013if (!defined($line)) { return -1; }
1014
1015$self->unshiftline($line,$ref);
1016my ($eof,@lines) = $self->get_string_until(">",{include=>1,unquoted=>1});
1017my $line2 = $self->join_lines(@lines);
1018while (!$found && $i < @tag_types) {
1019($match1,$match2) = ($tag_types[$i]->{beginning},$tag_types[$i]->{end});
1020if ($line =~ /^<\Q$match1\E/) {
1021if (!defined($tag_types[$i]->{f_extract})) {
1022#print substr($line2,length($line2)-1-length($match2),1+length($match2))."\n";
1023if (defined($line2) and $line2 =~ /\Q$match2\E>$/) {
1024$found = 1;
1025#print "YES: <".$match1." ".$match2.">\n";
1026} else {
1027#print "NO: <".$match1." ".$match2.">\n";
1028$i++;
1029}
1030} else {
1031$found = 1;
1032}
1033} else {
1034$i++;
1035}
1036}
1037if (!$found) {
1038#It should never enter here, unless you undefine the most
1039#general tags (as <...>)
1040chomp $line;
1041die $ref.": Unknown tag type: ".$line."\n";
1042} else {
1043return $i;
1044}
1045}
1046
1047=item extract_tag($$)
1048
1049This function returns the next tag from the input stream without the beginning
1050and end, in an array form, to maintain the references from the input file. It
1051has two parameters: the type of the tag (as returned by tag_type) and a
1052boolean, that indicates if it should be removed from the input stream.
1053
1054=cut
1055
1056sub extract_tag {
1057my ($self,$type,$remove) = (shift,shift,shift);
1058my ($match1,$match2) = ($tag_types[$type]->{beginning},$tag_types[$type]->{end});
1059my ($eof,@tag);
1060if (defined($tag_types[$type]->{f_extract})) {
1061($eof,@tag) = &{$tag_types[$type]->{f_extract}}($self,$remove);
1062} else {
1063($eof,@tag) = $self->get_string_until($match2.">",{include=>1,remove=>$remove,unquoted=>1});
1064}
1065$tag[0] =~ /^<\Q$match1\E(.*)$/s;
1066$tag[0] = $1;
1067$tag[$#tag-1] =~ /^(.*)\Q$match2\E>$/s;
1068$tag[$#tag-1] = $1;
1069return ($eof,@tag);
1070}
1071
1072=item get_tag_name(@)
1073
1074This function returns the name of the tag passed as an argument, in the array
1075form returned by extract_tag.
1076
1077=cut
1078
1079sub get_tag_name {
1080my ($self,@tag)=@_;
1081$tag[0] =~ /^(\S*)/;
1082return $1;
1083}
1084
1085=item breaking_tag()
1086
1087This function returns a boolean that says if the next tag in the input stream
1088is a breaking tag or not (inline tag). It leaves the input stream intact.
1089
1090=cut
1091
1092sub breaking_tag {
1093my $self = shift;
1094my $break;
1095
1096my $type = $self->tag_type;
1097if ($type == -1) { return 0; }
1098
1099#print "TAG TYPE = ".$type."\n";
1100$break = $tag_types[$type]->{breaking};
1101if (!defined($break)) {
1102# This tag's breaking depends on its content
1103my ($eof,@lines) = $self->extract_tag($type,0);
1104$break = &{$tag_types[$type]->{f_breaking}}($self,@lines);
1105}
1106#print "break = ".$break."\n";
1107return $break;
1108}
1109
1110=item treat_tag()
1111
1112This function translates the next tag from the input stream. Using each
1113tag type's custom translation functions.
1114
1115=cut
1116
1117sub treat_tag {
1118my $self = shift;
1119my $type = $self->tag_type;
1120
1121my ($match1,$match2) = ($tag_types[$type]->{beginning},$tag_types[$type]->{end});
1122my ($eof,@lines) = $self->extract_tag($type,1);
1123
1124$lines[0] =~ /^(\s*)(.*)$/s;
1125my $space1 = $1;
1126$lines[0] = $2;
1127$lines[$#lines-1] =~ /^(.*?)(\s*)$/s;
1128my $space2 = $2;
1129$lines[$#lines-1] = $1;
1130
1131# Calling this tag type's specific handling (translation of
1132# attributes...)
1133my $line = &{$tag_types[$type]->{f_translate}}($self,@lines);
1134$self->pushline("<".$match1.$space1.$line.$space2.$match2.">");
1135return $eof;
1136}
1137
1138=item tag_in_list($@)
1139
1140This function returns a string value that says if the first argument (a tag
1141hierarchy) matches any of the tags from the second argument (a list of tags
1142or tag hierarchies). If it doesn't match, it returns 0. Else, it returns the
1143matched tag's options (the characters in front of the tag) or 1 (if that tag
1144doesn't have options).
1145
1146=back
1147
1148=cut
1149sub tag_in_list ($$$) {
1150my ($self,$path,$list) = @_;
1151if ($self->{options}{'caseinsensitive'}) {
1152$path = lc $path;
1153}
1154
1155while (1) {
1156if (defined $list->{$path}) {
1157if (length $list->{$path}) {
1158return $list->{$path};
1159} else {
1160return 1;
1161}
1162}
1163last unless ($path =~ m/</);
1164$path =~ s/^<.*?>//;
1165}
1166
1167return 0;
1168}
1169
1170=head2 WORKING WITH ATTRIBUTES
1171
1172=over 4
1173
1174=item treat_attributes(@)
1175
1176This function handles the translation of the tags' attributes. It receives the tag
1177without the beginning / end marks, and then it finds the attributes, and it
1178translates the translatable ones (specified by the module option "attributes").
1179This returns a plain string with the translated tag.
1180
1181=back
1182
1183=cut
1184
1185sub treat_attributes {
1186my ($self,@tag)=@_;
1187
1188$tag[0] =~ /^(\S*)(.*)/s;
1189my $text = $1;
1190$tag[0] = $2;
1191
1192while (@tag) {
1193my $complete = 1;
1194
1195$text .= $self->skip_spaces(\@tag);
1196if (@tag) {
1197# Get the attribute's name
1198$complete = 0;
1199
1200$tag[0] =~ /^([^\s=]+)(.*)/s;
1201my $name = $1;
1202my $ref = $tag[1];
1203$tag[0] = $2;
1204$text .= $name;
1205$text .= $self->skip_spaces(\@tag);
1206if (@tag) {
1207# Get the '='
1208if ($tag[0] =~ /^=(.*)/s) {
1209$tag[0] = $1;
1210$text .= "=";
1211$text .= $self->skip_spaces(\@tag);
1212if (@tag) {
1213# Get the value
1214my $value="";
1215$ref=$tag[1];
1216my $quot=substr($tag[0],0,1);
1217if ($quot ne "\"" and $quot ne "'") {
1218# Unquoted value
1219$quot="";
1220$tag[0] =~ /^(\S+)(.*)/s;
1221$value = $1;
1222$tag[0] = $2;
1223} else {
1224# Quoted value
1225$text .= $quot;
1226$tag[0] =~ /^\Q$quot\E(.*)/s;
1227$tag[0] = $1;
1228while ($tag[0] !~ /\Q$quot\E/) {
1229$value .= $tag[0];
1230shift @tag;
1231shift @tag;
1232}
1233$tag[0] =~ /^(.*?)\Q$quot\E(.*)/s;
1234$value .= $1;
1235$tag[0] = $2;
1236}
1237$complete = 1;
1238if ($self->tag_in_list($self->get_path.$name,$self->{attributes})) {
1239$text .= $self->found_string($value, $ref, { type=>"attribute", attribute=>$name });
1240} else {
1241print wrap_ref_mod($ref, "po4a::xml", dgettext("po4a", "Content of attribute %s excluded: %s"), $self->get_path.$name, $value)
1242 if $self->debug();
1243$text .= $self->recode_skipped_text($value);
1244}
1245$text .= $quot;
1246}
1247}
1248}
1249
1250unless ($complete) {
1251my $ontagerror = $self->{options}{'ontagerror'};
1252if ($ontagerror eq "warn") {
1253warn wrap_ref_mod($ref, "po4a::xml", dgettext ("po4a", "Bad attribute syntax. Continuing..."));
1254} elsif ($ontagerror ne "silent") {
1255die wrap_ref_mod($ref, "po4a::xml", dgettext ("po4a", "Bad attribute syntax"));
1256}
1257}
1258}
1259}
1260return $text;
1261}
1262
1263# Returns an empty string if the content in the $path should not be
1264# translated.
1265#
1266# Otherwise, returns the set of options for translation:
1267# w: the content shall be re-wrapped
1268# W: the content shall not be re-wrapped
1269# i: the tag shall be inlined
1270# p: a placeholder shall replace the tag (and its content)
1271# n: a custom tag
1272#
1273# A translatable inline tag in an untranslated tag is treated as a translatable breaking tag.
1274sub get_translate_options {
1275my $self = shift;
1276my $path = shift;
1277
1278if (defined $translate_options_cache{$path}) {
1279return $translate_options_cache{$path};
1280}
1281
1282my $options = "";
1283my $translate = 0;
1284my $usedefault = 1;
1285
1286my $inlist = 0;
1287my $tag = $self->get_tag_from_list($path, $self->{tags});
1288if (defined $tag) {
1289$inlist = 1;
1290}
1291if ($self->{options}{'tagsonly'} eq $inlist) {
1292$usedefault = 0;
1293if (defined $tag) {
1294$options = $tag;
1295$options =~ s/<.*$//;
1296} else {
1297if ($self->{options}{'wrap'}) {
1298$options = "w";
1299} else {
1300$options = "W";
1301}
1302}
1303$translate = 1;
1304}
1305
1306# TODO: a less precise set of tags should not override a more precise one
1307# The tags and tagsonly options are deprecated.
1308# The translated and untranslated options have an higher priority.
1309$tag = $self->get_tag_from_list($path, $self->{translated});
1310if (defined $tag) {
1311$usedefault = 0;
1312$options = $tag;
1313$options =~ s/<.*$//;
1314$translate = 1;
1315}
1316
1317if ($translate and $options !~ m/w/i) {
1318$options .= ($self->{options}{'wrap'})?"w":"W";
1319}
1320
1321if (not defined $tag) {
1322$tag = $self->get_tag_from_list($path, $self->{untranslated});
1323if (defined $tag) {
1324$usedefault = 0;
1325$options = "";
1326$translate = 0;
1327}
1328}
1329
1330$tag = $self->get_tag_from_list($path, $self->{inline});
1331if (defined $tag) {
1332$usedefault = 0;
1333$options .= "i";
1334} else {
1335$tag = $self->get_tag_from_list($path, $self->{placeholder});
1336if (defined $tag) {
1337$usedefault = 0;
1338$options .= "p";
1339}
1340}
1341
1342$tag = $self->get_tag_from_list($path, $self->{customtag});
1343if (defined $tag) {
1344$usedefault = 0;
1345$options = "in"; # This erase any other setting
1346}
1347
1348if ($usedefault) {
1349$options = $self->{options}{'defaulttranslateoption'};
1350}
1351
1352# A translatable inline tag in an untranslated tag is treated as a
1353# translatable breaking tag.
1354if ($options =~ m/i/) {
1355my $ppath = $path;
1356$ppath =~ s/<[^>]*>$//;
1357my $poptions = $self->get_translate_options ($ppath);
1358if ($poptions eq "") {
1359$options =~ s/i//;
1360}
1361}
1362
1363if ($options =~ m/i/ and $self->{options}{'foldattributes'}) {
1364$options .= "f";
1365}
1366
1367$translate_options_cache{$path} = $options;
1368return $options;
1369}
1370
1371
1372# Return the tag (or biggest set of tags) of a list which matches with the
1373# given path.
1374#
1375# The tag (or set of tags) is returned with its options.
1376#
1377# If no tags could match the path, undef is returned.
1378sub get_tag_from_list ($$$) {
1379my ($self,$path,$list) = @_;
1380if ($self->{options}{'caseinsensitive'}) {
1381$path = lc $path;
1382}
1383
1384while (1) {
1385if (defined $list->{$path}) {
1386return $list->{$path}.$path;
1387}
1388last unless ($path =~ m/</);
1389$path =~ s/^<.*?>//;
1390}
1391
1392return undef;
1393}
1394
1395
1396
1397sub treat_content {
1398my $self = shift;
1399my $blank="";
1400# Indicates if the paragraph will have to be translated
1401my $translate = "";
1402
1403my ($eof,@paragraph)=$self->get_string_until('<',{remove=>1});
1404
1405while (!$eof and !$self->breaking_tag) {
1406NEXT_TAG:
1407my @text;
1408my $type = $self->tag_type;
1409my $f_extract = $tag_types[$type]->{'f_extract'};
1410if ( defined($f_extract)
1411 and $f_extract eq \&tag_extract_comment) {
1412# Remove the content of the comments
1413($eof, @text) = $self->extract_tag($type,1);
1414$text[$#text-1] .= "\0";
1415if ($tag_types[$type]->{'beginning'} eq "!--#") {
1416$text[0] = "#".$text[0];
1417}
1418push @comments, @text;
1419} else {
1420my ($tmpeof, @tag) = $self->extract_tag($type,0);
1421# Append the found inline tag
1422($eof,@text)=$self->get_string_until('>',
1423 {include=>1,
1424 remove=>1,
1425 unquoted=>1});
1426# Append or remove the opening/closing tag from
1427# the tag path
1428if ($tag_types[$type]->{'end'} eq "") {
1429if ($tag_types[$type]->{'beginning'} eq "") {
1430# Opening inline tag
1431my $cur_tag_name = $self->get_tag_name(@tag);
1432my $t_opts = $self->get_translate_options($self->get_path($cur_tag_name));
1433if ($t_opts =~ m/p/) {
1434# We enter a new holder.
1435# Append a <placeholder ...> tag to the current
1436# paragraph, and save the @paragraph in the
1437# current holder.
1438my $last_holder = $save_holders[$#save_holders];
1439my $placeholder_str = "<placeholder type=\"".$cur_tag_name."\" id=\"".($#{$last_holder->{'sub_translations'}}+1)."\"/>";
1440push @paragraph, ($placeholder_str, $text[1]);
1441my @saved_paragraph = @paragraph;
1442
1443$last_holder->{'paragraph'} = \@saved_paragraph;
1444
1445# Then we must push a new holder
1446my @new_paragraph = ();
1447my @sub_translations = ();
1448my %folded_attributes;
1449my %new_holder = ('paragraph' => \@new_paragraph,
1450 'open' => $self->join_lines(@text),
1451 'translation' => "",
1452 'close' => undef,
1453 'sub_translations' => \@sub_translations,
1454 'folded_attributes' => \%folded_attributes);
1455push @save_holders, \%new_holder;
1456@text = ();
1457
1458# The current @paragraph
1459# (for the current holder)
1460# is empty.
1461@paragraph = ();
1462} elsif ($t_opts =~ m/f/) {
1463my $tag_full = $self->join_lines(@text);
1464my $tag_ref = $text[1];
1465if ($tag_full =~ m/^<\s*\S+\s+\S.*>$/s) {
1466my $holder = $save_holders[$#save_holders];
1467my $id = 0;
1468foreach (keys %{$holder->{folded_attributes}}) {
1469$id = $_ + 1 if ($_ >= $id);
1470}
1471$holder->{folded_attributes}->{$id} = $tag_full;
1472
1473@text = ("<$cur_tag_name po4a-id=$id>", $tag_ref);
1474}
1475}
1476unless ($t_opts =~ m/n/) {
1477push @path, $cur_tag_name;
1478}
1479} elsif ($tag_types[$type]->{'beginning'} eq "/") {
1480# Closing inline tag
1481
1482# Check if this is closing the
1483# last opening tag we detected.
1484my $test = pop @path;
1485my $name = $self->get_tag_name(@tag);
1486if (!defined($test) ||
1487 $test ne $name ) {
1488my $ontagerror = $self->{options}{'ontagerror'};
1489if ($ontagerror eq "warn") {
1490warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong. Continuing..."), $name);
1491} elsif ($ontagerror ne "silent") {
1492die wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong."), $name);
1493}
1494}
1495
1496if ($self->get_translate_options($self->get_path($self->get_tag_name(@tag))) =~ m/p/) {
1497# This closes the current holder.
1498
1499push @path, $self->get_tag_name(@tag);
1500# Now translate this paragraph if needed.
1501# This will call pushline and append the
1502# translation to the current holder's translation.
1503$self->translate_paragraph(@paragraph);
1504pop @path;
1505
1506# Now that this holder is closed, we can remove
1507# the holder from the stack.
1508my $holder = pop @save_holders;
1509# We need to keep the translation of this holder
1510my $translation = $holder->{'open'}.$holder->{'translation'};
1511$translation .= $self->join_lines(@text);
1512
1513@text = ();
1514
1515# Then we store the translation in the previous
1516# holder's sub_translations array
1517my $previous_holder = $save_holders[$#save_holders];
1518push @{$previous_holder->{'sub_translations'}}, $translation;
1519# We also need to restore the @paragraph array, as
1520# it was before we encountered the holder.
1521@paragraph = @{$previous_holder->{'paragraph'}};
1522}
1523}
1524}
1525push @paragraph, @text;
1526}
1527
1528# Next tag
1529($eof,@text)=$self->get_string_until('<',{remove=>1});
1530if ($#text > 0) {
1531# Check if text (extracted after the inline tag)
1532# has to be translated
1533push @paragraph, @text;
1534}
1535}
1536
1537# This strips the extracted strings
1538# (only if you don't specify the 'nostrip' option, and if the
1539# paragraph can be re-wrapped)
1540$translate = $self->get_translate_options($self->get_path);
1541if (!$self->{options}{'nostrip'} and $translate !~ m/W/) {
1542my $clean = 0;
1543# Clean the beginning
1544while (!$clean and $#paragraph > 0) {
1545$paragraph[0] =~ /^(\s*)(.*)/s;
1546my $match = $1;
1547if ($paragraph[0] eq $match) {
1548if ($match ne "") {
1549$self->pushline($match);
1550}
1551shift @paragraph;
1552shift @paragraph;
1553} else {
1554$paragraph[0] = $2;
1555if ($match ne "") {
1556$self->pushline($match);
1557}
1558$clean = 1;
1559}
1560}
1561$clean = 0;
1562# Clean the end
1563while (!$clean and $#paragraph > 0) {
1564$paragraph[$#paragraph-1] =~ /^(.*?)(\s*)$/s;
1565my $match = $2;
1566if ($paragraph[$#paragraph-1] eq $match) {
1567if ($match ne "") {
1568$blank = $match.$blank;
1569}
1570pop @paragraph;
1571pop @paragraph;
1572} else {
1573$paragraph[$#paragraph-1] = $1;
1574if ($match ne "") {
1575$blank = $match.$blank;
1576}
1577$clean = 1;
1578}
1579}
1580}
1581
1582# Translate the string when needed
1583# This will either push the translation in the translated document or
1584# in the current holder translation.
1585$self->translate_paragraph(@paragraph);
1586
1587# Push the trailing blanks
1588if ($blank ne "") {
1589$self->pushline($blank);
1590}
1591return $eof;
1592}
1593
1594# Translate a @paragraph array of (string, reference).
1595# The $translate argument indicates if the strings must be translated or
1596# just pushed
1597sub translate_paragraph {
1598my $self = shift;
1599my @paragraph = @_;
1600my $translate = $self->get_translate_options($self->get_path);
1601
1602while ( (scalar @paragraph)
1603 and ($paragraph[0] =~ m/^\s*\n/s)) {
1604$self->pushline($paragraph[0]);
1605shift @paragraph;
1606shift @paragraph;
1607}
1608
1609my $comments;
1610while (@comments) {
1611my ($comment,$eoc);
1612do {
1613my ($t,$l) = (shift @comments, shift @comments);
1614$t =~ s/\n?(\0)?$//;
1615$eoc = $1;
1616$comment .= "\n" if defined $comment;
1617$comment .= $t;
1618} until ($eoc);
1619$comments .= "\n" if defined $comments;
1620$comments .= $comment;
1621$self->pushline("<!--".$comment."-->\n") if defined $comment;
1622}
1623@comments = ();
1624
1625if ($self->{options}{'cpp'}) {
1626my @tmp = @paragraph;
1627@paragraph = ();
1628while (@tmp) {
1629my ($t,$l) = (shift @tmp, shift @tmp);
1630# #include can be followed by a filename between
1631# <> brackets. In that case, the argument won't be
1632# handled in the same call to translate_paragraph.
1633# Thus do not try to match "include ".
1634if ($t =~ m/^#[ \t]*(if |endif|undef |include|else|ifdef |ifndef |define )/si) {
1635if (@paragraph) {
1636$self->translate_paragraph(@paragraph);
1637@paragraph = ();
1638$self->pushline("\n");
1639}
1640$self->pushline($t);
1641} else {
1642push @paragraph, ($t,$l);
1643}
1644}
1645}
1646
1647my $para = $self->join_lines(@paragraph);
1648if ( length($para) > 0 ) {
1649if ($translate ne "") {
1650# This tag should be translated
1651$self->pushline($self->found_string(
1652$para,
1653$paragraph[1], {
1654type=>"tag",
1655tag_options=>$translate,
1656comments=>$comments
1657}));
1658} else {
1659# Inform that this tag isn't translated in debug mode
1660print wrap_ref_mod($paragraph[1], "po4a::xml", dgettext ("po4a", "Content of tag %s excluded: %s"), $self->get_path, $para)
1661 if $self->debug();
1662$self->pushline($self->recode_skipped_text($para));
1663}
1664}
1665# Now the paragraph is fully translated.
1666# If we have all the holders' translation, we can replace the
1667# placeholders by their translations.
1668# We must wait to have all the translations because the holders are
1669# numbered.
1670{
1671my $holder = $save_holders[$#save_holders];
1672my $translation = $holder->{'translation'};
1673
1674# Count the number of <placeholder ...> in $translation
1675my $count = 0;
1676my $str = $translation;
1677while ( (defined $str)
1678 and ($str =~ m/^.*?<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>(.*)$/s)) {
1679$count += 1;
1680$str = $2;
1681if ($holder->{'sub_translations'}->[$1] =~ m/<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>/s) {
1682$count = -1;
1683last;
1684}
1685}
1686
1687if ( (defined $translation)
1688 and (scalar(@{$holder->{'sub_translations'}}) == $count)) {
1689# OK, all the holders of the current paragraph are
1690# closed (and translated).
1691# Replace them by their translation.
1692while ($translation =~ m/^(.*?)<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>(.*)$/s) {
1693# FIXME: we could also check that
1694# * the holder exists
1695# * all the holders are used
1696$translation = $1.$holder->{'sub_translations'}->[$2].$3;
1697}
1698# We have our translation
1699$holder->{'translation'} = $translation;
1700# And there is no need for any holder in it.
1701my @sub_translations = ();
1702$holder->{'sub_translations'} = \@sub_translations;
1703}
1704}
1705
1706}
1707
1708
1709
1710=head2 WORKING WITH THE MODULE OPTIONS
1711
1712=over 4
1713
1714=item treat_options()
1715
1716This function fills the internal structures that contain the tags, attributes
1717and inline data with the options of the module (specified in the command-line
1718or in the initialize function).
1719
1720=back
1721
1722=cut
1723
1724sub treat_options {
1725my $self = shift;
1726
1727if ($self->{options}{'caseinsensitive'}) {
1728$self->{options}{'nodefault'} = lc $self->{options}{'nodefault'};
1729$self->{options}{'tags'} = lc $self->{options}{'tags'};
1730$self->{options}{'break'} = lc $self->{options}{'break'};
1731$self->{options}{'_default_break'} = lc $self->{options}{'_default_break'};
1732$self->{options}{'translated'} = lc $self->{options}{'translated'};
1733$self->{options}{'_default_translated'} = lc $self->{options}{'_default_translated'};
1734$self->{options}{'untranslated'} = lc $self->{options}{'untranslated'};
1735$self->{options}{'_default_untranslated'} = lc $self->{options}{'_default_untranslated'};
1736$self->{options}{'attributes'} = lc $self->{options}{'attributes'};
1737$self->{options}{'_default_attributes'} = lc $self->{options}{'_default_attributes'};
1738$self->{options}{'inline'} = lc $self->{options}{'inline'};
1739$self->{options}{'_default_inline'} = lc $self->{options}{'_default_inline'};
1740$self->{options}{'placeholder'} = lc $self->{options}{'placeholder'};
1741$self->{options}{'_default_placeholder'} = lc $self->{options}{'_default_placeholder'};
1742$self->{options}{'customtag'} = lc $self->{options}{'customtag'};
1743$self->{options}{'_default_customtag'} = lc $self->{options}{'_default_customtag'};
1744}
1745
1746$self->{options}{'nodefault'} =~ /^\s*(.*)\s*$/s;
1747my %list_nodefault;
1748foreach (split(/\s+/s,$1)) {
1749$list_nodefault{$_} = 1;
1750}
1751$self->{nodefault} = \%list_nodefault;
1752
1753$self->{options}{'tags'} =~ /^\s*(.*)\s*$/s;
1754if (length $self->{options}{'tags'}) {
1755warn wrap_mod("po4a::xml",
1756 dgettext("po4a",
1757 "The '%s' option is deprecated. Please use the translated/untranslated and/or break/inline/placeholder categories."), "tags");
1758}
1759foreach (split(/\s+/s,$1)) {
1760$_ =~ m/^(.*?)(<.*)$/;
1761$self->{tags}->{$2} = $1 || "";
1762}
1763
1764if ($self->{options}{'tagsonly'}) {
1765warn wrap_mod("po4a::xml",
1766 dgettext("po4a",
1767 "The '%s' option is deprecated. Please use the translated/untranslated and/or break/inline/placeholder categories."), "tagsonly");
1768}
1769
1770$self->{options}{'break'} =~ /^\s*(.*)\s*$/s;
1771foreach my $tag (split(/\s+/s,$1)) {
1772$tag =~ m/^(.*?)(<.*)$/;
1773$self->{break}->{$2} = $1 || "";
1774}
1775$self->{options}{'_default_break'} =~ /^\s*(.*)\s*$/s;
1776foreach my $tag (split(/\s+/s,$1)) {
1777$tag =~ m/^(.*?)(<.*)$/;
1778$self->{break}->{$2} = $1 || ""
1779unless $list_nodefault{$2}
1780 or defined $self->{break}->{$2};
1781}
1782
1783$self->{options}{'translated'} =~ /^\s*(.*)\s*$/s;
1784foreach my $tag (split(/\s+/s,$1)) {
1785$tag =~ m/^(.*?)(<.*)$/;
1786$self->{translated}->{$2} = $1 || "";
1787}
1788$self->{options}{'_default_translated'} =~ /^\s*(.*)\s*$/s;
1789foreach my $tag (split(/\s+/s,$1)) {
1790$tag =~ m/^(.*?)(<.*)$/;
1791$self->{translated}->{$2} = $1 || ""
1792unless $list_nodefault{$2}
1793 or defined $self->{translated}->{$2};
1794}
1795
1796$self->{options}{'untranslated'} =~ /^\s*(.*)\s*$/s;
1797foreach my $tag (split(/\s+/s,$1)) {
1798$tag =~ m/^(.*?)(<.*)$/;
1799$self->{untranslated}->{$2} = $1 || "";
1800}
1801$self->{options}{'_default_untranslated'} =~ /^\s*(.*)\s*$/s;
1802foreach my $tag (split(/\s+/s,$1)) {
1803$tag =~ m/^(.*?)(<.*)$/;
1804$self->{untranslated}->{$2} = $1 || ""
1805unless $list_nodefault{$2}
1806 or defined $self->{untranslated}->{$2};
1807}
1808
1809$self->{options}{'attributes'} =~ /^\s*(.*)\s*$/s;
1810foreach my $tag (split(/\s+/s,$1)) {
1811if ($tag =~ m/^(.*?)(<.*)$/) {
1812$self->{attributes}->{$2} = $1 || "";
1813} else {
1814$self->{attributes}->{$tag} = "";
1815}
1816}
1817$self->{options}{'_default_attributes'} =~ /^\s*(.*)\s*$/s;
1818foreach my $tag (split(/\s+/s,$1)) {
1819if ($tag =~ m/^(.*?)(<.*)$/) {
1820$self->{attributes}->{$2} = $1 || ""
1821unless $list_nodefault{$2}
1822 or defined $self->{attributes}->{$2};
1823} else {
1824$self->{attributes}->{$tag} = ""
1825unless $list_nodefault{$tag}
1826 or defined $self->{attributes}->{$tag};
1827}
1828}
1829
1830$self->{options}{'inline'} =~ /^\s*(.*)\s*$/s;
1831foreach my $tag (split(/\s+/s,$1)) {
1832$tag =~ m/^(.*?)(<.*)$/;
1833$self->{inline}->{$2} = $1 || "";
1834}
1835$self->{options}{'_default_inline'} =~ /^\s*(.*)\s*$/s;
1836foreach my $tag (split(/\s+/s,$1)) {
1837$tag =~ m/^(.*?)(<.*)$/;
1838$self->{inline}->{$2} = $1 || ""
1839unless $list_nodefault{$2}
1840 or defined $self->{inline}->{$2};
1841}
1842
1843$self->{options}{'placeholder'} =~ /^\s*(.*)\s*$/s;
1844foreach my $tag (split(/\s+/s,$1)) {
1845$tag =~ m/^(.*?)(<.*)$/;
1846$self->{placeholder}->{$2} = $1 || "";
1847}
1848$self->{options}{'_default_placeholder'} =~ /^\s*(.*)\s*$/s;
1849foreach my $tag (split(/\s+/s,$1)) {
1850$tag =~ m/^(.*?)(<.*)$/;
1851$self->{placeholder}->{$2} = $1 || ""
1852unless $list_nodefault{$2}
1853 or defined $self->{placeholder}->{$2};
1854}
1855
1856$self->{options}{'customtag'} =~ /^\s*(.*)\s*$/s;
1857foreach my $tag (split(/\s+/s,$1)) {
1858$tag =~ m/^(.*?)(<.*)$/;
1859$self->{customtag}->{$2} = $1 || "";
1860}
1861$self->{options}{'_default_customtag'} =~ /^\s*(.*)\s*$/s;
1862foreach my $tag (split(/\s+/s,$1)) {
1863$tag =~ m/^(.*?)(<.*)$/;
1864$self->{customtag}->{$2} = $1 || ""
1865unless $list_nodefault{$2}
1866 or defined $self->{customtag}->{$2};
1867}
1868
1869# There should be no translated and untranslated tags
1870foreach my $tag (keys %{$self->{translated}}) {
1871die wrap_mod("po4a::xml",
1872 dgettext("po4a",
1873 "Tag '%s' both in the %s and %s categories."), $tag, "translated", "untranslated")
1874if defined $self->{untranslated}->{$tag};
1875}
1876# There should be no inline, break, placeholder, and customtag tags
1877foreach my $tag (keys %{$self->{inline}}) {
1878die wrap_mod("po4a::xml",
1879 dgettext("po4a",
1880 "Tag '%s' both in the %s and %s categories."), $tag, "inline", "break")
1881if defined $self->{break}->{$tag};
1882die wrap_mod("po4a::xml",
1883 dgettext("po4a",
1884 "Tag '%s' both in the %s and %s categories."), $tag, "inline", "placeholder")
1885if defined $self->{placeholder}->{$tag};
1886die wrap_mod("po4a::xml",
1887 dgettext("po4a",
1888 "Tag '%s' both in the %s and %s categories."), $tag, "inline", "customtag")
1889if defined $self->{customtag}->{$tag};
1890}
1891foreach my $tag (keys %{$self->{break}}) {
1892die wrap_mod("po4a::xml",
1893 dgettext("po4a",
1894 "Tag '%s' both in the %s and %s categories."), $tag, "break", "placeholder")
1895if defined $self->{placeholder}->{$tag};
1896die wrap_mod("po4a::xml",
1897 dgettext("po4a",
1898 "Tag '%s' both in the %s and %s categories."), $tag, "break", "customtag")
1899if defined $self->{customtag}->{$tag};
1900}
1901foreach my $tag (keys %{$self->{placeholder}}) {
1902die wrap_mod("po4a::xml",
1903 dgettext("po4a",
1904 "Tag '%s' both in the %s and %s categories."), $tag, "placeholder", "customtag")
1905if defined $self->{customtag}->{$tag};
1906}
1907}
1908
1909=head2 GETTING TEXT FROM THE INPUT DOCUMENT
1910
1911=over
1912
1913=item get_string_until($%)
1914
1915This function returns an array with the lines (and references) from the input
1916document until it finds the first argument. The second argument is an options
1917hash. Value 0 means disabled (the default) and 1, enabled.
1918
1919The valid options are:
1920
1921=over 4
1922
1923=item B<include>
1924
1925This makes the returned array to contain the searched text
1926
1927=item B<remove>
1928
1929This removes the returned stream from the input
1930
1931=item B<unquoted>
1932
1933This ensures that the searched text is outside any quotes
1934
1935=back
1936
1937=cut
1938
1939sub get_string_until {
1940my ($self,$search) = (shift,shift);
1941my $options = shift;
1942my ($include,$remove,$unquoted, $regex) = (0,0,0,0);
1943
1944if (defined($options->{include})) { $include = $options->{include}; }
1945if (defined($options->{remove})) { $remove = $options->{remove}; }
1946if (defined($options->{unquoted})) { $unquoted = $options->{unquoted}; }
1947if (defined($options->{regex})) { $regex = $options->{regex}; }
1948
1949my ($line,$ref) = $self->shiftline();
1950my (@text,$paragraph);
1951my ($eof,$found) = (0,0);
1952
1953$search = "\Q$search\E" unless $regex;
1954while (defined($line) and !$found) {
1955push @text, ($line,$ref);
1956$paragraph .= $line;
1957if ($unquoted) {
1958if ( $paragraph =~ /^((\".*?\")|(\'.*?\')|[^\"\'])*$search/s ) {
1959$found = 1;
1960}
1961} else {
1962if ( $paragraph =~ /$search/s ) {
1963$found = 1;
1964}
1965}
1966if (!$found) {
1967($line,$ref)=$self->shiftline();
1968}
1969}
1970
1971if (!defined($line)) { $eof = 1; }
1972
1973if ( $found ) {
1974$line = "";
1975if($unquoted) {
1976$paragraph =~ /^(?:(?:\".*?\")|(?:\'.*?\')|[^\"\'])*?$search(.*)$/s;
1977$line = $1;
1978$text[$#text-1] =~ s/\Q$line\E$//s;
1979} else {
1980$paragraph =~ /$search(.*)$/s;
1981$line = $1;
1982$text[$#text-1] =~ s/\Q$line\E$//s;
1983}
1984if(!$include) {
1985$text[$#text-1] =~ /^(.*)($search.*)$/s;
1986$text[$#text-1] = $1;
1987$line = $2.$line;
1988}
1989if (defined($line) and ($line ne "")) {
1990$self->unshiftline ($line,$text[$#text]);
1991}
1992}
1993if (!$remove) {
1994$self->unshiftline (@text);
1995}
1996
1997#If we get to the end of the file, we return the whole paragraph
1998return ($eof,@text);
1999}
2000
2001=item skip_spaces(\@)
2002
2003This function receives as argument the reference to a paragraph (in the format
2004returned by get_string_until), skips his heading spaces and returns them as
2005a simple string.
2006
2007=cut
2008
2009sub skip_spaces {
2010my ($self,$pstring)=@_;
2011my $space="";
2012
2013while (@$pstring and (@$pstring[0] =~ /^(\s+)(.*)$/s or @$pstring[0] eq "")) {
2014if (@$pstring[0] ne "") {
2015$space .= $1;
2016@$pstring[0] = $2;
2017}
2018
2019if (@$pstring[0] eq "") {
2020shift @$pstring;
2021shift @$pstring;
2022}
2023}
2024return $space;
2025}
2026
2027=item join_lines(@)
2028
2029This function returns a simple string with the text from the argument array
2030(discarding the references).
2031
2032=cut
2033
2034sub join_lines {
2035my ($self,@lines)=@_;
2036my ($line,$ref);
2037my $text = "";
2038while ($#lines > 0) {
2039($line,$ref) = (shift @lines,shift @lines);
2040$text .= $line;
2041}
2042return $text;
2043}
2044
2045=back
2046
2047=head1 STATUS OF THIS MODULE
2048
2049This module can translate tags and attributes.
2050
2051=head1 TODO LIST
2052
2053DOCTYPE (ENTITIES)
2054
2055There is a minimal support for the translation of entities. They are
2056translated as a whole, and tags are not taken into account. Multilines
2057entities are not supported and entities are always rewrapped during the
2058translation.
2059
2060MODIFY TAG TYPES FROM INHERITED MODULES
2061(move the tag_types structure inside the $self hash?)
2062
2063=head1 SEE ALSO
2064
2065L<Locale::Po4a::TransTractor(3pm)|Locale::Po4a::TransTractor>,
2066L<po4a(7)|po4a.7>
2067
2068=head1 AUTHORS
2069
2070 Jordi Vilalta <jvprat@gmail.com>
2071 Nicolas François <nicolas.francois@centraliens.net>
2072
2073=head1 COPYRIGHT AND LICENSE
2074
2075 Copyright (c) 2004 by Jordi Vilalta <jvprat@gmail.com>
2076 Copyright (c) 2008-2009 by Nicolas François <nicolas.francois@centraliens.net>
2077
2078This program is free software; you may redistribute it and/or modify it
2079under the terms of GPL (see the COPYING file).
2080
2081=cut
2082
20831;
2084

Archive Download this file

Revision: 2790