1 | #!/usr/bin/perl -w␊ |
2 | ␊ |
3 | require Exporter;␊ |
4 | ␊ |
5 | package Locale::Po4a::TransTractor;␊ |
6 | use DynaLoader;␊ |
7 | ␊ |
8 | use 5.006;␊ |
9 | use strict;␊ |
10 | use warnings;␊ |
11 | ␊ |
12 | use subs qw(makespace);␊ |
13 | use vars qw($VERSION @ISA @EXPORT);␊ |
14 | $VERSION="0.44";␊ |
15 | @ISA = qw(DynaLoader);␊ |
16 | @EXPORT = qw(new process translate␊ |
17 | read write readpo writepo␊ |
18 | getpoout setpoout get_out_charset);␊ |
19 | ␊ |
20 | # Try to use a C extension if present.␊ |
21 | eval("bootstrap Locale::Po4a::TransTractor $VERSION");␊ |
22 | ␊ |
23 | use Carp qw(croak);␊ |
24 | use Locale::Po4a::Po;␊ |
25 | use Locale::Po4a::Common;␊ |
26 | ␊ |
27 | use File::Path; # mkdir before write␊ |
28 | ␊ |
29 | use Encode;␊ |
30 | use Encode::Guess;␊ |
31 | ␊ |
32 | =encoding UTF-8␊ |
33 | ␊ |
34 | =head1 NAME␊ |
35 | ␊ |
36 | Locale::Po4a::TransTractor - generic trans(lator ex)tractor.␊ |
37 | ␊ |
38 | =head1 DESCRIPTION␊ |
39 | ␊ |
40 | The po4a (PO for anything) project goal is to ease translations (and more␊ |
41 | interestingly, the maintenance of translations) using gettext tools on␊ |
42 | areas where they were not expected like documentation.␊ |
43 | ␊ |
44 | This class is the ancestor of every po4a parser used to parse a document, to␊ |
45 | search translatable strings, to extract them to a PO file and to replace them by␊ |
46 | their translation in the output document.␊ |
47 | ␊ |
48 | More formally, it takes the following arguments as input:␊ |
49 | ␊ |
50 | =over 2␊ |
51 | ␊ |
52 | =item -␊ |
53 | ␊ |
54 | a document to translate;␊ |
55 | ␊ |
56 | =item -␊ |
57 | ␊ |
58 | a PO file containing the translations to use.␊ |
59 | ␊ |
60 | =back␊ |
61 | ␊ |
62 | As output, it produces:␊ |
63 | ␊ |
64 | =over 2␊ |
65 | ␊ |
66 | =item -␊ |
67 | ␊ |
68 | another PO file, resulting of the extraction of translatable strings from␊ |
69 | the input document;␊ |
70 | ␊ |
71 | =item -␊ |
72 | ␊ |
73 | a translated document, with the same structure than the one in input, but␊ |
74 | with all translatable strings replaced with the translations found in the␊ |
75 | PO file provided in input.␊ |
76 | ␊ |
77 | =back␊ |
78 | ␊ |
79 | Here is a graphical representation of this:␊ |
80 | ␊ |
81 | Input document --\ /---> Output document␊ |
82 | \ / (translated)␊ |
83 | +-> parse() function -----+␊ |
84 | / \␊ |
85 | Input PO --------/ \---> Output PO␊ |
86 | (extracted)␊ |
87 | ␊ |
88 | =head1 FUNCTIONS YOUR PARSER SHOULD OVERRIDE␊ |
89 | ␊ |
90 | =over 4␊ |
91 | ␊ |
92 | =item parse()␊ |
93 | ␊ |
94 | This is where all the work takes place: the parsing of input documents, the␊ |
95 | generation of output, and the extraction of the translatable strings. This␊ |
96 | is pretty simple using the provided functions presented in the section␊ |
97 | B<INTERNAL FUNCTIONS> below. See also the B<SYNOPSIS>, which presents an␊ |
98 | example.␊ |
99 | ␊ |
100 | This function is called by the process() function below, but if you choose␊ |
101 | to use the new() function, and to add content manually to your document,␊ |
102 | you will have to call this function yourself.␊ |
103 | ␊ |
104 | =item docheader()␊ |
105 | ␊ |
106 | This function returns the header we should add to the produced document,␊ |
107 | quoted properly to be a comment in the target language. See the section␊ |
108 | B<Educating developers about translations>, from L<po4a(7)|po4a.7>, for what␊ |
109 | it is good for.␊ |
110 | ␊ |
111 | =back␊ |
112 | ␊ |
113 | =cut␊ |
114 | ␊ |
115 | sub docheader {}␊ |
116 | ␊ |
117 | sub parse {}␊ |
118 | ␊ |
119 | =head1 SYNOPSIS␊ |
120 | ␊ |
121 | The following example parses a list of paragraphs beginning with "<p>". For the sake␊ |
122 | of simplicity, we assume that the document is well formatted, i.e. that '<p>'␊ |
123 | tags are the only tags present, and that this tag is at the very beginning␊ |
124 | of each paragraph.␊ |
125 | ␊ |
126 | sub parse {␊ |
127 | my $self = shift;␊ |
128 | ␊ |
129 | PARAGRAPH: while (1) {␊ |
130 | my ($paragraph,$pararef)=("","");␊ |
131 | my $first=1;␊ |
132 | my ($line,$lref)=$self->shiftline();␊ |
133 | while (defined($line)) {␊ |
134 | if ($line =~ m/<p>/ && !$first--; ) {␊ |
135 | # Not the first time we see <p>.␊ |
136 | # Reput the current line in input,␊ |
137 | # and put the built paragraph to output␊ |
138 | $self->unshiftline($line,$lref);␊ |
139 | ␊ |
140 | # Now that the document is formed, translate it:␊ |
141 | # - Remove the leading tag␊ |
142 | $paragraph =~ s/^<p>//s;␊ |
143 | ␊ |
144 | # - push to output the leading tag (untranslated) and the␊ |
145 | # rest of the paragraph (translated)␊ |
146 | $self->pushline( "<p>"␊ |
147 | . $document->translate($paragraph,$pararef)␊ |
148 | );␊ |
149 | ␊ |
150 | next PARAGRAPH;␊ |
151 | } else {␊ |
152 | # Append to the paragraph␊ |
153 | $paragraph .= $line;␊ |
154 | $pararef = $lref unless(length($pararef));␊ |
155 | }␊ |
156 | ␊ |
157 | # Reinit the loop␊ |
158 | ($line,$lref)=$self->shiftline();␊ |
159 | }␊ |
160 | # Did not get a defined line? End of input file.␊ |
161 | return;␊ |
162 | }␊ |
163 | }␊ |
164 | ␊ |
165 | Once you've implemented the parse function, you can use your document␊ |
166 | class, using the public interface presented in the next section.␊ |
167 | ␊ |
168 | =head1 PUBLIC INTERFACE for scripts using your parser␊ |
169 | ␊ |
170 | =head2 Constructor␊ |
171 | ␊ |
172 | =over 4␊ |
173 | ␊ |
174 | =item process(%)␊ |
175 | ␊ |
176 | This function can do all you need to do with a po4a document in one␊ |
177 | invocation. Its arguments must be packed as a hash. ACTIONS:␊ |
178 | ␊ |
179 | =over 3␊ |
180 | ␊ |
181 | =item a.␊ |
182 | ␊ |
183 | Reads all the PO files specified in po_in_name␊ |
184 | ␊ |
185 | =item b.␊ |
186 | ␊ |
187 | Reads all original documents specified in file_in_name␊ |
188 | ␊ |
189 | =item c.␊ |
190 | ␊ |
191 | Parses the document␊ |
192 | ␊ |
193 | =item d.␊ |
194 | ␊ |
195 | Reads and applies all the addenda specified␊ |
196 | ␊ |
197 | =item e.␊ |
198 | ␊ |
199 | Writes the translated document to file_out_name (if given)␊ |
200 | ␊ |
201 | =item f.␊ |
202 | ␊ |
203 | Writes the extracted PO file to po_out_name (if given)␊ |
204 | ␊ |
205 | =back␊ |
206 | ␊ |
207 | ARGUMENTS, beside the ones accepted by new() (with expected type):␊ |
208 | ␊ |
209 | =over 4␊ |
210 | ␊ |
211 | =item file_in_name (@)␊ |
212 | ␊ |
213 | List of filenames where we should read the input document.␊ |
214 | ␊ |
215 | =item file_in_charset ($)␊ |
216 | ␊ |
217 | Charset used in the input document (if it isn't specified, it will try␊ |
218 | to detect it from the input document).␊ |
219 | ␊ |
220 | =item file_out_name ($)␊ |
221 | ␊ |
222 | Filename where we should write the output document.␊ |
223 | ␊ |
224 | =item file_out_charset ($)␊ |
225 | ␊ |
226 | Charset used in the output document (if it isn't specified, it will use␊ |
227 | the PO file charset).␊ |
228 | ␊ |
229 | =item po_in_name (@)␊ |
230 | ␊ |
231 | List of filenames where we should read the input PO files from, containing␊ |
232 | the translation which will be used to translate the document.␊ |
233 | ␊ |
234 | =item po_out_name ($)␊ |
235 | ␊ |
236 | Filename where we should write the output PO file, containing the strings␊ |
237 | extracted from the input document.␊ |
238 | ␊ |
239 | =item addendum (@)␊ |
240 | ␊ |
241 | List of filenames where we should read the addenda from.␊ |
242 | ␊ |
243 | =item addendum_charset ($)␊ |
244 | ␊ |
245 | Charset for the addenda.␊ |
246 | ␊ |
247 | =back␊ |
248 | ␊ |
249 | =item new(%)␊ |
250 | ␊ |
251 | Create a new po4a document. Accepted options (but be in a hash):␊ |
252 | ␊ |
253 | =over 4␊ |
254 | ␊ |
255 | =item verbose ($)␊ |
256 | ␊ |
257 | Sets the verbosity.␊ |
258 | ␊ |
259 | =item debug ($)␊ |
260 | ␊ |
261 | Sets the debugging.␊ |
262 | ␊ |
263 | =back␊ |
264 | ␊ |
265 | =cut␊ |
266 | ␊ |
267 | sub process {␊ |
268 | ## Determine if we were called via an object-ref or a classname␊ |
269 | my $self = shift;␊ |
270 | ␊ |
271 | ## Any remaining arguments are treated as initial values for the␊ |
272 | ## hash that is used to represent this object.␊ |
273 | my %params = @_;␊ |
274 | ␊ |
275 | # Build the args for new()␊ |
276 | my %newparams = ();␊ |
277 | foreach (keys %params) {␊ |
278 | next if ($_ eq 'po_in_name' ||␊ |
279 | $_ eq 'po_out_name' ||␊ |
280 | $_ eq 'file_in_name' ||␊ |
281 | $_ eq 'file_in_charset' ||␊ |
282 | $_ eq 'file_out_name' ||␊ |
283 | $_ eq 'file_out_charset' ||␊ |
284 | $_ eq 'addendum' ||␊ |
285 | $_ eq 'addendum_charset');␊ |
286 | $newparams{$_}=$params{$_};␊ |
287 | }␊ |
288 | ␊ |
289 | $self->detected_charset($params{'file_in_charset'});␊ |
290 | $self->{TT}{'file_out_charset'}=$params{'file_out_charset'};␊ |
291 | if (defined($self->{TT}{'file_out_charset'}) and␊ |
292 | length($self->{TT}{'file_out_charset'})) {␊ |
293 | $self->{TT}{'file_out_encoder'} = find_encoding($self->{TT}{'file_out_charset'});␊ |
294 | }␊ |
295 | $self->{TT}{'addendum_charset'}=$params{'addendum_charset'};␊ |
296 | ␊ |
297 | chdir $params{'srcdir'}␊ |
298 | if (defined $params{'srcdir'});␊ |
299 | foreach my $file (@{$params{'po_in_name'}}) {␊ |
300 | print STDERR "readpo($file)... " if $self->debug();␊ |
301 | $self->readpo($file);␊ |
302 | print STDERR "done.\n" if $self->debug()␊ |
303 | }␊ |
304 | foreach my $file (@{$params{'file_in_name'}}) {␊ |
305 | print STDERR "read($file)..." if $self->debug();␊ |
306 | $self->read($file);␊ |
307 | print STDERR "done.\n" if $self->debug();␊ |
308 | }␊ |
309 | print STDERR "parse..." if $self->debug();␊ |
310 | $self->parse();␊ |
311 | print STDERR "done.\n" if $self->debug();␊ |
312 | foreach my $file (@{$params{'addendum'}}) {␊ |
313 | print STDERR "addendum($file)..." if $self->debug();␊ |
314 | $self->addendum($file) || die "An addendum failed\n";␊ |
315 | print STDERR "done.\n" if $self->debug();␊ |
316 | }␊ |
317 | chdir $params{'destdir'}␊ |
318 | if (defined $params{'destdir'});␊ |
319 | if (defined $params{'file_out_name'}) {␊ |
320 | print STDERR "write(".$params{'file_out_name'}.")... "␊ |
321 | if $self->debug();␊ |
322 | $self->write($params{'file_out_name'});␊ |
323 | print STDERR "done.\n" if $self->debug();␊ |
324 | }␊ |
325 | chdir $params{'srcdir'}␊ |
326 | if (defined $params{'srcdir'});␊ |
327 | if (defined $params{'po_out_name'}) {␊ |
328 | print STDERR "writepo(".$params{'po_out_name'}.")... "␊ |
329 | if $self->debug();␊ |
330 | $self->writepo($params{'po_out_name'});␊ |
331 | print STDERR "done.\n" if $self->debug();␊ |
332 | }␊ |
333 | chdir $params{'calldir'}␊ |
334 | if (defined $params{'calldir'});␊ |
335 | return $self;␊ |
336 | }␊ |
337 | ␊ |
338 | sub new {␊ |
339 | ## Determine if we were called via an object-ref or a classname␊ |
340 | my $this = shift;␊ |
341 | my $class = ref($this) || $this;␊ |
342 | my $self = { };␊ |
343 | my %options=@_;␊ |
344 | ## Bless ourselves into the desired class and perform any initialization␊ |
345 | bless $self, $class;␊ |
346 | ␊ |
347 | ## initialize the plugin␊ |
348 | # prevent the plugin from croaking on the options intended for Po.pm␊ |
349 | $self->{options}{'porefs'} = '';␊ |
350 | $self->{options}{'copyright-holder'} = '';␊ |
351 | $self->{options}{'msgid-bugs-address'} = '';␊ |
352 | $self->{options}{'package-name'} = '';␊ |
353 | $self->{options}{'package-version'} = '';␊ |
354 | # let the plugin parse the options and such␊ |
355 | $self->initialize(%options);␊ |
356 | ␊ |
357 | ## Create our private data␊ |
358 | my %po_options;␊ |
359 | $po_options{'porefs'} = $self->{options}{'porefs'};␊ |
360 | $po_options{'copyright-holder'} = $options{'copyright-holder'};␊ |
361 | $po_options{'msgid-bugs-address'} = $options{'msgid-bugs-address'};␊ |
362 | $po_options{'package-name'} = $options{'package-name'};␊ |
363 | $po_options{'package-version'} = $options{'package-version'};␊ |
364 | ␊ |
365 | # private data␊ |
366 | $self->{TT}=();␊ |
367 | $self->{TT}{po_in}=Locale::Po4a::Po->new(\%po_options);␊ |
368 | $self->{TT}{po_out}=Locale::Po4a::Po->new(\%po_options);␊ |
369 | # Warning, this is an array of array:␊ |
370 | # The document is splited on lines, and for each␊ |
371 | # [0] is the line content, [1] is the reference [2] the type␊ |
372 | $self->{TT}{doc_in}=();␊ |
373 | $self->{TT}{doc_out}=();␊ |
374 | if (defined $options{'verbose'}) {␊ |
375 | $self->{TT}{verbose} = $options{'verbose'};␊ |
376 | }␊ |
377 | if (defined $options{'debug'}) {␊ |
378 | $self->{TT}{debug} = $options{'debug'};␊ |
379 | }␊ |
380 | # Input document is in ascii until we prove the opposite (in read())␊ |
381 | $self->{TT}{ascii_input}=1;␊ |
382 | # We try not to use utf unless it's forced from the outside (in case the␊ |
383 | # document isn't in ascii)␊ |
384 | $self->{TT}{utf_mode}=0;␊ |
385 | ␊ |
386 | return $self;␊ |
387 | }␊ |
388 | ␊ |
389 | =back␊ |
390 | ␊ |
391 | =head2 Manipulating document files␊ |
392 | ␊ |
393 | =over 4␊ |
394 | ␊ |
395 | =item read($)␊ |
396 | ␊ |
397 | Add another input document at the end of the existing one. The argument is␊ |
398 | the filename to read.␊ |
399 | ␊ |
400 | Please note that it does not parse anything. You should use the parse()␊ |
401 | function when you're done with packing input files into the document.␊ |
402 | ␊ |
403 | =cut␊ |
404 | ␊ |
405 | #'␊ |
406 | sub read() {␊ |
407 | my $self=shift;␊ |
408 | my $filename=shift␊ |
409 | or croak wrap_msg(dgettext("po4a", "Can't read from file without having a filename"));␊ |
410 | my $linenum=0;␊ |
411 | ␊ |
412 | open INPUT,"<$filename"␊ |
413 | or croak wrap_msg(dgettext("po4a", "Can't read from %s: %s"), $filename, $!);␊ |
414 | while (defined (my $textline = <INPUT>)) {␊ |
415 | $linenum++;␊ |
416 | my $ref="$filename:$linenum";␊ |
417 | $textline =~ s/\r$//;␊ |
418 | my @entry=($textline,$ref);␊ |
419 | push @{$self->{TT}{doc_in}}, @entry;␊ |
420 | ␊ |
421 | if (!defined($self->{TT}{'file_in_charset'})) {␊ |
422 | # Detect if this file has non-ascii characters␊ |
423 | if($self->{TT}{ascii_input}) {␊ |
424 | my $decoder = guess_encoding($textline);␊ |
425 | if (!ref($decoder) or $decoder !~ /Encode::XS=/) {␊ |
426 | # We have detected a non-ascii line␊ |
427 | $self->{TT}{ascii_input} = 0;␊ |
428 | # Save the reference for future error message␊ |
429 | $self->{TT}{non_ascii_ref} ||= $ref;␊ |
430 | }␊ |
431 | }␊ |
432 | }␊ |
433 | }␊ |
434 | close INPUT␊ |
435 | or croak wrap_msg(dgettext("po4a", "Can't close %s after reading: %s"), $filename, $!);␊ |
436 | ␊ |
437 | }␊ |
438 | ␊ |
439 | =item write($)␊ |
440 | ␊ |
441 | Write the translated document to the given filename.␊ |
442 | ␊ |
443 | =cut␊ |
444 | ␊ |
445 | sub write {␊ |
446 | my $self=shift;␊ |
447 | my $filename=shift␊ |
448 | or croak wrap_msg(dgettext("po4a", "Can't write to a file without filename"));␊ |
449 | ␊ |
450 | my $fh;␊ |
451 | if ($filename eq '-') {␊ |
452 | $fh=\*STDOUT;␊ |
453 | } else {␊ |
454 | # make sure the directory in which we should write the localized file exists␊ |
455 | my $dir = $filename;␊ |
456 | if ($dir =~ m|/|) {␊ |
457 | $dir =~ s|/[^/]*$||;␊ |
458 | ␊ |
459 | File::Path::mkpath($dir, 0, 0755) # Croaks on error␊ |
460 | if (length ($dir) && ! -e $dir);␊ |
461 | }␊ |
462 | open $fh,">$filename"␊ |
463 | or croak wrap_msg(dgettext("po4a", "Can't write to %s: %s"), $filename, $!);␊ |
464 | }␊ |
465 | ␊ |
466 | map { print $fh $_ } $self->docheader();␊ |
467 | map { print $fh $_ } @{$self->{TT}{doc_out}};␊ |
468 | ␊ |
469 | if ($filename ne '-') {␊ |
470 | close $fh or croak wrap_msg(dgettext("po4a", "Can't close %s after writing: %s"), $filename, $!);␊ |
471 | }␊ |
472 | ␊ |
473 | }␊ |
474 | ␊ |
475 | =back␊ |
476 | ␊ |
477 | =head2 Manipulating PO files␊ |
478 | ␊ |
479 | =over 4␊ |
480 | ␊ |
481 | =item readpo($)␊ |
482 | ␊ |
483 | Add the content of a file (which name is passed as argument) to the␊ |
484 | existing input PO. The old content is not discarded.␊ |
485 | ␊ |
486 | =item writepo($)␊ |
487 | ␊ |
488 | Write the extracted PO file to the given filename.␊ |
489 | ␊ |
490 | =item stats()␊ |
491 | ␊ |
492 | Returns some statistics about the translation done so far. Please note that␊ |
493 | it's not the same statistics than the one printed by msgfmt␊ |
494 | --statistic. Here, it's stats about recent usage of the PO file, while␊ |
495 | msgfmt reports the status of the file. It is a wrapper to the␊ |
496 | Locale::Po4a::Po::stats_get function applied to the input PO file. Example␊ |
497 | of use:␊ |
498 | ␊ |
499 | [normal use of the po4a document...]␊ |
500 | ␊ |
501 | ($percent,$hit,$queries) = $document->stats();␊ |
502 | print "We found translations for $percent\% ($hit from $queries) of strings.\n";␊ |
503 | ␊ |
504 | =back␊ |
505 | ␊ |
506 | =cut␊ |
507 | ␊ |
508 | sub getpoout {␊ |
509 | return $_[0]->{TT}{po_out};␊ |
510 | }␊ |
511 | sub setpoout {␊ |
512 | $_[0]->{TT}{po_out} = $_[1];␊ |
513 | }␊ |
514 | sub readpo {␊ |
515 | $_[0]->{TT}{po_in}->read($_[1]);␊ |
516 | }␊ |
517 | sub writepo {␊ |
518 | $_[0]->{TT}{po_out}->write( $_[1] );␊ |
519 | }␊ |
520 | sub stats {␊ |
521 | return $_[0]->{TT}{po_in}->stats_get();␊ |
522 | }␊ |
523 | ␊ |
524 | =head2 Manipulating addenda␊ |
525 | ␊ |
526 | =over 4␊ |
527 | ␊ |
528 | =item addendum($)␊ |
529 | ␊ |
530 | Please refer to L<po4a(7)|po4a.7> for more information on what addenda are,␊ |
531 | and how translators should write them. To apply an addendum to the translated␊ |
532 | document, simply pass its filename to this function and you are done ;)␊ |
533 | ␊ |
534 | This function returns a non-null integer on error.␊ |
535 | ␊ |
536 | =cut␊ |
537 | ␊ |
538 | # Internal function to read the header.␊ |
539 | sub addendum_parse {␊ |
540 | my ($filename,$header)=shift;␊ |
541 | ␊ |
542 | my ($errcode,$mode,$position,$boundary,$bmode,$content)=␊ |
543 | (1,"","","","","");␊ |
544 | ␊ |
545 | unless (open (INS, "<$filename")) {␊ |
546 | warn wrap_msg(dgettext("po4a", "Can't read from %s: %s"), $filename, $!);␊ |
547 | goto END_PARSE_ADDFILE;␊ |
548 | }␊ |
549 | ␊ |
550 | unless (defined ($header=<INS>) && $header) {␊ |
551 | warn wrap_msg(dgettext("po4a", "Can't read po4a header from %s."), $filename);␊ |
552 | goto END_PARSE_ADDFILE;␊ |
553 | }␊ |
554 | ␊ |
555 | unless ($header =~ s/PO4A-HEADER://i) {␊ |
556 | warn wrap_msg(dgettext("po4a", "First line of %s does not look like a po4a header."), $filename);␊ |
557 | goto END_PARSE_ADDFILE;␊ |
558 | }␊ |
559 | foreach my $part (split(/;/,$header)) {␊ |
560 | unless ($part =~ m/^\s*([^=]*)=(.*)$/) {␊ |
561 | warn wrap_msg(dgettext("po4a", "Syntax error in po4a header of %s, near \"%s\""), $filename, $part);␊ |
562 | goto END_PARSE_ADDFILE;␊ |
563 | }␊ |
564 | my ($key,$value)=($1,$2);␊ |
565 | $key=lc($key);␊ |
566 | if ($key eq 'mode') {␊ |
567 | $mode=lc($value);␊ |
568 | } elsif ($key eq 'position') {␊ |
569 | $position=$value;␊ |
570 | } elsif ($key eq 'endboundary') {␊ |
571 | $boundary=$value;␊ |
572 | $bmode='after';␊ |
573 | } elsif ($key eq 'beginboundary') {␊ |
574 | $boundary=$value;␊ |
575 | $bmode='before';␊ |
576 | } else {␊ |
577 | warn wrap_msg(dgettext("po4a", "Invalid argument in the po4a header of %s: %s"), $filename, $key);␊ |
578 | goto END_PARSE_ADDFILE;␊ |
579 | }␊ |
580 | }␊ |
581 | ␊ |
582 | unless (length($mode)) {␊ |
583 | warn wrap_msg(dgettext("po4a", "The po4a header of %s does not define the mode."), $filename);␊ |
584 | goto END_PARSE_ADDFILE;␊ |
585 | }␊ |
586 | unless ($mode eq "before" || $mode eq "after") {␊ |
587 | warn wrap_msg(dgettext("po4a", "Mode invalid in the po4a header of %s: should be 'before' or 'after' not %s."), $filename, $mode);␊ |
588 | goto END_PARSE_ADDFILE;␊ |
589 | }␊ |
590 | ␊ |
591 | unless (length($position)) {␊ |
592 | warn wrap_msg(dgettext("po4a", "The po4a header of %s does not define the position."), $filename);␊ |
593 | goto END_PARSE_ADDFILE;␊ |
594 | }␊ |
595 | unless ($mode eq "before" || length($boundary)) {␊ |
596 | warn wrap_msg(dgettext("po4a", "No ending boundary given in the po4a header, but mode=after."));␊ |
597 | goto END_PARSE_ADDFILE;␊ |
598 | }␊ |
599 | ␊ |
600 | while (defined(my $line = <INS>)) {␊ |
601 | $content .= $line;␊ |
602 | }␊ |
603 | close INS;␊ |
604 | ␊ |
605 | $errcode=0;␊ |
606 | END_PARSE_ADDFILE:␊ |
607 | return ($errcode,$mode,$position,$boundary,$bmode,$content);␊ |
608 | }␊ |
609 | ␊ |
610 | sub mychomp {␊ |
611 | my ($str) = shift;␊ |
612 | chomp($str);␊ |
613 | return $str;␊ |
614 | }␊ |
615 | ␊ |
616 | sub addendum {␊ |
617 | my ($self,$filename) = @_;␊ |
618 | ␊ |
619 | print STDERR "Apply addendum $filename..." if $self->debug();␊ |
620 | unless ($filename) {␊ |
621 | warn wrap_msg(dgettext("po4a",␊ |
622 | "Can't apply addendum when not given the filename"));␊ |
623 | return 0;␊ |
624 | }␊ |
625 | die wrap_msg(dgettext("po4a", "Addendum %s does not exist."), $filename)␊ |
626 | unless -e $filename;␊ |
627 | ␊ |
628 | my ($errcode,$mode,$position,$boundary,$bmode,$content)=␊ |
629 | addendum_parse($filename);␊ |
630 | return 0 if ($errcode);␊ |
631 | ␊ |
632 | print STDERR "mode=$mode;pos=$position;bound=$boundary;bmode=$bmode;ctn=$content\n"␊ |
633 | if $self->debug();␊ |
634 | ␊ |
635 | # We only recode the addendum if an origin charset is specified, else we␊ |
636 | # suppose it's already in the output document's charset␊ |
637 | if (defined($self->{TT}{'addendum_charset'}) &&␊ |
638 | length($self->{TT}{'addendum_charset'})) {␊ |
639 | Encode::from_to($content,$self->{TT}{'addendum_charset'},␊ |
640 | $self->get_out_charset);␊ |
641 | }␊ |
642 | ␊ |
643 | my $found = scalar grep { /$position/ } @{$self->{TT}{doc_out}};␊ |
644 | if ($found == 0) {␊ |
645 | warn wrap_msg(dgettext("po4a",␊ |
646 | "No candidate position for the addendum %s."), $filename);␊ |
647 | return 0;␊ |
648 | }␊ |
649 | if ($found > 1) {␊ |
650 | warn wrap_msg(dgettext("po4a",␊ |
651 | "More than one candidate position found for the addendum %s."), $filename);␊ |
652 | return 0;␊ |
653 | }␊ |
654 | ␊ |
655 | if ($mode eq "before") {␊ |
656 | if ($self->verbose() > 1 || $self->debug() ) {␊ |
657 | map { print STDERR wrap_msg(dgettext("po4a", "Addendum '%s' applied before this line: %s"), $filename, $_) if (/$position/);␊ |
658 | } @{$self->{TT}{doc_out}};␊ |
659 | }␊ |
660 | @{$self->{TT}{doc_out}} = map { /$position/ ? ($content,$_) : $_␊ |
661 | } @{$self->{TT}{doc_out}};␊ |
662 | } else {␊ |
663 | my @newres=();␊ |
664 | ␊ |
665 | do {␊ |
666 | # make sure it doesn't whine on empty document␊ |
667 | my $line = scalar @{$self->{TT}{doc_out}} ? shift @{$self->{TT}{doc_out}} : "";␊ |
668 | push @newres,$line;␊ |
669 | my $outline=mychomp($line);␊ |
670 | $outline =~ s/^[ \t]*//;␊ |
671 | ␊ |
672 | if ($line =~ m/$position/) {␊ |
673 | while ($line=shift @{$self->{TT}{doc_out}}) {␊ |
674 | last if ($line=~/$boundary/);␊ |
675 | push @newres,$line;␊ |
676 | }␊ |
677 | if (defined $line) {␊ |
678 | if ($bmode eq 'before') {␊ |
679 | print wrap_msg(dgettext("po4a",␊ |
680 | "Addendum '%s' applied before this line: %s"),␊ |
681 | $filename, $outline)␊ |
682 | if ($self->verbose() > 1 || $self->debug());␊ |
683 | push @newres,$content;␊ |
684 | push @newres,$line;␊ |
685 | } else {␊ |
686 | print wrap_msg(dgettext("po4a",␊ |
687 | "Addendum '%s' applied after the line: %s."),␊ |
688 | $filename, $outline)␊ |
689 | if ($self->verbose() > 1 || $self->debug());␊ |
690 | push @newres,$line;␊ |
691 | push @newres,$content;␊ |
692 | }␊ |
693 | } else {␊ |
694 | print wrap_msg(dgettext("po4a", "Addendum '%s' applied at the end of the file."), $filename)␊ |
695 | if ($self->verbose() > 1 || $self->debug());␊ |
696 | push @newres,$content;␊ |
697 | }␊ |
698 | }␊ |
699 | } while (scalar @{$self->{TT}{doc_out}});␊ |
700 | @{$self->{TT}{doc_out}} = @newres;␊ |
701 | }␊ |
702 | print STDERR "done.\n" if $self->debug();␊ |
703 | return 1;␊ |
704 | }␊ |
705 | ␊ |
706 | =back␊ |
707 | ␊ |
708 | =head1 INTERNAL FUNCTIONS used to write derivated parsers␊ |
709 | ␊ |
710 | =head2 Getting input, providing output␊ |
711 | ␊ |
712 | Four functions are provided to get input and return output. They are very␊ |
713 | similar to shift/unshift and push/pop. The first pair is about input, while␊ |
714 | the second is about output. Mnemonic: in input, you are interested in the␊ |
715 | first line, what shift gives, and in output you want to add your result at␊ |
716 | the end, like push does.␊ |
717 | ␊ |
718 | =over 4␊ |
719 | ␊ |
720 | =item shiftline()␊ |
721 | ␊ |
722 | This function returns the next line of the doc_in to be parsed and its␊ |
723 | reference (packed as an array).␊ |
724 | ␊ |
725 | =item unshiftline($$)␊ |
726 | ␊ |
727 | Unshifts a line of the input document and its reference.␊ |
728 | ␊ |
729 | =item pushline($)␊ |
730 | ␊ |
731 | Push a new line to the doc_out.␊ |
732 | ␊ |
733 | =item popline()␊ |
734 | ␊ |
735 | Pop the last pushed line from the doc_out.␊ |
736 | ␊ |
737 | =back␊ |
738 | ␊ |
739 | =cut␊ |
740 | ␊ |
741 | sub shiftline {␊ |
742 | my ($line,$ref)=(shift @{$_[0]->{TT}{doc_in}},␊ |
743 | shift @{$_[0]->{TT}{doc_in}});␊ |
744 | return ($line,$ref);␊ |
745 | }␊ |
746 | sub unshiftline {␊ |
747 | my $self = shift;␊ |
748 | unshift @{$self->{TT}{doc_in}},@_;␊ |
749 | }␊ |
750 | ␊ |
751 | sub pushline { push @{$_[0]->{TT}{doc_out}}, $_[1] if defined $_[1]; }␊ |
752 | sub popline { return pop @{$_[0]->{TT}{doc_out}}; }␊ |
753 | ␊ |
754 | =head2 Marking strings as translatable␊ |
755 | ␊ |
756 | One function is provided to handle the text which should be translated.␊ |
757 | ␊ |
758 | =over 4␊ |
759 | ␊ |
760 | =item translate($$$)␊ |
761 | ␊ |
762 | Mandatory arguments:␊ |
763 | ␊ |
764 | =over 2␊ |
765 | ␊ |
766 | =item -␊ |
767 | ␊ |
768 | A string to translate␊ |
769 | ␊ |
770 | =item -␊ |
771 | ␊ |
772 | The reference of this string (i.e. position in inputfile)␊ |
773 | ␊ |
774 | =item -␊ |
775 | ␊ |
776 | The type of this string (i.e. the textual description of its structural role;␊ |
777 | used in Locale::Po4a::Po::gettextization(); see also L<po4a(7)|po4a.7>,␊ |
778 | section B<Gettextization: how does it work?>)␊ |
779 | ␊ |
780 | =back␊ |
781 | ␊ |
782 | This function can also take some extra arguments. They must be organized as␊ |
783 | a hash. For example:␊ |
784 | ␊ |
785 | $self->translate("string","ref","type",␊ |
786 | 'wrap' => 1);␊ |
787 | ␊ |
788 | =over␊ |
789 | ␊ |
790 | =item B<wrap>␊ |
791 | ␊ |
792 | boolean indicating whether we can consider that whitespaces in string are␊ |
793 | not important. If yes, the function canonizes the string before looking for␊ |
794 | a translation or extracting it, and wraps the translation.␊ |
795 | ␊ |
796 | =item B<wrapcol>␊ |
797 | ␊ |
798 | the column at which we should wrap (default: 76).␊ |
799 | ␊ |
800 | =item B<comment>␊ |
801 | ␊ |
802 | an extra comment to add to the entry.␊ |
803 | ␊ |
804 | =back␊ |
805 | ␊ |
806 | Actions:␊ |
807 | ␊ |
808 | =over 2␊ |
809 | ␊ |
810 | =item -␊ |
811 | ␊ |
812 | Pushes the string, reference and type to po_out.␊ |
813 | ␊ |
814 | =item -␊ |
815 | ␊ |
816 | Returns the translation of the string (as found in po_in) so that the␊ |
817 | parser can build the doc_out.␊ |
818 | ␊ |
819 | =item -␊ |
820 | ␊ |
821 | Handles the charsets to recode the strings before sending them to␊ |
822 | po_out and before returning the translations.␊ |
823 | ␊ |
824 | =back␊ |
825 | ␊ |
826 | =back␊ |
827 | ␊ |
828 | =cut␊ |
829 | ␊ |
830 | sub translate {␊ |
831 | my $self=shift;␊ |
832 | my ($string,$ref,$type)=(shift,shift,shift);␊ |
833 | my (%options)=@_;␊ |
834 | ␊ |
835 | # my $validoption="wrap wrapcol";␊ |
836 | # my %validoption;␊ |
837 | ␊ |
838 | return "" unless defined($string) && length($string);␊ |
839 | ␊ |
840 | # map { $validoption{$_}=1 } (split(/ /,$validoption));␊ |
841 | # foreach (keys %options) {␊ |
842 | # Carp::confess "internal error: translate() called with unknown arg $_. Valid options: $validoption"␊ |
843 | # unless $validoption{$_};␊ |
844 | # }␊ |
845 | ␊ |
846 | my $in_charset;␊ |
847 | if ($self->{TT}{ascii_input}) {␊ |
848 | $in_charset = "ascii";␊ |
849 | } else {␊ |
850 | if (defined($self->{TT}{'file_in_charset'}) and␊ |
851 | length($self->{TT}{'file_in_charset'}) and␊ |
852 | $self->{TT}{'file_in_charset'} !~ m/ascii/i) {␊ |
853 | $in_charset=$self->{TT}{'file_in_charset'};␊ |
854 | } else {␊ |
855 | # FYI, the document charset have to be determined *before* we see the first␊ |
856 | # string to recode.␊ |
857 | die wrap_mod("po4a", dgettext("po4a", "Couldn't determine the input document's charset. Please specify it on the command line. (non-ASCII char at %s)"), $self->{TT}{non_ascii_ref})␊ |
858 | }␊ |
859 | }␊ |
860 | ␊ |
861 | if ($self->{TT}{po_in}->get_charset ne "CHARSET") {␊ |
862 | $string = encode_from_to($string,␊ |
863 | $self->{TT}{'file_in_encoder'},␊ |
864 | $self->{TT}{po_in}{encoder});␊ |
865 | }␊ |
866 | ␊ |
867 | if (defined $options{'wrapcol'} && $options{'wrapcol'} < 0) {␊ |
868 | # FIXME: should be the parameter given with --width␊ |
869 | $options{'wrapcol'} = 76 + $options{'wrapcol'};␊ |
870 | }␊ |
871 | my $transstring = $self->{TT}{po_in}->gettext($string,␊ |
872 | 'wrap' => $options{'wrap'}||0,␊ |
873 | 'wrapcol' => $options{'wrapcol'});␊ |
874 | ␊ |
875 | if ($self->{TT}{po_in}->get_charset ne "CHARSET") {␊ |
876 | my $out_encoder = $self->{TT}{'file_out_encoder'};␊ |
877 | unless (defined $out_encoder) {␊ |
878 | $out_encoder = find_encoding($self->get_out_charset)␊ |
879 | }␊ |
880 | $transstring = encode_from_to($transstring,␊ |
881 | $self->{TT}{po_in}{encoder},␊ |
882 | $out_encoder);␊ |
883 | }␊ |
884 | ␊ |
885 | # If the input document isn't completely in ascii, we should see what to␊ |
886 | # do with the current string␊ |
887 | unless ($self->{TT}{ascii_input}) {␊ |
888 | my $out_charset = $self->{TT}{po_out}->get_charset;␊ |
889 | # We set the output po charset␊ |
890 | if ($out_charset eq "CHARSET") {␊ |
891 | if ($self->{TT}{utf_mode}) {␊ |
892 | $out_charset="UTF-8";␊ |
893 | } else {␊ |
894 | $out_charset=$in_charset;␊ |
895 | }␊ |
896 | $self->{TT}{po_out}->set_charset($out_charset);␊ |
897 | }␊ |
898 | if ( $in_charset !~ /^$out_charset$/i ) {␊ |
899 | Encode::from_to($string,$in_charset,$out_charset);␊ |
900 | if (defined($options{'comment'}) and length($options{'comment'})) {␊ |
901 | Encode::from_to($options{'comment'},$in_charset,$out_charset);␊ |
902 | }␊ |
903 | }␊ |
904 | }␊ |
905 | ␊ |
906 | # the comments provided by the modules are automatic comments from the PO point of view␊ |
907 | $self->{TT}{po_out}->push('msgid' => $string,␊ |
908 | 'reference' => $ref,␊ |
909 | 'type' => $type,␊ |
910 | 'automatic' => $options{'comment'},␊ |
911 | 'wrap' => $options{'wrap'}||0,␊ |
912 | 'wrapcol' => $options{'wrapcol'});␊ |
913 | ␊ |
914 | # if ($self->{TT}{po_in}->get_charset ne "CHARSET") {␊ |
915 | # Encode::from_to($transstring,$self->{TT}{po_in}->get_charset,␊ |
916 | # $self->get_out_charset);␊ |
917 | # }␊ |
918 | ␊ |
919 | if ($options{'wrap'}||0) {␊ |
920 | $transstring =~ s/( *)$//s;␊ |
921 | my $trailing_spaces = $1||"";␊ |
922 | $transstring =~ s/(?<!\\) +$//gm;␊ |
923 | $transstring .= $trailing_spaces;␊ |
924 | }␊ |
925 | ␊ |
926 | return $transstring;␊ |
927 | }␊ |
928 | ␊ |
929 | =head2 Misc functions␊ |
930 | ␊ |
931 | =over 4␊ |
932 | ␊ |
933 | =item verbose()␊ |
934 | ␊ |
935 | Returns if the verbose option was passed during the creation of the␊ |
936 | TransTractor.␊ |
937 | ␊ |
938 | =cut␊ |
939 | ␊ |
940 | sub verbose {␊ |
941 | if (defined $_[1]) {␊ |
942 | $_[0]->{TT}{verbose} = $_[1];␊ |
943 | } else {␊ |
944 | return $_[0]->{TT}{verbose} || 0; # undef and 0 have the same meaning, but one generates warnings␊ |
945 | }␊ |
946 | }␊ |
947 | ␊ |
948 | =item debug()␊ |
949 | ␊ |
950 | Returns if the debug option was passed during the creation of the␊ |
951 | TransTractor.␊ |
952 | ␊ |
953 | =cut␊ |
954 | ␊ |
955 | sub debug {␊ |
956 | return $_[0]->{TT}{debug};␊ |
957 | }␊ |
958 | ␊ |
959 | =item detected_charset($)␊ |
960 | ␊ |
961 | This tells TransTractor that a new charset (the first argument) has been␊ |
962 | detected from the input document. It can usually be read from the document␊ |
963 | header. Only the first charset will remain, coming either from the␊ |
964 | process() arguments or detected from the document.␊ |
965 | ␊ |
966 | =cut␊ |
967 | ␊ |
968 | sub detected_charset {␊ |
969 | my ($self,$charset)=(shift,shift);␊ |
970 | unless (defined($self->{TT}{'file_in_charset'}) and␊ |
971 | length($self->{TT}{'file_in_charset'}) ) {␊ |
972 | $self->{TT}{'file_in_charset'}=$charset;␊ |
973 | if (defined $charset) {␊ |
974 | $self->{TT}{'file_in_encoder'}=find_encoding($charset);␊ |
975 | } else {␊ |
976 | $self->{TT}{ascii_input}=1;␊ |
977 | $self->{TT}{utf_mode}=0;␊ |
978 | }␊ |
979 | }␊ |
980 | ␊ |
981 | if (defined $self->{TT}{'file_in_charset'} and␊ |
982 | length $self->{TT}{'file_in_charset'} and␊ |
983 | $self->{TT}{'file_in_charset'} !~ m/ascii/i) {␊ |
984 | $self->{TT}{ascii_input}=0;␊ |
985 | }␊ |
986 | }␊ |
987 | ␊ |
988 | =item get_out_charset()␊ |
989 | ␊ |
990 | This function will return the charset that should be used in the output␊ |
991 | document (usually useful to substitute the input document's detected charset␊ |
992 | where it has been found).␊ |
993 | ␊ |
994 | It will use the output charset specified in the command line. If it wasn't␊ |
995 | specified, it will use the input PO's charset, and if the input PO has the␊ |
996 | default "CHARSET", it will return the input document's charset, so that no␊ |
997 | encoding is performed.␊ |
998 | ␊ |
999 | =cut␊ |
1000 | ␊ |
1001 | sub get_out_charset {␊ |
1002 | my $self=shift;␊ |
1003 | my $charset;␊ |
1004 | ␊ |
1005 | # Use the value specified at the command line␊ |
1006 | if (defined($self->{TT}{'file_out_charset'}) and␊ |
1007 | length($self->{TT}{'file_out_charset'})) {␊ |
1008 | $charset=$self->{TT}{'file_out_charset'};␊ |
1009 | } else {␊ |
1010 | if ($self->{TT}{utf_mode} && $self->{TT}{ascii_input}) {␊ |
1011 | $charset="UTF-8";␊ |
1012 | } else {␊ |
1013 | $charset=$self->{TT}{po_in}->get_charset;␊ |
1014 | $charset=$self->{TT}{'file_in_charset'}␊ |
1015 | if $charset eq "CHARSET" and␊ |
1016 | defined($self->{TT}{'file_in_charset'}) and␊ |
1017 | length($self->{TT}{'file_in_charset'});␊ |
1018 | $charset="ascii"␊ |
1019 | if $charset eq "CHARSET";␊ |
1020 | }␊ |
1021 | }␊ |
1022 | return $charset;␊ |
1023 | }␊ |
1024 | ␊ |
1025 | =item recode_skipped_text($)␊ |
1026 | ␊ |
1027 | This function returns the recoded text passed as argument, from the input␊ |
1028 | document's charset to the output document's one. This isn't needed when␊ |
1029 | translating a string (translate() recodes everything itself), but it is when␊ |
1030 | you skip a string from the input document and you want the output document to␊ |
1031 | be consistent with the global encoding.␊ |
1032 | ␊ |
1033 | =cut␊ |
1034 | ␊ |
1035 | sub recode_skipped_text {␊ |
1036 | my ($self,$text)=(shift,shift);␊ |
1037 | unless ($self->{TT}{'ascii_input'}) {␊ |
1038 | if(defined($self->{TT}{'file_in_charset'}) and␊ |
1039 | length($self->{TT}{'file_in_charset'}) ) {␊ |
1040 | $text = encode_from_to($text,␊ |
1041 | $self->{TT}{'file_in_encoder'},␊ |
1042 | find_encoding($self->get_out_charset));␊ |
1043 | } else {␊ |
1044 | die wrap_mod("po4a", dgettext("po4a", "Couldn't determine the input document's charset. Please specify it on the command line. (non-ASCII char at %s)"), $self->{TT}{non_ascii_ref})␊ |
1045 | }␊ |
1046 | }␊ |
1047 | return $text;␊ |
1048 | }␊ |
1049 | ␊ |
1050 | ␊ |
1051 | # encode_from_to($,$,$)␊ |
1052 | #␊ |
1053 | # Encode the given text from one encoding to another one.␊ |
1054 | # It differs from Encode::from_to because it does not take the name of the␊ |
1055 | # encoding in argument, but the encoders (as returned by the␊ |
1056 | # Encode::find_encoding(<name>) method). Thus it permits to save a bunch␊ |
1057 | # of call to find_encoding.␊ |
1058 | #␊ |
1059 | # If the "from" encoding is undefined, it is considered as UTF-8 (or␊ |
1060 | # ascii).␊ |
1061 | # If the "to" encoding is undefined, it is considered as UTF-8.␊ |
1062 | #␊ |
1063 | sub encode_from_to {␊ |
1064 | my ($text,$from,$to) = (shift,shift,shift);␊ |
1065 | ␊ |
1066 | if (not defined $from) {␊ |
1067 | # for ascii and UTF-8, no conversion needed to get an utf-8␊ |
1068 | # string.␊ |
1069 | } else {␊ |
1070 | $text = $from->decode($text, 0);␊ |
1071 | }␊ |
1072 | ␊ |
1073 | if (not defined $to) {␊ |
1074 | # Already in UTF-8, no conversion needed␊ |
1075 | } else {␊ |
1076 | $text = $to->encode($text, 0);␊ |
1077 | }␊ |
1078 | ␊ |
1079 | return $text;␊ |
1080 | }␊ |
1081 | ␊ |
1082 | =back␊ |
1083 | ␊ |
1084 | =head1 FUTURE DIRECTIONS␊ |
1085 | ␊ |
1086 | One shortcoming of the current TransTractor is that it can't handle␊ |
1087 | translated document containing all languages, like debconf templates, or␊ |
1088 | .desktop files.␊ |
1089 | ␊ |
1090 | To address this problem, the only interface changes needed are:␊ |
1091 | ␊ |
1092 | =over 2␊ |
1093 | ␊ |
1094 | =item -␊ |
1095 | ␊ |
1096 | take a hash as po_in_name (a list per language)␊ |
1097 | ␊ |
1098 | =item -␊ |
1099 | ␊ |
1100 | add an argument to translate to indicate the target language␊ |
1101 | ␊ |
1102 | =item -␊ |
1103 | ␊ |
1104 | make a pushline_all function, which would make pushline of its content for␊ |
1105 | all language, using a map-like syntax:␊ |
1106 | ␊ |
1107 | $self->pushline_all({ "Description[".$langcode."]=".␊ |
1108 | $self->translate($line,$ref,$langcode)␊ |
1109 | });␊ |
1110 | ␊ |
1111 | =back␊ |
1112 | ␊ |
1113 | Will see if it's enough ;)␊ |
1114 | ␊ |
1115 | =head1 AUTHORS␊ |
1116 | ␊ |
1117 | Denis Barbier <barbier@linuxfr.org>␊ |
1118 | Martin Quinson (mquinson#debian.org)␊ |
1119 | Jordi Vilalta <jvprat@gmail.com>␊ |
1120 | ␊ |
1121 | =cut␊ |
1122 | ␊ |
1123 | 1;␊ |
1124 | |