Chameleon

Chameleon Svn Source Tree

Root/branches/Bungo/package/bin/po4a/lib/Locale/Po4a/Html.pm

1#!/usr/bin/perl
2
3# Po4a::Html.pm
4#
5# extract and translate translatable strings from a HTML document.
6#
7# This code extracts plain text between HTML tags and some "alt" or
8# "title" attributes.
9#
10# Copyright (c) 2003 by Laurent Hausermann <laurent@hausermann.org>
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26########################################################################
27
28=encoding UTF-8
29
30=head1 NAME
31
32Locale::Po4a::Html - convert HTML documents from/to PO files
33
34=head1 DESCRIPTION
35
36The po4a (PO for anything) project goal is to ease translations (and more
37interestingly, the maintenance of translations) using gettext tools on
38areas where they were not expected like documentation.
39
40Locale::Po4a::Html is a module to help the translation of documentation in
41the HTML format into other [human] languages.
42
43Please note that this module is not distributed with the main po4a archive
44because we don't feel it mature enough for that. If you insist on using it
45anyway, check it from the CVS out.
46
47=cut
48
49package Locale::Po4a::Html;
50require Exporter;
51use vars qw(@ISA @EXPORT);
52@ISA = qw(Locale::Po4a::TransTractor);
53@EXPORT = qw(new initialize);
54
55use Locale::Po4a::TransTractor;
56
57use strict;
58use warnings;
59
60use HTML::TokeParser;
61
62use File::Temp;
63
64sub initialize {}
65
66sub read {
67 my ($self,$filename)=@_;
68 push @{$self->{DOCPOD}{infile}}, $filename;
69 $self->Locale::Po4a::TransTractor::read($filename);
70}
71
72sub parse {
73 my $self=shift;
74 map {$self->parse_file($_)} @{$self->{DOCPOD}{infile}};
75}
76
77#
78# Parse file and translate it
79#
80sub parse_file {
81 my ($self,$filename)=@_;
82 my $stream = HTML::TokeParser->new($filename)
83 || die "Couldn't read HTML file $filename : $!";
84
85 $stream->unbroken_text( [1] );
86
87 my @type=();
88 NEXT : while (my $token = $stream->get_token) {
89 if($token->[0] eq 'T') {
90 my $text = $token->[1];
91 my ($pre_spaces) = ($text =~ /^(\s*)/);
92 my ($post_spaces) = ($text =~ /(\s*)$/);
93 $text = trim($text);
94 if (notranslation($text) == 1) {
95 $self->pushline( get_tag( $token ) );
96 next NEXT;
97 }
98# FIXME : it should be useful to encode characters
99# in UTF8 in the po, but converting them in HTML::Entities
100# in the doc_out, translate acts both way
101# so we cant do that.
102# use HTML::Entities ();
103# $encoded = HTML::Entities::encode($a);
104# $decoded = HTML::Entities::decode($a);
105 #print STDERR $token->[0];
106 $self->pushline( $pre_spaces . $self->translate($text,
107 "FIXME:0",
108 (scalar @type ? $type[scalar @type-1]: "NOTYPE")
109 ) . $post_spaces,
110 'wrap' => 1
111 );
112 next NEXT;
113 } elsif ($token->[0] eq 'S') {
114 push @type,$token->[1];
115 my $text = get_tag( $token );
116 my $tag = $token->[1];
117# TODO: It would be nice to support an option to specify these
118# (e.g. a list of tag.attribute)
119 my @trans_attr = (( $tag eq 'img' ) || ( $tag eq 'input' ) ||
120 ( $tag eq 'area' ) || ( $tag eq 'applet'))
121 ? qw/title alt/ : qw/title/;
122 my %attr = %{$token->[2]};
123 for my $a (@trans_attr) {
124 my $content = $attr{$a};
125 if (defined $content) {
126 $content = trim($content);
127 my $translated = $self->translate(
128 $content,
129 "FIXME:0",
130 "${tag}_$a"
131 );
132 $attr{$a} = $translated;
133 }
134 }
135 my ($closing) = ( $text =~ /(\s*\/?>)/ );
136 # reconstruct the tag from scratch
137 delete $attr{'/'}; # Parser thinks closing / in XHTML is an attribute
138 $text = "<$tag";
139 $text .= " $_=\"$attr{$_}\"" foreach keys %attr;
140 $text .= $closing;
141 $self->pushline( $text );
142 } elsif ($token->[0] eq 'E') {
143 pop @type;
144 $self->pushline( get_tag( $token ) );
145 } else {
146 $self->pushline( get_tag( $token ) );
147 }
148 }
149}
150
151sub get_tag {
152 my $token = shift;
153 my $tag = "";
154
155 if ($token->[0] eq 'S') {
156 $tag = $token->[4];
157 }
158 if ( ($token->[0] eq 'C') ||
159 ($token->[0] eq 'D') ||
160 ($token->[0] eq 'T') ) {
161 $tag = $token->[1];
162 }
163 if ( ($token->[0] eq 'E') ||
164 ($token->[0] eq 'PI') ) {
165 $tag = $token->[2];
166 }
167
168 return $tag;
169}
170
171sub trim {
172 my $s=shift;
173 $s =~ s/\n/ /g; # remove \n in text
174 $s =~ s/\r/ /g; # remove \r in text
175 $s =~ s/\t/ /g; # remove tabulations
176 $s =~ s/\s+/ /g; # remove multiple spaces
177 $s =~ s/^\s*//g; # remove leading spaces
178 $s =~ s/\s*$//g; # remove trailing spaces
179 return $s;
180}
181
182#
183# This method says if a string must be
184# translated or not.
185# To be improved with more test or regexp
186# Maybe there is a way to do this in TransTractor
187# for example ::^ should not be translated
188sub notranslation {
189 my $s=shift;
190 return 1 if ( ($s cmp "") == 0);
191 return 1 if ( ($s cmp "-") == 0);
192 return 1 if ( ($s cmp "::") == 0);
193 return 1 if ( ($s cmp ":") == 0);
194 return 1 if ( ($s cmp ".") == 0);
195 return 1 if ( ($s cmp "|") == 0);
196 return 1 if ( ($s cmp '"') == 0);
197 return 1 if ( ($s cmp "'") == 0);
198 # don't translate entries composed of one entity
199 return 1 if ($s =~ /^&[^;]*;$/);
200
201# don't translate entries with no letters
202# (happens with e.g. <b>Hello</b>, <i>world</i> )
203# ^^
204# ", " doesn't need translation
205 return 1 unless $s =~ /\w/;
206 return 0;
207}
208
209=head1 AUTHORS
210
211 Laurent Hausermann <laurent@hausermann.org>
212
213=head1 COPYRIGHT AND LICENSE
214
215Laurent Hausermann <laurent@hausermann.org>
216
217This program is free software; you may redistribute it and/or modify it
218under the terms of GPL (see the COPYING file).
219

Archive Download this file

Revision: 2840