[ Index ] |
PHP Cross Reference of Unnamed Project |
[Summary view] [Print] [Text view]
1 package Encode::Alias; 2 use strict; 3 use warnings; 4 no warnings 'redefine'; 5 use Encode; 6 our $VERSION = do { my @r = ( q$Revision: 2.7 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 7 sub DEBUG () { 0 } 8 9 use base qw(Exporter); 10 11 # Public, encouraged API is exported by default 12 13 our @EXPORT = 14 qw ( 15 define_alias 16 find_alias 17 ); 18 19 our @Alias; # ordered matching list 20 our %Alias; # cached known aliases 21 22 sub find_alias { 23 my $class = shift; 24 my $find = shift; 25 unless ( exists $Alias{$find} ) { 26 $Alias{$find} = undef; # Recursion guard 27 for ( my $i = 0 ; $i < @Alias ; $i += 2 ) { 28 my $alias = $Alias[$i]; 29 my $val = $Alias[ $i + 1 ]; 30 my $new; 31 if ( ref($alias) eq 'Regexp' && $find =~ $alias ) { 32 DEBUG and warn "eval $val"; 33 $new = eval $val; 34 DEBUG and $@ and warn "$val, $@"; 35 } 36 elsif ( ref($alias) eq 'CODE' ) { 37 DEBUG and warn "$alias", "->", "($find)"; 38 $new = $alias->($find); 39 } 40 elsif ( lc($find) eq lc($alias) ) { 41 $new = $val; 42 } 43 if ( defined($new) ) { 44 next if $new eq $find; # avoid (direct) recursion on bugs 45 DEBUG and warn "$alias, $new"; 46 my $enc = 47 ( ref($new) ) ? $new : Encode::find_encoding($new); 48 if ($enc) { 49 $Alias{$find} = $enc; 50 last; 51 } 52 } 53 } 54 55 # case insensitive search when canonical is not in all lowercase 56 # RT ticket #7835 57 unless ( $Alias{$find} ) { 58 my $lcfind = lc($find); 59 for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule ) 60 { 61 $lcfind eq lc($name) or next; 62 $Alias{$find} = Encode::find_encoding($name); 63 DEBUG and warn "$find => $name"; 64 } 65 } 66 } 67 if (DEBUG) { 68 my $name; 69 if ( my $e = $Alias{$find} ) { 70 $name = $e->name; 71 } 72 else { 73 $name = ""; 74 } 75 warn "find_alias($class, $find)->name = $name"; 76 } 77 return $Alias{$find}; 78 } 79 80 sub define_alias { 81 while (@_) { 82 my ( $alias, $name ) = splice( @_, 0, 2 ); 83 unshift( @Alias, $alias => $name ); # newer one has precedence 84 if ( ref($alias) ) { 85 86 # clear %Alias cache to allow overrides 87 my @a = keys %Alias; 88 for my $k (@a) { 89 if ( ref($alias) eq 'Regexp' && $k =~ $alias ) { 90 DEBUG and warn "delete \$Alias\{$k\}"; 91 delete $Alias{$k}; 92 } 93 elsif ( ref($alias) eq 'CODE' ) { 94 DEBUG and warn "delete \$Alias\{$k\}"; 95 delete $Alias{ $alias->($name) }; 96 } 97 } 98 } 99 else { 100 DEBUG and warn "delete \$Alias\{$alias\}"; 101 delete $Alias{$alias}; 102 } 103 } 104 } 105 106 # Allow latin-1 style names as well 107 # 0 1 2 3 4 5 6 7 8 9 10 108 our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 ); 109 110 # Allow winlatin1 style names as well 111 our %Winlatin2cp = ( 112 'latin1' => 1252, 113 'latin2' => 1250, 114 'cyrillic' => 1251, 115 'greek' => 1253, 116 'turkish' => 1254, 117 'hebrew' => 1255, 118 'arabic' => 1256, 119 'baltic' => 1257, 120 'vietnamese' => 1258, 121 ); 122 123 init_aliases(); 124 125 sub undef_aliases { 126 @Alias = (); 127 %Alias = (); 128 } 129 130 sub init_aliases { 131 undef_aliases(); 132 133 # Try all-lower-case version should all else fails 134 define_alias( qr/^(.*)$/ => '"\L$1"' ); 135 136 # UTF/UCS stuff 137 define_alias( qr/^UTF-?7$/i => '"UTF-7"' ); 138 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' ); 139 define_alias( 140 qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"', 141 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")', 142 qr/^iso-10646-1$/i => '"UCS-2BE"' 143 ); 144 define_alias( 145 qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"', 146 qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"', 147 qr/^UTF-?(16|32)$/i => '"UTF-$1"', 148 ); 149 150 # ASCII 151 define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' ); 152 define_alias( 'C' => 'ascii' ); 153 define_alias( qr/\bISO[-_]?646[-_]?US$/i => '"ascii"' ); 154 155 # Allow variants of iso-8859-1 etc. 156 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' ); 157 158 # At least HP-UX has these. 159 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' ); 160 161 # More HP stuff. 162 define_alias( 163 qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => 164 '"$1}8"' ); 165 166 # The Official name of ASCII. 167 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' ); 168 169 # This is a font issue, not an encoding issue. 170 # (The currency symbol of the Latin 1 upper half 171 # has been redefined as the euro symbol.) 172 define_alias( qr/^(.+)\@euro$/i => '"$1"' ); 173 174 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i => 175 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' 176 ); 177 178 define_alias( 179 qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish| 180 hebrew|arabic|baltic|vietnamese)$/ix => 181 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' 182 ); 183 184 # Common names for non-latin preferred MIME names 185 define_alias( 186 'ascii' => 'US-ascii', 187 'cyrillic' => 'iso-8859-5', 188 'arabic' => 'iso-8859-6', 189 'greek' => 'iso-8859-7', 190 'hebrew' => 'iso-8859-8', 191 'thai' => 'iso-8859-11', 192 ); 193 # RT #20781 194 define_alias(qr/\btis-?620\b/i => '"iso-8859-11"'); 195 196 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN. 197 # And Microsoft has their own naming (again, surprisingly). 198 # And windows-* is registered in IANA! 199 define_alias( 200 qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' ); 201 202 # Sometimes seen with a leading zero. 203 # define_alias( qr/\bcp037\b/i => '"cp37"'); 204 205 # Mac Mappings 206 # predefined in *.ucm; unneeded 207 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"'); 208 define_alias( qr/^mac_(.*)$/i => '"mac$1"' ); 209 210 # Ououououou. gone. They are differente! 211 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"'); 212 213 # Standardize on the dashed versions. 214 define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' ); 215 216 unless ($Encode::ON_EBCDIC) { 217 218 # for Encode::CN 219 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' ); 220 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' ); 221 222 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' ) 223 # CP936 doesn't have vendor-addon for GBK, so they're identical. 224 define_alias( qr/^gbk$/i => '"cp936"' ); 225 226 # This fixes gb2312 vs. euc-cn confusion, practically 227 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' ); 228 229 # for Encode::JP 230 define_alias( qr/\bjis$/i => '"7bit-jis"' ); 231 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' ); 232 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' ); 233 define_alias( qr/\bujis$/i => '"euc-jp"' ); 234 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' ); 235 define_alias( qr/\bsjis$/i => '"shiftjis"' ); 236 define_alias( qr/\bwindows-31j$/i => '"cp932"' ); 237 238 # for Encode::KR 239 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' ); 240 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' ); 241 242 # This fixes ksc5601 vs. euc-kr confusion, practically 243 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' ); 244 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' ); 245 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' ); 246 247 # for Encode::TW 248 define_alias( qr/\bbig-?5$/i => '"big5-eten"' ); 249 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' ); 250 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' ); 251 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' ); 252 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' ); 253 } 254 255 # utf8 is blessed :) 256 define_alias( qr/^UTF-8$/i => '"utf-8-strict"' ); 257 258 # At last, Map white space and _ to '-' 259 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' ); 260 } 261 262 1; 263 __END__ 264 265 # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8 266 # TODO: HP-UX '15' encodings japanese15 korean15 roi15 267 # TODO: Cyrillic encoding ISO-IR-111 (useful?) 268 # TODO: Armenian encoding ARMSCII-8 269 # TODO: Hebrew encoding ISO-8859-8-1 270 # TODO: Thai encoding TCVN 271 # TODO: Vietnamese encodings VPS 272 # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese 273 # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic 274 # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese 275 # Kannada Khmer Korean Laotian Malayalam Mongolian 276 # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese 277 278 =head1 NAME 279 280 Encode::Alias - alias definitions to encodings 281 282 =head1 SYNOPSIS 283 284 use Encode; 285 use Encode::Alias; 286 define_alias( newName => ENCODING); 287 288 =head1 DESCRIPTION 289 290 Allows newName to be used as an alias for ENCODING. ENCODING may be 291 either the name of an encoding or an encoding object (as described 292 in L<Encode>). 293 294 Currently I<newName> can be specified in the following ways: 295 296 =over 4 297 298 =item As a simple string. 299 300 =item As a qr// compiled regular expression, e.g.: 301 302 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' ); 303 304 In this case, if I<ENCODING> is not a reference, it is C<eval>-ed 305 in order to allow C<$1> etc. to be substituted. The example is one 306 way to alias names as used in X11 fonts to the MIME names for the 307 iso-8859-* family. Note the double quotes inside the single quotes. 308 309 (or, you don't have to do this yourself because this example is predefined) 310 311 If you are using a regex here, you have to use the quotes as shown or 312 it won't work. Also note that regex handling is tricky even for the 313 experienced. Use this feature with caution. 314 315 =item As a code reference, e.g.: 316 317 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); 318 319 The same effect as the example above in a different way. The coderef 320 takes the alias name as an argument and returns a canonical name on 321 success or undef if not. Note the second argument is not required. 322 Use this with even more caution than the regex version. 323 324 =back 325 326 =head3 Changes in code reference aliasing 327 328 As of Encode 1.87, the older form 329 330 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); 331 332 no longer works. 333 334 Encode up to 1.86 internally used "local $_" to implement ths older 335 form. But consider the code below; 336 337 use Encode; 338 $_ = "eeeee" ; 339 while (/(e)/g) { 340 my $utf = decode('aliased-encoding-name', $1); 341 print "position:",pos,"\n"; 342 } 343 344 Prior to Encode 1.86 this fails because of "local $_". 345 346 =head2 Alias overloading 347 348 You can override predefined aliases by simply applying define_alias(). 349 The new alias is always evaluated first, and when necessary, 350 define_alias() flushes the internal cache to make the new definition 351 available. 352 353 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a 354 # superset of SHIFT_JIS 355 356 define_alias( qr/shift.*jis$/i => '"cp932"' ); 357 define_alias( qr/sjis$/i => '"cp932"' ); 358 359 If you want to zap all predefined aliases, you can use 360 361 Encode::Alias->undef_aliases; 362 363 to do so. And 364 365 Encode::Alias->init_aliases; 366 367 gets the factory settings back. 368 369 =head1 SEE ALSO 370 371 L<Encode>, L<Encode::Supported> 372 373 =cut 374
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Tue Mar 17 22:47:18 2015 | Cross-referenced by PHPXref 0.7.1 |