[ Index ] |
PHP Cross Reference of Unnamed Project |
[Summary view] [Print] [Text view]
1 package Unicode::UCD; 2 3 use strict; 4 use warnings; 5 6 our $VERSION = '0.25'; 7 8 use Storable qw(dclone); 9 10 require Exporter; 11 12 our @ISA = qw(Exporter); 13 14 our @EXPORT_OK = qw(charinfo 15 charblock charscript 16 charblocks charscripts 17 charinrange 18 general_categories bidi_types 19 compexcl 20 casefold casespec 21 namedseq); 22 23 use Carp; 24 25 =head1 NAME 26 27 Unicode::UCD - Unicode character database 28 29 =head1 SYNOPSIS 30 31 use Unicode::UCD 'charinfo'; 32 my $charinfo = charinfo($codepoint); 33 34 use Unicode::UCD 'charblock'; 35 my $charblock = charblock($codepoint); 36 37 use Unicode::UCD 'charscript'; 38 my $charscript = charscript($codepoint); 39 40 use Unicode::UCD 'charblocks'; 41 my $charblocks = charblocks(); 42 43 use Unicode::UCD 'charscripts'; 44 my $charscripts = charscripts(); 45 46 use Unicode::UCD qw(charscript charinrange); 47 my $range = charscript($script); 48 print "looks like $script\n" if charinrange($range, $codepoint); 49 50 use Unicode::UCD qw(general_categories bidi_types); 51 my $categories = general_categories(); 52 my $types = bidi_types(); 53 54 use Unicode::UCD 'compexcl'; 55 my $compexcl = compexcl($codepoint); 56 57 use Unicode::UCD 'namedseq'; 58 my $namedseq = namedseq($named_sequence_name); 59 60 my $unicode_version = Unicode::UCD::UnicodeVersion(); 61 62 =head1 DESCRIPTION 63 64 The Unicode::UCD module offers a simple interface to the Unicode 65 Character Database. 66 67 =cut 68 69 my $UNICODEFH; 70 my $BLOCKSFH; 71 my $SCRIPTSFH; 72 my $VERSIONFH; 73 my $COMPEXCLFH; 74 my $CASEFOLDFH; 75 my $CASESPECFH; 76 my $NAMEDSEQFH; 77 78 sub openunicode { 79 my ($rfh, @path) = @_; 80 my $f; 81 unless (defined $$rfh) { 82 for my $d (@INC) { 83 use File::Spec; 84 $f = File::Spec->catfile($d, "unicore", @path); 85 last if open($$rfh, $f); 86 undef $f; 87 } 88 croak __PACKAGE__, ": failed to find ", 89 File::Spec->catfile(@path), " in @INC" 90 unless defined $f; 91 } 92 return $f; 93 } 94 95 =head2 charinfo 96 97 use Unicode::UCD 'charinfo'; 98 99 my $charinfo = charinfo(0x41); 100 101 charinfo() returns a reference to a hash that has the following fields 102 as defined by the Unicode standard: 103 104 key 105 106 code code point with at least four hexdigits 107 name name of the character IN UPPER CASE 108 category general category of the character 109 combining classes used in the Canonical Ordering Algorithm 110 bidi bidirectional type 111 decomposition character decomposition mapping 112 decimal if decimal digit this is the integer numeric value 113 digit if digit this is the numeric value 114 numeric if numeric is the integer or rational numeric value 115 mirrored if mirrored in bidirectional text 116 unicode10 Unicode 1.0 name if existed and different 117 comment ISO 10646 comment field 118 upper uppercase equivalent mapping 119 lower lowercase equivalent mapping 120 title titlecase equivalent mapping 121 122 block block the character belongs to (used in \p{In...}) 123 script script the character belongs to 124 125 If no match is found, a reference to an empty hash is returned. 126 127 The C<block> property is the same as returned by charinfo(). It is 128 not defined in the Unicode Character Database proper (Chapter 4 of the 129 Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database 130 (Chapter 14 of TUS3). Similarly for the C<script> property. 131 132 Note that you cannot do (de)composition and casing based solely on the 133 above C<decomposition> and C<lower>, C<upper>, C<title>, properties, 134 you will need also the compexcl(), casefold(), and casespec() functions. 135 136 =cut 137 138 # NB: This function is duplicated in charnames.pm 139 sub _getcode { 140 my $arg = shift; 141 142 if ($arg =~ /^[1-9]\d*$/) { 143 return $arg; 144 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { 145 return hex($1); 146 } 147 148 return; 149 } 150 151 # Lingua::KO::Hangul::Util not part of the standard distribution 152 # but it will be used if available. 153 154 eval { require Lingua::KO::Hangul::Util }; 155 my $hasHangulUtil = ! $@; 156 if ($hasHangulUtil) { 157 Lingua::KO::Hangul::Util->import(); 158 } 159 160 sub hangul_decomp { # internal: called from charinfo 161 if ($hasHangulUtil) { 162 my @tmp = decomposeHangul(shift); 163 return sprintf("%04X %04X", @tmp) if @tmp == 2; 164 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3; 165 } 166 return; 167 } 168 169 sub hangul_charname { # internal: called from charinfo 170 return sprintf("HANGUL SYLLABLE-%04X", shift); 171 } 172 173 sub han_charname { # internal: called from charinfo 174 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift); 175 } 176 177 my @CharinfoRanges = ( 178 # block name 179 # [ first, last, coderef to name, coderef to decompose ], 180 # CJK Ideographs Extension A 181 [ 0x3400, 0x4DB5, \&han_charname, undef ], 182 # CJK Ideographs 183 [ 0x4E00, 0x9FA5, \&han_charname, undef ], 184 # Hangul Syllables 185 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ], 186 # Non-Private Use High Surrogates 187 [ 0xD800, 0xDB7F, undef, undef ], 188 # Private Use High Surrogates 189 [ 0xDB80, 0xDBFF, undef, undef ], 190 # Low Surrogates 191 [ 0xDC00, 0xDFFF, undef, undef ], 192 # The Private Use Area 193 [ 0xE000, 0xF8FF, undef, undef ], 194 # CJK Ideographs Extension B 195 [ 0x20000, 0x2A6D6, \&han_charname, undef ], 196 # Plane 15 Private Use Area 197 [ 0xF0000, 0xFFFFD, undef, undef ], 198 # Plane 16 Private Use Area 199 [ 0x100000, 0x10FFFD, undef, undef ], 200 ); 201 202 sub charinfo { 203 my $arg = shift; 204 my $code = _getcode($arg); 205 croak __PACKAGE__, "::charinfo: unknown code '$arg'" 206 unless defined $code; 207 my $hexk = sprintf("%06X", $code); 208 my($rcode,$rname,$rdec); 209 foreach my $range (@CharinfoRanges){ 210 if ($range->[0] <= $code && $code <= $range->[1]) { 211 $rcode = $hexk; 212 $rcode =~ s/^0+//; 213 $rcode = sprintf("%04X", hex($rcode)); 214 $rname = $range->[2] ? $range->[2]->($code) : ''; 215 $rdec = $range->[3] ? $range->[3]->($code) : ''; 216 $hexk = sprintf("%06X", $range->[0]); # replace by the first 217 last; 218 } 219 } 220 openunicode(\$UNICODEFH, "UnicodeData.txt"); 221 if (defined $UNICODEFH) { 222 use Search::Dict 1.02; 223 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) { 224 my $line = <$UNICODEFH>; 225 return unless defined $line; 226 chomp $line; 227 my %prop; 228 @prop{qw( 229 code name category 230 combining bidi decomposition 231 decimal digit numeric 232 mirrored unicode10 comment 233 upper lower title 234 )} = split(/;/, $line, -1); 235 $hexk =~ s/^0+//; 236 $hexk = sprintf("%04X", hex($hexk)); 237 if ($prop{code} eq $hexk) { 238 $prop{block} = charblock($code); 239 $prop{script} = charscript($code); 240 if(defined $rname){ 241 $prop{code} = $rcode; 242 $prop{name} = $rname; 243 $prop{decomposition} = $rdec; 244 } 245 return \%prop; 246 } 247 } 248 } 249 return; 250 } 251 252 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table. 253 my ($table, $lo, $hi, $code) = @_; 254 255 return if $lo > $hi; 256 257 my $mid = int(($lo+$hi) / 2); 258 259 if ($table->[$mid]->[0] < $code) { 260 if ($table->[$mid]->[1] >= $code) { 261 return $table->[$mid]->[2]; 262 } else { 263 _search($table, $mid + 1, $hi, $code); 264 } 265 } elsif ($table->[$mid]->[0] > $code) { 266 _search($table, $lo, $mid - 1, $code); 267 } else { 268 return $table->[$mid]->[2]; 269 } 270 } 271 272 sub charinrange { 273 my ($range, $arg) = @_; 274 my $code = _getcode($arg); 275 croak __PACKAGE__, "::charinrange: unknown code '$arg'" 276 unless defined $code; 277 _search($range, 0, $#$range, $code); 278 } 279 280 =head2 charblock 281 282 use Unicode::UCD 'charblock'; 283 284 my $charblock = charblock(0x41); 285 my $charblock = charblock(1234); 286 my $charblock = charblock("0x263a"); 287 my $charblock = charblock("U+263a"); 288 289 my $range = charblock('Armenian'); 290 291 With a B<code point argument> charblock() returns the I<block> the character 292 belongs to, e.g. C<Basic Latin>. Note that not all the character 293 positions within all blocks are defined. 294 295 See also L</Blocks versus Scripts>. 296 297 If supplied with an argument that can't be a code point, charblock() tries 298 to do the opposite and interpret the argument as a character block. The 299 return value is a I<range>: an anonymous list of lists that contain 300 I<start-of-range>, I<end-of-range> code point pairs. You can test whether 301 a code point is in a range using the L</charinrange> function. If the 302 argument is not a known character block, C<undef> is returned. 303 304 =cut 305 306 my @BLOCKS; 307 my %BLOCKS; 308 309 sub _charblocks { 310 unless (@BLOCKS) { 311 if (openunicode(\$BLOCKSFH, "Blocks.txt")) { 312 local $_; 313 while (<$BLOCKSFH>) { 314 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) { 315 my ($lo, $hi) = (hex($1), hex($2)); 316 my $subrange = [ $lo, $hi, $3 ]; 317 push @BLOCKS, $subrange; 318 push @{$BLOCKS{$3}}, $subrange; 319 } 320 } 321 close($BLOCKSFH); 322 } 323 } 324 } 325 326 sub charblock { 327 my $arg = shift; 328 329 _charblocks() unless @BLOCKS; 330 331 my $code = _getcode($arg); 332 333 if (defined $code) { 334 _search(\@BLOCKS, 0, $#BLOCKS, $code); 335 } else { 336 if (exists $BLOCKS{$arg}) { 337 return dclone $BLOCKS{$arg}; 338 } else { 339 return; 340 } 341 } 342 } 343 344 =head2 charscript 345 346 use Unicode::UCD 'charscript'; 347 348 my $charscript = charscript(0x41); 349 my $charscript = charscript(1234); 350 my $charscript = charscript("U+263a"); 351 352 my $range = charscript('Thai'); 353 354 With a B<code point argument> charscript() returns the I<script> the 355 character belongs to, e.g. C<Latin>, C<Greek>, C<Han>. 356 357 See also L</Blocks versus Scripts>. 358 359 If supplied with an argument that can't be a code point, charscript() tries 360 to do the opposite and interpret the argument as a character script. The 361 return value is a I<range>: an anonymous list of lists that contain 362 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a 363 code point is in a range using the L</charinrange> function. If the 364 argument is not a known character script, C<undef> is returned. 365 366 =cut 367 368 my @SCRIPTS; 369 my %SCRIPTS; 370 371 sub _charscripts { 372 unless (@SCRIPTS) { 373 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) { 374 local $_; 375 while (<$SCRIPTSFH>) { 376 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) { 377 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1)); 378 my $script = lc($3); 379 $script =~ s/\b(\w)/uc($1)/ge; 380 my $subrange = [ $lo, $hi, $script ]; 381 push @SCRIPTS, $subrange; 382 push @{$SCRIPTS{$script}}, $subrange; 383 } 384 } 385 close($SCRIPTSFH); 386 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS; 387 } 388 } 389 } 390 391 sub charscript { 392 my $arg = shift; 393 394 _charscripts() unless @SCRIPTS; 395 396 my $code = _getcode($arg); 397 398 if (defined $code) { 399 _search(\@SCRIPTS, 0, $#SCRIPTS, $code); 400 } else { 401 if (exists $SCRIPTS{$arg}) { 402 return dclone $SCRIPTS{$arg}; 403 } else { 404 return; 405 } 406 } 407 } 408 409 =head2 charblocks 410 411 use Unicode::UCD 'charblocks'; 412 413 my $charblocks = charblocks(); 414 415 charblocks() returns a reference to a hash with the known block names 416 as the keys, and the code point ranges (see L</charblock>) as the values. 417 418 See also L</Blocks versus Scripts>. 419 420 =cut 421 422 sub charblocks { 423 _charblocks() unless %BLOCKS; 424 return dclone \%BLOCKS; 425 } 426 427 =head2 charscripts 428 429 use Unicode::UCD 'charscripts'; 430 431 my $charscripts = charscripts(); 432 433 charscripts() returns a reference to a hash with the known script 434 names as the keys, and the code point ranges (see L</charscript>) as 435 the values. 436 437 See also L</Blocks versus Scripts>. 438 439 =cut 440 441 sub charscripts { 442 _charscripts() unless %SCRIPTS; 443 return dclone \%SCRIPTS; 444 } 445 446 =head2 Blocks versus Scripts 447 448 The difference between a block and a script is that scripts are closer 449 to the linguistic notion of a set of characters required to present 450 languages, while block is more of an artifact of the Unicode character 451 numbering and separation into blocks of (mostly) 256 characters. 452 453 For example the Latin B<script> is spread over several B<blocks>, such 454 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and 455 C<Latin Extended-B>. On the other hand, the Latin script does not 456 contain all the characters of the C<Basic Latin> block (also known as 457 the ASCII): it includes only the letters, and not, for example, the digits 458 or the punctuation. 459 460 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt 461 462 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/ 463 464 =head2 Matching Scripts and Blocks 465 466 Scripts are matched with the regular-expression construct 467 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script), 468 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches 469 any of the 256 code points in the Tibetan block). 470 471 =head2 Code Point Arguments 472 473 A I<code point argument> is either a decimal or a hexadecimal scalar 474 designating a Unicode character, or C<U+> followed by hexadecimals 475 designating a Unicode character. In other words, if you want a code 476 point to be interpreted as a hexadecimal number, you must prefix it 477 with either C<0x> or C<U+>, because a string like e.g. C<123> will 478 be interpreted as a decimal code point. Also note that Unicode is 479 B<not> limited to 16 bits (the number of Unicode characters is 480 open-ended, in theory unlimited): you may have more than 4 hexdigits. 481 482 =head2 charinrange 483 484 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you 485 can also test whether a code point is in the I<range> as returned by 486 L</charblock> and L</charscript> or as the values of the hash returned 487 by L</charblocks> and L</charscripts> by using charinrange(): 488 489 use Unicode::UCD qw(charscript charinrange); 490 491 $range = charscript('Hiragana'); 492 print "looks like hiragana\n" if charinrange($range, $codepoint); 493 494 =cut 495 496 my %GENERAL_CATEGORIES = 497 ( 498 'L' => 'Letter', 499 'LC' => 'CasedLetter', 500 'Lu' => 'UppercaseLetter', 501 'Ll' => 'LowercaseLetter', 502 'Lt' => 'TitlecaseLetter', 503 'Lm' => 'ModifierLetter', 504 'Lo' => 'OtherLetter', 505 'M' => 'Mark', 506 'Mn' => 'NonspacingMark', 507 'Mc' => 'SpacingMark', 508 'Me' => 'EnclosingMark', 509 'N' => 'Number', 510 'Nd' => 'DecimalNumber', 511 'Nl' => 'LetterNumber', 512 'No' => 'OtherNumber', 513 'P' => 'Punctuation', 514 'Pc' => 'ConnectorPunctuation', 515 'Pd' => 'DashPunctuation', 516 'Ps' => 'OpenPunctuation', 517 'Pe' => 'ClosePunctuation', 518 'Pi' => 'InitialPunctuation', 519 'Pf' => 'FinalPunctuation', 520 'Po' => 'OtherPunctuation', 521 'S' => 'Symbol', 522 'Sm' => 'MathSymbol', 523 'Sc' => 'CurrencySymbol', 524 'Sk' => 'ModifierSymbol', 525 'So' => 'OtherSymbol', 526 'Z' => 'Separator', 527 'Zs' => 'SpaceSeparator', 528 'Zl' => 'LineSeparator', 529 'Zp' => 'ParagraphSeparator', 530 'C' => 'Other', 531 'Cc' => 'Control', 532 'Cf' => 'Format', 533 'Cs' => 'Surrogate', 534 'Co' => 'PrivateUse', 535 'Cn' => 'Unassigned', 536 ); 537 538 sub general_categories { 539 return dclone \%GENERAL_CATEGORIES; 540 } 541 542 =head2 general_categories 543 544 use Unicode::UCD 'general_categories'; 545 546 my $categories = general_categories(); 547 548 The general_categories() returns a reference to a hash which has short 549 general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long 550 names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>, 551 C<Symbol>) as values. The hash is reversible in case you need to go 552 from the long names to the short names. The general category is the 553 one returned from charinfo() under the C<category> key. 554 555 =cut 556 557 my %BIDI_TYPES = 558 ( 559 'L' => 'Left-to-Right', 560 'LRE' => 'Left-to-Right Embedding', 561 'LRO' => 'Left-to-Right Override', 562 'R' => 'Right-to-Left', 563 'AL' => 'Right-to-Left Arabic', 564 'RLE' => 'Right-to-Left Embedding', 565 'RLO' => 'Right-to-Left Override', 566 'PDF' => 'Pop Directional Format', 567 'EN' => 'European Number', 568 'ES' => 'European Number Separator', 569 'ET' => 'European Number Terminator', 570 'AN' => 'Arabic Number', 571 'CS' => 'Common Number Separator', 572 'NSM' => 'Non-Spacing Mark', 573 'BN' => 'Boundary Neutral', 574 'B' => 'Paragraph Separator', 575 'S' => 'Segment Separator', 576 'WS' => 'Whitespace', 577 'ON' => 'Other Neutrals', 578 ); 579 580 sub bidi_types { 581 return dclone \%BIDI_TYPES; 582 } 583 584 =head2 bidi_types 585 586 use Unicode::UCD 'bidi_types'; 587 588 my $categories = bidi_types(); 589 590 The bidi_types() returns a reference to a hash which has the short 591 bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long 592 names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The 593 hash is reversible in case you need to go from the long names to the 594 short names. The bidi type is the one returned from charinfo() 595 under the C<bidi> key. For the exact meaning of the various bidi classes 596 the Unicode TR9 is recommended reading: 597 http://www.unicode.org/reports/tr9/tr9-17.html 598 (as of Unicode 5.0.0) 599 600 =cut 601 602 =head2 compexcl 603 604 use Unicode::UCD 'compexcl'; 605 606 my $compexcl = compexcl("09dc"); 607 608 The compexcl() returns the composition exclusion (that is, if the 609 character should not be produced during a precomposition) of the 610 character specified by a B<code point argument>. 611 612 If there is a composition exclusion for the character, true is 613 returned. Otherwise, false is returned. 614 615 =cut 616 617 my %COMPEXCL; 618 619 sub _compexcl { 620 unless (%COMPEXCL) { 621 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) { 622 local $_; 623 while (<$COMPEXCLFH>) { 624 if (/^([0-9A-F]+)\s+\#\s+/) { 625 my $code = hex($1); 626 $COMPEXCL{$code} = undef; 627 } 628 } 629 close($COMPEXCLFH); 630 } 631 } 632 } 633 634 sub compexcl { 635 my $arg = shift; 636 my $code = _getcode($arg); 637 croak __PACKAGE__, "::compexcl: unknown code '$arg'" 638 unless defined $code; 639 640 _compexcl() unless %COMPEXCL; 641 642 return exists $COMPEXCL{$code}; 643 } 644 645 =head2 casefold 646 647 use Unicode::UCD 'casefold'; 648 649 my $casefold = casefold("00DF"); 650 651 The casefold() returns the locale-independent case folding of the 652 character specified by a B<code point argument>. 653 654 If there is a case folding for that character, a reference to a hash 655 with the following fields is returned: 656 657 key 658 659 code code point with at least four hexdigits 660 status "C", "F", "S", or "I" 661 mapping one or more codes separated by spaces 662 663 The meaning of the I<status> is as follows: 664 665 C common case folding, common mappings shared 666 by both simple and full mappings 667 F full case folding, mappings that cause strings 668 to grow in length. Multiple characters are separated 669 by spaces 670 S simple case folding, mappings to single characters 671 where different from F 672 I special case for dotted uppercase I and 673 dotless lowercase i 674 - If this mapping is included, the result is 675 case-insensitive, but dotless and dotted I's 676 are not distinguished 677 - If this mapping is excluded, the result is not 678 fully case-insensitive, but dotless and dotted 679 I's are distinguished 680 681 If there is no case folding for that character, C<undef> is returned. 682 683 For more information about case mappings see 684 http://www.unicode.org/unicode/reports/tr21/ 685 686 =cut 687 688 my %CASEFOLD; 689 690 sub _casefold { 691 unless (%CASEFOLD) { 692 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) { 693 local $_; 694 while (<$CASEFOLDFH>) { 695 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) { 696 my $code = hex($1); 697 $CASEFOLD{$code} = { code => $1, 698 status => $2, 699 mapping => $3 }; 700 } 701 } 702 close($CASEFOLDFH); 703 } 704 } 705 } 706 707 sub casefold { 708 my $arg = shift; 709 my $code = _getcode($arg); 710 croak __PACKAGE__, "::casefold: unknown code '$arg'" 711 unless defined $code; 712 713 _casefold() unless %CASEFOLD; 714 715 return $CASEFOLD{$code}; 716 } 717 718 =head2 casespec 719 720 use Unicode::UCD 'casespec'; 721 722 my $casespec = casespec("FB00"); 723 724 The casespec() returns the potentially locale-dependent case mapping 725 of the character specified by a B<code point argument>. The mapping 726 may change the length of the string (which the basic Unicode case 727 mappings as returned by charinfo() never do). 728 729 If there is a case folding for that character, a reference to a hash 730 with the following fields is returned: 731 732 key 733 734 code code point with at least four hexdigits 735 lower lowercase 736 title titlecase 737 upper uppercase 738 condition condition list (may be undef) 739 740 The C<condition> is optional. Where present, it consists of one or 741 more I<locales> or I<contexts>, separated by spaces (other than as 742 used to separate elements, spaces are to be ignored). A condition 743 list overrides the normal behavior if all of the listed conditions are 744 true. Case distinctions in the condition list are not significant. 745 Conditions preceded by "NON_" represent the negation of the condition. 746 747 Note that when there are multiple case folding definitions for a 748 single code point because of different locales, the value returned by 749 casespec() is a hash reference which has the locales as the keys and 750 hash references as described above as the values. 751 752 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly 753 followed by a "_" and a 2-letter ISO language code (possibly followed 754 by a "_" and a variant code). You can find the lists of those codes, 755 see L<Locale::Country> and L<Locale::Language>. 756 757 A I<context> is one of the following choices: 758 759 FINAL The letter is not followed by a letter of 760 general category L (e.g. Ll, Lt, Lu, Lm, or Lo) 761 MODERN The mapping is only used for modern text 762 AFTER_i The last base character was "i" (U+0069) 763 764 For more information about case mappings see 765 http://www.unicode.org/unicode/reports/tr21/ 766 767 =cut 768 769 my %CASESPEC; 770 771 sub _casespec { 772 unless (%CASESPEC) { 773 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) { 774 local $_; 775 while (<$CASESPECFH>) { 776 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) { 777 my ($hexcode, $lower, $title, $upper, $condition) = 778 ($1, $2, $3, $4, $5); 779 my $code = hex($hexcode); 780 if (exists $CASESPEC{$code}) { 781 if (exists $CASESPEC{$code}->{code}) { 782 my ($oldlower, 783 $oldtitle, 784 $oldupper, 785 $oldcondition) = 786 @{$CASESPEC{$code}}{qw(lower 787 title 788 upper 789 condition)}; 790 if (defined $oldcondition) { 791 my ($oldlocale) = 792 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/); 793 delete $CASESPEC{$code}; 794 $CASESPEC{$code}->{$oldlocale} = 795 { code => $hexcode, 796 lower => $oldlower, 797 title => $oldtitle, 798 upper => $oldupper, 799 condition => $oldcondition }; 800 } 801 } 802 my ($locale) = 803 ($condition =~ /^([a-z][a-z](?:_\S+)?)/); 804 $CASESPEC{$code}->{$locale} = 805 { code => $hexcode, 806 lower => $lower, 807 title => $title, 808 upper => $upper, 809 condition => $condition }; 810 } else { 811 $CASESPEC{$code} = 812 { code => $hexcode, 813 lower => $lower, 814 title => $title, 815 upper => $upper, 816 condition => $condition }; 817 } 818 } 819 } 820 close($CASESPECFH); 821 } 822 } 823 } 824 825 sub casespec { 826 my $arg = shift; 827 my $code = _getcode($arg); 828 croak __PACKAGE__, "::casespec: unknown code '$arg'" 829 unless defined $code; 830 831 _casespec() unless %CASESPEC; 832 833 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code}; 834 } 835 836 =head2 namedseq() 837 838 use Unicode::UCD 'namedseq'; 839 840 my $namedseq = namedseq("KATAKANA LETTER AINU P"); 841 my @namedseq = namedseq("KATAKANA LETTER AINU P"); 842 my %namedseq = namedseq(); 843 844 If used with a single argument in a scalar context, returns the string 845 consisting of the code points of the named sequence, or C<undef> if no 846 named sequence by that name exists. If used with a single argument in 847 a list context, returns list of the code points. If used with no 848 arguments in a list context, returns a hash with the names of the 849 named sequences as the keys and the named sequences as strings as 850 the values. Otherwise, returns C<undef> or empty list depending 851 on the context. 852 853 (New from Unicode 4.1.0) 854 855 =cut 856 857 my %NAMEDSEQ; 858 859 sub _namedseq { 860 unless (%NAMEDSEQ) { 861 if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) { 862 local $_; 863 while (<$NAMEDSEQFH>) { 864 if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) { 865 my ($n, $s) = ($1, $2); 866 my @s = map { chr(hex($_)) } split(' ', $s); 867 $NAMEDSEQ{$n} = join("", @s); 868 } 869 } 870 close($NAMEDSEQFH); 871 } 872 } 873 } 874 875 sub namedseq { 876 _namedseq() unless %NAMEDSEQ; 877 my $wantarray = wantarray(); 878 if (defined $wantarray) { 879 if ($wantarray) { 880 if (@_ == 0) { 881 return %NAMEDSEQ; 882 } elsif (@_ == 1) { 883 my $s = $NAMEDSEQ{ $_[0] }; 884 return defined $s ? map { ord($_) } split('', $s) : (); 885 } 886 } elsif (@_ == 1) { 887 return $NAMEDSEQ{ $_[0] }; 888 } 889 } 890 return; 891 } 892 893 =head2 Unicode::UCD::UnicodeVersion 894 895 Unicode::UCD::UnicodeVersion() returns the version of the Unicode 896 Character Database, in other words, the version of the Unicode 897 standard the database implements. The version is a string 898 of numbers delimited by dots (C<'.'>). 899 900 =cut 901 902 my $UNICODEVERSION; 903 904 sub UnicodeVersion { 905 unless (defined $UNICODEVERSION) { 906 openunicode(\$VERSIONFH, "version"); 907 chomp($UNICODEVERSION = <$VERSIONFH>); 908 close($VERSIONFH); 909 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'" 910 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/; 911 } 912 return $UNICODEVERSION; 913 } 914 915 =head2 Implementation Note 916 917 The first use of charinfo() opens a read-only filehandle to the Unicode 918 Character Database (the database is included in the Perl distribution). 919 The filehandle is then kept open for further queries. In other words, 920 if you are wondering where one of your filehandles went, that's where. 921 922 =head1 BUGS 923 924 Does not yet support EBCDIC platforms. 925 926 =head1 AUTHOR 927 928 Jarkko Hietaniemi 929 930 =cut 931 932 1;
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Tue Mar 17 22:47:18 2015 | Cross-referenced by PHPXref 0.7.1 |