PHPXRef 0.7.1 : Unnamed Project : /se3-unattended/var/se3/unattended/install/linuxaux/opt/perl/lib/5.10.0/i586-linux-thread-multi/Unicode/Normalize.pm source

[Summary view] [Print] [Text view]
   1  package Unicode::Normalize;
   2  
   3  BEGIN {
   4      unless ("A" eq pack('U', 0x41)) {
   5      die "Unicode::Normalize cannot stringify a Unicode code point\n";
   6      }
   7  }
   8  
   9  use 5.006;
  10  use strict;
  11  use warnings;
  12  use Carp;
  13  
  14  no warnings 'utf8';
  15  
  16  our $VERSION = '1.02';
  17  our $PACKAGE = __PACKAGE__;
  18  
  19  require Exporter;
  20  require DynaLoader;
  21  
  22  our @ISA = qw(Exporter DynaLoader);
  23  our @EXPORT = qw( NFC NFD NFKC NFKD );
  24  our @EXPORT_OK = qw(
  25      normalize decompose reorder compose
  26      checkNFD checkNFKD checkNFC checkNFKC check
  27      getCanon getCompat getComposite getCombinClass
  28      isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
  29      isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
  30      FCD checkFCD FCC checkFCC composeContiguous
  31      splitOnLastStarter
  32  );
  33  our %EXPORT_TAGS = (
  34      all       => [ @EXPORT, @EXPORT_OK ],
  35      normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
  36      check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
  37      fast      => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
  38  );
  39  
  40  ######
  41  
  42  bootstrap Unicode::Normalize $VERSION;
  43  
  44  ######
  45  
  46  ##
  47  ## utilites for tests
  48  ##
  49  
  50  sub pack_U {
  51      return pack('U*', @_);
  52  }
  53  
  54  sub unpack_U {
  55      return unpack('U*', shift(@_).pack('U*'));
  56  }
  57  
  58  
  59  ##
  60  ## normalization forms
  61  ##
  62  
  63  sub FCD ($) {
  64      my $str = shift;
  65      return checkFCD($str) ? $str : NFD($str);
  66  }
  67  
  68  our %formNorm = (
  69      NFC  => \&NFC,    C  => \&NFC,
  70      NFD  => \&NFD,    D  => \&NFD,
  71      NFKC => \&NFKC,    KC => \&NFKC,
  72      NFKD => \&NFKD,    KD => \&NFKD,
  73      FCD  => \&FCD,    FCC => \&FCC,
  74  );
  75  
  76  sub normalize($$)
  77  {
  78      my $form = shift;
  79      my $str = shift;
  80      if (exists $formNorm{$form}) {
  81      return $formNorm{$form}->($str);
  82      }
  83      croak($PACKAGE."::normalize: invalid form name: $form");
  84  }
  85  
  86  
  87  ##
  88  ## quick check
  89  ##
  90  
  91  our %formCheck = (
  92      NFC  => \&checkNFC,     C  => \&checkNFC,
  93      NFD  => \&checkNFD,     D  => \&checkNFD,
  94      NFKC => \&checkNFKC,    KC => \&checkNFKC,
  95      NFKD => \&checkNFKD,    KD => \&checkNFKD,
  96      FCD  => \&checkFCD,     FCC => \&checkFCC,
  97  );
  98  
  99  sub check($$)
 100  {
 101      my $form = shift;
 102      my $str = shift;
 103      if (exists $formCheck{$form}) {
 104      return $formCheck{$form}->($str);
 105      }
 106      croak($PACKAGE."::check: invalid form name: $form");
 107  }
 108  
 109  1;
 110  __END__
 111  
 112  =head1 NAME
 113  
 114  Unicode::Normalize - Unicode Normalization Forms
 115  
 116  =head1 SYNOPSIS
 117  
 118  (1) using function names exported by default:
 119  
 120    use Unicode::Normalize;
 121  
 122    $NFD_string  = NFD($string);  # Normalization Form D
 123    $NFC_string  = NFC($string);  # Normalization Form C
 124    $NFKD_string = NFKD($string); # Normalization Form KD
 125    $NFKC_string = NFKC($string); # Normalization Form KC
 126  
 127  (2) using function names exported on request:
 128  
 129    use Unicode::Normalize 'normalize';
 130  
 131    $NFD_string  = normalize('D',  $string);  # Normalization Form D
 132    $NFC_string  = normalize('C',  $string);  # Normalization Form C
 133    $NFKD_string = normalize('KD', $string);  # Normalization Form KD
 134    $NFKC_string = normalize('KC', $string);  # Normalization Form KC
 135  
 136  =head1 DESCRIPTION
 137  
 138  Parameters:
 139  
 140  C<$string> is used as a string under character semantics (see F<perlunicode>).
 141  
 142  C<$code_point> should be an unsigned integer representing a Unicode code point.
 143  
 144  Note: Between XSUB and pure Perl, there is an incompatibility
 145  about the interpretation of C<$code_point> as a decimal number.
 146  XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not.
 147  Do not use a floating point nor a negative sign in C<$code_point>.
 148  
 149  =head2 Normalization Forms
 150  
 151  =over 4
 152  
 153  =item C<$NFD_string = NFD($string)>
 154  
 155  It returns the Normalization Form D (formed by canonical decomposition).
 156  
 157  =item C<$NFC_string = NFC($string)>
 158  
 159  It returns the Normalization Form C (formed by canonical decomposition
 160  followed by canonical composition).
 161  
 162  =item C<$NFKD_string = NFKD($string)>
 163  
 164  It returns the Normalization Form KD (formed by compatibility decomposition).
 165  
 166  =item C<$NFKC_string = NFKC($string)>
 167  
 168  It returns the Normalization Form KC (formed by compatibility decomposition
 169  followed by B<canonical> composition).
 170  
 171  =item C<$FCD_string = FCD($string)>
 172  
 173  If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
 174  it returns the string without modification; otherwise it returns an FCD string.
 175  
 176  Note: FCD is not always unique, then plural forms may be equivalent
 177  each other. C<FCD()> will return one of these equivalent forms.
 178  
 179  =item C<$FCC_string = FCC($string)>
 180  
 181  It returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
 182  
 183  Note: FCC is unique, as well as four normalization forms (NF*).
 184  
 185  =item C<$normalized_string = normalize($form_name, $string)>
 186  
 187  It returns the normalization form of C<$form_name>.
 188  
 189  As C<$form_name>, one of the following names must be given.
 190  
 191    'C'  or 'NFC'  for Normalization Form C  (UAX #15)
 192    'D'  or 'NFD'  for Normalization Form D  (UAX #15)
 193    'KC' or 'NFKC' for Normalization Form KC (UAX #15)
 194    'KD' or 'NFKD' for Normalization Form KD (UAX #15)
 195  
 196    'FCD'          for "Fast C or D" Form  (UTN #5)
 197    'FCC'          for "Fast C Contiguous" (UTN #5)
 198  
 199  =back
 200  
 201  =head2 Decomposition and Composition
 202  
 203  =over 4
 204  
 205  =item C<$decomposed_string = decompose($string [, $useCompatMapping])>
 206  
 207  It returns the concatenation of the decomposition of each character
 208  in the string.
 209  
 210  If the second parameter (a boolean) is omitted or false,
 211  the decomposition is canonical decomposition;
 212  if the second parameter (a boolean) is true,
 213  the decomposition is compatibility decomposition.
 214  
 215  The string returned is not always in NFD/NFKD. Reordering may be required.
 216  
 217      $NFD_string  = reorder(decompose($string));       # eq. to NFD()
 218      $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
 219  
 220  =item C<$reordered_string = reorder($string)>
 221  
 222  It returns the result of reordering the combining characters
 223  according to Canonical Ordering Behavior.
 224  
 225  For example, when you have a list of NFD/NFKD strings,
 226  you can get the concatenated NFD/NFKD string from them, by saying
 227  
 228      $concat_NFD  = reorder(join '', @NFD_strings);
 229      $concat_NFKD = reorder(join '', @NFKD_strings);
 230  
 231  =item C<$composed_string = compose($string)>
 232  
 233  It returns the result of canonical composition
 234  without applying any decomposition.
 235  
 236  For example, when you have a NFD/NFKD string,
 237  you can get its NFC/NFKC string, by saying
 238  
 239      $NFC_string  = compose($NFD_string);
 240      $NFKC_string = compose($NFKD_string);
 241  
 242  =back
 243  
 244  =head2 Quick Check
 245  
 246  (see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
 247  
 248  The following functions check whether the string is in that normalization form.
 249  
 250  The result returned will be one of the following:
 251  
 252      YES     The string is in that normalization form.
 253      NO      The string is not in that normalization form.
 254      MAYBE   Dubious. Maybe yes, maybe no.
 255  
 256  =over 4
 257  
 258  =item C<$result = checkNFD($string)>
 259  
 260  It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
 261  
 262  =item C<$result = checkNFC($string)>
 263  
 264  It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 265  C<undef> if C<MAYBE>.
 266  
 267  =item C<$result = checkNFKD($string)>
 268  
 269  It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
 270  
 271  =item C<$result = checkNFKC($string)>
 272  
 273  It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 274  C<undef> if C<MAYBE>.
 275  
 276  =item C<$result = checkFCD($string)>
 277  
 278  It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
 279  
 280  =item C<$result = checkFCC($string)>
 281  
 282  It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 283  C<undef> if C<MAYBE>.
 284  
 285  Note: If a string is not in FCD, it must not be in FCC.
 286  So C<checkFCC($not_FCD_string)> should return C<NO>.
 287  
 288  =item C<$result = check($form_name, $string)>
 289  
 290  It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 291  C<undef> if C<MAYBE>.
 292  
 293  As C<$form_name>, one of the following names must be given.
 294  
 295    'C'  or 'NFC'  for Normalization Form C  (UAX #15)
 296    'D'  or 'NFD'  for Normalization Form D  (UAX #15)
 297    'KC' or 'NFKC' for Normalization Form KC (UAX #15)
 298    'KD' or 'NFKD' for Normalization Form KD (UAX #15)
 299  
 300    'FCD'          for "Fast C or D" Form  (UTN #5)
 301    'FCC'          for "Fast C Contiguous" (UTN #5)
 302  
 303  =back
 304  
 305  B<Note>
 306  
 307  In the cases of NFD, NFKD, and FCD, the answer must be
 308  either C<YES> or C<NO>. The answer C<MAYBE> may be returned
 309  in the cases of NFC, NFKC, and FCC.
 310  
 311  A C<MAYBE> string should contain at least one combining character
 312  or the like. For example, C<COMBINING ACUTE ACCENT> has
 313  the MAYBE_NFC/MAYBE_NFKC property.
 314  
 315  Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
 316  and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
 317  C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
 318  (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
 319  while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
 320  
 321  If you want to check exactly, compare the string with its NFC/NFKC/FCC.
 322  
 323      if ($string eq NFC($string)) {
 324      # $string is exactly normalized in NFC;
 325      } else {
 326      # $string is not normalized in NFC;
 327      }
 328  
 329      if ($string eq NFKC($string)) {
 330      # $string is exactly normalized in NFKC;
 331      } else {
 332      # $string is not normalized in NFKC;
 333      }
 334  
 335  =head2 Character Data
 336  
 337  These functions are interface of character data used internally.
 338  If you want only to get Unicode normalization forms, you don't need
 339  call them yourself.
 340  
 341  =over 4
 342  
 343  =item C<$canonical_decomposition = getCanon($code_point)>
 344  
 345  If the character is canonically decomposable (including Hangul Syllables),
 346  it returns the (full) canonical decomposition as a string.
 347  Otherwise it returns C<undef>.
 348  
 349  B<Note:> According to the Unicode standard, the canonical decomposition
 350  of the character that is not canonically decomposable is same as
 351  the character itself.
 352  
 353  =item C<$compatibility_decomposition = getCompat($code_point)>
 354  
 355  If the character is compatibility decomposable (including Hangul Syllables),
 356  it returns the (full) compatibility decomposition as a string.
 357  Otherwise it returns C<undef>.
 358  
 359  B<Note:> According to the Unicode standard, the compatibility decomposition
 360  of the character that is not compatibility decomposable is same as
 361  the character itself.
 362  
 363  =item C<$code_point_composite = getComposite($code_point_here, $code_point_next)>
 364  
 365  If two characters here and next (as code points) are composable
 366  (including Hangul Jamo/Syllables and Composition Exclusions),
 367  it returns the code point of the composite.
 368  
 369  If they are not composable, it returns C<undef>.
 370  
 371  =item C<$combining_class = getCombinClass($code_point)>
 372  
 373  It returns the combining class (as an integer) of the character.
 374  
 375  =item C<$may_be_composed_with_prev_char = isComp2nd($code_point)>
 376  
 377  It returns a boolean whether the character of the specified codepoint
 378  may be composed with the previous one in a certain composition
 379  (including Hangul Compositions, but excluding
 380  Composition Exclusions and Non-Starter Decompositions).
 381  
 382  =item C<$is_exclusion = isExclusion($code_point)>
 383  
 384  It returns a boolean whether the code point is a composition exclusion.
 385  
 386  =item C<$is_singleton = isSingleton($code_point)>
 387  
 388  It returns a boolean whether the code point is a singleton
 389  
 390  =item C<$is_non_starter_decomposition = isNonStDecomp($code_point)>
 391  
 392  It returns a boolean whether the code point has Non-Starter Decomposition.
 393  
 394  =item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)>
 395  
 396  It returns a boolean of the derived property Comp_Ex
 397  (Full_Composition_Exclusion). This property is generated from
 398  Composition Exclusions + Singletons + Non-Starter Decompositions.
 399  
 400  =item C<$NFD_is_NO = isNFD_NO($code_point)>
 401  
 402  It returns a boolean of the derived property NFD_NO
 403  (NFD_Quick_Check=No).
 404  
 405  =item C<$NFC_is_NO = isNFC_NO($code_point)>
 406  
 407  It returns a boolean of the derived property NFC_NO
 408  (NFC_Quick_Check=No).
 409  
 410  =item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)>
 411  
 412  It returns a boolean of the derived property NFC_MAYBE
 413  (NFC_Quick_Check=Maybe).
 414  
 415  =item C<$NFKD_is_NO = isNFKD_NO($code_point)>
 416  
 417  It returns a boolean of the derived property NFKD_NO
 418  (NFKD_Quick_Check=No).
 419  
 420  =item C<$NFKC_is_NO = isNFKC_NO($code_point)>
 421  
 422  It returns a boolean of the derived property NFKC_NO
 423  (NFKC_Quick_Check=No).
 424  
 425  =item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)>
 426  
 427  It returns a boolean of the derived property NFKC_MAYBE
 428  (NFKC_Quick_Check=Maybe).
 429  
 430  =back
 431  
 432  =head1 EXPORT
 433  
 434  C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
 435  
 436  C<normalize> and other some functions: on request.
 437  
 438  =head1 CAVEATS
 439  
 440  =over 4
 441  
 442  =item Perl's version vs. Unicode version
 443  
 444  Since this module refers to perl core's Unicode database in the directory
 445  F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
 446  normalization implemented by this module depends on your perl's version.
 447  
 448      perl's version     implemented Unicode version
 449         5.6.1              3.0.1
 450         5.7.2              3.1.0
 451         5.7.3              3.1.1 (normalization is same as 3.1.0)
 452         5.8.0              3.2.0
 453       5.8.1-5.8.3          4.0.0
 454       5.8.4-5.8.6          4.0.1 (normalization is same as 4.0.0)
 455       5.8.7-5.8.8          4.1.0
 456  
 457  =item Correction of decomposition mapping
 458  
 459  In older Unicode versions, a small number of characters (all of which are
 460  CJK compatibility ideographs as far as they have been found) may have
 461  an erroneous decomposition mapping (see F<NormalizationCorrections.txt>).
 462  Anyhow, this module will neither refer to F<NormalizationCorrections.txt>
 463  nor provide any specific version of normalization. Therefore this module
 464  running on an older perl with an older Unicode database may use
 465  the erroneous decomposition mapping blindly conforming to the Unicode database.
 466  
 467  =item Revised definition of canonical composition
 468  
 469  In Unicode 4.1.0, the definition D2 of canonical composition (which
 470  affects NFC and NFKC) has been changed (see Public Review Issue #29
 471  and recent UAX #15). This module has used the newer definition
 472  since the version 0.07 (Oct 31, 2001).
 473  This module will not support the normalization according to the older
 474  definition, even if the Unicode version implemented by perl is
 475  lower than 4.1.0.
 476  
 477  =back
 478  
 479  =head1 AUTHOR
 480  
 481  SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
 482  
 483  Copyright(C) 2001-2007, SADAHIRO Tomoyuki. Japan. All rights reserved.
 484  
 485  This module is free software; you can redistribute it
 486  and/or modify it under the same terms as Perl itself.
 487  
 488  =head1 SEE ALSO
 489  
 490  =over 4
 491  
 492  =item http://www.unicode.org/reports/tr15/
 493  
 494  Unicode Normalization Forms - UAX #15
 495  
 496  =item http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
 497  
 498  Composition Exclusion Table
 499  
 500  =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
 501  
 502  Derived Normalization Properties
 503  
 504  =item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt
 505  
 506  Normalization Corrections
 507  
 508  =item http://www.unicode.org/review/pr-29.html
 509  
 510  Public Review Issue #29: Normalization Issue
 511  
 512  =item http://www.unicode.org/notes/tn5/
 513  
 514  Canonical Equivalence in Applications - UTN #5
 515  
 516  =back
 517  
 518  =cut
PHP Cross Reference of Unnamed Project

/se3-unattended/var/se3/unattended/install/linuxaux/opt/perl/lib/5.10.0/i586-linux-thread-multi/Unicode/ -> Normalize.pm (source)