PHPXRef 0.7.1 : Joomla 4.2.2 documentation : /libraries/vendor/joomla/string/src/phputf8/utils/bad.php source

[Summary view] [Print] [Text view]
   1  <?php
   2  /**
   3  * Tools for locating / replacing bad bytes in UTF-8 strings
   4  * The Original Code is Mozilla Communicator client code.
   5  * The Initial Developer of the Original Code is
   6  * Netscape Communications Corporation.
   7  * Portions created by the Initial Developer are Copyright (C) 1998
   8  * the Initial Developer. All Rights Reserved.
   9  * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
  10  * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
  11  * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
  12  * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
  13  * @see http://hsivonen.iki.fi/php-utf8/
  14  * @package utf8
  15  * @see utf8_is_valid
  16  */
  17  
  18  //--------------------------------------------------------------------
  19  /**
  20  * Locates the first bad byte in a UTF-8 string returning it's
  21  * byte index in the string
  22  * PCRE Pattern to locate bad bytes in a UTF-8 string
  23  * Comes from W3 FAQ: Multilingual Forms
  24  * Note: modified to include full ASCII range including control chars
  25  * @see http://www.w3.org/International/questions/qa-forms-utf-8
  26  * @param string
  27  * @return mixed integer byte index or FALSE if no bad found
  28  * @package utf8
  29  */
  30  function utf8_bad_find($str) {
  31      $UTF8_BAD =
  32      '([\x00-\x7F]'.                          # ASCII (including control chars)
  33      '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
  34      '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
  35      '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
  36      '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
  37      '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
  38      '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
  39      '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
  40      '|(.{1}))';                              # invalid byte
  41      $pos = 0;
  42      $badList = array();
  43      while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  44          $bytes = strlen($matches[0]);
  45          if ( isset($matches[2])) {
  46              return $pos;
  47          }
  48          $pos += $bytes;
  49          $str = substr($str,$bytes);
  50      }
  51      return FALSE;
  52  }
  53  
  54  //--------------------------------------------------------------------
  55  /**
  56  * Locates all bad bytes in a UTF-8 string and returns a list of their
  57  * byte index in the string
  58  * PCRE Pattern to locate bad bytes in a UTF-8 string
  59  * Comes from W3 FAQ: Multilingual Forms
  60  * Note: modified to include full ASCII range including control chars
  61  * @see http://www.w3.org/International/questions/qa-forms-utf-8
  62  * @param string
  63  * @return mixed array of integers or FALSE if no bad found
  64  * @package utf8
  65  */
  66  function utf8_bad_findall($str) {
  67      $UTF8_BAD =
  68      '([\x00-\x7F]'.                          # ASCII (including control chars)
  69      '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
  70      '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
  71      '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
  72      '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
  73      '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
  74      '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
  75      '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
  76      '|(.{1}))';                              # invalid byte
  77      $pos = 0;
  78      $badList = array();
  79      while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  80          $bytes = strlen($matches[0]);
  81          if ( isset($matches[2])) {
  82              $badList[] = $pos;
  83          }
  84          $pos += $bytes;
  85          $str = substr($str,$bytes);
  86      }
  87      if ( count($badList) > 0 ) {
  88          return $badList;
  89      }
  90      return FALSE;
  91  }
  92  
  93  //--------------------------------------------------------------------
  94  /**
  95  * Strips out any bad bytes from a UTF-8 string and returns the rest
  96  * PCRE Pattern to locate bad bytes in a UTF-8 string
  97  * Comes from W3 FAQ: Multilingual Forms
  98  * Note: modified to include full ASCII range including control chars
  99  * @see http://www.w3.org/International/questions/qa-forms-utf-8
 100  * @param string
 101  * @return string
 102  * @package utf8
 103  */
 104  function utf8_bad_strip($str) {
 105      $UTF8_BAD =
 106      '([\x00-\x7F]'.                          # ASCII (including control chars)
 107      '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
 108      '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
 109      '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
 110      '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
 111      '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
 112      '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
 113      '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
 114      '|(.{1}))';                              # invalid byte
 115      ob_start();
 116      while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
 117          if ( !isset($matches[2])) {
 118              echo $matches[0];
 119          }
 120          $str = substr($str,strlen($matches[0]));
 121      }
 122      $result = ob_get_contents();
 123      ob_end_clean();
 124      return $result;
 125  }
 126  
 127  //--------------------------------------------------------------------
 128  /**
 129  * Replace bad bytes with an alternative character - ASCII character
 130  * recommended is replacement char
 131  * PCRE Pattern to locate bad bytes in a UTF-8 string
 132  * Comes from W3 FAQ: Multilingual Forms
 133  * Note: modified to include full ASCII range including control chars
 134  * @see http://www.w3.org/International/questions/qa-forms-utf-8
 135  * @param string to search
 136  * @param string to replace bad bytes with (defaults to '?') - use ASCII
 137  * @return string
 138  * @package utf8
 139  */
 140  function utf8_bad_replace($str, $replace = '?') {
 141      $UTF8_BAD =
 142      '([\x00-\x7F]'.                          # ASCII (including control chars)
 143      '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
 144      '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
 145      '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
 146      '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
 147      '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
 148      '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
 149      '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
 150      '|(.{1}))';                              # invalid byte
 151      ob_start();
 152      while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
 153          if ( !isset($matches[2])) {
 154              echo $matches[0];
 155          } else {
 156              echo $replace;
 157          }
 158          $str = substr($str,strlen($matches[0]));
 159      }
 160      $result = ob_get_contents();
 161      ob_end_clean();
 162      return $result;
 163  }
 164  
 165  //--------------------------------------------------------------------
 166  /**
 167  * Return code from utf8_bad_identify() when a five octet sequence is detected.
 168  * Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so
 169  * do not represent a useful character
 170  * @see utf8_bad_identify
 171  * @package utf8
 172  */
 173  define('UTF8_BAD_5OCTET',1);
 174  
 175  /**
 176  * Return code from utf8_bad_identify() when a six octet sequence is detected.
 177  * Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so
 178  * do not represent a useful character
 179  * @see utf8_bad_identify
 180  * @package utf8
 181  */
 182  define('UTF8_BAD_6OCTET',2);
 183  
 184  /**
 185  * Return code from utf8_bad_identify().
 186  * Invalid octet for use as start of multi-byte UTF-8 sequence
 187  * @see utf8_bad_identify
 188  * @package utf8
 189  */
 190  define('UTF8_BAD_SEQID',3);
 191  
 192  /**
 193  * Return code from utf8_bad_identify().
 194  * From Unicode 3.1, non-shortest form is illegal
 195  * @see utf8_bad_identify
 196  * @package utf8
 197  */
 198  define('UTF8_BAD_NONSHORT',4);
 199  
 200  /**
 201  * Return code from utf8_bad_identify().
 202  * From Unicode 3.2, surrogate characters are illegal
 203  * @see utf8_bad_identify
 204  * @package utf8
 205  */
 206  define('UTF8_BAD_SURROGATE',5);
 207  
 208  /**
 209  * Return code from utf8_bad_identify().
 210  * Codepoints outside the Unicode range are illegal
 211  * @see utf8_bad_identify
 212  * @package utf8
 213  */
 214  define('UTF8_BAD_UNIOUTRANGE',6);
 215  
 216  /**
 217  * Return code from utf8_bad_identify().
 218  * Incomplete multi-octet sequence
 219  * Note: this is kind of a "catch-all"
 220  * @see utf8_bad_identify
 221  * @package utf8
 222  */
 223  define('UTF8_BAD_SEQINCOMPLETE',7);
 224  
 225  //--------------------------------------------------------------------
 226  /**
 227  * Reports on the type of bad byte found in a UTF-8 string. Returns a
 228  * status code on the first bad byte found
 229  *
 230  * Joomla modification - As of PHP 7.4, curly brace access has been deprecated. As a result this function has been
 231  * modified to use square brace syntax
 232  * See https://github.com/php/php-src/commit/d574df63dc375f5fc9202ce5afde23f866b6450a
 233  * for additional references
 234  *
 235  * @author <[email protected]>
 236  * @param string UTF-8 encoded string
 237  * @return mixed integer constant describing problem or FALSE if valid UTF-8
 238  * @see utf8_bad_explain
 239  * @see http://hsivonen.iki.fi/php-utf8/
 240  * @package utf8
 241  */
 242  function utf8_bad_identify($str, &$i) {
 243  
 244      $mState = 0;     // cached expected number of octets after the current octet
 245                       // until the beginning of the next UTF8 character sequence
 246      $mUcs4  = 0;     // cached Unicode character
 247      $mBytes = 1;     // cached expected number of octets in the current sequence
 248  
 249      $len = strlen($str);
 250  
 251      for($i = 0; $i < $len; $i++) {
 252  
 253          $in = ord($str[$i]);
 254  
 255          if ( $mState == 0) {
 256  
 257              // When mState is zero we expect either a US-ASCII character or a
 258              // multi-octet sequence.
 259              if (0 == (0x80 & ($in))) {
 260                  // US-ASCII, pass straight through.
 261                  $mBytes = 1;
 262  
 263              } else if (0xC0 == (0xE0 & ($in))) {
 264                  // First octet of 2 octet sequence
 265                  $mUcs4 = ($in);
 266                  $mUcs4 = ($mUcs4 & 0x1F) << 6;
 267                  $mState = 1;
 268                  $mBytes = 2;
 269  
 270              } else if (0xE0 == (0xF0 & ($in))) {
 271                  // First octet of 3 octet sequence
 272                  $mUcs4 = ($in);
 273                  $mUcs4 = ($mUcs4 & 0x0F) << 12;
 274                  $mState = 2;
 275                  $mBytes = 3;
 276  
 277              } else if (0xF0 == (0xF8 & ($in))) {
 278                  // First octet of 4 octet sequence
 279                  $mUcs4 = ($in);
 280                  $mUcs4 = ($mUcs4 & 0x07) << 18;
 281                  $mState = 3;
 282                  $mBytes = 4;
 283  
 284              } else if (0xF8 == (0xFC & ($in))) {
 285  
 286                  /* First octet of 5 octet sequence.
 287                  *
 288                  * This is illegal because the encoded codepoint must be either
 289                  * (a) not the shortest form or
 290                  * (b) outside the Unicode range of 0-0x10FFFF.
 291                  */
 292  
 293                  return UTF8_BAD_5OCTET;
 294  
 295              } else if (0xFC == (0xFE & ($in))) {
 296  
 297                  // First octet of 6 octet sequence, see comments for 5 octet sequence.
 298                  return UTF8_BAD_6OCTET;
 299  
 300              } else {
 301                  // Current octet is neither in the US-ASCII range nor a legal first
 302                  // octet of a multi-octet sequence.
 303                  return UTF8_BAD_SEQID;
 304  
 305              }
 306  
 307          } else {
 308  
 309              // When mState is non-zero, we expect a continuation of the multi-octet
 310              // sequence
 311              if (0x80 == (0xC0 & ($in))) {
 312  
 313                  // Legal continuation.
 314                  $shift = ($mState - 1) * 6;
 315                  $tmp = $in;
 316                  $tmp = ($tmp & 0x0000003F) << $shift;
 317                  $mUcs4 |= $tmp;
 318  
 319                  /**
 320                  * End of the multi-octet sequence. mUcs4 now contains the final
 321                  * Unicode codepoint to be output
 322                  */
 323                  if (0 == --$mState) {
 324  
 325                      // From Unicode 3.1, non-shortest form is illegal
 326                      if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
 327                          ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
 328                          ((4 == $mBytes) && ($mUcs4 < 0x10000)) ) {
 329                          return UTF8_BAD_NONSHORT;
 330  
 331                      // From Unicode 3.2, surrogate characters are illegal
 332                      } else if (($mUcs4 & 0xFFFFF800) == 0xD800) {
 333                          return UTF8_BAD_SURROGATE;
 334  
 335                      // Codepoints outside the Unicode range are illegal
 336                      } else if ($mUcs4 > 0x10FFFF) {
 337                          return UTF8_BAD_UNIOUTRANGE;
 338                      }
 339  
 340                      //initialize UTF8 cache
 341                      $mState = 0;
 342                      $mUcs4  = 0;
 343                      $mBytes = 1;
 344                  }
 345  
 346              } else {
 347                  // ((0xC0 & (*in) != 0x80) && (mState != 0))
 348                  // Incomplete multi-octet sequence.
 349                  $i--;
 350                  return UTF8_BAD_SEQINCOMPLETE;
 351              }
 352          }
 353      }
 354  
 355      if ( $mState != 0 ) {
 356          // Incomplete multi-octet sequence.
 357          $i--;
 358          return UTF8_BAD_SEQINCOMPLETE;
 359      }
 360  
 361      // No bad octets found
 362      $i = NULL;
 363      return FALSE;
 364  }
 365  
 366  //--------------------------------------------------------------------
 367  /**
 368  * Takes a return code from utf8_bad_identify() are returns a message
 369  * (in English) explaining what the problem is.
 370  * @param int return code from utf8_bad_identify
 371  * @return mixed string message or FALSE if return code unknown
 372  * @see utf8_bad_identify
 373  * @package utf8
 374  */
 375  function utf8_bad_explain($code) {
 376  
 377      switch ($code) {
 378  
 379          case UTF8_BAD_5OCTET:
 380              return 'Five octet sequences are valid UTF-8 but are not supported by Unicode';
 381          break;
 382  
 383          case UTF8_BAD_6OCTET:
 384              return 'Six octet sequences are valid UTF-8 but are not supported by Unicode';
 385          break;
 386  
 387          case UTF8_BAD_SEQID:
 388              return 'Invalid octet for use as start of multi-byte UTF-8 sequence';
 389          break;
 390  
 391          case UTF8_BAD_NONSHORT:
 392              return 'From Unicode 3.1, non-shortest form is illegal';
 393          break;
 394  
 395          case UTF8_BAD_SURROGATE:
 396              return 'From Unicode 3.2, surrogate characters are illegal';
 397          break;
 398  
 399          case UTF8_BAD_UNIOUTRANGE:
 400              return 'Codepoints outside the Unicode range are illegal';
 401          break;
 402  
 403          case UTF8_BAD_SEQINCOMPLETE:
 404              return 'Incomplete multi-octet sequence';
 405          break;
 406  
 407      }
 408  
 409      trigger_error('Unknown error code: '.$code,E_USER_WARNING);
 410      return FALSE;
 411  
 412  }
PHP Cross Reference of Joomla 4.2.2 documentation

/libraries/vendor/joomla/string/src/phputf8/utils/ -> bad.php (source)