[ Index ]

PHP Cross Reference of Joomla 4.2.2 documentation

title

Body

[close]

/libraries/vendor/algo26-matthias/idna-convert/src/TranscodeUnicode/ -> TranscodeUnicode.php (source)

   1  <?php
   2  /**
   3   * Converts between various flavours of Unicode representations like UCS-4 or UTF-8
   4   * Supported schemes:
   5   * - UCS-4 Little Endian / Big Endian / Array (partially)
   6   * - UTF-16 Little Endian / Big Endian (not yet)
   7   * - UTF-8
   8   * - UTF-7
   9   * - UTF-7 IMAP (modified UTF-7)
  10   *
  11   * @package IdnaConvert
  12   * @author Matthias Sommerfeld  <[email protected]>
  13   * @copyright 2003-2019 algo26 Beratungs GmbH, Berlin, https://www.algo26.de
  14   */
  15  
  16  namespace Algo26\IdnaConvert\TranscodeUnicode;
  17  
  18  use Algo26\IdnaConvert\Exception\InvalidCharacterException;
  19  use InvalidArgumentException;
  20  
  21  class TranscodeUnicode implements TranscodeUnicodeInterface
  22  {
  23      public const FORMAT_UCS4       = 'ucs4';
  24      public const FORMAT_UCS4_ARRAY = 'ucs4array';
  25      public const FORMAT_UTF8       = 'utf8';
  26      public const FORMAT_UTF7       = 'utf7';
  27      public const FORMAT_UTF7_IMAP  = 'utf7imap';
  28  
  29      private const encodings = [
  30          self::FORMAT_UCS4,
  31          self::FORMAT_UCS4_ARRAY,
  32          self::FORMAT_UTF8,
  33          self::FORMAT_UTF7,
  34          self::FORMAT_UTF7_IMAP
  35      ];
  36  
  37      private $safeMode;
  38      private $safeCodepoint = 0xFFFC;
  39  
  40      public function convert(
  41          $data,
  42          string $fromEncoding,
  43          string $toEncoding,
  44          bool $safeMode = false,
  45          ?int $safeCodepoint = null
  46      ) {
  47          $this->safeMode = $safeMode;
  48          if ($safeCodepoint !== null) {
  49              $this->safeCodepoint = $safeCodepoint;
  50          }
  51  
  52          $fromEncoding = strtolower($fromEncoding);
  53          $toEncoding   = strtolower($toEncoding);
  54  
  55          if ($fromEncoding === $toEncoding) {
  56              return $data;
  57          }
  58  
  59          if (!in_array($fromEncoding, self::encodings)) {
  60              throw new InvalidArgumentException(sprintf('Invalid input format %s', $fromEncoding), 300);
  61          }
  62          if (!in_array($toEncoding, self::encodings)) {
  63              throw new InvalidArgumentException(sprintf('Invalid output format %s', $toEncoding), 301);
  64          }
  65  
  66          if ($fromEncoding !== self::FORMAT_UCS4_ARRAY) {
  67              $methodName = sprintf('%s_%s', $fromEncoding, self::FORMAT_UCS4_ARRAY);
  68              $data = $this->$methodName($data);
  69          }
  70          if ($toEncoding !== self::FORMAT_UCS4_ARRAY) {
  71              $methodName = sprintf('%s_%s', self::FORMAT_UCS4_ARRAY, $toEncoding);
  72              $data = $this->$methodName($data);
  73          }
  74  
  75          return $data;
  76      }
  77  
  78      /**
  79       * This converts an UTF-8 encoded string to its UCS-4 representation
  80       *
  81       * @param string $input The UTF-8 string to convert
  82       *
  83       * @return array  Array of 32bit values representing each codepoint
  84       * @throws InvalidCharacterException
  85       * @access public
  86       */
  87      private function utf8_ucs4array($input)
  88      {
  89          $startByte = 0;
  90          $nextByte = 0;
  91  
  92          $output = [];
  93          $outputLength = 0;
  94          $inputLength = $this->byteLength($input);
  95          $mode = 'next';
  96          $test = 'none';
  97          for ($k = 0; $k < $inputLength; ++$k) {
  98              $v = ord($input[$k]); // Extract byte from input string
  99  
 100              if ($v < 128) { // We found an ASCII char - put into string as is
 101                  $output[$outputLength] = $v;
 102                  ++$outputLength;
 103                  if ('add' === $mode) {
 104                      if ($this->safeMode) {
 105                          $output[$outputLength - 2] = $this->safeCodepoint;
 106                          $mode = 'next';
 107                      } else {
 108                          throw new InvalidCharacterException(
 109                              sprintf(
 110                                  'Conversion from UTF-8 to UCS-4 failed: malformed input at byte %d',
 111                                  $k
 112                              ),
 113                              302
 114                          );
 115                      }
 116                  }
 117  
 118                  continue;
 119              }
 120  
 121              if ('next' === $mode) { // Try to find the next start byte; determine the width of the Unicode char
 122                  $startByte = $v;
 123                  $mode = 'add';
 124                  $test = 'range';
 125                  if ($v >> 5 === 6) { // &110xxxxx 10xxxxx
 126                      $nextByte = 0; // How many times subsequent bit masks must rotate 6bits to the left
 127                      $v = ($v - 192) << 6;
 128                  } elseif ($v >> 4 === 14) { // &1110xxxx 10xxxxxx 10xxxxxx
 129                      $nextByte = 1;
 130                      $v = ($v - 224) << 12;
 131                  } elseif ($v >> 3 === 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 132                      $nextByte = 2;
 133                      $v = ($v - 240) << 18;
 134                  } elseif ($this->safeMode) {
 135                      $mode = 'next';
 136                      $output[$outputLength] = $this->safeCodepoint;
 137                      ++$outputLength;
 138  
 139                      continue;
 140                  } else {
 141                      throw new InvalidCharacterException(
 142                          sprintf('This might be UTF-8, but I don\'t understand it at byte %d', $k),
 143                          303
 144                      );
 145                  }
 146                  if (($inputLength - $k - $nextByte) < 2) {
 147                      $output[$outputLength] = $this->safeCodepoint;
 148                      $mode = 'no';
 149  
 150                      continue;
 151                  }
 152  
 153                  if ('add' === $mode) {
 154                      $output[$outputLength] = (int)$v;
 155                      ++$outputLength;
 156  
 157                      continue;
 158                  }
 159              }
 160              if ('add' == $mode) {
 161                  if (!$this->safeMode && $test === 'range') {
 162                      $test = 'none';
 163                      if (($v < 0xA0 && $startByte === 0xE0)
 164                          || ($v < 0x90 && $startByte === 0xF0)
 165                          || ($v > 0x8F && $startByte === 0xF4)
 166                      ) {
 167                          throw new InvalidCharacterException(
 168                              sprintf('Bogus UTF-8 character (out of legal range) at byte %d', $k),
 169                              304
 170                          );
 171                      }
 172                  }
 173                  if ($v >> 6 === 2) { // Bit mask must be 10xxxxxx
 174                      $v = ($v - 128) << ($nextByte * 6);
 175                      $output[($outputLength - 1)] += $v;
 176                      --$nextByte;
 177                  } else {
 178                      if ($this->safeMode) {
 179                          $output[$outputLength - 1] = ord($this->safeCodepoint);
 180                          $k--;
 181                          $mode = 'next';
 182  
 183                          continue;
 184                      } else {
 185                          throw new InvalidCharacterException(
 186                              sprintf('Conversion from UTF-8 to UCS-4 failed: malformed input at byte %d', $k),
 187                              302
 188                          );
 189                      }
 190                  }
 191                  if ($nextByte < 0) {
 192                      $mode = 'next';
 193                  }
 194              }
 195          } // for
 196  
 197          return $output;
 198      }
 199  
 200      /**
 201       * Convert UCS-4 arary into UTF-8 string
 202       * See utf8_ucs4array() for details
 203       *
 204       * @param $input array Array of UCS-4 codepoints
 205       *
 206       * @return string
 207       * @access   public
 208       * @throws InvalidCharacterException
 209       */
 210      private function ucs4array_utf8($input)
 211      {
 212          $output = '';
 213          foreach ($input as $k => $v) {
 214              if ($v < 128) { // 7bit are transferred literally
 215                  $output .= chr($v);
 216              } elseif ($v < (1 << 11)) { // 2 bytes
 217                  $output .= sprintf(
 218                      '%s%s',
 219                      chr(192 + ($v >> 6)),
 220                      chr(128 + ($v & 63))
 221                  );
 222              } elseif ($v < (1 << 16)) { // 3 bytes
 223                  $output .= sprintf(
 224                      '%s%s%s',
 225                      chr(224 + ($v >> 12)),
 226                      chr(128 + (($v >> 6) & 63)),
 227                      chr(128 + ($v & 63))
 228                  );
 229              } elseif ($v < (1 << 21)) { // 4 bytes
 230                  $output .= sprintf(
 231                      '%s%s%s%s',
 232                      chr(240 + ($v >> 18)),
 233                      chr(128 + (($v >> 12) & 63)),
 234                      chr(128 + (($v >> 6) & 63)),
 235                      chr(128 + ($v & 63))
 236                  );
 237              } elseif ($this->safeMode) {
 238                  $output .= $this->safeCodepoint;
 239              } else {
 240                  throw new InvalidCharacterException(
 241                      sprintf('Conversion from UCS-4 to UTF-8 failed: malformed input at byte %d', $k),
 242                      305
 243                  );
 244              }
 245          }
 246  
 247          return $output;
 248      }
 249  
 250      private function utf7imap_ucs4array($input)
 251      {
 252          return $this->utf7_ucs4array(str_replace(',', '/', $input), '&');
 253      }
 254  
 255      private function utf7_ucs4array($input, $sc = '+')
 256      {
 257          $output = [];
 258          $outputLength = 0;
 259          $inputLength = $this->byteLength($input);
 260          $mode = 'd';
 261          $b64 = '';
 262  
 263          for ($k = 0; $k < $inputLength; ++$k) {
 264              $c = $input[$k];
 265  
 266              // Ignore zero bytes
 267              if (0 === ord($c)) {
 268                  continue;
 269              }
 270              if ('b' === $mode) {
 271                  // Sequence got terminated
 272                  if (!preg_match('![A-Za-z0-9/'.preg_quote($sc, '!').']!', $c)) {
 273                      if ('-' == $c) {
 274                          if ($b64 === '') {
 275                              $output[$outputLength] = ord($sc);
 276                              $outputLength++;
 277                              $mode = 'd';
 278  
 279                              continue;
 280                          }
 281                      }
 282                      $tmp = base64_decode($b64);
 283                      $tmp = substr($tmp, -1 * (strlen($tmp) % 2));
 284                      for ($i = 0; $i < strlen($tmp); $i++) {
 285                          if ($i % 2) {
 286                              $output[$outputLength] += ord($tmp[$i]);
 287                              $outputLength++;
 288                          } else {
 289                              $output[$outputLength] = ord($tmp[$i]) << 8;
 290                          }
 291                      }
 292                      $mode = 'd';
 293                      $b64 = '';
 294  
 295                      continue;
 296                  } else {
 297                      $b64 .= $c;
 298                  }
 299              }
 300              if ('d' === $mode) {
 301                  if ($sc === $c) {
 302                      $mode = 'b';
 303  
 304                      continue;
 305                  }
 306  
 307                  $output[$outputLength] = ord($c);
 308                  $outputLength++;
 309              }
 310          }
 311  
 312          return $output;
 313      }
 314  
 315      private function ucs4array_utf7imap($input)
 316      {
 317          return str_replace(
 318              '/',
 319              ',',
 320              $this->ucs4array_utf7($input, '&')
 321          );
 322      }
 323  
 324      private function ucs4array_utf7($input, $sc = '+')
 325      {
 326          $output = '';
 327          $mode = 'd';
 328          $b64 = '';
 329          while (true) {
 330              $v = (!empty($input)) ? array_shift($input) : false;
 331              $isDirect = (false !== $v)
 332                  ? (0x20 <= $v && $v <= 0x7e && $v !== ord($sc))
 333                  : true;
 334              if ($mode === 'b') {
 335                  if ($isDirect) {
 336                      if ($b64 === chr(0).$sc) {
 337                          $output .= $sc.'-';
 338                          $b64 = '';
 339                      } elseif ($b64) {
 340                          $output .= $sc.str_replace('=', '', base64_encode($b64)).'-';
 341                          $b64 = '';
 342                      }
 343                      $mode = 'd';
 344                  } elseif (false !== $v) {
 345                      $b64 .= chr(($v >> 8) & 255).chr($v & 255);
 346                  }
 347              }
 348              if ($mode === 'd' && false !== $v) {
 349                  if ($isDirect) {
 350                      $output .= chr($v);
 351                  } else {
 352                      $b64 = chr(($v >> 8) & 255).chr($v & 255);
 353                      $mode = 'b';
 354                  }
 355              }
 356              if (false === $v && $b64 === '') {
 357                  break;
 358              }
 359          }
 360  
 361          return $output;
 362      }
 363  
 364      /**
 365       * Convert UCS-4 array into UCS-4 string (Little Endian at the moment)
 366       * @param $input array UCS-4 code points
 367       * @return string
 368       * @access   public
 369       */
 370      private function ucs4array_ucs4($input)
 371      {
 372          $output = '';
 373          foreach ($input as $v) {
 374              $output .= sprintf(
 375                  '%s%s%s%s',
 376                  chr(($v >> 24) & 255),
 377                  chr(($v >> 16) & 255),
 378                  chr(($v >> 8) & 255),
 379                  chr($v & 255)
 380              );
 381          }
 382  
 383          return $output;
 384      }
 385  
 386      /**
 387       * Convert UCS-4 string (LE ar the moment) into UCS-4 array
 388       *
 389       * @param $input string UCS-4 LE string
 390       *
 391       * @return array
 392       * @access   public
 393       * @throws InvalidCharacterException
 394       */
 395      private function ucs4_ucs4array($input)
 396      {
 397          $output = [];
 398  
 399          $inputLength = $this->byteLength($input);
 400          // Input length must be dividable by 4
 401          if ($inputLength % 4) {
 402              throw new InvalidCharacterException('Input UCS4 string is broken', 306);
 403          }
 404          // Empty input - return empty output
 405          if (!$inputLength) {
 406              return $output;
 407          }
 408  
 409          for ($i = 0, $outputLength = -1; $i < $inputLength; ++$i) {
 410              if (!($i % 4)) { // Increment output position every 4 input bytes
 411                  $outputLength++;
 412                  $output[$outputLength] = 0;
 413              }
 414              $output[$outputLength] += ord($input[$i]) << (8 * (3 - ($i % 4)));
 415          }
 416  
 417          return $output;
 418      }
 419      
 420      /**
 421       * Gets the length of a string in bytes even if mbstring function
 422       * overloading is turned on
 423       *
 424       * @param string $string the string for which to get the length.
 425       * @return integer the length of the string in bytes.
 426       */
 427      protected function byteLength($string)
 428      {
 429          if ((extension_loaded('mbstring')
 430               && (ini_get('mbstring.func_overload') & 0x02) === 0x02)
 431          ) {
 432              return mb_strlen($string, '8bit');
 433          }
 434  
 435          return strlen((binary) $string);
 436      }    
 437  }


Generated: Wed Sep 7 05:41:13 2022 Chilli.vc Blog - For Webmaster,Blog-Writer,System Admin and Domainer