[ Index ] |
PHP Cross Reference of Joomla 4.2.2 documentation |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Converts between various flavours of Unicode representations like UCS-4 or UTF-8 4 * Supported schemes: 5 * - UCS-4 Little Endian / Big Endian / Array (partially) 6 * - UTF-16 Little Endian / Big Endian (not yet) 7 * - UTF-8 8 * - UTF-7 9 * - UTF-7 IMAP (modified UTF-7) 10 * 11 * @package IdnaConvert 12 * @author Matthias Sommerfeld <[email protected]> 13 * @copyright 2003-2019 algo26 Beratungs GmbH, Berlin, https://www.algo26.de 14 */ 15 16 namespace Algo26\IdnaConvert\TranscodeUnicode; 17 18 use Algo26\IdnaConvert\Exception\InvalidCharacterException; 19 use InvalidArgumentException; 20 21 class TranscodeUnicode implements TranscodeUnicodeInterface 22 { 23 public const FORMAT_UCS4 = 'ucs4'; 24 public const FORMAT_UCS4_ARRAY = 'ucs4array'; 25 public const FORMAT_UTF8 = 'utf8'; 26 public const FORMAT_UTF7 = 'utf7'; 27 public const FORMAT_UTF7_IMAP = 'utf7imap'; 28 29 private const encodings = [ 30 self::FORMAT_UCS4, 31 self::FORMAT_UCS4_ARRAY, 32 self::FORMAT_UTF8, 33 self::FORMAT_UTF7, 34 self::FORMAT_UTF7_IMAP 35 ]; 36 37 private $safeMode; 38 private $safeCodepoint = 0xFFFC; 39 40 public function convert( 41 $data, 42 string $fromEncoding, 43 string $toEncoding, 44 bool $safeMode = false, 45 ?int $safeCodepoint = null 46 ) { 47 $this->safeMode = $safeMode; 48 if ($safeCodepoint !== null) { 49 $this->safeCodepoint = $safeCodepoint; 50 } 51 52 $fromEncoding = strtolower($fromEncoding); 53 $toEncoding = strtolower($toEncoding); 54 55 if ($fromEncoding === $toEncoding) { 56 return $data; 57 } 58 59 if (!in_array($fromEncoding, self::encodings)) { 60 throw new InvalidArgumentException(sprintf('Invalid input format %s', $fromEncoding), 300); 61 } 62 if (!in_array($toEncoding, self::encodings)) { 63 throw new InvalidArgumentException(sprintf('Invalid output format %s', $toEncoding), 301); 64 } 65 66 if ($fromEncoding !== self::FORMAT_UCS4_ARRAY) { 67 $methodName = sprintf('%s_%s', $fromEncoding, self::FORMAT_UCS4_ARRAY); 68 $data = $this->$methodName($data); 69 } 70 if ($toEncoding !== self::FORMAT_UCS4_ARRAY) { 71 $methodName = sprintf('%s_%s', self::FORMAT_UCS4_ARRAY, $toEncoding); 72 $data = $this->$methodName($data); 73 } 74 75 return $data; 76 } 77 78 /** 79 * This converts an UTF-8 encoded string to its UCS-4 representation 80 * 81 * @param string $input The UTF-8 string to convert 82 * 83 * @return array Array of 32bit values representing each codepoint 84 * @throws InvalidCharacterException 85 * @access public 86 */ 87 private function utf8_ucs4array($input) 88 { 89 $startByte = 0; 90 $nextByte = 0; 91 92 $output = []; 93 $outputLength = 0; 94 $inputLength = $this->byteLength($input); 95 $mode = 'next'; 96 $test = 'none'; 97 for ($k = 0; $k < $inputLength; ++$k) { 98 $v = ord($input[$k]); // Extract byte from input string 99 100 if ($v < 128) { // We found an ASCII char - put into string as is 101 $output[$outputLength] = $v; 102 ++$outputLength; 103 if ('add' === $mode) { 104 if ($this->safeMode) { 105 $output[$outputLength - 2] = $this->safeCodepoint; 106 $mode = 'next'; 107 } else { 108 throw new InvalidCharacterException( 109 sprintf( 110 'Conversion from UTF-8 to UCS-4 failed: malformed input at byte %d', 111 $k 112 ), 113 302 114 ); 115 } 116 } 117 118 continue; 119 } 120 121 if ('next' === $mode) { // Try to find the next start byte; determine the width of the Unicode char 122 $startByte = $v; 123 $mode = 'add'; 124 $test = 'range'; 125 if ($v >> 5 === 6) { // &110xxxxx 10xxxxx 126 $nextByte = 0; // How many times subsequent bit masks must rotate 6bits to the left 127 $v = ($v - 192) << 6; 128 } elseif ($v >> 4 === 14) { // &1110xxxx 10xxxxxx 10xxxxxx 129 $nextByte = 1; 130 $v = ($v - 224) << 12; 131 } elseif ($v >> 3 === 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 132 $nextByte = 2; 133 $v = ($v - 240) << 18; 134 } elseif ($this->safeMode) { 135 $mode = 'next'; 136 $output[$outputLength] = $this->safeCodepoint; 137 ++$outputLength; 138 139 continue; 140 } else { 141 throw new InvalidCharacterException( 142 sprintf('This might be UTF-8, but I don\'t understand it at byte %d', $k), 143 303 144 ); 145 } 146 if (($inputLength - $k - $nextByte) < 2) { 147 $output[$outputLength] = $this->safeCodepoint; 148 $mode = 'no'; 149 150 continue; 151 } 152 153 if ('add' === $mode) { 154 $output[$outputLength] = (int)$v; 155 ++$outputLength; 156 157 continue; 158 } 159 } 160 if ('add' == $mode) { 161 if (!$this->safeMode && $test === 'range') { 162 $test = 'none'; 163 if (($v < 0xA0 && $startByte === 0xE0) 164 || ($v < 0x90 && $startByte === 0xF0) 165 || ($v > 0x8F && $startByte === 0xF4) 166 ) { 167 throw new InvalidCharacterException( 168 sprintf('Bogus UTF-8 character (out of legal range) at byte %d', $k), 169 304 170 ); 171 } 172 } 173 if ($v >> 6 === 2) { // Bit mask must be 10xxxxxx 174 $v = ($v - 128) << ($nextByte * 6); 175 $output[($outputLength - 1)] += $v; 176 --$nextByte; 177 } else { 178 if ($this->safeMode) { 179 $output[$outputLength - 1] = ord($this->safeCodepoint); 180 $k--; 181 $mode = 'next'; 182 183 continue; 184 } else { 185 throw new InvalidCharacterException( 186 sprintf('Conversion from UTF-8 to UCS-4 failed: malformed input at byte %d', $k), 187 302 188 ); 189 } 190 } 191 if ($nextByte < 0) { 192 $mode = 'next'; 193 } 194 } 195 } // for 196 197 return $output; 198 } 199 200 /** 201 * Convert UCS-4 arary into UTF-8 string 202 * See utf8_ucs4array() for details 203 * 204 * @param $input array Array of UCS-4 codepoints 205 * 206 * @return string 207 * @access public 208 * @throws InvalidCharacterException 209 */ 210 private function ucs4array_utf8($input) 211 { 212 $output = ''; 213 foreach ($input as $k => $v) { 214 if ($v < 128) { // 7bit are transferred literally 215 $output .= chr($v); 216 } elseif ($v < (1 << 11)) { // 2 bytes 217 $output .= sprintf( 218 '%s%s', 219 chr(192 + ($v >> 6)), 220 chr(128 + ($v & 63)) 221 ); 222 } elseif ($v < (1 << 16)) { // 3 bytes 223 $output .= sprintf( 224 '%s%s%s', 225 chr(224 + ($v >> 12)), 226 chr(128 + (($v >> 6) & 63)), 227 chr(128 + ($v & 63)) 228 ); 229 } elseif ($v < (1 << 21)) { // 4 bytes 230 $output .= sprintf( 231 '%s%s%s%s', 232 chr(240 + ($v >> 18)), 233 chr(128 + (($v >> 12) & 63)), 234 chr(128 + (($v >> 6) & 63)), 235 chr(128 + ($v & 63)) 236 ); 237 } elseif ($this->safeMode) { 238 $output .= $this->safeCodepoint; 239 } else { 240 throw new InvalidCharacterException( 241 sprintf('Conversion from UCS-4 to UTF-8 failed: malformed input at byte %d', $k), 242 305 243 ); 244 } 245 } 246 247 return $output; 248 } 249 250 private function utf7imap_ucs4array($input) 251 { 252 return $this->utf7_ucs4array(str_replace(',', '/', $input), '&'); 253 } 254 255 private function utf7_ucs4array($input, $sc = '+') 256 { 257 $output = []; 258 $outputLength = 0; 259 $inputLength = $this->byteLength($input); 260 $mode = 'd'; 261 $b64 = ''; 262 263 for ($k = 0; $k < $inputLength; ++$k) { 264 $c = $input[$k]; 265 266 // Ignore zero bytes 267 if (0 === ord($c)) { 268 continue; 269 } 270 if ('b' === $mode) { 271 // Sequence got terminated 272 if (!preg_match('![A-Za-z0-9/'.preg_quote($sc, '!').']!', $c)) { 273 if ('-' == $c) { 274 if ($b64 === '') { 275 $output[$outputLength] = ord($sc); 276 $outputLength++; 277 $mode = 'd'; 278 279 continue; 280 } 281 } 282 $tmp = base64_decode($b64); 283 $tmp = substr($tmp, -1 * (strlen($tmp) % 2)); 284 for ($i = 0; $i < strlen($tmp); $i++) { 285 if ($i % 2) { 286 $output[$outputLength] += ord($tmp[$i]); 287 $outputLength++; 288 } else { 289 $output[$outputLength] = ord($tmp[$i]) << 8; 290 } 291 } 292 $mode = 'd'; 293 $b64 = ''; 294 295 continue; 296 } else { 297 $b64 .= $c; 298 } 299 } 300 if ('d' === $mode) { 301 if ($sc === $c) { 302 $mode = 'b'; 303 304 continue; 305 } 306 307 $output[$outputLength] = ord($c); 308 $outputLength++; 309 } 310 } 311 312 return $output; 313 } 314 315 private function ucs4array_utf7imap($input) 316 { 317 return str_replace( 318 '/', 319 ',', 320 $this->ucs4array_utf7($input, '&') 321 ); 322 } 323 324 private function ucs4array_utf7($input, $sc = '+') 325 { 326 $output = ''; 327 $mode = 'd'; 328 $b64 = ''; 329 while (true) { 330 $v = (!empty($input)) ? array_shift($input) : false; 331 $isDirect = (false !== $v) 332 ? (0x20 <= $v && $v <= 0x7e && $v !== ord($sc)) 333 : true; 334 if ($mode === 'b') { 335 if ($isDirect) { 336 if ($b64 === chr(0).$sc) { 337 $output .= $sc.'-'; 338 $b64 = ''; 339 } elseif ($b64) { 340 $output .= $sc.str_replace('=', '', base64_encode($b64)).'-'; 341 $b64 = ''; 342 } 343 $mode = 'd'; 344 } elseif (false !== $v) { 345 $b64 .= chr(($v >> 8) & 255).chr($v & 255); 346 } 347 } 348 if ($mode === 'd' && false !== $v) { 349 if ($isDirect) { 350 $output .= chr($v); 351 } else { 352 $b64 = chr(($v >> 8) & 255).chr($v & 255); 353 $mode = 'b'; 354 } 355 } 356 if (false === $v && $b64 === '') { 357 break; 358 } 359 } 360 361 return $output; 362 } 363 364 /** 365 * Convert UCS-4 array into UCS-4 string (Little Endian at the moment) 366 * @param $input array UCS-4 code points 367 * @return string 368 * @access public 369 */ 370 private function ucs4array_ucs4($input) 371 { 372 $output = ''; 373 foreach ($input as $v) { 374 $output .= sprintf( 375 '%s%s%s%s', 376 chr(($v >> 24) & 255), 377 chr(($v >> 16) & 255), 378 chr(($v >> 8) & 255), 379 chr($v & 255) 380 ); 381 } 382 383 return $output; 384 } 385 386 /** 387 * Convert UCS-4 string (LE ar the moment) into UCS-4 array 388 * 389 * @param $input string UCS-4 LE string 390 * 391 * @return array 392 * @access public 393 * @throws InvalidCharacterException 394 */ 395 private function ucs4_ucs4array($input) 396 { 397 $output = []; 398 399 $inputLength = $this->byteLength($input); 400 // Input length must be dividable by 4 401 if ($inputLength % 4) { 402 throw new InvalidCharacterException('Input UCS4 string is broken', 306); 403 } 404 // Empty input - return empty output 405 if (!$inputLength) { 406 return $output; 407 } 408 409 for ($i = 0, $outputLength = -1; $i < $inputLength; ++$i) { 410 if (!($i % 4)) { // Increment output position every 4 input bytes 411 $outputLength++; 412 $output[$outputLength] = 0; 413 } 414 $output[$outputLength] += ord($input[$i]) << (8 * (3 - ($i % 4))); 415 } 416 417 return $output; 418 } 419 420 /** 421 * Gets the length of a string in bytes even if mbstring function 422 * overloading is turned on 423 * 424 * @param string $string the string for which to get the length. 425 * @return integer the length of the string in bytes. 426 */ 427 protected function byteLength($string) 428 { 429 if ((extension_loaded('mbstring') 430 && (ini_get('mbstring.func_overload') & 0x02) === 0x02) 431 ) { 432 return mb_strlen($string, '8bit'); 433 } 434 435 return strlen((binary) $string); 436 } 437 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Wed Sep 7 05:41:13 2022 | Chilli.vc Blog - For Webmaster,Blog-Writer,System Admin and Domainer |