[ Index ] |
PHP Cross Reference of Joomla 4.2.2 documentation |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Tools for locating / replacing bad bytes in UTF-8 strings 4 * The Original Code is Mozilla Communicator client code. 5 * The Initial Developer of the Original Code is 6 * Netscape Communications Corporation. 7 * Portions created by the Initial Developer are Copyright (C) 1998 8 * the Initial Developer. All Rights Reserved. 9 * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi) 10 * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com) 11 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp 12 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp 13 * @see http://hsivonen.iki.fi/php-utf8/ 14 * @package utf8 15 * @see utf8_is_valid 16 */ 17 18 //-------------------------------------------------------------------- 19 /** 20 * Locates the first bad byte in a UTF-8 string returning it's 21 * byte index in the string 22 * PCRE Pattern to locate bad bytes in a UTF-8 string 23 * Comes from W3 FAQ: Multilingual Forms 24 * Note: modified to include full ASCII range including control chars 25 * @see http://www.w3.org/International/questions/qa-forms-utf-8 26 * @param string 27 * @return mixed integer byte index or FALSE if no bad found 28 * @package utf8 29 */ 30 function utf8_bad_find($str) { 31 $UTF8_BAD = 32 '([\x00-\x7F]'. # ASCII (including control chars) 33 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 34 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs 35 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 36 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 37 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 38 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 39 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 40 '|(.{1}))'; # invalid byte 41 $pos = 0; 42 $badList = array(); 43 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { 44 $bytes = strlen($matches[0]); 45 if ( isset($matches[2])) { 46 return $pos; 47 } 48 $pos += $bytes; 49 $str = substr($str,$bytes); 50 } 51 return FALSE; 52 } 53 54 //-------------------------------------------------------------------- 55 /** 56 * Locates all bad bytes in a UTF-8 string and returns a list of their 57 * byte index in the string 58 * PCRE Pattern to locate bad bytes in a UTF-8 string 59 * Comes from W3 FAQ: Multilingual Forms 60 * Note: modified to include full ASCII range including control chars 61 * @see http://www.w3.org/International/questions/qa-forms-utf-8 62 * @param string 63 * @return mixed array of integers or FALSE if no bad found 64 * @package utf8 65 */ 66 function utf8_bad_findall($str) { 67 $UTF8_BAD = 68 '([\x00-\x7F]'. # ASCII (including control chars) 69 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 70 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs 71 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 72 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 73 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 74 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 75 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 76 '|(.{1}))'; # invalid byte 77 $pos = 0; 78 $badList = array(); 79 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { 80 $bytes = strlen($matches[0]); 81 if ( isset($matches[2])) { 82 $badList[] = $pos; 83 } 84 $pos += $bytes; 85 $str = substr($str,$bytes); 86 } 87 if ( count($badList) > 0 ) { 88 return $badList; 89 } 90 return FALSE; 91 } 92 93 //-------------------------------------------------------------------- 94 /** 95 * Strips out any bad bytes from a UTF-8 string and returns the rest 96 * PCRE Pattern to locate bad bytes in a UTF-8 string 97 * Comes from W3 FAQ: Multilingual Forms 98 * Note: modified to include full ASCII range including control chars 99 * @see http://www.w3.org/International/questions/qa-forms-utf-8 100 * @param string 101 * @return string 102 * @package utf8 103 */ 104 function utf8_bad_strip($str) { 105 $UTF8_BAD = 106 '([\x00-\x7F]'. # ASCII (including control chars) 107 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 108 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs 109 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 110 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 111 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 112 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 113 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 114 '|(.{1}))'; # invalid byte 115 ob_start(); 116 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { 117 if ( !isset($matches[2])) { 118 echo $matches[0]; 119 } 120 $str = substr($str,strlen($matches[0])); 121 } 122 $result = ob_get_contents(); 123 ob_end_clean(); 124 return $result; 125 } 126 127 //-------------------------------------------------------------------- 128 /** 129 * Replace bad bytes with an alternative character - ASCII character 130 * recommended is replacement char 131 * PCRE Pattern to locate bad bytes in a UTF-8 string 132 * Comes from W3 FAQ: Multilingual Forms 133 * Note: modified to include full ASCII range including control chars 134 * @see http://www.w3.org/International/questions/qa-forms-utf-8 135 * @param string to search 136 * @param string to replace bad bytes with (defaults to '?') - use ASCII 137 * @return string 138 * @package utf8 139 */ 140 function utf8_bad_replace($str, $replace = '?') { 141 $UTF8_BAD = 142 '([\x00-\x7F]'. # ASCII (including control chars) 143 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 144 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs 145 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 146 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 147 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 148 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 149 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 150 '|(.{1}))'; # invalid byte 151 ob_start(); 152 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { 153 if ( !isset($matches[2])) { 154 echo $matches[0]; 155 } else { 156 echo $replace; 157 } 158 $str = substr($str,strlen($matches[0])); 159 } 160 $result = ob_get_contents(); 161 ob_end_clean(); 162 return $result; 163 } 164 165 //-------------------------------------------------------------------- 166 /** 167 * Return code from utf8_bad_identify() when a five octet sequence is detected. 168 * Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so 169 * do not represent a useful character 170 * @see utf8_bad_identify 171 * @package utf8 172 */ 173 define('UTF8_BAD_5OCTET',1); 174 175 /** 176 * Return code from utf8_bad_identify() when a six octet sequence is detected. 177 * Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so 178 * do not represent a useful character 179 * @see utf8_bad_identify 180 * @package utf8 181 */ 182 define('UTF8_BAD_6OCTET',2); 183 184 /** 185 * Return code from utf8_bad_identify(). 186 * Invalid octet for use as start of multi-byte UTF-8 sequence 187 * @see utf8_bad_identify 188 * @package utf8 189 */ 190 define('UTF8_BAD_SEQID',3); 191 192 /** 193 * Return code from utf8_bad_identify(). 194 * From Unicode 3.1, non-shortest form is illegal 195 * @see utf8_bad_identify 196 * @package utf8 197 */ 198 define('UTF8_BAD_NONSHORT',4); 199 200 /** 201 * Return code from utf8_bad_identify(). 202 * From Unicode 3.2, surrogate characters are illegal 203 * @see utf8_bad_identify 204 * @package utf8 205 */ 206 define('UTF8_BAD_SURROGATE',5); 207 208 /** 209 * Return code from utf8_bad_identify(). 210 * Codepoints outside the Unicode range are illegal 211 * @see utf8_bad_identify 212 * @package utf8 213 */ 214 define('UTF8_BAD_UNIOUTRANGE',6); 215 216 /** 217 * Return code from utf8_bad_identify(). 218 * Incomplete multi-octet sequence 219 * Note: this is kind of a "catch-all" 220 * @see utf8_bad_identify 221 * @package utf8 222 */ 223 define('UTF8_BAD_SEQINCOMPLETE',7); 224 225 //-------------------------------------------------------------------- 226 /** 227 * Reports on the type of bad byte found in a UTF-8 string. Returns a 228 * status code on the first bad byte found 229 * 230 * Joomla modification - As of PHP 7.4, curly brace access has been deprecated. As a result this function has been 231 * modified to use square brace syntax 232 * See https://github.com/php/php-src/commit/d574df63dc375f5fc9202ce5afde23f866b6450a 233 * for additional references 234 * 235 * @author <[email protected]> 236 * @param string UTF-8 encoded string 237 * @return mixed integer constant describing problem or FALSE if valid UTF-8 238 * @see utf8_bad_explain 239 * @see http://hsivonen.iki.fi/php-utf8/ 240 * @package utf8 241 */ 242 function utf8_bad_identify($str, &$i) { 243 244 $mState = 0; // cached expected number of octets after the current octet 245 // until the beginning of the next UTF8 character sequence 246 $mUcs4 = 0; // cached Unicode character 247 $mBytes = 1; // cached expected number of octets in the current sequence 248 249 $len = strlen($str); 250 251 for($i = 0; $i < $len; $i++) { 252 253 $in = ord($str[$i]); 254 255 if ( $mState == 0) { 256 257 // When mState is zero we expect either a US-ASCII character or a 258 // multi-octet sequence. 259 if (0 == (0x80 & ($in))) { 260 // US-ASCII, pass straight through. 261 $mBytes = 1; 262 263 } else if (0xC0 == (0xE0 & ($in))) { 264 // First octet of 2 octet sequence 265 $mUcs4 = ($in); 266 $mUcs4 = ($mUcs4 & 0x1F) << 6; 267 $mState = 1; 268 $mBytes = 2; 269 270 } else if (0xE0 == (0xF0 & ($in))) { 271 // First octet of 3 octet sequence 272 $mUcs4 = ($in); 273 $mUcs4 = ($mUcs4 & 0x0F) << 12; 274 $mState = 2; 275 $mBytes = 3; 276 277 } else if (0xF0 == (0xF8 & ($in))) { 278 // First octet of 4 octet sequence 279 $mUcs4 = ($in); 280 $mUcs4 = ($mUcs4 & 0x07) << 18; 281 $mState = 3; 282 $mBytes = 4; 283 284 } else if (0xF8 == (0xFC & ($in))) { 285 286 /* First octet of 5 octet sequence. 287 * 288 * This is illegal because the encoded codepoint must be either 289 * (a) not the shortest form or 290 * (b) outside the Unicode range of 0-0x10FFFF. 291 */ 292 293 return UTF8_BAD_5OCTET; 294 295 } else if (0xFC == (0xFE & ($in))) { 296 297 // First octet of 6 octet sequence, see comments for 5 octet sequence. 298 return UTF8_BAD_6OCTET; 299 300 } else { 301 // Current octet is neither in the US-ASCII range nor a legal first 302 // octet of a multi-octet sequence. 303 return UTF8_BAD_SEQID; 304 305 } 306 307 } else { 308 309 // When mState is non-zero, we expect a continuation of the multi-octet 310 // sequence 311 if (0x80 == (0xC0 & ($in))) { 312 313 // Legal continuation. 314 $shift = ($mState - 1) * 6; 315 $tmp = $in; 316 $tmp = ($tmp & 0x0000003F) << $shift; 317 $mUcs4 |= $tmp; 318 319 /** 320 * End of the multi-octet sequence. mUcs4 now contains the final 321 * Unicode codepoint to be output 322 */ 323 if (0 == --$mState) { 324 325 // From Unicode 3.1, non-shortest form is illegal 326 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 327 ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 328 ((4 == $mBytes) && ($mUcs4 < 0x10000)) ) { 329 return UTF8_BAD_NONSHORT; 330 331 // From Unicode 3.2, surrogate characters are illegal 332 } else if (($mUcs4 & 0xFFFFF800) == 0xD800) { 333 return UTF8_BAD_SURROGATE; 334 335 // Codepoints outside the Unicode range are illegal 336 } else if ($mUcs4 > 0x10FFFF) { 337 return UTF8_BAD_UNIOUTRANGE; 338 } 339 340 //initialize UTF8 cache 341 $mState = 0; 342 $mUcs4 = 0; 343 $mBytes = 1; 344 } 345 346 } else { 347 // ((0xC0 & (*in) != 0x80) && (mState != 0)) 348 // Incomplete multi-octet sequence. 349 $i--; 350 return UTF8_BAD_SEQINCOMPLETE; 351 } 352 } 353 } 354 355 if ( $mState != 0 ) { 356 // Incomplete multi-octet sequence. 357 $i--; 358 return UTF8_BAD_SEQINCOMPLETE; 359 } 360 361 // No bad octets found 362 $i = NULL; 363 return FALSE; 364 } 365 366 //-------------------------------------------------------------------- 367 /** 368 * Takes a return code from utf8_bad_identify() are returns a message 369 * (in English) explaining what the problem is. 370 * @param int return code from utf8_bad_identify 371 * @return mixed string message or FALSE if return code unknown 372 * @see utf8_bad_identify 373 * @package utf8 374 */ 375 function utf8_bad_explain($code) { 376 377 switch ($code) { 378 379 case UTF8_BAD_5OCTET: 380 return 'Five octet sequences are valid UTF-8 but are not supported by Unicode'; 381 break; 382 383 case UTF8_BAD_6OCTET: 384 return 'Six octet sequences are valid UTF-8 but are not supported by Unicode'; 385 break; 386 387 case UTF8_BAD_SEQID: 388 return 'Invalid octet for use as start of multi-byte UTF-8 sequence'; 389 break; 390 391 case UTF8_BAD_NONSHORT: 392 return 'From Unicode 3.1, non-shortest form is illegal'; 393 break; 394 395 case UTF8_BAD_SURROGATE: 396 return 'From Unicode 3.2, surrogate characters are illegal'; 397 break; 398 399 case UTF8_BAD_UNIOUTRANGE: 400 return 'Codepoints outside the Unicode range are illegal'; 401 break; 402 403 case UTF8_BAD_SEQINCOMPLETE: 404 return 'Incomplete multi-octet sequence'; 405 break; 406 407 } 408 409 trigger_error('Unknown error code: '.$code,E_USER_WARNING); 410 return FALSE; 411 412 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Wed Sep 7 05:41:13 2022 | Chilli.vc Blog - For Webmaster,Blog-Writer,System Admin and Domainer |