[ Index ] |
PHP Cross Reference of Joomla 4.2.2 documentation |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Tools for conversion between UTF-8 and unicode 4 * The Original Code is Mozilla Communicator client code. 5 * The Initial Developer of the Original Code is 6 * Netscape Communications Corporation. 7 * Portions created by the Initial Developer are Copyright (C) 1998 8 * the Initial Developer. All Rights Reserved. 9 * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi) 10 * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com) 11 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp 12 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp 13 * @see http://hsivonen.iki.fi/php-utf8/ 14 * @package utf8 15 */ 16 17 //-------------------------------------------------------------------- 18 /** 19 * Takes an UTF-8 string and returns an array of ints representing the 20 * Unicode characters. Astral planes are supported ie. the ints in the 21 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 22 * are not allowed. 23 * Returns false if the input string isn't a valid UTF-8 octet sequence 24 * and raises a PHP error at level E_USER_WARNING 25 * Note: this function has been modified slightly in this library to 26 * trigger errors on encountering bad bytes 27 * 28 * Joomla modification - As of PHP 7.4, curly brace access has been deprecated. As a result this function has been 29 * modified to use square brace syntax 30 * See https://github.com/php/php-src/commit/d574df63dc375f5fc9202ce5afde23f866b6450a 31 * for additional references 32 * 33 * @author <[email protected]> 34 * @param string UTF-8 encoded string 35 * @return mixed array of unicode code points or FALSE if UTF-8 invalid 36 * @see utf8_from_unicode 37 * @see http://hsivonen.iki.fi/php-utf8/ 38 * @package utf8 39 */ 40 function utf8_to_unicode($str) { 41 $mState = 0; // cached expected number of octets after the current octet 42 // until the beginning of the next UTF8 character sequence 43 $mUcs4 = 0; // cached Unicode character 44 $mBytes = 1; // cached expected number of octets in the current sequence 45 46 $out = array(); 47 48 $len = strlen($str); 49 50 for($i = 0; $i < $len; $i++) { 51 52 $in = ord($str[$i]); 53 54 if ( $mState == 0) { 55 56 // When mState is zero we expect either a US-ASCII character or a 57 // multi-octet sequence. 58 if (0 == (0x80 & ($in))) { 59 // US-ASCII, pass straight through. 60 $out[] = $in; 61 $mBytes = 1; 62 63 } else if (0xC0 == (0xE0 & ($in))) { 64 // First octet of 2 octet sequence 65 $mUcs4 = ($in); 66 $mUcs4 = ($mUcs4 & 0x1F) << 6; 67 $mState = 1; 68 $mBytes = 2; 69 70 } else if (0xE0 == (0xF0 & ($in))) { 71 // First octet of 3 octet sequence 72 $mUcs4 = ($in); 73 $mUcs4 = ($mUcs4 & 0x0F) << 12; 74 $mState = 2; 75 $mBytes = 3; 76 77 } else if (0xF0 == (0xF8 & ($in))) { 78 // First octet of 4 octet sequence 79 $mUcs4 = ($in); 80 $mUcs4 = ($mUcs4 & 0x07) << 18; 81 $mState = 3; 82 $mBytes = 4; 83 84 } else if (0xF8 == (0xFC & ($in))) { 85 /* First octet of 5 octet sequence. 86 * 87 * This is illegal because the encoded codepoint must be either 88 * (a) not the shortest form or 89 * (b) outside the Unicode range of 0-0x10FFFF. 90 * Rather than trying to resynchronize, we will carry on until the end 91 * of the sequence and let the later error handling code catch it. 92 */ 93 $mUcs4 = ($in); 94 $mUcs4 = ($mUcs4 & 0x03) << 24; 95 $mState = 4; 96 $mBytes = 5; 97 98 } else if (0xFC == (0xFE & ($in))) { 99 // First octet of 6 octet sequence, see comments for 5 octet sequence. 100 $mUcs4 = ($in); 101 $mUcs4 = ($mUcs4 & 1) << 30; 102 $mState = 5; 103 $mBytes = 6; 104 105 } else { 106 /* Current octet is neither in the US-ASCII range nor a legal first 107 * octet of a multi-octet sequence. 108 */ 109 trigger_error( 110 'utf8_to_unicode: Illegal sequence identifier '. 111 'in UTF-8 at byte '.$i, 112 E_USER_WARNING 113 ); 114 return FALSE; 115 116 } 117 118 } else { 119 120 // When mState is non-zero, we expect a continuation of the multi-octet 121 // sequence 122 if (0x80 == (0xC0 & ($in))) { 123 124 // Legal continuation. 125 $shift = ($mState - 1) * 6; 126 $tmp = $in; 127 $tmp = ($tmp & 0x0000003F) << $shift; 128 $mUcs4 |= $tmp; 129 130 /** 131 * End of the multi-octet sequence. mUcs4 now contains the final 132 * Unicode codepoint to be output 133 */ 134 if (0 == --$mState) { 135 136 /* 137 * Check for illegal sequences and codepoints. 138 */ 139 // From Unicode 3.1, non-shortest form is illegal 140 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 141 ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 142 ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 143 (4 < $mBytes) || 144 // From Unicode 3.2, surrogate characters are illegal 145 (($mUcs4 & 0xFFFFF800) == 0xD800) || 146 // Codepoints outside the Unicode range are illegal 147 ($mUcs4 > 0x10FFFF)) { 148 149 trigger_error( 150 'utf8_to_unicode: Illegal sequence or codepoint '. 151 'in UTF-8 at byte '.$i, 152 E_USER_WARNING 153 ); 154 155 return FALSE; 156 157 } 158 159 if (0xFEFF != $mUcs4) { 160 // BOM is legal but we don't want to output it 161 $out[] = $mUcs4; 162 } 163 164 //initialize UTF8 cache 165 $mState = 0; 166 $mUcs4 = 0; 167 $mBytes = 1; 168 } 169 170 } else { 171 /** 172 *((0xC0 & (*in) != 0x80) && (mState != 0)) 173 * Incomplete multi-octet sequence. 174 */ 175 trigger_error( 176 'utf8_to_unicode: Incomplete multi-octet '. 177 ' sequence in UTF-8 at byte '.$i, 178 E_USER_WARNING 179 ); 180 181 return FALSE; 182 } 183 } 184 } 185 return $out; 186 } 187 188 //-------------------------------------------------------------------- 189 /** 190 * Takes an array of ints representing the Unicode characters and returns 191 * a UTF-8 string. Astral planes are supported ie. the ints in the 192 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 193 * are not allowed. 194 * Returns false if the input array contains ints that represent 195 * surrogates or are outside the Unicode range 196 * and raises a PHP error at level E_USER_WARNING 197 * Note: this function has been modified slightly in this library to use 198 * output buffering to concatenate the UTF-8 string (faster) as well as 199 * reference the array by it's keys 200 * @param array of unicode code points representing a string 201 * @return mixed UTF-8 string or FALSE if array contains invalid code points 202 * @author <[email protected]> 203 * @see utf8_to_unicode 204 * @see http://hsivonen.iki.fi/php-utf8/ 205 * @package utf8 206 */ 207 function utf8_from_unicode($arr) { 208 ob_start(); 209 210 foreach (array_keys($arr) as $k) { 211 212 # ASCII range (including control chars) 213 if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) { 214 215 echo chr($arr[$k]); 216 217 # 2 byte sequence 218 } else if ($arr[$k] <= 0x07ff) { 219 220 echo chr(0xc0 | ($arr[$k] >> 6)); 221 echo chr(0x80 | ($arr[$k] & 0x003f)); 222 223 # Byte order mark (skip) 224 } else if($arr[$k] == 0xFEFF) { 225 226 // nop -- zap the BOM 227 228 # Test for illegal surrogates 229 } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { 230 231 // found a surrogate 232 trigger_error( 233 'utf8_from_unicode: Illegal surrogate '. 234 'at index: '.$k.', value: '.$arr[$k], 235 E_USER_WARNING 236 ); 237 238 return FALSE; 239 240 # 3 byte sequence 241 } else if ($arr[$k] <= 0xffff) { 242 243 echo chr(0xe0 | ($arr[$k] >> 12)); 244 echo chr(0x80 | (($arr[$k] >> 6) & 0x003f)); 245 echo chr(0x80 | ($arr[$k] & 0x003f)); 246 247 # 4 byte sequence 248 } else if ($arr[$k] <= 0x10ffff) { 249 250 echo chr(0xf0 | ($arr[$k] >> 18)); 251 echo chr(0x80 | (($arr[$k] >> 12) & 0x3f)); 252 echo chr(0x80 | (($arr[$k] >> 6) & 0x3f)); 253 echo chr(0x80 | ($arr[$k] & 0x3f)); 254 255 } else { 256 257 trigger_error( 258 'utf8_from_unicode: Codepoint out of Unicode range '. 259 'at index: '.$k.', value: '.$arr[$k], 260 E_USER_WARNING 261 ); 262 263 // out of range 264 return FALSE; 265 } 266 } 267 268 $result = ob_get_contents(); 269 ob_end_clean(); 270 return $result; 271 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Wed Sep 7 05:41:13 2022 | Chilli.vc Blog - For Webmaster,Blog-Writer,System Admin and Domainer |