[ Index ] |
PHP Cross Reference of Joomla 4.2.2 documentation |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Tools for validing a UTF-8 string is well formed. 4 * The Original Code is Mozilla Communicator client code. 5 * The Initial Developer of the Original Code is 6 * Netscape Communications Corporation. 7 * Portions created by the Initial Developer are Copyright (C) 1998 8 * the Initial Developer. All Rights Reserved. 9 * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi) 10 * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com) 11 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp 12 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp 13 * @see http://hsivonen.iki.fi/php-utf8/ 14 * @package utf8 15 */ 16 17 //-------------------------------------------------------------------- 18 /** 19 * Tests a string as to whether it's valid UTF-8 and supported by the 20 * Unicode standard 21 * Note: this function has been modified to simple return true or false 22 * @author <[email protected]> 23 * @param string UTF-8 encoded string 24 * @return boolean true if valid 25 * @see http://hsivonen.iki.fi/php-utf8/ 26 * @see utf8_compliant 27 * @package utf8 28 */ 29 function utf8_is_valid($str) { 30 31 $mState = 0; // cached expected number of octets after the current octet 32 // until the beginning of the next UTF8 character sequence 33 $mUcs4 = 0; // cached Unicode character 34 $mBytes = 1; // cached expected number of octets in the current sequence 35 36 $len = strlen($str); 37 38 for($i = 0; $i < $len; $i++) { 39 40 /* 41 * Joomla modification - As of PHP 7.4, curly brace access has been deprecated. As a result the line below has 42 * been modified to use square brace syntax 43 * See https://github.com/php/php-src/commit/d574df63dc375f5fc9202ce5afde23f866b6450a 44 * for additional references 45 */ 46 $in = ord($str[$i]); 47 48 if ( $mState == 0) { 49 50 // When mState is zero we expect either a US-ASCII character or a 51 // multi-octet sequence. 52 if (0 == (0x80 & ($in))) { 53 // US-ASCII, pass straight through. 54 $mBytes = 1; 55 56 } else if (0xC0 == (0xE0 & ($in))) { 57 // First octet of 2 octet sequence 58 $mUcs4 = ($in); 59 $mUcs4 = ($mUcs4 & 0x1F) << 6; 60 $mState = 1; 61 $mBytes = 2; 62 63 } else if (0xE0 == (0xF0 & ($in))) { 64 // First octet of 3 octet sequence 65 $mUcs4 = ($in); 66 $mUcs4 = ($mUcs4 & 0x0F) << 12; 67 $mState = 2; 68 $mBytes = 3; 69 70 } else if (0xF0 == (0xF8 & ($in))) { 71 // First octet of 4 octet sequence 72 $mUcs4 = ($in); 73 $mUcs4 = ($mUcs4 & 0x07) << 18; 74 $mState = 3; 75 $mBytes = 4; 76 77 } else if (0xF8 == (0xFC & ($in))) { 78 /* First octet of 5 octet sequence. 79 * 80 * This is illegal because the encoded codepoint must be either 81 * (a) not the shortest form or 82 * (b) outside the Unicode range of 0-0x10FFFF. 83 * Rather than trying to resynchronize, we will carry on until the end 84 * of the sequence and let the later error handling code catch it. 85 */ 86 $mUcs4 = ($in); 87 $mUcs4 = ($mUcs4 & 0x03) << 24; 88 $mState = 4; 89 $mBytes = 5; 90 91 } else if (0xFC == (0xFE & ($in))) { 92 // First octet of 6 octet sequence, see comments for 5 octet sequence. 93 $mUcs4 = ($in); 94 $mUcs4 = ($mUcs4 & 1) << 30; 95 $mState = 5; 96 $mBytes = 6; 97 98 } else { 99 /* Current octet is neither in the US-ASCII range nor a legal first 100 * octet of a multi-octet sequence. 101 */ 102 return FALSE; 103 104 } 105 106 } else { 107 108 // When mState is non-zero, we expect a continuation of the multi-octet 109 // sequence 110 if (0x80 == (0xC0 & ($in))) { 111 112 // Legal continuation. 113 $shift = ($mState - 1) * 6; 114 $tmp = $in; 115 $tmp = ($tmp & 0x0000003F) << $shift; 116 $mUcs4 |= $tmp; 117 118 /** 119 * End of the multi-octet sequence. mUcs4 now contains the final 120 * Unicode codepoint to be output 121 */ 122 if (0 == --$mState) { 123 124 /* 125 * Check for illegal sequences and codepoints. 126 */ 127 // From Unicode 3.1, non-shortest form is illegal 128 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 129 ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 130 ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 131 (4 < $mBytes) || 132 // From Unicode 3.2, surrogate characters are illegal 133 (($mUcs4 & 0xFFFFF800) == 0xD800) || 134 // Codepoints outside the Unicode range are illegal 135 ($mUcs4 > 0x10FFFF)) { 136 137 return FALSE; 138 139 } 140 141 //initialize UTF8 cache 142 $mState = 0; 143 $mUcs4 = 0; 144 $mBytes = 1; 145 } 146 147 } else { 148 /** 149 *((0xC0 & (*in) != 0x80) && (mState != 0)) 150 * Incomplete multi-octet sequence. 151 */ 152 153 return FALSE; 154 } 155 } 156 } 157 return TRUE; 158 } 159 160 //-------------------------------------------------------------------- 161 /** 162 * Tests whether a string complies as UTF-8. This will be much 163 * faster than utf8_is_valid but will pass five and six octet 164 * UTF-8 sequences, which are not supported by Unicode and 165 * so cannot be displayed correctly in a browser. In other words 166 * it is not as strict as utf8_is_valid but it's faster. If you use 167 * is to validate user input, you place yourself at the risk that 168 * attackers will be able to inject 5 and 6 byte sequences (which 169 * may or may not be a significant risk, depending on what you are 170 * are doing) 171 * @see utf8_is_valid 172 * @see http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805 173 * @param string UTF-8 string to check 174 * @return boolean TRUE if string is valid UTF-8 175 * @package utf8 176 */ 177 function utf8_compliant($str) { 178 if ( strlen($str) == 0 ) { 179 return TRUE; 180 } 181 // If even just the first character can be matched, when the /u 182 // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow 183 // invalid, nothing at all will match, even if the string contains 184 // some valid sequences 185 return (preg_match('/^.{1}/us',$str,$ar) == 1); 186 } 187
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Wed Sep 7 05:41:13 2022 | Chilli.vc Blog - For Webmaster,Blog-Writer,System Admin and Domainer |