|
TYPO3 API
SVNRelease
|
00001 <?php 00002 /*************************************************************** 00003 * Copyright notice 00004 * 00005 * (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com) 00006 * All rights reserved 00007 * 00008 * This script is part of the Typo3 project. The Typo3 project is 00009 * free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 2 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * The GNU General Public License can be found at 00015 * http://www.gnu.org/copyleft/gpl.html. 00016 * 00017 * This script is distributed in the hope that it will be useful, 00018 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00019 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00020 * GNU General Public License for more details. 00021 * 00022 * This copyright notice MUST APPEAR in all copies of the script! 00023 ***************************************************************/ 00024 /** 00025 * Class for conversion between charsets. 00026 * 00027 * $Id: class.t3lib_cs.php 10330 2011-01-26 10:28:29Z steffenk $ 00028 * 00029 * @author Kasper Skårhøj <kasperYYYY@typo3.com> 00030 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 00031 */ 00032 /** 00033 * [CLASS/FUNCTION INDEX of SCRIPT] 00034 * 00035 * 00036 * 00037 * 136: class t3lib_cs 00038 * 488: function parse_charset($charset) 00039 * 507: function get_locale_charset($locale) 00040 * 00041 * SECTION: Charset Conversion functions 00042 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) 00043 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) 00044 * 617: function utf8_encode($str,$charset) 00045 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0) 00046 * 706: function utf8_to_entities($str) 00047 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0) 00048 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0) 00049 * 823: function UnumberToChar($cbyte) 00050 * 868: function utf8CharToUnumber($str,$hex=0) 00051 * 00052 * SECTION: Init functions 00053 * 911: function initCharset($charset) 00054 * 973: function initUnicodeData($mode=null) 00055 * 1198: function initCaseFolding($charset) 00056 * 1260: function initToASCII($charset) 00057 * 00058 * SECTION: String operation functions 00059 * 1331: function substr($charset,$string,$start,$len=null) 00060 * 1384: function strlen($charset,$string) 00061 * 1414: function crop($charset,$string,$len,$crop='') 00062 * 1467: function strtrunc($charset,$string,$len) 00063 * 1501: function conv_case($charset,$string,$case) 00064 * 1527: function specCharsToASCII($charset,$string) 00065 * 00066 * SECTION: Internal string operation functions 00067 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='') 00068 * 00069 * SECTION: Internal UTF-8 string operation functions 00070 * 1622: function utf8_substr($str,$start,$len=null) 00071 * 1655: function utf8_strlen($str) 00072 * 1676: function utf8_strtrunc($str,$len) 00073 * 1698: function utf8_strpos($haystack,$needle,$offset=0) 00074 * 1723: function utf8_strrpos($haystack,$needle) 00075 * 1745: function utf8_char2byte_pos($str,$pos) 00076 * 1786: function utf8_byte2char_pos($str,$pos) 00077 * 1809: function utf8_char_mapping($str,$mode,$opt='') 00078 * 00079 * SECTION: Internal EUC string operation functions 00080 * 1885: function euc_strtrunc($str,$len,$charset) 00081 * 1914: function euc_substr($str,$start,$charset,$len=null) 00082 * 1939: function euc_strlen($str,$charset) 00083 * 1966: function euc_char2byte_pos($str,$pos,$charset) 00084 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='') 00085 * 00086 * TOTAL FUNCTIONS: 35 00087 * (This index is automatically created/updated by the extension "extdeveval") 00088 * 00089 */ 00090 00091 00092 /** 00093 * Notes on UTF-8 00094 * 00095 * Functions working on UTF-8 strings: 00096 * 00097 * - strchr/strstr 00098 * - strrchr 00099 * - substr_count 00100 * - implode/explode/join 00101 * 00102 * Functions nearly working on UTF-8 strings: 00103 * 00104 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen 00105 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII 00106 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos 00107 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0 00108 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier 00109 * 00110 * Functions NOT working on UTF-8 strings: 00111 * 00112 * - str*cmp 00113 * - stristr 00114 * - stripos 00115 * - substr 00116 * - strrev 00117 * - split/spliti 00118 * - ... 00119 * 00120 */ 00121 /** 00122 * Class for conversion between charsets 00123 * 00124 * @author Kasper Skårhøj <kasperYYYY@typo3.com> 00125 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 00126 * @package TYPO3 00127 * @subpackage t3lib 00128 */ 00129 class t3lib_cs { 00130 var $noCharByteVal = 63; // ASCII Value for chars with no equivalent. 00131 00132 // This is the array where parsed conversion tables are stored (cached) 00133 var $parsedCharsets = array(); 00134 00135 // An array where case folding data will be stored (cached) 00136 var $caseFolding = array(); 00137 00138 // An array where charset-to-ASCII mappings are stored (cached) 00139 var $toASCII = array(); 00140 00141 // This tells the converter which charsets has two bytes per char: 00142 var $twoByteSets = array( 00143 'ucs-2' => 1, // 2-byte Unicode 00144 ); 00145 00146 // This tells the converter which charsets has four bytes per char: 00147 var $fourByteSets = array( 00148 'ucs-4' => 1, // 4-byte Unicode 00149 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16) 00150 ); 00151 00152 // This tells the converter which charsets use a scheme like the Extended Unix Code: 00153 var $eucBasedSets = array( 00154 'gb2312' => 1, // Chinese, simplified. 00155 'big5' => 1, // Chinese, traditional. 00156 'euc-kr' => 1, // Korean 00157 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80! 00158 ); 00159 00160 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html 00161 // http://czyborra.com/charsets/iso8859.html 00162 var $synonyms = array( 00163 'us' => 'ascii', 00164 'us-ascii' => 'ascii', 00165 'cp819' => 'iso-8859-1', 00166 'ibm819' => 'iso-8859-1', 00167 'iso-ir-100' => 'iso-8859-1', 00168 'iso-ir-101' => 'iso-8859-2', 00169 'iso-ir-109' => 'iso-8859-3', 00170 'iso-ir-110' => 'iso-8859-4', 00171 'iso-ir-144' => 'iso-8859-5', 00172 'iso-ir-127' => 'iso-8859-6', 00173 'iso-ir-126' => 'iso-8859-7', 00174 'iso-ir-138' => 'iso-8859-8', 00175 'iso-ir-148' => 'iso-8859-9', 00176 'iso-ir-157' => 'iso-8859-10', 00177 'iso-ir-179' => 'iso-8859-13', 00178 'iso-ir-199' => 'iso-8859-14', 00179 'iso-ir-203' => 'iso-8859-15', 00180 'csisolatin1' => 'iso-8859-1', 00181 'csisolatin2' => 'iso-8859-2', 00182 'csisolatin3' => 'iso-8859-3', 00183 'csisolatin5' => 'iso-8859-9', 00184 'csisolatin8' => 'iso-8859-14', 00185 'csisolatin9' => 'iso-8859-15', 00186 'csisolatingreek' => 'iso-8859-7', 00187 'iso-celtic' => 'iso-8859-14', 00188 'latin1' => 'iso-8859-1', 00189 'latin2' => 'iso-8859-2', 00190 'latin3' => 'iso-8859-3', 00191 'latin5' => 'iso-8859-9', 00192 'latin6' => 'iso-8859-10', 00193 'latin8' => 'iso-8859-14', 00194 'latin9' => 'iso-8859-15', 00195 'l1' => 'iso-8859-1', 00196 'l2' => 'iso-8859-2', 00197 'l3' => 'iso-8859-3', 00198 'l5' => 'iso-8859-9', 00199 'l6' => 'iso-8859-10', 00200 'l8' => 'iso-8859-14', 00201 'l9' => 'iso-8859-15', 00202 'cyrillic' => 'iso-8859-5', 00203 'arabic' => 'iso-8859-6', 00204 'tis-620' => 'iso-8859-11', 00205 'win874' => 'windows-874', 00206 'win1250' => 'windows-1250', 00207 'win1251' => 'windows-1251', 00208 'win1252' => 'windows-1252', 00209 'win1253' => 'windows-1253', 00210 'win1254' => 'windows-1254', 00211 'win1255' => 'windows-1255', 00212 'win1256' => 'windows-1256', 00213 'win1257' => 'windows-1257', 00214 'win1258' => 'windows-1258', 00215 'cp1250' => 'windows-1250', 00216 'cp1251' => 'windows-1251', 00217 'cp1252' => 'windows-1252', 00218 'ms-ee' => 'windows-1250', 00219 'ms-ansi' => 'windows-1252', 00220 'ms-greek' => 'windows-1253', 00221 'ms-turk' => 'windows-1254', 00222 'winbaltrim' => 'windows-1257', 00223 'koi-8ru' => 'koi-8r', 00224 'koi8r' => 'koi-8r', 00225 'cp878' => 'koi-8r', 00226 'mac' => 'macroman', 00227 'macintosh' => 'macroman', 00228 'euc-cn' => 'gb2312', 00229 'x-euc-cn' => 'gb2312', 00230 'euccn' => 'gb2312', 00231 'cp936' => 'gb2312', 00232 'big-5' => 'big5', 00233 'cp950' => 'big5', 00234 'eucjp' => 'euc-jp', 00235 'sjis' => 'shift_jis', 00236 'shift-jis' => 'shift_jis', 00237 'cp932' => 'shift_jis', 00238 'cp949' => 'euc-kr', 00239 'utf7' => 'utf-7', 00240 'utf8' => 'utf-8', 00241 'utf16' => 'utf-16', 00242 'utf32' => 'utf-32', 00243 'utf8' => 'utf-8', 00244 'ucs2' => 'ucs-2', 00245 'ucs4' => 'ucs-4', 00246 ); 00247 00248 // mapping of iso-639-1 language codes to script names 00249 var $lang_to_script = array( 00250 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php 00251 'ar' => 'arabic', 00252 'bg' => 'cyrillic', // Bulgarian 00253 'bs' => 'east_european', // Bosnian 00254 'cs' => 'east_european', // Czech 00255 'da' => 'west_european', // Danish 00256 'de' => 'west_european', // German 00257 'es' => 'west_european', // Spanish 00258 'et' => 'estonian', 00259 'eo' => 'unicode', // Esperanto 00260 'eu' => 'west_european', // Basque 00261 'fa' => 'arabic', // Persian 00262 'fi' => 'west_european', // Finish 00263 'fo' => 'west_european', // Faroese 00264 'fr' => 'west_european', // French 00265 'ga' => 'west_european', // Irish 00266 'gl' => 'west_european', // Galician 00267 'gr' => 'greek', 00268 'he' => 'hebrew', // Hebrew (since 1998) 00269 'hi' => 'unicode', // Hindi 00270 'hr' => 'east_european', // Croatian 00271 'hu' => 'east_european', // Hungarian 00272 'iw' => 'hebrew', // Hebrew (til 1998) 00273 'is' => 'west_european', // Icelandic 00274 'it' => 'west_european', // Italian 00275 'ja' => 'japanese', 00276 'ka' => 'unicode', // Georgian 00277 'kl' => 'west_european', // Greenlandic 00278 'km' => 'unicode', // Khmer 00279 'ko' => 'korean', 00280 'lt' => 'lithuanian', 00281 'lv' => 'west_european', // Latvian/Lettish 00282 'nl' => 'west_european', // Dutch 00283 'no' => 'west_european', // Norwegian 00284 'nb' => 'west_european', // Norwegian Bokmal 00285 'nn' => 'west_european', // Norwegian Nynorsk 00286 'pl' => 'east_european', // Polish 00287 'pt' => 'west_european', // Portuguese 00288 'ro' => 'east_european', // Romanian 00289 'ru' => 'cyrillic', // Russian 00290 'sk' => 'east_european', // Slovak 00291 'sl' => 'east_european', // Slovenian 00292 'sr' => 'cyrillic', // Serbian 00293 'sv' => 'west_european', // Swedish 00294 'sq' => 'albanian', // Albanian 00295 'th' => 'thai', 00296 'uk' => 'cyrillic', // Ukranian 00297 'vi' => 'vietnamese', 00298 'zh' => 'chinese', 00299 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp 00300 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp 00301 'ara' => 'arabic', 00302 'bgr' => 'cyrillic', // Bulgarian 00303 'cat' => 'west_european', // Catalan 00304 'chs' => 'simpl_chinese', 00305 'cht' => 'trad_chinese', 00306 'csy' => 'east_european', // Czech 00307 'dan' => 'west_european', // Danisch 00308 'deu' => 'west_european', // German 00309 'dea' => 'west_european', // German (Austrian) 00310 'des' => 'west_european', // German (Swiss) 00311 'ena' => 'west_european', // English (Australian) 00312 'enc' => 'west_european', // English (Canadian) 00313 'eng' => 'west_european', // English 00314 'enz' => 'west_european', // English (New Zealand) 00315 'enu' => 'west_european', // English (United States) 00316 'euq' => 'west_european', // Basque 00317 'fos' => 'west_european', // Faroese 00318 'far' => 'arabic', // Persian 00319 'fin' => 'west_european', // Finish 00320 'fra' => 'west_european', // French 00321 'frb' => 'west_european', // French (Belgian) 00322 'frc' => 'west_european', // French (Canadian) 00323 'frs' => 'west_european', // French (Swiss) 00324 'geo' => 'unicode', // Georgian 00325 'glg' => 'west_european', // Galician 00326 'ell' => 'greek', 00327 'heb' => 'hebrew', 00328 'hin' => 'unicode', // Hindi 00329 'hun' => 'east_european', // Hungarian 00330 'isl' => 'west_euorpean', // Icelandic 00331 'ita' => 'west_european', // Italian 00332 'its' => 'west_european', // Italian (Swiss) 00333 'jpn' => 'japanese', 00334 'khm' => 'unicode', // Khmer 00335 'kor' => 'korean', 00336 'lth' => 'lithuanian', 00337 'lvi' => 'west_european', // Latvian/Lettish 00338 'msl' => 'west_european', // Malay 00339 'nlb' => 'west_european', // Dutch (Belgian) 00340 'nld' => 'west_european', // Dutch 00341 'nor' => 'west_european', // Norwegian (bokmal) 00342 'non' => 'west_european', // Norwegian (nynorsk) 00343 'plk' => 'east_european', // Polish 00344 'ptg' => 'west_european', // Portuguese 00345 'ptb' => 'west_european', // Portuguese (Brazil) 00346 'rom' => 'east_european', // Romanian 00347 'rus' => 'cyrillic', // Russian 00348 'slv' => 'east_european', // Slovenian 00349 'sky' => 'east_european', // Slovak 00350 'srl' => 'east_european', // Serbian (Latin) 00351 'srb' => 'cyrillic', // Serbian (Cyrillic) 00352 'esp' => 'west_european', // Spanish (trad. sort) 00353 'esm' => 'west_european', // Spanish (Mexican) 00354 'esn' => 'west_european', // Spanish (internat. sort) 00355 'sve' => 'west_european', // Swedish 00356 'sqi' => 'albanian', // Albanian 00357 'tha' => 'thai', 00358 'trk' => 'turkish', 00359 'ukr' => 'cyrillic', // Ukrainian 00360 // English language names 00361 'albanian' => 'albanian', 00362 'arabic' => 'arabic', 00363 'basque' => 'west_european', 00364 'bosnian' => 'east_european', 00365 'bulgarian' => 'east_european', 00366 'catalan' => 'west_european', 00367 'croatian' => 'east_european', 00368 'czech' => 'east_european', 00369 'danish' => 'west_european', 00370 'dutch' => 'west_european', 00371 'english' => 'west_european', 00372 'esperanto' => 'unicode', 00373 'estonian' => 'estonian', 00374 'faroese' => 'west_european', 00375 'farsi' => 'arabic', 00376 'finnish' => 'west_european', 00377 'french' => 'west_european', 00378 'galician' => 'west_european', 00379 'georgian' => 'unicode', 00380 'german' => 'west_european', 00381 'greek' => 'greek', 00382 'greenlandic' => 'west_european', 00383 'hebrew' => 'hebrew', 00384 'hindi' => 'unicode', 00385 'hungarian' => 'east_european', 00386 'icelandic' => 'west_european', 00387 'italian' => 'west_european', 00388 'khmer' => 'unicode', 00389 'latvian' => 'west_european', 00390 'lettish' => 'west_european', 00391 'lithuanian' => 'lithuanian', 00392 'malay' => 'west_european', 00393 'norwegian' => 'west_european', 00394 'persian' => 'arabic', 00395 'polish' => 'east_european', 00396 'portuguese' => 'west_european', 00397 'russian' => 'cyrillic', 00398 'romanian' => 'east_european', 00399 'serbian' => 'cyrillic', 00400 'slovak' => 'east_european', 00401 'slovenian' => 'east_european', 00402 'spanish' => 'west_european', 00403 'svedish' => 'west_european', 00404 'that' => 'thai', 00405 'turkish' => 'turkish', 00406 'ukrainian' => 'cyrillic', 00407 ); 00408 00409 // mapping of language (family) names to charsets on Unix 00410 var $script_to_charset_unix = array( 00411 'west_european' => 'iso-8859-1', 00412 'estonian' => 'iso-8859-1', 00413 'east_european' => 'iso-8859-2', 00414 'baltic' => 'iso-8859-4', 00415 'cyrillic' => 'iso-8859-5', 00416 'arabic' => 'iso-8859-6', 00417 'greek' => 'iso-8859-7', 00418 'hebrew' => 'iso-8859-8', 00419 'turkish' => 'iso-8859-9', 00420 'thai' => 'iso-8859-11', // = TIS-620 00421 'lithuanian' => 'iso-8859-13', 00422 'chinese' => 'gb2312', // = euc-cn 00423 'japanese' => 'euc-jp', 00424 'korean' => 'euc-kr', 00425 'simpl_chinese' => 'gb2312', 00426 'trad_chinese' => 'big5', 00427 'vietnamese' => '', 00428 'unicode' => 'utf-8', 00429 'albanian' => 'utf-8' 00430 ); 00431 00432 // mapping of language (family) names to charsets on Windows 00433 var $script_to_charset_windows = array( 00434 'east_european' => 'windows-1250', 00435 'cyrillic' => 'windows-1251', 00436 'west_european' => 'windows-1252', 00437 'greek' => 'windows-1253', 00438 'turkish' => 'windows-1254', 00439 'hebrew' => 'windows-1255', 00440 'arabic' => 'windows-1256', 00441 'baltic' => 'windows-1257', 00442 'estonian' => 'windows-1257', 00443 'lithuanian' => 'windows-1257', 00444 'vietnamese' => 'windows-1258', 00445 'thai' => 'cp874', 00446 'korean' => 'cp949', 00447 'chinese' => 'gb2312', 00448 'japanese' => 'shift_jis', 00449 'simpl_chinese' => 'gb2312', 00450 'trad_chinese' => 'big5', 00451 'albanian' => 'windows-1250', 00452 'unicode' => 'utf-8' 00453 ); 00454 00455 // mapping of locale names to charsets 00456 var $locale_to_charset = array( 00457 'japanese.euc' => 'euc-jp', 00458 'ja_jp.ujis' => 'euc-jp', 00459 'korean.euc' => 'euc-kr', 00460 'sr@Latn' => 'iso-8859-2', 00461 'zh_cn' => 'gb2312', 00462 'zh_hk' => 'big5', 00463 'zh_tw' => 'big5', 00464 ); 00465 00466 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3: 00467 // Empty values means "iso-8859-1" 00468 var $charSetArray = array( 00469 'dk' => '', 00470 'de' => '', 00471 'no' => '', 00472 'it' => '', 00473 'fr' => '', 00474 'es' => '', 00475 'nl' => '', 00476 'cz' => 'windows-1250', 00477 'pl' => 'iso-8859-2', 00478 'si' => 'windows-1250', 00479 'fi' => '', 00480 'tr' => 'iso-8859-9', 00481 'se' => '', 00482 'pt' => '', 00483 'ru' => 'windows-1251', 00484 'ro' => 'iso-8859-2', 00485 'ch' => 'gb2312', 00486 'sk' => 'windows-1250', 00487 'lt' => 'windows-1257', 00488 'is' => 'utf-8', 00489 'hr' => 'windows-1250', 00490 'hu' => 'iso-8859-2', 00491 'gl' => '', 00492 'th' => 'iso-8859-11', 00493 'gr' => 'iso-8859-7', 00494 'hk' => 'big5', 00495 'eu' => '', 00496 'bg' => 'windows-1251', 00497 'br' => '', 00498 'et' => 'iso-8859-4', 00499 'ar' => 'iso-8859-6', 00500 'he' => 'utf-8', 00501 'ua' => 'windows-1251', 00502 'jp' => 'shift_jis', 00503 'lv' => 'utf-8', 00504 'vn' => 'utf-8', 00505 'ca' => 'iso-8859-15', 00506 'ba' => 'iso-8859-2', 00507 'kr' => 'euc-kr', 00508 'eo' => 'utf-8', 00509 'my' => '', 00510 'hi' => 'utf-8', 00511 'fo' => 'utf-8', 00512 'fa' => 'utf-8', 00513 'sr' => 'utf-8', 00514 'sq' => 'utf-8', 00515 'ge' => 'utf-8', 00516 'ga' => '', 00517 'km' => 'utf-8', 00518 'qc' => '', 00519 ); 00520 00521 // TYPO3 specific: Array with the iso names used for each system language in TYPO3: 00522 // Missing keys means: same as Typo3 00523 var $isoArray = array( 00524 'ba' => 'bs', 00525 'br' => 'pt_BR', 00526 'ch' => 'zh_CN', 00527 'cz' => 'cs', 00528 'dk' => 'da', 00529 'si' => 'sl', 00530 'se' => 'sv', 00531 'gl' => 'kl', 00532 'gr' => 'el', 00533 'hk' => 'zh_HK', 00534 'kr' => 'ko', 00535 'ua' => 'uk', 00536 'jp' => 'ja', 00537 'qc' => 'fr_CA', 00538 'vn' => 'vi', 00539 'ge' => 'ka', 00540 'ga' => 'gl', 00541 ); 00542 00543 /** 00544 * Normalize - changes input character set to lowercase letters. 00545 * 00546 * @param string Input charset 00547 * @return string Normalized charset 00548 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 00549 */ 00550 function parse_charset($charset) { 00551 $charset = trim(strtolower($charset)); 00552 if (isset($this->synonyms[$charset])) { 00553 $charset = $this->synonyms[$charset]; 00554 } 00555 00556 return $charset; 00557 } 00558 00559 /** 00560 * Get the charset of a locale. 00561 * 00562 * ln language 00563 * ln_CN language / country 00564 * ln_CN.cs language / country / charset 00565 * ln_CN.cs@mod language / country / charset / modifier 00566 * 00567 * @param string Locale string 00568 * @return string Charset resolved for locale string 00569 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 00570 */ 00571 function get_locale_charset($locale) { 00572 $locale = strtolower($locale); 00573 00574 // exact locale specific charset? 00575 if (isset($this->locale_to_charset[$locale])) { 00576 return $this->locale_to_charset[$locale]; 00577 } 00578 00579 // get modifier 00580 list($locale, $modifier) = explode('@', $locale); 00581 00582 // locale contains charset: use it 00583 list($locale, $charset) = explode('.', $locale); 00584 if ($charset) { 00585 return $this->parse_charset($charset); 00586 } 00587 00588 // modifier is 'euro' (after charset check, because of xx.utf-8@euro) 00589 if ($modifier == 'euro') { 00590 return 'iso-8859-15'; 00591 } 00592 00593 // get language 00594 list($language, $country) = explode('_', $locale); 00595 if (isset($this->lang_to_script[$language])) { 00596 $script = $this->lang_to_script[$language]; 00597 } 00598 00599 if (TYPO3_OS == 'WIN') { 00600 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252'; 00601 } else { 00602 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1'; 00603 } 00604 00605 return $cs; 00606 } 00607 00608 00609 /******************************************** 00610 * 00611 * Charset Conversion functions 00612 * 00613 ********************************************/ 00614 00615 /** 00616 * Convert from one charset to another charset. 00617 * 00618 * @param string Input string 00619 * @param string From charset (the current charset of the string) 00620 * @param string To charset (the output charset wanted) 00621 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities 00622 * @return string Converted string 00623 * @see convArray() 00624 */ 00625 function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) { 00626 if ($fromCS == $toCS) { 00627 return $str; 00628 } 00629 00630 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything 00631 if ($toCS == 'utf-8' || !$useEntityForNoChar) { 00632 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) { 00633 case 'mbstring': 00634 $conv_str = mb_convert_encoding($str, $toCS, $fromCS); 00635 if (FALSE !== $conv_str) { 00636 return $conv_str; 00637 } // returns false for unsupported charsets 00638 break; 00639 00640 case 'iconv': 00641 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str); 00642 if (FALSE !== $conv_str) { 00643 return $conv_str; 00644 } 00645 break; 00646 00647 case 'recode': 00648 $conv_str = recode_string($fromCS . '..' . $toCS, $str); 00649 if (FALSE !== $conv_str) { 00650 return $conv_str; 00651 } 00652 break; 00653 } 00654 // fallback to TYPO3 conversion 00655 } 00656 00657 if ($fromCS != 'utf-8') { 00658 $str = $this->utf8_encode($str, $fromCS); 00659 } 00660 if ($toCS != 'utf-8') { 00661 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar); 00662 } 00663 return $str; 00664 } 00665 00666 /** 00667 * Convert all elements in ARRAY with type string from one charset to another charset. 00668 * NOTICE: Array is passed by reference! 00669 * 00670 * @param string Input array, possibly multidimensional 00671 * @param string From charset (the current charset of the string) 00672 * @param string To charset (the output charset wanted) 00673 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities 00674 * @return void 00675 * @see conv() 00676 */ 00677 function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) { 00678 foreach ($array as $key => $value) { 00679 if (is_array($array[$key])) { 00680 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar); 00681 } elseif (is_string($array[$key])) { 00682 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar); 00683 } 00684 } 00685 } 00686 00687 /** 00688 * Converts $str from $charset to UTF-8 00689 * 00690 * @param string String in local charset to convert to UTF-8 00691 * @param string Charset, lowercase. Must be found in csconvtbl/ folder. 00692 * @return string Output string, converted to UTF-8 00693 */ 00694 function utf8_encode($str, $charset) { 00695 00696 if ($charset === 'utf-8') { 00697 return $str; 00698 } 00699 00700 // Charset is case-insensitive. 00701 if ($this->initCharset($charset)) { // Parse conv. table if not already... 00702 $strLen = strlen($str); 00703 $outStr = ''; 00704 00705 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string. 00706 $chr = substr($str, $a, 1); 00707 $ord = ord($chr); 00708 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char 00709 $ord2 = ord($str{$a + 1}); 00710 $ord = $ord << 8 | $ord2; // assume big endian 00711 00712 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?) 00713 $outStr .= $this->parsedCharsets[$charset]['local'][$ord]; 00714 } else { 00715 $outStr .= chr($this->noCharByteVal); 00716 } // No char exists 00717 $a++; 00718 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8 00719 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int. 00720 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte 00721 $a++; 00722 $ord2 = ord(substr($str, $a, 1)); 00723 $ord = $ord * 256 + $ord2; 00724 } 00725 } 00726 00727 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?) 00728 $outStr .= $this->parsedCharsets[$charset]['local'][$ord]; 00729 } else { 00730 $outStr .= chr($this->noCharByteVal); 00731 } // No char exists 00732 } else { 00733 $outStr .= $chr; 00734 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00735 } 00736 return $outStr; 00737 } 00738 } 00739 00740 /** 00741 * Converts $str from UTF-8 to $charset 00742 * 00743 * @param string String in UTF-8 to convert to local charset 00744 * @param string Charset, lowercase. Must be found in csconvtbl/ folder. 00745 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities 00746 * @return string Output string, converted to local charset 00747 */ 00748 function utf8_decode($str, $charset, $useEntityForNoChar = 0) { 00749 00750 if ($charset === 'utf-8') { 00751 return $str; 00752 } 00753 00754 // Charset is case-insensitive. 00755 if ($this->initCharset($charset)) { // Parse conv. table if not already... 00756 $strLen = strlen($str); 00757 $outStr = ''; 00758 $buf = ''; 00759 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string. 00760 $chr = substr($str, $a, 1); 00761 $ord = ord($chr); 00762 if ($ord > 127) { // This means multibyte! (first byte!) 00763 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 00764 00765 $buf = $chr; // Add first byte 00766 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string... 00767 $ord = $ord << 1; // Shift it left and ... 00768 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00769 $a++; // Increase pointer... 00770 $buf .= substr($str, $a, 1); // ... and add the next char. 00771 } else { 00772 break; 00773 } 00774 } 00775 00776 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then... 00777 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number 00778 if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars. 00779 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255); 00780 } else { 00781 $outStr .= chr($mByte); 00782 } 00783 } elseif ($useEntityForNoChar) { // Create num entity: 00784 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';'; 00785 } else { 00786 $outStr .= chr($this->noCharByteVal); 00787 } // No char exists 00788 } else { 00789 $outStr .= chr($this->noCharByteVal); 00790 } // No char exists (MIDDLE of MB sequence!) 00791 } else { 00792 $outStr .= $chr; 00793 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00794 } 00795 return $outStr; 00796 } 00797 } 00798 00799 /** 00800 * Converts all chars > 127 to numeric entities. 00801 * 00802 * @param string Input string 00803 * @return string Output string 00804 */ 00805 function utf8_to_entities($str) { 00806 $strLen = strlen($str); 00807 $outStr = ''; 00808 $buf = ''; 00809 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string. 00810 $chr = substr($str, $a, 1); 00811 $ord = ord($chr); 00812 if ($ord > 127) { // This means multibyte! (first byte!) 00813 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 00814 $buf = $chr; // Add first byte 00815 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string... 00816 $ord = $ord << 1; // Shift it left and ... 00817 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00818 $a++; // Increase pointer... 00819 $buf .= substr($str, $a, 1); // ... and add the next char. 00820 } else { 00821 break; 00822 } 00823 } 00824 00825 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';'; 00826 } else { 00827 $outStr .= chr($this->noCharByteVal); 00828 } // No char exists (MIDDLE of MB sequence!) 00829 } else { 00830 $outStr .= $chr; 00831 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00832 } 00833 00834 return $outStr; 00835 } 00836 00837 /** 00838 * Converts numeric entities (UNICODE, eg. decimal (Ӓ) or hexadecimal ()) to UTF-8 multibyte chars 00839 * 00840 * @param string Input string, UTF-8 00841 * @param boolean If set, then all string-HTML entities (like & or £ will be converted as well) 00842 * @return string Output string 00843 */ 00844 function entities_to_utf8($str, $alsoStdHtmlEnt = 0) { 00845 if ($alsoStdHtmlEnt) { 00846 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below. 00847 } 00848 00849 $token = md5(microtime()); 00850 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str)); 00851 foreach ($parts as $k => $v) { 00852 if ($k % 2) { 00853 if (substr($v, 0, 1) == '#') { // Dec or hex entities: 00854 if (substr($v, 1, 1) == 'x') { 00855 $parts[$k] = $this->UnumberToChar(hexdec(substr($v, 2))); 00856 } else { 00857 $parts[$k] = $this->UnumberToChar(substr($v, 1)); 00858 } 00859 } elseif ($alsoStdHtmlEnt && $trans_tbl['&' . $v . ';']) { // Other entities: 00860 $parts[$k] = $this->utf8_encode($trans_tbl['&' . $v . ';'], 'iso-8859-1'); 00861 } else { // No conversion: 00862 $parts[$k] = '&' . $v . ';'; 00863 } 00864 } 00865 } 00866 00867 return implode('', $parts); 00868 } 00869 00870 /** 00871 * Converts all chars in the input UTF-8 string into integer numbers returned in an array 00872 * 00873 * @param string Input string, UTF-8 00874 * @param boolean If set, then all HTML entities (like & or £ or { or 㽝) will be detected as characters. 00875 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned. 00876 * @return array Output array with the char numbers 00877 */ 00878 function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) { 00879 // If entities must be registered as well...: 00880 if ($convEntities) { 00881 $str = $this->entities_to_utf8($str, 1); 00882 } 00883 // Do conversion: 00884 $strLen = strlen($str); 00885 $outArr = array(); 00886 $buf = ''; 00887 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string. 00888 $chr = substr($str, $a, 1); 00889 $ord = ord($chr); 00890 if ($ord > 127) { // This means multibyte! (first byte!) 00891 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 00892 $buf = $chr; // Add first byte 00893 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string... 00894 $ord = $ord << 1; // Shift it left and ... 00895 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00896 $a++; // Increase pointer... 00897 $buf .= substr($str, $a, 1); // ... and add the next char. 00898 } else { 00899 break; 00900 } 00901 } 00902 00903 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf); 00904 } else { 00905 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal; 00906 } // No char exists (MIDDLE of MB sequence!) 00907 } else { 00908 $outArr[] = $retChar ? chr($ord) : $ord; 00909 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00910 } 00911 00912 return $outArr; 00913 } 00914 00915 /** 00916 * Converts a UNICODE number to a UTF-8 multibyte character 00917 * Algorithm based on script found at From: http://czyborra.com/utf/ 00918 * Unit-tested by Kasper 00919 * 00920 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence: 00921 * 00922 * bytes | bits | representation 00923 * 1 | 7 | 0vvvvvvv 00924 * 2 | 11 | 110vvvvv 10vvvvvv 00925 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv 00926 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv 00927 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 00928 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 00929 * 00930 * @param integer UNICODE integer 00931 * @return string UTF-8 multibyte character string 00932 * @see utf8CharToUnumber() 00933 */ 00934 function UnumberToChar($cbyte) { 00935 $str = ''; 00936 00937 if ($cbyte < 0x80) { 00938 $str .= chr($cbyte); 00939 } else { 00940 if ($cbyte < 0x800) { 00941 $str .= chr(0xC0 | ($cbyte >> 6)); 00942 $str .= chr(0x80 | ($cbyte & 0x3F)); 00943 } else { 00944 if ($cbyte < 0x10000) { 00945 $str .= chr(0xE0 | ($cbyte >> 12)); 00946 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F)); 00947 $str .= chr(0x80 | ($cbyte & 0x3F)); 00948 } else { 00949 if ($cbyte < 0x200000) { 00950 $str .= chr(0xF0 | ($cbyte >> 18)); 00951 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F)); 00952 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F)); 00953 $str .= chr(0x80 | ($cbyte & 0x3F)); 00954 } else { 00955 if ($cbyte < 0x4000000) { 00956 $str .= chr(0xF8 | ($cbyte >> 24)); 00957 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F)); 00958 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F)); 00959 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F)); 00960 $str .= chr(0x80 | ($cbyte & 0x3F)); 00961 } else { 00962 if ($cbyte < 0x80000000) { 00963 $str .= chr(0xFC | ($cbyte >> 30)); 00964 $str .= chr(0x80 | (($cbyte >> 24) & 0x3F)); 00965 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F)); 00966 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F)); 00967 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F)); 00968 $str .= chr(0x80 | ($cbyte & 0x3F)); 00969 } else { // Cannot express a 32-bit character in UTF-8 00970 $str .= chr($this->noCharByteVal); 00971 } 00972 } 00973 } 00974 } 00975 } 00976 } 00977 return $str; 00978 } 00979 00980 /** 00981 * Converts a UTF-8 Multibyte character to a UNICODE number 00982 * Unit-tested by Kasper 00983 * 00984 * @param string UTF-8 multibyte character string 00985 * @param boolean If set, then a hex. number is returned. 00986 * @return integer UNICODE integer 00987 * @see UnumberToChar() 00988 */ 00989 function utf8CharToUnumber($str, $hex = 0) { 00990 $ord = ord(substr($str, 0, 1)); // First char 00991 00992 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string 00993 $binBuf = ''; 00994 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string... 00995 $ord = $ord << 1; // Shift it left and ... 00996 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00997 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6); 00998 } else { 00999 break; 01000 } 01001 } 01002 $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf; 01003 01004 $int = bindec($binBuf); 01005 } else { 01006 $int = $ord; 01007 } 01008 01009 return $hex ? 'x' . dechex($int) : $int; 01010 } 01011 01012 01013 /******************************************** 01014 * 01015 * Init functions 01016 * 01017 ********************************************/ 01018 01019 /** 01020 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder 01021 * This function is automatically called by the conversion functions 01022 * 01023 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/ 01024 * 01025 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl) 01026 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed. 01027 * @access private 01028 */ 01029 function initCharset($charset) { 01030 // Only process if the charset is not yet loaded: 01031 if (!is_array($this->parsedCharsets[$charset])) { 01032 01033 // Conversion table filename: 01034 $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl'; 01035 01036 // If the conversion table is found: 01037 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) { 01038 // Cache file for charsets: 01039 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero. 01040 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl'); 01041 if ($cacheFile && @is_file($cacheFile)) { 01042 $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 01043 } else { 01044 // Parse conversion table into lines: 01045 $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1); 01046 // Initialize the internal variable holding the conv. table: 01047 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array()); 01048 // traverse the lines: 01049 $detectedType = ''; 01050 foreach ($lines as $value) { 01051 if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored. 01052 01053 // Detect type if not done yet: (Done on first real line) 01054 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE" 01055 if (!$detectedType) { 01056 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token'; 01057 } 01058 01059 if ($detectedType == 'ms-token') { 01060 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3); 01061 } elseif ($detectedType == 'whitespaced') { 01062 $regA = array(); 01063 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA); 01064 $hexbyte = $regA[1]; 01065 $utf8 = 'U+' . $regA[2]; 01066 } 01067 $decval = hexdec(trim($hexbyte)); 01068 if ($decval > 127) { 01069 $utf8decval = hexdec(substr(trim($utf8), 2)); 01070 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval); 01071 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval; 01072 } 01073 } 01074 } 01075 if ($cacheFile) { 01076 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset])); 01077 } 01078 } 01079 return 2; 01080 } else { 01081 return FALSE; 01082 } 01083 } else { 01084 return 1; 01085 } 01086 } 01087 01088 /** 01089 * This function initializes all UTF-8 character data tables. 01090 * 01091 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/ 01092 * 01093 * @param string Mode ("case", "ascii", ...) 01094 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached). 01095 * @access private 01096 */ 01097 function initUnicodeData($mode = NULL) { 01098 // cache files 01099 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl'); 01100 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl'); 01101 01102 // Only process if the tables are not yet loaded 01103 switch ($mode) { 01104 case 'case': 01105 if (is_array($this->caseFolding['utf-8'])) { 01106 return 1; 01107 } 01108 01109 // Use cached version if possible 01110 if ($cacheFileCase && @is_file($cacheFileCase)) { 01111 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase)); 01112 return 2; 01113 } 01114 break; 01115 01116 case 'ascii': 01117 if (is_array($this->toASCII['utf-8'])) { 01118 return 1; 01119 } 01120 01121 // Use cached version if possible 01122 if ($cacheFileASCII && @is_file($cacheFileASCII)) { 01123 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII)); 01124 return 2; 01125 } 01126 break; 01127 } 01128 01129 // process main Unicode data file 01130 $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt'; 01131 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) { 01132 return FALSE; 01133 } 01134 01135 $fh = fopen($unicodeDataFile, 'rb'); 01136 if (!$fh) { 01137 return FALSE; 01138 } 01139 01140 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence) 01141 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper) 01142 $this->caseFolding['utf-8'] = array(); 01143 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand 01144 $utf8CaseFolding['toUpper'] = array(); 01145 $utf8CaseFolding['toLower'] = array(); 01146 $utf8CaseFolding['toTitle'] = array(); 01147 01148 $decomposition = array(); // array of temp. decompositions 01149 $mark = array(); // array of chars that are marks (eg. composing accents) 01150 $number = array(); // array of chars that are numbers (eg. digits) 01151 $omit = array(); // array of chars to be omitted (eg. Russian hard sign) 01152 01153 while (!feof($fh)) { 01154 $line = fgets($fh, 4096); 01155 // has a lot of info 01156 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line)); 01157 01158 $ord = hexdec($char); 01159 if ($ord > 0xFFFF) { 01160 break; 01161 } // only process the BMP 01162 01163 $utf8_char = $this->UnumberToChar($ord); 01164 01165 if ($upper) { 01166 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper)); 01167 } 01168 if ($lower) { 01169 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower)); 01170 } 01171 // store "title" only when different from "upper" (only a few) 01172 if ($title && $title != $upper) { 01173 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title)); 01174 } 01175 01176 switch ($cat{0}) { 01177 case 'M': // mark (accent, umlaut, ...) 01178 $mark["U+$char"] = 1; 01179 break; 01180 01181 case 'N': // numeric value 01182 if ($ord > 0x80 && $num != '') { 01183 $number["U+$char"] = $num; 01184 } 01185 } 01186 01187 // accented Latin letters without "official" decomposition 01188 $match = array(); 01189 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) { 01190 $c = ord($match[2]); 01191 if ($match[1] == 'SMALL') { 01192 $c += 32; 01193 } 01194 01195 $decomposition["U+$char"] = array(dechex($c)); 01196 continue; 01197 } 01198 01199 $match = array(); 01200 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) { 01201 switch ($match[1]) { 01202 case '<circle>': // add parenthesis as circle replacement, eg (1) 01203 $match[2] = '0028 ' . $match[2] . ' 0029'; 01204 break; 01205 01206 case '<square>': // add square brackets as square replacement, eg [1] 01207 $match[2] = '005B ' . $match[2] . ' 005D'; 01208 break; 01209 01210 case '<compat>': // ignore multi char decompositions that start with a space 01211 if (preg_match('/^0020 /', $match[2])) { 01212 continue 2; 01213 } 01214 break; 01215 01216 // ignore Arabic and vertical layout presentation decomposition 01217 case '<initial>': 01218 case '<medial>': 01219 case '<final>': 01220 case '<isolated>': 01221 case '<vertical>': 01222 continue 2; 01223 } 01224 $decomposition["U+$char"] = explode(' ', $match[2]); 01225 } 01226 } 01227 fclose($fh); 01228 01229 // process additional Unicode data for casing (allow folded characters to expand into a sequence) 01230 $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt'; 01231 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) { 01232 $fh = fopen($specialCasingFile, 'rb'); 01233 if ($fh) { 01234 while (!feof($fh)) { 01235 $line = fgets($fh, 4096); 01236 if ($line{0} != '#' && trim($line) != '') { 01237 01238 list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line); 01239 if ($cond == '' || $cond{0} == '#') { 01240 $utf8_char = $this->UnumberToChar(hexdec($char)); 01241 if ($char != $lower) { 01242 $arr = explode(' ', $lower); 01243 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 01244 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr); 01245 } 01246 if ($char != $title && $title != $upper) { 01247 $arr = explode(' ', $title); 01248 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 01249 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr); 01250 } 01251 if ($char != $upper) { 01252 $arr = explode(' ', $upper); 01253 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 01254 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr); 01255 } 01256 } 01257 } 01258 } 01259 fclose($fh); 01260 } 01261 } 01262 01263 // process custom decompositions 01264 $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt'; 01265 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) { 01266 $fh = fopen($customTranslitFile, 'rb'); 01267 if ($fh) { 01268 while (!feof($fh)) { 01269 $line = fgets($fh, 4096); 01270 if ($line{0} != '#' && trim($line) != '') { 01271 list($char, $translit) = t3lib_div::trimExplode(';', $line); 01272 if (!$translit) { 01273 $omit["U+$char"] = 1; 01274 } 01275 $decomposition["U+$char"] = explode(' ', $translit); 01276 01277 } 01278 } 01279 fclose($fh); 01280 } 01281 } 01282 01283 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>) 01284 foreach ($decomposition as $from => $to) { 01285 $code_decomp = array(); 01286 01287 while ($code_value = array_shift($to)) { 01288 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition 01289 foreach (array_reverse($decomposition["U+$code_value"]) as $cv) { 01290 array_unshift($to, $cv); 01291 } 01292 } elseif (!isset($mark["U+$code_value"])) { // remove mark 01293 array_push($code_decomp, $code_value); 01294 } 01295 } 01296 if (count($code_decomp) || isset($omit[$from])) { 01297 $decomposition[$from] = $code_decomp; 01298 } else { 01299 unset($decomposition[$from]); 01300 } 01301 } 01302 01303 // create ascii only mapping 01304 $this->toASCII['utf-8'] = array(); 01305 $ascii =& $this->toASCII['utf-8']; 01306 01307 foreach ($decomposition as $from => $to) { 01308 $code_decomp = array(); 01309 while ($code_value = array_shift($to)) { 01310 $ord = hexdec($code_value); 01311 if ($ord > 127) { 01312 continue 2; 01313 } // skip decompositions containing non-ASCII chars 01314 else 01315 { 01316 array_push($code_decomp, chr($ord)); 01317 } 01318 } 01319 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp); 01320 } 01321 01322 // add numeric decompositions 01323 foreach ($number as $from => $to) { 01324 $utf8_char = $this->UnumberToChar(hexdec($from)); 01325 if (!isset($ascii[$utf8_char])) { 01326 $ascii[$utf8_char] = $to; 01327 } 01328 } 01329 01330 if ($cacheFileCase) { 01331 t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding)); 01332 } 01333 01334 if ($cacheFileASCII) { 01335 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii)); 01336 } 01337 01338 return 3; 01339 } 01340 01341 /** 01342 * This function initializes the folding table for a charset other than UTF-8. 01343 * This function is automatically called by the case folding functions. 01344 * 01345 * @param string Charset for which to initialize case folding. 01346 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached). 01347 * @access private 01348 */ 01349 function initCaseFolding($charset) { 01350 // Only process if the case table is not yet loaded: 01351 if (is_array($this->caseFolding[$charset])) { 01352 return 1; 01353 } 01354 01355 // Use cached version if possible 01356 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl'); 01357 if ($cacheFile && @is_file($cacheFile)) { 01358 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 01359 return 2; 01360 } 01361 01362 // init UTF-8 conversion for this charset 01363 if (!$this->initCharset($charset)) { 01364 return FALSE; 01365 } 01366 01367 // UTF-8 case folding is used as the base conversion table 01368 if (!$this->initUnicodeData('case')) { 01369 return FALSE; 01370 } 01371 01372 $nochar = chr($this->noCharByteVal); 01373 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) { 01374 // reconvert to charset (don't use chr() of numeric value, might be muli-byte) 01375 $c = $this->utf8_decode($utf8, $charset); 01376 01377 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset); 01378 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset); 01379 if ($cc != '' && $cc != $nochar) { 01380 $this->caseFolding[$charset]['toUpper'][$c] = $cc; 01381 } 01382 01383 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset); 01384 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset); 01385 if ($cc != '' && $cc != $nochar) { 01386 $this->caseFolding[$charset]['toLower'][$c] = $cc; 01387 } 01388 01389 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset); 01390 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset); 01391 if ($cc != '' && $cc != $nochar) { 01392 $this->caseFolding[$charset]['toTitle'][$c] = $cc; 01393 } 01394 } 01395 01396 // add the ASCII case table 01397 for ($i = ord('a'); $i <= ord('z'); $i++) { 01398 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32); 01399 } 01400 for ($i = ord('A'); $i <= ord('Z'); $i++) { 01401 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32); 01402 } 01403 01404 if ($cacheFile) { 01405 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset])); 01406 } 01407 01408 return 3; 01409 } 01410 01411 /** 01412 * This function initializes the to-ASCII conversion table for a charset other than UTF-8. 01413 * This function is automatically called by the ASCII transliteration functions. 01414 * 01415 * @param string Charset for which to initialize conversion. 01416 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached). 01417 * @access private 01418 */ 01419 function initToASCII($charset) { 01420 // Only process if the case table is not yet loaded: 01421 if (is_array($this->toASCII[$charset])) { 01422 return 1; 01423 } 01424 01425 // Use cached version if possible 01426 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl'); 01427 if ($cacheFile && @is_file($cacheFile)) { 01428 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 01429 return 2; 01430 } 01431 01432 // init UTF-8 conversion for this charset 01433 if (!$this->initCharset($charset)) { 01434 return FALSE; 01435 } 01436 01437 // UTF-8/ASCII transliteration is used as the base conversion table 01438 if (!$this->initUnicodeData('ascii')) { 01439 return FALSE; 01440 } 01441 01442 $nochar = chr($this->noCharByteVal); 01443 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) { 01444 // reconvert to charset (don't use chr() of numeric value, might be muli-byte) 01445 $c = $this->utf8_decode($utf8, $charset); 01446 01447 if (isset($this->toASCII['utf-8'][$utf8])) { 01448 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8]; 01449 } 01450 } 01451 01452 if ($cacheFile) { 01453 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset])); 01454 } 01455 01456 return 3; 01457 } 01458 01459 01460 /******************************************** 01461 * 01462 * String operation functions 01463 * 01464 ********************************************/ 01465 01466 /** 01467 * Returns a part of a string. 01468 * Unit-tested by Kasper (single byte charsets only) 01469 * 01470 * @param string The character set 01471 * @param string Character string 01472 * @param integer Start position (character position) 01473 * @param integer Length (in characters) 01474 * @return string The substring 01475 * @see substr(), mb_substr() 01476 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 01477 */ 01478 function substr($charset, $string, $start, $len = NULL) { 01479 if ($len === 0 || $string === '') { 01480 return ''; 01481 } 01482 01483 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01484 // cannot omit $len, when specifying charset 01485 if ($len == NULL) { 01486 $enc = mb_internal_encoding(); // save internal encoding 01487 mb_internal_encoding($charset); 01488 $str = mb_substr($string, $start); 01489 mb_internal_encoding($enc); // restore internal encoding 01490 01491 return $str; 01492 } 01493 else { 01494 return mb_substr($string, $start, $len, $charset); 01495 } 01496 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 01497 // cannot omit $len, when specifying charset 01498 if ($len == NULL) { 01499 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding 01500 iconv_set_encoding('internal_encoding', $charset); 01501 $str = iconv_substr($string, $start); 01502 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding 01503 01504 return $str; 01505 } 01506 else { 01507 return iconv_substr($string, $start, $len, $charset); 01508 } 01509 } elseif ($charset == 'utf-8') { 01510 return $this->utf8_substr($string, $start, $len); 01511 } elseif ($this->eucBasedSets[$charset]) { 01512 return $this->euc_substr($string, $start, $charset, $len); 01513 } elseif ($this->twoByteSets[$charset]) { 01514 return substr($string, $start * 2, $len * 2); 01515 } elseif ($this->fourByteSets[$charset]) { 01516 return substr($string, $start * 4, $len * 4); 01517 } 01518 01519 // treat everything else as single-byte encoding 01520 return $len === NULL ? substr($string, $start) : substr($string, $start, $len); 01521 } 01522 01523 /** 01524 * Counts the number of characters. 01525 * Unit-tested by Kasper (single byte charsets only) 01526 * 01527 * @param string The character set 01528 * @param string Character string 01529 * @return integer The number of characters 01530 * @see strlen() 01531 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 01532 */ 01533 function strlen($charset, $string) { 01534 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01535 return mb_strlen($string, $charset); 01536 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 01537 return iconv_strlen($string, $charset); 01538 } elseif ($charset == 'utf-8') { 01539 return $this->utf8_strlen($string); 01540 } elseif ($this->eucBasedSets[$charset]) { 01541 return $this->euc_strlen($string, $charset); 01542 } elseif ($this->twoByteSets[$charset]) { 01543 return strlen($string) / 2; 01544 } elseif ($this->fourByteSets[$charset]) { 01545 return strlen($string) / 4; 01546 } 01547 // treat everything else as single-byte encoding 01548 return strlen($string); 01549 } 01550 01551 /** 01552 * Method to crop strings using the mb_substr function. 01553 * 01554 * @param string The character set 01555 * @param string String to be cropped 01556 * @param integer Crop length (in characters) 01557 * @param string Crop signifier 01558 * @return string The shortened string 01559 * @see mb_strlen(), mb_substr() 01560 */ 01561 protected function cropMbstring($charset, $string, $len, $crop = '') { 01562 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) { 01563 return $string; 01564 } 01565 01566 if ($len > 0) { 01567 $string = mb_substr($string, 0, $len, $charset) . $crop; 01568 } else { 01569 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset); 01570 } 01571 01572 return $string; 01573 } 01574 01575 /** 01576 * Truncates a string and pre-/appends a string. 01577 * Unit tested by Kasper 01578 * 01579 * @param string The character set 01580 * @param string Character string 01581 * @param integer Length (in characters) 01582 * @param string Crop signifier 01583 * @return string The shortened string 01584 * @see substr(), mb_strimwidth() 01585 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 01586 */ 01587 function crop($charset, $string, $len, $crop = '') { 01588 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01589 return $this->cropMbstring($charset, $string, $len, $crop); 01590 } 01591 01592 if (intval($len) == 0) { 01593 return $string; 01594 } 01595 01596 if ($charset == 'utf-8') { 01597 $i = $this->utf8_char2byte_pos($string, $len); 01598 } elseif ($this->eucBasedSets[$charset]) { 01599 $i = $this->euc_char2byte_pos($string, $len, $charset); 01600 } else { 01601 if ($len > 0) { 01602 $i = $len; 01603 } else { 01604 $i = strlen($string) + $len; 01605 if ($i <= 0) { 01606 $i = FALSE; 01607 } 01608 } 01609 } 01610 01611 if ($i === FALSE) { // $len outside actual string length 01612 return $string; 01613 } else { 01614 if ($len > 0) { 01615 if (strlen($string{$i})) { 01616 return substr($string, 0, $i) . $crop; 01617 01618 } 01619 } else { 01620 if (strlen($string{$i - 1})) { 01621 return $crop . substr($string, $i); 01622 } 01623 } 01624 01625 /* 01626 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...) 01627 if ($len > 0) { 01628 return substr($string,0,$i).$crop; 01629 } else { 01630 return $crop.substr($string,$i); 01631 } 01632 } 01633 */ 01634 } 01635 return $string; 01636 } 01637 01638 /** 01639 * Cuts a string short at a given byte length. 01640 * 01641 * @param string The character set 01642 * @param string Character string 01643 * @param integer The byte length 01644 * @return string The shortened string 01645 * @see mb_strcut() 01646 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 01647 */ 01648 function strtrunc($charset, $string, $len) { 01649 if ($len <= 0) { 01650 return ''; 01651 } 01652 01653 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01654 return mb_strcut($string, 0, $len, $charset); 01655 } elseif ($charset == 'utf-8') { 01656 return $this->utf8_strtrunc($string, $len); 01657 } elseif ($this->eucBasedSets[$charset]) { 01658 return $this->euc_strtrunc($string, $len, $charset); 01659 } elseif ($this->twoByteSets[$charset]) { 01660 if ($len % 2) { 01661 $len--; 01662 } // don't cut at odd positions 01663 } elseif ($this->fourByteSets[$charset]) { 01664 $x = $len % 4; 01665 $len -= $x; // realign to position dividable by four 01666 } 01667 // treat everything else as single-byte encoding 01668 return substr($string, 0, $len); 01669 } 01670 01671 /** 01672 * Translates all characters of a string into their respective case values. 01673 * Unlike strtolower() and strtoupper() this method is locale independent. 01674 * Note that the string length may change! 01675 * eg. lower case German "ß" (sharp S) becomes upper case "SS" 01676 * Unit-tested by Kasper 01677 * Real case folding is language dependent, this method ignores this fact. 01678 * 01679 * @param string Character set of string 01680 * @param string Input string to convert case for 01681 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" ) 01682 * @return string The converted string 01683 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 01684 * @see strtolower(), strtoupper() 01685 */ 01686 function conv_case($charset, $string, $case) { 01687 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01688 if ($case == 'toLower') { 01689 $string = mb_strtolower($string, $charset); 01690 } else { 01691 $string = mb_strtoupper($string, $charset); 01692 } 01693 } elseif ($charset == 'utf-8') { 01694 $string = $this->utf8_char_mapping($string, 'case', $case); 01695 } elseif (isset($this->eucBasedSets[$charset])) { 01696 $string = $this->euc_char_mapping($string, $charset, 'case', $case); 01697 } else { 01698 // treat everything else as single-byte encoding 01699 $string = $this->sb_char_mapping($string, $charset, 'case', $case); 01700 } 01701 01702 return $string; 01703 } 01704 01705 /** 01706 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.) 01707 * 01708 * @param string Character set of string 01709 * @param string Input string to convert 01710 * @return string The converted string 01711 */ 01712 function specCharsToASCII($charset, $string) { 01713 if ($charset == 'utf-8') { 01714 $string = $this->utf8_char_mapping($string, 'ascii'); 01715 } elseif (isset($this->eucBasedSets[$charset])) { 01716 $string = $this->euc_char_mapping($string, $charset, 'ascii'); 01717 } else { 01718 // treat everything else as single-byte encoding 01719 $string = $this->sb_char_mapping($string, $charset, 'ascii'); 01720 } 01721 01722 return $string; 01723 } 01724 01725 01726 /** 01727 * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE) 01728 * into a TYPO3-readable language code 01729 * @param $languageCodesList list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1' 01730 * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4 01731 * @return string a preferred language that TYPO3 supports, or "default" if none found 01732 * @author Benjamin Mack (benni.typo3.org) 01733 */ 01734 public function getPreferredClientLanguage($languageCodesList) { 01735 $allLanguageCodes = array(); 01736 $selectedLanguage = 'default'; 01737 01738 // get all languages where TYPO3 code is the same as the ISO code 01739 foreach ($this->charSetArray as $typo3Lang => $charSet) { 01740 $allLanguageCodes[$typo3Lang] = $typo3Lang; 01741 } 01742 01743 // get all languages where TYPO3 code differs from ISO code 01744 // or needs the country part 01745 // the iso codes will here overwrite the default typo3 language in the key 01746 foreach ($this->isoArray as $typo3Lang => $isoLang) { 01747 $isoLang = join('-', explode('_', $isoLang)); 01748 $allLanguageCodes[$typo3Lang] = $isoLang; 01749 } 01750 01751 // move the iso codes to the (because we're comparing the keys with "isset" later on) 01752 $allLanguageCodes = array_flip($allLanguageCodes); 01753 01754 01755 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList); 01756 // order the preferred languages after they key 01757 $sortedPreferredLanguages = array(); 01758 foreach ($preferredLanguages as $preferredLanguage) { 01759 $quality = 1.0; 01760 if (strpos($preferredLanguage, ';q=') !== FALSE) { 01761 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage); 01762 } 01763 $sortedPreferredLanguages[$preferredLanguage] = $quality; 01764 } 01765 01766 // loop through the languages, with the highest priority first 01767 arsort($sortedPreferredLanguages, SORT_NUMERIC); 01768 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) { 01769 if (isset($allLanguageCodes[$preferredLanguage])) { 01770 $selectedLanguage = $allLanguageCodes[$preferredLanguage]; 01771 break; 01772 } 01773 01774 // strip the country code from the end 01775 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage); 01776 if (isset($allLanguageCodes[$preferredLanguage])) { 01777 $selectedLanguage = $allLanguageCodes[$preferredLanguage]; 01778 break; 01779 } 01780 } 01781 if (!$selectedLanguage || $selectedLanguage == 'en') { 01782 $selectedLanguage = 'default'; 01783 } 01784 return $selectedLanguage; 01785 } 01786 01787 01788 /******************************************** 01789 * 01790 * Internal string operation functions 01791 * 01792 ********************************************/ 01793 01794 /** 01795 * Maps all characters of a string in a single byte charset. 01796 * 01797 * @param string the string 01798 * @param string the charset 01799 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration) 01800 * @param string 'case': conversion 'toLower' or 'toUpper' 01801 * @return string the converted string 01802 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 01803 */ 01804 function sb_char_mapping($str, $charset, $mode, $opt = '') { 01805 switch ($mode) { 01806 case 'case': 01807 if (!$this->initCaseFolding($charset)) { 01808 return $str; 01809 } // do nothing 01810 $map =& $this->caseFolding[$charset][$opt]; 01811 break; 01812 01813 case 'ascii': 01814 if (!$this->initToASCII($charset)) { 01815 return $str; 01816 } // do nothing 01817 $map =& $this->toASCII[$charset]; 01818 break; 01819 01820 default: 01821 return $str; 01822 } 01823 01824 $out = ''; 01825 for ($i = 0; strlen($str{$i}); $i++) { 01826 $c = $str{$i}; 01827 if (isset($map[$c])) { 01828 $out .= $map[$c]; 01829 } else { 01830 $out .= $c; 01831 } 01832 } 01833 01834 return $out; 01835 } 01836 01837 01838 /******************************************** 01839 * 01840 * Internal UTF-8 string operation functions 01841 * 01842 ********************************************/ 01843 01844 /** 01845 * Returns a part of a UTF-8 string. 01846 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len 01847 * 01848 * @param string UTF-8 string 01849 * @param integer Start position (character position) 01850 * @param integer Length (in characters) 01851 * @return string The substring 01852 * @see substr() 01853 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 01854 */ 01855 function utf8_substr($str, $start, $len = NULL) { 01856 if (!strcmp($len, '0')) { 01857 return ''; 01858 } 01859 01860 $byte_start = $this->utf8_char2byte_pos($str, $start); 01861 if ($byte_start === FALSE) { 01862 if ($start > 0) { 01863 return FALSE; // $start outside string length 01864 } else { 01865 $start = 0; 01866 } 01867 } 01868 01869 $str = substr($str, $byte_start); 01870 01871 if ($len != NULL) { 01872 $byte_end = $this->utf8_char2byte_pos($str, $len); 01873 if ($byte_end === FALSE) // $len outside actual string length 01874 { 01875 return $len < 0 ? '' : $str; 01876 } // When length is less than zero and exceeds, then we return blank string. 01877 else 01878 { 01879 return substr($str, 0, $byte_end); 01880 } 01881 } 01882 else { 01883 return $str; 01884 } 01885 } 01886 01887 /** 01888 * Counts the number of characters of a string in UTF-8. 01889 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen() 01890 * 01891 * @param string UTF-8 multibyte character string 01892 * @return integer The number of characters 01893 * @see strlen() 01894 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 01895 */ 01896 function utf8_strlen($str) { 01897 $n = 0; 01898 for ($i = 0; strlen($str{$i}); $i++) { 01899 $c = ord($str{$i}); 01900 if (!($c & 0x80)) // single-byte (0xxxxxx) 01901 { 01902 $n++; 01903 } 01904 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 01905 { 01906 $n++; 01907 } 01908 } 01909 return $n; 01910 } 01911 01912 /** 01913 * Truncates a string in UTF-8 short at a given byte length. 01914 * 01915 * @param string UTF-8 multibyte character string 01916 * @param integer the byte length 01917 * @return string the shortened string 01918 * @see mb_strcut() 01919 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 01920 */ 01921 function utf8_strtrunc($str, $len) { 01922 $i = $len - 1; 01923 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence 01924 for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte 01925 if ($i <= 0) { 01926 return ''; 01927 } // sanity check 01928 for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes 01929 if ($bc + $i > $len) { 01930 return substr($str, 0, $i); 01931 } 01932 // fallthru: multibyte char fits into length 01933 } 01934 return substr($str, 0, $len); 01935 } 01936 01937 /** 01938 * Find position of first occurrence of a string, both arguments are in UTF-8. 01939 * 01940 * @param string UTF-8 string to search in 01941 * @param string UTF-8 string to search for 01942 * @param integer Positition to start the search 01943 * @return integer The character position 01944 * @see strpos() 01945 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 01946 */ 01947 function utf8_strpos($haystack, $needle, $offset = 0) { 01948 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01949 return mb_strpos($haystack, $needle, $offset, 'utf-8'); 01950 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 01951 return iconv_strpos($haystack, $needle, $offset, 'utf-8'); 01952 } 01953 01954 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset); 01955 if ($byte_offset === FALSE) { 01956 return FALSE; 01957 } // offset beyond string length 01958 01959 $byte_pos = strpos($haystack, $needle, $byte_offset); 01960 if ($byte_pos === FALSE) { 01961 return FALSE; 01962 } // needle not found 01963 01964 return $this->utf8_byte2char_pos($haystack, $byte_pos); 01965 } 01966 01967 /** 01968 * Find position of last occurrence of a char in a string, both arguments are in UTF-8. 01969 * 01970 * @param string UTF-8 string to search in 01971 * @param string UTF-8 character to search for (single character) 01972 * @return integer The character position 01973 * @see strrpos() 01974 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 01975 */ 01976 function utf8_strrpos($haystack, $needle) { 01977 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01978 return mb_strrpos($haystack, $needle, 'utf-8'); 01979 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 01980 return iconv_strrpos($haystack, $needle, 'utf-8'); 01981 } 01982 01983 $byte_pos = strrpos($haystack, $needle); 01984 if ($byte_pos === FALSE) { 01985 return FALSE; 01986 } // needle not found 01987 01988 return $this->utf8_byte2char_pos($haystack, $byte_pos); 01989 } 01990 01991 /** 01992 * Translates a character position into an 'absolute' byte position. 01993 * Unit tested by Kasper. 01994 * 01995 * @param string UTF-8 string 01996 * @param integer Character position (negative values start from the end) 01997 * @return integer Byte position 01998 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 01999 */ 02000 function utf8_char2byte_pos($str, $pos) { 02001 $n = 0; // number of characters found 02002 $p = abs($pos); // number of characters wanted 02003 02004 if ($pos >= 0) { 02005 $i = 0; 02006 $d = 1; 02007 } else { 02008 $i = strlen($str) - 1; 02009 $d = -1; 02010 } 02011 02012 for (; strlen($str{$i}) && $n < $p; $i += $d) { 02013 $c = (int) ord($str{$i}); 02014 if (!($c & 0x80)) // single-byte (0xxxxxx) 02015 { 02016 $n++; 02017 } 02018 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 02019 { 02020 $n++; 02021 } 02022 } 02023 if (!strlen($str{$i})) { 02024 return FALSE; 02025 } // offset beyond string length 02026 02027 if ($pos >= 0) { 02028 // skip trailing multi-byte data bytes 02029 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { 02030 $i++; 02031 } 02032 } else { 02033 // correct offset 02034 $i++; 02035 } 02036 02037 return $i; 02038 } 02039 02040 /** 02041 * Translates an 'absolute' byte position into a character position. 02042 * Unit tested by Kasper. 02043 * 02044 * @param string UTF-8 string 02045 * @param integer byte position 02046 * @return integer character position 02047 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 02048 */ 02049 function utf8_byte2char_pos($str, $pos) { 02050 $n = 0; // number of characters 02051 for ($i = $pos; $i > 0; $i--) { 02052 $c = (int) ord($str{$i}); 02053 if (!($c & 0x80)) // single-byte (0xxxxxx) 02054 { 02055 $n++; 02056 } 02057 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 02058 { 02059 $n++; 02060 } 02061 } 02062 if (!strlen($str{$i})) { 02063 return FALSE; 02064 } // offset beyond string length 02065 02066 return $n; 02067 } 02068 02069 /** 02070 * Maps all characters of an UTF-8 string. 02071 * 02072 * @param string UTF-8 string 02073 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration) 02074 * @param string 'case': conversion 'toLower' or 'toUpper' 02075 * @return string the converted string 02076 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 02077 */ 02078 function utf8_char_mapping($str, $mode, $opt = '') { 02079 if (!$this->initUnicodeData($mode)) { 02080 return $str; 02081 } // do nothing 02082 02083 $out = ''; 02084 switch ($mode) { 02085 case 'case': 02086 $map =& $this->caseFolding['utf-8'][$opt]; 02087 break; 02088 02089 case 'ascii': 02090 $map =& $this->toASCII['utf-8']; 02091 break; 02092 02093 default: 02094 return $str; 02095 } 02096 02097 for ($i = 0; strlen($str{$i}); $i++) { 02098 $c = ord($str{$i}); 02099 if (!($c & 0x80)) // single-byte (0xxxxxx) 02100 { 02101 $mbc = $str{$i}; 02102 } 02103 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx) 02104 for ($bc = 0; $c & 0x80; $c = $c << 1) { 02105 $bc++; 02106 } // calculate number of bytes 02107 $mbc = substr($str, $i, $bc); 02108 $i += $bc - 1; 02109 } 02110 02111 if (isset($map[$mbc])) { 02112 $out .= $map[$mbc]; 02113 } else { 02114 $out .= $mbc; 02115 } 02116 } 02117 02118 return $out; 02119 } 02120 02121 02122 /******************************************** 02123 * 02124 * Internal EUC string operation functions 02125 * 02126 * Extended Unix Code: 02127 * ASCII compatible 7bit single bytes chars 02128 * 8bit two byte chars 02129 * 02130 * Shift-JIS is treated as a special case. 02131 * 02132 ********************************************/ 02133 02134 /** 02135 * Cuts a string in the EUC charset family short at a given byte length. 02136 * 02137 * @param string EUC multibyte character string 02138 * @param integer the byte length 02139 * @param string the charset 02140 * @return string the shortened string 02141 * @see mb_strcut() 02142 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 02143 */ 02144 function euc_strtrunc($str, $len, $charset) { 02145 $sjis = ($charset == 'shift_jis'); 02146 for ($i = 0; strlen($str{$i}) && $i < $len; $i++) { 02147 $c = ord($str{$i}); 02148 if ($sjis) { 02149 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { 02150 $i++; 02151 } // advance a double-byte char 02152 } 02153 else { 02154 if ($c >= 0x80) { 02155 $i++; 02156 } // advance a double-byte char 02157 } 02158 } 02159 if (!strlen($str{$i})) { 02160 return $str; 02161 } // string shorter than supplied length 02162 02163 if ($i > $len) { 02164 return substr($str, 0, $len - 1); // we ended on a first byte 02165 } else { 02166 return substr($str, 0, $len); 02167 } 02168 } 02169 02170 /** 02171 * Returns a part of a string in the EUC charset family. 02172 * 02173 * @param string EUC multibyte character string 02174 * @param integer start position (character position) 02175 * @param string the charset 02176 * @param integer length (in characters) 02177 * @return string the substring 02178 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 02179 */ 02180 function euc_substr($str, $start, $charset, $len = NULL) { 02181 $byte_start = $this->euc_char2byte_pos($str, $start, $charset); 02182 if ($byte_start === FALSE) { 02183 return FALSE; 02184 } // $start outside string length 02185 02186 $str = substr($str, $byte_start); 02187 02188 if ($len != NULL) { 02189 $byte_end = $this->euc_char2byte_pos($str, $len, $charset); 02190 if ($byte_end === FALSE) // $len outside actual string length 02191 { 02192 return $str; 02193 } 02194 else 02195 { 02196 return substr($str, 0, $byte_end); 02197 } 02198 } 02199 else { 02200 return $str; 02201 } 02202 } 02203 02204 /** 02205 * Counts the number of characters of a string in the EUC charset family. 02206 * 02207 * @param string EUC multibyte character string 02208 * @param string the charset 02209 * @return integer the number of characters 02210 * @see strlen() 02211 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 02212 */ 02213 function euc_strlen($str, $charset) { 02214 $sjis = ($charset == 'shift_jis'); 02215 $n = 0; 02216 for ($i = 0; strlen($str{$i}); $i++) { 02217 $c = ord($str{$i}); 02218 if ($sjis) { 02219 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { 02220 $i++; 02221 } // advance a double-byte char 02222 } 02223 else { 02224 if ($c >= 0x80) { 02225 $i++; 02226 } // advance a double-byte char 02227 } 02228 02229 $n++; 02230 } 02231 02232 return $n; 02233 } 02234 02235 /** 02236 * Translates a character position into an 'absolute' byte position. 02237 * 02238 * @param string EUC multibyte character string 02239 * @param integer character position (negative values start from the end) 02240 * @param string the charset 02241 * @return integer byte position 02242 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 02243 */ 02244 function euc_char2byte_pos($str, $pos, $charset) { 02245 $sjis = ($charset == 'shift_jis'); 02246 $n = 0; // number of characters seen 02247 $p = abs($pos); // number of characters wanted 02248 02249 if ($pos >= 0) { 02250 $i = 0; 02251 $d = 1; 02252 } else { 02253 $i = strlen($str) - 1; 02254 $d = -1; 02255 } 02256 02257 for (; strlen($str{$i}) && $n < $p; $i += $d) { 02258 $c = ord($str{$i}); 02259 if ($sjis) { 02260 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { 02261 $i += $d; 02262 } // advance a double-byte char 02263 } 02264 else { 02265 if ($c >= 0x80) { 02266 $i += $d; 02267 } // advance a double-byte char 02268 } 02269 02270 $n++; 02271 } 02272 if (!strlen($str{$i})) { 02273 return FALSE; 02274 } // offset beyond string length 02275 02276 if ($pos < 0) { 02277 $i++; 02278 } // correct offset 02279 02280 return $i; 02281 } 02282 02283 /** 02284 * Maps all characters of a string in the EUC charset family. 02285 * 02286 * @param string EUC multibyte character string 02287 * @param string the charset 02288 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration) 02289 * @param string 'case': conversion 'toLower' or 'toUpper' 02290 * @return string the converted string 02291 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 02292 */ 02293 function euc_char_mapping($str, $charset, $mode, $opt = '') { 02294 switch ($mode) { 02295 case 'case': 02296 if (!$this->initCaseFolding($charset)) { 02297 return $str; 02298 } // do nothing 02299 $map =& $this->caseFolding[$charset][$opt]; 02300 break; 02301 02302 case 'ascii': 02303 if (!$this->initToASCII($charset)) { 02304 return $str; 02305 } // do nothing 02306 $map =& $this->toASCII[$charset]; 02307 break; 02308 02309 default: 02310 return $str; 02311 } 02312 02313 $sjis = ($charset == 'shift_jis'); 02314 $out = ''; 02315 for ($i = 0; strlen($str{$i}); $i++) { 02316 $mbc = $str{$i}; 02317 $c = ord($mbc); 02318 02319 if ($sjis) { 02320 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char 02321 $mbc = substr($str, $i, 2); 02322 $i++; 02323 } 02324 } 02325 else { 02326 if ($c >= 0x80) { // a double-byte char 02327 $mbc = substr($str, $i, 2); 02328 $i++; 02329 } 02330 } 02331 02332 if (isset($map[$mbc])) { 02333 $out .= $map[$mbc]; 02334 } else { 02335 $out .= $mbc; 02336 } 02337 } 02338 02339 return $out; 02340 } 02341 02342 } 02343 02344 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])) { 02345 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']); 02346 } 02347 02348 ?>
1.8.0