TYPO3 API  SVNRelease
class.t3lib_cs.php
Go to the documentation of this file.
00001 <?php
00002 /***************************************************************
00003  *  Copyright notice
00004  *
00005  *  (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
00006  *  All rights reserved
00007  *
00008  *  This script is part of the Typo3 project. The Typo3 project is
00009  *  free software; you can redistribute it and/or modify
00010  *  it under the terms of the GNU General Public License as published by
00011  *  the Free Software Foundation; either version 2 of the License, or
00012  *  (at your option) any later version.
00013  *
00014  *  The GNU General Public License can be found at
00015  *  http://www.gnu.org/copyleft/gpl.html.
00016  *
00017  *  This script is distributed in the hope that it will be useful,
00018  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00019  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00020  *  GNU General Public License for more details.
00021  *
00022  *  This copyright notice MUST APPEAR in all copies of the script!
00023  ***************************************************************/
00024 /**
00025  * Class for conversion between charsets.
00026  *
00027  * $Id: class.t3lib_cs.php 10330 2011-01-26 10:28:29Z steffenk $
00028  *
00029  * @author  Kasper Skårhøj <kasperYYYY@typo3.com>
00030  * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
00031  */
00032 /**
00033  * [CLASS/FUNCTION INDEX of SCRIPT]
00034  *
00035  *
00036  *
00037  *  136: class t3lib_cs
00038  *  488:     function parse_charset($charset)
00039  *  507:     function get_locale_charset($locale)
00040  *
00041  *            SECTION: Charset Conversion functions
00042  *  560:     function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
00043  *  600:     function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
00044  *  617:     function utf8_encode($str,$charset)
00045  *  663:     function utf8_decode($str,$charset,$useEntityForNoChar=0)
00046  *  706:     function utf8_to_entities($str)
00047  *  739:     function entities_to_utf8($str,$alsoStdHtmlEnt=0)
00048  *  773:     function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
00049  *  823:     function UnumberToChar($cbyte)
00050  *  868:     function utf8CharToUnumber($str,$hex=0)
00051  *
00052  *            SECTION: Init functions
00053  *  911:     function initCharset($charset)
00054  *  973:     function initUnicodeData($mode=null)
00055  * 1198:     function initCaseFolding($charset)
00056  * 1260:     function initToASCII($charset)
00057  *
00058  *            SECTION: String operation functions
00059  * 1331:     function substr($charset,$string,$start,$len=null)
00060  * 1384:     function strlen($charset,$string)
00061  * 1414:     function crop($charset,$string,$len,$crop='')
00062  * 1467:     function strtrunc($charset,$string,$len)
00063  * 1501:     function conv_case($charset,$string,$case)
00064  * 1527:     function specCharsToASCII($charset,$string)
00065  *
00066  *            SECTION: Internal string operation functions
00067  * 1567:     function sb_char_mapping($str,$charset,$mode,$opt='')
00068  *
00069  *            SECTION: Internal UTF-8 string operation functions
00070  * 1622:     function utf8_substr($str,$start,$len=null)
00071  * 1655:     function utf8_strlen($str)
00072  * 1676:     function utf8_strtrunc($str,$len)
00073  * 1698:     function utf8_strpos($haystack,$needle,$offset=0)
00074  * 1723:     function utf8_strrpos($haystack,$needle)
00075  * 1745:     function utf8_char2byte_pos($str,$pos)
00076  * 1786:     function utf8_byte2char_pos($str,$pos)
00077  * 1809:     function utf8_char_mapping($str,$mode,$opt='')
00078  *
00079  *            SECTION: Internal EUC string operation functions
00080  * 1885:     function euc_strtrunc($str,$len,$charset)
00081  * 1914:     function euc_substr($str,$start,$charset,$len=null)
00082  * 1939:     function euc_strlen($str,$charset)
00083  * 1966:     function euc_char2byte_pos($str,$pos,$charset)
00084  * 2007:     function euc_char_mapping($str,$charset,$mode,$opt='')
00085  *
00086  * TOTAL FUNCTIONS: 35
00087  * (This index is automatically created/updated by the extension "extdeveval")
00088  *
00089  */
00090 
00091 
00092 /**
00093  * Notes on UTF-8
00094  *
00095  * Functions working on UTF-8 strings:
00096  *
00097  * - strchr/strstr
00098  * - strrchr
00099  * - substr_count
00100  * - implode/explode/join
00101  *
00102  * Functions nearly working on UTF-8 strings:
00103  *
00104  * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
00105  * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
00106  * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
00107  * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
00108  * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
00109  *
00110  * Functions NOT working on UTF-8 strings:
00111  *
00112  * - str*cmp
00113  * - stristr
00114  * - stripos
00115  * - substr
00116  * - strrev
00117  * - split/spliti
00118  * - ...
00119  *
00120  */
00121 /**
00122  * Class for conversion between charsets
00123  *
00124  * @author  Kasper Skårhøj <kasperYYYY@typo3.com>
00125  * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
00126  * @package TYPO3
00127  * @subpackage t3lib
00128  */
00129 class t3lib_cs {
00130     var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
00131 
00132         // This is the array where parsed conversion tables are stored (cached)
00133     var $parsedCharsets = array();
00134 
00135         // An array where case folding data will be stored (cached)
00136     var $caseFolding = array();
00137 
00138         // An array where charset-to-ASCII mappings are stored (cached)
00139     var $toASCII = array();
00140 
00141         // This tells the converter which charsets has two bytes per char:
00142     var $twoByteSets = array(
00143         'ucs-2' => 1, // 2-byte Unicode
00144     );
00145 
00146         // This tells the converter which charsets has four bytes per char:
00147     var $fourByteSets = array(
00148         'ucs-4' => 1, // 4-byte Unicode
00149         'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
00150     );
00151 
00152         // This tells the converter which charsets use a scheme like the Extended Unix Code:
00153     var $eucBasedSets = array(
00154         'gb2312' => 1, // Chinese, simplified.
00155         'big5' => 1, // Chinese, traditional.
00156         'euc-kr' => 1, // Korean
00157         'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
00158     );
00159 
00160         // see  http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
00161         // http://czyborra.com/charsets/iso8859.html
00162     var $synonyms = array(
00163         'us' => 'ascii',
00164         'us-ascii' => 'ascii',
00165         'cp819' => 'iso-8859-1',
00166         'ibm819' => 'iso-8859-1',
00167         'iso-ir-100' => 'iso-8859-1',
00168         'iso-ir-101' => 'iso-8859-2',
00169         'iso-ir-109' => 'iso-8859-3',
00170         'iso-ir-110' => 'iso-8859-4',
00171         'iso-ir-144' => 'iso-8859-5',
00172         'iso-ir-127' => 'iso-8859-6',
00173         'iso-ir-126' => 'iso-8859-7',
00174         'iso-ir-138' => 'iso-8859-8',
00175         'iso-ir-148' => 'iso-8859-9',
00176         'iso-ir-157' => 'iso-8859-10',
00177         'iso-ir-179' => 'iso-8859-13',
00178         'iso-ir-199' => 'iso-8859-14',
00179         'iso-ir-203' => 'iso-8859-15',
00180         'csisolatin1' => 'iso-8859-1',
00181         'csisolatin2' => 'iso-8859-2',
00182         'csisolatin3' => 'iso-8859-3',
00183         'csisolatin5' => 'iso-8859-9',
00184         'csisolatin8' => 'iso-8859-14',
00185         'csisolatin9' => 'iso-8859-15',
00186         'csisolatingreek' => 'iso-8859-7',
00187         'iso-celtic' => 'iso-8859-14',
00188         'latin1' => 'iso-8859-1',
00189         'latin2' => 'iso-8859-2',
00190         'latin3' => 'iso-8859-3',
00191         'latin5' => 'iso-8859-9',
00192         'latin6' => 'iso-8859-10',
00193         'latin8' => 'iso-8859-14',
00194         'latin9' => 'iso-8859-15',
00195         'l1' => 'iso-8859-1',
00196         'l2' => 'iso-8859-2',
00197         'l3' => 'iso-8859-3',
00198         'l5' => 'iso-8859-9',
00199         'l6' => 'iso-8859-10',
00200         'l8' => 'iso-8859-14',
00201         'l9' => 'iso-8859-15',
00202         'cyrillic' => 'iso-8859-5',
00203         'arabic' => 'iso-8859-6',
00204         'tis-620' => 'iso-8859-11',
00205         'win874' => 'windows-874',
00206         'win1250' => 'windows-1250',
00207         'win1251' => 'windows-1251',
00208         'win1252' => 'windows-1252',
00209         'win1253' => 'windows-1253',
00210         'win1254' => 'windows-1254',
00211         'win1255' => 'windows-1255',
00212         'win1256' => 'windows-1256',
00213         'win1257' => 'windows-1257',
00214         'win1258' => 'windows-1258',
00215         'cp1250' => 'windows-1250',
00216         'cp1251' => 'windows-1251',
00217         'cp1252' => 'windows-1252',
00218         'ms-ee' => 'windows-1250',
00219         'ms-ansi' => 'windows-1252',
00220         'ms-greek' => 'windows-1253',
00221         'ms-turk' => 'windows-1254',
00222         'winbaltrim' => 'windows-1257',
00223         'koi-8ru' => 'koi-8r',
00224         'koi8r' => 'koi-8r',
00225         'cp878' => 'koi-8r',
00226         'mac' => 'macroman',
00227         'macintosh' => 'macroman',
00228         'euc-cn' => 'gb2312',
00229         'x-euc-cn' => 'gb2312',
00230         'euccn' => 'gb2312',
00231         'cp936' => 'gb2312',
00232         'big-5' => 'big5',
00233         'cp950' => 'big5',
00234         'eucjp' => 'euc-jp',
00235         'sjis' => 'shift_jis',
00236         'shift-jis' => 'shift_jis',
00237         'cp932' => 'shift_jis',
00238         'cp949' => 'euc-kr',
00239         'utf7' => 'utf-7',
00240         'utf8' => 'utf-8',
00241         'utf16' => 'utf-16',
00242         'utf32' => 'utf-32',
00243         'utf8' => 'utf-8',
00244         'ucs2' => 'ucs-2',
00245         'ucs4' => 'ucs-4',
00246     );
00247 
00248         // mapping of iso-639-1 language codes to script names
00249     var $lang_to_script = array(
00250             // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
00251         'ar' => 'arabic',
00252         'bg' => 'cyrillic', // Bulgarian
00253         'bs' => 'east_european', // Bosnian
00254         'cs' => 'east_european', // Czech
00255         'da' => 'west_european', // Danish
00256         'de' => 'west_european', // German
00257         'es' => 'west_european', // Spanish
00258         'et' => 'estonian',
00259         'eo' => 'unicode', // Esperanto
00260         'eu' => 'west_european', // Basque
00261         'fa' => 'arabic', // Persian
00262         'fi' => 'west_european', // Finish
00263         'fo' => 'west_european', // Faroese
00264         'fr' => 'west_european', // French
00265         'ga' => 'west_european', // Irish
00266         'gl' => 'west_european', // Galician
00267         'gr' => 'greek',
00268         'he' => 'hebrew', // Hebrew (since 1998)
00269         'hi' => 'unicode', // Hindi
00270         'hr' => 'east_european', // Croatian
00271         'hu' => 'east_european', // Hungarian
00272         'iw' => 'hebrew', // Hebrew (til 1998)
00273         'is' => 'west_european', // Icelandic
00274         'it' => 'west_european', // Italian
00275         'ja' => 'japanese',
00276         'ka' => 'unicode', // Georgian
00277         'kl' => 'west_european', // Greenlandic
00278         'km' => 'unicode', // Khmer
00279         'ko' => 'korean',
00280         'lt' => 'lithuanian',
00281         'lv' => 'west_european', // Latvian/Lettish
00282         'nl' => 'west_european', // Dutch
00283         'no' => 'west_european', // Norwegian
00284         'nb' => 'west_european', // Norwegian Bokmal
00285         'nn' => 'west_european', // Norwegian Nynorsk
00286         'pl' => 'east_european', // Polish
00287         'pt' => 'west_european', // Portuguese
00288         'ro' => 'east_european', // Romanian
00289         'ru' => 'cyrillic', // Russian
00290         'sk' => 'east_european', // Slovak
00291         'sl' => 'east_european', // Slovenian
00292         'sr' => 'cyrillic', // Serbian
00293         'sv' => 'west_european', // Swedish
00294         'sq' => 'albanian', // Albanian
00295         'th' => 'thai',
00296         'uk' => 'cyrillic', // Ukranian
00297         'vi' => 'vietnamese',
00298         'zh' => 'chinese',
00299             // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
00300             // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
00301         'ara' => 'arabic',
00302         'bgr' => 'cyrillic', // Bulgarian
00303         'cat' => 'west_european', // Catalan
00304         'chs' => 'simpl_chinese',
00305         'cht' => 'trad_chinese',
00306         'csy' => 'east_european', // Czech
00307         'dan' => 'west_european', // Danisch
00308         'deu' => 'west_european', // German
00309         'dea' => 'west_european', // German (Austrian)
00310         'des' => 'west_european', // German (Swiss)
00311         'ena' => 'west_european', // English (Australian)
00312         'enc' => 'west_european', // English (Canadian)
00313         'eng' => 'west_european', // English
00314         'enz' => 'west_european', // English (New Zealand)
00315         'enu' => 'west_european', // English (United States)
00316         'euq' => 'west_european', // Basque
00317         'fos' => 'west_european', // Faroese
00318         'far' => 'arabic', // Persian
00319         'fin' => 'west_european', // Finish
00320         'fra' => 'west_european', // French
00321         'frb' => 'west_european', // French (Belgian)
00322         'frc' => 'west_european', // French (Canadian)
00323         'frs' => 'west_european', // French (Swiss)
00324         'geo' => 'unicode', // Georgian
00325         'glg' => 'west_european', // Galician
00326         'ell' => 'greek',
00327         'heb' => 'hebrew',
00328         'hin' => 'unicode', // Hindi
00329         'hun' => 'east_european', // Hungarian
00330         'isl' => 'west_euorpean', // Icelandic
00331         'ita' => 'west_european', // Italian
00332         'its' => 'west_european', // Italian (Swiss)
00333         'jpn' => 'japanese',
00334         'khm' => 'unicode', // Khmer
00335         'kor' => 'korean',
00336         'lth' => 'lithuanian',
00337         'lvi' => 'west_european', // Latvian/Lettish
00338         'msl' => 'west_european', // Malay
00339         'nlb' => 'west_european', // Dutch (Belgian)
00340         'nld' => 'west_european', // Dutch
00341         'nor' => 'west_european', // Norwegian (bokmal)
00342         'non' => 'west_european', // Norwegian (nynorsk)
00343         'plk' => 'east_european', // Polish
00344         'ptg' => 'west_european', // Portuguese
00345         'ptb' => 'west_european', // Portuguese (Brazil)
00346         'rom' => 'east_european', // Romanian
00347         'rus' => 'cyrillic', // Russian
00348         'slv' => 'east_european', // Slovenian
00349         'sky' => 'east_european', // Slovak
00350         'srl' => 'east_european', // Serbian (Latin)
00351         'srb' => 'cyrillic', // Serbian (Cyrillic)
00352         'esp' => 'west_european', // Spanish (trad. sort)
00353         'esm' => 'west_european', // Spanish (Mexican)
00354         'esn' => 'west_european', // Spanish (internat. sort)
00355         'sve' => 'west_european', // Swedish
00356         'sqi' => 'albanian', // Albanian
00357         'tha' => 'thai',
00358         'trk' => 'turkish',
00359         'ukr' => 'cyrillic', // Ukrainian
00360             // English language names
00361         'albanian' => 'albanian',
00362         'arabic' => 'arabic',
00363         'basque' => 'west_european',
00364         'bosnian' => 'east_european',
00365         'bulgarian' => 'east_european',
00366         'catalan' => 'west_european',
00367         'croatian' => 'east_european',
00368         'czech' => 'east_european',
00369         'danish' => 'west_european',
00370         'dutch' => 'west_european',
00371         'english' => 'west_european',
00372         'esperanto' => 'unicode',
00373         'estonian' => 'estonian',
00374         'faroese' => 'west_european',
00375         'farsi' => 'arabic',
00376         'finnish' => 'west_european',
00377         'french' => 'west_european',
00378         'galician' => 'west_european',
00379         'georgian' => 'unicode',
00380         'german' => 'west_european',
00381         'greek' => 'greek',
00382         'greenlandic' => 'west_european',
00383         'hebrew' => 'hebrew',
00384         'hindi' => 'unicode',
00385         'hungarian' => 'east_european',
00386         'icelandic' => 'west_european',
00387         'italian' => 'west_european',
00388         'khmer' => 'unicode',
00389         'latvian' => 'west_european',
00390         'lettish' => 'west_european',
00391         'lithuanian' => 'lithuanian',
00392         'malay' => 'west_european',
00393         'norwegian' => 'west_european',
00394         'persian' => 'arabic',
00395         'polish' => 'east_european',
00396         'portuguese' => 'west_european',
00397         'russian' => 'cyrillic',
00398         'romanian' => 'east_european',
00399         'serbian' => 'cyrillic',
00400         'slovak' => 'east_european',
00401         'slovenian' => 'east_european',
00402         'spanish' => 'west_european',
00403         'svedish' => 'west_european',
00404         'that' => 'thai',
00405         'turkish' => 'turkish',
00406         'ukrainian' => 'cyrillic',
00407     );
00408 
00409         // mapping of language (family) names to charsets on Unix
00410     var $script_to_charset_unix = array(
00411         'west_european' => 'iso-8859-1',
00412         'estonian' => 'iso-8859-1',
00413         'east_european' => 'iso-8859-2',
00414         'baltic' => 'iso-8859-4',
00415         'cyrillic' => 'iso-8859-5',
00416         'arabic' => 'iso-8859-6',
00417         'greek' => 'iso-8859-7',
00418         'hebrew' => 'iso-8859-8',
00419         'turkish' => 'iso-8859-9',
00420         'thai' => 'iso-8859-11', // = TIS-620
00421         'lithuanian' => 'iso-8859-13',
00422         'chinese' => 'gb2312', // = euc-cn
00423         'japanese' => 'euc-jp',
00424         'korean' => 'euc-kr',
00425         'simpl_chinese' => 'gb2312',
00426         'trad_chinese' => 'big5',
00427         'vietnamese' => '',
00428         'unicode' => 'utf-8',
00429         'albanian' => 'utf-8'
00430     );
00431 
00432         // mapping of language (family) names to charsets on Windows
00433     var $script_to_charset_windows = array(
00434         'east_european' => 'windows-1250',
00435         'cyrillic' => 'windows-1251',
00436         'west_european' => 'windows-1252',
00437         'greek' => 'windows-1253',
00438         'turkish' => 'windows-1254',
00439         'hebrew' => 'windows-1255',
00440         'arabic' => 'windows-1256',
00441         'baltic' => 'windows-1257',
00442         'estonian' => 'windows-1257',
00443         'lithuanian' => 'windows-1257',
00444         'vietnamese' => 'windows-1258',
00445         'thai' => 'cp874',
00446         'korean' => 'cp949',
00447         'chinese' => 'gb2312',
00448         'japanese' => 'shift_jis',
00449         'simpl_chinese' => 'gb2312',
00450         'trad_chinese' => 'big5',
00451         'albanian' => 'windows-1250',
00452         'unicode' => 'utf-8'
00453     );
00454 
00455         // mapping of locale names to charsets
00456     var $locale_to_charset = array(
00457         'japanese.euc' => 'euc-jp',
00458         'ja_jp.ujis' => 'euc-jp',
00459         'korean.euc' => 'euc-kr',
00460         'sr@Latn' => 'iso-8859-2',
00461         'zh_cn' => 'gb2312',
00462         'zh_hk' => 'big5',
00463         'zh_tw' => 'big5',
00464     );
00465 
00466         // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
00467         // Empty values means "iso-8859-1"
00468     var $charSetArray = array(
00469         'dk' => '',
00470         'de' => '',
00471         'no' => '',
00472         'it' => '',
00473         'fr' => '',
00474         'es' => '',
00475         'nl' => '',
00476         'cz' => 'windows-1250',
00477         'pl' => 'iso-8859-2',
00478         'si' => 'windows-1250',
00479         'fi' => '',
00480         'tr' => 'iso-8859-9',
00481         'se' => '',
00482         'pt' => '',
00483         'ru' => 'windows-1251',
00484         'ro' => 'iso-8859-2',
00485         'ch' => 'gb2312',
00486         'sk' => 'windows-1250',
00487         'lt' => 'windows-1257',
00488         'is' => 'utf-8',
00489         'hr' => 'windows-1250',
00490         'hu' => 'iso-8859-2',
00491         'gl' => '',
00492         'th' => 'iso-8859-11',
00493         'gr' => 'iso-8859-7',
00494         'hk' => 'big5',
00495         'eu' => '',
00496         'bg' => 'windows-1251',
00497         'br' => '',
00498         'et' => 'iso-8859-4',
00499         'ar' => 'iso-8859-6',
00500         'he' => 'utf-8',
00501         'ua' => 'windows-1251',
00502         'jp' => 'shift_jis',
00503         'lv' => 'utf-8',
00504         'vn' => 'utf-8',
00505         'ca' => 'iso-8859-15',
00506         'ba' => 'iso-8859-2',
00507         'kr' => 'euc-kr',
00508         'eo' => 'utf-8',
00509         'my' => '',
00510         'hi' => 'utf-8',
00511         'fo' => 'utf-8',
00512         'fa' => 'utf-8',
00513         'sr' => 'utf-8',
00514         'sq' => 'utf-8',
00515         'ge' => 'utf-8',
00516         'ga' => '',
00517         'km' => 'utf-8',
00518         'qc' => '',
00519     );
00520 
00521         // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
00522         // Missing keys means: same as Typo3
00523     var $isoArray = array(
00524         'ba' => 'bs',
00525         'br' => 'pt_BR',
00526         'ch' => 'zh_CN',
00527         'cz' => 'cs',
00528         'dk' => 'da',
00529         'si' => 'sl',
00530         'se' => 'sv',
00531         'gl' => 'kl',
00532         'gr' => 'el',
00533         'hk' => 'zh_HK',
00534         'kr' => 'ko',
00535         'ua' => 'uk',
00536         'jp' => 'ja',
00537         'qc' => 'fr_CA',
00538         'vn' => 'vi',
00539         'ge' => 'ka',
00540         'ga' => 'gl',
00541     );
00542 
00543     /**
00544      * Normalize - changes input character set to lowercase letters.
00545      *
00546      * @param   string      Input charset
00547      * @return  string      Normalized charset
00548      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
00549      */
00550     function parse_charset($charset) {
00551         $charset = trim(strtolower($charset));
00552         if (isset($this->synonyms[$charset])) {
00553             $charset = $this->synonyms[$charset];
00554         }
00555 
00556         return $charset;
00557     }
00558 
00559     /**
00560      * Get the charset of a locale.
00561      *
00562      * ln           language
00563      * ln_CN         language / country
00564      * ln_CN.cs   language / country / charset
00565      * ln_CN.cs@mod  language / country / charset / modifier
00566      *
00567      * @param   string      Locale string
00568      * @return  string      Charset resolved for locale string
00569      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
00570      */
00571     function get_locale_charset($locale) {
00572         $locale = strtolower($locale);
00573 
00574             // exact locale specific charset?
00575         if (isset($this->locale_to_charset[$locale])) {
00576             return $this->locale_to_charset[$locale];
00577         }
00578 
00579             // get modifier
00580         list($locale, $modifier) = explode('@', $locale);
00581 
00582             // locale contains charset: use it
00583         list($locale, $charset) = explode('.', $locale);
00584         if ($charset) {
00585             return $this->parse_charset($charset);
00586         }
00587 
00588             // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
00589         if ($modifier == 'euro') {
00590             return 'iso-8859-15';
00591         }
00592 
00593             // get language
00594         list($language, $country) = explode('_', $locale);
00595         if (isset($this->lang_to_script[$language])) {
00596             $script = $this->lang_to_script[$language];
00597         }
00598 
00599         if (TYPO3_OS == 'WIN') {
00600             $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
00601         } else {
00602             $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
00603         }
00604 
00605         return $cs;
00606     }
00607 
00608 
00609     /********************************************
00610      *
00611      * Charset Conversion functions
00612      *
00613      ********************************************/
00614 
00615     /**
00616      * Convert from one charset to another charset.
00617      *
00618      * @param   string      Input string
00619      * @param   string      From charset (the current charset of the string)
00620      * @param   string      To charset (the output charset wanted)
00621      * @param   boolean     If set, then characters that are not available in the destination character set will be encoded as numeric entities
00622      * @return  string      Converted string
00623      * @see convArray()
00624      */
00625     function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
00626         if ($fromCS == $toCS) {
00627             return $str;
00628         }
00629 
00630             // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
00631         if ($toCS == 'utf-8' || !$useEntityForNoChar) {
00632             switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
00633                 case 'mbstring':
00634                     $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
00635                     if (FALSE !== $conv_str) {
00636                         return $conv_str;
00637                     } // returns false for unsupported charsets
00638                     break;
00639 
00640                 case 'iconv':
00641                     $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
00642                     if (FALSE !== $conv_str) {
00643                         return $conv_str;
00644                     }
00645                     break;
00646 
00647                 case 'recode':
00648                     $conv_str = recode_string($fromCS . '..' . $toCS, $str);
00649                     if (FALSE !== $conv_str) {
00650                         return $conv_str;
00651                     }
00652                     break;
00653             }
00654             // fallback to TYPO3 conversion
00655         }
00656 
00657         if ($fromCS != 'utf-8') {
00658             $str = $this->utf8_encode($str, $fromCS);
00659         }
00660         if ($toCS != 'utf-8') {
00661             $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
00662         }
00663         return $str;
00664     }
00665 
00666     /**
00667      * Convert all elements in ARRAY with type string from one charset to another charset.
00668      * NOTICE: Array is passed by reference!
00669      *
00670      * @param   string      Input array, possibly multidimensional
00671      * @param   string      From charset (the current charset of the string)
00672      * @param   string      To charset (the output charset wanted)
00673      * @param   boolean     If set, then characters that are not available in the destination character set will be encoded as numeric entities
00674      * @return  void
00675      * @see conv()
00676      */
00677     function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
00678         foreach ($array as $key => $value) {
00679             if (is_array($array[$key])) {
00680                 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
00681             } elseif (is_string($array[$key])) {
00682                 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
00683             }
00684         }
00685     }
00686 
00687     /**
00688      * Converts $str from $charset to UTF-8
00689      *
00690      * @param   string      String in local charset to convert to UTF-8
00691      * @param   string      Charset, lowercase. Must be found in csconvtbl/ folder.
00692      * @return  string      Output string, converted to UTF-8
00693      */
00694     function utf8_encode($str, $charset) {
00695 
00696         if ($charset === 'utf-8') {
00697             return $str;
00698         }
00699 
00700             // Charset is case-insensitive.
00701         if ($this->initCharset($charset)) { // Parse conv. table if not already...
00702             $strLen = strlen($str);
00703             $outStr = '';
00704 
00705             for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
00706                 $chr = substr($str, $a, 1);
00707                 $ord = ord($chr);
00708                 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
00709                     $ord2 = ord($str{$a + 1});
00710                     $ord = $ord << 8 | $ord2; // assume big endian
00711 
00712                     if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
00713                         $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
00714                     } else {
00715                         $outStr .= chr($this->noCharByteVal);
00716                     } // No char exists
00717                     $a++;
00718                 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
00719                     if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
00720                         if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
00721                             $a++;
00722                             $ord2 = ord(substr($str, $a, 1));
00723                             $ord = $ord * 256 + $ord2;
00724                         }
00725                     }
00726 
00727                     if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
00728                         $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
00729                     } else {
00730                         $outStr .= chr($this->noCharByteVal);
00731                     } // No char exists
00732                 } else {
00733                     $outStr .= $chr;
00734                 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00735             }
00736             return $outStr;
00737         }
00738     }
00739 
00740     /**
00741      * Converts $str from UTF-8 to $charset
00742      *
00743      * @param   string      String in UTF-8 to convert to local charset
00744      * @param   string      Charset, lowercase. Must be found in csconvtbl/ folder.
00745      * @param   boolean     If set, then characters that are not available in the destination character set will be encoded as numeric entities
00746      * @return  string      Output string, converted to local charset
00747      */
00748     function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
00749 
00750         if ($charset === 'utf-8') {
00751             return $str;
00752         }
00753 
00754             // Charset is case-insensitive.
00755         if ($this->initCharset($charset)) { // Parse conv. table if not already...
00756             $strLen = strlen($str);
00757             $outStr = '';
00758             $buf = '';
00759             for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
00760                 $chr = substr($str, $a, 1);
00761                 $ord = ord($chr);
00762                 if ($ord > 127) { // This means multibyte! (first byte!)
00763                     if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00764 
00765                         $buf = $chr; // Add first byte
00766                         for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
00767                             $ord = $ord << 1; // Shift it left and ...
00768                             if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00769                                 $a++; // Increase pointer...
00770                                 $buf .= substr($str, $a, 1); // ... and add the next char.
00771                             } else {
00772                                 break;
00773                             }
00774                         }
00775 
00776                         if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
00777                             $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
00778                             if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
00779                                 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
00780                             } else {
00781                                 $outStr .= chr($mByte);
00782                             }
00783                         } elseif ($useEntityForNoChar) { // Create num entity:
00784                             $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
00785                         } else {
00786                             $outStr .= chr($this->noCharByteVal);
00787                         } // No char exists
00788                     } else {
00789                         $outStr .= chr($this->noCharByteVal);
00790                     } // No char exists (MIDDLE of MB sequence!)
00791                 } else {
00792                     $outStr .= $chr;
00793                 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00794             }
00795             return $outStr;
00796         }
00797     }
00798 
00799     /**
00800      * Converts all chars > 127 to numeric entities.
00801      *
00802      * @param   string      Input string
00803      * @return  string      Output string
00804      */
00805     function utf8_to_entities($str) {
00806         $strLen = strlen($str);
00807         $outStr = '';
00808         $buf = '';
00809         for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
00810             $chr = substr($str, $a, 1);
00811             $ord = ord($chr);
00812             if ($ord > 127) { // This means multibyte! (first byte!)
00813                 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00814                     $buf = $chr; // Add first byte
00815                     for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
00816                         $ord = $ord << 1; // Shift it left and ...
00817                         if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00818                             $a++; // Increase pointer...
00819                             $buf .= substr($str, $a, 1); // ... and add the next char.
00820                         } else {
00821                             break;
00822                         }
00823                     }
00824 
00825                     $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
00826                 } else {
00827                     $outStr .= chr($this->noCharByteVal);
00828                 } // No char exists (MIDDLE of MB sequence!)
00829             } else {
00830                 $outStr .= $chr;
00831             } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00832         }
00833 
00834         return $outStr;
00835     }
00836 
00837     /**
00838      * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
00839      *
00840      * @param   string      Input string, UTF-8
00841      * @param   boolean     If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
00842      * @return  string      Output string
00843      */
00844     function entities_to_utf8($str, $alsoStdHtmlEnt = 0) {
00845         if ($alsoStdHtmlEnt) {
00846             $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
00847         }
00848 
00849         $token = md5(microtime());
00850         $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
00851         foreach ($parts as $k => $v) {
00852             if ($k % 2) {
00853                 if (substr($v, 0, 1) == '#') { // Dec or hex entities:
00854                     if (substr($v, 1, 1) == 'x') {
00855                         $parts[$k] = $this->UnumberToChar(hexdec(substr($v, 2)));
00856                     } else {
00857                         $parts[$k] = $this->UnumberToChar(substr($v, 1));
00858                     }
00859                 } elseif ($alsoStdHtmlEnt && $trans_tbl['&' . $v . ';']) { // Other entities:
00860                     $parts[$k] = $this->utf8_encode($trans_tbl['&' . $v . ';'], 'iso-8859-1');
00861                 } else { // No conversion:
00862                     $parts[$k] = '&' . $v . ';';
00863                 }
00864             }
00865         }
00866 
00867         return implode('', $parts);
00868     }
00869 
00870     /**
00871      * Converts all chars in the input UTF-8 string into integer numbers returned in an array
00872      *
00873      * @param   string      Input string, UTF-8
00874      * @param   boolean     If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
00875      * @param   boolean     If set, then instead of integer numbers the real UTF-8 char is returned.
00876      * @return  array       Output array with the char numbers
00877      */
00878     function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
00879             // If entities must be registered as well...:
00880         if ($convEntities) {
00881             $str = $this->entities_to_utf8($str, 1);
00882         }
00883             // Do conversion:
00884         $strLen = strlen($str);
00885         $outArr = array();
00886         $buf = '';
00887         for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
00888             $chr = substr($str, $a, 1);
00889             $ord = ord($chr);
00890             if ($ord > 127) { // This means multibyte! (first byte!)
00891                 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00892                     $buf = $chr; // Add first byte
00893                     for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
00894                         $ord = $ord << 1; // Shift it left and ...
00895                         if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00896                             $a++; // Increase pointer...
00897                             $buf .= substr($str, $a, 1); // ... and add the next char.
00898                         } else {
00899                             break;
00900                         }
00901                     }
00902 
00903                     $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
00904                 } else {
00905                     $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
00906                 } // No char exists (MIDDLE of MB sequence!)
00907             } else {
00908                 $outArr[] = $retChar ? chr($ord) : $ord;
00909             } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00910         }
00911 
00912         return $outArr;
00913     }
00914 
00915     /**
00916      * Converts a UNICODE number to a UTF-8 multibyte character
00917      * Algorithm based on script found at From: http://czyborra.com/utf/
00918      * Unit-tested by Kasper
00919      *
00920      * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
00921      *
00922      *  bytes | bits | representation
00923      *    1 |   7 | 0vvvvvvv
00924      *    2 |   11 | 110vvvvv 10vvvvvv
00925      *    3 |   16 | 1110vvvv 10vvvvvv 10vvvvvv
00926      *    4 |   21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
00927      *    5 |   26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
00928      *    6 |   31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
00929      *
00930      * @param   integer     UNICODE integer
00931      * @return  string      UTF-8 multibyte character string
00932      * @see utf8CharToUnumber()
00933      */
00934     function UnumberToChar($cbyte) {
00935         $str = '';
00936 
00937         if ($cbyte < 0x80) {
00938             $str .= chr($cbyte);
00939         } else {
00940             if ($cbyte < 0x800) {
00941                 $str .= chr(0xC0 | ($cbyte >> 6));
00942                 $str .= chr(0x80 | ($cbyte & 0x3F));
00943             } else {
00944                 if ($cbyte < 0x10000) {
00945                     $str .= chr(0xE0 | ($cbyte >> 12));
00946                     $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
00947                     $str .= chr(0x80 | ($cbyte & 0x3F));
00948                 } else {
00949                     if ($cbyte < 0x200000) {
00950                         $str .= chr(0xF0 | ($cbyte >> 18));
00951                         $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
00952                         $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
00953                         $str .= chr(0x80 | ($cbyte & 0x3F));
00954                     } else {
00955                         if ($cbyte < 0x4000000) {
00956                             $str .= chr(0xF8 | ($cbyte >> 24));
00957                             $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
00958                             $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
00959                             $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
00960                             $str .= chr(0x80 | ($cbyte & 0x3F));
00961                         } else {
00962                             if ($cbyte < 0x80000000) {
00963                                 $str .= chr(0xFC | ($cbyte >> 30));
00964                                 $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
00965                                 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
00966                                 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
00967                                 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
00968                                 $str .= chr(0x80 | ($cbyte & 0x3F));
00969                             } else { // Cannot express a 32-bit character in UTF-8
00970                                 $str .= chr($this->noCharByteVal);
00971                             }
00972                         }
00973                     }
00974                 }
00975             }
00976         }
00977         return $str;
00978     }
00979 
00980     /**
00981      * Converts a UTF-8 Multibyte character to a UNICODE number
00982      * Unit-tested by Kasper
00983      *
00984      * @param   string      UTF-8 multibyte character string
00985      * @param   boolean     If set, then a hex. number is returned.
00986      * @return  integer     UNICODE integer
00987      * @see UnumberToChar()
00988      */
00989     function utf8CharToUnumber($str, $hex = 0) {
00990         $ord = ord(substr($str, 0, 1)); // First char
00991 
00992         if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
00993             $binBuf = '';
00994             for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
00995                 $ord = $ord << 1; // Shift it left and ...
00996                 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00997                     $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
00998                 } else {
00999                     break;
01000                 }
01001             }
01002             $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
01003 
01004             $int = bindec($binBuf);
01005         } else {
01006             $int = $ord;
01007         }
01008 
01009         return $hex ? 'x' . dechex($int) : $int;
01010     }
01011 
01012 
01013     /********************************************
01014      *
01015      * Init functions
01016      *
01017      ********************************************/
01018 
01019     /**
01020      * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
01021      * This function is automatically called by the conversion functions
01022      *
01023      * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
01024      *
01025      * @param   string      The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
01026      * @return  integer     Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
01027      * @access private
01028      */
01029     function initCharset($charset) {
01030             // Only process if the charset is not yet loaded:
01031         if (!is_array($this->parsedCharsets[$charset])) {
01032 
01033                 // Conversion table filename:
01034             $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
01035 
01036                 // If the conversion table is found:
01037             if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
01038                     // Cache file for charsets:
01039                     // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
01040                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
01041                 if ($cacheFile && @is_file($cacheFile)) {
01042                     $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01043                 } else {
01044                         // Parse conversion table into lines:
01045                     $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
01046                         // Initialize the internal variable holding the conv. table:
01047                     $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
01048                         // traverse the lines:
01049                     $detectedType = '';
01050                     foreach ($lines as $value) {
01051                         if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
01052 
01053                                 // Detect type if not done yet: (Done on first real line)
01054                                 // The "whitespaced" type is on the syntax  "0x0A   0x000A  #LINE FEED"     while   "ms-token" is like      "B9 = U+00B9 : SUPERSCRIPT ONE"
01055                             if (!$detectedType) {
01056                                 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
01057                             }
01058 
01059                             if ($detectedType == 'ms-token') {
01060                                 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
01061                             } elseif ($detectedType == 'whitespaced') {
01062                                 $regA = array();
01063                                 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
01064                                 $hexbyte = $regA[1];
01065                                 $utf8 = 'U+' . $regA[2];
01066                             }
01067                             $decval = hexdec(trim($hexbyte));
01068                             if ($decval > 127) {
01069                                 $utf8decval = hexdec(substr(trim($utf8), 2));
01070                                 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
01071                                 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
01072                             }
01073                         }
01074                     }
01075                     if ($cacheFile) {
01076                         t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
01077                     }
01078                 }
01079                 return 2;
01080             } else {
01081                 return FALSE;
01082             }
01083         } else {
01084             return 1;
01085         }
01086     }
01087 
01088     /**
01089      * This function initializes all UTF-8 character data tables.
01090      *
01091      * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
01092      *
01093      * @param   string      Mode ("case", "ascii", ...)
01094      * @return  integer     Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
01095      * @access private
01096      */
01097     function initUnicodeData($mode = NULL) {
01098             // cache files
01099         $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
01100         $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
01101 
01102             // Only process if the tables are not yet loaded
01103         switch ($mode) {
01104             case 'case':
01105                 if (is_array($this->caseFolding['utf-8'])) {
01106                     return 1;
01107                 }
01108 
01109                     // Use cached version if possible
01110                 if ($cacheFileCase && @is_file($cacheFileCase)) {
01111                     $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
01112                     return 2;
01113                 }
01114                 break;
01115 
01116             case 'ascii':
01117                 if (is_array($this->toASCII['utf-8'])) {
01118                     return 1;
01119                 }
01120 
01121                     // Use cached version if possible
01122                 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
01123                     $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
01124                     return 2;
01125                 }
01126                 break;
01127         }
01128 
01129             // process main Unicode data file
01130         $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
01131         if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
01132             return FALSE;
01133         }
01134 
01135         $fh = fopen($unicodeDataFile, 'rb');
01136         if (!$fh) {
01137             return FALSE;
01138         }
01139 
01140             // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
01141             // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
01142         $this->caseFolding['utf-8'] = array();
01143         $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
01144         $utf8CaseFolding['toUpper'] = array();
01145         $utf8CaseFolding['toLower'] = array();
01146         $utf8CaseFolding['toTitle'] = array();
01147 
01148         $decomposition = array(); // array of temp. decompositions
01149         $mark = array(); // array of chars that are marks (eg. composing accents)
01150         $number = array(); // array of chars that are numbers (eg. digits)
01151         $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
01152 
01153         while (!feof($fh)) {
01154             $line = fgets($fh, 4096);
01155                 // has a lot of info
01156             list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
01157 
01158             $ord = hexdec($char);
01159             if ($ord > 0xFFFF) {
01160                 break;
01161             } // only process the BMP
01162 
01163             $utf8_char = $this->UnumberToChar($ord);
01164 
01165             if ($upper) {
01166                 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
01167             }
01168             if ($lower) {
01169                 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
01170             }
01171                 // store "title" only when different from "upper" (only a few)
01172             if ($title && $title != $upper) {
01173                 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
01174             }
01175 
01176             switch ($cat{0}) {
01177                 case 'M': // mark (accent, umlaut, ...)
01178                     $mark["U+$char"] = 1;
01179                     break;
01180 
01181                 case 'N': // numeric value
01182                     if ($ord > 0x80 && $num != '') {
01183                         $number["U+$char"] = $num;
01184                     }
01185             }
01186 
01187                 // accented Latin letters without "official" decomposition
01188             $match = array();
01189             if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
01190                 $c = ord($match[2]);
01191                 if ($match[1] == 'SMALL') {
01192                     $c += 32;
01193                 }
01194 
01195                 $decomposition["U+$char"] = array(dechex($c));
01196                 continue;
01197             }
01198 
01199             $match = array();
01200             if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
01201                 switch ($match[1]) {
01202                     case '<circle>': // add parenthesis as circle replacement, eg (1)
01203                         $match[2] = '0028 ' . $match[2] . ' 0029';
01204                         break;
01205 
01206                     case '<square>': // add square brackets as square replacement, eg [1]
01207                         $match[2] = '005B ' . $match[2] . ' 005D';
01208                         break;
01209 
01210                     case '<compat>': // ignore multi char decompositions that start with a space
01211                         if (preg_match('/^0020 /', $match[2])) {
01212                             continue 2;
01213                         }
01214                         break;
01215 
01216                         // ignore Arabic and vertical layout presentation decomposition
01217                     case '<initial>':
01218                     case '<medial>':
01219                     case '<final>':
01220                     case '<isolated>':
01221                     case '<vertical>':
01222                         continue 2;
01223                 }
01224                 $decomposition["U+$char"] = explode(' ', $match[2]);
01225             }
01226         }
01227         fclose($fh);
01228 
01229             // process additional Unicode data for casing (allow folded characters to expand into a sequence)
01230         $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
01231         if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
01232             $fh = fopen($specialCasingFile, 'rb');
01233             if ($fh) {
01234                 while (!feof($fh)) {
01235                     $line = fgets($fh, 4096);
01236                     if ($line{0} != '#' && trim($line) != '') {
01237 
01238                         list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
01239                         if ($cond == '' || $cond{0} == '#') {
01240                             $utf8_char = $this->UnumberToChar(hexdec($char));
01241                             if ($char != $lower) {
01242                                 $arr = explode(' ', $lower);
01243                                 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01244                                 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
01245                             }
01246                             if ($char != $title && $title != $upper) {
01247                                 $arr = explode(' ', $title);
01248                                 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01249                                 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
01250                             }
01251                             if ($char != $upper) {
01252                                 $arr = explode(' ', $upper);
01253                                 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01254                                 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
01255                             }
01256                         }
01257                     }
01258                 }
01259                 fclose($fh);
01260             }
01261         }
01262 
01263             // process custom decompositions
01264         $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
01265         if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
01266             $fh = fopen($customTranslitFile, 'rb');
01267             if ($fh) {
01268                 while (!feof($fh)) {
01269                     $line = fgets($fh, 4096);
01270                     if ($line{0} != '#' && trim($line) != '') {
01271                         list($char, $translit) = t3lib_div::trimExplode(';', $line);
01272                         if (!$translit) {
01273                             $omit["U+$char"] = 1;
01274                         }
01275                         $decomposition["U+$char"] = explode(' ', $translit);
01276 
01277                     }
01278                 }
01279                 fclose($fh);
01280             }
01281         }
01282 
01283             // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
01284         foreach ($decomposition as $from => $to) {
01285             $code_decomp = array();
01286 
01287             while ($code_value = array_shift($to)) {
01288                 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
01289                     foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
01290                         array_unshift($to, $cv);
01291                     }
01292                 } elseif (!isset($mark["U+$code_value"])) { // remove mark
01293                     array_push($code_decomp, $code_value);
01294                 }
01295             }
01296             if (count($code_decomp) || isset($omit[$from])) {
01297                 $decomposition[$from] = $code_decomp;
01298             } else {
01299                 unset($decomposition[$from]);
01300             }
01301         }
01302 
01303             // create ascii only mapping
01304         $this->toASCII['utf-8'] = array();
01305         $ascii =& $this->toASCII['utf-8'];
01306 
01307         foreach ($decomposition as $from => $to) {
01308             $code_decomp = array();
01309             while ($code_value = array_shift($to)) {
01310                 $ord = hexdec($code_value);
01311                 if ($ord > 127) {
01312                     continue 2;
01313                 } // skip decompositions containing non-ASCII chars
01314                 else
01315                 {
01316                     array_push($code_decomp, chr($ord));
01317                 }
01318             }
01319             $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
01320         }
01321 
01322             // add numeric decompositions
01323         foreach ($number as $from => $to) {
01324             $utf8_char = $this->UnumberToChar(hexdec($from));
01325             if (!isset($ascii[$utf8_char])) {
01326                 $ascii[$utf8_char] = $to;
01327             }
01328         }
01329 
01330         if ($cacheFileCase) {
01331             t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
01332         }
01333 
01334         if ($cacheFileASCII) {
01335             t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
01336         }
01337 
01338         return 3;
01339     }
01340 
01341     /**
01342      * This function initializes the folding table for a charset other than UTF-8.
01343      * This function is automatically called by the case folding functions.
01344      *
01345      * @param   string      Charset for which to initialize case folding.
01346      * @return  integer     Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
01347      * @access private
01348      */
01349     function initCaseFolding($charset) {
01350             // Only process if the case table is not yet loaded:
01351         if (is_array($this->caseFolding[$charset])) {
01352             return 1;
01353         }
01354 
01355             // Use cached version if possible
01356         $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
01357         if ($cacheFile && @is_file($cacheFile)) {
01358             $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01359             return 2;
01360         }
01361 
01362             // init UTF-8 conversion for this charset
01363         if (!$this->initCharset($charset)) {
01364             return FALSE;
01365         }
01366 
01367             // UTF-8 case folding is used as the base conversion table
01368         if (!$this->initUnicodeData('case')) {
01369             return FALSE;
01370         }
01371 
01372         $nochar = chr($this->noCharByteVal);
01373         foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
01374                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
01375             $c = $this->utf8_decode($utf8, $charset);
01376 
01377                 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
01378             $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
01379             if ($cc != '' && $cc != $nochar) {
01380                 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
01381             }
01382 
01383                 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
01384             $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
01385             if ($cc != '' && $cc != $nochar) {
01386                 $this->caseFolding[$charset]['toLower'][$c] = $cc;
01387             }
01388 
01389                 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
01390             $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
01391             if ($cc != '' && $cc != $nochar) {
01392                 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
01393             }
01394         }
01395 
01396             // add the ASCII case table
01397         for ($i = ord('a'); $i <= ord('z'); $i++) {
01398             $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
01399         }
01400         for ($i = ord('A'); $i <= ord('Z'); $i++) {
01401             $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
01402         }
01403 
01404         if ($cacheFile) {
01405             t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
01406         }
01407 
01408         return 3;
01409     }
01410 
01411     /**
01412      * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
01413      * This function is automatically called by the ASCII transliteration functions.
01414      *
01415      * @param   string      Charset for which to initialize conversion.
01416      * @return  integer     Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
01417      * @access private
01418      */
01419     function initToASCII($charset) {
01420             // Only process if the case table is not yet loaded:
01421         if (is_array($this->toASCII[$charset])) {
01422             return 1;
01423         }
01424 
01425             // Use cached version if possible
01426         $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
01427         if ($cacheFile && @is_file($cacheFile)) {
01428             $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01429             return 2;
01430         }
01431 
01432             // init UTF-8 conversion for this charset
01433         if (!$this->initCharset($charset)) {
01434             return FALSE;
01435         }
01436 
01437             // UTF-8/ASCII transliteration is used as the base conversion table
01438         if (!$this->initUnicodeData('ascii')) {
01439             return FALSE;
01440         }
01441 
01442         $nochar = chr($this->noCharByteVal);
01443         foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
01444                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
01445             $c = $this->utf8_decode($utf8, $charset);
01446 
01447             if (isset($this->toASCII['utf-8'][$utf8])) {
01448                 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
01449             }
01450         }
01451 
01452         if ($cacheFile) {
01453             t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
01454         }
01455 
01456         return 3;
01457     }
01458 
01459 
01460     /********************************************
01461      *
01462      * String operation functions
01463      *
01464      ********************************************/
01465 
01466     /**
01467      * Returns a part of a string.
01468      * Unit-tested by Kasper (single byte charsets only)
01469      *
01470      * @param   string      The character set
01471      * @param   string      Character string
01472      * @param   integer     Start position (character position)
01473      * @param   integer     Length (in characters)
01474      * @return  string      The substring
01475      * @see substr(), mb_substr()
01476      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01477      */
01478     function substr($charset, $string, $start, $len = NULL) {
01479         if ($len === 0 || $string === '') {
01480             return '';
01481         }
01482 
01483         if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01484                 // cannot omit $len, when specifying charset
01485             if ($len == NULL) {
01486                 $enc = mb_internal_encoding(); // save internal encoding
01487                 mb_internal_encoding($charset);
01488                 $str = mb_substr($string, $start);
01489                 mb_internal_encoding($enc); // restore internal encoding
01490 
01491                 return $str;
01492             }
01493             else {
01494                 return mb_substr($string, $start, $len, $charset);
01495             }
01496         } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
01497                 // cannot omit $len, when specifying charset
01498             if ($len == NULL) {
01499                 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
01500                 iconv_set_encoding('internal_encoding', $charset);
01501                 $str = iconv_substr($string, $start);
01502                 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
01503 
01504                 return $str;
01505             }
01506             else {
01507                 return iconv_substr($string, $start, $len, $charset);
01508             }
01509         } elseif ($charset == 'utf-8') {
01510             return $this->utf8_substr($string, $start, $len);
01511         } elseif ($this->eucBasedSets[$charset]) {
01512             return $this->euc_substr($string, $start, $charset, $len);
01513         } elseif ($this->twoByteSets[$charset]) {
01514             return substr($string, $start * 2, $len * 2);
01515         } elseif ($this->fourByteSets[$charset]) {
01516             return substr($string, $start * 4, $len * 4);
01517         }
01518 
01519             // treat everything else as single-byte encoding
01520         return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
01521     }
01522 
01523     /**
01524      * Counts the number of characters.
01525      * Unit-tested by Kasper (single byte charsets only)
01526      *
01527      * @param   string      The character set
01528      * @param   string      Character string
01529      * @return  integer     The number of characters
01530      * @see strlen()
01531      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01532      */
01533     function strlen($charset, $string) {
01534         if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01535             return mb_strlen($string, $charset);
01536         } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
01537             return iconv_strlen($string, $charset);
01538         } elseif ($charset == 'utf-8') {
01539             return $this->utf8_strlen($string);
01540         } elseif ($this->eucBasedSets[$charset]) {
01541             return $this->euc_strlen($string, $charset);
01542         } elseif ($this->twoByteSets[$charset]) {
01543             return strlen($string) / 2;
01544         } elseif ($this->fourByteSets[$charset]) {
01545             return strlen($string) / 4;
01546         }
01547             // treat everything else as single-byte encoding
01548         return strlen($string);
01549     }
01550 
01551     /**
01552      * Method to crop strings using the mb_substr function.
01553      *
01554      * @param  string       The character set
01555      * @param  string       String to be cropped
01556      * @param  integer      Crop length (in characters)
01557      * @param  string       Crop signifier
01558      * @return string       The shortened string
01559      * @see mb_strlen(), mb_substr()
01560      */
01561     protected function cropMbstring($charset, $string, $len, $crop = '') {
01562         if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
01563             return $string;
01564         }
01565 
01566         if ($len > 0) {
01567             $string = mb_substr($string, 0, $len, $charset) . $crop;
01568         } else {
01569             $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
01570         }
01571 
01572         return $string;
01573     }
01574 
01575     /**
01576      * Truncates a string and pre-/appends a string.
01577      * Unit tested by Kasper
01578      *
01579      * @param   string      The character set
01580      * @param   string      Character string
01581      * @param   integer     Length (in characters)
01582      * @param   string      Crop signifier
01583      * @return  string      The shortened string
01584      * @see substr(), mb_strimwidth()
01585      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01586      */
01587     function crop($charset, $string, $len, $crop = '') {
01588         if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01589             return $this->cropMbstring($charset, $string, $len, $crop);
01590         }
01591 
01592         if (intval($len) == 0) {
01593             return $string;
01594         }
01595 
01596         if ($charset == 'utf-8') {
01597             $i = $this->utf8_char2byte_pos($string, $len);
01598         } elseif ($this->eucBasedSets[$charset]) {
01599             $i = $this->euc_char2byte_pos($string, $len, $charset);
01600         } else {
01601             if ($len > 0) {
01602                 $i = $len;
01603             } else {
01604                 $i = strlen($string) + $len;
01605                 if ($i <= 0) {
01606                     $i = FALSE;
01607                 }
01608             }
01609         }
01610 
01611         if ($i === FALSE) { // $len outside actual string length
01612             return $string;
01613         } else {
01614             if ($len > 0) {
01615                 if (strlen($string{$i})) {
01616                     return substr($string, 0, $i) . $crop;
01617 
01618                 }
01619             } else {
01620                 if (strlen($string{$i - 1})) {
01621                     return $crop . substr($string, $i);
01622                 }
01623             }
01624 
01625             /*
01626                if (abs($len)<$this->strlen($charset,$string))   {   // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
01627                    if ($len > 0)    {
01628                        return substr($string,0,$i).$crop;
01629                    } else {
01630                        return $crop.substr($string,$i);
01631                    }
01632                }
01633    */
01634         }
01635         return $string;
01636     }
01637 
01638     /**
01639      * Cuts a string short at a given byte length.
01640      *
01641      * @param   string      The character set
01642      * @param   string      Character string
01643      * @param   integer     The byte length
01644      * @return  string      The shortened string
01645      * @see mb_strcut()
01646      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01647      */
01648     function strtrunc($charset, $string, $len) {
01649         if ($len <= 0) {
01650             return '';
01651         }
01652 
01653         if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01654             return mb_strcut($string, 0, $len, $charset);
01655         } elseif ($charset == 'utf-8') {
01656             return $this->utf8_strtrunc($string, $len);
01657         } elseif ($this->eucBasedSets[$charset]) {
01658             return $this->euc_strtrunc($string, $len, $charset);
01659         } elseif ($this->twoByteSets[$charset]) {
01660             if ($len % 2) {
01661                 $len--;
01662             } // don't cut at odd positions
01663         } elseif ($this->fourByteSets[$charset]) {
01664             $x = $len % 4;
01665             $len -= $x; // realign to position dividable by four
01666         }
01667             // treat everything else as single-byte encoding
01668         return substr($string, 0, $len);
01669     }
01670 
01671     /**
01672      * Translates all characters of a string into their respective case values.
01673      * Unlike strtolower() and strtoupper() this method is locale independent.
01674      * Note that the string length may change!
01675      * eg. lower case German "ß" (sharp S) becomes upper case "SS"
01676      * Unit-tested by Kasper
01677      * Real case folding is language dependent, this method ignores this fact.
01678      *
01679      * @param   string      Character set of string
01680      * @param   string      Input string to convert case for
01681      * @param   string      Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
01682      * @return  string      The converted string
01683      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01684      * @see strtolower(), strtoupper()
01685      */
01686     function conv_case($charset, $string, $case) {
01687         if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01688             if ($case == 'toLower') {
01689                 $string = mb_strtolower($string, $charset);
01690             } else {
01691                 $string = mb_strtoupper($string, $charset);
01692             }
01693         } elseif ($charset == 'utf-8') {
01694             $string = $this->utf8_char_mapping($string, 'case', $case);
01695         } elseif (isset($this->eucBasedSets[$charset])) {
01696             $string = $this->euc_char_mapping($string, $charset, 'case', $case);
01697         } else {
01698                 // treat everything else as single-byte encoding
01699             $string = $this->sb_char_mapping($string, $charset, 'case', $case);
01700         }
01701 
01702         return $string;
01703     }
01704 
01705     /**
01706      * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
01707      *
01708      * @param   string      Character set of string
01709      * @param   string      Input string to convert
01710      * @return  string      The converted string
01711      */
01712     function specCharsToASCII($charset, $string) {
01713         if ($charset == 'utf-8') {
01714             $string = $this->utf8_char_mapping($string, 'ascii');
01715         } elseif (isset($this->eucBasedSets[$charset])) {
01716             $string = $this->euc_char_mapping($string, $charset, 'ascii');
01717         } else {
01718                 // treat everything else as single-byte encoding
01719             $string = $this->sb_char_mapping($string, $charset, 'ascii');
01720         }
01721 
01722         return $string;
01723     }
01724 
01725 
01726     /**
01727      * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
01728      * into a TYPO3-readable language code
01729      * @param   $languageCodesList  list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
01730      *           see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
01731      * @return  string  a preferred language that TYPO3 supports, or "default" if none found
01732      * @author  Benjamin Mack (benni.typo3.org)
01733      */
01734     public function getPreferredClientLanguage($languageCodesList) {
01735         $allLanguageCodes = array();
01736         $selectedLanguage = 'default';
01737 
01738             // get all languages where TYPO3 code is the same as the ISO code
01739         foreach ($this->charSetArray as $typo3Lang => $charSet) {
01740             $allLanguageCodes[$typo3Lang] = $typo3Lang;
01741         }
01742 
01743             // get all languages where TYPO3 code differs from ISO code
01744             // or needs the country part
01745             // the iso codes will here overwrite the default typo3 language in the key
01746         foreach ($this->isoArray as $typo3Lang => $isoLang) {
01747             $isoLang = join('-', explode('_', $isoLang));
01748             $allLanguageCodes[$typo3Lang] = $isoLang;
01749         }
01750 
01751             // move the iso codes to the (because we're comparing the keys with "isset" later on)
01752         $allLanguageCodes = array_flip($allLanguageCodes);
01753 
01754 
01755         $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
01756             // order the preferred languages after they key
01757         $sortedPreferredLanguages = array();
01758         foreach ($preferredLanguages as $preferredLanguage) {
01759             $quality = 1.0;
01760             if (strpos($preferredLanguage, ';q=') !== FALSE) {
01761                 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
01762             }
01763             $sortedPreferredLanguages[$preferredLanguage] = $quality;
01764         }
01765 
01766             // loop through the languages, with the highest priority first
01767         arsort($sortedPreferredLanguages, SORT_NUMERIC);
01768         foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
01769             if (isset($allLanguageCodes[$preferredLanguage])) {
01770                 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
01771                 break;
01772             }
01773 
01774                 // strip the country code from the end
01775             list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
01776             if (isset($allLanguageCodes[$preferredLanguage])) {
01777                 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
01778                 break;
01779             }
01780         }
01781         if (!$selectedLanguage || $selectedLanguage == 'en') {
01782             $selectedLanguage = 'default';
01783         }
01784         return $selectedLanguage;
01785     }
01786 
01787 
01788     /********************************************
01789      *
01790      * Internal string operation functions
01791      *
01792      ********************************************/
01793 
01794     /**
01795      * Maps all characters of a string in a single byte charset.
01796      *
01797      * @param   string      the string
01798      * @param   string      the charset
01799      * @param   string      mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
01800      * @param   string      'case': conversion 'toLower' or 'toUpper'
01801      * @return  string      the converted string
01802      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01803      */
01804     function sb_char_mapping($str, $charset, $mode, $opt = '') {
01805         switch ($mode) {
01806             case 'case':
01807                 if (!$this->initCaseFolding($charset)) {
01808                     return $str;
01809                 } // do nothing
01810                 $map =& $this->caseFolding[$charset][$opt];
01811                 break;
01812 
01813             case 'ascii':
01814                 if (!$this->initToASCII($charset)) {
01815                     return $str;
01816                 } // do nothing
01817                 $map =& $this->toASCII[$charset];
01818                 break;
01819 
01820             default:
01821                 return $str;
01822         }
01823 
01824         $out = '';
01825         for ($i = 0; strlen($str{$i}); $i++) {
01826             $c = $str{$i};
01827             if (isset($map[$c])) {
01828                 $out .= $map[$c];
01829             } else {
01830                 $out .= $c;
01831             }
01832         }
01833 
01834         return $out;
01835     }
01836 
01837 
01838     /********************************************
01839      *
01840      * Internal UTF-8 string operation functions
01841      *
01842      ********************************************/
01843 
01844     /**
01845      * Returns a part of a UTF-8 string.
01846      * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
01847      *
01848      * @param   string      UTF-8 string
01849      * @param   integer     Start position (character position)
01850      * @param   integer     Length (in characters)
01851      * @return  string      The substring
01852      * @see substr()
01853      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01854      */
01855     function utf8_substr($str, $start, $len = NULL) {
01856         if (!strcmp($len, '0')) {
01857             return '';
01858         }
01859 
01860         $byte_start = $this->utf8_char2byte_pos($str, $start);
01861         if ($byte_start === FALSE) {
01862             if ($start > 0) {
01863                 return FALSE; // $start outside string length
01864             } else {
01865                 $start = 0;
01866             }
01867         }
01868 
01869         $str = substr($str, $byte_start);
01870 
01871         if ($len != NULL) {
01872             $byte_end = $this->utf8_char2byte_pos($str, $len);
01873             if ($byte_end === FALSE) // $len outside actual string length
01874             {
01875                 return $len < 0 ? '' : $str;
01876             } // When length is less than zero and exceeds, then we return blank string.
01877             else
01878             {
01879                 return substr($str, 0, $byte_end);
01880             }
01881         }
01882         else    {
01883             return $str;
01884         }
01885     }
01886 
01887     /**
01888      * Counts the number of characters of a string in UTF-8.
01889      * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
01890      *
01891      * @param   string      UTF-8 multibyte character string
01892      * @return  integer     The number of characters
01893      * @see strlen()
01894      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01895      */
01896     function utf8_strlen($str) {
01897         $n = 0;
01898         for ($i = 0; strlen($str{$i}); $i++) {
01899             $c = ord($str{$i});
01900             if (!($c & 0x80)) // single-byte (0xxxxxx)
01901             {
01902                 $n++;
01903             }
01904             elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
01905             {
01906                 $n++;
01907             }
01908         }
01909         return $n;
01910     }
01911 
01912     /**
01913      * Truncates a string in UTF-8 short at a given byte length.
01914      *
01915      * @param   string      UTF-8 multibyte character string
01916      * @param   integer     the byte length
01917      * @return  string      the shortened string
01918      * @see mb_strcut()
01919      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01920      */
01921     function utf8_strtrunc($str, $len) {
01922         $i = $len - 1;
01923         if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
01924             for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
01925             if ($i <= 0) {
01926                 return '';
01927             } // sanity check
01928             for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
01929             if ($bc + $i > $len) {
01930                 return substr($str, 0, $i);
01931             }
01932             // fallthru: multibyte char fits into length
01933         }
01934         return substr($str, 0, $len);
01935     }
01936 
01937     /**
01938      * Find position of first occurrence of a string, both arguments are in UTF-8.
01939      *
01940      * @param   string      UTF-8 string to search in
01941      * @param   string      UTF-8 string to search for
01942      * @param   integer     Positition to start the search
01943      * @return  integer     The character position
01944      * @see strpos()
01945      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01946      */
01947     function utf8_strpos($haystack, $needle, $offset = 0) {
01948         if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01949             return mb_strpos($haystack, $needle, $offset, 'utf-8');
01950         } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
01951             return iconv_strpos($haystack, $needle, $offset, 'utf-8');
01952         }
01953 
01954         $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
01955         if ($byte_offset === FALSE) {
01956             return FALSE;
01957         } // offset beyond string length
01958 
01959         $byte_pos = strpos($haystack, $needle, $byte_offset);
01960         if ($byte_pos === FALSE) {
01961             return FALSE;
01962         } // needle not found
01963 
01964         return $this->utf8_byte2char_pos($haystack, $byte_pos);
01965     }
01966 
01967     /**
01968      * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
01969      *
01970      * @param   string      UTF-8 string to search in
01971      * @param   string      UTF-8 character to search for (single character)
01972      * @return  integer     The character position
01973      * @see strrpos()
01974      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01975      */
01976     function utf8_strrpos($haystack, $needle) {
01977         if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01978             return mb_strrpos($haystack, $needle, 'utf-8');
01979         } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
01980             return iconv_strrpos($haystack, $needle, 'utf-8');
01981         }
01982 
01983         $byte_pos = strrpos($haystack, $needle);
01984         if ($byte_pos === FALSE) {
01985             return FALSE;
01986         } // needle not found
01987 
01988         return $this->utf8_byte2char_pos($haystack, $byte_pos);
01989     }
01990 
01991     /**
01992      * Translates a character position into an 'absolute' byte position.
01993      * Unit tested by Kasper.
01994      *
01995      * @param   string      UTF-8 string
01996      * @param   integer     Character position (negative values start from the end)
01997      * @return  integer     Byte position
01998      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01999      */
02000     function utf8_char2byte_pos($str, $pos) {
02001         $n = 0; // number of characters found
02002         $p = abs($pos); // number of characters wanted
02003 
02004         if ($pos >= 0) {
02005             $i = 0;
02006             $d = 1;
02007         } else {
02008             $i = strlen($str) - 1;
02009             $d = -1;
02010         }
02011 
02012         for (; strlen($str{$i}) && $n < $p; $i += $d) {
02013             $c = (int) ord($str{$i});
02014             if (!($c & 0x80)) // single-byte (0xxxxxx)
02015             {
02016                 $n++;
02017             }
02018             elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
02019             {
02020                 $n++;
02021             }
02022         }
02023         if (!strlen($str{$i})) {
02024             return FALSE;
02025         } // offset beyond string length
02026 
02027         if ($pos >= 0) {
02028                 // skip trailing multi-byte data bytes
02029             while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) {
02030                 $i++;
02031             }
02032         } else {
02033                 // correct offset
02034             $i++;
02035         }
02036 
02037         return $i;
02038     }
02039 
02040     /**
02041      * Translates an 'absolute' byte position into a character position.
02042      * Unit tested by Kasper.
02043      *
02044      * @param   string      UTF-8 string
02045      * @param   integer     byte position
02046      * @return  integer     character position
02047      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
02048      */
02049     function utf8_byte2char_pos($str, $pos) {
02050         $n = 0; // number of characters
02051         for ($i = $pos; $i > 0; $i--) {
02052             $c = (int) ord($str{$i});
02053             if (!($c & 0x80)) // single-byte (0xxxxxx)
02054             {
02055                 $n++;
02056             }
02057             elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
02058             {
02059                 $n++;
02060             }
02061         }
02062         if (!strlen($str{$i})) {
02063             return FALSE;
02064         } // offset beyond string length
02065 
02066         return $n;
02067     }
02068 
02069     /**
02070      * Maps all characters of an UTF-8 string.
02071      *
02072      * @param   string      UTF-8 string
02073      * @param   string      mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
02074      * @param   string      'case': conversion 'toLower' or 'toUpper'
02075      * @return  string      the converted string
02076      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
02077      */
02078     function utf8_char_mapping($str, $mode, $opt = '') {
02079         if (!$this->initUnicodeData($mode)) {
02080             return $str;
02081         } // do nothing
02082 
02083         $out = '';
02084         switch ($mode) {
02085             case 'case':
02086                 $map =& $this->caseFolding['utf-8'][$opt];
02087                 break;
02088 
02089             case 'ascii':
02090                 $map =& $this->toASCII['utf-8'];
02091                 break;
02092 
02093             default:
02094                 return $str;
02095         }
02096 
02097         for ($i = 0; strlen($str{$i}); $i++) {
02098             $c = ord($str{$i});
02099             if (!($c & 0x80)) // single-byte (0xxxxxx)
02100             {
02101                 $mbc = $str{$i};
02102             }
02103             elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
02104                 for ($bc = 0; $c & 0x80; $c = $c << 1) {
02105                     $bc++;
02106                 } // calculate number of bytes
02107                 $mbc = substr($str, $i, $bc);
02108                 $i += $bc - 1;
02109             }
02110 
02111             if (isset($map[$mbc])) {
02112                 $out .= $map[$mbc];
02113             } else {
02114                 $out .= $mbc;
02115             }
02116         }
02117 
02118         return $out;
02119     }
02120 
02121 
02122     /********************************************
02123      *
02124      * Internal EUC string operation functions
02125      *
02126      * Extended Unix Code:
02127      *  ASCII compatible 7bit single bytes chars
02128      *  8bit two byte chars
02129      *
02130      * Shift-JIS is treated as a special case.
02131      *
02132      ********************************************/
02133 
02134     /**
02135      * Cuts a string in the EUC charset family short at a given byte length.
02136      *
02137      * @param   string      EUC multibyte character string
02138      * @param   integer     the byte length
02139      * @param   string      the charset
02140      * @return  string      the shortened string
02141      * @see mb_strcut()
02142      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
02143      */
02144     function euc_strtrunc($str, $len, $charset) {
02145         $sjis = ($charset == 'shift_jis');
02146         for ($i = 0; strlen($str{$i}) && $i < $len; $i++) {
02147             $c = ord($str{$i});
02148             if ($sjis) {
02149                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
02150                     $i++;
02151                 } // advance a double-byte char
02152             }
02153             else {
02154                 if ($c >= 0x80) {
02155                     $i++;
02156                 } // advance a double-byte char
02157             }
02158         }
02159         if (!strlen($str{$i})) {
02160             return $str;
02161         } // string shorter than supplied length
02162 
02163         if ($i > $len) {
02164             return substr($str, 0, $len - 1); // we ended on a first byte
02165         } else {
02166             return substr($str, 0, $len);
02167         }
02168     }
02169 
02170     /**
02171      * Returns a part of a string in the EUC charset family.
02172      *
02173      * @param   string      EUC multibyte character string
02174      * @param   integer     start position (character position)
02175      * @param   string      the charset
02176      * @param   integer     length (in characters)
02177      * @return  string      the substring
02178      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
02179      */
02180     function euc_substr($str, $start, $charset, $len = NULL) {
02181         $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
02182         if ($byte_start === FALSE) {
02183             return FALSE;
02184         } // $start outside string length
02185 
02186         $str = substr($str, $byte_start);
02187 
02188         if ($len != NULL) {
02189             $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
02190             if ($byte_end === FALSE) // $len outside actual string length
02191             {
02192                 return $str;
02193             }
02194             else
02195             {
02196                 return substr($str, 0, $byte_end);
02197             }
02198         }
02199         else    {
02200             return $str;
02201         }
02202     }
02203 
02204     /**
02205      * Counts the number of characters of a string in the EUC charset family.
02206      *
02207      * @param   string      EUC multibyte character string
02208      * @param   string      the charset
02209      * @return  integer     the number of characters
02210      * @see strlen()
02211      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
02212      */
02213     function euc_strlen($str, $charset) {
02214         $sjis = ($charset == 'shift_jis');
02215         $n = 0;
02216         for ($i = 0; strlen($str{$i}); $i++) {
02217             $c = ord($str{$i});
02218             if ($sjis) {
02219                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
02220                     $i++;
02221                 } // advance a double-byte char
02222             }
02223             else {
02224                 if ($c >= 0x80) {
02225                     $i++;
02226                 } // advance a double-byte char
02227             }
02228 
02229             $n++;
02230         }
02231 
02232         return $n;
02233     }
02234 
02235     /**
02236      * Translates a character position into an 'absolute' byte position.
02237      *
02238      * @param   string      EUC multibyte character string
02239      * @param   integer     character position (negative values start from the end)
02240      * @param   string      the charset
02241      * @return  integer     byte position
02242      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
02243      */
02244     function euc_char2byte_pos($str, $pos, $charset) {
02245         $sjis = ($charset == 'shift_jis');
02246         $n = 0; // number of characters seen
02247         $p = abs($pos); // number of characters wanted
02248 
02249         if ($pos >= 0) {
02250             $i = 0;
02251             $d = 1;
02252         } else {
02253             $i = strlen($str) - 1;
02254             $d = -1;
02255         }
02256 
02257         for (; strlen($str{$i}) && $n < $p; $i += $d) {
02258             $c = ord($str{$i});
02259             if ($sjis) {
02260                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
02261                     $i += $d;
02262                 } // advance a double-byte char
02263             }
02264             else {
02265                 if ($c >= 0x80) {
02266                     $i += $d;
02267                 } // advance a double-byte char
02268             }
02269 
02270             $n++;
02271         }
02272         if (!strlen($str{$i})) {
02273             return FALSE;
02274         } // offset beyond string length
02275 
02276         if ($pos < 0) {
02277             $i++;
02278         } // correct offset
02279 
02280         return $i;
02281     }
02282 
02283     /**
02284      * Maps all characters of a string in the EUC charset family.
02285      *
02286      * @param   string      EUC multibyte character string
02287      * @param   string      the charset
02288      * @param   string      mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
02289      * @param   string      'case': conversion 'toLower' or 'toUpper'
02290      * @return  string      the converted string
02291      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
02292      */
02293     function euc_char_mapping($str, $charset, $mode, $opt = '') {
02294         switch ($mode) {
02295             case 'case':
02296                 if (!$this->initCaseFolding($charset)) {
02297                     return $str;
02298                 } // do nothing
02299                 $map =& $this->caseFolding[$charset][$opt];
02300                 break;
02301 
02302             case 'ascii':
02303                 if (!$this->initToASCII($charset)) {
02304                     return $str;
02305                 } // do nothing
02306                 $map =& $this->toASCII[$charset];
02307                 break;
02308 
02309             default:
02310                 return $str;
02311         }
02312 
02313         $sjis = ($charset == 'shift_jis');
02314         $out = '';
02315         for ($i = 0; strlen($str{$i}); $i++) {
02316             $mbc = $str{$i};
02317             $c = ord($mbc);
02318 
02319             if ($sjis) {
02320                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
02321                     $mbc = substr($str, $i, 2);
02322                     $i++;
02323                 }
02324             }
02325             else {
02326                 if ($c >= 0x80) { // a double-byte char
02327                     $mbc = substr($str, $i, 2);
02328                     $i++;
02329                 }
02330             }
02331 
02332             if (isset($map[$mbc])) {
02333                 $out .= $map[$mbc];
02334             } else {
02335                 $out .= $mbc;
02336             }
02337         }
02338 
02339         return $out;
02340     }
02341 
02342 }
02343 
02344 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])) {
02345     include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
02346 }
02347 
02348 ?>