class.t3lib_cs.php

Go to the documentation of this file.
00001 <?php
00002 /***************************************************************
00003 *  Copyright notice
00004 *
00005 *  (c) 2003-2010 Kasper Skaarhoj (kasperYYYY@typo3.com)
00006 *  All rights reserved
00007 *
00008 *  This script is part of the Typo3 project. The Typo3 project is
00009 *  free software; you can redistribute it and/or modify
00010 *  it under the terms of the GNU General Public License as published by
00011 *  the Free Software Foundation; either version 2 of the License, or
00012 *  (at your option) any later version.
00013 *
00014 *  The GNU General Public License can be found at
00015 *  http://www.gnu.org/copyleft/gpl.html.
00016 *
00017 *  This script is distributed in the hope that it will be useful,
00018 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00019 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00020 *  GNU General Public License for more details.
00021 *
00022 *  This copyright notice MUST APPEAR in all copies of the script!
00023 ***************************************************************/
00024 /**
00025  * Class for conversion between charsets.
00026  *
00027  * $Id: class.t3lib_cs.php 7905 2010-06-13 14:42:33Z ohader $
00028  *
00029  * @author  Kasper Skaarhoj <kasperYYYY@typo3.com>
00030  * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
00031  */
00032 /**
00033  * [CLASS/FUNCTION INDEX of SCRIPT]
00034  *
00035  *
00036  *
00037  *  136: class t3lib_cs
00038  *  488:     function parse_charset($charset)
00039  *  507:     function get_locale_charset($locale)
00040  *
00041  *              SECTION: Charset Conversion functions
00042  *  560:     function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
00043  *  600:     function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
00044  *  617:     function utf8_encode($str,$charset)
00045  *  663:     function utf8_decode($str,$charset,$useEntityForNoChar=0)
00046  *  706:     function utf8_to_entities($str)
00047  *  739:     function entities_to_utf8($str,$alsoStdHtmlEnt=0)
00048  *  773:     function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
00049  *  823:     function UnumberToChar($cbyte)
00050  *  868:     function utf8CharToUnumber($str,$hex=0)
00051  *
00052  *              SECTION: Init functions
00053  *  911:     function initCharset($charset)
00054  *  973:     function initUnicodeData($mode=null)
00055  * 1198:     function initCaseFolding($charset)
00056  * 1260:     function initToASCII($charset)
00057  *
00058  *              SECTION: String operation functions
00059  * 1331:     function substr($charset,$string,$start,$len=null)
00060  * 1384:     function strlen($charset,$string)
00061  * 1414:     function crop($charset,$string,$len,$crop='')
00062  * 1467:     function strtrunc($charset,$string,$len)
00063  * 1501:     function conv_case($charset,$string,$case)
00064  * 1527:     function specCharsToASCII($charset,$string)
00065  *
00066  *              SECTION: Internal string operation functions
00067  * 1567:     function sb_char_mapping($str,$charset,$mode,$opt='')
00068  *
00069  *              SECTION: Internal UTF-8 string operation functions
00070  * 1622:     function utf8_substr($str,$start,$len=null)
00071  * 1655:     function utf8_strlen($str)
00072  * 1676:     function utf8_strtrunc($str,$len)
00073  * 1698:     function utf8_strpos($haystack,$needle,$offset=0)
00074  * 1723:     function utf8_strrpos($haystack,$needle)
00075  * 1745:     function utf8_char2byte_pos($str,$pos)
00076  * 1786:     function utf8_byte2char_pos($str,$pos)
00077  * 1809:     function utf8_char_mapping($str,$mode,$opt='')
00078  *
00079  *              SECTION: Internal EUC string operation functions
00080  * 1885:     function euc_strtrunc($str,$len,$charset)
00081  * 1914:     function euc_substr($str,$start,$charset,$len=null)
00082  * 1939:     function euc_strlen($str,$charset)
00083  * 1966:     function euc_char2byte_pos($str,$pos,$charset)
00084  * 2007:     function euc_char_mapping($str,$charset,$mode,$opt='')
00085  *
00086  * TOTAL FUNCTIONS: 35
00087  * (This index is automatically created/updated by the extension "extdeveval")
00088  *
00089  */
00090 
00091 
00092 
00093 
00094 
00095 
00096 
00097 
00098 /**
00099  * Notes on UTF-8
00100  *
00101  * Functions working on UTF-8 strings:
00102  *
00103  * - strchr/strstr
00104  * - strrchr
00105  * - substr_count
00106  * - implode/explode/join
00107  *
00108  * Functions nearly working on UTF-8 strings:
00109  *
00110  * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
00111  * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
00112  * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
00113  * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
00114  * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
00115  *
00116  * Functions NOT working on UTF-8 strings:
00117  *
00118  * - str*cmp
00119  * - stristr
00120  * - stripos
00121  * - substr
00122  * - strrev
00123  * - split/spliti
00124  * - ...
00125  *
00126  */
00127 /**
00128  * Class for conversion between charsets
00129  *
00130  * @author  Kasper Skaarhoj <kasperYYYY@typo3.com>
00131  * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
00132  * @package TYPO3
00133  * @subpackage t3lib
00134  */
00135 class t3lib_cs {
00136     var $noCharByteVal=63;      // ASCII Value for chars with no equivalent.
00137 
00138         // This is the array where parsed conversion tables are stored (cached)
00139     var $parsedCharsets=array();
00140 
00141         // An array where case folding data will be stored (cached)
00142     var $caseFolding=array();
00143 
00144         // An array where charset-to-ASCII mappings are stored (cached)
00145     var $toASCII=array();
00146 
00147         // This tells the converter which charsets has two bytes per char:
00148     var $twoByteSets=array(
00149         'ucs-2'=>1, // 2-byte Unicode
00150     );
00151 
00152         // This tells the converter which charsets has four bytes per char:
00153     var $fourByteSets=array(
00154         'ucs-4'=>1, // 4-byte Unicode
00155         'utf-32'=>1,    // 4-byte Unicode (limited to the 21-bits of UTF-16)
00156     );
00157 
00158         // This tells the converter which charsets use a scheme like the Extended Unix Code:
00159     var $eucBasedSets=array(
00160         'gb2312'=>1,        // Chinese, simplified.
00161         'big5'=>1,      // Chinese, traditional.
00162         'euc-kr'=>1,        // Korean
00163         'shift_jis'=>1,     // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
00164     );
00165 
00166         // see  http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
00167         // http://czyborra.com/charsets/iso8859.html
00168     var $synonyms=array(
00169         'us' => 'ascii',
00170         'us-ascii'=> 'ascii',
00171         'cp819' => 'iso-8859-1',
00172         'ibm819' => 'iso-8859-1',
00173         'iso-ir-100' => 'iso-8859-1',
00174         'iso-ir-101' => 'iso-8859-2',
00175         'iso-ir-109' => 'iso-8859-3',
00176         'iso-ir-110' => 'iso-8859-4',
00177         'iso-ir-144' => 'iso-8859-5',
00178         'iso-ir-127' => 'iso-8859-6',
00179         'iso-ir-126' => 'iso-8859-7',
00180         'iso-ir-138' => 'iso-8859-8',
00181         'iso-ir-148' => 'iso-8859-9',
00182         'iso-ir-157' => 'iso-8859-10',
00183         'iso-ir-179' => 'iso-8859-13',
00184         'iso-ir-199' => 'iso-8859-14',
00185         'iso-ir-203' => 'iso-8859-15',
00186         'csisolatin1' => 'iso-8859-1',
00187         'csisolatin2' => 'iso-8859-2',
00188         'csisolatin3' => 'iso-8859-3',
00189         'csisolatin5' => 'iso-8859-9',
00190         'csisolatin8' => 'iso-8859-14',
00191         'csisolatin9' => 'iso-8859-15',
00192         'csisolatingreek' => 'iso-8859-7',
00193         'iso-celtic' => 'iso-8859-14',
00194         'latin1' => 'iso-8859-1',
00195         'latin2' => 'iso-8859-2',
00196         'latin3' => 'iso-8859-3',
00197         'latin5' => 'iso-8859-9',
00198         'latin6' => 'iso-8859-10',
00199         'latin8' => 'iso-8859-14',
00200         'latin9' => 'iso-8859-15',
00201         'l1' => 'iso-8859-1',
00202         'l2' => 'iso-8859-2',
00203         'l3' => 'iso-8859-3',
00204         'l5' => 'iso-8859-9',
00205         'l6' => 'iso-8859-10',
00206         'l8' => 'iso-8859-14',
00207         'l9' => 'iso-8859-15',
00208         'cyrillic' => 'iso-8859-5',
00209         'arabic' => 'iso-8859-6',
00210         'tis-620' => 'iso-8859-11',
00211         'win874' => 'windows-874',
00212         'win1250' => 'windows-1250',
00213         'win1251' => 'windows-1251',
00214         'win1252' => 'windows-1252',
00215         'win1253' => 'windows-1253',
00216         'win1254' => 'windows-1254',
00217         'win1255' => 'windows-1255',
00218         'win1256' => 'windows-1256',
00219         'win1257' => 'windows-1257',
00220         'win1258' => 'windows-1258',
00221         'cp1250' => 'windows-1250',
00222         'cp1251' => 'windows-1251',
00223         'cp1252' => 'windows-1252',
00224         'ms-ee' => 'windows-1250',
00225         'ms-ansi' => 'windows-1252',
00226         'ms-greek' => 'windows-1253',
00227         'ms-turk' => 'windows-1254',
00228         'winbaltrim' => 'windows-1257',
00229         'koi-8ru' => 'koi-8r',
00230         'koi8r' => 'koi-8r',
00231         'cp878' => 'koi-8r',
00232         'mac' => 'macroman',
00233         'macintosh' => 'macroman',
00234         'euc-cn' => 'gb2312',
00235         'x-euc-cn' => 'gb2312',
00236         'euccn' => 'gb2312',
00237         'cp936' => 'gb2312',
00238         'big-5' => 'big5',
00239         'cp950' => 'big5',
00240         'eucjp' => 'euc-jp',
00241         'sjis' => 'shift_jis',
00242         'shift-jis' => 'shift_jis',
00243         'cp932' => 'shift_jis',
00244         'cp949' => 'euc-kr',
00245         'utf7' => 'utf-7',
00246         'utf8' => 'utf-8',
00247         'utf16' => 'utf-16',
00248         'utf32' => 'utf-32',
00249         'utf8' => 'utf-8',
00250         'ucs2' => 'ucs-2',
00251         'ucs4' => 'ucs-4',
00252     );
00253 
00254         // mapping of iso-639-1 language codes to script names
00255     var $lang_to_script=array(
00256             // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
00257         'ar' => 'arabic',
00258         'bg' => 'cyrillic',     // Bulgarian
00259         'bs' => 'east_european',    // Bosnian
00260         'cs' => 'east_european',    // Czech
00261         'da' => 'west_european',    // Danish
00262         'de' => 'west_european',    // German
00263         'es' => 'west_european',    // Spanish
00264         'et' => 'estonian',
00265         'eo' => 'unicode',      // Esperanto
00266         'eu' => 'west_european',    // Basque
00267         'fa' => 'arabic',   // Persian
00268         'fi' => 'west_european',    // Finish
00269         'fo' => 'west_european',    // Faroese
00270         'fr' => 'west_european',    // French
00271         'ga' => 'west_european',    // Galician
00272         'ge' => 'unicode',          // Georgian
00273         'gr' => 'greek',
00274         'he' => 'hebrew',       // Hebrew (since 1998)
00275         'hi' => 'unicode',      // Hindi
00276         'hr' => 'east_european',    // Croatian
00277         'hu' => 'east_european',    // Hungarian
00278         'iw' => 'hebrew',       // Hebrew (til 1998)
00279         'is' => 'west_european',    // Icelandic
00280         'it' => 'west_european',    // Italian
00281         'ja' => 'japanese',
00282         'kl' => 'west_european',    // Greenlandic
00283         'ko' => 'korean',
00284         'lt' => 'lithuanian',
00285         'lv' => 'west_european',    // Latvian/Lettish
00286         'nl' => 'west_european',    // Dutch
00287         'no' => 'west_european',    // Norwegian
00288         'nb' => 'west_european',    // Norwegian Bokmal
00289         'nn' => 'west_european',    // Norwegian Nynorsk
00290         'pl' => 'east_european',    // Polish
00291         'pt' => 'west_european',    // Portuguese
00292         'ro' => 'east_european',    // Romanian
00293         'ru' => 'cyrillic',     // Russian
00294         'sk' => 'east_european',    // Slovak
00295         'sl' => 'east_european',    // Slovenian
00296         'sr' => 'cyrillic',     // Serbian
00297         'sv' => 'west_european',    // Swedish
00298         'sq' => 'albanian',     // Albanian
00299         'th' => 'thai',
00300         'uk' => 'cyrillic',     // Ukranian
00301         'vi' => 'vietnamese',
00302         'zh' => 'chinese',
00303             // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
00304             // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
00305         'ara' => 'arabic',
00306         'bgr' => 'cyrillic',        // Bulgarian
00307         'cat' => 'west_european',   // Catalan
00308         'chs' => 'simpl_chinese',
00309         'cht' => 'trad_chinese',
00310         'csy' => 'east_european',   // Czech
00311         'dan' => 'west_european',   // Danisch
00312         'deu' => 'west_european',   // German
00313         'dea' => 'west_european',   // German (Austrian)
00314         'des' => 'west_european',   // German (Swiss)
00315         'ena' => 'west_european',   // English (Australian)
00316         'enc' => 'west_european',   // English (Canadian)
00317         'eng' => 'west_european',   // English
00318         'enz' => 'west_european',   // English (New Zealand)
00319         'enu' => 'west_european',   // English (United States)
00320         'euq' => 'west_european',   // Basque
00321         'fos' => 'west_european',   // Faroese
00322         'far' => 'arabic',  // Persian
00323         'fin' => 'west_european',   // Finish
00324         'fra' => 'west_european',   // French
00325         'frb' => 'west_european',   // French (Belgian)
00326         'frc' => 'west_european',   // French (Canadian)
00327         'frs' => 'west_european',   // French (Swiss)
00328         'geo' => 'unicode',         // Georgian
00329         'glg' => 'west_european',   // Galician
00330         'ell' => 'greek',
00331         'heb' => 'hebrew',
00332         'hin' => 'unicode', // Hindi
00333         'hun' => 'east_european',   // Hungarian
00334         'isl' => 'west_euorpean',   // Icelandic
00335         'ita' => 'west_european',   // Italian
00336         'its' => 'west_european',   // Italian (Swiss)
00337         'jpn' => 'japanese',
00338         'kor' => 'korean',
00339         'lth' => 'lithuanian',
00340         'lvi' => 'west_european',   // Latvian/Lettish
00341         'msl' => 'west_european',   // Malay
00342         'nlb' => 'west_european',   // Dutch (Belgian)
00343         'nld' => 'west_european',   // Dutch
00344         'nor' => 'west_european',   // Norwegian (bokmal)
00345         'non' => 'west_european',   // Norwegian (nynorsk)
00346         'plk' => 'east_european',   // Polish
00347         'ptg' => 'west_european',   // Portuguese
00348         'ptb' => 'west_european',   // Portuguese (Brazil)
00349         'rom' => 'east_european',   // Romanian
00350         'rus' => 'cyrillic',        // Russian
00351         'slv' => 'east_european',   // Slovenian
00352         'sky' => 'east_european',   // Slovak
00353         'srl' => 'east_european',   // Serbian (Latin)
00354         'srb' => 'cyrillic',        // Serbian (Cyrillic)
00355         'esp' => 'west_european',   // Spanish (trad. sort)
00356         'esm' => 'west_european',   // Spanish (Mexican)
00357         'esn' => 'west_european',   // Spanish (internat. sort)
00358         'sve' => 'west_european',   // Swedish
00359         'sqi' => 'albanian',        // Albanian
00360         'tha' => 'thai',
00361         'trk' => 'turkish',
00362         'ukr' => 'cyrillic',    // Ukrainian
00363             // English language names
00364         'albanian' => 'albanian',
00365         'arabic' => 'arabic',
00366         'basque' => 'west_european',
00367         'bosnian' => 'east_european',
00368         'bulgarian' => 'east_european',
00369         'catalan' => 'west_european',
00370         'croatian' => 'east_european',
00371         'czech' => 'east_european',
00372         'danish' => 'west_european',
00373         'dutch' => 'west_european',
00374         'english' => 'west_european',
00375         'esperanto' => 'unicode',
00376         'estonian' => 'estonian',
00377         'faroese' => 'west_european',
00378         'farsi' => 'arabic',
00379         'finnish' => 'west_european',
00380         'french' => 'west_european',
00381         'galician' => 'west_european',
00382         'georgian' => 'unicode',
00383         'german' => 'west_european',
00384         'greek' => 'greek',
00385         'greenlandic' => 'west_european',
00386         'hebrew' => 'hebrew',
00387         'hindi' => 'unicode',
00388         'hungarian' => 'east_european',
00389         'icelandic' => 'west_european',
00390         'italian' => 'west_european',
00391         'latvian' => 'west_european',
00392         'lettish' => 'west_european',
00393         'lithuanian' => 'lithuanian',
00394         'malay' => 'west_european',
00395         'norwegian' => 'west_european',
00396         'persian' => 'arabic',
00397         'polish' => 'east_european',
00398         'portuguese' => 'west_european',
00399         'russian' => 'cyrillic',
00400         'romanian' => 'east_european',
00401         'serbian' => 'cyrillic',
00402         'slovak' => 'east_european',
00403         'slovenian' => 'east_european',
00404         'spanish' => 'west_european',
00405         'svedish' => 'west_european',
00406         'that' => 'thai',
00407         'turkish' => 'turkish',
00408         'ukrainian' => 'cyrillic',
00409     );
00410 
00411         // mapping of language (family) names to charsets on Unix
00412     var $script_to_charset_unix=array(
00413         'west_european' => 'iso-8859-1',
00414         'estonian' => 'iso-8859-1',
00415         'east_european' => 'iso-8859-2',
00416         'baltic' => 'iso-8859-4',
00417         'cyrillic' => 'iso-8859-5',
00418         'arabic' => 'iso-8859-6',
00419         'greek' => 'iso-8859-7',
00420         'hebrew' => 'iso-8859-8',
00421         'turkish' => 'iso-8859-9',
00422         'thai' => 'iso-8859-11', // = TIS-620
00423         'lithuanian' => 'iso-8859-13',
00424         'chinese' => 'gb2312', // = euc-cn
00425         'japanese' => 'euc-jp',
00426         'korean' => 'euc-kr',
00427         'simpl_chinese' => 'gb2312',
00428         'trad_chinese' => 'big5',
00429         'vietnamese' => '',
00430         'unicode' => 'utf-8',
00431         'albanian' => 'utf-8'
00432     );
00433 
00434         // mapping of language (family) names to charsets on Windows
00435     var $script_to_charset_windows=array(
00436         'east_european' => 'windows-1250',
00437         'cyrillic' => 'windows-1251',
00438         'west_european' => 'windows-1252',
00439         'greek' => 'windows-1253',
00440         'turkish' => 'windows-1254',
00441         'hebrew' => 'windows-1255',
00442         'arabic' => 'windows-1256',
00443         'baltic' => 'windows-1257',
00444         'estonian' => 'windows-1257',
00445         'lithuanian' => 'windows-1257',
00446         'vietnamese' => 'windows-1258',
00447         'thai' => 'cp874',
00448         'korean' => 'cp949',
00449         'chinese' => 'gb2312',
00450         'japanese' => 'shift_jis',
00451         'simpl_chinese' => 'gb2312',
00452         'trad_chinese' => 'big5',
00453         'albanian' => 'windows-1250',
00454         'unicode' => 'utf-8'
00455     );
00456 
00457         // mapping of locale names to charsets
00458     var $locale_to_charset=array(
00459         'japanese.euc' => 'euc-jp',
00460         'ja_jp.ujis' => 'euc-jp',
00461         'korean.euc' => 'euc-kr',
00462         'sr@Latn' => 'iso-8859-2',
00463         'zh_cn' => 'gb2312',
00464         'zh_hk' => 'big5',
00465         'zh_tw' => 'big5',
00466     );
00467 
00468         // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
00469         // Empty values means "iso-8859-1"
00470     var $charSetArray = array(
00471         'dk' => '',
00472         'de' => '',
00473         'no' => '',
00474         'it' => '',
00475         'fr' => '',
00476         'es' => '',
00477         'nl' => '',
00478         'cz' => 'windows-1250',
00479         'pl' => 'iso-8859-2',
00480         'si' => 'windows-1250',
00481         'fi' => '',
00482         'tr' => 'iso-8859-9',
00483         'se' => '',
00484         'pt' => '',
00485         'ru' => 'windows-1251',
00486         'ro' => 'iso-8859-2',
00487         'ch' => 'gb2312',
00488         'sk' => 'windows-1250',
00489         'lt' => 'windows-1257',
00490         'is' => 'utf-8',
00491         'hr' => 'windows-1250',
00492         'hu' => 'iso-8859-2',
00493         'gl' => '',
00494         'th' => 'iso-8859-11',
00495         'gr' => 'iso-8859-7',
00496         'hk' => 'big5',
00497         'eu' => '',
00498         'bg' => 'windows-1251',
00499         'br' => '',
00500         'et' => 'iso-8859-4',
00501         'ar' => 'iso-8859-6',
00502         'he' => 'utf-8',
00503         'ua' => 'windows-1251',
00504         'jp' => 'shift_jis',
00505         'lv' => 'utf-8',
00506         'vn' => 'utf-8',
00507         'ca' => 'iso-8859-15',
00508         'ba' => 'iso-8859-2',
00509         'kr' => 'euc-kr',
00510         'eo' => 'utf-8',
00511         'my' => '',
00512         'hi' => 'utf-8',
00513         'fo' => 'utf-8',
00514         'fa' => 'utf-8',
00515         'sr' => 'utf-8',
00516         'sq' => 'utf-8',
00517         'ge' => 'utf-8',
00518         'ga' => '',
00519     );
00520 
00521         // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
00522         // Missing keys means: same as Typo3
00523     var $isoArray = array(
00524         'ba' => 'bs',
00525         'br' => 'pt_BR',
00526         'ch' => 'zh_CN',
00527         'cz' => 'cs',
00528         'dk' => 'da',
00529         'si' => 'sl',
00530         'se' => 'sv',
00531         'gl' => 'kl',
00532         'gr' => 'el',
00533         'hk' => 'zh_HK',
00534         'kr' => 'ko',
00535         'ua' => 'uk',
00536         'jp' => 'ja',
00537         'vn' => 'vi',
00538     );
00539 
00540     /**
00541      * Normalize - changes input character set to lowercase letters.
00542      *
00543      * @param   string      Input charset
00544      * @return  string      Normalized charset
00545      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
00546      */
00547     function parse_charset($charset)    {
00548         $charset = trim(strtolower($charset));
00549         if (isset($this->synonyms[$charset]))   $charset = $this->synonyms[$charset];
00550 
00551         return $charset;
00552     }
00553 
00554     /**
00555      * Get the charset of a locale.
00556      *
00557      * ln            language
00558      * ln_CN         language / country
00559      * ln_CN.cs      language / country / charset
00560      * ln_CN.cs@mod  language / country / charset / modifier
00561      *
00562      * @param   string      Locale string
00563      * @return  string      Charset resolved for locale string
00564      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
00565      */
00566     function get_locale_charset($locale)    {
00567         $locale = strtolower($locale);
00568 
00569             // exact locale specific charset?
00570         if (isset($this->locale_to_charset[$locale]))   return $this->locale_to_charset[$locale];
00571 
00572             // get modifier
00573         list($locale,$modifier) = explode('@',$locale);
00574 
00575             // locale contains charset: use it
00576         list($locale,$charset) = explode('.',$locale);
00577         if ($charset)   return $this->parse_charset($charset);
00578 
00579             // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
00580         if ($modifier == 'euro')    return 'iso-8859-15';
00581 
00582             // get language
00583         list($language,$country) = explode('_',$locale);
00584         if (isset($this->lang_to_script[$language]))    $script = $this->lang_to_script[$language];
00585 
00586         if (TYPO3_OS == 'WIN')  {
00587             $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
00588         } else {
00589             $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
00590         }
00591 
00592         return $cs;
00593     }
00594 
00595 
00596 
00597 
00598 
00599 
00600 
00601 
00602 
00603     /********************************************
00604      *
00605      * Charset Conversion functions
00606      *
00607      ********************************************/
00608 
00609     /**
00610      * Convert from one charset to another charset.
00611      *
00612      * @param   string      Input string
00613      * @param   string      From charset (the current charset of the string)
00614      * @param   string      To charset (the output charset wanted)
00615      * @param   boolean     If set, then characters that are not available in the destination character set will be encoded as numeric entities
00616      * @return  string      Converted string
00617      * @see convArray()
00618      */
00619     function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
00620         if ($fromCS==$toCS) return $str;
00621 
00622             // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
00623         if ($toCS=='utf-8' || !$useEntityForNoChar) {
00624             switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod'])   {
00625             case 'mbstring':
00626                 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
00627                 if (false !== $conv_str)    return $conv_str; // returns false for unsupported charsets
00628                 break;
00629 
00630             case 'iconv':
00631                 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
00632                 if (false !== $conv_str)    return $conv_str;
00633                 break;
00634 
00635             case 'recode':
00636                 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
00637                 if (false !== $conv_str)    return $conv_str;
00638                 break;
00639             }
00640             // fallback to TYPO3 conversion
00641         }
00642 
00643         if ($fromCS!='utf-8')   $str=$this->utf8_encode($str,$fromCS);
00644         if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
00645         return $str;
00646     }
00647 
00648     /**
00649      * Convert all elements in ARRAY with type string from one charset to another charset.
00650      * NOTICE: Array is passed by reference!
00651      *
00652      * @param   string      Input array, possibly multidimensional
00653      * @param   string      From charset (the current charset of the string)
00654      * @param   string      To charset (the output charset wanted)
00655      * @param   boolean     If set, then characters that are not available in the destination character set will be encoded as numeric entities
00656      * @return  void
00657      * @see conv()
00658      */
00659     function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
00660         foreach($array as $key => $value)   {
00661             if (is_array($array[$key])) {
00662                 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
00663             } elseif (is_string($array[$key])) {
00664                 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
00665             }
00666         }
00667     }
00668 
00669     /**
00670      * Converts $str from $charset to UTF-8
00671      *
00672      * @param   string      String in local charset to convert to UTF-8
00673      * @param   string      Charset, lowercase. Must be found in csconvtbl/ folder.
00674      * @return  string      Output string, converted to UTF-8
00675      */
00676     function utf8_encode($str,$charset) {
00677 
00678         if ($charset === 'utf-8')   return $str;
00679 
00680             // Charset is case-insensitive.
00681         if ($this->initCharset($charset))   {   // Parse conv. table if not already...
00682             $strLen = strlen($str);
00683             $outStr='';
00684 
00685             for ($a=0;$a<$strLen;$a++)  {   // Traverse each char in string.
00686                 $chr=substr($str,$a,1);
00687                 $ord=ord($chr);
00688                 if (isset($this->twoByteSets[$charset]))    {   // If the charset has two bytes per char
00689                     $ord2 = ord($str{$a+1});
00690                     $ord = $ord<<8 | $ord2; // assume big endian
00691 
00692                     if (isset($this->parsedCharsets[$charset]['local'][$ord]))  {   // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
00693                         $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
00694                     } else $outStr.=chr($this->noCharByteVal);  // No char exists
00695                     $a++;
00696                 } elseif ($ord>127) {   // If char has value over 127 it's a multibyte char in UTF-8
00697                     if (isset($this->eucBasedSets[$charset]))   {   // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
00698                         if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF))    {   // Shift-JIS: chars between 160 and 223 are single byte
00699                             $a++;
00700                             $ord2=ord(substr($str,$a,1));
00701                             $ord = $ord*256+$ord2;
00702                         }
00703                     }
00704 
00705                     if (isset($this->parsedCharsets[$charset]['local'][$ord]))  {   // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
00706                         $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
00707                     } else $outStr.= chr($this->noCharByteVal); // No char exists
00708                 } else $outStr.= $chr;  // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00709             }
00710             return $outStr;
00711         }
00712     }
00713 
00714     /**
00715      * Converts $str from UTF-8 to $charset
00716      *
00717      * @param   string      String in UTF-8 to convert to local charset
00718      * @param   string      Charset, lowercase. Must be found in csconvtbl/ folder.
00719      * @param   boolean     If set, then characters that are not available in the destination character set will be encoded as numeric entities
00720      * @return  string      Output string, converted to local charset
00721      */
00722     function utf8_decode($str,$charset,$useEntityForNoChar=0)   {
00723 
00724         if ($charset === 'utf-8') {
00725             return $str;
00726         }
00727 
00728             // Charset is case-insensitive.
00729         if ($this->initCharset($charset))   {   // Parse conv. table if not already...
00730             $strLen = strlen($str);
00731             $outStr='';
00732             $buf='';
00733             for ($a=0,$i=0;$a<$strLen;$a++,$i++)    {   // Traverse each char in UTF-8 string.
00734                 $chr=substr($str,$a,1);
00735                 $ord=ord($chr);
00736                 if ($ord>127)   {   // This means multibyte! (first byte!)
00737                     if ($ord & 64)  {   // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00738 
00739                         $buf=$chr;  // Add first byte
00740                         for ($b=0;$b<8;$b++)    {   // for each byte in multibyte string...
00741                             $ord = $ord << 1;   // Shift it left and ...
00742                             if ($ord & 128) {   // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00743                                 $a++;   // Increase pointer...
00744                                 $buf.=substr($str,$a,1);    // ... and add the next char.
00745                             } else break;
00746                         }
00747 
00748                         if (isset($this->parsedCharsets[$charset]['utf8'][$buf]))   {   // If the UTF-8 char-sequence is found then...
00749                             $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
00750                             if ($mByte>255) {   // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
00751                                 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
00752                             } else $outStr.= chr($mByte);
00753                         } elseif ($useEntityForNoChar) {    // Create num entity:
00754                             $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
00755                         } else $outStr.=chr($this->noCharByteVal);  // No char exists
00756                     } else $outStr.=chr($this->noCharByteVal);  // No char exists (MIDDLE of MB sequence!)
00757                 } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00758             }
00759             return $outStr;
00760         }
00761     }
00762 
00763     /**
00764      * Converts all chars > 127 to numeric entities.
00765      *
00766      * @param   string      Input string
00767      * @return  string      Output string
00768      */
00769     function utf8_to_entities($str) {
00770         $strLen = strlen($str);
00771         $outStr='';
00772         $buf='';
00773         for ($a=0;$a<$strLen;$a++)  {   // Traverse each char in UTF-8 string.
00774             $chr=substr($str,$a,1);
00775             $ord=ord($chr);
00776             if ($ord>127)   {   // This means multibyte! (first byte!)
00777                 if ($ord & 64)  {   // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00778                     $buf=$chr;  // Add first byte
00779                     for ($b=0;$b<8;$b++)    {   // for each byte in multibyte string...
00780                         $ord = $ord << 1;   // Shift it left and ...
00781                         if ($ord & 128) {   // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00782                             $a++;   // Increase pointer...
00783                             $buf.=substr($str,$a,1);    // ... and add the next char.
00784                         } else break;
00785                     }
00786 
00787                     $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
00788                 } else $outStr.=chr($this->noCharByteVal);  // No char exists (MIDDLE of MB sequence!)
00789             } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00790         }
00791 
00792         return $outStr;
00793     }
00794 
00795     /**
00796      * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
00797      *
00798      * @param   string      Input string, UTF-8
00799      * @param   boolean     If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
00800      * @return  string      Output string
00801      */
00802     function entities_to_utf8($str,$alsoStdHtmlEnt=0)   {
00803         if ($alsoStdHtmlEnt)    {
00804             $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));     // Getting them in iso-8859-1 - but thats ok since this is observed below.
00805         }
00806 
00807         $token = md5(microtime());
00808         $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
00809         foreach($parts as $k => $v) {
00810             if ($k%2)   {
00811                 if (substr($v,0,1)=='#')    {   // Dec or hex entities:
00812                     if (substr($v,1,1)=='x')    {
00813                         $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
00814                     } else {
00815                         $parts[$k] = $this->UnumberToChar(substr($v,1));
00816                     }
00817                 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) {  // Other entities:
00818                     $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
00819                 } else {    // No conversion:
00820                     $parts[$k] ='&'.$v.';';
00821                 }
00822             }
00823         }
00824 
00825         return implode('',$parts);
00826     }
00827 
00828     /**
00829      * Converts all chars in the input UTF-8 string into integer numbers returned in an array
00830      *
00831      * @param   string      Input string, UTF-8
00832      * @param   boolean     If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
00833      * @param   boolean     If set, then instead of integer numbers the real UTF-8 char is returned.
00834      * @return  array       Output array with the char numbers
00835      */
00836     function utf8_to_numberarray($str,$convEntities=0,$retChar=0)   {
00837             // If entities must be registered as well...:
00838         if ($convEntities)  {
00839             $str = $this->entities_to_utf8($str,1);
00840         }
00841             // Do conversion:
00842         $strLen = strlen($str);
00843         $outArr=array();
00844         $buf='';
00845         for ($a=0;$a<$strLen;$a++)  {   // Traverse each char in UTF-8 string.
00846             $chr=substr($str,$a,1);
00847             $ord=ord($chr);
00848             if ($ord>127)   {   // This means multibyte! (first byte!)
00849                 if ($ord & 64)  {   // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00850                     $buf=$chr;  // Add first byte
00851                     for ($b=0;$b<8;$b++)    {   // for each byte in multibyte string...
00852                         $ord = $ord << 1;   // Shift it left and ...
00853                         if ($ord & 128) {   // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00854                             $a++;   // Increase pointer...
00855                             $buf.=substr($str,$a,1);    // ... and add the next char.
00856                         } else break;
00857                     }
00858 
00859                     $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
00860                 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal;   // No char exists (MIDDLE of MB sequence!)
00861             } else $outArr[]=$retChar?chr($ord):$ord;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00862         }
00863 
00864         return $outArr;
00865     }
00866 
00867     /**
00868      * Converts a UNICODE number to a UTF-8 multibyte character
00869      * Algorithm based on script found at From: http://czyborra.com/utf/
00870      * Unit-tested by Kasper
00871      *
00872      * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
00873      *
00874      *  bytes | bits | representation
00875      *      1 |    7 | 0vvvvvvv
00876      *      2 |   11 | 110vvvvv 10vvvvvv
00877      *      3 |   16 | 1110vvvv 10vvvvvv 10vvvvvv
00878      *      4 |   21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
00879      *      5 |   26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
00880      *      6 |   31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
00881      *
00882      * @param   integer     UNICODE integer
00883      * @return  string      UTF-8 multibyte character string
00884      * @see utf8CharToUnumber()
00885      */
00886     function UnumberToChar($cbyte)  {
00887         $str='';
00888 
00889         if ($cbyte < 0x80) {
00890             $str.=chr($cbyte);
00891         } else if ($cbyte < 0x800) {
00892             $str.=chr(0xC0 | ($cbyte >> 6));
00893             $str.=chr(0x80 | ($cbyte & 0x3F));
00894         } else if ($cbyte < 0x10000) {
00895             $str.=chr(0xE0 | ($cbyte >> 12));
00896             $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00897             $str.=chr(0x80 | ($cbyte & 0x3F));
00898         } else if ($cbyte < 0x200000) {
00899             $str.=chr(0xF0 | ($cbyte >> 18));
00900             $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00901             $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00902             $str.=chr(0x80 | ($cbyte & 0x3F));
00903         } else if ($cbyte < 0x4000000) {
00904             $str.=chr(0xF8 | ($cbyte >> 24));
00905             $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
00906             $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00907             $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00908             $str.=chr(0x80 | ($cbyte & 0x3F));
00909         } else if ($cbyte < 0x80000000) {
00910             $str.=chr(0xFC | ($cbyte >> 30));
00911             $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
00912             $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
00913             $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00914             $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00915             $str.=chr(0x80 | ($cbyte & 0x3F));
00916         } else { // Cannot express a 32-bit character in UTF-8
00917             $str .= chr($this->noCharByteVal);
00918         }
00919         return $str;
00920     }
00921 
00922     /**
00923      * Converts a UTF-8 Multibyte character to a UNICODE number
00924      * Unit-tested by Kasper
00925      *
00926      * @param   string      UTF-8 multibyte character string
00927      * @param   boolean     If set, then a hex. number is returned.
00928      * @return  integer     UNICODE integer
00929      * @see UnumberToChar()
00930      */
00931     function utf8CharToUnumber($str,$hex=0) {
00932         $ord=ord(substr($str,0,1)); // First char
00933 
00934         if (($ord & 192) == 192)    {   // This verifyes that it IS a multi byte string
00935             $binBuf='';
00936             for ($b=0;$b<8;$b++)    {   // for each byte in multibyte string...
00937                 $ord = $ord << 1;   // Shift it left and ...
00938                 if ($ord & 128) {   // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00939                     $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
00940                 } else break;
00941             }
00942             $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
00943 
00944             $int = bindec($binBuf);
00945         } else $int = $ord;
00946 
00947         return $hex ? 'x'.dechex($int) : $int;
00948     }
00949 
00950 
00951 
00952 
00953 
00954 
00955 
00956 
00957 
00958     /********************************************
00959      *
00960      * Init functions
00961      *
00962      ********************************************/
00963 
00964     /**
00965      * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
00966      * This function is automatically called by the conversion functions
00967      *
00968      * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
00969      *
00970      * @param   string      The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
00971      * @return  integer     Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
00972      * @access private
00973      */
00974     function initCharset($charset)  {
00975             // Only process if the charset is not yet loaded:
00976         if (!is_array($this->parsedCharsets[$charset])) {
00977 
00978                 // Conversion table filename:
00979             $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
00980 
00981                 // If the conversion table is found:
00982             if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile))  {
00983                     // Cache file for charsets:
00984                     // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
00985                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
00986                 if ($cacheFile && @is_file($cacheFile)) {
00987                     $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
00988                 } else {
00989                         // Parse conversion table into lines:
00990                     $lines=t3lib_div::trimExplode(LF,t3lib_div::getUrl($charsetConvTableFile),1);
00991                         // Initialize the internal variable holding the conv. table:
00992                     $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
00993                         // traverse the lines:
00994                     $detectedType='';
00995                     foreach($lines as $value)   {
00996                         if (trim($value) && substr($value,0,1)!='#')    {   // Comment line or blanks are ignored.
00997 
00998                                 // Detect type if not done yet: (Done on first real line)
00999                                 // The "whitespaced" type is on the syntax  "0x0A   0x000A  #LINE FEED"     while   "ms-token" is like      "B9 = U+00B9 : SUPERSCRIPT ONE"
01000                             if (!$detectedType)     $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/',$value) ? 'whitespaced' : 'ms-token';
01001 
01002                             if ($detectedType=='ms-token')  {
01003                                 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
01004                             } elseif ($detectedType=='whitespaced') {
01005                                 $regA=array();
01006                                 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/',$value,$regA);
01007                                 $hexbyte = $regA[1];
01008                                 $utf8 = 'U+'.$regA[2];
01009                             }
01010                             $decval = hexdec(trim($hexbyte));
01011                             if ($decval>127)    {
01012                                 $utf8decval = hexdec(substr(trim($utf8),2));
01013                                 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
01014                                 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
01015                             }
01016                         }
01017                     }
01018                     if ($cacheFile) {
01019                         t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
01020                     }
01021                 }
01022                 return 2;
01023             } else return false;
01024         } else return 1;
01025     }
01026 
01027     /**
01028      * This function initializes all UTF-8 character data tables.
01029      *
01030      * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
01031      *
01032      * @param   string      Mode ("case", "ascii", ...)
01033      * @return  integer     Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
01034      * @access private
01035      */
01036     function initUnicodeData($mode=null)    {
01037             // cache files
01038         $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
01039         $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
01040 
01041             // Only process if the tables are not yet loaded
01042         switch($mode)   {
01043             case 'case':
01044                 if (is_array($this->caseFolding['utf-8']))  return 1;
01045 
01046                     // Use cached version if possible
01047                 if ($cacheFileCase && @is_file($cacheFileCase)) {
01048                     $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
01049                     return 2;
01050                 }
01051                 break;
01052 
01053             case 'ascii':
01054                 if (is_array($this->toASCII['utf-8']))  return 1;
01055 
01056                     // Use cached version if possible
01057                 if ($cacheFileASCII && @is_file($cacheFileASCII))   {
01058                     $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
01059                     return 2;
01060                 }
01061                 break;
01062         }
01063 
01064             // process main Unicode data file
01065         $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
01066         if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
01067 
01068         $fh = fopen($unicodeDataFile,'rb');
01069         if (!$fh)   return false;
01070 
01071             // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
01072             // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
01073         $this->caseFolding['utf-8'] = array();
01074         $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
01075         $utf8CaseFolding['toUpper'] = array();
01076         $utf8CaseFolding['toLower'] = array();
01077         $utf8CaseFolding['toTitle'] = array();
01078 
01079         $decomposition = array();   // array of temp. decompositions
01080         $mark = array();        // array of chars that are marks (eg. composing accents)
01081         $number = array();      // array of chars that are numbers (eg. digits)
01082         $omit = array();        // array of chars to be omitted (eg. Russian hard sign)
01083 
01084         while (!feof($fh))  {
01085             $line = fgets($fh,4096);
01086                 // has a lot of info
01087             list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = explode(';', rtrim($line));
01088 
01089             $ord = hexdec($char);
01090             if ($ord > 0xFFFF)  break;  // only process the BMP
01091 
01092             $utf8_char = $this->UnumberToChar($ord);
01093 
01094             if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
01095             if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
01096                 // store "title" only when different from "upper" (only a few)
01097             if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
01098 
01099             switch ($cat{0})    {
01100                 case 'M':   // mark (accent, umlaut, ...)
01101                     $mark["U+$char"] = 1;
01102                     break;
01103 
01104                 case 'N':   // numeric value
01105                     if ($ord > 0x80 && $num != '')  $number["U+$char"] = $num;
01106             }
01107 
01108                 // accented Latin letters without "official" decomposition
01109             $match = array();
01110             if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/',$name,$match) && !$decomp)    {
01111                 $c = ord($match[2]);
01112                 if ($match[1] == 'SMALL')   $c += 32;
01113 
01114                 $decomposition["U+$char"] = array(dechex($c));
01115                 continue;
01116             }
01117 
01118             $match = array();
01119             if (preg_match('/(<.*>)? *(.+)/',$decomp,$match))   {
01120                 switch($match[1])   {
01121                     case '<circle>':    // add parenthesis as circle replacement, eg (1)
01122                         $match[2] = '0028 '.$match[2].' 0029';
01123                         break;
01124 
01125                     case '<square>':    // add square brackets as square replacement, eg [1]
01126                         $match[2] = '005B '.$match[2].' 005D';
01127                         break;
01128 
01129                     case '<compat>':    // ignore multi char decompositions that start with a space
01130                         if (preg_match('/^0020 /',$match[2]))   continue 2;
01131                         break;
01132 
01133                         // ignore Arabic and vertical layout presentation decomposition
01134                     case '<initial>':
01135                     case '<medial>':
01136                     case '<final>':
01137                     case '<isolated>':
01138                     case '<vertical>':
01139                         continue 2;
01140                 }
01141                 $decomposition["U+$char"] = explode(' ', $match[2]);
01142             }
01143         }
01144         fclose($fh);
01145 
01146             // process additional Unicode data for casing (allow folded characters to expand into a sequence)
01147         $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
01148         if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile))    {
01149             $fh = fopen($specialCasingFile,'rb');
01150             if ($fh)    {
01151                 while (!feof($fh))  {
01152                     $line = fgets($fh,4096);
01153                     if ($line{0} != '#' && trim($line) != '')   {
01154 
01155                         list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
01156                         if ($cond == '' || $cond{0} == '#') {
01157                             $utf8_char = $this->UnumberToChar(hexdec($char));
01158                             if ($char != $lower)    {
01159                                 $arr = explode(' ', $lower);
01160                                 for ($i=0; isset($arr[$i]); $i++)   $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01161                                 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
01162                             }
01163                             if ($char != $title && $title != $upper)    {
01164                                 $arr = explode(' ', $title);
01165                                 for ($i=0; isset($arr[$i]); $i++)   $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01166                                 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
01167                             }
01168                             if ($char != $upper)    {
01169                                     $arr = explode(' ', $upper);
01170                                 for ($i=0; isset($arr[$i]); $i++)   $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01171                                 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
01172                             }
01173                         }
01174                     }
01175                 }
01176                 fclose($fh);
01177             }
01178         }
01179 
01180             // process custom decompositions
01181         $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
01182         if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile))  {
01183             $fh = fopen($customTranslitFile,'rb');
01184             if ($fh)    {
01185                 while (!feof($fh))  {
01186                     $line = fgets($fh,4096);
01187                     if ($line{0} != '#' && trim($line) != '')   {
01188                         list($char,$translit) = t3lib_div::trimExplode(';', $line);
01189                         if (!$translit) $omit["U+$char"] = 1;
01190                         $decomposition["U+$char"] = explode(' ', $translit);
01191 
01192                     }
01193                 }
01194                 fclose($fh);
01195             }
01196         }
01197 
01198             // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
01199         foreach($decomposition as $from => $to) {
01200             $code_decomp = array();
01201 
01202             while ($code_value = array_shift($to))  {
01203                 if (isset($decomposition["U+$code_value"])) {   // do recursive decomposition
01204                     foreach(array_reverse($decomposition["U+$code_value"]) as $cv)  {
01205                         array_unshift($to, $cv);
01206                     }
01207                 } elseif (!isset($mark["U+$code_value"])) { // remove mark
01208                     array_push($code_decomp, $code_value);
01209                 }
01210             }
01211             if (count($code_decomp) || isset($omit[$from])) {
01212                 $decomposition[$from] = $code_decomp;
01213             } else {
01214                 unset($decomposition[$from]);
01215             }
01216         }
01217 
01218             // create ascii only mapping
01219         $this->toASCII['utf-8'] = array();
01220         $ascii =& $this->toASCII['utf-8'];
01221 
01222         foreach($decomposition as $from => $to) {
01223             $code_decomp = array();
01224             while ($code_value = array_shift($to))  {
01225                 $ord = hexdec($code_value);
01226                 if ($ord > 127)
01227                     continue 2; // skip decompositions containing non-ASCII chars
01228                 else
01229                     array_push($code_decomp,chr($ord));
01230             }
01231             $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
01232         }
01233 
01234             // add numeric decompositions
01235         foreach($number as $from => $to)    {
01236             $utf8_char = $this->UnumberToChar(hexdec($from));
01237             if (!isset($ascii[$utf8_char])) {
01238                 $ascii[$utf8_char] = $to;
01239             }
01240         }
01241 
01242         if ($cacheFileCase) {
01243                 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
01244         }
01245 
01246         if ($cacheFileASCII)    {
01247                 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
01248         }
01249 
01250         return 3;
01251     }
01252 
01253     /**
01254      * This function initializes the folding table for a charset other than UTF-8.
01255      * This function is automatically called by the case folding functions.
01256      *
01257      * @param   string      Charset for which to initialize case folding.
01258      * @return  integer     Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
01259      * @access private
01260      */
01261     function initCaseFolding($charset)  {
01262             // Only process if the case table is not yet loaded:
01263         if (is_array($this->caseFolding[$charset])) return 1;
01264 
01265             // Use cached version if possible
01266         $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
01267         if ($cacheFile && @is_file($cacheFile)) {
01268             $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01269             return 2;
01270         }
01271 
01272             // init UTF-8 conversion for this charset
01273         if (!$this->initCharset($charset))  {
01274             return false;
01275         }
01276 
01277             // UTF-8 case folding is used as the base conversion table
01278         if (!$this->initUnicodeData('case'))    {
01279             return false;
01280         }
01281 
01282         $nochar = chr($this->noCharByteVal);
01283         foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)  {
01284                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
01285             $c = $this->utf8_decode($utf8, $charset);
01286 
01287                 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
01288             $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
01289             if ($cc != '' && $cc != $nochar)    $this->caseFolding[$charset]['toUpper'][$c] = $cc;
01290 
01291                 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
01292             $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
01293             if ($cc != '' && $cc != $nochar)    $this->caseFolding[$charset]['toLower'][$c] = $cc;
01294 
01295                 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
01296             $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
01297             if ($cc != '' && $cc != $nochar)    $this->caseFolding[$charset]['toTitle'][$c] = $cc;
01298         }
01299 
01300             // add the ASCII case table
01301         for ($i=ord('a'); $i<=ord('z'); $i++)   {
01302             $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
01303         }
01304         for ($i=ord('A'); $i<=ord('Z'); $i++)   {
01305             $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
01306         }
01307 
01308         if ($cacheFile) {
01309                 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
01310         }
01311 
01312         return 3;
01313     }
01314 
01315     /**
01316      * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
01317      * This function is automatically called by the ASCII transliteration functions.
01318      *
01319      * @param   string      Charset for which to initialize conversion.
01320      * @return  integer     Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
01321      * @access private
01322      */
01323     function initToASCII($charset)  {
01324             // Only process if the case table is not yet loaded:
01325         if (is_array($this->toASCII[$charset])) return 1;
01326 
01327             // Use cached version if possible
01328         $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
01329         if ($cacheFile && @is_file($cacheFile)) {
01330             $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01331             return 2;
01332         }
01333 
01334             // init UTF-8 conversion for this charset
01335         if (!$this->initCharset($charset))  {
01336             return false;
01337         }
01338 
01339             // UTF-8/ASCII transliteration is used as the base conversion table
01340         if (!$this->initUnicodeData('ascii'))   {
01341             return false;
01342         }
01343 
01344         $nochar = chr($this->noCharByteVal);
01345         foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)  {
01346                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
01347             $c = $this->utf8_decode($utf8, $charset);
01348 
01349             if (isset($this->toASCII['utf-8'][$utf8]))  {
01350                 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
01351             }
01352         }
01353 
01354         if ($cacheFile) {
01355                 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
01356         }
01357 
01358         return 3;
01359     }
01360 
01361 
01362 
01363 
01364 
01365 
01366 
01367 
01368 
01369 
01370 
01371 
01372 
01373 
01374 
01375 
01376     /********************************************
01377      *
01378      * String operation functions
01379      *
01380      ********************************************/
01381 
01382     /**
01383      * Returns a part of a string.
01384      * Unit-tested by Kasper (single byte charsets only)
01385      *
01386      * @param   string      The character set
01387      * @param   string      Character string
01388      * @param   integer     Start position (character position)
01389      * @param   integer     Length (in characters)
01390      * @return  string      The substring
01391      * @see substr(), mb_substr()
01392      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01393      */
01394     function substr($charset,$string,$start,$len=null)  {
01395         if ($len === 0 || $string === '') {
01396             return '';
01397         }
01398 
01399         if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01400                 // cannot omit $len, when specifying charset
01401             if ($len==null) {
01402                 $enc = mb_internal_encoding();  // save internal encoding
01403                 mb_internal_encoding($charset);
01404                 $str = mb_substr($string,$start);
01405                 mb_internal_encoding($enc); // restore internal encoding
01406 
01407                 return $str;
01408             }
01409             else {
01410                 return mb_substr($string,$start,$len,$charset);
01411             }
01412         } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')  {
01413                 // cannot omit $len, when specifying charset
01414             if ($len==null) {
01415                 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
01416                 iconv_set_encoding('internal_encoding',$charset);
01417                 $str = iconv_substr($string,$start);
01418                 iconv_set_encoding('internal_encoding',$enc);   // restore internal encoding
01419 
01420                 return $str;
01421             }
01422             else {
01423                 return iconv_substr($string,$start,$len,$charset);
01424             }
01425         } elseif ($charset == 'utf-8')  {
01426             return $this->utf8_substr($string,$start,$len);
01427         } elseif ($this->eucBasedSets[$charset])    {
01428             return $this->euc_substr($string,$start,$charset,$len);
01429         } elseif ($this->twoByteSets[$charset]) {
01430             return substr($string,$start*2,$len*2);
01431         } elseif ($this->fourByteSets[$charset])    {
01432             return substr($string,$start*4,$len*4);
01433         }
01434 
01435         // treat everything else as single-byte encoding
01436         return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
01437     }
01438 
01439     /**
01440      * Counts the number of characters.
01441      * Unit-tested by Kasper (single byte charsets only)
01442      *
01443      * @param   string      The character set
01444      * @param   string      Character string
01445      * @return  integer     The number of characters
01446      * @see strlen()
01447      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01448      */
01449     function strlen($charset,$string)   {
01450         if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01451             return mb_strlen($string,$charset);
01452         } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')  {
01453             return iconv_strlen($string,$charset);
01454         } elseif ($charset == 'utf-8')  {
01455             return $this->utf8_strlen($string);
01456         } elseif ($this->eucBasedSets[$charset])    {
01457             return $this->euc_strlen($string,$charset);
01458         } elseif ($this->twoByteSets[$charset]) {
01459             return strlen($string)/2;
01460         } elseif ($this->fourByteSets[$charset])    {
01461             return strlen($string)/4;
01462         }
01463         // treat everything else as single-byte encoding
01464         return strlen($string);
01465     }
01466 
01467     /**
01468      * Method to crop strings using the mb_substr function.
01469      *
01470      * @param  string       The character set
01471      * @param  string       String to be cropped
01472      * @param  integer      Crop length (in characters)
01473      * @param  string       Crop signifier
01474      * @return string       The shortened string
01475      * @see mb_strlen(), mb_substr()
01476      */
01477     protected function cropMbstring($charset, $string, $len, $crop = '') {
01478         if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
01479             return $string;
01480         }
01481 
01482         if ($len > 0) {
01483             $string = mb_substr($string, 0, $len, $charset) . $crop;
01484         } else {
01485             $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
01486         }
01487 
01488         return $string;
01489     }
01490 
01491     /**
01492      * Truncates a string and pre-/appends a string.
01493      * Unit tested by Kasper
01494      *
01495      * @param   string      The character set
01496      * @param   string      Character string
01497      * @param   integer     Length (in characters)
01498      * @param   string      Crop signifier
01499      * @return  string      The shortened string
01500      * @see substr(), mb_strimwidth()
01501      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01502      */
01503     function crop($charset,$string,$len,$crop='')   {
01504         if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01505             return $this->cropMbstring($charset, $string, $len, $crop);
01506         }
01507 
01508         if (intval($len) == 0)  return $string;
01509 
01510         if ($charset == 'utf-8')    {
01511             $i = $this->utf8_char2byte_pos($string,$len);
01512         } elseif ($this->eucBasedSets[$charset])    {
01513             $i = $this->euc_char2byte_pos($string,$len,$charset);
01514         } else {
01515             if ($len > 0)   {
01516                 $i = $len;
01517             } else {
01518                 $i = strlen($string)+$len;
01519                 if ($i<=0)  $i = false;
01520             }
01521         }
01522 
01523         if ($i === false)   {   // $len outside actual string length
01524             return $string;
01525         } else  {
01526             if ($len > 0)   {
01527                 if (strlen($string{$i}))    {
01528                     return substr($string,0,$i).$crop;
01529 
01530                 }
01531             } else {
01532                 if (strlen($string{$i-1}))  {
01533                     return $crop.substr($string,$i);
01534                 }
01535             }
01536 
01537 /*
01538             if (abs($len)<$this->strlen($charset,$string))  {   // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
01539                 if ($len > 0)   {
01540                     return substr($string,0,$i).$crop;
01541                 } else {
01542                     return $crop.substr($string,$i);
01543                 }
01544             }
01545 */
01546         }
01547         return $string;
01548     }
01549 
01550     /**
01551      * Cuts a string short at a given byte length.
01552      *
01553      * @param   string      The character set
01554      * @param   string      Character string
01555      * @param   integer     The byte length
01556      * @return  string      The shortened string
01557      * @see mb_strcut()
01558      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01559      */
01560     function strtrunc($charset,$string,$len)    {
01561         if ($len <= 0)  return '';
01562 
01563         if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01564             return mb_strcut($string,0,$len,$charset);
01565         } elseif ($charset == 'utf-8')  {
01566             return $this->utf8_strtrunc($string,$len);
01567         } elseif ($this->eucBasedSets[$charset])    {
01568             return $this->euc_strtrunc($string,$charset);
01569         } elseif ($this->twoByteSets[$charset]) {
01570             if ($len % 2)   $len--;     // don't cut at odd positions
01571         } elseif ($this->fourByteSets[$charset])    {
01572             $x = $len % 4;
01573             $len -= $x; // realign to position dividable by four
01574         }
01575         // treat everything else as single-byte encoding
01576         return substr($string,0,$len);
01577     }
01578 
01579     /**
01580      * Translates all characters of a string into their respective case values.
01581      * Unlike strtolower() and strtoupper() this method is locale independent.
01582      * Note that the string length may change!
01583      * eg. lower case German �(sharp S) becomes upper case "SS"
01584      * Unit-tested by Kasper
01585      * Real case folding is language dependent, this method ignores this fact.
01586      *
01587      * @param   string      Character set of string
01588      * @param   string      Input string to convert case for
01589      * @param   string      Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
01590      * @return  string      The converted string
01591      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01592      * @see strtolower(), strtoupper()
01593      */
01594     function conv_case($charset,$string,$case)  {
01595         if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01596             if ($case == 'toLower') {
01597                 $string = mb_strtolower($string,$charset);
01598             } else {
01599                 $string = mb_strtoupper($string,$charset);
01600             }
01601         } elseif ($charset == 'utf-8')  {
01602             $string = $this->utf8_char_mapping($string,'case',$case);
01603         } elseif (isset($this->eucBasedSets[$charset])) {
01604             $string = $this->euc_char_mapping($string,$charset,'case',$case);
01605         } else {
01606                 // treat everything else as single-byte encoding
01607             $string = $this->sb_char_mapping($string,$charset,'case',$case);
01608         }
01609 
01610         return $string;
01611     }
01612 
01613     /**
01614      * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
01615      *
01616      * @param   string      Character set of string
01617      * @param   string      Input string to convert
01618      * @return  string      The converted string
01619      */
01620     function specCharsToASCII($charset,$string) {
01621         if ($charset == 'utf-8')    {
01622             $string = $this->utf8_char_mapping($string,'ascii');
01623         } elseif (isset($this->eucBasedSets[$charset])) {
01624             $string = $this->euc_char_mapping($string,$charset,'ascii');
01625         } else {
01626                 // treat everything else as single-byte encoding
01627             $string = $this->sb_char_mapping($string,$charset,'ascii');
01628         }
01629 
01630         return $string;
01631     }
01632 
01633 
01634     /**
01635      * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
01636      * into a TYPO3-readable language code
01637      * @param   $languageCodesList  list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
01638      *          see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
01639      * @return  string  a preferred language that TYPO3 supports, or "default" if none found
01640      * @author  Benjamin Mack (benni.typo3.org)
01641      */
01642     public function getPreferredClientLanguage($languageCodesList) {
01643         $allLanguageCodes = array();
01644         $selectedLanguage = 'default';
01645 
01646         // get all languages where TYPO3 code is the same as the ISO code
01647         foreach ($this->charSetArray as $typo3Lang => $charSet) {
01648             $allLanguageCodes[$typo3Lang] = $typo3Lang;
01649         }
01650 
01651         // get all languages where TYPO3 code differs from ISO code
01652         // or needs the country part
01653         // the iso codes will here overwrite the default typo3 language in the key
01654         foreach ($this->isoArray as $typo3Lang => $isoLang) {
01655             $isoLang = join('-', explode('_', $isoLang));
01656             $allLanguageCodes[$typo3Lang] = $isoLang;
01657         }
01658 
01659         // move the iso codes to the (because we're comparing the keys with "isset" later on)
01660         $allLanguageCodes = array_flip($allLanguageCodes);
01661 
01662 
01663         $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
01664         // order the preferred languages after they key
01665         $sortedPreferredLanguages = array();
01666         foreach ($preferredLanguages as $preferredLanguage) {
01667             $quality = 1.0;
01668             if (strpos($preferredLanguage, ';q=') !== false) {
01669                 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
01670             }
01671             $sortedPreferredLanguages[$preferredLanguage] = $quality;
01672         }
01673 
01674         // loop through the languages, with the highest priority first
01675         arsort($sortedPreferredLanguages, SORT_NUMERIC);
01676         foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
01677             if (isset($allLanguageCodes[$preferredLanguage])) {
01678                 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
01679                 break;
01680             }
01681 
01682             // strip the country code from the end
01683             list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
01684             if (isset($allLanguageCodes[$preferredLanguage])) {
01685                 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
01686                 break;
01687             }
01688         }
01689         if (!$selectedLanguage || $selectedLanguage == 'en') {
01690             $selectedLanguage = 'default';
01691         }
01692         return $selectedLanguage;
01693     }
01694 
01695 
01696 
01697 
01698 
01699 
01700 
01701 
01702 
01703 
01704     /********************************************
01705      *
01706      * Internal string operation functions
01707      *
01708      ********************************************/
01709 
01710     /**
01711      * Maps all characters of a string in a single byte charset.
01712      *
01713      * @param   string      the string
01714      * @param   string      the charset
01715      * @param   string      mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
01716      * @param   string      'case': conversion 'toLower' or 'toUpper'
01717      * @return  string      the converted string
01718      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01719      */
01720     function sb_char_mapping($str,$charset,$mode,$opt='')   {
01721         switch($mode)   {
01722             case 'case':
01723                 if (!$this->initCaseFolding($charset))  return $str;    // do nothing
01724                 $map =& $this->caseFolding[$charset][$opt];
01725                 break;
01726 
01727             case 'ascii':
01728                 if (!$this->initToASCII($charset))  return $str;    // do nothing
01729                 $map =& $this->toASCII[$charset];
01730                 break;
01731 
01732             default:
01733                 return $str;
01734         }
01735 
01736         $out = '';
01737         for($i=0; strlen($str{$i}); $i++)   {
01738             $c = $str{$i};
01739             if (isset($map[$c]))    {
01740                 $out .= $map[$c];
01741             } else {
01742                 $out .= $c;
01743             }
01744         }
01745 
01746         return $out;
01747     }
01748 
01749 
01750 
01751 
01752 
01753 
01754 
01755 
01756 
01757 
01758     /********************************************
01759      *
01760      * Internal UTF-8 string operation functions
01761      *
01762      ********************************************/
01763 
01764     /**
01765      * Returns a part of a UTF-8 string.
01766      * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
01767      *
01768      * @param   string      UTF-8 string
01769      * @param   integer     Start position (character position)
01770      * @param   integer     Length (in characters)
01771      * @return  string      The substring
01772      * @see substr()
01773      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01774      */
01775     function utf8_substr($str,$start,$len=null) {
01776         if (!strcmp($len,'0'))  return '';
01777 
01778         $byte_start = $this->utf8_char2byte_pos($str,$start);
01779         if ($byte_start === false)  {
01780             if ($start > 0) {
01781                 return false;   // $start outside string length
01782             } else {
01783                 $start = 0;
01784             }
01785         }
01786 
01787         $str = substr($str,$byte_start);
01788 
01789         if ($len!=null) {
01790             $byte_end = $this->utf8_char2byte_pos($str,$len);
01791             if ($byte_end === false)    // $len outside actual string length
01792                 return $len<0 ? '' : $str;  // When length is less than zero and exceeds, then we return blank string.
01793             else
01794                 return substr($str,0,$byte_end);
01795         }
01796         else    return $str;
01797     }
01798 
01799     /**
01800      * Counts the number of characters of a string in UTF-8.
01801      * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
01802      *
01803      * @param   string      UTF-8 multibyte character string
01804      * @return  integer     The number of characters
01805      * @see strlen()
01806      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01807      */
01808     function utf8_strlen($str)  {
01809         $n=0;
01810         for($i=0; strlen($str{$i}); $i++)   {
01811             $c = ord($str{$i});
01812             if (!($c & 0x80))   // single-byte (0xxxxxx)
01813                 $n++;
01814             elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
01815                 $n++;
01816         }
01817         return $n;
01818     }
01819 
01820     /**
01821      * Truncates a string in UTF-8 short at a given byte length.
01822      *
01823      * @param   string      UTF-8 multibyte character string
01824      * @param   integer     the byte length
01825      * @return  string      the shortened string
01826      * @see mb_strcut()
01827      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01828      */
01829     function utf8_strtrunc($str,$len)   {
01830         $i = $len-1;
01831         if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
01832             for (; $i>0 && !(ord($str{$i}) & 0x40); $i--)   ;   // find the first byte
01833             if ($i <= 0)    return ''; // sanity check
01834             for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1)  $bc++;  // calculate number of bytes
01835             if ($bc+$i > $len)  return substr($str,0,$i);
01836             // fallthru: multibyte char fits into length
01837         }
01838         return substr($str,0,$len);
01839     }
01840 
01841     /**
01842      * Find position of first occurrence of a string, both arguments are in UTF-8.
01843      *
01844      * @param   string      UTF-8 string to search in
01845      * @param   string      UTF-8 string to search for
01846      * @param   integer     Positition to start the search
01847      * @return  integer     The character position
01848      * @see strpos()
01849      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01850      */
01851     function utf8_strpos($haystack,$needle,$offset=0)   {
01852         if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01853             return mb_strpos($haystack,$needle,$offset,'utf-8');
01854         } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')  {
01855             return iconv_strpos($haystack,$needle,$offset,'utf-8');
01856         }
01857 
01858         $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
01859         if ($byte_offset === false) return false; // offset beyond string length
01860 
01861         $byte_pos = strpos($haystack,$needle,$byte_offset);
01862         if ($byte_pos === false)    return false; // needle not found
01863 
01864         return $this->utf8_byte2char_pos($haystack,$byte_pos);
01865     }
01866 
01867     /**
01868      * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
01869      *
01870      * @param   string      UTF-8 string to search in
01871      * @param   string      UTF-8 character to search for (single character)
01872      * @return  integer     The character position
01873      * @see strrpos()
01874      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01875      */
01876     function utf8_strrpos($haystack,$needle)    {
01877         if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01878             return mb_strrpos($haystack,$needle,'utf-8');
01879         } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')  {
01880             return iconv_strrpos($haystack,$needle,'utf-8');
01881         }
01882 
01883         $byte_pos = strrpos($haystack,$needle);
01884         if ($byte_pos === false)    return false; // needle not found
01885 
01886         return $this->utf8_byte2char_pos($haystack,$byte_pos);
01887     }
01888 
01889     /**
01890      * Translates a character position into an 'absolute' byte position.
01891      * Unit tested by Kasper.
01892      *
01893      * @param   string      UTF-8 string
01894      * @param   integer     Character position (negative values start from the end)
01895      * @return  integer     Byte position
01896      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01897      */
01898     function utf8_char2byte_pos($str,$pos)  {
01899         $n = 0;             // number of characters found
01900         $p = abs($pos);     // number of characters wanted
01901 
01902         if ($pos >= 0)  {
01903             $i = 0;
01904             $d = 1;
01905         } else {
01906             $i = strlen($str)-1;
01907             $d = -1;
01908         }
01909 
01910         for( ; strlen($str{$i}) && $n<$p; $i+=$d)   {
01911             $c = (int)ord($str{$i});
01912             if (!($c & 0x80))   // single-byte (0xxxxxx)
01913                 $n++;
01914             elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
01915                 $n++;
01916         }
01917         if (!strlen($str{$i}))  return false; // offset beyond string length
01918 
01919         if ($pos >= 0)  {
01920                 // skip trailing multi-byte data bytes
01921             while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
01922         } else {
01923                 // correct offset
01924             $i++;
01925         }
01926 
01927         return $i;
01928     }
01929 
01930     /**
01931      * Translates an 'absolute' byte position into a character position.
01932      * Unit tested by Kasper.
01933      *
01934      * @param   string      UTF-8 string
01935      * @param   integer     byte position
01936      * @return  integer     character position
01937      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01938      */
01939     function utf8_byte2char_pos($str,$pos)  {
01940         $n = 0; // number of characters
01941         for($i=$pos; $i>0; $i--)    {
01942             $c = (int)ord($str{$i});
01943             if (!($c & 0x80))   // single-byte (0xxxxxx)
01944                 $n++;
01945             elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
01946                 $n++;
01947         }
01948         if (!strlen($str{$i}))  return false; // offset beyond string length
01949 
01950         return $n;
01951     }
01952 
01953     /**
01954      * Maps all characters of an UTF-8 string.
01955      *
01956      * @param   string      UTF-8 string
01957      * @param   string      mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
01958      * @param   string      'case': conversion 'toLower' or 'toUpper'
01959      * @return  string      the converted string
01960      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
01961      */
01962     function utf8_char_mapping($str,$mode,$opt='')  {
01963         if (!$this->initUnicodeData($mode)) return $str;    // do nothing
01964 
01965         $out = '';
01966         switch($mode)   {
01967             case 'case':
01968                 $map =& $this->caseFolding['utf-8'][$opt];
01969                 break;
01970 
01971             case 'ascii':
01972                 $map =& $this->toASCII['utf-8'];
01973                 break;
01974 
01975             default:
01976                 return $str;
01977         }
01978 
01979         for($i=0; strlen($str{$i}); $i++)   {
01980             $c = ord($str{$i});
01981             if (!($c & 0x80))   // single-byte (0xxxxxx)
01982                 $mbc = $str{$i};
01983             elseif (($c & 0xC0) == 0xC0)    {   // multi-byte starting byte (11xxxxxx)
01984                 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
01985                 $mbc = substr($str,$i,$bc);
01986                 $i += $bc-1;
01987             }
01988 
01989             if (isset($map[$mbc]))  {
01990                 $out .= $map[$mbc];
01991             } else {
01992                 $out .= $mbc;
01993             }
01994         }
01995 
01996         return $out;
01997     }
01998 
01999 
02000 
02001 
02002 
02003 
02004 
02005 
02006 
02007 
02008 
02009 
02010 
02011 
02012 
02013 
02014 
02015 
02016     /********************************************
02017      *
02018      * Internal EUC string operation functions
02019      *
02020      * Extended Unix Code:
02021      *  ASCII compatible 7bit single bytes chars
02022      *  8bit two byte chars
02023      *
02024      * Shift-JIS is treated as a special case.
02025      *
02026      ********************************************/
02027 
02028     /**
02029      * Cuts a string in the EUC charset family short at a given byte length.
02030      *
02031      * @param   string      EUC multibyte character string
02032      * @param   integer     the byte length
02033      * @param   string      the charset
02034      * @return  string      the shortened string
02035      * @see mb_strcut()
02036      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
02037      */
02038     function euc_strtrunc($str,$len,$charset)    {
02039         $sjis = ($charset == 'shift_jis');
02040         for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
02041             $c = ord($str{$i});
02042             if ($sjis)  {
02043                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++;   // advance a double-byte char
02044             }
02045             else    {
02046                 if ($c >= 0x80) $i++;   // advance a double-byte char
02047             }
02048         }
02049         if (!strlen($str{$i}))  return $str;    // string shorter than supplied length
02050 
02051         if ($i>$len) {
02052             return substr($str,0,$len-1);   // we ended on a first byte
02053         } else {
02054             return substr($str,0,$len);
02055         }
02056     }
02057 
02058     /**
02059      * Returns a part of a string in the EUC charset family.
02060      *
02061      * @param   string      EUC multibyte character string
02062      * @param   integer     start position (character position)
02063      * @param   string      the charset
02064      * @param   integer     length (in characters)
02065      * @return  string      the substring
02066      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
02067      */
02068     function euc_substr($str,$start,$charset,$len=null) {
02069         $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
02070         if ($byte_start === false)  return false;   // $start outside string length
02071 
02072         $str = substr($str,$byte_start);
02073 
02074         if ($len!=null) {
02075             $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
02076             if ($byte_end === false)    // $len outside actual string length
02077                 return $str;
02078             else
02079                 return substr($str,0,$byte_end);
02080         }
02081         else    return $str;
02082     }
02083 
02084     /**
02085      * Counts the number of characters of a string in the EUC charset family.
02086      *
02087      * @param   string      EUC multibyte character string
02088      * @param   string      the charset
02089      * @return  integer     the number of characters
02090      * @see strlen()
02091      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
02092      */
02093     function euc_strlen($str,$charset)   {
02094         $sjis = ($charset == 'shift_jis');
02095         $n=0;
02096         for ($i=0; strlen($str{$i}); $i++) {
02097             $c = ord($str{$i});
02098             if ($sjis)  {
02099                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++;   // advance a double-byte char
02100             }
02101             else    {
02102                 if ($c >= 0x80) $i++;   // advance a double-byte char
02103             }
02104 
02105             $n++;
02106         }
02107 
02108         return $n;
02109     }
02110 
02111     /**
02112      * Translates a character position into an 'absolute' byte position.
02113      *
02114      * @param   string      EUC multibyte character string
02115      * @param   integer     character position (negative values start from the end)
02116      * @param   string      the charset
02117      * @return  integer     byte position
02118      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
02119      */
02120     function euc_char2byte_pos($str,$pos,$charset)  {
02121         $sjis = ($charset == 'shift_jis');
02122         $n = 0; // number of characters seen
02123         $p = abs($pos); // number of characters wanted
02124 
02125         if ($pos >= 0)  {
02126             $i = 0;
02127             $d = 1;
02128         } else {
02129             $i = strlen($str)-1;
02130             $d = -1;
02131         }
02132 
02133         for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
02134             $c = ord($str{$i});
02135             if ($sjis)  {
02136                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i+=$d; // advance a double-byte char
02137             }
02138             else    {
02139                 if ($c >= 0x80) $i+=$d; // advance a double-byte char
02140             }
02141 
02142             $n++;
02143         }
02144         if (!strlen($str{$i}))  return false; // offset beyond string length
02145 
02146         if ($pos < 0)   $i++;   // correct offset
02147 
02148         return $i;
02149     }
02150 
02151     /**
02152      * Maps all characters of a string in the EUC charset family.
02153      *
02154      * @param   string      EUC multibyte character string
02155      * @param   string      the charset
02156      * @param   string      mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
02157      * @param   string      'case': conversion 'toLower' or 'toUpper'
02158      * @return  string      the converted string
02159      * @author  Martin Kutschker <martin.t.kutschker@blackbox.net>
02160      */
02161     function euc_char_mapping($str,$charset,$mode,$opt='')  {
02162         switch($mode)   {
02163             case 'case':
02164                 if (!$this->initCaseFolding($charset))  return $str;    // do nothing
02165                 $map =& $this->caseFolding[$charset][$opt];
02166                 break;
02167 
02168             case 'ascii':
02169                 if (!$this->initToASCII($charset))  return $str;    // do nothing
02170                 $map =& $this->toASCII[$charset];
02171                 break;
02172 
02173             default:
02174                 return $str;
02175         }
02176 
02177         $sjis = ($charset == 'shift_jis');
02178         $out = '';
02179         for($i=0; strlen($str{$i}); $i++)   {
02180             $mbc = $str{$i};
02181             $c = ord($mbc);
02182 
02183             if ($sjis)  {
02184                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  {   // a double-byte char
02185                     $mbc = substr($str,$i,2);
02186                     $i++;
02187                 }
02188             }
02189             else    {
02190                 if ($c >= 0x80) {   // a double-byte char
02191                     $mbc = substr($str,$i,2);
02192                     $i++;
02193                 }
02194             }
02195 
02196             if (isset($map[$mbc]))  {
02197                 $out .= $map[$mbc];
02198             } else {
02199                 $out .= $mbc;
02200             }
02201         }
02202 
02203         return $out;
02204     }
02205 
02206 }
02207 
02208 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])    {
02209     include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
02210 }
02211 
02212 ?>

Generated on Sat Jul 24 04:17:16 2010 for TYPO3 API by  doxygen 1.4.7