|
TYPO3 API
SVNRelease
|
00001 <?php 00002 /*************************************************************** 00003 * Copyright notice 00004 * 00005 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com) 00006 * All rights reserved 00007 * 00008 * This script is part of the TYPO3 project. The TYPO3 project is 00009 * free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 2 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * The GNU General Public License can be found at 00015 * http://www.gnu.org/copyleft/gpl.html. 00016 * A copy is found in the textfile GPL.txt and important notices to the license 00017 * from the author is found in LICENSE.txt distributed with these scripts. 00018 * 00019 * 00020 * This script is distributed in the hope that it will be useful, 00021 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00022 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00023 * GNU General Public License for more details. 00024 * 00025 * This copyright notice MUST APPEAR in all copies of the script! 00026 ***************************************************************/ 00027 /** 00028 * Lexer for indexed_search 00029 * 00030 * @author Kasper Skårhøj <kasperYYYY@typo3.com> 00031 * Parts provided by Martin Kutschker <Martin.T.Kutschker@blackbox.net> 00032 */ 00033 /** 00034 * [CLASS/FUNCTION INDEX of SCRIPT] 00035 * 00036 * 00037 * 00038 * 73: class tx_indexedsearch_lexer 00039 * 105: function tx_indexedsearch_lexer() 00040 * 116: function split2Words($wordString) 00041 * 00042 * SECTION: Helper functions 00043 * 178: function addWords(&$words, &$wordString, $start, $len) 00044 * 239: function get_word(&$str, $pos=0) 00045 * 264: function utf8_is_letter(&$str, &$len, $pos=0) 00046 * 329: function charType($cp) 00047 * 383: function utf8_ord(&$str, &$len, $pos=0, $hex=false) 00048 * 00049 * TOTAL FUNCTIONS: 7 00050 * (This index is automatically created/updated by the extension "extdeveval") 00051 * 00052 */ 00053 00054 00055 00056 00057 00058 00059 00060 00061 00062 00063 00064 00065 /** 00066 * Lexer class for indexed_search 00067 * A lexer splits the text into words 00068 * 00069 * @author Kasper Skårhøj <kasperYYYY@typo3.com> 00070 * @package TYPO3 00071 * @subpackage tx_indexedsearch 00072 */ 00073 class tx_indexedsearch_lexer { 00074 00075 // Debugging options: 00076 var $debug = FALSE; // If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display) 00077 var $debugString = ''; 00078 00079 /** 00080 * Charset class object 00081 * 00082 * @var t3lib_cs 00083 */ 00084 var $csObj; 00085 00086 00087 // Configuration of the lexer: 00088 var $lexerConf = array( 00089 'printjoins' => array( // This is the Unicode numbers of chars that are allowed INSIDE a sequence of letter chars (alphanum + CJK) 00090 0x2e, // "." 00091 0x2d, // "-" 00092 0x5f, // "_" 00093 0x3a, // ":" 00094 0x2f, // "/" 00095 0x27, // "'" 00096 // 0x615, // ARABIC SMALL HIGH TAH 00097 ), 00098 'casesensitive' => FALSE, // Set, if case sensitive indexing is wanted. 00099 'removeChars' => array( // List of unicode numbers of chars that will be removed before words are returned (eg. "-") 00100 0x2d // "-" 00101 ) 00102 ); 00103 00104 00105 /** 00106 * Constructor: Initializes the charset class, t3lib_cs 00107 * 00108 * @return void 00109 */ 00110 function tx_indexedsearch_lexer() { 00111 $this->csObj = t3lib_div::makeInstance('t3lib_cs'); 00112 } 00113 00114 /** 00115 * Splitting string into words. 00116 * Used for indexing, can also be used to find words in query. 00117 * 00118 * @param string String with UTF-8 content to process. 00119 * @return array Array of words in utf-8 00120 */ 00121 function split2Words($wordString) { 00122 00123 // Reset debug string: 00124 $this->debugString = ''; 00125 00126 // Then convert the string to lowercase: 00127 if (!$this->lexerConf['casesensitive']) { 00128 $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower'); 00129 } 00130 00131 // Now, splitting words: 00132 $len = 0; 00133 $start = 0; 00134 $pos = 0; 00135 $words = array(); 00136 $this->debugString = ''; 00137 00138 while(1) { 00139 list($start,$len) = $this->get_word($wordString, $pos); 00140 if ($len) { 00141 00142 $this->addWords($words, $wordString,$start,$len); 00143 00144 if ($this->debug) { 00145 $this->debugString.= '<span style="color:red">'.htmlspecialchars(substr($wordString,$pos,$start-$pos)).'</span>'. 00146 htmlspecialchars(substr($wordString,$start,$len)); 00147 } 00148 00149 $pos = $start+$len; 00150 } else break; 00151 } 00152 return $words; 00153 } 00154 00155 00156 00157 00158 00159 00160 00161 00162 00163 00164 00165 00166 /********************************** 00167 * 00168 * Helper functions 00169 * 00170 ********************************/ 00171 00172 00173 /** 00174 * Add word to word-array 00175 * This function should be used to make sure CJK sequences are split up in the right way 00176 * 00177 * @param array Array of accumulated words 00178 * @param string Complete Input string from where to extract word 00179 * @param integer Start position of word in input string 00180 * @param integer The Length of the word string from start position 00181 * @return void 00182 */ 00183 function addWords(&$words, &$wordString, $start, $len) { 00184 00185 // Get word out of string: 00186 $theWord = substr($wordString,$start,$len); 00187 00188 // Get next chars unicode number and find type: 00189 $bc = 0; 00190 $cp = $this->utf8_ord($theWord, $bc); 00191 list($cType) = $this->charType($cp); 00192 00193 // If string is a CJK sequence we follow this algorithm: 00194 /* 00195 DESCRIPTION OF (CJK) ALGORITHM 00196 00197 Continuous letters and numbers make up words. Spaces and symbols 00198 separate letters and numbers into words. This is sufficient for 00199 all western text. 00200 00201 CJK doesn't use spaces or separators to separate words, so the only 00202 way to really find out what constitutes a word would be to have a 00203 dictionary and advanced heuristics. Instead, we form pairs from 00204 consecutive characters, in such a way that searches will find only 00205 characters that appear more-or-less the right sequence. For example: 00206 00207 ABCDE => AB BC CD DE 00208 00209 This works okay since both the index and the search query is split 00210 in the same manner, and since the set of characters is huge so the 00211 extra matches are not significant. 00212 00213 (Hint taken from ZOPEs chinese user group) 00214 00215 [Kasper: As far as I can see this will only work well with or-searches!] 00216 */ 00217 if ($cType == 'cjk') { 00218 // Find total string length: 00219 $strlen = $this->csObj->utf8_strlen($theWord); 00220 00221 // Traverse string length and add words as pairs of two chars: 00222 for ($a=0; $a<$strlen; $a++) { 00223 if ($strlen==1 || $a<$strlen-1) { 00224 $words[] = $this->csObj->utf8_substr($theWord, $a, 2); 00225 } 00226 } 00227 } else { // Normal "single-byte" chars: 00228 // Remove chars: 00229 foreach($this->lexerConf['removeChars'] as $skipJoin) { 00230 $theWord = str_replace($this->csObj->UnumberToChar($skipJoin),'',$theWord); 00231 } 00232 // Add word: 00233 $words[] = $theWord; 00234 } 00235 } 00236 00237 /** 00238 * Get the first word in a given utf-8 string (initial non-letters will be skipped) 00239 * 00240 * @param string Input string (reference) 00241 * @param integer Starting position in input string 00242 * @return array 0: start, 1: len or false if no word has been found 00243 */ 00244 function get_word(&$str, $pos=0) { 00245 00246 $len=0; 00247 00248 // If return is true, a word was found starting at this position, so returning position and length: 00249 if ($this->utf8_is_letter($str, $len, $pos)) { 00250 return array($pos,$len); 00251 } 00252 00253 // If the return value was false it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word: 00254 $pos += $len; 00255 if ($str{$pos} == '') return false; // check end of string before looking for word of course. 00256 00257 $this->utf8_is_letter($str, $len, $pos); 00258 return array($pos,$len); 00259 } 00260 00261 /** 00262 * See if a character is a letter (or a string of letters or non-letters). 00263 * 00264 * @param string Input string (reference) 00265 * @param integer Byte-length of character sequence (reference, return value) 00266 * @param integer Starting position in input string 00267 * @return boolean letter (or word) found 00268 */ 00269 function utf8_is_letter(&$str, &$len, $pos=0) { 00270 global $cs; 00271 00272 $len = 0; 00273 $bc = 0; 00274 $cType = $cType_prev = false; // Letter type 00275 $letter = true; // looking for a letter? 00276 00277 if ($str{$pos} == '') return false; // Return false on end-of-string at this stage 00278 00279 while(1) { 00280 00281 // If characters has been obtained we will know whether the string starts as a sequence of letters or not: 00282 if ($len) { 00283 if ($letter) { // We are in a sequence of words 00284 if (!$cType // The char was NOT a letter 00285 || ($cType_prev=='cjk' && t3lib_div::inList('num,alpha',$cType)) || ($cType=='cjk' && t3lib_div::inList('num,alpha',$cType_prev)) // ... or the previous and current char are from single-byte sets vs. asian CJK sets 00286 ) { 00287 // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word. 00288 if (!in_array($cp,$this->lexerConf['printjoins'])) { 00289 // If a printjoin start length has been record, set that back now so the length is right (filtering out multiple end chars) 00290 if ($printJoinLgd) { 00291 $len = $printJoinLgd; 00292 } 00293 #debug($cp); 00294 return true; 00295 } else { // If a printJoin char is found, record the length if it has not been recorded already: 00296 if (!$printJoinLgd) $printJoinLgd = $len; 00297 } 00298 } else { // When a true letter is found, reset printJoinLgd counter: 00299 $printJoinLgd = 0; 00300 } 00301 } 00302 elseif (!$letter && $cType) { // end of non-word reached 00303 return false; 00304 } 00305 } 00306 $len += $bc; // add byte-length of last found character 00307 00308 if ($str{$pos} == '') return $letter; // end of string; return status of string till now 00309 00310 // Get next chars unicode number: 00311 $cp = $this->utf8_ord($str,$bc,$pos); 00312 $pos += $bc; 00313 00314 // Determine the type: 00315 $cType_prev = $cType; 00316 list($cType) = $this->charType($cp); 00317 if ($cType) { 00318 continue; 00319 } 00320 00321 // Setting letter to false if the first char was not a letter! 00322 if (!$len) $letter = false; 00323 } 00324 00325 return false; 00326 } 00327 00328 /** 00329 * Determine the type of character 00330 * 00331 * @param integer Unicode number to evaluate 00332 * @return array Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean) 00333 */ 00334 function charType($cp) { 00335 00336 // Numeric? 00337 if ( 00338 ($cp >= 0x30 && $cp <= 0x39) // Arabic 00339 /* 00340 ($cp >= 0x660 && $cp <= 0x669) || // Arabic-Indic 00341 ($cp >= 0x6F0 && $cp <= 0x6F9) || // Arabic-Indic (Iran, Pakistan, and India) 00342 ($cp >= 0x3021 && $cp <= 0x3029) || // Hangzhou 00343 */ 00344 ) { 00345 return array('num'); 00346 } 00347 00348 // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic): 00349 if ( 00350 ($cp >= 0x41 && $cp <= 0x5A) || // Basic Latin: capital letters 00351 ($cp >= 0x61 && $cp <= 0x7A) || // Basic Latin: small letters 00352 ($cp >= 0xC0 && $cp <= 0xFF && $cp != 0xD7 && $cp != 0xF7) || // Latin-1 Supplement (0x80-0xFF) excluding multiplication and division sign 00353 ($cp >= 0x100 && $cp < 0x280) || // Latin Extended-A and -B 00354 ($cp == 0x386 || ($cp >= 0x388 && $cp < 0x400)) || // Greek and Coptic excluding non-letters 00355 (($cp >= 0x400 && $cp < 0x482) || ($cp >= 0x48A && $cp < 0x530)) || // Cyrillic and Cyrillic Supplement excluding historic miscellaneous 00356 (($cp >= 0x590 && $cp < 0x5B0) || ($cp >= 0x5D0 && $cp < 0x5F3)) || // Hebrew: only accents and letters 00357 (($cp >= 0x621 && $cp <= 0x658) || ($cp >= 0x66E && $cp <= 0x6D3)) || // Arabic: only letters (there are more letters in the range, we can add them if there is a demand) 00358 ($cp >= 0x1E00 && $cp < 0x2000) // Latin Extended Additional and Greek Extended 00359 ) { 00360 return array('alpha'); 00361 } 00362 00363 // Looking for CJK (Chinese / Japanese / Korean) 00364 // Ranges are not certain - deducted from the translation tables in t3lib/csconvtbl/ 00365 // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete. 00366 if ( 00367 ($cp >= 0x3040 && $cp <= 0x30FF) || // HIRAGANA and KATAKANA letters 00368 ($cp >= 0x3130 && $cp <= 0x318F) || // Hangul Compatibility Jamo 00369 ($cp >= 0x3400 && $cp <= 0x4DBF) || // CJK Unified Ideographs Extension A 00370 ($cp >= 0x4E00 && $cp <= 0x9FAF) || // CJK Unified Ideographs 00371 ($cp >= 0xAC00 && $cp <= 0xD7AF) || // Hangul Syllables 00372 ($cp >= 0x20000 && $cp <= 0x2FA1F) // CJK Unified Ideographs Extension B and CJK Compatibility Ideographs Supplement 00373 // also include CJK and Kangxi radicals or Bopomofo letter? 00374 ) { 00375 return array('cjk'); 00376 } 00377 } 00378 00379 /** 00380 * Converts a UTF-8 multibyte character to a UNICODE codepoint 00381 * 00382 * @param string UTF-8 multibyte character string (reference) 00383 * @param integer The length of the character (reference, return value) 00384 * @param integer Starting position in input string 00385 * @param boolean If set, then a hex. number is returned 00386 * @return integer UNICODE codepoint 00387 */ 00388 function utf8_ord(&$str, &$len, $pos=0, $hex=false) { 00389 $ord = ord($str{$pos}); 00390 $len = 1; 00391 00392 if ($ord > 0x80) { 00393 for ($bc=-1, $mbs=$ord; $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of extra bytes 00394 $len += $bc; 00395 00396 $ord = $ord & ((1 << (6-$bc)) - 1); // mask utf-8 lead-in bytes 00397 for ($i=$pos+1; $bc; $bc--, $i++) // "bring in" data bytes 00398 $ord = ($ord << 6) | (ord($str{$i}) & 0x3F); 00399 } 00400 00401 return $hex ? 'x'.dechex($ord) : $ord; 00402 } 00403 } 00404 00405 00406 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php'])) { 00407 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']); 00408 } 00409 ?>
1.8.0