TYPO3 API  SVNRelease
class.lexer.php
Go to the documentation of this file.
00001 <?php
00002 /***************************************************************
00003 *  Copyright notice
00004 *
00005 *  (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
00006 *  All rights reserved
00007 *
00008 *  This script is part of the TYPO3 project. The TYPO3 project is
00009 *  free software; you can redistribute it and/or modify
00010 *  it under the terms of the GNU General Public License as published by
00011 *  the Free Software Foundation; either version 2 of the License, or
00012 *  (at your option) any later version.
00013 *
00014 *  The GNU General Public License can be found at
00015 *  http://www.gnu.org/copyleft/gpl.html.
00016 *  A copy is found in the textfile GPL.txt and important notices to the license
00017 *  from the author is found in LICENSE.txt distributed with these scripts.
00018 *
00019 *
00020 *  This script is distributed in the hope that it will be useful,
00021 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00022 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00023 *  GNU General Public License for more details.
00024 *
00025 *  This copyright notice MUST APPEAR in all copies of the script!
00026 ***************************************************************/
00027 /**
00028  * Lexer for indexed_search
00029  *
00030  * @author  Kasper Skårhøj <kasperYYYY@typo3.com>
00031  * Parts provided by Martin Kutschker <Martin.T.Kutschker@blackbox.net>
00032  */
00033 /**
00034  * [CLASS/FUNCTION INDEX of SCRIPT]
00035  *
00036  *
00037  *
00038  *   73: class tx_indexedsearch_lexer
00039  *  105:     function tx_indexedsearch_lexer()
00040  *  116:     function split2Words($wordString)
00041  *
00042  *              SECTION: Helper functions
00043  *  178:     function addWords(&$words, &$wordString, $start, $len)
00044  *  239:     function get_word(&$str, $pos=0)
00045  *  264:     function utf8_is_letter(&$str, &$len, $pos=0)
00046  *  329:     function charType($cp)
00047  *  383:     function utf8_ord(&$str, &$len, $pos=0, $hex=false)
00048  *
00049  * TOTAL FUNCTIONS: 7
00050  * (This index is automatically created/updated by the extension "extdeveval")
00051  *
00052  */
00053 
00054 
00055 
00056 
00057 
00058 
00059 
00060 
00061 
00062 
00063 
00064 
00065 /**
00066  * Lexer class for indexed_search
00067  * A lexer splits the text into words
00068  *
00069  * @author  Kasper Skårhøj <kasperYYYY@typo3.com>
00070  * @package TYPO3
00071  * @subpackage tx_indexedsearch
00072  */
00073 class tx_indexedsearch_lexer {
00074 
00075         // Debugging options:
00076     var $debug = FALSE;     // If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
00077     var $debugString = '';
00078 
00079     /**
00080      * Charset class object
00081      *
00082      * @var t3lib_cs
00083      */
00084     var $csObj;
00085 
00086 
00087         // Configuration of the lexer:
00088     var $lexerConf = array(
00089         'printjoins' => array(  // This is the Unicode numbers of chars that are allowed INSIDE a sequence of letter chars (alphanum + CJK)
00090             0x2e,   // "."
00091             0x2d,   // "-"
00092             0x5f,   // "_"
00093             0x3a,   // ":"
00094             0x2f,   // "/"
00095             0x27,   // "'"
00096             // 0x615,   // ARABIC SMALL HIGH TAH
00097         ),
00098         'casesensitive' => FALSE,   // Set, if case sensitive indexing is wanted.
00099         'removeChars' => array(     // List of unicode numbers of chars that will be removed before words are returned (eg. "-")
00100             0x2d    // "-"
00101         )
00102     );
00103 
00104 
00105     /**
00106      * Constructor: Initializes the charset class, t3lib_cs
00107      *
00108      * @return  void
00109      */
00110     function tx_indexedsearch_lexer() {
00111         $this->csObj = t3lib_div::makeInstance('t3lib_cs');
00112     }
00113 
00114     /**
00115      * Splitting string into words.
00116      * Used for indexing, can also be used to find words in query.
00117      *
00118      * @param   string      String with UTF-8 content to process.
00119      * @return  array       Array of words in utf-8
00120      */
00121     function split2Words($wordString)   {
00122 
00123             // Reset debug string:
00124         $this->debugString = '';
00125 
00126             // Then convert the string to lowercase:
00127         if (!$this->lexerConf['casesensitive']) {
00128             $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
00129         }
00130 
00131             // Now, splitting words:
00132         $len = 0;
00133         $start = 0;
00134         $pos = 0;
00135         $words = array();
00136         $this->debugString = '';
00137 
00138         while(1)    {
00139             list($start,$len) = $this->get_word($wordString, $pos);
00140             if ($len)   {
00141 
00142                 $this->addWords($words, $wordString,$start,$len);
00143 
00144                 if ($this->debug)   {
00145                     $this->debugString.= '<span style="color:red">'.htmlspecialchars(substr($wordString,$pos,$start-$pos)).'</span>'.
00146                                         htmlspecialchars(substr($wordString,$start,$len));
00147                 }
00148 
00149                 $pos = $start+$len;
00150             } else break;
00151         }
00152         return $words;
00153     }
00154 
00155 
00156 
00157 
00158 
00159 
00160 
00161 
00162 
00163 
00164 
00165 
00166     /**********************************
00167      *
00168      * Helper functions
00169      *
00170      ********************************/
00171 
00172 
00173     /**
00174      * Add word to word-array
00175      * This function should be used to make sure CJK sequences are split up in the right way
00176      *
00177      * @param   array       Array of accumulated words
00178      * @param   string      Complete Input string from where to extract word
00179      * @param   integer     Start position of word in input string
00180      * @param   integer     The Length of the word string from start position
00181      * @return  void
00182      */
00183     function addWords(&$words, &$wordString, $start, $len)  {
00184 
00185             // Get word out of string:
00186         $theWord = substr($wordString,$start,$len);
00187 
00188             // Get next chars unicode number and find type:
00189         $bc = 0;
00190         $cp = $this->utf8_ord($theWord, $bc);
00191         list($cType) = $this->charType($cp);
00192 
00193             // If string is a CJK sequence we follow this algorithm:
00194             /*
00195                 DESCRIPTION OF (CJK) ALGORITHM
00196 
00197                 Continuous letters and numbers make up words. Spaces and symbols
00198                 separate letters and numbers into words. This is sufficient for
00199                 all western text.
00200 
00201                 CJK doesn't use spaces or separators to separate words, so the only
00202                 way to really find out what constitutes a word would be to have a
00203                 dictionary and advanced heuristics. Instead, we form pairs from
00204                 consecutive characters, in such a way that searches will find only
00205                 characters that appear more-or-less the right sequence. For example:
00206 
00207                     ABCDE => AB BC CD DE
00208 
00209                 This works okay since both the index and the search query is split
00210                 in the same manner, and since the set of characters is huge so the
00211                 extra matches are not significant.
00212 
00213                 (Hint taken from ZOPEs chinese user group)
00214 
00215                 [Kasper: As far as I can see this will only work well with or-searches!]
00216             */
00217         if ($cType == 'cjk')    {
00218                 // Find total string length:
00219             $strlen = $this->csObj->utf8_strlen($theWord);
00220 
00221                 // Traverse string length and add words as pairs of two chars:
00222             for ($a=0; $a<$strlen; $a++)    {
00223                 if ($strlen==1 || $a<$strlen-1) {
00224                     $words[] = $this->csObj->utf8_substr($theWord, $a, 2);
00225                 }
00226             }
00227         } else {    // Normal "single-byte" chars:
00228                 // Remove chars:
00229             foreach($this->lexerConf['removeChars'] as $skipJoin)   {
00230                 $theWord = str_replace($this->csObj->UnumberToChar($skipJoin),'',$theWord);
00231             }
00232                 // Add word:
00233             $words[] = $theWord;
00234         }
00235     }
00236 
00237     /**
00238      * Get the first word in a given utf-8 string (initial non-letters will be skipped)
00239      *
00240      * @param   string      Input string (reference)
00241      * @param   integer     Starting position in input string
00242      * @return  array       0: start, 1: len or false if no word has been found
00243      */
00244     function get_word(&$str, $pos=0)    {
00245 
00246         $len=0;
00247 
00248             // If return is true, a word was found starting at this position, so returning position and length:
00249         if ($this->utf8_is_letter($str, $len, $pos))    {
00250             return array($pos,$len);
00251         }
00252 
00253             // If the return value was false it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
00254         $pos += $len;
00255         if ($str{$pos} == '')   return false;   // check end of string before looking for word of course.
00256 
00257         $this->utf8_is_letter($str, $len, $pos);
00258         return array($pos,$len);
00259     }
00260 
00261     /**
00262      * See if a character is a letter (or a string of letters or non-letters).
00263      *
00264      * @param   string      Input string (reference)
00265      * @param   integer     Byte-length of character sequence (reference, return value)
00266      * @param   integer     Starting position in input string
00267      * @return  boolean     letter (or word) found
00268      */
00269     function utf8_is_letter(&$str, &$len, $pos=0)   {
00270         global $cs;
00271 
00272         $len = 0;
00273         $bc = 0;
00274         $cType = $cType_prev = false; // Letter type
00275         $letter = true; // looking for a letter?
00276 
00277         if ($str{$pos} == '')   return false;   // Return false on end-of-string at this stage
00278 
00279         while(1) {
00280 
00281                 // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
00282             if ($len)   {
00283                 if ($letter)    {   // We are in a sequence of words
00284                     if (!$cType     // The char was NOT a letter
00285                             || ($cType_prev=='cjk' && t3lib_div::inList('num,alpha',$cType)) || ($cType=='cjk' && t3lib_div::inList('num,alpha',$cType_prev))   // ... or the previous and current char are from single-byte sets vs. asian CJK sets
00286                             )   {
00287                             // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
00288                         if (!in_array($cp,$this->lexerConf['printjoins']))  {
00289                                 // If a printjoin start length has been record, set that back now so the length is right (filtering out multiple end chars)
00290                             if ($printJoinLgd)  {
00291                                 $len = $printJoinLgd;
00292                             }
00293                             #debug($cp);
00294                             return true;
00295                         } else {    // If a printJoin char is found, record the length if it has not been recorded already:
00296                             if (!$printJoinLgd) $printJoinLgd = $len;
00297                         }
00298                     } else {    // When a true letter is found, reset printJoinLgd counter:
00299                         $printJoinLgd = 0;
00300                     }
00301                 }
00302                 elseif (!$letter && $cType) {   // end of non-word reached
00303                     return false;
00304                 }
00305             }
00306             $len += $bc;    // add byte-length of last found character
00307 
00308             if ($str{$pos} == '')   return $letter; // end of string; return status of string till now
00309 
00310                 // Get next chars unicode number:
00311             $cp = $this->utf8_ord($str,$bc,$pos);
00312             $pos += $bc;
00313 
00314                 // Determine the type:
00315             $cType_prev = $cType;
00316             list($cType) = $this->charType($cp);
00317             if ($cType) {
00318                 continue;
00319             }
00320 
00321                 // Setting letter to false if the first char was not a letter!
00322             if (!$len)  $letter = false;
00323         }
00324 
00325         return false;
00326     }
00327 
00328     /**
00329      * Determine the type of character
00330      *
00331      * @param   integer     Unicode number to evaluate
00332      * @return  array       Type of char; index-0: the main type: num, alpha or CJK (Chinese / Japanese / Korean)
00333      */
00334     function charType($cp)  {
00335 
00336             // Numeric?
00337         if (
00338                 ($cp >= 0x30 && $cp <= 0x39)        // Arabic
00339 /*
00340                 ($cp >= 0x660 && $cp <= 0x669) ||   // Arabic-Indic
00341                 ($cp >= 0x6F0 && $cp <= 0x6F9) ||   // Arabic-Indic (Iran, Pakistan, and India)
00342                 ($cp >= 0x3021 && $cp <= 0x3029) || // Hangzhou
00343 */
00344             )   {
00345             return array('num');
00346         }
00347 
00348             // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
00349         if (
00350                 ($cp >= 0x41 && $cp <= 0x5A) ||     // Basic Latin: capital letters
00351                 ($cp >= 0x61 && $cp <= 0x7A) ||     // Basic Latin: small letters
00352                 ($cp >= 0xC0 && $cp <= 0xFF && $cp != 0xD7 && $cp != 0xF7) ||           // Latin-1 Supplement (0x80-0xFF) excluding multiplication and division sign
00353                 ($cp >= 0x100 && $cp < 0x280) ||    // Latin Extended-A and -B
00354                 ($cp == 0x386 || ($cp >= 0x388 && $cp < 0x400)) || // Greek and Coptic excluding non-letters
00355                 (($cp >= 0x400 && $cp < 0x482) || ($cp >= 0x48A && $cp < 0x530)) ||     // Cyrillic and Cyrillic Supplement excluding historic miscellaneous
00356                 (($cp >= 0x590 && $cp < 0x5B0) || ($cp >= 0x5D0 && $cp < 0x5F3)) ||     // Hebrew: only accents and letters
00357                 (($cp >= 0x621 && $cp <= 0x658) || ($cp >= 0x66E &&  $cp <= 0x6D3)) ||  // Arabic: only letters (there are more letters in the range, we can add them if there is a demand)
00358                 ($cp >= 0x1E00 && $cp < 0x2000)     // Latin Extended Additional and Greek Extended
00359             )   {
00360             return array('alpha');
00361         }
00362 
00363             // Looking for CJK (Chinese / Japanese / Korean)
00364             // Ranges are not certain - deducted from the translation tables in t3lib/csconvtbl/
00365             // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
00366         if (
00367                 ($cp >= 0x3040 && $cp <= 0x30FF) ||     // HIRAGANA and KATAKANA letters
00368                 ($cp >= 0x3130 && $cp <= 0x318F) ||     // Hangul Compatibility Jamo
00369                 ($cp >= 0x3400 && $cp <= 0x4DBF) ||     // CJK Unified Ideographs Extension A
00370                 ($cp >= 0x4E00 && $cp <= 0x9FAF) ||     // CJK Unified Ideographs
00371                 ($cp >= 0xAC00 && $cp <= 0xD7AF) ||     // Hangul Syllables
00372                 ($cp >= 0x20000 && $cp <= 0x2FA1F)      // CJK Unified Ideographs Extension B and CJK Compatibility Ideographs Supplement
00373                                                         // also include CJK and Kangxi radicals or Bopomofo letter?
00374             )   {
00375             return array('cjk');
00376         }
00377     }
00378 
00379     /**
00380      * Converts a UTF-8 multibyte character to a UNICODE codepoint
00381      *
00382      * @param   string      UTF-8 multibyte character string (reference)
00383      * @param   integer     The length of the character (reference, return value)
00384      * @param   integer     Starting position in input string
00385      * @param   boolean     If set, then a hex. number is returned
00386      * @return  integer     UNICODE codepoint
00387      */
00388     function utf8_ord(&$str, &$len, $pos=0, $hex=false) {
00389         $ord = ord($str{$pos});
00390         $len = 1;
00391 
00392         if ($ord > 0x80)    {
00393             for ($bc=-1, $mbs=$ord; $mbs & 0x80; $mbs = $mbs << 1)  $bc++;  // calculate number of extra bytes
00394             $len += $bc;
00395 
00396             $ord = $ord & ((1 << (6-$bc)) - 1); // mask utf-8 lead-in bytes
00397             for ($i=$pos+1; $bc; $bc--, $i++)   // "bring in" data bytes
00398                 $ord = ($ord << 6) | (ord($str{$i}) & 0x3F);
00399         }
00400 
00401         return $hex ? 'x'.dechex($ord) : $ord;
00402     }
00403 }
00404 
00405 
00406 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php'])) {
00407     include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']);
00408 }
00409 ?>