class.indexer.php

Go to the documentation of this file.
00001 <?php
00002 /***************************************************************
00003 *  Copyright notice
00004 *
00005 *  (c) 2001-2010 Kasper Skårhøj (kasperYYYY@typo3.com)
00006 *  All rights reserved
00007 *
00008 *  This script is part of the TYPO3 project. The TYPO3 project is
00009 *  free software; you can redistribute it and/or modify
00010 *  it under the terms of the GNU General Public License as published by
00011 *  the Free Software Foundation; either version 2 of the License, or
00012 *  (at your option) any later version.
00013 *
00014 *  The GNU General Public License can be found at
00015 *  http://www.gnu.org/copyleft/gpl.html.
00016 *  A copy is found in the textfile GPL.txt and important notices to the license
00017 *  from the author is found in LICENSE.txt distributed with these scripts.
00018 *
00019 *
00020 *  This script is distributed in the hope that it will be useful,
00021 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00022 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00023 *  GNU General Public License for more details.
00024 *
00025 *  This copyright notice MUST APPEAR in all copies of the script!
00026 ***************************************************************/
00027 /**
00028  * This class is a search indexer for TYPO3
00029  *
00030  * @author  Kasper Skårhøj <kasperYYYY@typo3.com>
00031  * Originally Christian Jul Jensen <christian@jul.net> helped as well.
00032  */
00033 /**
00034  * [CLASS/FUNCTION INDEX of SCRIPT]
00035  *
00036  *
00037  *
00038  *  141: class tx_indexedsearch_indexer
00039  *  207:     function hook_indexContent(&$pObj)
00040  *
00041  *              SECTION: Backend API
00042  *  308:     function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE)
00043  *  347:     function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0)
00044  *  365:     function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0)
00045  *
00046  *              SECTION: Initialization
00047  *  416:     function init()
00048  *  468:     function initializeExternalParsers()
00049  *
00050  *              SECTION: Indexing; TYPO3 pages (HTML content)
00051  *  509:     function indexTypo3PageContent()
00052  *  596:     function splitHTMLContent($content)
00053  *  642:     function getHTMLcharset($content)
00054  *  657:     function convertHTMLToUtf8($content,$charset='')
00055  *  685:     function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList)
00056  *  712:     function typoSearchTags(&$body)
00057  *  741:     function extractLinks($content)
00058  *  812:     function extractHyperLinks($string)
00059  *
00060  *              SECTION: Indexing; external URL
00061  *  871:     function indexExternalUrl($externalUrl)
00062  *  902:     function getUrlHeaders($url)
00063  *
00064  *              SECTION: Indexing; external files (PDF, DOC, etc)
00065  *  948:     function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='')
00066  * 1054:     function readFileContent($ext,$absFile,$cPKey)
00067  * 1071:     function fileContentParts($ext,$absFile)
00068  * 1089:     function splitRegularContent($content)
00069  *
00070  *              SECTION: Analysing content, Extracting words
00071  * 1122:     function charsetEntity2utf8(&$contentArr, $charset)
00072  * 1145:     function processWordsInArrays($contentArr)
00073  * 1170:     function procesWordsInArrays($contentArr)
00074  * 1180:     function bodyDescription($contentArr)
00075  * 1202:     function indexAnalyze($content)
00076  * 1223:     function analyzeHeaderinfo(&$retArr,$content,$key,$offset)
00077  * 1242:     function analyzeBody(&$retArr,$content)
00078  * 1262:     function metaphone($word,$retRaw=FALSE)
00079  *
00080  *              SECTION: SQL; TYPO3 Pages
00081  * 1304:     function submitPage()
00082  * 1378:     function submit_grlist($hash,$phash_x)
00083  * 1398:     function submit_section($hash,$hash_t3)
00084  * 1416:     function removeOldIndexedPages($phash)
00085  *
00086  *              SECTION: SQL; External media
00087  * 1459:     function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)
00088  * 1525:     function submitFile_grlist($hash)
00089  * 1539:     function submitFile_section($hash)
00090  * 1553:     function removeOldIndexedFiles($phash)
00091  *
00092  *              SECTION: SQL Helper functions
00093  * 1589:     function checkMtimeTstamp($mtime,$phash)
00094  * 1625:     function checkContentHash()
00095  * 1642:     function checkExternalDocContentHash($hashGr,$content_md5h)
00096  * 1656:     function is_grlist_set($phash_x)
00097  * 1669:     function update_grlist($phash,$phash_x)
00098  * 1684:     function updateTstamp($phash,$mtime=0)
00099  * 1699:     function updateSetId($phash)
00100  * 1714:     function updateParsetime($phash,$parsetime)
00101  * 1727:     function updateRootline()
00102  * 1742:     function getRootLineFields(&$fieldArr)
00103  * 1761:     function removeLoginpagesWithContentHash()
00104  * 1778:     function includeCrawlerClass()
00105  *
00106  *              SECTION: SQL; Submitting words
00107  * 1805:     function checkWordList($wl)
00108  * 1842:     function submitWords($wl,$phash)
00109  * 1866:     function freqMap($freq)
00110  *
00111  *              SECTION: Hashing
00112  * 1899:     function setT3Hashes()
00113  * 1925:     function setExtHashes($file,$subinfo=array())
00114  * 1949:     function md5inthash($str)
00115  * 1959:     function makeCHash($paramArray)
00116  *
00117  *              SECTION: Internal logging functions
00118  * 1991:     function log_push($msg,$key)
00119  * 2000:     function log_pull()
00120  * 2011:     function log_setTSlogMessage($msg, $errorNum=0)
00121  *
00122  *              SECTION: tslib_fe hooks:
00123  * 2036:     function fe_headerNoCache(&$params, $ref)
00124  *
00125  * TOTAL FUNCTIONS: 59
00126  * (This index is automatically created/updated by the extension "extdeveval")
00127  *
00128  */
00129 /**
00130  * Indexing class for TYPO3 frontend
00131  *
00132  * @author  Kasper Skårhøj <kasperYYYY@typo3.com>
00133  * @package TYPO3
00134  * @subpackage tx_indexedsearch
00135  */
00136 class tx_indexedsearch_indexer {
00137 
00138         // Messages:
00139     var $reasons = array(
00140         -1 => 'mtime matched the document, so no changes detected and no content updated',
00141         -2 => 'The minimum age was not exceeded',
00142         1 => "The configured max-age was exceeded for the document and thus it's indexed.",
00143         2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
00144         3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
00145         4 => 'Page has never been indexed (is not represented in the index_phash table).'
00146     );
00147 
00148         // HTML code blocks to exclude from indexing:
00149     var $excludeSections = 'script,style';
00150 
00151         // Supported Extensions for external files:
00152     var $external_parsers = array();        // External parser objects, keys are file extension names. Values are objects with certain methods.
00153 
00154         // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
00155     var $defaultGrList = '0,-1';
00156 
00157         // Min/Max times:
00158     var $tstamp_maxAge = 0;     // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
00159     var $tstamp_minAge = 0;     // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
00160     var $maxExternalFiles = 0;  // Max number of external files to index.
00161 
00162     var $forceIndexing = FALSE;     // If true, indexing is forced despite of hashes etc.
00163     var $crawlerActive = FALSE;     // Set when crawler is detected (internal)
00164 
00165         // INTERNALS:
00166     var $defaultContentArray=array(
00167         'title' => '',
00168         'description' => '',
00169         'keywords' => '',
00170         'body' => '',
00171     );
00172     var $wordcount = 0;
00173     var $externalFileCounter = 0;
00174 
00175     var $conf = array();        // Configuration set internally (see init functions for required keys and their meaning)
00176     var $indexerConfig = array();   // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
00177     var $hash = array();        // Hash array, contains phash and phash_grouping
00178     var $file_phash_arr = array();  // Hash array for files
00179     var $contentParts = array();    // Content of TYPO3 page
00180     var $content_md5h = '';
00181     var $internal_log = array();    // Internal log
00182     var $indexExternalUrl_content = '';
00183 
00184     var $cHashParams = array(); // cHashparams array
00185 
00186     var $freqRange = 32000;
00187     var $freqMax = 0.1;
00188 
00189         // Objects:
00190     /**
00191      * Charset class object
00192      *
00193      * @var t3lib_cs
00194      */
00195     var $csObj;
00196 
00197     /**
00198      * Metaphone object, if any
00199      *
00200      * @var user_DoubleMetaPhone
00201      */
00202     var $metaphoneObj;
00203 
00204     /**
00205      * Lexer object for word splitting
00206      *
00207      * @var tx_indexedsearch_lexer
00208      */
00209     var $lexerObj;
00210 
00211 
00212 
00213     /**
00214      * Parent Object (TSFE) Initialization
00215      *
00216      * @param   object      Parent Object (frontend TSFE object), passed by reference
00217      * @return  void
00218      */
00219     function hook_indexContent(&$pObj)  {
00220 
00221             // Indexer configuration from Extension Manager interface:
00222         $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00223 
00224             // Crawler activation:
00225             // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
00226         if (t3lib_extMgm::isLoaded('crawler')
00227                 && $pObj->applicationData['tx_crawler']['running']
00228                 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions']))    {
00229 
00230                 // Setting simple log message:
00231             $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
00232 
00233                 // Setting variables:
00234             $this->crawlerActive = TRUE;    // Crawler active flag
00235             $this->forceIndexing = TRUE;    // Force indexing despite timestamps etc.
00236         }
00237 
00238             // Determine if page should be indexed, and if so, configure and initialize indexer
00239         if ($pObj->config['config']['index_enable'])    {
00240             $this->log_push('Index page','');
00241 
00242             if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
00243                 if (!$pObj->page['no_search'])  {
00244                     if (!$pObj->no_cache)   {
00245                         if (!strcmp($pObj->sys_language_uid,$pObj->sys_language_content))   {
00246 
00247                                 // Setting up internal configuration from config array:
00248                             $this->conf = array();
00249 
00250                                 // Information about page for which the indexing takes place
00251                             $this->conf['id'] = $pObj->id;              // Page id
00252                             $this->conf['type'] = $pObj->type;          // Page type
00253                             $this->conf['sys_language_uid'] = $pObj->sys_language_uid;  // sys_language UID of the language of the indexing.
00254                             $this->conf['MP'] = $pObj->MP;              // MP variable, if any (Mount Points)
00255                             $this->conf['gr_list'] = $pObj->gr_list;    // Group list
00256 
00257                             $this->conf['cHash'] = $pObj->cHash;                    // cHash string for additional parameters
00258                             $this->conf['cHash_array'] = $pObj->cHash_array;        // Array of the additional parameters
00259 
00260                             $this->conf['crdate'] = $pObj->page['crdate'];          // The creation date of the TYPO3 page
00261                             $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;    // reg1 of the caching table. Not known what practical use this has.
00262 
00263                                 // Root line uids
00264                             $this->conf['rootline_uids'] = array();
00265                             foreach($pObj->config['rootLine'] as $rlkey => $rldat)  {
00266                                 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
00267                             }
00268 
00269                                 // Content of page:
00270                             $this->conf['content'] = $pObj->content;                    // Content string (HTML of TYPO3 page)
00271                             $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);  // Alternative title for indexing
00272                             $this->conf['metaCharset'] = $pObj->metaCharset;            // Character set of content (will be converted to utf-8 during indexing)
00273                             $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'];  // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
00274 
00275                                 // Configuration of behavior:
00276                             $this->conf['index_externals'] = $pObj->config['config']['index_externals'];    // Whether to index external documents like PDF, DOC etc. (if possible)
00277                             $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];      // Length of description text (max 250, default 200)
00278                             $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : true;
00279 
00280                                 // Set to zero:
00281                             $this->conf['recordUid'] = 0;
00282                             $this->conf['freeIndexUid'] = 0;
00283                             $this->conf['freeIndexSetId'] = 0;
00284 
00285                                 // Init and start indexing:
00286                             $this->init();
00287                             $this->indexTypo3PageContent();
00288                         } else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
00289                     } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
00290                 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
00291             } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
00292             $this->log_pull();
00293         }
00294     }
00295 
00296 
00297 
00298 
00299 
00300 
00301 
00302 
00303     /****************************
00304      *
00305      * Backend API
00306      *
00307      ****************************/
00308 
00309     /**
00310      * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
00311      *
00312      * @param   integer     The page uid, &id=
00313      * @param   integer     The page type, &type=
00314      * @param   integer     sys_language uid, typically &L=
00315      * @param   string      The MP variable (Mount Points), &MP=
00316      * @param   array       Rootline array of only UIDs.
00317      * @param   array       Array of GET variables to register with this indexing
00318      * @param   boolean     If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend!
00319      * @return  void
00320      */
00321     function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE)  {
00322 
00323             // Setting up internal configuration from config array:
00324         $this->conf = array();
00325 
00326             // Information about page for which the indexing takes place
00327         $this->conf['id'] = $id;                // Page id  (integer)
00328         $this->conf['type'] = $type;            // Page type (integer)
00329         $this->conf['sys_language_uid'] = $sys_language_uid;    // sys_language UID of the language of the indexing (integer)
00330         $this->conf['MP'] = $MP;                // MP variable, if any (Mount Points) (string)
00331         $this->conf['gr_list'] = '0,-1';    // Group list (hardcoded for now...)
00332 
00333             // cHash values:
00334         $this->conf['cHash'] = $createCHash ? t3lib_div::generateCHash(t3lib_div::implodeArrayForUrl('', $cHash_array)) : '';   // cHash string for additional parameters
00335         $this->conf['cHash_array'] = $cHash_array;      // Array of the additional parameters
00336 
00337             // Set to defaults
00338         $this->conf['freeIndexUid'] = 0;
00339         $this->conf['freeIndexSetId'] = 0;
00340         $this->conf['page_cache_reg1'] = '';
00341 
00342             // Root line uids
00343         $this->conf['rootline_uids'] = $uidRL;
00344 
00345             // Configuration of behavior:
00346         $this->conf['index_externals'] = 1; // Whether to index external documents like PDF, DOC etc. (if possible)
00347         $this->conf['index_descrLgd'] = 200;        // Length of description text (max 250, default 200)
00348         $this->conf['index_metatags'] = true;   // Whether to index document keywords and description (if present)
00349 
00350             // Init and start indexing:
00351         $this->init();
00352     }
00353 
00354     /**
00355      * Sets the free-index uid. Can be called right after backend_initIndexer()
00356      *
00357      * @param   integer     Free index UID
00358      * @param   integer     Set id - an integer identifying the "set" of indexing operations.
00359      * @return  void
00360      */
00361     function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0)  {
00362         $this->conf['freeIndexUid'] = $freeIndexUid;
00363         $this->conf['freeIndexSetId'] = $freeIndexSetId;
00364     }
00365 
00366     /**
00367      * Indexing records as the content of a TYPO3 page.
00368      *
00369      * @param   string      Title equivalent
00370      * @param   string      Keywords equivalent
00371      * @param   string      Description equivalent
00372      * @param   string      The main content to index
00373      * @param   string      The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed!
00374      * @param   integer     Last modification time, in seconds
00375      * @param   integer     The creation date of the content, in seconds
00376      * @param   integer     The record UID that the content comes from (for registration with the indexed rows)
00377      * @return  void
00378      */
00379     function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
00380 
00381             // Content of page:
00382         $this->conf['mtime'] = $mtime;          // Most recent modification time (seconds) of the content
00383         $this->conf['crdate'] = $crdate;        // The creation date of the TYPO3 content
00384         $this->conf['recordUid'] = $recordUid;  // UID of the record, if applicable
00385 
00386             // Construct fake HTML for parsing:
00387         $this->conf['content'] = '
00388         <html>
00389             <head>
00390                 <title>'.htmlspecialchars($title).'</title>
00391                 <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
00392                 <meta name="description" content="'.htmlspecialchars($description).'" />
00393             </head>
00394             <body>
00395                 '.htmlspecialchars($content).'
00396             </body>
00397         </html>';                   // Content string (HTML of TYPO3 page)
00398 
00399             // Initializing charset:
00400         $this->conf['metaCharset'] = $charset;          // Character set of content (will be converted to utf-8 during indexing)
00401         $this->conf['indexedDocTitle'] = '';    // Alternative title for indexing
00402 
00403             // Index content as if it was a TYPO3 page:
00404         $this->indexTypo3PageContent();
00405     }
00406 
00407 
00408 
00409 
00410 
00411 
00412 
00413 
00414 
00415 
00416 
00417 
00418 
00419     /********************************
00420      *
00421      * Initialization
00422      *
00423      *******************************/
00424 
00425     /**
00426      * Initializes the object. $this->conf MUST be set with proper values prior to this call!!!
00427      *
00428      * @return  void
00429      */
00430     function init() {
00431         global $TYPO3_CONF_VARS;
00432 
00433             // Initializing:
00434         $this->cHashParams = $this->conf['cHash_array'];
00435         if (is_array($this->cHashParams) && count($this->cHashParams))  {
00436             if ($this->conf['cHash'])   $this->cHashParams['cHash'] = $this->conf['cHash']; // Add this so that URL's come out right...
00437             unset($this->cHashParams['encryptionKey']);     // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!!
00438         }
00439 
00440             // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
00441         $this->setT3Hashes();
00442 
00443             // Indexer configuration from Extension Manager interface:
00444         $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00445         $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0);
00446         $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0);
00447         $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
00448         $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255);
00449 
00450             // Initialize external document parsers:
00451             // Example configuration, see ext_localconf.php of this file!
00452         if ($this->conf['index_externals']) {
00453             $this->initializeExternalParsers();
00454         }
00455 
00456             // Initialize lexer (class that deconstructs the text into words):
00457             // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
00458         $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
00459                         $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
00460                         'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
00461         $this->lexerObj = t3lib_div::getUserObj($lexerObjRef);
00462         $this->lexerObj->debug = $this->indexerConfig['debugMode'];
00463 
00464             // Initialize metaphone hook:
00465             // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
00466         if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
00467             $this->metaphoneObj = t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
00468             $this->metaphoneObj->pObj = $this;
00469         }
00470 
00471             // Init charset class:
00472         $this->csObj = t3lib_div::makeInstance('t3lib_cs');
00473     }
00474 
00475     /**
00476      * Initialize external parsers
00477      *
00478      * @return  void
00479      * @access private
00480      * @see init()
00481      */
00482     function initializeExternalParsers()    {
00483         global $TYPO3_CONF_VARS;
00484 
00485         if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers']))    {
00486             foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef)    {
00487                 $this->external_parsers[$extension] = t3lib_div::getUserObj($_objRef);
00488                 $this->external_parsers[$extension]->pObj = $this;
00489 
00490                     // Init parser and if it returns false, unset its entry again:
00491                 if (!$this->external_parsers[$extension]->initParser($extension))   {
00492                     unset($this->external_parsers[$extension]);
00493                 }
00494             }
00495         }
00496     }
00497 
00498 
00499 
00500 
00501 
00502 
00503 
00504 
00505 
00506 
00507 
00508 
00509 
00510 
00511 
00512     /********************************
00513      *
00514      * Indexing; TYPO3 pages (HTML content)
00515      *
00516      *******************************/
00517 
00518     /**
00519      * Start indexing of the TYPO3 page
00520      *
00521      * @return  void
00522      */
00523     function indexTypo3PageContent()    {
00524 
00525         $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
00526         $is_grlist = $this->is_grlist_set($this->hash['phash']);
00527 
00528         if ($check > 0 || !$is_grlist || $this->forceIndexing)  {
00529 
00530                 // Setting message:
00531             if ($this->forceIndexing)   {
00532                 $this->log_setTSlogMessage('Indexing needed, reason: Forced',1);
00533             } elseif ($check > 0)   {
00534                 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
00535             } else {
00536                 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
00537             }
00538 
00539                     // Divide into title,keywords,description and body:
00540             $this->log_push('Split content','');
00541                 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
00542                 if ($this->conf['indexedDocTitle']) {
00543                     $this->contentParts['title'] = $this->conf['indexedDocTitle'];
00544                 }
00545             $this->log_pull();
00546 
00547                 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
00548             $this->content_md5h = $this->md5inthash(implode($this->contentParts,''));
00549 
00550                 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
00551                 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
00552                 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
00553             $checkCHash = $this->checkContentHash();
00554             if (!is_array($checkCHash) || $check===1)   {
00555                 $Pstart=t3lib_div::milliseconds();
00556 
00557                 $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8','');
00558                     $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']);
00559                 $this->log_pull();
00560 
00561                         // Splitting words
00562                 $this->log_push('Extract words from content','');
00563                     $splitInWords = $this->processWordsInArrays($this->contentParts);
00564                 $this->log_pull();
00565 
00566                         // Analyse the indexed words.
00567                 $this->log_push('Analyse the extracted words','');
00568                     $indexArr = $this->indexAnalyze($splitInWords);
00569                 $this->log_pull();
00570 
00571                         // Submitting page (phash) record
00572                 $this->log_push('Submitting page','');
00573                     $this->submitPage();
00574                 $this->log_pull();
00575 
00576                         // Check words and submit to word list if not there
00577                 $this->log_push('Check word list and submit words','');
00578                     $this->checkWordList($indexArr);
00579                     $this->submitWords($indexArr,$this->hash['phash']);
00580                 $this->log_pull();
00581 
00582                         // Set parsetime
00583                 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
00584 
00585                         // Checking external files if configured for.
00586                 $this->log_push('Checking external files','');
00587                 if ($this->conf['index_externals']) {
00588                     $this->extractLinks($this->conf['content']);
00589                 }
00590                 $this->log_pull();
00591             } else {
00592                 $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp
00593                 $this->updateSetId($this->hash['phash']);
00594                 $this->update_grlist($checkCHash['phash'],$this->hash['phash']);    // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
00595                 $this->updateRootline();
00596                 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
00597             }
00598         } else {
00599             $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
00600         }
00601     }
00602 
00603     /**
00604      * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
00605      *
00606      * @param   string      HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
00607      * @return  array       Array of content, having keys "title", "body", "keywords" and "description" set.
00608      * @see splitRegularContent()
00609      */
00610     function splitHTMLContent($content) {
00611 
00612             // divide head from body ( u-ouh :) )
00613         $contentArr = $this->defaultContentArray;
00614         $contentArr['body'] = stristr($content,'<body');
00615         $headPart = substr($content,0,-strlen($contentArr['body']));
00616 
00617             // get title
00618         $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
00619         $titleParts = explode(':',$contentArr['title'],2);
00620         $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
00621 
00622             // get keywords and description metatags
00623         if($this->conf['index_metatags']) {
00624             for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ }
00625             for($i=0;isset($meta[$i]);$i++) {
00626                 $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
00627                 if (stristr($meta[$i]['name'], 'keywords')) {
00628                     $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
00629                 }
00630                 if (stristr($meta[$i]['name'], 'description')) {
00631                     $contentArr['description'] .= ',' . $meta[$i]['content'];
00632                 }
00633             }
00634         }
00635 
00636             // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
00637         $this->typoSearchTags($contentArr['body']);
00638 
00639             // Get rid of unwanted sections (ie. scripting and style stuff) in body
00640         $tagList = explode(',',$this->excludeSections);
00641         foreach($tagList as $tag)   {
00642             while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
00643         }
00644 
00645             // remove tags, but first make sure we don't concatenate words by doing it
00646         $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
00647         $contentArr['body'] = trim(strip_tags($contentArr['body']));
00648 
00649         $contentArr['keywords'] = trim($contentArr['keywords']);
00650         $contentArr['description'] = trim($contentArr['description']);
00651 
00652             // Return array
00653         return $contentArr;
00654     }
00655 
00656     /**
00657      * Extract the charset value from HTML meta tag.
00658      *
00659      * @param   string      HTML content
00660      * @return  string      The charset value if found.
00661      */
00662     function getHTMLcharset($content)   {
00663         if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i',$content,$reg))   {
00664             if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i',$reg[0],$reg2)) {
00665                 return $reg2[1];
00666             }
00667         }
00668     }
00669 
00670     /**
00671      * Converts a HTML document to utf-8
00672      *
00673      * @param   string      HTML content, any charset
00674      * @param   string      Optional charset (otherwise extracted from HTML)
00675      * @return  string      Converted HTML
00676      */
00677     function convertHTMLToUtf8($content,$charset='')    {
00678 
00679             // Find charset:
00680         $charset = $charset ? $charset : $this->getHTMLcharset($content);
00681         $charset = $this->csObj->parse_charset($charset);
00682 
00683             // Convert charset:
00684         if ($charset && $charset!=='utf-8') {
00685             $content = $this->csObj->utf8_encode($content, $charset);
00686         }
00687             // Convert entities, assuming document is now UTF-8:
00688         $content = $this->csObj->entities_to_utf8($content, TRUE);
00689 
00690         return $content;
00691     }
00692 
00693     /**
00694      * Finds first occurence of embracing tags and returns the embraced content and the original string with
00695      * the tag removed in the two passed variables. Returns false if no match found. ie. useful for finding
00696      * <title> of document or removing <script>-sections
00697      *
00698      * @param   string      String to search in
00699      * @param   string      Tag name, eg. "script"
00700      * @param   string      Passed by reference: Content inside found tag
00701      * @param   string      Passed by reference: Content after found tag
00702      * @param   string      Passed by reference: Attributes of the found tag.
00703      * @return  boolean     Returns false if tag was not found, otherwise true.
00704      */
00705     function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
00706         $endTag = '</'.$tagName.'>';
00707         $startTag = '<'.$tagName;
00708 
00709         $isTagInText = stristr($string,$startTag);      // stristr used because we want a case-insensitive search for the tag.
00710         if(!$isTagInText) return false; // if the tag was not found, return false
00711 
00712         list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
00713         $afterTagInText = stristr($isTagInText,$endTag);
00714         if ($afterTagInText)    {
00715             $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
00716             $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText));
00717             $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag));
00718         } else {    // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.
00719             $tagContent='';
00720             $stringAfter = $isTagInText;
00721         }
00722 
00723         return true;
00724     }
00725 
00726     /**
00727      * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
00728      *
00729      * @param   string      HTML Content, passed by reference
00730      * @return  boolean     Returns true if a TYPOSEARCH_ tag was found, otherwise false.
00731      */
00732     function typoSearchTags(&$body) {
00733         $expBody = preg_split('/\<\!\-\-[\s]?TYPO3SEARCH_/',$body);
00734 
00735         if(count($expBody)>1) {
00736             $body = '';
00737 
00738             foreach($expBody as $val)   {
00739                 $part = explode('-->',$val,2);
00740                 if(trim($part[0])=='begin') {
00741                     $body.= $part[1];
00742                     $prev = '';
00743                 } elseif(trim($part[0])=='end') {
00744                     $body.= $prev;
00745                 } else {
00746                     $prev = $val;
00747                 }
00748             }
00749             return true;
00750         } else {
00751             return false;
00752         }
00753     }
00754 
00755     /**
00756      * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
00757      *
00758      * @param   string      HTML content
00759      * @return  void
00760      */
00761     function extractLinks($content) {
00762 
00763             // Get links:
00764         $list = $this->extractHyperLinks($content);
00765 
00766         if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler'))    {
00767             $this->includeCrawlerClass();
00768             $crawler = t3lib_div::makeInstance('tx_crawler_lib');
00769         }
00770 
00771             // Traverse links:
00772         foreach($list as $linkInfo) {
00773 
00774                 // Decode entities:
00775             if ($linkInfo['localPath']) {   // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
00776                 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']);
00777             } else {
00778                 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
00779             }
00780 
00781                 // Parse URL:
00782             $qParts = parse_url($linkSource);
00783 
00784                 // Check for jumpurl (TYPO3 specific thing...)
00785             if ($qParts['query'] && strstr($qParts['query'],'jumpurl='))    {
00786                 parse_str($qParts['query'],$getP);
00787                 $linkSource = $getP['jumpurl'];
00788                 $qParts = parse_url($linkSource);   // parse again due to new linkSource!
00789             }
00790 
00791             if (!$linkInfo['localPath'] && $qParts['scheme']) {
00792                 if ($this->indexerConfig['indexExternalURLs'])  {
00793                         // Index external URL (http or otherwise)
00794                     $this->indexExternalUrl($linkSource);
00795                 }
00796             } elseif (!$qParts['query']) {
00797                 $linkSource = urldecode($linkSource);
00798                 if (t3lib_div::isAllowedAbsPath($linkSource))   {
00799                     $localFile = $linkSource;
00800                 } else {
00801                     $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
00802                 }
00803                 if ($localFile && @is_file($localFile)) {
00804 
00805                         // Index local file:
00806                     if ($linkInfo['localPath']) {
00807 
00808                         $fI = pathinfo($linkSource);
00809                         $ext = strtolower($fI['extension']);
00810                         if (is_object($crawler))    {
00811                             $params = array(
00812                                 'document' => $linkSource,
00813                                 'alturl' => $linkInfo['href'],
00814                                 'conf' => $this->conf
00815                             );
00816                             unset($params['conf']['content']);
00817 
00818                             $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
00819                             $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
00820                         } else {
00821                             $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
00822                         }
00823                     } else {
00824                         if (is_object($crawler))    {
00825                             $params = array(
00826                                 'document' => $linkSource,
00827                                 'conf' => $this->conf
00828                             );
00829                             unset($params['conf']['content']);
00830                             $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
00831                             $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
00832                         } else {
00833                             $this->indexRegularDocument($linkSource);
00834                         }
00835                     }
00836                 }
00837             }
00838         }
00839     }
00840 
00841     /**
00842      * Extracts all links to external documents from the HTML content string
00843      *
00844      * @param string $html
00845      * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
00846      * @see extractLinks()
00847      */
00848     function extractHyperLinks($html)   {
00849         $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
00850         $htmlParts = $htmlParser->splitTags('a', $html);
00851         $hyperLinksData = array();
00852         foreach ($htmlParts as $index => $tagData) {
00853             if (($index % 2) !== 0) {
00854                 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
00855                 $firstTagName = $htmlParser->getFirstTagName($tagData);
00856 
00857                 if (strtolower($firstTagName) == 'a') {
00858                     if ($tagAttributes[0]['href'] && $tagAttributes[0]['href']{0} != '#') {
00859                         $hyperLinksData[] = array(
00860                             'tag' => $tagData,
00861                             'href' => $tagAttributes[0]['href'],
00862                             'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
00863                             );
00864                         }
00865                 }
00866             }
00867         }
00868 
00869         return $hyperLinksData;
00870     }
00871 
00872     /**
00873      * Extracts the "base href" from content string.
00874      *
00875      * @param   string      Content to analyze
00876      * @return  string      The base href or an empty string if not found
00877      */
00878     public function extractBaseHref($html) {
00879         $href = '';
00880         $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
00881         $htmlParts = $htmlParser->splitTags('base', $html);
00882         foreach ($htmlParts as $index => $tagData) {
00883             if (($index % 2) !== 0) {
00884                 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
00885                 $firstTagName = $htmlParser->getFirstTagName($tagData);
00886                 if (strtolower($firstTagName) == 'base') {
00887                     $href = $tagAttributes[0]['href'];
00888                         if ($href) {
00889                         break;
00890                         }
00891                 }
00892             }
00893         }
00894 
00895         return $href;
00896     }
00897 
00898     /******************************************
00899      *
00900      * Indexing; external URL
00901      *
00902      ******************************************/
00903 
00904     /**
00905      * Index External URLs HTML content
00906      *
00907      * @param   string      URL, eg. "http://typo3.org/"
00908      * @return  void
00909      * @see indexRegularDocument()
00910      */
00911     function indexExternalUrl($externalUrl) {
00912 
00913             // Parse External URL:
00914         $qParts = parse_url($externalUrl);
00915         $fI = pathinfo($qParts['path']);
00916         $ext = strtolower($fI['extension']);
00917 
00918             // Get headers:
00919         $urlHeaders = $this->getUrlHeaders($externalUrl);
00920         if (stristr($urlHeaders['Content-Type'],'text/html'))   {
00921             $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
00922             if (strlen($content))   {
00923 
00924                     // Create temporary file:
00925                 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL');
00926                 if ($tmpFile) {
00927                     t3lib_div::writeFile($tmpFile, $content);
00928 
00929                         // Index that file:
00930                     $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html');  // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
00931                     unlink($tmpFile);
00932                 }
00933             }
00934         }
00935     }
00936 
00937     /**
00938      * Getting HTTP request headers of URL
00939      *
00940      * @param   string      The URL
00941      * @param   integer     Timeout (seconds?)
00942      * @return  mixed       If no answer, returns false. Otherwise an array where HTTP headers are keys
00943      */
00944     function getUrlHeaders($url)    {
00945         $content = t3lib_div::getURL($url,2);   // Try to get the headers only
00946 
00947         if (strlen($content))   {
00948                 // Compile headers:
00949             $headers = t3lib_div::trimExplode(LF,$content,1);
00950             $retVal = array();
00951             foreach($headers as $line)  {
00952                 if (!strlen(trim($line)))   {
00953                     break;  // Stop at the first empty line (= end of header)
00954                 }
00955 
00956                 list($headKey, $headValue) = explode(':', $line, 2);
00957                 $retVal[$headKey] = $headValue;
00958             }
00959             return $retVal;
00960         }
00961     }
00962 
00963 
00964 
00965     /**
00966      * Checks if the file is local
00967      *
00968      * @param $sourcePath
00969      * @return string Absolute path to file if file is local, else empty string
00970      */
00971     protected function createLocalPath($sourcePath) {
00972         $localPath = '';
00973         static $pathFunctions = array(
00974             'createLocalPathFromT3vars',
00975             'createLocalPathUsingAbsRefPrefix',
00976             'createLocalPathUsingDomainURL',
00977             'createLocalPathFromAbsoluteURL',
00978             'createLocalPathFromRelativeURL'
00979             );
00980         foreach ($pathFunctions as $functionName) {
00981             $localPath = $this->$functionName($sourcePath);
00982             if ($localPath != '') {
00983                 break;
00984             }
00985         }
00986         return $localPath;
00987     }
00988 
00989     /**
00990      * Attempts to create a local file path from T3VARs. This is useful for
00991      * various download extensions that hide actual file name but still want the
00992      * file to be indexed.
00993      *
00994      * @param string $sourcePath
00995      * @return string
00996      */
00997     protected function createLocalPathFromT3vars($sourcePath) {
00998         $localPath = '';
00999         $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
01000         if (is_array($indexLocalFiles)) {
01001             $md5 = t3lib_div::shortMD5($sourcePath);
01002             // Note: not using self::isAllowedLocalFile here because this method
01003             // is allowed to index files outside of the web site (for example,
01004             // protected downloads)
01005             if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
01006                 $localPath = $indexLocalFiles[$md5];
01007             }
01008         }
01009         return $localPath;
01010     }
01011 
01012     /**
01013      * Attempts to create a local file path by matching a current request URL.
01014      *
01015      * @param string $sourcePath
01016      * @return string
01017      */
01018     protected function createLocalPathUsingDomainURL($sourcePath) {
01019         $localPath = '';
01020         $baseURL = t3lib_div::getIndpEnv('TYPO3_SITE_URL');
01021         $baseURLLength = strlen($baseURL);
01022         if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
01023             $sourcePath = substr($sourcePath, $baseURLLength);
01024             $localPath = PATH_site . $sourcePath;
01025             if (!self::isAllowedLocalFile($localPath)) {
01026                 $localPath = '';
01027             }
01028         }
01029         return $localPath;
01030     }
01031 
01032     /**
01033      * Attempts to create a local file path by matching absRefPrefix. This
01034      * requires TSFE. If TSFE is missing, this function does nothing.
01035      *
01036      * @param string $sourcePath
01037      * @return string
01038      */
01039     protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
01040         $localPath = '';
01041         if ($GLOBALS['TSFE'] instanceof tslib_fe) {
01042             $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
01043             $absRefPrefixLength = strlen($absRefPrefix);
01044             if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
01045                 $sourcePath = substr($sourcePath, $absRefPrefixLength);
01046                 $localPath = PATH_site . $sourcePath;
01047                 if (!self::isAllowedLocalFile($localPath)) {
01048                     $localPath = '';
01049                 }
01050             }
01051         }
01052         return $localPath;
01053     }
01054 
01055     /**
01056      * Attempts to create a local file path from the absolute URL without
01057      * schema.
01058      *
01059      * @param string $sourcePath
01060      * @return string
01061      */
01062     protected function createLocalPathFromAbsoluteURL($sourcePath) {
01063         $localPath = '';
01064         if ($sourcePath{0} == '/') {
01065             $sourcePath = substr($sourcePath, 1);
01066             $localPath = PATH_site . $sourcePath;
01067             if (!self::isAllowedLocalFile($localPath)) {
01068                 $localPath = '';
01069             }
01070         }
01071         return $localPath;
01072     }
01073 
01074     /**
01075      * Attempts to create a local file path from the relative URL.
01076      *
01077      * @param string $sourcePath
01078      * @return string
01079      */
01080     protected function createLocalPathFromRelativeURL($sourcePath) {
01081         $localPath = '';
01082         if (self::isRelativeURL($sourcePath)) {
01083             $localPath = PATH_site . $sourcePath;
01084             if (!self::isAllowedLocalFile($localPath)) {
01085                 $localPath = '';
01086             }
01087         }
01088         return $localPath;
01089     }
01090 
01091     /**
01092      * Checks if URL is relative.
01093      *
01094      * @param string $url
01095      * @return boolean
01096      */
01097     static protected function isRelativeURL($url) {
01098         $urlParts = @parse_url($url);
01099         return ($urlParts['scheme'] == '' && $urlParts['path']{0} != '/');
01100     }
01101 
01102     /**
01103      * Checks if the path points to the file inside the web site
01104      *
01105      * @param string $filePath
01106      * @return boolean
01107      */
01108     static protected function isAllowedLocalFile($filePath) {
01109         $filePath = t3lib_div::resolveBackPath($filePath);
01110         $insideWebPath = (substr($filePath, 0, strlen(PATH_site)) == PATH_site);
01111         $isFile = is_file($filePath);
01112         return $insideWebPath && $isFile;
01113     }
01114 
01115     /******************************************
01116      *
01117      * Indexing; external files (PDF, DOC, etc)
01118      *
01119      ******************************************/
01120 
01121     /**
01122      * Indexing a regular document given as $file (relative to PATH_site, local file)
01123      *
01124      * @param   string      Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with t3lib_div::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
01125      * @param   boolean     If set, indexing is forced (despite content hashes, mtime etc).
01126      * @param   string      Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
01127      * @param   string      File extension for temporary file.
01128      * @return  void
01129      */
01130     function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='')    {
01131 
01132             // Init
01133         $fI = pathinfo($file);
01134         $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
01135 
01136             // Create abs-path:
01137         if (!$contentTmpFile)   {
01138             if (!t3lib_div::isAbsPath($file))   {   // Relative, prepend PATH_site:
01139                 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
01140             } else {    // Absolute, pass-through:
01141                 $absFile = $file;
01142             }
01143             $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
01144         } else {
01145             $absFile = $contentTmpFile;
01146         }
01147 
01148             // Indexing the document:
01149         if ($absFile && @is_file($absFile)) {
01150             if ($this->external_parsers[$ext])  {
01151                 $mtime = filemtime($absFile);
01152                 $cParts = $this->fileContentParts($ext,$absFile);
01153 
01154                 foreach($cParts as $cPKey)  {
01155                     $this->internal_log = array();
01156                     $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
01157                     $Pstart = t3lib_div::milliseconds();
01158                     $subinfo = array('key' => $cPKey);  // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
01159                     $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);
01160                     $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
01161                     if ($check > 0 || $force)   {
01162                         if ($check > 0) {
01163                             $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
01164                         } else {
01165                             $this->log_setTSlogMessage('Indexing forced by flag',1);
01166                         }
01167 
01168                             // Check external file counter:
01169                         if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
01170 
01171                                     // Divide into title,keywords,description and body:
01172                             $this->log_push('Split content','');
01173                                 $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
01174                             $this->log_pull();
01175 
01176                             if (is_array($contentParts))    {
01177                                     // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
01178                                 $content_md5h = $this->md5inthash(implode($contentParts,''));
01179 
01180                                 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force)  {
01181 
01182                                         // Increment counter:
01183                                     $this->externalFileCounter++;
01184 
01185                                         // Splitting words
01186                                     $this->log_push('Extract words from content','');
01187                                         $splitInWords = $this->processWordsInArrays($contentParts);
01188                                     $this->log_pull();
01189 
01190                                         // Analyse the indexed words.
01191                                     $this->log_push('Analyse the extracted words','');
01192                                         $indexArr = $this->indexAnalyze($splitInWords);
01193                                     $this->log_pull();
01194 
01195                                         // Submitting page (phash) record
01196                                     $this->log_push('Submitting page','');
01197                                         $size = filesize($absFile);
01198                                         $ctime = filemtime($absFile);   // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
01199                                         $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
01200                                     $this->log_pull();
01201 
01202                                         // Check words and submit to word list if not there
01203                                     $this->log_push('Check word list and submit words','');
01204                                         $this->checkWordList($indexArr);
01205                                         $this->submitWords($indexArr,$phash_arr['phash']);
01206                                     $this->log_pull();
01207 
01208                                         // Set parsetime
01209                                     $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
01210                                 } else {
01211                                     $this->updateTstamp($phash_arr['phash'],$mtime);    // Update the timestamp
01212                                     $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
01213                                 }
01214                             } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
01215                         } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
01216                     } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
01217 
01218                         // Checking and setting sections:
01219         #           $this->submitFile_grlist($phash_arr['phash']);  // Setting a gr_list record if there is none already (set for default fe_group)
01220                     $this->submitFile_section($phash_arr['phash']);     // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
01221                     $this->log_pull();
01222                 }
01223             } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
01224         } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
01225     }
01226 
01227     /**
01228      * Reads the content of an external file being indexed.
01229      * The content from the external parser MUST be returned in utf-8!
01230      *
01231      * @param   string      File extension, eg. "pdf", "doc" etc.
01232      * @param   string      Absolute filename of file (must exist and be validated OK before calling function)
01233      * @param   string      Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
01234      * @return  array       Standard content array (title, description, keywords, body keys)
01235      */
01236     function readFileContent($ext,$absFile,$cPKey)  {
01237 
01238             // Consult relevant external document parser:
01239         if (is_object($this->external_parsers[$ext]))   {
01240             $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey);
01241         }
01242 
01243         return $contentArr;
01244     }
01245 
01246     /**
01247      * Creates an array with pointers to divisions of document.
01248      *
01249      * @param   string      File extension
01250      * @param   string      Absolute filename (must exist and be validated OK before calling function)
01251      * @return  array       Array of pointers to sections that the document should be divided into
01252      */
01253     function fileContentParts($ext,$absFile)    {
01254         $cParts = array(0);
01255 
01256             // Consult relevant external document parser:
01257         if (is_object($this->external_parsers[$ext]))   {
01258             $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile);
01259         }
01260 
01261         return $cParts;
01262     }
01263 
01264     /**
01265      * Splits non-HTML content (from external files for instance)
01266      *
01267      * @param   string      Input content (non-HTML) to index.
01268      * @return  array       Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
01269      * @see splitHTMLContent()
01270      */
01271     function splitRegularContent($content) {
01272         $contentArr = $this->defaultContentArray;
01273         $contentArr['body'] = $content;
01274 
01275         return $contentArr;
01276     }
01277 
01278 
01279 
01280 
01281 
01282 
01283 
01284 
01285 
01286 
01287 
01288 
01289 
01290 
01291     /**********************************
01292      *
01293      * Analysing content, Extracting words
01294      *
01295      **********************************/
01296 
01297     /**
01298      * Convert character set and HTML entities in the value of input content array keys
01299      *
01300      * @param   array       Standard content array
01301      * @param   string      Charset of the input content (converted to utf-8)
01302      * @return  void
01303      */
01304     function charsetEntity2utf8(&$contentArr, $charset) {
01305 
01306             // Convert charset if necessary
01307         foreach ($contentArr as $key => $value) {
01308             if (strlen($contentArr[$key]))  {
01309 
01310                 if ($charset!=='utf-8') {
01311                     $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
01312                 }
01313 
01314                     // decode all numeric / html-entities in the string to real characters:
01315                 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);
01316             }
01317         }
01318     }
01319 
01320     /**
01321      * Processing words in the array from split*Content -functions
01322      *
01323      * @param   array       Array of content to index, see splitHTMLContent() and splitRegularContent()
01324      * @return  array       Content input array modified so each key is not a unique array of words
01325      */
01326     function processWordsInArrays($contentArr)  {
01327 
01328             // split all parts to words
01329         foreach ($contentArr as $key => $value) {
01330             $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
01331         }
01332 
01333             // For title, keywords, and description we don't want duplicates:
01334         $contentArr['title'] = array_unique($contentArr['title']);
01335         $contentArr['keywords'] = array_unique($contentArr['keywords']);
01336         $contentArr['description'] = array_unique($contentArr['description']);
01337 
01338             // Return modified array:
01339         return $contentArr;
01340     }
01341 
01342     /**
01343      * Processing words in the array from split*Content -functions
01344      * This function is only a wrapper because the function has been removed (see above).
01345      *
01346      * @param   array       Array of content to index, see splitHTMLContent() and splitRegularContent()
01347      * @return  array       Content input array modified so each key is not a unique array of words
01348      * @deprecated since TYPO3 4.0, this function will be removed in TYPO3 4.5.
01349      */
01350     function procesWordsInArrays($contentArr)   {
01351         t3lib_div::logDeprecatedFunction();
01352 
01353         return $this->processWordsInArrays($contentArr);
01354     }
01355 
01356     /**
01357      * Extracts the sample description text from the content array.
01358      *
01359      * @param   array       Content array
01360      * @return  string      Description string
01361      */
01362     function bodyDescription($contentArr)   {
01363 
01364             // Setting description
01365         $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
01366         if ($maxL)  {
01367                 // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet.
01368     #       $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
01369             $bodyDescription = str_replace(array(' ',TAB,CR,LF),' ',$contentArr['body']);
01370 
01371                 // Shorten the string:
01372             $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
01373         }
01374 
01375         return $bodyDescription;
01376     }
01377 
01378     /**
01379      * Analyzes content to use for indexing,
01380      *
01381      * @param   array       Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
01382      * @return  array       Index Array (whatever that is...)
01383      */
01384     function indexAnalyze($content) {
01385         $indexArr = Array();
01386         $counter = 0;
01387 
01388         $this->analyzeHeaderinfo($indexArr,$content,'title',7);
01389         $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
01390         $this->analyzeHeaderinfo($indexArr,$content,'description',5);
01391         $this->analyzeBody($indexArr,$content);
01392 
01393         return ($indexArr);
01394     }
01395 
01396     /**
01397      * Calculates relevant information for headercontent
01398      *
01399      * @param   array       Index array, passed by reference
01400      * @param   array       Standard content array
01401      * @param   string      Key from standard content array
01402      * @param   integer     Bit-wise priority to type
01403      * @return  void
01404      */
01405     function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
01406         foreach ($content[$key] as $val) {
01407             $val = substr($val,0,60);   // Max 60 - because the baseword varchar IS 60. This MUST be the same.
01408             $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
01409             $retArr[$val]['count'] = $retArr[$val]['count']+1;
01410             $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
01411             $retArr[$val]['metaphone'] = $this->metaphone($val);
01412             $this->wordcount++;
01413         }
01414     }
01415 
01416     /**
01417      * Calculates relevant information for bodycontent
01418      *
01419      * @param   array       Index array, passed by reference
01420      * @param   array       Standard content array
01421      * @return  void
01422      */
01423     function analyzeBody(&$retArr,$content) {
01424         foreach($content['body'] as $key => $val)   {
01425             $val = substr($val,0,60);   // Max 60 - because the baseword varchar IS 60. This MUST be the same.
01426             if(!isset($retArr[$val])) {
01427                 $retArr[$val]['first'] = $key;
01428                 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
01429                 $retArr[$val]['metaphone'] = $this->metaphone($val);
01430             }
01431             $retArr[$val]['count'] = $retArr[$val]['count']+1;
01432             $this->wordcount++;
01433         }
01434     }
01435 
01436     /**
01437      * Creating metaphone based hash from input word
01438      *
01439      * @param   string      Word to convert
01440      * @param   boolean     If set, returns the raw metaphone value (not hashed)
01441      * @return  mixed       Metaphone hash integer (or raw value, string)
01442      */
01443     function metaphone($word,$retRaw=FALSE) {
01444 
01445         if (is_object($this->metaphoneObj)) {
01446             $tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
01447         } else {
01448             $tmp = metaphone($word);
01449         }
01450 
01451             // Return raw value?
01452         if ($retRaw)    return $tmp;
01453 
01454             // Otherwise create hash and return integer
01455         if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));
01456         return $ret;
01457     }
01458 
01459 
01460 
01461 
01462 
01463 
01464 
01465 
01466 
01467 
01468 
01469 
01470 
01471 
01472 
01473 
01474     /********************************
01475      *
01476      * SQL; TYPO3 Pages
01477      *
01478      *******************************/
01479 
01480     /**
01481      * Updates db with information about the page (TYPO3 page, not external media)
01482      *
01483      * @return  void
01484      */
01485     function submitPage()   {
01486 
01487             // Remove any current data for this phash:
01488         $this->removeOldIndexedPages($this->hash['phash']);
01489 
01490             // setting new phash_row
01491         $fields = array(
01492             'phash' => $this->hash['phash'],
01493             'phash_grouping' => $this->hash['phash_grouping'],
01494             'cHashParams' => serialize($this->cHashParams),
01495             'contentHash' => $this->content_md5h,
01496             'data_page_id' => $this->conf['id'],
01497             'data_page_reg1' => $this->conf['page_cache_reg1'],
01498             'data_page_type' => $this->conf['type'],
01499             'data_page_mp' => $this->conf['MP'],
01500             'gr_list' => $this->conf['gr_list'],
01501             'item_type' => 0,   // TYPO3 page
01502             'item_title' => $this->contentParts['title'],
01503             'item_description' => $this->bodyDescription($this->contentParts),
01504             'item_mtime' => $this->conf['mtime'],
01505             'item_size' => strlen($this->conf['content']),
01506             'tstamp' => $GLOBALS['EXEC_TIME'],
01507             'crdate' => $GLOBALS['EXEC_TIME'],
01508             'item_crdate' => $this->conf['crdate'], // Creation date of page
01509             'sys_language_uid' => $this->conf['sys_language_uid'],  // Sys language uid of the page. Should reflect which language it DOES actually display!
01510             'externalUrl' => 0,
01511             'recordUid' => intval($this->conf['recordUid']),
01512             'freeIndexUid' => intval($this->conf['freeIndexUid']),
01513             'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
01514         );
01515 
01516         $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
01517 
01518             // PROCESSING index_section
01519         $this->submit_section($this->hash['phash'],$this->hash['phash']);
01520 
01521             // PROCESSING index_grlist
01522         $this->submit_grlist($this->hash['phash'],$this->hash['phash']);
01523 
01524             // PROCESSING index_fulltext
01525         $fields = array(
01526             'phash' => $this->hash['phash'],
01527             'fulltextdata' => implode(' ', $this->contentParts)
01528         );
01529         if ($this->indexerConfig['fullTextDataLength']>0)   {
01530             $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
01531         }
01532         $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
01533 
01534             // PROCESSING index_debug
01535         if ($this->indexerConfig['debugMode'])  {
01536             $fields = array(
01537                 'phash' => $this->hash['phash'],
01538                 'debuginfo' => serialize(array(
01539                         'cHashParams' => $this->cHashParams,
01540                         'external_parsers initialized' => array_keys($this->external_parsers),
01541                         'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))),
01542                         'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))),
01543                         'logs' => $this->internal_log,
01544                         'lexer' => $this->lexerObj->debugString,
01545                     ))
01546             );
01547             $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
01548         }
01549     }
01550 
01551     /**
01552      * Stores gr_list in the database.
01553      *
01554      * @param   integer     Search result record phash
01555      * @param   integer     Actual phash of current content
01556      * @return  void
01557      * @see update_grlist()
01558      */
01559     function submit_grlist($hash,$phash_x)  {
01560 
01561             // Setting the gr_list record
01562         $fields = array(
01563             'phash' => $hash,
01564             'phash_x' => $phash_x,
01565             'hash_gr_list' => $this->md5inthash($this->conf['gr_list']),
01566             'gr_list' => $this->conf['gr_list']
01567         );
01568         $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
01569     }
01570 
01571     /**
01572      * Stores section
01573      * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
01574      *
01575      * @param   integer     phash of TYPO3 parent search result record
01576      * @param   integer     phash of the file indexation search record
01577      * @return  void
01578      */
01579     function submit_section($hash,$hash_t3) {
01580         $fields = array(
01581             'phash' => $hash,
01582             'phash_t3' => $hash_t3,
01583             'page_id' => intval($this->conf['id'])
01584         );
01585 
01586         $this->getRootLineFields($fields);
01587 
01588         $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
01589     }
01590 
01591     /**
01592      * Removes records for the indexed page, $phash
01593      *
01594      * @param   integer     phash value to flush
01595      * @return  void
01596      */
01597     function removeOldIndexedPages($phash)  {
01598             // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
01599         $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext,index_debug');
01600         foreach($tableArr as $table)    {
01601             $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
01602         }
01603             // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
01604         $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3='.intval($phash));
01605     }
01606 
01607 
01608 
01609 
01610 
01611 
01612 
01613 
01614 
01615 
01616 
01617 
01618 
01619     /********************************
01620      *
01621      * SQL; External media
01622      *
01623      *******************************/
01624 
01625 
01626     /**
01627      * Updates db with information about the file
01628      *
01629      * @param   array       Array with phash and phash_grouping keys for file
01630      * @param   string      File name
01631      * @param   array       Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
01632      * @param   string      File extension determining the type of media.
01633      * @param   integer     Modification time of file.
01634      * @param   integer     Creation time of file.
01635      * @param   integer     Size of file in bytes
01636      * @param   integer     Content HASH value.
01637      * @param   array       Standard content array (using only title and body for a file)
01638      * @return  void
01639      */
01640     function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)  {
01641 
01642             // Find item Type:
01643         $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
01644         $storeItemType = $storeItemType ? $storeItemType : $ext;
01645 
01646             // Remove any current data for this phash:
01647         $this->removeOldIndexedFiles($hash['phash']);
01648 
01649             // Split filename:
01650         $fileParts = parse_url($file);
01651 
01652             // Setting new
01653         $fields = array(
01654             'phash' => $hash['phash'],
01655             'phash_grouping' => $hash['phash_grouping'],
01656             'cHashParams' => serialize($subinfo),
01657             'contentHash' => $content_md5h,
01658             'data_filename' => $file,
01659             'item_type' => $storeItemType,
01660             'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
01661             'item_description' => $this->bodyDescription($contentParts),
01662             'item_mtime' => $mtime,
01663             'item_size' => $size,
01664             'item_crdate' => $ctime,
01665             'tstamp' => $GLOBALS['EXEC_TIME'],
01666             'crdate' => $GLOBALS['EXEC_TIME'],
01667             'gr_list' => $this->conf['gr_list'],
01668             'externalUrl' => $fileParts['scheme'] ? 1 : 0,
01669             'recordUid' => intval($this->conf['recordUid']),
01670             'freeIndexUid' => intval($this->conf['freeIndexUid']),
01671             'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
01672         );
01673         $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
01674 
01675             // PROCESSING index_fulltext
01676         $fields = array(
01677             'phash' => $hash['phash'],
01678             'fulltextdata' => implode(' ', $contentParts)
01679         );
01680         if ($this->indexerConfig['fullTextDataLength']>0)   {
01681             $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
01682         }
01683         $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
01684 
01685             // PROCESSING index_debug
01686         if ($this->indexerConfig['debugMode'])  {
01687             $fields = array(
01688                 'phash' => $hash['phash'],
01689                 'debuginfo' => serialize(array(
01690                         'cHashParams' => $subinfo,
01691                         'contentParts' => array_merge($contentParts,array('body' => substr($contentParts['body'],0,1000))),
01692                         'logs' => $this->internal_log,
01693                         'lexer' => $this->lexerObj->debugString,
01694                     ))
01695             );
01696             $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
01697         }
01698     }
01699 
01700     /**
01701      * Stores file gr_list for a file IF it does not exist already
01702      *
01703      * @param   integer     phash value of file
01704      * @return  void
01705      */
01706     function submitFile_grlist($hash)   {
01707             // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
01708         $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
01709             'phash',
01710             'index_grlist',
01711             'phash=' . intval($hash) .
01712                 ' AND (hash_gr_list=' . $this->md5inthash($this->defaultGrList) .
01713                 ' OR hash_gr_list=' . $this->md5inthash($this->conf['gr_list']) . ')'
01714         );
01715         if (!$count) {
01716             $this->submit_grlist($hash,$hash);
01717         }
01718     }
01719 
01720     /**
01721      * Stores file section for a file IF it does not exist
01722      *
01723      * @param   integer     phash value of file
01724      * @return  void
01725      */
01726     function submitFile_section($hash)  {
01727             // Testing if there is a section
01728         $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->conf['id']));
01729         if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res))  {
01730             $this->submit_section($hash,$this->hash['phash']);
01731         }
01732     }
01733 
01734     /**
01735      * Removes records for the indexed page, $phash
01736      *
01737      * @param   integer     phash value to flush
01738      * @return  void
01739      */
01740     function removeOldIndexedFiles($phash)  {
01741 
01742             // Removing old registrations for tables.
01743         $tableArr = explode(',','index_phash,index_grlist,index_fulltext,index_debug');
01744         foreach($tableArr as $table)    {
01745             $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
01746         }
01747     }
01748 
01749 
01750 
01751 
01752 
01753 
01754 
01755 
01756 
01757 
01758 
01759 
01760 
01761 
01762     /********************************
01763      *
01764      * SQL Helper functions
01765      *
01766      *******************************/
01767 
01768     /**
01769      * Check the mtime / tstamp of the currently indexed page/file (based on phash)
01770      * Return positive integer if the page needs to be indexed
01771      *
01772      * @param   integer     mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
01773      * @param   integer     "phash" used to select any already indexed page to see what its mtime is.
01774      * @return  integer     Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur.  -1) mtime matched so no need to reindex page. 0) N/A   1) Max age exceeded, page must be indexed again.   2) mtime of indexed page doesn't match mtime given for current content and we must index page.  3) No mtime was set, so we will index...  4) No indexed page found, so of course we will index.
01775      */
01776     function checkMtimeTstamp($mtime,$phash)    {
01777 
01778             // Select indexed page:
01779         $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash));
01780         $out = 0;
01781 
01782             // If there was an indexing of the page...:
01783         if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01784             if ($this->tstamp_maxAge && ($row['tstamp'] + $this->tstamp_maxAge) < $GLOBALS['EXEC_TIME']) {  // If max age is exceeded, index the page
01785                 $out = 1;       // The configured max-age was exceeded for the document and thus it's indexed.
01786             } else {
01787                 if (!$this->tstamp_minAge || ($row['tstamp'] + $this->tstamp_minAge) < $GLOBALS['EXEC_TIME']) { // if minAge is not set or if minAge is exceeded, consider at mtime
01788                     if ($mtime) {       // It mtime is set, then it's tested. If not, the page must clearly be indexed.
01789                         if ($row['item_mtime'] != $mtime)   {   // And if mtime is different from the index_phash mtime, it's about time to re-index.
01790                             $out = 2;       // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
01791                         } else {
01792                             $out = -1;      // mtime matched the document, so no changes detected and no content updated
01793                             if ($this->tstamp_maxAge)   {
01794                                 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
01795                             } else {
01796                                 $this->updateTstamp($phash);    // Update the timestatmp
01797                                 $this->log_setTSlogMessage('mtime matched, timestamp updated.',1);
01798                             }
01799                         }
01800                     } else {$out = 3;   }   // The minimum age was exceed, but mtime was not set, so the page was indexed.
01801                 } else {$out = -2;}         // The minimum age was not exceeded
01802             }
01803         } else {$out = 4;}  // Page has never been indexed (is not represented in the index_phash table).
01804         return $out;
01805     }
01806 
01807     /**
01808      * Check content hash in phash table
01809      *
01810      * @return  mixed       Returns true if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
01811      */
01812     function checkContentHash() {
01813             // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
01814         $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h));
01815         if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01816             return $row;
01817         }
01818         return 1;
01819     }
01820 
01821     /**
01822      * Check content hash for external documents
01823      * Returns true if the document needs to be indexed (that is, there was no result)
01824      *
01825      * @param   integer     phash value to check (phash_grouping)
01826      * @param   integer     Content hash to check
01827      * @return  boolean     Returns true if the document needs to be indexed (that is, there was no result)
01828      */
01829     function checkExternalDocContentHash($hashGr,$content_md5h) {
01830         $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h));
01831         if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01832             return 0;
01833         }
01834         return 1;
01835     }
01836 
01837     /**
01838      * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
01839      *
01840      * @param   integer     Phash integer to test.
01841      * @return  void
01842      */
01843     function is_grlist_set($phash_x)    {
01844         return $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
01845             'phash_x',
01846             'index_grlist',
01847             'phash_x=' . intval($phash_x)
01848         );
01849     }
01850 
01851     /**
01852      * Check if an grlist-entry for this hash exists and if not so, write one.
01853      *
01854      * @param   integer     phash of the search result that should be found
01855      * @param   integer     The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
01856      * @return  void
01857      * @see submit_grlist()
01858      */
01859     function update_grlist($phash,$phash_x) {
01860         $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->conf['gr_list']));
01861         if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res))  {
01862             $this->submit_grlist($phash,$phash_x);
01863             $this->log_setTSlogMessage("Inserted gr_list '".$this->conf['gr_list']."' for phash '".$phash."'",1);
01864         }
01865     }
01866 
01867     /**
01868      * Update tstamp for a phash row.
01869      *
01870      * @param   integer     phash value
01871      * @param   integer     If set, update the mtime field to this value.
01872      * @return  void
01873      */
01874     function updateTstamp($phash,$mtime=0)  {
01875         $updateFields = array(
01876             'tstamp' => $GLOBALS['EXEC_TIME']
01877         );
01878         if ($mtime) { $updateFields['item_mtime'] = intval($mtime); }
01879 
01880         $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
01881     }
01882 
01883     /**
01884      * Update SetID of the index_phash record.
01885      *
01886      * @param   integer     phash value
01887      * @return  void
01888      */
01889     function updateSetId($phash)    {
01890         $updateFields = array(
01891             'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
01892         );
01893 
01894         $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
01895     }
01896 
01897     /**
01898      * Update parsetime for phash row.
01899      *
01900      * @param   integer     phash value.
01901      * @param   integer     Parsetime value to set.
01902      * @return  void
01903      */
01904     function updateParsetime($phash,$parsetime) {
01905         $updateFields = array(
01906             'parsetime' => intval($parsetime)
01907         );
01908 
01909         $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
01910     }
01911 
01912     /**
01913      * Update section rootline for the page
01914      *
01915      * @return  void
01916      */
01917     function updateRootline()   {
01918 
01919         $updateFields = array();
01920         $this->getRootLineFields($updateFields);
01921 
01922         $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->conf['id']), $updateFields);
01923     }
01924 
01925     /**
01926      * Adding values for root-line fields.
01927      * rl0, rl1 and rl2 are standard. A hook might add more.
01928      *
01929      * @param   array       Field array, passed by reference
01930      * @return  void
01931      */
01932     function getRootLineFields(&$fieldArr)  {
01933 
01934         $fieldArr['rl0'] = intval($this->conf['rootline_uids'][0]);
01935         $fieldArr['rl1'] = intval($this->conf['rootline_uids'][1]);
01936         $fieldArr['rl2'] = intval($this->conf['rootline_uids'][2]);
01937 
01938         if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields']))    {
01939             foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel)  {
01940                 $fieldArr[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
01941             }
01942         }
01943     }
01944 
01945     /**
01946      * Removes any indexed pages with userlogins which has the same contentHash
01947      * NOT USED anywhere inside this class!
01948      *
01949      * @return  void
01950      */
01951     function removeLoginpagesWithContentHash()  {
01952         $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A,index_grlist B', '
01953                     A.phash=B.phash
01954                     AND A.phash_grouping='.intval($this->hash['phash_grouping']).'
01955                     AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).'
01956                     AND A.contentHash='.intval($this->content_md5h));
01957         while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))   {
01958             $this->log_setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1);
01959             $this->removeOldIndexedPages($row['phash']);
01960         }
01961     }
01962 
01963     /**
01964      * Includes the crawler class
01965      *
01966      * @return  void
01967      */
01968     function includeCrawlerClass()  {
01969         global $TYPO3_CONF_VARS;
01970 
01971         require_once(t3lib_extMgm::extPath('crawler').'class.tx_crawler_lib.php');
01972     }
01973 
01974 
01975 
01976 
01977 
01978 
01979 
01980 
01981 
01982 
01983     /********************************
01984      *
01985      * SQL; Submitting words
01986      *
01987      *******************************/
01988 
01989     /**
01990      * Adds new words to db
01991      *
01992      * @param   array       Word List array (where each word has information about position etc).
01993      * @return  void
01994      */
01995     function checkWordList($wl) {
01996         $phashArr = array();
01997         foreach ($wl as $key => $value) {
01998             $phashArr[] = $wl[$key]['hash'];
01999         }
02000         if (count($phashArr))   {
02001             $cwl = implode(',',$phashArr);
02002             $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')');
02003 
02004             if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) {
02005                 $this->log_setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1);
02006                 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
02007                     unset($wl[$row['baseword']]);
02008                 }
02009 
02010                 foreach ($wl as $key => $val) {
02011                     $insertFields = array(
02012                         'wid' => $val['hash'],
02013                         'baseword' => $key,
02014                         'metaphone' => $val['metaphone']
02015                     );
02016                         // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
02017                     $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
02018                 }
02019             }
02020         }
02021     }
02022 
02023     /**
02024      * Submits RELATIONS between words and phash
02025      *
02026      * @param   array       Word list array
02027      * @param   integer     phash value
02028      * @return  void
02029      */
02030     function submitWords($wl,$phash) {
02031         $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash='.intval($phash));
02032 
02033         foreach($wl as $val)    {
02034             $insertFields = array(
02035                 'phash' => $phash,
02036                 'wid' => $val['hash'],
02037                 'count' => $val['count'],
02038                 'first' => $val['first'],
02039                 'freq' => $this->freqMap(($val['count']/$this->wordcount)),
02040                 'flags' => ($val['cmp'] & $this->flagBitMask)
02041             );
02042 
02043             $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
02044         }
02045     }
02046 
02047     /**
02048      * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
02049      * and back.
02050      *
02051      * @param   double      Frequency
02052      * @return  integer     Frequency in range.
02053      */
02054     function freqMap($freq) {
02055         $mapFactor = $this->freqMax*100*$this->freqRange;
02056         if($freq<1) {
02057             $newFreq = $freq*$mapFactor;
02058             $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq;
02059         } else {
02060             $newFreq = $freq/$mapFactor;
02061         }
02062         return $newFreq;
02063 
02064     }
02065 
02066 
02067 
02068 
02069 
02070 
02071 
02072 
02073 
02074 
02075 
02076     /********************************
02077      *
02078      * Hashing
02079      *
02080      *******************************/
02081 
02082     /**
02083      * Get search hash, T3 pages
02084      *
02085      * @return  void
02086      */
02087     function setT3Hashes()  {
02088 
02089             //  Set main array:
02090         $hArray = array(
02091             'id' => (integer)$this->conf['id'],
02092             'type' => (integer)$this->conf['type'],
02093             'sys_lang' => (integer)$this->conf['sys_language_uid'],
02094             'MP' => (string)$this->conf['MP'],
02095             'cHash' => $this->cHashParams
02096         );
02097 
02098             // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
02099         $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
02100 
02101             // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
02102         $hArray['gr_list'] = (string)$this->conf['gr_list'];
02103         $this->hash['phash'] = $this->md5inthash(serialize($hArray));
02104     }
02105 
02106     /**
02107      * Get search hash, external files
02108      *
02109      * @param   string      File name / path which identifies it on the server
02110      * @param   array       Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
02111      * @return  array       Array with "phash_grouping" and "phash" inside.
02112      */
02113     function setExtHashes($file,$subinfo=array())   {
02114             //  Set main array:
02115         $hash = array();
02116         $hArray = array(
02117             'file' => $file,
02118         );
02119 
02120             // Set grouping hash:
02121         $hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
02122 
02123             // Add subinfo
02124         $hArray['subinfo'] = $subinfo;
02125         $hash['phash'] = $this->md5inthash(serialize($hArray));
02126 
02127         return $hash;
02128     }
02129 
02130     /**
02131      * md5 integer hash
02132      * Using 7 instead of 8 just because that makes the integers lower than 32 bit (28 bit) and so they do not interfere with UNSIGNED integers or PHP-versions which has varying output from the hexdec function.
02133      *
02134      * @param   string      String to hash
02135      * @return  integer     Integer intepretation of the md5 hash of input string.
02136      */
02137     function md5inthash($str)   {
02138         return hexdec(substr(md5($str),0,7));
02139     }
02140 
02141     /**
02142      * Calculates the cHash value of input GET array (for constructing cHash values if needed)
02143      *
02144      * @param   array       Array of GET parameters to encode
02145      * @return  void
02146      * @deprecated since TYPO3 4.3, this function will be removed in TYPO3 4.5, use directly t3lib_div::calculateCHash()
02147      */
02148     function makeCHash($paramArray) {
02149         t3lib_div::logDeprecatedFunction();
02150 
02151         $addQueryParams = t3lib_div::implodeArrayForUrl('', $paramArray);
02152 
02153         $pA = t3lib_div::cHashParams($addQueryParams);
02154 
02155         return t3lib_div::shortMD5(serialize($pA));
02156     }
02157 
02158 
02159 
02160 
02161 
02162 
02163 
02164 
02165 
02166 
02167 
02168 
02169     /*********************************
02170      *
02171      * Internal logging functions
02172      *
02173      *********************************/
02174 
02175     /**
02176      * Push function wrapper for TT logging
02177      *
02178      * @param   string      Title to set
02179      * @param   string      Key (?)
02180      * @return  void
02181      */
02182     function log_push($msg,$key)    {
02183         if (is_object($GLOBALS['TT']))      $GLOBALS['TT']->push($msg,$key);
02184     }
02185 
02186     /**
02187      * Pull function wrapper for TT logging
02188      *
02189      * @return  void
02190      */
02191     function log_pull() {
02192         if (is_object($GLOBALS['TT']))      $GLOBALS['TT']->pull();
02193     }
02194 
02195     /**
02196      * Set log message function wrapper for TT logging
02197      *
02198      * @param   string      Message to set
02199      * @param   integer     Error number
02200      * @return  void
02201      */
02202     function log_setTSlogMessage($msg, $errorNum=0) {
02203         if (is_object($GLOBALS['TT']))      $GLOBALS['TT']->setTSlogMessage($msg,$errorNum);
02204         $this->internal_log[] = $msg;
02205     }
02206 
02207 
02208 
02209 
02210 
02211 
02212 
02213 
02214     /**************************
02215      *
02216      * tslib_fe hooks:
02217      *
02218      **************************/
02219 
02220     /**
02221      * Frontend hook: If the page is not being re-generated this is our chance to force it to be (because re-generation of the page is required in order to have the indexer called!)
02222      *
02223      * @param   array       Parameters from frontend
02224      * @param   object      TSFE object (reference under PHP5)
02225      * @return  void
02226      * @deprecated since TYPO3 4.3, this function will be removed in TYPO3 4.5, the method was extracted to hooks/class.tx_indexedsearch_tslib_fe_hook.php
02227      */
02228     function fe_headerNoCache(&$params, $ref)   {
02229         t3lib_div::logDeprecatedFunction();
02230 
02231         require_once t3lib_extMgm::extPath('indexed_search') . 'hooks/class.tx_indexedsearch_tslib_fe_hook.php';
02232         t3lib_div::makeInstance('tx_indexedsearch_tslib_fe_hook')->headerNoCache($params, $ref);
02233     }
02234 
02235     /**
02236      * Makes sure that keywords are space-separated. This is impotant for their
02237      * proper displaying as a part of fulltext index.
02238      *
02239      * @param string $keywordList
02240      * @return string
02241      * @see http://bugs.typo3.org/view.php?id=1436
02242      */
02243     protected function addSpacesToKeywordList($keywordList) {
02244         $keywords = t3lib_div::trimExplode(',', $keywordList);
02245         return ' ' . implode(', ', $keywords) . ' ';
02246     }
02247 }
02248 
02249 
02250 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php'])    {
02251     include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']);
02252 }
02253 ?>

Generated on Sat Sep 4 04:17:20 2010 for TYPO3 API by  doxygen 1.4.7