|
TYPO3 API
SVNRelease
|
00001 <?php 00002 /*************************************************************** 00003 * Copyright notice 00004 * 00005 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com) 00006 * All rights reserved 00007 * 00008 * This script is part of the TYPO3 project. The TYPO3 project is 00009 * free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 2 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * The GNU General Public License can be found at 00015 * http://www.gnu.org/copyleft/gpl.html. 00016 * A copy is found in the textfile GPL.txt and important notices to the license 00017 * from the author is found in LICENSE.txt distributed with these scripts. 00018 * 00019 * 00020 * This script is distributed in the hope that it will be useful, 00021 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00022 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00023 * GNU General Public License for more details. 00024 * 00025 * This copyright notice MUST APPEAR in all copies of the script! 00026 ***************************************************************/ 00027 /** 00028 * This class is a search indexer for TYPO3 00029 * 00030 * @author Kasper Skårhøj <kasperYYYY@typo3.com> 00031 * Originally Christian Jul Jensen <christian@jul.net> helped as well. 00032 */ 00033 /** 00034 * [CLASS/FUNCTION INDEX of SCRIPT] 00035 * 00036 * 00037 * 00038 * 141: class tx_indexedsearch_indexer 00039 * 207: function hook_indexContent(&$pObj) 00040 * 00041 * SECTION: Backend API 00042 * 308: function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) 00043 * 347: function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) 00044 * 365: function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) 00045 * 00046 * SECTION: Initialization 00047 * 416: function init() 00048 * 468: function initializeExternalParsers() 00049 * 00050 * SECTION: Indexing; TYPO3 pages (HTML content) 00051 * 509: function indexTypo3PageContent() 00052 * 596: function splitHTMLContent($content) 00053 * 642: function getHTMLcharset($content) 00054 * 657: function convertHTMLToUtf8($content,$charset='') 00055 * 685: function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) 00056 * 712: function typoSearchTags(&$body) 00057 * 741: function extractLinks($content) 00058 * 812: function extractHyperLinks($string) 00059 * 00060 * SECTION: Indexing; external URL 00061 * 871: function indexExternalUrl($externalUrl) 00062 * 902: function getUrlHeaders($url) 00063 * 00064 * SECTION: Indexing; external files (PDF, DOC, etc) 00065 * 948: function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') 00066 * 1054: function readFileContent($ext,$absFile,$cPKey) 00067 * 1071: function fileContentParts($ext,$absFile) 00068 * 1089: function splitRegularContent($content) 00069 * 00070 * SECTION: Analysing content, Extracting words 00071 * 1122: function charsetEntity2utf8(&$contentArr, $charset) 00072 * 1145: function processWordsInArrays($contentArr) 00073 * 1170: function procesWordsInArrays($contentArr) 00074 * 1180: function bodyDescription($contentArr) 00075 * 1202: function indexAnalyze($content) 00076 * 1223: function analyzeHeaderinfo(&$retArr,$content,$key,$offset) 00077 * 1242: function analyzeBody(&$retArr,$content) 00078 * 1262: function metaphone($word,$retRaw=FALSE) 00079 * 00080 * SECTION: SQL; TYPO3 Pages 00081 * 1304: function submitPage() 00082 * 1378: function submit_grlist($hash,$phash_x) 00083 * 1398: function submit_section($hash,$hash_t3) 00084 * 1416: function removeOldIndexedPages($phash) 00085 * 00086 * SECTION: SQL; External media 00087 * 1459: function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) 00088 * 1525: function submitFile_grlist($hash) 00089 * 1539: function submitFile_section($hash) 00090 * 1553: function removeOldIndexedFiles($phash) 00091 * 00092 * SECTION: SQL Helper functions 00093 * 1589: function checkMtimeTstamp($mtime,$phash) 00094 * 1625: function checkContentHash() 00095 * 1642: function checkExternalDocContentHash($hashGr,$content_md5h) 00096 * 1656: function is_grlist_set($phash_x) 00097 * 1669: function update_grlist($phash,$phash_x) 00098 * 1684: function updateTstamp($phash,$mtime=0) 00099 * 1699: function updateSetId($phash) 00100 * 1714: function updateParsetime($phash,$parsetime) 00101 * 1727: function updateRootline() 00102 * 1742: function getRootLineFields(&$fieldArr) 00103 * 1761: function removeLoginpagesWithContentHash() 00104 * 1778: function includeCrawlerClass() 00105 * 00106 * SECTION: SQL; Submitting words 00107 * 1805: function checkWordList($wl) 00108 * 1842: function submitWords($wl,$phash) 00109 * 1866: function freqMap($freq) 00110 * 00111 * SECTION: Hashing 00112 * 1899: function setT3Hashes() 00113 * 1925: function setExtHashes($file,$subinfo=array()) 00114 * 1949: function md5inthash($str) 00115 * 1959: function makeCHash($paramArray) 00116 * 00117 * SECTION: Internal logging functions 00118 * 1991: function log_push($msg,$key) 00119 * 2000: function log_pull() 00120 * 2011: function log_setTSlogMessage($msg, $errorNum=0) 00121 * 00122 * SECTION: tslib_fe hooks: 00123 * 2036: function fe_headerNoCache(&$params, $ref) 00124 * 00125 * TOTAL FUNCTIONS: 59 00126 * (This index is automatically created/updated by the extension "extdeveval") 00127 * 00128 */ 00129 /** 00130 * Indexing class for TYPO3 frontend 00131 * 00132 * @author Kasper Skårhøj <kasperYYYY@typo3.com> 00133 * @package TYPO3 00134 * @subpackage tx_indexedsearch 00135 */ 00136 class tx_indexedsearch_indexer { 00137 00138 // Messages: 00139 var $reasons = array( 00140 -1 => 'mtime matched the document, so no changes detected and no content updated', 00141 -2 => 'The minimum age was not exceeded', 00142 1 => "The configured max-age was exceeded for the document and thus it's indexed.", 00143 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.', 00144 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.', 00145 4 => 'Page has never been indexed (is not represented in the index_phash table).' 00146 ); 00147 00148 // HTML code blocks to exclude from indexing: 00149 var $excludeSections = 'script,style'; 00150 00151 // Supported Extensions for external files: 00152 var $external_parsers = array(); // External parser objects, keys are file extension names. Values are objects with certain methods. 00153 00154 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!) 00155 var $defaultGrList = '0,-1'; 00156 00157 // Min/Max times: 00158 var $tstamp_maxAge = 0; // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded. 00159 var $tstamp_minAge = 0; // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime. 00160 var $maxExternalFiles = 0; // Max number of external files to index. 00161 00162 var $forceIndexing = FALSE; // If true, indexing is forced despite of hashes etc. 00163 var $crawlerActive = FALSE; // Set when crawler is detected (internal) 00164 00165 // INTERNALS: 00166 var $defaultContentArray=array( 00167 'title' => '', 00168 'description' => '', 00169 'keywords' => '', 00170 'body' => '', 00171 ); 00172 var $wordcount = 0; 00173 var $externalFileCounter = 0; 00174 00175 var $conf = array(); // Configuration set internally (see init functions for required keys and their meaning) 00176 var $indexerConfig = array(); // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search'] 00177 var $hash = array(); // Hash array, contains phash and phash_grouping 00178 var $file_phash_arr = array(); // Hash array for files 00179 var $contentParts = array(); // Content of TYPO3 page 00180 var $content_md5h = ''; 00181 var $internal_log = array(); // Internal log 00182 var $indexExternalUrl_content = ''; 00183 00184 var $cHashParams = array(); // cHashparams array 00185 00186 var $freqRange = 32000; 00187 var $freqMax = 0.1; 00188 00189 // Objects: 00190 /** 00191 * Charset class object 00192 * 00193 * @var t3lib_cs 00194 */ 00195 var $csObj; 00196 00197 /** 00198 * Metaphone object, if any 00199 * 00200 * @var user_DoubleMetaPhone 00201 */ 00202 var $metaphoneObj; 00203 00204 /** 00205 * Lexer object for word splitting 00206 * 00207 * @var tx_indexedsearch_lexer 00208 */ 00209 var $lexerObj; 00210 00211 00212 00213 /** 00214 * Parent Object (TSFE) Initialization 00215 * 00216 * @param object Parent Object (frontend TSFE object), passed by reference 00217 * @return void 00218 */ 00219 function hook_indexContent(&$pObj) { 00220 00221 // Indexer configuration from Extension Manager interface: 00222 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']); 00223 00224 // Crawler activation: 00225 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction: 00226 if (t3lib_extMgm::isLoaded('crawler') 00227 && $pObj->applicationData['tx_crawler']['running'] 00228 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) { 00229 00230 // Setting simple log message: 00231 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled'; 00232 00233 // Setting variables: 00234 $this->crawlerActive = TRUE; // Crawler active flag 00235 $this->forceIndexing = TRUE; // Force indexing despite timestamps etc. 00236 } 00237 00238 // Determine if page should be indexed, and if so, configure and initialize indexer 00239 if ($pObj->config['config']['index_enable']) { 00240 $this->log_push('Index page',''); 00241 00242 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) { 00243 if (!$pObj->page['no_search']) { 00244 if (!$pObj->no_cache) { 00245 if (!strcmp($pObj->sys_language_uid,$pObj->sys_language_content)) { 00246 00247 // Setting up internal configuration from config array: 00248 $this->conf = array(); 00249 00250 // Information about page for which the indexing takes place 00251 $this->conf['id'] = $pObj->id; // Page id 00252 $this->conf['type'] = $pObj->type; // Page type 00253 $this->conf['sys_language_uid'] = $pObj->sys_language_uid; // sys_language UID of the language of the indexing. 00254 $this->conf['MP'] = $pObj->MP; // MP variable, if any (Mount Points) 00255 $this->conf['gr_list'] = $pObj->gr_list; // Group list 00256 00257 $this->conf['cHash'] = $pObj->cHash; // cHash string for additional parameters 00258 $this->conf['cHash_array'] = $pObj->cHash_array; // Array of the additional parameters 00259 00260 $this->conf['crdate'] = $pObj->page['crdate']; // The creation date of the TYPO3 page 00261 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1; // reg1 of the caching table. Not known what practical use this has. 00262 00263 // Root line uids 00264 $this->conf['rootline_uids'] = array(); 00265 foreach($pObj->config['rootLine'] as $rlkey => $rldat) { 00266 $this->conf['rootline_uids'][$rlkey] = $rldat['uid']; 00267 } 00268 00269 // Content of page: 00270 $this->conf['content'] = $pObj->content; // Content string (HTML of TYPO3 page) 00271 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle); // Alternative title for indexing 00272 $this->conf['metaCharset'] = $pObj->metaCharset; // Character set of content (will be converted to utf-8 during indexing) 00273 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED']; // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed. 00274 00275 // Configuration of behavior: 00276 $this->conf['index_externals'] = $pObj->config['config']['index_externals']; // Whether to index external documents like PDF, DOC etc. (if possible) 00277 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd']; // Length of description text (max 250, default 200) 00278 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : true; 00279 00280 // Set to zero: 00281 $this->conf['recordUid'] = 0; 00282 $this->conf['freeIndexUid'] = 0; 00283 $this->conf['freeIndexSetId'] = 0; 00284 00285 // Init and start indexing: 00286 $this->init(); 00287 $this->indexTypo3PageContent(); 00288 } else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.'); 00289 } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.'); 00290 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!'); 00291 } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.'); 00292 $this->log_pull(); 00293 } 00294 } 00295 00296 00297 00298 00299 00300 00301 00302 00303 /**************************** 00304 * 00305 * Backend API 00306 * 00307 ****************************/ 00308 00309 /** 00310 * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached) 00311 * 00312 * @param integer The page uid, &id= 00313 * @param integer The page type, &type= 00314 * @param integer sys_language uid, typically &L= 00315 * @param string The MP variable (Mount Points), &MP= 00316 * @param array Rootline array of only UIDs. 00317 * @param array Array of GET variables to register with this indexing 00318 * @param boolean If set, calculates a cHash value from the $cHash_array. Probably you will not do that since such cases are indexed through the frontend and the idea of this interface is to index non-cachable pages from the backend! 00319 * @return void 00320 */ 00321 function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) { 00322 00323 // Setting up internal configuration from config array: 00324 $this->conf = array(); 00325 00326 // Information about page for which the indexing takes place 00327 $this->conf['id'] = $id; // Page id (integer) 00328 $this->conf['type'] = $type; // Page type (integer) 00329 $this->conf['sys_language_uid'] = $sys_language_uid; // sys_language UID of the language of the indexing (integer) 00330 $this->conf['MP'] = $MP; // MP variable, if any (Mount Points) (string) 00331 $this->conf['gr_list'] = '0,-1'; // Group list (hardcoded for now...) 00332 00333 // cHash values: 00334 $this->conf['cHash'] = $createCHash ? t3lib_div::generateCHash(t3lib_div::implodeArrayForUrl('', $cHash_array)) : ''; // cHash string for additional parameters 00335 $this->conf['cHash_array'] = $cHash_array; // Array of the additional parameters 00336 00337 // Set to defaults 00338 $this->conf['freeIndexUid'] = 0; 00339 $this->conf['freeIndexSetId'] = 0; 00340 $this->conf['page_cache_reg1'] = ''; 00341 00342 // Root line uids 00343 $this->conf['rootline_uids'] = $uidRL; 00344 00345 // Configuration of behavior: 00346 $this->conf['index_externals'] = 1; // Whether to index external documents like PDF, DOC etc. (if possible) 00347 $this->conf['index_descrLgd'] = 200; // Length of description text (max 250, default 200) 00348 $this->conf['index_metatags'] = true; // Whether to index document keywords and description (if present) 00349 00350 // Init and start indexing: 00351 $this->init(); 00352 } 00353 00354 /** 00355 * Sets the free-index uid. Can be called right after backend_initIndexer() 00356 * 00357 * @param integer Free index UID 00358 * @param integer Set id - an integer identifying the "set" of indexing operations. 00359 * @return void 00360 */ 00361 function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) { 00362 $this->conf['freeIndexUid'] = $freeIndexUid; 00363 $this->conf['freeIndexSetId'] = $freeIndexSetId; 00364 } 00365 00366 /** 00367 * Indexing records as the content of a TYPO3 page. 00368 * 00369 * @param string Title equivalent 00370 * @param string Keywords equivalent 00371 * @param string Description equivalent 00372 * @param string The main content to index 00373 * @param string The charset of the title, keyword, description and body-content. MUST BE VALID, otherwise nothing is indexed! 00374 * @param integer Last modification time, in seconds 00375 * @param integer The creation date of the content, in seconds 00376 * @param integer The record UID that the content comes from (for registration with the indexed rows) 00377 * @return void 00378 */ 00379 function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) { 00380 00381 // Content of page: 00382 $this->conf['mtime'] = $mtime; // Most recent modification time (seconds) of the content 00383 $this->conf['crdate'] = $crdate; // The creation date of the TYPO3 content 00384 $this->conf['recordUid'] = $recordUid; // UID of the record, if applicable 00385 00386 // Construct fake HTML for parsing: 00387 $this->conf['content'] = ' 00388 <html> 00389 <head> 00390 <title>'.htmlspecialchars($title).'</title> 00391 <meta name="keywords" content="'.htmlspecialchars($keywords).'" /> 00392 <meta name="description" content="'.htmlspecialchars($description).'" /> 00393 </head> 00394 <body> 00395 '.htmlspecialchars($content).' 00396 </body> 00397 </html>'; // Content string (HTML of TYPO3 page) 00398 00399 // Initializing charset: 00400 $this->conf['metaCharset'] = $charset; // Character set of content (will be converted to utf-8 during indexing) 00401 $this->conf['indexedDocTitle'] = ''; // Alternative title for indexing 00402 00403 // Index content as if it was a TYPO3 page: 00404 $this->indexTypo3PageContent(); 00405 } 00406 00407 00408 00409 00410 00411 00412 00413 00414 00415 00416 00417 00418 00419 /******************************** 00420 * 00421 * Initialization 00422 * 00423 *******************************/ 00424 00425 /** 00426 * Initializes the object. $this->conf MUST be set with proper values prior to this call!!! 00427 * 00428 * @return void 00429 */ 00430 function init() { 00431 global $TYPO3_CONF_VARS; 00432 00433 // Initializing: 00434 $this->cHashParams = $this->conf['cHash_array']; 00435 if (is_array($this->cHashParams) && count($this->cHashParams)) { 00436 if ($this->conf['cHash']) $this->cHashParams['cHash'] = $this->conf['cHash']; // Add this so that URL's come out right... 00437 unset($this->cHashParams['encryptionKey']); // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!! 00438 } 00439 00440 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables: 00441 $this->setT3Hashes(); 00442 00443 // Indexer configuration from Extension Manager interface: 00444 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']); 00445 $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0); 00446 $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0); 00447 $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5); 00448 $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255); 00449 00450 // Initialize external document parsers: 00451 // Example configuration, see ext_localconf.php of this file! 00452 if ($this->conf['index_externals']) { 00453 $this->initializeExternalParsers(); 00454 } 00455 00456 // Initialize lexer (class that deconstructs the text into words): 00457 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer'; 00458 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ? 00459 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] : 00460 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer'; 00461 $this->lexerObj = t3lib_div::getUserObj($lexerObjRef); 00462 $this->lexerObj->debug = $this->indexerConfig['debugMode']; 00463 00464 // Initialize metaphone hook: 00465 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone'; 00466 if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) { 00467 $this->metaphoneObj = t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']); 00468 $this->metaphoneObj->pObj = $this; 00469 } 00470 00471 // Init charset class: 00472 $this->csObj = t3lib_div::makeInstance('t3lib_cs'); 00473 } 00474 00475 /** 00476 * Initialize external parsers 00477 * 00478 * @return void 00479 * @access private 00480 * @see init() 00481 */ 00482 function initializeExternalParsers() { 00483 global $TYPO3_CONF_VARS; 00484 00485 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) { 00486 foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) { 00487 $this->external_parsers[$extension] = t3lib_div::getUserObj($_objRef); 00488 $this->external_parsers[$extension]->pObj = $this; 00489 00490 // Init parser and if it returns false, unset its entry again: 00491 if (!$this->external_parsers[$extension]->initParser($extension)) { 00492 unset($this->external_parsers[$extension]); 00493 } 00494 } 00495 } 00496 } 00497 00498 00499 00500 00501 00502 00503 00504 00505 00506 00507 00508 00509 00510 00511 00512 /******************************** 00513 * 00514 * Indexing; TYPO3 pages (HTML content) 00515 * 00516 *******************************/ 00517 00518 /** 00519 * Start indexing of the TYPO3 page 00520 * 00521 * @return void 00522 */ 00523 function indexTypo3PageContent() { 00524 00525 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']); 00526 $is_grlist = $this->is_grlist_set($this->hash['phash']); 00527 00528 if ($check > 0 || !$is_grlist || $this->forceIndexing) { 00529 00530 // Setting message: 00531 if ($this->forceIndexing) { 00532 $this->log_setTSlogMessage('Indexing needed, reason: Forced',1); 00533 } elseif ($check > 0) { 00534 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1); 00535 } else { 00536 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1); 00537 } 00538 00539 // Divide into title,keywords,description and body: 00540 $this->log_push('Split content',''); 00541 $this->contentParts = $this->splitHTMLContent($this->conf['content']); 00542 if ($this->conf['indexedDocTitle']) { 00543 $this->contentParts['title'] = $this->conf['indexedDocTitle']; 00544 } 00545 $this->log_pull(); 00546 00547 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!) 00548 $this->content_md5h = $this->md5inthash(implode($this->contentParts,'')); 00549 00550 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash. 00551 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more. 00552 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem. 00553 $checkCHash = $this->checkContentHash(); 00554 if (!is_array($checkCHash) || $check===1) { 00555 $Pstart=t3lib_div::milliseconds(); 00556 00557 $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8',''); 00558 $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']); 00559 $this->log_pull(); 00560 00561 // Splitting words 00562 $this->log_push('Extract words from content',''); 00563 $splitInWords = $this->processWordsInArrays($this->contentParts); 00564 $this->log_pull(); 00565 00566 // Analyse the indexed words. 00567 $this->log_push('Analyse the extracted words',''); 00568 $indexArr = $this->indexAnalyze($splitInWords); 00569 $this->log_pull(); 00570 00571 // Submitting page (phash) record 00572 $this->log_push('Submitting page',''); 00573 $this->submitPage(); 00574 $this->log_pull(); 00575 00576 // Check words and submit to word list if not there 00577 $this->log_push('Check word list and submit words',''); 00578 $this->checkWordList($indexArr); 00579 $this->submitWords($indexArr,$this->hash['phash']); 00580 $this->log_pull(); 00581 00582 // Set parsetime 00583 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart); 00584 00585 // Checking external files if configured for. 00586 $this->log_push('Checking external files',''); 00587 if ($this->conf['index_externals']) { 00588 $this->extractLinks($this->conf['content']); 00589 } 00590 $this->log_pull(); 00591 } else { 00592 $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp 00593 $this->updateSetId($this->hash['phash']); 00594 $this->update_grlist($checkCHash['phash'],$this->hash['phash']); // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash. 00595 $this->updateRootline(); 00596 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.'); 00597 } 00598 } else { 00599 $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]); 00600 } 00601 } 00602 00603 /** 00604 * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body. 00605 * 00606 * @param string HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":") 00607 * @return array Array of content, having keys "title", "body", "keywords" and "description" set. 00608 * @see splitRegularContent() 00609 */ 00610 function splitHTMLContent($content) { 00611 00612 // divide head from body ( u-ouh :) ) 00613 $contentArr = $this->defaultContentArray; 00614 $contentArr['body'] = stristr($content,'<body'); 00615 $headPart = substr($content,0,-strlen($contentArr['body'])); 00616 00617 // get title 00618 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy); 00619 $titleParts = explode(':',$contentArr['title'],2); 00620 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]); 00621 00622 // get keywords and description metatags 00623 if($this->conf['index_metatags']) { 00624 for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ } 00625 for($i=0;isset($meta[$i]);$i++) { 00626 $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]); 00627 if (stristr($meta[$i]['name'], 'keywords')) { 00628 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']); 00629 } 00630 if (stristr($meta[$i]['name'], 'description')) { 00631 $contentArr['description'] .= ',' . $meta[$i]['content']; 00632 } 00633 } 00634 } 00635 00636 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags: 00637 $this->typoSearchTags($contentArr['body']); 00638 00639 // Get rid of unwanted sections (ie. scripting and style stuff) in body 00640 $tagList = explode(',',$this->excludeSections); 00641 foreach($tagList as $tag) { 00642 while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2)); 00643 } 00644 00645 // remove tags, but first make sure we don't concatenate words by doing it 00646 $contentArr['body'] = str_replace('<',' <',$contentArr['body']); 00647 $contentArr['body'] = trim(strip_tags($contentArr['body'])); 00648 00649 $contentArr['keywords'] = trim($contentArr['keywords']); 00650 $contentArr['description'] = trim($contentArr['description']); 00651 00652 // Return array 00653 return $contentArr; 00654 } 00655 00656 /** 00657 * Extract the charset value from HTML meta tag. 00658 * 00659 * @param string HTML content 00660 * @return string The charset value if found. 00661 */ 00662 function getHTMLcharset($content) { 00663 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i',$content,$reg)) { 00664 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i',$reg[0],$reg2)) { 00665 return $reg2[1]; 00666 } 00667 } 00668 } 00669 00670 /** 00671 * Converts a HTML document to utf-8 00672 * 00673 * @param string HTML content, any charset 00674 * @param string Optional charset (otherwise extracted from HTML) 00675 * @return string Converted HTML 00676 */ 00677 function convertHTMLToUtf8($content,$charset='') { 00678 00679 // Find charset: 00680 $charset = $charset ? $charset : $this->getHTMLcharset($content); 00681 $charset = $this->csObj->parse_charset($charset); 00682 00683 // Convert charset: 00684 if ($charset && $charset!=='utf-8') { 00685 $content = $this->csObj->utf8_encode($content, $charset); 00686 } 00687 // Convert entities, assuming document is now UTF-8: 00688 $content = $this->csObj->entities_to_utf8($content, TRUE); 00689 00690 return $content; 00691 } 00692 00693 /** 00694 * Finds first occurence of embracing tags and returns the embraced content and the original string with 00695 * the tag removed in the two passed variables. Returns false if no match found. ie. useful for finding 00696 * <title> of document or removing <script>-sections 00697 * 00698 * @param string String to search in 00699 * @param string Tag name, eg. "script" 00700 * @param string Passed by reference: Content inside found tag 00701 * @param string Passed by reference: Content after found tag 00702 * @param string Passed by reference: Attributes of the found tag. 00703 * @return boolean Returns false if tag was not found, otherwise true. 00704 */ 00705 function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) { 00706 $endTag = '</'.$tagName.'>'; 00707 $startTag = '<'.$tagName; 00708 00709 $isTagInText = stristr($string,$startTag); // stristr used because we want a case-insensitive search for the tag. 00710 if(!$isTagInText) return false; // if the tag was not found, return false 00711 00712 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2); 00713 $afterTagInText = stristr($isTagInText,$endTag); 00714 if ($afterTagInText) { 00715 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag))); 00716 $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText)); 00717 $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag)); 00718 } else { // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned. 00719 $tagContent=''; 00720 $stringAfter = $isTagInText; 00721 } 00722 00723 return true; 00724 } 00725 00726 /** 00727 * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags. 00728 * 00729 * @param string HTML Content, passed by reference 00730 * @return boolean Returns true if a TYPOSEARCH_ tag was found, otherwise false. 00731 */ 00732 function typoSearchTags(&$body) { 00733 $expBody = preg_split('/<\!\-\-[\s]?TYPO3SEARCH_/',$body); 00734 00735 if(count($expBody)>1) { 00736 $body = ''; 00737 00738 foreach($expBody as $val) { 00739 $part = explode('-->',$val,2); 00740 if(trim($part[0])=='begin') { 00741 $body.= $part[1]; 00742 $prev = ''; 00743 } elseif(trim($part[0])=='end') { 00744 $body.= $prev; 00745 } else { 00746 $prev = $val; 00747 } 00748 } 00749 return true; 00750 } else { 00751 return false; 00752 } 00753 } 00754 00755 /** 00756 * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed. 00757 * 00758 * @param string HTML content 00759 * @return void 00760 */ 00761 function extractLinks($content) { 00762 00763 // Get links: 00764 $list = $this->extractHyperLinks($content); 00765 00766 if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler')) { 00767 $this->includeCrawlerClass(); 00768 $crawler = t3lib_div::makeInstance('tx_crawler_lib'); 00769 } 00770 00771 // Traverse links: 00772 foreach($list as $linkInfo) { 00773 00774 // Decode entities: 00775 if ($linkInfo['localPath']) { // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here! 00776 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']); 00777 } else { 00778 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']); 00779 } 00780 00781 // Parse URL: 00782 $qParts = parse_url($linkSource); 00783 00784 // Check for jumpurl (TYPO3 specific thing...) 00785 if ($qParts['query'] && strstr($qParts['query'],'jumpurl=')) { 00786 parse_str($qParts['query'],$getP); 00787 $linkSource = $getP['jumpurl']; 00788 $qParts = parse_url($linkSource); // parse again due to new linkSource! 00789 } 00790 00791 if (!$linkInfo['localPath'] && $qParts['scheme']) { 00792 if ($this->indexerConfig['indexExternalURLs']) { 00793 // Index external URL (http or otherwise) 00794 $this->indexExternalUrl($linkSource); 00795 } 00796 } elseif (!$qParts['query']) { 00797 $linkSource = urldecode($linkSource); 00798 if (t3lib_div::isAllowedAbsPath($linkSource)) { 00799 $localFile = $linkSource; 00800 } else { 00801 $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource); 00802 } 00803 if ($localFile && @is_file($localFile)) { 00804 00805 // Index local file: 00806 if ($linkInfo['localPath']) { 00807 00808 $fI = pathinfo($linkSource); 00809 $ext = strtolower($fI['extension']); 00810 if (is_object($crawler)) { 00811 $params = array( 00812 'document' => $linkSource, 00813 'alturl' => $linkInfo['href'], 00814 'conf' => $this->conf 00815 ); 00816 unset($params['conf']['content']); 00817 00818 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']); 00819 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1); 00820 } else { 00821 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext); 00822 } 00823 } else { 00824 if (is_object($crawler)) { 00825 $params = array( 00826 'document' => $linkSource, 00827 'conf' => $this->conf 00828 ); 00829 unset($params['conf']['content']); 00830 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']); 00831 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1); 00832 } else { 00833 $this->indexRegularDocument($linkSource); 00834 } 00835 } 00836 } 00837 } 00838 } 00839 } 00840 00841 /** 00842 * Extracts all links to external documents from the HTML content string 00843 * 00844 * @param string $html 00845 * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local)) 00846 * @see extractLinks() 00847 */ 00848 function extractHyperLinks($html) { 00849 $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml'); 00850 $htmlParts = $htmlParser->splitTags('a', $html); 00851 $hyperLinksData = array(); 00852 foreach ($htmlParts as $index => $tagData) { 00853 if (($index % 2) !== 0) { 00854 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE); 00855 $firstTagName = $htmlParser->getFirstTagName($tagData); 00856 00857 if (strtolower($firstTagName) == 'a') { 00858 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href']{0} != '#') { 00859 $hyperLinksData[] = array( 00860 'tag' => $tagData, 00861 'href' => $tagAttributes[0]['href'], 00862 'localPath' => $this->createLocalPath($tagAttributes[0]['href']) 00863 ); 00864 } 00865 } 00866 } 00867 } 00868 00869 return $hyperLinksData; 00870 } 00871 00872 /** 00873 * Extracts the "base href" from content string. 00874 * 00875 * @param string Content to analyze 00876 * @return string The base href or an empty string if not found 00877 */ 00878 public function extractBaseHref($html) { 00879 $href = ''; 00880 $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml'); 00881 $htmlParts = $htmlParser->splitTags('base', $html); 00882 foreach ($htmlParts as $index => $tagData) { 00883 if (($index % 2) !== 0) { 00884 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true); 00885 $firstTagName = $htmlParser->getFirstTagName($tagData); 00886 if (strtolower($firstTagName) == 'base') { 00887 $href = $tagAttributes[0]['href']; 00888 if ($href) { 00889 break; 00890 } 00891 } 00892 } 00893 } 00894 00895 return $href; 00896 } 00897 00898 /****************************************** 00899 * 00900 * Indexing; external URL 00901 * 00902 ******************************************/ 00903 00904 /** 00905 * Index External URLs HTML content 00906 * 00907 * @param string URL, eg. "http://typo3.org/" 00908 * @return void 00909 * @see indexRegularDocument() 00910 */ 00911 function indexExternalUrl($externalUrl) { 00912 00913 // Parse External URL: 00914 $qParts = parse_url($externalUrl); 00915 $fI = pathinfo($qParts['path']); 00916 $ext = strtolower($fI['extension']); 00917 00918 // Get headers: 00919 $urlHeaders = $this->getUrlHeaders($externalUrl); 00920 if (stristr($urlHeaders['Content-Type'],'text/html')) { 00921 $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl); 00922 if (strlen($content)) { 00923 00924 // Create temporary file: 00925 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL'); 00926 if ($tmpFile) { 00927 t3lib_div::writeFile($tmpFile, $content); 00928 00929 // Index that file: 00930 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html'); // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?) 00931 unlink($tmpFile); 00932 } 00933 } 00934 } 00935 } 00936 00937 /** 00938 * Getting HTTP request headers of URL 00939 * 00940 * @param string The URL 00941 * @param integer Timeout (seconds?) 00942 * @return mixed If no answer, returns false. Otherwise an array where HTTP headers are keys 00943 */ 00944 function getUrlHeaders($url) { 00945 $content = t3lib_div::getURL($url,2); // Try to get the headers only 00946 00947 if (strlen($content)) { 00948 // Compile headers: 00949 $headers = t3lib_div::trimExplode(LF,$content,1); 00950 $retVal = array(); 00951 foreach($headers as $line) { 00952 if (!strlen(trim($line))) { 00953 break; // Stop at the first empty line (= end of header) 00954 } 00955 00956 list($headKey, $headValue) = explode(':', $line, 2); 00957 $retVal[$headKey] = $headValue; 00958 } 00959 return $retVal; 00960 } 00961 } 00962 00963 00964 00965 /** 00966 * Checks if the file is local 00967 * 00968 * @param $sourcePath 00969 * @return string Absolute path to file if file is local, else empty string 00970 */ 00971 protected function createLocalPath($sourcePath) { 00972 $localPath = ''; 00973 static $pathFunctions = array( 00974 'createLocalPathFromT3vars', 00975 'createLocalPathUsingAbsRefPrefix', 00976 'createLocalPathUsingDomainURL', 00977 'createLocalPathFromAbsoluteURL', 00978 'createLocalPathFromRelativeURL' 00979 ); 00980 foreach ($pathFunctions as $functionName) { 00981 $localPath = $this->$functionName($sourcePath); 00982 if ($localPath != '') { 00983 break; 00984 } 00985 } 00986 return $localPath; 00987 } 00988 00989 /** 00990 * Attempts to create a local file path from T3VARs. This is useful for 00991 * various download extensions that hide actual file name but still want the 00992 * file to be indexed. 00993 * 00994 * @param string $sourcePath 00995 * @return string 00996 */ 00997 protected function createLocalPathFromT3vars($sourcePath) { 00998 $localPath = ''; 00999 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles']; 01000 if (is_array($indexLocalFiles)) { 01001 $md5 = t3lib_div::shortMD5($sourcePath); 01002 // Note: not using self::isAllowedLocalFile here because this method 01003 // is allowed to index files outside of the web site (for example, 01004 // protected downloads) 01005 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) { 01006 $localPath = $indexLocalFiles[$md5]; 01007 } 01008 } 01009 return $localPath; 01010 } 01011 01012 /** 01013 * Attempts to create a local file path by matching a current request URL. 01014 * 01015 * @param string $sourcePath 01016 * @return string 01017 */ 01018 protected function createLocalPathUsingDomainURL($sourcePath) { 01019 $localPath = ''; 01020 $baseURL = t3lib_div::getIndpEnv('TYPO3_SITE_URL'); 01021 $baseURLLength = strlen($baseURL); 01022 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) { 01023 $sourcePath = substr($sourcePath, $baseURLLength); 01024 $localPath = PATH_site . $sourcePath; 01025 if (!self::isAllowedLocalFile($localPath)) { 01026 $localPath = ''; 01027 } 01028 } 01029 return $localPath; 01030 } 01031 01032 /** 01033 * Attempts to create a local file path by matching absRefPrefix. This 01034 * requires TSFE. If TSFE is missing, this function does nothing. 01035 * 01036 * @param string $sourcePath 01037 * @return string 01038 */ 01039 protected function createLocalPathUsingAbsRefPrefix($sourcePath) { 01040 $localPath = ''; 01041 if ($GLOBALS['TSFE'] instanceof tslib_fe) { 01042 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix']; 01043 $absRefPrefixLength = strlen($absRefPrefix); 01044 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) { 01045 $sourcePath = substr($sourcePath, $absRefPrefixLength); 01046 $localPath = PATH_site . $sourcePath; 01047 if (!self::isAllowedLocalFile($localPath)) { 01048 $localPath = ''; 01049 } 01050 } 01051 } 01052 return $localPath; 01053 } 01054 01055 /** 01056 * Attempts to create a local file path from the absolute URL without 01057 * schema. 01058 * 01059 * @param string $sourcePath 01060 * @return string 01061 */ 01062 protected function createLocalPathFromAbsoluteURL($sourcePath) { 01063 $localPath = ''; 01064 if ($sourcePath{0} == '/') { 01065 $sourcePath = substr($sourcePath, 1); 01066 $localPath = PATH_site . $sourcePath; 01067 if (!self::isAllowedLocalFile($localPath)) { 01068 $localPath = ''; 01069 } 01070 } 01071 return $localPath; 01072 } 01073 01074 /** 01075 * Attempts to create a local file path from the relative URL. 01076 * 01077 * @param string $sourcePath 01078 * @return string 01079 */ 01080 protected function createLocalPathFromRelativeURL($sourcePath) { 01081 $localPath = ''; 01082 if (self::isRelativeURL($sourcePath)) { 01083 $localPath = PATH_site . $sourcePath; 01084 if (!self::isAllowedLocalFile($localPath)) { 01085 $localPath = ''; 01086 } 01087 } 01088 return $localPath; 01089 } 01090 01091 /** 01092 * Checks if URL is relative. 01093 * 01094 * @param string $url 01095 * @return boolean 01096 */ 01097 static protected function isRelativeURL($url) { 01098 $urlParts = @parse_url($url); 01099 return ($urlParts['scheme'] == '' && $urlParts['path']{0} != '/'); 01100 } 01101 01102 /** 01103 * Checks if the path points to the file inside the web site 01104 * 01105 * @param string $filePath 01106 * @return boolean 01107 */ 01108 static protected function isAllowedLocalFile($filePath) { 01109 $filePath = t3lib_div::resolveBackPath($filePath); 01110 $insideWebPath = (substr($filePath, 0, strlen(PATH_site)) == PATH_site); 01111 $isFile = is_file($filePath); 01112 return $insideWebPath && $isFile; 01113 } 01114 01115 /****************************************** 01116 * 01117 * Indexing; external files (PDF, DOC, etc) 01118 * 01119 ******************************************/ 01120 01121 /** 01122 * Indexing a regular document given as $file (relative to PATH_site, local file) 01123 * 01124 * @param string Relative Filename, relative to PATH_site. It can also be an absolute path as long as it is inside the lockRootPath (validated with t3lib_div::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL 01125 * @param boolean If set, indexing is forced (despite content hashes, mtime etc). 01126 * @param string Temporary file with the content to read it from (instead of $file). Used when the $file is a URL. 01127 * @param string File extension for temporary file. 01128 * @return void 01129 */ 01130 function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') { 01131 01132 // Init 01133 $fI = pathinfo($file); 01134 $ext = $altExtension ? $altExtension : strtolower($fI['extension']); 01135 01136 // Create abs-path: 01137 if (!$contentTmpFile) { 01138 if (!t3lib_div::isAbsPath($file)) { // Relative, prepend PATH_site: 01139 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file); 01140 } else { // Absolute, pass-through: 01141 $absFile = $file; 01142 } 01143 $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : ''; 01144 } else { 01145 $absFile = $contentTmpFile; 01146 } 01147 01148 // Indexing the document: 01149 if ($absFile && @is_file($absFile)) { 01150 if ($this->external_parsers[$ext]) { 01151 $mtime = filemtime($absFile); 01152 $cParts = $this->fileContentParts($ext,$absFile); 01153 01154 foreach($cParts as $cPKey) { 01155 $this->internal_log = array(); 01156 $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),''); 01157 $Pstart = t3lib_div::milliseconds(); 01158 $subinfo = array('key' => $cPKey); // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3" 01159 $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo); 01160 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']); 01161 if ($check > 0 || $force) { 01162 if ($check > 0) { 01163 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1); 01164 } else { 01165 $this->log_setTSlogMessage('Indexing forced by flag',1); 01166 } 01167 01168 // Check external file counter: 01169 if ($this->externalFileCounter < $this->maxExternalFiles || $force) { 01170 01171 // Divide into title,keywords,description and body: 01172 $this->log_push('Split content',''); 01173 $contentParts = $this->readFileContent($ext,$absFile,$cPKey); 01174 $this->log_pull(); 01175 01176 if (is_array($contentParts)) { 01177 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent()) 01178 $content_md5h = $this->md5inthash(implode($contentParts,'')); 01179 01180 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) { 01181 01182 // Increment counter: 01183 $this->externalFileCounter++; 01184 01185 // Splitting words 01186 $this->log_push('Extract words from content',''); 01187 $splitInWords = $this->processWordsInArrays($contentParts); 01188 $this->log_pull(); 01189 01190 // Analyse the indexed words. 01191 $this->log_push('Analyse the extracted words',''); 01192 $indexArr = $this->indexAnalyze($splitInWords); 01193 $this->log_pull(); 01194 01195 // Submitting page (phash) record 01196 $this->log_push('Submitting page',''); 01197 $size = filesize($absFile); 01198 $ctime = filemtime($absFile); // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time... 01199 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts); 01200 $this->log_pull(); 01201 01202 // Check words and submit to word list if not there 01203 $this->log_push('Check word list and submit words',''); 01204 $this->checkWordList($indexArr); 01205 $this->submitWords($indexArr,$phash_arr['phash']); 01206 $this->log_pull(); 01207 01208 // Set parsetime 01209 $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart); 01210 } else { 01211 $this->updateTstamp($phash_arr['phash'],$mtime); // Update the timestamp 01212 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.'); 01213 } 01214 } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.'); 01215 } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.'); 01216 } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]); 01217 01218 // Checking and setting sections: 01219 # $this->submitFile_grlist($phash_arr['phash']); // Setting a gr_list record if there is none already (set for default fe_group) 01220 $this->submitFile_section($phash_arr['phash']); // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed. 01221 $this->log_pull(); 01222 } 01223 } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.'); 01224 } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.'); 01225 } 01226 01227 /** 01228 * Reads the content of an external file being indexed. 01229 * The content from the external parser MUST be returned in utf-8! 01230 * 01231 * @param string File extension, eg. "pdf", "doc" etc. 01232 * @param string Absolute filename of file (must exist and be validated OK before calling function) 01233 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.) 01234 * @return array Standard content array (title, description, keywords, body keys) 01235 */ 01236 function readFileContent($ext,$absFile,$cPKey) { 01237 01238 // Consult relevant external document parser: 01239 if (is_object($this->external_parsers[$ext])) { 01240 $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey); 01241 } 01242 01243 return $contentArr; 01244 } 01245 01246 /** 01247 * Creates an array with pointers to divisions of document. 01248 * 01249 * @param string File extension 01250 * @param string Absolute filename (must exist and be validated OK before calling function) 01251 * @return array Array of pointers to sections that the document should be divided into 01252 */ 01253 function fileContentParts($ext,$absFile) { 01254 $cParts = array(0); 01255 01256 // Consult relevant external document parser: 01257 if (is_object($this->external_parsers[$ext])) { 01258 $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile); 01259 } 01260 01261 return $cParts; 01262 } 01263 01264 /** 01265 * Splits non-HTML content (from external files for instance) 01266 * 01267 * @param string Input content (non-HTML) to index. 01268 * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty) 01269 * @see splitHTMLContent() 01270 */ 01271 function splitRegularContent($content) { 01272 $contentArr = $this->defaultContentArray; 01273 $contentArr['body'] = $content; 01274 01275 return $contentArr; 01276 } 01277 01278 01279 01280 01281 01282 01283 01284 01285 01286 01287 01288 01289 01290 01291 /********************************** 01292 * 01293 * Analysing content, Extracting words 01294 * 01295 **********************************/ 01296 01297 /** 01298 * Convert character set and HTML entities in the value of input content array keys 01299 * 01300 * @param array Standard content array 01301 * @param string Charset of the input content (converted to utf-8) 01302 * @return void 01303 */ 01304 function charsetEntity2utf8(&$contentArr, $charset) { 01305 01306 // Convert charset if necessary 01307 foreach ($contentArr as $key => $value) { 01308 if (strlen($contentArr[$key])) { 01309 01310 if ($charset!=='utf-8') { 01311 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset); 01312 } 01313 01314 // decode all numeric / html-entities in the string to real characters: 01315 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE); 01316 } 01317 } 01318 } 01319 01320 /** 01321 * Processing words in the array from split*Content -functions 01322 * 01323 * @param array Array of content to index, see splitHTMLContent() and splitRegularContent() 01324 * @return array Content input array modified so each key is not a unique array of words 01325 */ 01326 function processWordsInArrays($contentArr) { 01327 01328 // split all parts to words 01329 foreach ($contentArr as $key => $value) { 01330 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]); 01331 } 01332 01333 // For title, keywords, and description we don't want duplicates: 01334 $contentArr['title'] = array_unique($contentArr['title']); 01335 $contentArr['keywords'] = array_unique($contentArr['keywords']); 01336 $contentArr['description'] = array_unique($contentArr['description']); 01337 01338 // Return modified array: 01339 return $contentArr; 01340 } 01341 01342 /** 01343 * Extracts the sample description text from the content array. 01344 * 01345 * @param array Content array 01346 * @return string Description string 01347 */ 01348 function bodyDescription($contentArr) { 01349 01350 // Setting description 01351 $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200); 01352 if ($maxL) { 01353 // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet. 01354 # $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4))); 01355 $bodyDescription = str_replace(array(' ',TAB,CR,LF),' ',$contentArr['body']); 01356 01357 // Shorten the string: 01358 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL); 01359 } 01360 01361 return $bodyDescription; 01362 } 01363 01364 /** 01365 * Analyzes content to use for indexing, 01366 * 01367 * @param array Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words. 01368 * @return array Index Array (whatever that is...) 01369 */ 01370 function indexAnalyze($content) { 01371 $indexArr = Array(); 01372 $counter = 0; 01373 01374 $this->analyzeHeaderinfo($indexArr,$content,'title',7); 01375 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6); 01376 $this->analyzeHeaderinfo($indexArr,$content,'description',5); 01377 $this->analyzeBody($indexArr,$content); 01378 01379 return ($indexArr); 01380 } 01381 01382 /** 01383 * Calculates relevant information for headercontent 01384 * 01385 * @param array Index array, passed by reference 01386 * @param array Standard content array 01387 * @param string Key from standard content array 01388 * @param integer Bit-wise priority to type 01389 * @return void 01390 */ 01391 function analyzeHeaderinfo(&$retArr,$content,$key,$offset) { 01392 foreach ($content[$key] as $val) { 01393 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same. 01394 $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset); 01395 $retArr[$val]['count'] = $retArr[$val]['count']+1; 01396 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7)); 01397 $retArr[$val]['metaphone'] = $this->metaphone($val); 01398 $this->wordcount++; 01399 } 01400 } 01401 01402 /** 01403 * Calculates relevant information for bodycontent 01404 * 01405 * @param array Index array, passed by reference 01406 * @param array Standard content array 01407 * @return void 01408 */ 01409 function analyzeBody(&$retArr,$content) { 01410 foreach($content['body'] as $key => $val) { 01411 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same. 01412 if(!isset($retArr[$val])) { 01413 $retArr[$val]['first'] = $key; 01414 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7)); 01415 $retArr[$val]['metaphone'] = $this->metaphone($val); 01416 } 01417 $retArr[$val]['count'] = $retArr[$val]['count']+1; 01418 $this->wordcount++; 01419 } 01420 } 01421 01422 /** 01423 * Creating metaphone based hash from input word 01424 * 01425 * @param string Word to convert 01426 * @param boolean If set, returns the raw metaphone value (not hashed) 01427 * @return mixed Metaphone hash integer (or raw value, string) 01428 */ 01429 function metaphone($word,$retRaw=FALSE) { 01430 01431 if (is_object($this->metaphoneObj)) { 01432 $tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']); 01433 } else { 01434 $tmp = metaphone($word); 01435 } 01436 01437 // Return raw value? 01438 if ($retRaw) return $tmp; 01439 01440 // Otherwise create hash and return integer 01441 if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7)); 01442 return $ret; 01443 } 01444 01445 01446 01447 01448 01449 01450 01451 01452 01453 01454 01455 01456 01457 01458 01459 01460 /******************************** 01461 * 01462 * SQL; TYPO3 Pages 01463 * 01464 *******************************/ 01465 01466 /** 01467 * Updates db with information about the page (TYPO3 page, not external media) 01468 * 01469 * @return void 01470 */ 01471 function submitPage() { 01472 01473 // Remove any current data for this phash: 01474 $this->removeOldIndexedPages($this->hash['phash']); 01475 01476 // setting new phash_row 01477 $fields = array( 01478 'phash' => $this->hash['phash'], 01479 'phash_grouping' => $this->hash['phash_grouping'], 01480 'cHashParams' => serialize($this->cHashParams), 01481 'contentHash' => $this->content_md5h, 01482 'data_page_id' => $this->conf['id'], 01483 'data_page_reg1' => $this->conf['page_cache_reg1'], 01484 'data_page_type' => $this->conf['type'], 01485 'data_page_mp' => $this->conf['MP'], 01486 'gr_list' => $this->conf['gr_list'], 01487 'item_type' => 0, // TYPO3 page 01488 'item_title' => $this->contentParts['title'], 01489 'item_description' => $this->bodyDescription($this->contentParts), 01490 'item_mtime' => $this->conf['mtime'], 01491 'item_size' => strlen($this->conf['content']), 01492 'tstamp' => $GLOBALS['EXEC_TIME'], 01493 'crdate' => $GLOBALS['EXEC_TIME'], 01494 'item_crdate' => $this->conf['crdate'], // Creation date of page 01495 'sys_language_uid' => $this->conf['sys_language_uid'], // Sys language uid of the page. Should reflect which language it DOES actually display! 01496 'externalUrl' => 0, 01497 'recordUid' => intval($this->conf['recordUid']), 01498 'freeIndexUid' => intval($this->conf['freeIndexUid']), 01499 'freeIndexSetId' => intval($this->conf['freeIndexSetId']), 01500 ); 01501 01502 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields); 01503 01504 // PROCESSING index_section 01505 $this->submit_section($this->hash['phash'],$this->hash['phash']); 01506 01507 // PROCESSING index_grlist 01508 $this->submit_grlist($this->hash['phash'],$this->hash['phash']); 01509 01510 // PROCESSING index_fulltext 01511 $fields = array( 01512 'phash' => $this->hash['phash'], 01513 'fulltextdata' => implode(' ', $this->contentParts) 01514 ); 01515 if ($this->indexerConfig['fullTextDataLength']>0) { 01516 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']); 01517 } 01518 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields); 01519 01520 // PROCESSING index_debug 01521 if ($this->indexerConfig['debugMode']) { 01522 $fields = array( 01523 'phash' => $this->hash['phash'], 01524 'debuginfo' => serialize(array( 01525 'cHashParams' => $this->cHashParams, 01526 'external_parsers initialized' => array_keys($this->external_parsers), 01527 'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))), 01528 'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))), 01529 'logs' => $this->internal_log, 01530 'lexer' => $this->lexerObj->debugString, 01531 )) 01532 ); 01533 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields); 01534 } 01535 } 01536 01537 /** 01538 * Stores gr_list in the database. 01539 * 01540 * @param integer Search result record phash 01541 * @param integer Actual phash of current content 01542 * @return void 01543 * @see update_grlist() 01544 */ 01545 function submit_grlist($hash,$phash_x) { 01546 01547 // Setting the gr_list record 01548 $fields = array( 01549 'phash' => $hash, 01550 'phash_x' => $phash_x, 01551 'hash_gr_list' => $this->md5inthash($this->conf['gr_list']), 01552 'gr_list' => $this->conf['gr_list'] 01553 ); 01554 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields); 01555 } 01556 01557 /** 01558 * Stores section 01559 * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files. 01560 * 01561 * @param integer phash of TYPO3 parent search result record 01562 * @param integer phash of the file indexation search record 01563 * @return void 01564 */ 01565 function submit_section($hash,$hash_t3) { 01566 $fields = array( 01567 'phash' => $hash, 01568 'phash_t3' => $hash_t3, 01569 'page_id' => intval($this->conf['id']) 01570 ); 01571 01572 $this->getRootLineFields($fields); 01573 01574 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields); 01575 } 01576 01577 /** 01578 * Removes records for the indexed page, $phash 01579 * 01580 * @param integer phash value to flush 01581 * @return void 01582 */ 01583 function removeOldIndexedPages($phash) { 01584 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here. 01585 $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext,index_debug'); 01586 foreach($tableArr as $table) { 01587 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash)); 01588 } 01589 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file). 01590 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3='.intval($phash)); 01591 } 01592 01593 01594 01595 01596 01597 01598 01599 01600 01601 01602 01603 01604 01605 /******************************** 01606 * 01607 * SQL; External media 01608 * 01609 *******************************/ 01610 01611 01612 /** 01613 * Updates db with information about the file 01614 * 01615 * @param array Array with phash and phash_grouping keys for file 01616 * @param string File name 01617 * @param array Array of "cHashParams" for files: This is for instance the page index for a PDF file (other document types it will be a zero) 01618 * @param string File extension determining the type of media. 01619 * @param integer Modification time of file. 01620 * @param integer Creation time of file. 01621 * @param integer Size of file in bytes 01622 * @param integer Content HASH value. 01623 * @param array Standard content array (using only title and body for a file) 01624 * @return void 01625 */ 01626 function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) { 01627 01628 // Find item Type: 01629 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext]; 01630 $storeItemType = $storeItemType ? $storeItemType : $ext; 01631 01632 // Remove any current data for this phash: 01633 $this->removeOldIndexedFiles($hash['phash']); 01634 01635 // Split filename: 01636 $fileParts = parse_url($file); 01637 01638 // Setting new 01639 $fields = array( 01640 'phash' => $hash['phash'], 01641 'phash_grouping' => $hash['phash_grouping'], 01642 'cHashParams' => serialize($subinfo), 01643 'contentHash' => $content_md5h, 01644 'data_filename' => $file, 01645 'item_type' => $storeItemType, 01646 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file), 01647 'item_description' => $this->bodyDescription($contentParts), 01648 'item_mtime' => $mtime, 01649 'item_size' => $size, 01650 'item_crdate' => $ctime, 01651 'tstamp' => $GLOBALS['EXEC_TIME'], 01652 'crdate' => $GLOBALS['EXEC_TIME'], 01653 'gr_list' => $this->conf['gr_list'], 01654 'externalUrl' => $fileParts['scheme'] ? 1 : 0, 01655 'recordUid' => intval($this->conf['recordUid']), 01656 'freeIndexUid' => intval($this->conf['freeIndexUid']), 01657 'freeIndexSetId' => intval($this->conf['freeIndexSetId']), 01658 ); 01659 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields); 01660 01661 // PROCESSING index_fulltext 01662 $fields = array( 01663 'phash' => $hash['phash'], 01664 'fulltextdata' => implode(' ', $contentParts) 01665 ); 01666 if ($this->indexerConfig['fullTextDataLength']>0) { 01667 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']); 01668 } 01669 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields); 01670 01671 // PROCESSING index_debug 01672 if ($this->indexerConfig['debugMode']) { 01673 $fields = array( 01674 'phash' => $hash['phash'], 01675 'debuginfo' => serialize(array( 01676 'cHashParams' => $subinfo, 01677 'contentParts' => array_merge($contentParts,array('body' => substr($contentParts['body'],0,1000))), 01678 'logs' => $this->internal_log, 01679 'lexer' => $this->lexerObj->debugString, 01680 )) 01681 ); 01682 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields); 01683 } 01684 } 01685 01686 /** 01687 * Stores file gr_list for a file IF it does not exist already 01688 * 01689 * @param integer phash value of file 01690 * @return void 01691 */ 01692 function submitFile_grlist($hash) { 01693 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one. 01694 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows( 01695 'phash', 01696 'index_grlist', 01697 'phash=' . intval($hash) . 01698 ' AND (hash_gr_list=' . $this->md5inthash($this->defaultGrList) . 01699 ' OR hash_gr_list=' . $this->md5inthash($this->conf['gr_list']) . ')' 01700 ); 01701 if (!$count) { 01702 $this->submit_grlist($hash,$hash); 01703 } 01704 } 01705 01706 /** 01707 * Stores file section for a file IF it does not exist 01708 * 01709 * @param integer phash value of file 01710 * @return void 01711 */ 01712 function submitFile_section($hash) { 01713 // Testing if there is a section 01714 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->conf['id'])); 01715 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) { 01716 $this->submit_section($hash,$this->hash['phash']); 01717 } 01718 } 01719 01720 /** 01721 * Removes records for the indexed page, $phash 01722 * 01723 * @param integer phash value to flush 01724 * @return void 01725 */ 01726 function removeOldIndexedFiles($phash) { 01727 01728 // Removing old registrations for tables. 01729 $tableArr = explode(',','index_phash,index_grlist,index_fulltext,index_debug'); 01730 foreach($tableArr as $table) { 01731 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash)); 01732 } 01733 } 01734 01735 01736 01737 01738 01739 01740 01741 01742 01743 01744 01745 01746 01747 01748 /******************************** 01749 * 01750 * SQL Helper functions 01751 * 01752 *******************************/ 01753 01754 /** 01755 * Check the mtime / tstamp of the currently indexed page/file (based on phash) 01756 * Return positive integer if the page needs to be indexed 01757 * 01758 * @param integer mtime value to test against limits and indexed page (usually this is the mtime of the cached document) 01759 * @param integer "phash" used to select any already indexed page to see what its mtime is. 01760 * @return integer Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index. 01761 */ 01762 function checkMtimeTstamp($mtime,$phash) { 01763 01764 // Select indexed page: 01765 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash)); 01766 $out = 0; 01767 01768 // If there was an indexing of the page...: 01769 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) { 01770 if ($this->tstamp_maxAge && ($row['tstamp'] + $this->tstamp_maxAge) < $GLOBALS['EXEC_TIME']) { // If max age is exceeded, index the page 01771 $out = 1; // The configured max-age was exceeded for the document and thus it's indexed. 01772 } else { 01773 if (!$this->tstamp_minAge || ($row['tstamp'] + $this->tstamp_minAge) < $GLOBALS['EXEC_TIME']) { // if minAge is not set or if minAge is exceeded, consider at mtime 01774 if ($mtime) { // It mtime is set, then it's tested. If not, the page must clearly be indexed. 01775 if ($row['item_mtime'] != $mtime) { // And if mtime is different from the index_phash mtime, it's about time to re-index. 01776 $out = 2; // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed. 01777 } else { 01778 $out = -1; // mtime matched the document, so no changes detected and no content updated 01779 if ($this->tstamp_maxAge) { 01780 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1); 01781 } else { 01782 $this->updateTstamp($phash); // Update the timestatmp 01783 $this->log_setTSlogMessage('mtime matched, timestamp updated.',1); 01784 } 01785 } 01786 } else {$out = 3; } // The minimum age was exceed, but mtime was not set, so the page was indexed. 01787 } else {$out = -2;} // The minimum age was not exceeded 01788 } 01789 } else {$out = 4;} // Page has never been indexed (is not represented in the index_phash table). 01790 return $out; 01791 } 01792 01793 /** 01794 * Check content hash in phash table 01795 * 01796 * @return mixed Returns true if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related! 01797 */ 01798 function checkContentHash() { 01799 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page. 01800 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h)); 01801 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) { 01802 return $row; 01803 } 01804 return 1; 01805 } 01806 01807 /** 01808 * Check content hash for external documents 01809 * Returns true if the document needs to be indexed (that is, there was no result) 01810 * 01811 * @param integer phash value to check (phash_grouping) 01812 * @param integer Content hash to check 01813 * @return boolean Returns true if the document needs to be indexed (that is, there was no result) 01814 */ 01815 function checkExternalDocContentHash($hashGr,$content_md5h) { 01816 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h)); 01817 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) { 01818 return 0; 01819 } 01820 return 1; 01821 } 01822 01823 /** 01824 * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page) 01825 * 01826 * @param integer Phash integer to test. 01827 * @return void 01828 */ 01829 function is_grlist_set($phash_x) { 01830 return $GLOBALS['TYPO3_DB']->exec_SELECTcountRows( 01831 'phash_x', 01832 'index_grlist', 01833 'phash_x=' . intval($phash_x) 01834 ); 01835 } 01836 01837 /** 01838 * Check if an grlist-entry for this hash exists and if not so, write one. 01839 * 01840 * @param integer phash of the search result that should be found 01841 * @param integer The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact... 01842 * @return void 01843 * @see submit_grlist() 01844 */ 01845 function update_grlist($phash,$phash_x) { 01846 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->conf['gr_list'])); 01847 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) { 01848 $this->submit_grlist($phash,$phash_x); 01849 $this->log_setTSlogMessage("Inserted gr_list '".$this->conf['gr_list']."' for phash '".$phash."'",1); 01850 } 01851 } 01852 01853 /** 01854 * Update tstamp for a phash row. 01855 * 01856 * @param integer phash value 01857 * @param integer If set, update the mtime field to this value. 01858 * @return void 01859 */ 01860 function updateTstamp($phash,$mtime=0) { 01861 $updateFields = array( 01862 'tstamp' => $GLOBALS['EXEC_TIME'] 01863 ); 01864 if ($mtime) { $updateFields['item_mtime'] = intval($mtime); } 01865 01866 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields); 01867 } 01868 01869 /** 01870 * Update SetID of the index_phash record. 01871 * 01872 * @param integer phash value 01873 * @return void 01874 */ 01875 function updateSetId($phash) { 01876 $updateFields = array( 01877 'freeIndexSetId' => intval($this->conf['freeIndexSetId']) 01878 ); 01879 01880 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields); 01881 } 01882 01883 /** 01884 * Update parsetime for phash row. 01885 * 01886 * @param integer phash value. 01887 * @param integer Parsetime value to set. 01888 * @return void 01889 */ 01890 function updateParsetime($phash,$parsetime) { 01891 $updateFields = array( 01892 'parsetime' => intval($parsetime) 01893 ); 01894 01895 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields); 01896 } 01897 01898 /** 01899 * Update section rootline for the page 01900 * 01901 * @return void 01902 */ 01903 function updateRootline() { 01904 01905 $updateFields = array(); 01906 $this->getRootLineFields($updateFields); 01907 01908 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->conf['id']), $updateFields); 01909 } 01910 01911 /** 01912 * Adding values for root-line fields. 01913 * rl0, rl1 and rl2 are standard. A hook might add more. 01914 * 01915 * @param array Field array, passed by reference 01916 * @return void 01917 */ 01918 function getRootLineFields(&$fieldArr) { 01919 01920 $fieldArr['rl0'] = intval($this->conf['rootline_uids'][0]); 01921 $fieldArr['rl1'] = intval($this->conf['rootline_uids'][1]); 01922 $fieldArr['rl2'] = intval($this->conf['rootline_uids'][2]); 01923 01924 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) { 01925 foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) { 01926 $fieldArr[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]); 01927 } 01928 } 01929 } 01930 01931 /** 01932 * Removes any indexed pages with userlogins which has the same contentHash 01933 * NOT USED anywhere inside this class! 01934 * 01935 * @return void 01936 */ 01937 function removeLoginpagesWithContentHash() { 01938 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A,index_grlist B', ' 01939 A.phash=B.phash 01940 AND A.phash_grouping='.intval($this->hash['phash_grouping']).' 01941 AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).' 01942 AND A.contentHash='.intval($this->content_md5h)); 01943 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) { 01944 $this->log_setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1); 01945 $this->removeOldIndexedPages($row['phash']); 01946 } 01947 } 01948 01949 /** 01950 * Includes the crawler class 01951 * 01952 * @return void 01953 */ 01954 function includeCrawlerClass() { 01955 global $TYPO3_CONF_VARS; 01956 01957 require_once(t3lib_extMgm::extPath('crawler').'class.tx_crawler_lib.php'); 01958 } 01959 01960 01961 01962 01963 01964 01965 01966 01967 01968 01969 /******************************** 01970 * 01971 * SQL; Submitting words 01972 * 01973 *******************************/ 01974 01975 /** 01976 * Adds new words to db 01977 * 01978 * @param array Word List array (where each word has information about position etc). 01979 * @return void 01980 */ 01981 function checkWordList($wl) { 01982 $phashArr = array(); 01983 foreach ($wl as $key => $value) { 01984 $phashArr[] = $wl[$key]['hash']; 01985 } 01986 if (count($phashArr)) { 01987 $cwl = implode(',',$phashArr); 01988 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')'); 01989 01990 if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) { 01991 $this->log_setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1); 01992 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) { 01993 unset($wl[$row['baseword']]); 01994 } 01995 01996 foreach ($wl as $key => $val) { 01997 $insertFields = array( 01998 'wid' => $val['hash'], 01999 'baseword' => $key, 02000 'metaphone' => $val['metaphone'] 02001 ); 02002 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem. 02003 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields); 02004 } 02005 } 02006 } 02007 } 02008 02009 /** 02010 * Submits RELATIONS between words and phash 02011 * 02012 * @param array Word list array 02013 * @param integer phash value 02014 * @return void 02015 */ 02016 function submitWords($wl,$phash) { 02017 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash='.intval($phash)); 02018 02019 foreach($wl as $val) { 02020 $insertFields = array( 02021 'phash' => $phash, 02022 'wid' => $val['hash'], 02023 'count' => $val['count'], 02024 'first' => $val['first'], 02025 'freq' => $this->freqMap(($val['count']/$this->wordcount)), 02026 'flags' => ($val['cmp'] & $this->flagBitMask) 02027 ); 02028 02029 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields); 02030 } 02031 } 02032 02033 /** 02034 * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1 02035 * and back. 02036 * 02037 * @param double Frequency 02038 * @return integer Frequency in range. 02039 */ 02040 function freqMap($freq) { 02041 $mapFactor = $this->freqMax*100*$this->freqRange; 02042 if($freq<1) { 02043 $newFreq = $freq*$mapFactor; 02044 $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq; 02045 } else { 02046 $newFreq = $freq/$mapFactor; 02047 } 02048 return $newFreq; 02049 02050 } 02051 02052 02053 02054 02055 02056 02057 02058 02059 02060 02061 02062 /******************************** 02063 * 02064 * Hashing 02065 * 02066 *******************************/ 02067 02068 /** 02069 * Get search hash, T3 pages 02070 * 02071 * @return void 02072 */ 02073 function setT3Hashes() { 02074 02075 // Set main array: 02076 $hArray = array( 02077 'id' => (integer)$this->conf['id'], 02078 'type' => (integer)$this->conf['type'], 02079 'sys_lang' => (integer)$this->conf['sys_language_uid'], 02080 'MP' => (string)$this->conf['MP'], 02081 'cHash' => $this->cHashParams 02082 ); 02083 02084 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters): 02085 $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray)); 02086 02087 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.) 02088 $hArray['gr_list'] = (string)$this->conf['gr_list']; 02089 $this->hash['phash'] = $this->md5inthash(serialize($hArray)); 02090 } 02091 02092 /** 02093 * Get search hash, external files 02094 * 02095 * @param string File name / path which identifies it on the server 02096 * @param array Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing. 02097 * @return array Array with "phash_grouping" and "phash" inside. 02098 */ 02099 function setExtHashes($file,$subinfo=array()) { 02100 // Set main array: 02101 $hash = array(); 02102 $hArray = array( 02103 'file' => $file, 02104 ); 02105 02106 // Set grouping hash: 02107 $hash['phash_grouping'] = $this->md5inthash(serialize($hArray)); 02108 02109 // Add subinfo 02110 $hArray['subinfo'] = $subinfo; 02111 $hash['phash'] = $this->md5inthash(serialize($hArray)); 02112 02113 return $hash; 02114 } 02115 02116 /** 02117 * md5 integer hash 02118 * Using 7 instead of 8 just because that makes the integers lower than 32 bit (28 bit) and so they do not interfere with UNSIGNED integers or PHP-versions which has varying output from the hexdec function. 02119 * 02120 * @param string String to hash 02121 * @return integer Integer intepretation of the md5 hash of input string. 02122 */ 02123 function md5inthash($str) { 02124 return hexdec(substr(md5($str),0,7)); 02125 } 02126 02127 02128 02129 02130 02131 02132 02133 02134 02135 02136 02137 /********************************* 02138 * 02139 * Internal logging functions 02140 * 02141 *********************************/ 02142 02143 /** 02144 * Push function wrapper for TT logging 02145 * 02146 * @param string Title to set 02147 * @param string Key (?) 02148 * @return void 02149 */ 02150 function log_push($msg,$key) { 02151 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->push($msg,$key); 02152 } 02153 02154 /** 02155 * Pull function wrapper for TT logging 02156 * 02157 * @return void 02158 */ 02159 function log_pull() { 02160 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->pull(); 02161 } 02162 02163 /** 02164 * Set log message function wrapper for TT logging 02165 * 02166 * @param string Message to set 02167 * @param integer Error number 02168 * @return void 02169 */ 02170 function log_setTSlogMessage($msg, $errorNum=0) { 02171 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->setTSlogMessage($msg,$errorNum); 02172 $this->internal_log[] = $msg; 02173 } 02174 02175 02176 02177 02178 02179 02180 02181 02182 /************************** 02183 * 02184 * tslib_fe hooks: 02185 * 02186 **************************/ 02187 02188 /** 02189 * Makes sure that keywords are space-separated. This is impotant for their 02190 * proper displaying as a part of fulltext index. 02191 * 02192 * @param string $keywordList 02193 * @return string 02194 * @see http://bugs.typo3.org/view.php?id=1436 02195 */ 02196 protected function addSpacesToKeywordList($keywordList) { 02197 $keywords = t3lib_div::trimExplode(',', $keywordList); 02198 return ' ' . implode(', ', $keywords) . ' '; 02199 } 02200 } 02201 02202 02203 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php'])) { 02204 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']); 02205 } 02206 ?>
1.8.0