|
TYPO3 API
SVNRelease
|
00001 <?php 00002 /*************************************************************** 00003 * Copyright notice 00004 * 00005 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com) 00006 * All rights reserved 00007 * 00008 * This script is part of the TYPO3 project. The TYPO3 project is 00009 * free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 2 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * The GNU General Public License can be found at 00015 * http://www.gnu.org/copyleft/gpl.html. 00016 * A copy is found in the textfile GPL.txt and important notices to the license 00017 * from the author is found in LICENSE.txt distributed with these scripts. 00018 * 00019 * 00020 * This script is distributed in the hope that it will be useful, 00021 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00022 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00023 * GNU General Public License for more details. 00024 * 00025 * This copyright notice MUST APPEAR in all copies of the script! 00026 ***************************************************************/ 00027 /** 00028 * Crawler hook for indexed search. Works with the "crawler" extension 00029 * 00030 * @author Kasper Skårhøj <kasperYYYY@typo3.com> 00031 */ 00032 /** 00033 * [CLASS/FUNCTION INDEX of SCRIPT] 00034 * 00035 * 00036 * 00037 * 87: class tx_indexedsearch_crawler 00038 * 106: function crawler_init(&$pObj) 00039 * 219: function crawler_execute($params,&$pObj) 00040 * 285: function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj) 00041 * 345: function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj) 00042 * 414: function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj) 00043 * 458: function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj) 00044 * 513: function cleanUpOldRunningConfigurations() 00045 * 00046 * SECTION: Helper functions 00047 * 579: function checkUrl($url,$urlLog,$baseUrl) 00048 * 602: function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId) 00049 * 645: function indexSingleRecord($r,$cfgRec,$rl=NULL) 00050 * 694: function loadIndexerClass() 00051 * 706: function getUidRootLineForClosestTemplate($id) 00052 * 739: function generateNextIndexingTime($cfgRec) 00053 * 778: function checkDeniedSuburls($url, $url_deny) 00054 * 798: function addQueueEntryForHook($cfgRec, $title) 00055 * 00056 * SECTION: Hook functions for TCEmain (indexing of records) 00057 * 830: function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, &$pObj) 00058 * 00059 * 00060 * 879: class tx_indexedsearch_files 00061 * 888: function crawler_execute($params,&$pObj) 00062 * 913: function loadIndexerClass() 00063 * 00064 * TOTAL FUNCTIONS: 18 00065 * (This index is automatically created/updated by the extension "extdeveval") 00066 * 00067 */ 00068 00069 00070 00071 00072 # To make sure the backend charset is available: 00073 if (!is_object($GLOBALS['LANG'])) { 00074 $GLOBALS['LANG'] = t3lib_div::makeInstance('language'); 00075 $GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']); 00076 } 00077 00078 00079 /** 00080 * Crawler hook for indexed search. Works with the "crawler" extension 00081 * 00082 * @author Kasper Skårhøj <kasperYYYY@typo3.com> 00083 * @package TYPO3 00084 * @subpackage tx_indexedsearch 00085 */ 00086 class tx_indexedsearch_crawler { 00087 00088 // Static: 00089 var $secondsPerExternalUrl = 3; // Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3) 00090 00091 // Internal, dynamic: 00092 var $instanceCounter = 0; // Counts up for each added URL (type 3) 00093 00094 // Internal, static: 00095 var $callBack = 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'; // The object reference to this class. 00096 00097 /** 00098 * Initialization of crawler hook. 00099 * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing. 00100 * In reality we select indexing configurations and evaluate if any of them needs to run. 00101 * 00102 * @param object Parent object (tx_crawler lib) 00103 * @return void 00104 */ 00105 function crawler_init(&$pObj){ 00106 00107 // Select all indexing configuration which are waiting to be activated: 00108 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows( 00109 '*', 00110 'index_config', 00111 'hidden=0 00112 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ') 00113 AND timer_next_indexing<' . $GLOBALS['EXEC_TIME'] . ' 00114 AND set_id=0 00115 '.t3lib_BEfunc::deleteClause('index_config') 00116 ); 00117 00118 // For each configuration, check if it should be executed and if so, start: 00119 foreach($indexingConfigurations as $cfgRec) { 00120 00121 // Generate a unique set-ID: 00122 $setId = t3lib_div::md5int(microtime()); 00123 00124 // Get next time: 00125 $nextTime = $this->generateNextIndexingTime($cfgRec); 00126 00127 // Start process by updating index-config record: 00128 $field_array = array ( 00129 'set_id' => $setId, 00130 'timer_next_indexing' => $nextTime, 00131 'session_data' => '', 00132 ); 00133 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array); 00134 00135 // Based on configuration type: 00136 switch($cfgRec['type']) { 00137 case 1: // RECORDS: 00138 00139 // Parameters: 00140 $params = array( 00141 'indexConfigUid' => $cfgRec['uid'], 00142 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), 00143 'url' => 'Records (start)', // Just for show. 00144 ); 00145 // 00146 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']); 00147 break; 00148 case 2: // FILES: 00149 00150 // Parameters: 00151 $params = array( 00152 'indexConfigUid' => $cfgRec['uid'], // General 00153 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General 00154 'url' => $cfgRec['filepath'], // Partly general... (for URL and file types) 00155 'depth' => 0 // Specific for URL and file types 00156 ); 00157 00158 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']); 00159 break; 00160 case 3: // External URL: 00161 00162 // Parameters: 00163 $params = array( 00164 'indexConfigUid' => $cfgRec['uid'], // General 00165 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General 00166 'url' => $cfgRec['externalUrl'], // Partly general... (for URL and file types) 00167 'depth' => 0 // Specific for URL and file types 00168 ); 00169 00170 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']); 00171 break; 00172 case 4: // Page tree 00173 00174 // Parameters: 00175 $params = array( 00176 'indexConfigUid' => $cfgRec['uid'], // General 00177 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), // General 00178 'url' => intval($cfgRec['alternative_source_pid']), // Partly general... (for URL and file types and page tree (root)) 00179 'depth' => 0 // Specific for URL and file types and page tree 00180 ); 00181 00182 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']); 00183 break; 00184 case 5: // Meta configuration, nothing to do: 00185 # NOOP 00186 break; 00187 default: 00188 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) { 00189 $hookObj = t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]); 00190 00191 if (is_object($hookObj)) { 00192 00193 // Parameters: 00194 $params = array( 00195 'indexConfigUid' => $cfgRec['uid'], // General 00196 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].'/CUSTOM]'), // General 00197 'url' => $hookObj->initMessage($message), 00198 ); 00199 00200 $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']); 00201 } 00202 } 00203 break; 00204 } 00205 } 00206 00207 // Finally, look up all old index configurations which are finished and needs to be reset and done. 00208 $this->cleanUpOldRunningConfigurations(); 00209 } 00210 00211 /** 00212 * Call back function for execution of a log element 00213 * 00214 * @param array Params from log element. Must contain $params['indexConfigUid'] 00215 * @param object Parent object (tx_crawler lib) 00216 * @return array Result array 00217 */ 00218 function crawler_execute($params,&$pObj) { 00219 00220 // Indexer configuration ID must exist: 00221 if ($params['indexConfigUid']) { 00222 00223 // Load the indexing configuration record: 00224 $cfgRec = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow( 00225 '*', 00226 'index_config', 00227 'uid='.intval($params['indexConfigUid']) 00228 ); 00229 00230 if (is_array($cfgRec)) { 00231 00232 // Unpack session data: 00233 $session_data = unserialize($cfgRec['session_data']); 00234 00235 // Select which type: 00236 switch($cfgRec['type']) { 00237 case 1: // Records: 00238 $this->crawler_execute_type1($cfgRec,$session_data,$params,$pObj); 00239 break; 00240 case 2: // Files 00241 $this->crawler_execute_type2($cfgRec,$session_data,$params,$pObj); 00242 break; 00243 case 3: // External URL: 00244 $this->crawler_execute_type3($cfgRec,$session_data,$params,$pObj); 00245 break; 00246 case 4: // Page tree: 00247 $this->crawler_execute_type4($cfgRec,$session_data,$params,$pObj); 00248 break; 00249 case 5: // Meta 00250 # NOOP (should never enter here!) 00251 break; 00252 default: 00253 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) { 00254 $hookObj = t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]); 00255 00256 if (is_object($hookObj)) { 00257 $this->pObj = $pObj; // For addQueueEntryForHook() 00258 $hookObj->indexOperation($cfgRec,$session_data,$params,$this); 00259 } 00260 } 00261 break; 00262 } 00263 00264 // Save process data which might be modified: 00265 $field_array = array ( 00266 'session_data' => serialize($session_data) 00267 ); 00268 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array); 00269 } 00270 } 00271 00272 return array('log' => $params); 00273 } 00274 00275 /** 00276 * Indexing records from a table 00277 * 00278 * @param array Indexing Configuration Record 00279 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call! 00280 * @param array Parameters from the log queue. 00281 * @param object Parent object (from "crawler" extension!) 00282 * @return void 00283 */ 00284 function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj) { 00285 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) { 00286 00287 // Init session data array if not already: 00288 if (!is_array($session_data)) { 00289 $session_data = array( 00290 'uid' => 0 00291 ); 00292 } 00293 00294 // Init: 00295 $pid = intval($cfgRec['alternative_source_pid']) ? intval($cfgRec['alternative_source_pid']) : $cfgRec['pid']; 00296 $numberOfRecords = $cfgRec['recordsbatch'] ? t3lib_div::intInRange($cfgRec['recordsbatch'],1) : 100; 00297 00298 // Get root line: 00299 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']); 00300 00301 // Select 00302 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows( 00303 '*', 00304 $cfgRec['table2index'], 00305 'pid = '.intval($pid).' 00306 AND uid > '.intval($session_data['uid']). 00307 t3lib_BEfunc::deleteClause($cfgRec['table2index']). 00308 t3lib_BEfunc::BEenableFields($cfgRec['table2index']), 00309 '', 00310 'uid', 00311 $numberOfRecords 00312 ); 00313 00314 // Traverse: 00315 if (count($recs)) { 00316 foreach($recs as $r) { 00317 00318 // Index single record: 00319 $this->indexSingleRecord($r,$cfgRec,$rl); 00320 00321 // Update the UID we last processed: 00322 $session_data['uid'] = $r['uid']; 00323 } 00324 00325 // Finally, set entry for next indexing of batch of records: 00326 $nparams = array( 00327 'indexConfigUid' => $cfgRec['uid'], 00328 'url' => 'Records from UID#'.($r['uid']+1).'-?', 00329 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']') 00330 ); 00331 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']); 00332 } 00333 } 00334 } 00335 00336 /** 00337 * Indexing files from fileadmin 00338 * 00339 * @param array Indexing Configuration Record 00340 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call! 00341 * @param array Parameters from the log queue. 00342 * @param object Parent object (from "crawler" extension!) 00343 * @return void 00344 */ 00345 function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj) { 00346 00347 // Prepare path, making it absolute and checking: 00348 $readpath = $params['url']; 00349 if (!t3lib_div::isAbsPath($readpath)) { 00350 $readpath = t3lib_div::getFileAbsFileName($readpath); 00351 } 00352 00353 if (t3lib_div::isAllowedAbsPath($readpath)) { 00354 if (@is_file($readpath)) { // If file, index it! 00355 00356 // Get root line (need to provide this when indexing external files) 00357 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']); 00358 00359 // Load indexer if not yet. 00360 $this->loadIndexerClass(); 00361 00362 // (Re)-Indexing file on page. 00363 $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer'); 00364 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl); 00365 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']); 00366 $indexerObj->hash['phash'] = -1; // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!) 00367 00368 // Index document: 00369 $indexerObj->indexRegularDocument(substr($readpath,strlen(PATH_site)), TRUE); 00370 } elseif (@is_dir($readpath)) { // If dir, read content and create new pending items for log: 00371 00372 // Select files and directories in path: 00373 $extList = implode(',',t3lib_div::trimExplode(',',$cfgRec['extensions'],1)); 00374 $fileArr = array(); 00375 $files = t3lib_div::getAllFilesAndFoldersInPath($fileArr,$readpath,$extList,0,0); 00376 00377 $directoryList = t3lib_div::get_dirs($readpath); 00378 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) { 00379 foreach ($directoryList as $subdir) { 00380 if ((string)$subdir!='') { 00381 $files[]= $readpath.$subdir.'/'; 00382 } 00383 } 00384 } 00385 $files = t3lib_div::removePrefixPathFromList($files,PATH_site); 00386 00387 // traverse the items and create log entries: 00388 foreach($files as $path) { 00389 $this->instanceCounter++; 00390 if ($path!==$params['url']) { 00391 // Parameters: 00392 $nparams = array( 00393 'indexConfigUid' => $cfgRec['uid'], 00394 'url' => $path, 00395 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), 00396 'depth' => $params['depth']+1 00397 ); 00398 $pObj->addQueueEntry_callBack( 00399 $cfgRec['set_id'], 00400 $nparams, 00401 $this->callBack, 00402 $cfgRec['pid'], 00403 $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl 00404 ); 00405 } 00406 } 00407 } 00408 } 00409 } 00410 00411 /** 00412 * Indexing External URLs 00413 * 00414 * @param array Indexing Configuration Record 00415 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call! 00416 * @param array Parameters from the log queue. 00417 * @param object Parent object (from "crawler" extension!) 00418 * @return void 00419 */ 00420 function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj) { 00421 00422 // Init session data array if not already: 00423 if (!is_array($session_data)) { 00424 $session_data = array( 00425 'urlLog' => array($params['url']) 00426 ); 00427 } 00428 00429 // Index the URL: 00430 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']); 00431 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']); 00432 00433 // Add more elements to log now: 00434 if ($params['depth'] < $cfgRec['depth']) { 00435 foreach($subUrls as $url) { 00436 if ($url = $this->checkUrl($url,$session_data['urlLog'],$cfgRec['externalUrl'])) { 00437 if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) { 00438 $this->instanceCounter++; 00439 $session_data['urlLog'][] = $url; 00440 00441 // Parameters: 00442 $nparams = array( 00443 'indexConfigUid' => $cfgRec['uid'], 00444 'url' => $url, 00445 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), 00446 'depth' => $params['depth']+1 00447 ); 00448 $pObj->addQueueEntry_callBack( 00449 $cfgRec['set_id'], 00450 $nparams, 00451 $this->callBack, 00452 $cfgRec['pid'], 00453 $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl 00454 ); 00455 } 00456 } 00457 } 00458 } 00459 } 00460 00461 /** 00462 * Page tree indexing type 00463 * 00464 * @param array Indexing Configuration Record 00465 * @param array Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call! 00466 * @param array Parameters from the log queue. 00467 * @param object Parent object (from "crawler" extension!) 00468 * @return void 00469 */ 00470 function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj) { 00471 00472 // Base page uid: 00473 $pageUid = intval($params['url']); 00474 00475 // Get array of URLs from page: 00476 $pageRow = t3lib_BEfunc::getRecord('pages',$pageUid); 00477 $res = $pObj->getUrlsForPageRow($pageRow); 00478 00479 $duplicateTrack = array(); // Registry for duplicates 00480 $downloadUrls = array(); // Dummy. 00481 00482 // Submit URLs: 00483 if (count($res)) { 00484 foreach($res as $paramSetKey => $vv) { 00485 $urlList = $pObj->urlListFromUrlArray( 00486 $vv, 00487 $pageRow, 00488 $GLOBALS['EXEC_TIME'], 00489 30, 00490 1, 00491 0, 00492 $duplicateTrack, 00493 $downloadUrls, 00494 array('tx_indexedsearch_reindex') 00495 ); 00496 } 00497 } 00498 00499 // Add subpages to log now: 00500 if ($params['depth'] < $cfgRec['depth']) { 00501 00502 // Subpages selected 00503 $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows( 00504 'uid,title', 00505 'pages', 00506 'pid = '.intval($pageUid). 00507 t3lib_BEfunc::deleteClause('pages') 00508 ); 00509 00510 // Traverse subpages and add to queue: 00511 if (count($recs)) { 00512 foreach($recs as $r) { 00513 $this->instanceCounter++; 00514 $url = 'pages:'.$r['uid'].': '.$r['title']; 00515 $session_data['urlLog'][] = $url; 00516 00517 // Parameters: 00518 $nparams = array( 00519 'indexConfigUid' => $cfgRec['uid'], 00520 'url' => $r['uid'], 00521 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'), 00522 'depth' => $params['depth']+1 00523 ); 00524 $pObj->addQueueEntry_callBack( 00525 $cfgRec['set_id'], 00526 $nparams, 00527 $this->callBack, 00528 $cfgRec['pid'], 00529 $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl 00530 ); 00531 } 00532 } 00533 } 00534 } 00535 00536 /** 00537 * Look up all old index configurations which are finished and needs to be reset and done 00538 * 00539 * @return void 00540 */ 00541 function cleanUpOldRunningConfigurations() { 00542 00543 // Lookup running index configurations: 00544 $runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows( 00545 'uid,set_id', 00546 'index_config', 00547 'set_id!=0'.t3lib_BEfunc::deleteClause('index_config') 00548 ); 00549 00550 // For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE) 00551 foreach($runningIndexingConfigurations as $cfgRec) { 00552 00553 // Look for ended processes: 00554 $queued_items = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows( 00555 '*', 00556 'tx_crawler_queue', 00557 'set_id=' . intval($cfgRec['set_id']) . ' AND exec_time=0' 00558 ); 00559 00560 if (!$queued_items) { 00561 00562 // Lookup old phash rows: 00563 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows( 00564 'phash', 00565 'index_phash', 00566 'freeIndexUid='.intval($cfgRec['uid']).' AND freeIndexSetId!='.$cfgRec['set_id'] 00567 ); 00568 00569 foreach($oldPhashRows as $pHashRow) { 00570 // Removing old registrations for all tables (code copied from class.tx_indexedsearch_modfunc1.php) 00571 $tableArr = explode(',','index_phash,index_rel,index_section,index_grlist,index_fulltext,index_debug'); 00572 foreach($tableArr as $table) { 00573 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($pHashRow['phash'])); 00574 } 00575 } 00576 00577 // End process by updating index-config record: 00578 $field_array = array ( 00579 'set_id' => 0, 00580 'session_data' => '', 00581 ); 00582 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array); 00583 } 00584 } 00585 } 00586 00587 00588 00589 00590 00591 00592 00593 /***************************************** 00594 * 00595 * Helper functions 00596 * 00597 *****************************************/ 00598 00599 /** 00600 * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log. 00601 * 00602 * @param string URL string to check 00603 * @param array Array of already indexed URLs (input url is looked up here and must not exist already) 00604 * @param string Base URL of the indexing process (input URL must be "inside" the base URL!) 00605 * @return string Returls the URL if OK, otherwise false 00606 */ 00607 function checkUrl($url,$urlLog,$baseUrl) { 00608 $url = preg_replace('/\/\/$/','/',$url); 00609 list($url) = explode('#',$url); 00610 00611 if (!strstr($url,'../')) { 00612 if (t3lib_div::isFirstPartOfStr($url,$baseUrl)) { 00613 if (!in_array($url,$urlLog)) { 00614 return $url; 00615 } 00616 } 00617 } 00618 } 00619 00620 /** 00621 * Indexing External URL 00622 * 00623 * @param string URL, http://.... 00624 * @param integer Page id to relate indexing to. 00625 * @param array Rootline array to relate indexing to 00626 * @param integer Configuration UID 00627 * @param integer Set ID value 00628 * @return array URLs found on this page 00629 */ 00630 function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId) { 00631 00632 // Load indexer if not yet. 00633 $this->loadIndexerClass(); 00634 00635 // Index external URL: 00636 $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer'); 00637 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl); 00638 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId); 00639 $indexerObj->hash['phash'] = -1; // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!) 00640 00641 $indexerObj->indexExternalUrl($url); 00642 $url_qParts = parse_url($url); 00643 00644 $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host']; 00645 $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content); 00646 if (!$baseHref) { 00647 // Extract base href from current URL 00648 $baseHref = $baseAbsoluteHref; 00649 $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/')); 00650 } 00651 $baseHref = rtrim($baseHref, '/'); 00652 00653 // Get URLs on this page: 00654 $subUrls = array(); 00655 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content); 00656 00657 // Traverse links: 00658 foreach ($list as $count => $linkInfo) { 00659 00660 // Decode entities: 00661 $subUrl = t3lib_div::htmlspecialchars_decode($linkInfo['href']); 00662 00663 $qParts = parse_url($subUrl); 00664 if (!$qParts['scheme']) { 00665 $relativeUrl = t3lib_div::resolveBackPath($subUrl); 00666 if ($relativeUrl{0} === '/') { 00667 $subUrl = $baseAbsoluteHref . $relativeUrl; 00668 } else { 00669 $subUrl = $baseHref . '/' . $relativeUrl; 00670 } 00671 } 00672 00673 $subUrls[] = $subUrl; 00674 } 00675 00676 return $subUrls; 00677 } 00678 00679 /** 00680 * Indexing Single Record 00681 * 00682 * @param array Record to index 00683 * @param array Configuration Record 00684 * @param array Rootline array to relate indexing to 00685 * @return void 00686 */ 00687 function indexSingleRecord($r,$cfgRec,$rl=NULL) { 00688 00689 // Load indexer if not yet. 00690 $this->loadIndexerClass(); 00691 00692 00693 // Init: 00694 $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']); 00695 $fieldList = t3lib_div::trimExplode(',',$cfgRec['fieldlist'],1); 00696 $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField']; 00697 $sys_language_uid = $languageField ? $r[$languageField] : 0; 00698 00699 // (Re)-Indexing a row from a table: 00700 $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer'); 00701 parse_str(str_replace('###UID###',$r['uid'],$cfgRec['get_params']),$GETparams); 00702 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE); 00703 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']); 00704 $indexerObj->forceIndexing = TRUE; 00705 00706 $theContent = ''; 00707 foreach($fieldList as $k => $v) { 00708 if (!$k) { 00709 $theTitle = $r[$v]; 00710 } else { 00711 $theContent.= $r[$v].' '; 00712 } 00713 } 00714 00715 // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid()) 00716 $indexerObj->backend_indexAsTYPO3Page( 00717 strip_tags(str_replace('<', ' <', $theTitle)), 00718 '', 00719 '', 00720 strip_tags(str_replace('<', ' <', $theContent)), 00721 $GLOBALS['LANG']->charSet, // Requires that 00722 $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']], 00723 $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']], 00724 $r['uid'] 00725 ); 00726 } 00727 00728 /** 00729 * Include indexer class. 00730 * 00731 * @return void 00732 */ 00733 function loadIndexerClass() { 00734 global $TYPO3_CONF_VARS; 00735 require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php'); 00736 } 00737 00738 /** 00739 * Get rootline for closest TypoScript template root. 00740 * Algorithm same as used in Web > Template, Object browser 00741 * 00742 * @param integer The page id to traverse rootline back from 00743 * @return array Array where the root lines uid values are found. 00744 */ 00745 function getUidRootLineForClosestTemplate($id) { 00746 global $TYPO3_CONF_VARS; 00747 00748 $tmpl = t3lib_div::makeInstance("t3lib_tsparser_ext"); 00749 $tmpl->tt_track = 0; // Do not log time-performance information 00750 $tmpl->init(); 00751 00752 // Gets the rootLine 00753 $sys_page = t3lib_div::makeInstance("t3lib_pageSelect"); 00754 $rootLine = $sys_page->getRootLine($id); 00755 $tmpl->runThroughTemplates($rootLine,0); // This generates the constants/config + hierarchy info for the template. 00756 00757 // Root line uids 00758 $rootline_uids = array(); 00759 foreach($tmpl->rootLine as $rlkey => $rldat) { 00760 $rootline_uids[$rlkey] = $rldat['uid']; 00761 } 00762 00763 return $rootline_uids; 00764 } 00765 00766 /** 00767 * Generate the unix time stamp for next visit. 00768 * 00769 * @param array Index configuration record 00770 * @return integer The next time stamp 00771 */ 00772 function generateNextIndexingTime($cfgRec) { 00773 $currentTime = $GLOBALS['EXEC_TIME']; 00774 00775 // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected 00776 if ($cfgRec['timer_frequency']<=24*3600) { 00777 $aMidNight = mktime (0,0,0)-1*24*3600; 00778 } else { 00779 $lastTime = $cfgRec['timer_next_indexing'] ? $cfgRec['timer_next_indexing'] : $GLOBALS['EXEC_TIME']; 00780 $aMidNight = mktime (0,0,0, date('m',$lastTime), date('d',$lastTime), date('y',$lastTime)); 00781 } 00782 00783 // Find last offset time plus frequency in seconds: 00784 $lastSureOffset = $aMidNight+t3lib_div::intInRange($cfgRec['timer_offset'],0,86400); 00785 $frequencySeconds = t3lib_div::intInRange($cfgRec['timer_frequency'],1); 00786 00787 // Now, find out how many blocks of the length of frequency there is until the next time: 00788 $frequencyBlocksUntilNextTime = ceil(($currentTime-$lastSureOffset)/$frequencySeconds); 00789 00790 // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds. 00791 $nextTime = $lastSureOffset + $frequencyBlocksUntilNextTime*$frequencySeconds; 00792 00793 return $nextTime; 00794 } 00795 00796 /** 00797 * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns true. 00798 * 00799 * @param string URL to test 00800 * @param string String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend) 00801 * @return boolean TRUE if there is a matching URL (hence, do not index!) 00802 */ 00803 function checkDeniedSuburls($url, $url_deny) { 00804 if (trim($url_deny)) { 00805 $url_denyArray = t3lib_div::trimExplode(LF,$url_deny,1); 00806 foreach($url_denyArray as $testurl) { 00807 if (t3lib_div::isFirstPartOfStr($url,$testurl)) { 00808 echo $url.' /// '.$url_deny.LF; 00809 return TRUE; 00810 } 00811 } 00812 } 00813 return FALSE; 00814 } 00815 00816 /** 00817 * Adding entry in queue for Hook 00818 * 00819 * @param array Configuration record 00820 * @param string Title/URL 00821 * @return void 00822 */ 00823 function addQueueEntryForHook($cfgRec, $title) { 00824 00825 $nparams = array( 00826 'indexConfigUid' => $cfgRec['uid'], // This must ALWAYS be the cfgRec uid! 00827 'url' => $title, 00828 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']') // Also just for information. Its good style to show that its an indexing configuration that added the entry. 00829 ); 00830 $this->pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']); 00831 } 00832 00833 /** 00834 * Deletes all data stored by indexed search for a given page 00835 * 00836 * @param integer Uid of the page to delete all pHash 00837 * @return void 00838 */ 00839 function deleteFromIndex($id) { 00840 00841 // Lookup old phash rows: 00842 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash','index_section', 'page_id='.intval($id)); 00843 00844 if (count($oldPhashRows)) { 00845 $pHashesToDelete = array(); 00846 foreach ($oldPhashRows as $pHashRow) { 00847 $pHashesToDelete[] = $pHashRow['phash']; 00848 } 00849 00850 $where_clause = 'phash IN ('.implode(',',$GLOBALS['TYPO3_DB']->cleanIntArray($pHashesToDelete)).')'; 00851 $tables = explode(',', 'index_debug,index_fulltext,index_grlist,index_phash,index_rel,index_section'); 00852 foreach ($tables as $table) { 00853 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where_clause); 00854 } 00855 } 00856 } 00857 00858 00859 00860 00861 00862 00863 00864 /************************* 00865 * 00866 * Hook functions for TCEmain (indexing of records) 00867 * 00868 *************************/ 00869 00870 /** 00871 * TCEmain hook function for on-the-fly indexing of database records 00872 * 00873 * @param string TCEmain command 00874 * @param string Table name 00875 * @param string Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs 00876 * @param mixed Target value (ignored) 00877 * @param object Reference to tcemain calling object 00878 * @return void 00879 */ 00880 function processCmdmap_preProcess($command, $table, $id, $value, $pObj) { 00881 00882 // Clean up the index 00883 if ($command=='delete' && $table == 'pages') { 00884 $this->deleteFromIndex($id); 00885 } 00886 } 00887 00888 /** 00889 * TCEmain hook function for on-the-fly indexing of database records 00890 * 00891 * @param string Status "new" or "update" 00892 * @param string Table name 00893 * @param string Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs 00894 * @param array Field array of updated fields in the operation 00895 * @param object Reference to tcemain calling object 00896 * @return void 00897 */ 00898 function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj) { 00899 00900 // Check if any fields are actually updated: 00901 if (count($fieldArray)) { 00902 00903 // Translate new ids. 00904 if ($status=='new') { 00905 $id = $pObj->substNEWwithIDs[$id]; 00906 00907 } elseif ($table=='pages' && $status=='update' && ((array_key_exists('hidden',$fieldArray) && $fieldArray['hidden']==1) || (array_key_exists('no_search',$fieldArray) && $fieldArray['no_search']==1))) { 00908 00909 // If the page should be hidden or not indexed after update, delete index for this page 00910 $this->deleteFromIndex($id); 00911 } 00912 00913 // Get full record and if exists, search for indexing configurations: 00914 $currentRecord = t3lib_BEfunc::getRecord($table,$id); 00915 if (is_array($currentRecord)) { 00916 00917 // Select all (not running) indexing configurations of type "record" (1) and which points to this table and is located on the same page as the record or pointing to the right source PID 00918 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows( 00919 '*', 00920 'index_config', 00921 'hidden=0 00922 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ') 00923 AND set_id=0 00924 AND type=1 00925 AND table2index='.$GLOBALS['TYPO3_DB']->fullQuoteStr($table,'index_config').' 00926 AND ( 00927 (alternative_source_pid=0 AND pid='.intval($currentRecord['pid']).') 00928 OR (alternative_source_pid='.intval($currentRecord['pid']).') 00929 ) 00930 AND records_indexonchange=1 00931 '.t3lib_BEfunc::deleteClause('index_config') 00932 ); 00933 00934 foreach($indexingConfigurations as $cfgRec) { 00935 $this->indexSingleRecord($currentRecord,$cfgRec); 00936 } 00937 } 00938 } 00939 } 00940 } 00941 00942 00943 /** 00944 * Crawler hook for indexed search. Works with the "crawler" extension 00945 * This hook is specifically used to index external files found on pages through the crawler extension. 00946 * 00947 * @author Kasper Skårhøj <kasperYYYY@typo3.com> 00948 * @package TYPO3 00949 * @subpackage tx_indexedsearch 00950 * @see tx_indexedsearch_indexer::extractLinks() 00951 */ 00952 class tx_indexedsearch_files { 00953 00954 /** 00955 * Call back function for execution of a log element 00956 * 00957 * @param array Params from log element. 00958 * @param object Parent object (tx_crawler lib) 00959 * @return array Result array 00960 */ 00961 function crawler_execute($params,&$pObj) { 00962 00963 // Load indexer if not yet. 00964 $this->loadIndexerClass(); 00965 00966 if (is_array($params['conf'])) { 00967 00968 // Initialize the indexer class: 00969 $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer'); 00970 $indexerObj->conf = $params['conf']; 00971 $indexerObj->init(); 00972 00973 // Index document: 00974 if ($params['alturl']) { 00975 $fI = pathinfo($params['document']); 00976 $ext = strtolower($fI['extension']); 00977 $indexerObj->indexRegularDocument($params['alturl'], TRUE, $params['document'], $ext); 00978 } else { 00979 $indexerObj->indexRegularDocument($params['document'], TRUE); 00980 } 00981 00982 // Return OK: 00983 return array('content' => array()); 00984 } 00985 } 00986 00987 /** 00988 * Include indexer class. 00989 * 00990 * @return void 00991 */ 00992 function loadIndexerClass() { 00993 global $TYPO3_CONF_VARS; 00994 require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php'); 00995 } 00996 } 00997 00998 00999 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php'])) { 01000 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php']); 01001 } 01002 01003 ?>
1.8.0