TYPO3 API  SVNRelease
class.crawler.php
Go to the documentation of this file.
00001 <?php
00002 /***************************************************************
00003 *  Copyright notice
00004 *
00005 *  (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
00006 *  All rights reserved
00007 *
00008 *  This script is part of the TYPO3 project. The TYPO3 project is
00009 *  free software; you can redistribute it and/or modify
00010 *  it under the terms of the GNU General Public License as published by
00011 *  the Free Software Foundation; either version 2 of the License, or
00012 *  (at your option) any later version.
00013 *
00014 *  The GNU General Public License can be found at
00015 *  http://www.gnu.org/copyleft/gpl.html.
00016 *  A copy is found in the textfile GPL.txt and important notices to the license
00017 *  from the author is found in LICENSE.txt distributed with these scripts.
00018 *
00019 *
00020 *  This script is distributed in the hope that it will be useful,
00021 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00022 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00023 *  GNU General Public License for more details.
00024 *
00025 *  This copyright notice MUST APPEAR in all copies of the script!
00026 ***************************************************************/
00027 /**
00028  * Crawler hook for indexed search. Works with the "crawler" extension
00029  *
00030  * @author  Kasper Skårhøj <kasperYYYY@typo3.com>
00031  */
00032 /**
00033  * [CLASS/FUNCTION INDEX of SCRIPT]
00034  *
00035  *
00036  *
00037  *   87: class tx_indexedsearch_crawler
00038  *  106:     function crawler_init(&$pObj)
00039  *  219:     function crawler_execute($params,&$pObj)
00040  *  285:     function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj)
00041  *  345:     function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj)
00042  *  414:     function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj)
00043  *  458:     function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj)
00044  *  513:     function cleanUpOldRunningConfigurations()
00045  *
00046  *              SECTION: Helper functions
00047  *  579:     function checkUrl($url,$urlLog,$baseUrl)
00048  *  602:     function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
00049  *  645:     function indexSingleRecord($r,$cfgRec,$rl=NULL)
00050  *  694:     function loadIndexerClass()
00051  *  706:     function getUidRootLineForClosestTemplate($id)
00052  *  739:     function generateNextIndexingTime($cfgRec)
00053  *  778:     function checkDeniedSuburls($url, $url_deny)
00054  *  798:     function addQueueEntryForHook($cfgRec, $title)
00055  *
00056  *              SECTION: Hook functions for TCEmain (indexing of records)
00057  *  830:     function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, &$pObj)
00058  *
00059  *
00060  *  879: class tx_indexedsearch_files
00061  *  888:     function crawler_execute($params,&$pObj)
00062  *  913:     function loadIndexerClass()
00063  *
00064  * TOTAL FUNCTIONS: 18
00065  * (This index is automatically created/updated by the extension "extdeveval")
00066  *
00067  */
00068 
00069 
00070 
00071 
00072 # To make sure the backend charset is available:
00073 if (!is_object($GLOBALS['LANG']))   {
00074     $GLOBALS['LANG'] = t3lib_div::makeInstance('language');
00075     $GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']);
00076 }
00077 
00078 
00079 /**
00080  * Crawler hook for indexed search. Works with the "crawler" extension
00081  *
00082  * @author  Kasper Skårhøj <kasperYYYY@typo3.com>
00083  * @package TYPO3
00084  * @subpackage tx_indexedsearch
00085  */
00086 class tx_indexedsearch_crawler {
00087 
00088         // Static:
00089     var $secondsPerExternalUrl = 3;     // Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
00090 
00091         // Internal, dynamic:
00092     var $instanceCounter = 0;       // Counts up for each added URL (type 3)
00093 
00094         // Internal, static:
00095     var $callBack = 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler';       // The object reference to this class.
00096 
00097     /**
00098      * Initialization of crawler hook.
00099      * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
00100      * In reality we select indexing configurations and evaluate if any of them needs to run.
00101      *
00102      * @param   object      Parent object (tx_crawler lib)
00103      * @return  void
00104      */
00105     function crawler_init(&$pObj){
00106 
00107             // Select all indexing configuration which are waiting to be activated:
00108         $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
00109             '*',
00110             'index_config',
00111             'hidden=0
00112                 AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
00113                 AND timer_next_indexing<' . $GLOBALS['EXEC_TIME'] . '
00114                 AND set_id=0
00115                 '.t3lib_BEfunc::deleteClause('index_config')
00116         );
00117 
00118             // For each configuration, check if it should be executed and if so, start:
00119         foreach($indexingConfigurations as $cfgRec) {
00120 
00121                 // Generate a unique set-ID:
00122             $setId = t3lib_div::md5int(microtime());
00123 
00124                 // Get next time:
00125             $nextTime = $this->generateNextIndexingTime($cfgRec);
00126 
00127                 // Start process by updating index-config record:
00128             $field_array = array (
00129                 'set_id' => $setId,
00130                 'timer_next_indexing' => $nextTime,
00131                 'session_data' => '',
00132             );
00133             $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
00134 
00135                 // Based on configuration type:
00136             switch($cfgRec['type']) {
00137                 case 1: // RECORDS:
00138 
00139                         // Parameters:
00140                     $params = array(
00141                         'indexConfigUid' => $cfgRec['uid'],
00142                         'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
00143                         'url' => 'Records (start)', // Just for show.
00144                     );
00145                         //
00146                     $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
00147                 break;
00148                 case 2: // FILES:
00149 
00150                         // Parameters:
00151                     $params = array(
00152                         'indexConfigUid' => $cfgRec['uid'],     // General
00153                         'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),  // General
00154                         'url' => $cfgRec['filepath'],   // Partly general... (for URL and file types)
00155                         'depth' => 0    // Specific for URL and file types
00156                     );
00157 
00158                     $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
00159                 break;
00160                 case 3: // External URL:
00161 
00162                         // Parameters:
00163                     $params = array(
00164                         'indexConfigUid' => $cfgRec['uid'],     // General
00165                         'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),  // General
00166                         'url' => $cfgRec['externalUrl'],    // Partly general... (for URL and file types)
00167                         'depth' => 0    // Specific for URL and file types
00168                     );
00169 
00170                     $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
00171                 break;
00172                 case 4: // Page tree
00173 
00174                         // Parameters:
00175                     $params = array(
00176                         'indexConfigUid' => $cfgRec['uid'],     // General
00177                         'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),  // General
00178                         'url' => intval($cfgRec['alternative_source_pid']), // Partly general... (for URL and file types and page tree (root))
00179                         'depth' => 0    // Specific for URL and file types and page tree
00180                     );
00181 
00182                     $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
00183                 break;
00184                 case 5: // Meta configuration, nothing to do:
00185                     # NOOP
00186                 break;
00187                 default:
00188                     if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']])   {
00189                         $hookObj = t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
00190 
00191                         if (is_object($hookObj))    {
00192 
00193                                 // Parameters:
00194                             $params = array(
00195                                 'indexConfigUid' => $cfgRec['uid'],     // General
00196                                 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].'/CUSTOM]'),   // General
00197                                 'url' => $hookObj->initMessage($message),
00198                             );
00199 
00200                             $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
00201                         }
00202                     }
00203                 break;
00204             }
00205         }
00206 
00207             // Finally, look up all old index configurations which are finished and needs to be reset and done.
00208         $this->cleanUpOldRunningConfigurations();
00209     }
00210 
00211     /**
00212      * Call back function for execution of a log element
00213      *
00214      * @param   array       Params from log element. Must contain $params['indexConfigUid']
00215      * @param   object      Parent object (tx_crawler lib)
00216      * @return  array       Result array
00217      */
00218     function crawler_execute($params,&$pObj)    {
00219 
00220             // Indexer configuration ID must exist:
00221         if ($params['indexConfigUid'])  {
00222 
00223                 // Load the indexing configuration record:
00224             $cfgRec = $GLOBALS['TYPO3_DB']->exec_SELECTgetSingleRow(
00225                 '*',
00226                 'index_config',
00227                 'uid='.intval($params['indexConfigUid'])
00228             );
00229 
00230             if (is_array($cfgRec))  {
00231 
00232                     // Unpack session data:
00233                 $session_data = unserialize($cfgRec['session_data']);
00234 
00235                     // Select which type:
00236                 switch($cfgRec['type']) {
00237                     case 1: // Records:
00238                         $this->crawler_execute_type1($cfgRec,$session_data,$params,$pObj);
00239                     break;
00240                     case 2: // Files
00241                         $this->crawler_execute_type2($cfgRec,$session_data,$params,$pObj);
00242                     break;
00243                     case 3: // External URL:
00244                         $this->crawler_execute_type3($cfgRec,$session_data,$params,$pObj);
00245                     break;
00246                     case 4: // Page tree:
00247                         $this->crawler_execute_type4($cfgRec,$session_data,$params,$pObj);
00248                     break;
00249                     case 5: // Meta
00250                         # NOOP (should never enter here!)
00251                     break;
00252                     default:
00253                         if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']])   {
00254                             $hookObj = t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
00255 
00256                             if (is_object($hookObj))    {
00257                                 $this->pObj = $pObj;    // For addQueueEntryForHook()
00258                                 $hookObj->indexOperation($cfgRec,$session_data,$params,$this);
00259                             }
00260                         }
00261                     break;
00262                 }
00263 
00264                     // Save process data which might be modified:
00265                 $field_array = array (
00266                     'session_data' => serialize($session_data)
00267                 );
00268                 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
00269             }
00270         }
00271 
00272         return array('log' => $params);
00273     }
00274 
00275     /**
00276      * Indexing records from a table
00277      *
00278      * @param   array       Indexing Configuration Record
00279      * @param   array       Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
00280      * @param   array       Parameters from the log queue.
00281      * @param   object      Parent object (from "crawler" extension!)
00282      * @return  void
00283      */
00284     function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj)   {
00285         if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']]))   {
00286 
00287                 // Init session data array if not already:
00288             if (!is_array($session_data))   {
00289                 $session_data = array(
00290                     'uid' => 0
00291                 );
00292             }
00293 
00294                 // Init:
00295             $pid = intval($cfgRec['alternative_source_pid']) ? intval($cfgRec['alternative_source_pid']) : $cfgRec['pid'];
00296             $numberOfRecords = $cfgRec['recordsbatch'] ? t3lib_div::intInRange($cfgRec['recordsbatch'],1) : 100;
00297 
00298                 // Get root line:
00299             $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
00300 
00301                 // Select
00302             $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
00303                         '*',
00304                         $cfgRec['table2index'],
00305                         'pid = '.intval($pid).'
00306                             AND uid > '.intval($session_data['uid']).
00307                             t3lib_BEfunc::deleteClause($cfgRec['table2index']).
00308                             t3lib_BEfunc::BEenableFields($cfgRec['table2index']),
00309                         '',
00310                         'uid',
00311                         $numberOfRecords
00312                     );
00313 
00314                 // Traverse:
00315             if (count($recs))   {
00316                 foreach($recs as $r)    {
00317 
00318                         // Index single record:
00319                     $this->indexSingleRecord($r,$cfgRec,$rl);
00320 
00321                         // Update the UID we last processed:
00322                     $session_data['uid'] = $r['uid'];
00323                 }
00324 
00325                     // Finally, set entry for next indexing of batch of records:
00326                 $nparams = array(
00327                     'indexConfigUid' => $cfgRec['uid'],
00328                     'url' => 'Records from UID#'.($r['uid']+1).'-?',
00329                     'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')
00330                 );
00331                 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
00332             }
00333         }
00334     }
00335 
00336     /**
00337      * Indexing files from fileadmin
00338      *
00339      * @param   array       Indexing Configuration Record
00340      * @param   array       Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
00341      * @param   array       Parameters from the log queue.
00342      * @param   object      Parent object (from "crawler" extension!)
00343      * @return  void
00344      */
00345     function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj)   {
00346 
00347             // Prepare path, making it absolute and checking:
00348         $readpath = $params['url'];
00349         if (!t3lib_div::isAbsPath($readpath))   {
00350             $readpath = t3lib_div::getFileAbsFileName($readpath);
00351         }
00352 
00353         if (t3lib_div::isAllowedAbsPath($readpath)) {
00354             if (@is_file($readpath))    {   // If file, index it!
00355 
00356                     // Get root line (need to provide this when indexing external files)
00357                 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
00358 
00359                     // Load indexer if not yet.
00360                 $this->loadIndexerClass();
00361 
00362                     // (Re)-Indexing file on page.
00363                 $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
00364                 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
00365                 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
00366                 $indexerObj->hash['phash'] = -1;    // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
00367 
00368                     // Index document:
00369                 $indexerObj->indexRegularDocument(substr($readpath,strlen(PATH_site)), TRUE);
00370             } elseif (@is_dir($readpath)) { // If dir, read content and create new pending items for log:
00371 
00372                     // Select files and directories in path:
00373                 $extList = implode(',',t3lib_div::trimExplode(',',$cfgRec['extensions'],1));
00374                 $fileArr = array();
00375                 $files = t3lib_div::getAllFilesAndFoldersInPath($fileArr,$readpath,$extList,0,0);
00376 
00377                 $directoryList = t3lib_div::get_dirs($readpath);
00378                 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth'])    {
00379                     foreach ($directoryList as $subdir) {
00380                         if ((string)$subdir!='')    {
00381                             $files[]= $readpath.$subdir.'/';
00382                         }
00383                     }
00384                 }
00385                 $files = t3lib_div::removePrefixPathFromList($files,PATH_site);
00386 
00387                     // traverse the items and create log entries:
00388                 foreach($files as $path)    {
00389                     $this->instanceCounter++;
00390                     if ($path!==$params['url']) {
00391                             // Parameters:
00392                         $nparams = array(
00393                             'indexConfigUid' => $cfgRec['uid'],
00394                             'url' => $path,
00395                             'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
00396                             'depth' => $params['depth']+1
00397                         );
00398                         $pObj->addQueueEntry_callBack(
00399                             $cfgRec['set_id'],
00400                             $nparams,
00401                             $this->callBack,
00402                             $cfgRec['pid'],
00403                             $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
00404                         );
00405                     }
00406                 }
00407             }
00408         }
00409     }
00410 
00411     /**
00412      * Indexing External URLs
00413      *
00414      * @param   array       Indexing Configuration Record
00415      * @param   array       Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
00416      * @param   array       Parameters from the log queue.
00417      * @param   object      Parent object (from "crawler" extension!)
00418      * @return  void
00419      */
00420     function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj)   {
00421 
00422             // Init session data array if not already:
00423         if (!is_array($session_data))   {
00424             $session_data = array(
00425                 'urlLog' => array($params['url'])
00426             );
00427         }
00428 
00429             // Index the URL:
00430         $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
00431         $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
00432 
00433             // Add more elements to log now:
00434         if ($params['depth'] < $cfgRec['depth'])    {
00435             foreach($subUrls as $url)   {
00436                 if ($url = $this->checkUrl($url,$session_data['urlLog'],$cfgRec['externalUrl']))    {
00437                     if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny']))  {
00438                         $this->instanceCounter++;
00439                         $session_data['urlLog'][] = $url;
00440 
00441                             // Parameters:
00442                         $nparams = array(
00443                             'indexConfigUid' => $cfgRec['uid'],
00444                             'url' => $url,
00445                             'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
00446                             'depth' => $params['depth']+1
00447                         );
00448                         $pObj->addQueueEntry_callBack(
00449                             $cfgRec['set_id'],
00450                             $nparams,
00451                             $this->callBack,
00452                             $cfgRec['pid'],
00453                             $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
00454                         );
00455                     }
00456                 }
00457             }
00458         }
00459     }
00460 
00461     /**
00462      * Page tree indexing type
00463      *
00464      * @param   array       Indexing Configuration Record
00465      * @param   array       Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
00466      * @param   array       Parameters from the log queue.
00467      * @param   object      Parent object (from "crawler" extension!)
00468      * @return  void
00469      */
00470     function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj)   {
00471 
00472             // Base page uid:
00473         $pageUid = intval($params['url']);
00474 
00475             // Get array of URLs from page:
00476         $pageRow = t3lib_BEfunc::getRecord('pages',$pageUid);
00477         $res = $pObj->getUrlsForPageRow($pageRow);
00478 
00479         $duplicateTrack = array();  // Registry for duplicates
00480         $downloadUrls = array();    // Dummy.
00481 
00482             // Submit URLs:
00483         if (count($res))    {
00484             foreach($res as $paramSetKey => $vv)    {
00485                 $urlList = $pObj->urlListFromUrlArray(
00486                     $vv,
00487                     $pageRow,
00488                     $GLOBALS['EXEC_TIME'],
00489                     30,
00490                     1,
00491                     0,
00492                     $duplicateTrack,
00493                     $downloadUrls,
00494                     array('tx_indexedsearch_reindex')
00495                 );
00496             }
00497         }
00498 
00499             // Add subpages to log now:
00500         if ($params['depth'] < $cfgRec['depth'])    {
00501 
00502                 // Subpages selected
00503             $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
00504                 'uid,title',
00505                 'pages',
00506                 'pid = '.intval($pageUid).
00507                     t3lib_BEfunc::deleteClause('pages')
00508             );
00509 
00510                 // Traverse subpages and add to queue:
00511             if (count($recs))   {
00512                 foreach($recs as $r)    {
00513                     $this->instanceCounter++;
00514                     $url = 'pages:'.$r['uid'].': '.$r['title'];
00515                     $session_data['urlLog'][] = $url;
00516 
00517                             // Parameters:
00518                     $nparams = array(
00519                         'indexConfigUid' => $cfgRec['uid'],
00520                         'url' => $r['uid'],
00521                         'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
00522                         'depth' => $params['depth']+1
00523                     );
00524                     $pObj->addQueueEntry_callBack(
00525                         $cfgRec['set_id'],
00526                         $nparams,
00527                         $this->callBack,
00528                         $cfgRec['pid'],
00529                         $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
00530                     );
00531                 }
00532             }
00533         }
00534     }
00535 
00536     /**
00537      * Look up all old index configurations which are finished and needs to be reset and done
00538      *
00539      * @return  void
00540      */
00541     function cleanUpOldRunningConfigurations()  {
00542 
00543             // Lookup running index configurations:
00544         $runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
00545             'uid,set_id',
00546             'index_config',
00547             'set_id!=0'.t3lib_BEfunc::deleteClause('index_config')
00548         );
00549 
00550             // For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE)
00551         foreach($runningIndexingConfigurations as $cfgRec)  {
00552 
00553                 // Look for ended processes:
00554             $queued_items = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
00555                 '*',
00556                 'tx_crawler_queue',
00557                 'set_id=' . intval($cfgRec['set_id']) . ' AND exec_time=0'
00558             );
00559 
00560             if (!$queued_items) {
00561 
00562                     // Lookup old phash rows:
00563                 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
00564                     'phash',
00565                     'index_phash',
00566                     'freeIndexUid='.intval($cfgRec['uid']).' AND freeIndexSetId!='.$cfgRec['set_id']
00567                 );
00568 
00569                 foreach($oldPhashRows as $pHashRow) {
00570                         // Removing old registrations for all tables (code copied from class.tx_indexedsearch_modfunc1.php)
00571                     $tableArr = explode(',','index_phash,index_rel,index_section,index_grlist,index_fulltext,index_debug');
00572                     foreach($tableArr as $table)    {
00573                         $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($pHashRow['phash']));
00574                     }
00575                 }
00576 
00577                     // End process by updating index-config record:
00578                 $field_array = array (
00579                     'set_id' => 0,
00580                     'session_data' => '',
00581                 );
00582                 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
00583             }
00584         }
00585     }
00586 
00587 
00588 
00589 
00590 
00591 
00592 
00593     /*****************************************
00594      *
00595      * Helper functions
00596      *
00597      *****************************************/
00598 
00599     /**
00600      * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
00601      *
00602      * @param   string      URL string to check
00603      * @param   array       Array of already indexed URLs (input url is looked up here and must not exist already)
00604      * @param   string      Base URL of the indexing process (input URL must be "inside" the base URL!)
00605      * @return  string      Returls the URL if OK, otherwise false
00606      */
00607     function checkUrl($url,$urlLog,$baseUrl)    {
00608         $url = preg_replace('/\/\/$/','/',$url);
00609         list($url) = explode('#',$url);
00610 
00611         if (!strstr($url,'../'))    {
00612             if (t3lib_div::isFirstPartOfStr($url,$baseUrl)) {
00613                 if (!in_array($url,$urlLog))    {
00614                     return $url;
00615                 }
00616             }
00617         }
00618     }
00619 
00620     /**
00621      * Indexing External URL
00622      *
00623      * @param   string      URL, http://....
00624      * @param   integer     Page id to relate indexing to.
00625      * @param   array       Rootline array to relate indexing to
00626      * @param   integer     Configuration UID
00627      * @param   integer     Set ID value
00628      * @return  array       URLs found on this page
00629      */
00630     function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)   {
00631 
00632             // Load indexer if not yet.
00633         $this->loadIndexerClass();
00634 
00635             // Index external URL:
00636         $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
00637         $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
00638         $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
00639         $indexerObj->hash['phash'] = -1;    // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
00640 
00641         $indexerObj->indexExternalUrl($url);
00642         $url_qParts = parse_url($url);
00643 
00644         $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
00645         $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
00646         if (!$baseHref) {
00647                 // Extract base href from current URL
00648             $baseHref = $baseAbsoluteHref;
00649             $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
00650         }
00651         $baseHref = rtrim($baseHref, '/');
00652 
00653             // Get URLs on this page:
00654         $subUrls = array();
00655         $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
00656 
00657                         // Traverse links:
00658         foreach ($list as $count => $linkInfo)  {
00659 
00660                 // Decode entities:
00661             $subUrl = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
00662 
00663             $qParts = parse_url($subUrl);
00664             if (!$qParts['scheme']) {
00665                 $relativeUrl = t3lib_div::resolveBackPath($subUrl);
00666                 if ($relativeUrl{0} === '/') {
00667                     $subUrl = $baseAbsoluteHref . $relativeUrl;
00668                 } else {
00669                     $subUrl = $baseHref . '/' . $relativeUrl;
00670                 }
00671             }
00672 
00673             $subUrls[] = $subUrl;
00674         }
00675 
00676         return $subUrls;
00677     }
00678 
00679     /**
00680      * Indexing Single Record
00681      *
00682      * @param   array       Record to index
00683      * @param   array       Configuration Record
00684      * @param   array       Rootline array to relate indexing to
00685      * @return  void
00686      */
00687     function indexSingleRecord($r,$cfgRec,$rl=NULL) {
00688 
00689             // Load indexer if not yet.
00690         $this->loadIndexerClass();
00691 
00692 
00693             // Init:
00694         $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
00695         $fieldList = t3lib_div::trimExplode(',',$cfgRec['fieldlist'],1);
00696         $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
00697         $sys_language_uid = $languageField ? $r[$languageField] : 0;
00698 
00699             // (Re)-Indexing a row from a table:
00700         $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
00701         parse_str(str_replace('###UID###',$r['uid'],$cfgRec['get_params']),$GETparams);
00702         $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE);
00703         $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
00704         $indexerObj->forceIndexing = TRUE;
00705 
00706         $theContent = '';
00707         foreach($fieldList as $k => $v) {
00708             if (!$k)    {
00709                 $theTitle = $r[$v];
00710             } else {
00711                 $theContent.= $r[$v].' ';
00712             }
00713         }
00714 
00715             // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
00716         $indexerObj->backend_indexAsTYPO3Page(
00717             strip_tags(str_replace('<', ' <', $theTitle)),
00718             '',
00719             '',
00720             strip_tags(str_replace('<', ' <', $theContent)),
00721             $GLOBALS['LANG']->charSet,  // Requires that
00722             $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']],
00723             $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']],
00724             $r['uid']
00725         );
00726     }
00727 
00728     /**
00729      * Include indexer class.
00730      *
00731      * @return  void
00732      */
00733     function loadIndexerClass() {
00734         global $TYPO3_CONF_VARS;
00735         require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
00736     }
00737 
00738     /**
00739      * Get rootline for closest TypoScript template root.
00740      * Algorithm same as used in Web > Template, Object browser
00741      *
00742      * @param   integer     The page id to traverse rootline back from
00743      * @return  array       Array where the root lines uid values are found.
00744      */
00745     function getUidRootLineForClosestTemplate($id)  {
00746         global $TYPO3_CONF_VARS;
00747 
00748         $tmpl = t3lib_div::makeInstance("t3lib_tsparser_ext");
00749         $tmpl->tt_track = 0;    // Do not log time-performance information
00750         $tmpl->init();
00751 
00752                 // Gets the rootLine
00753         $sys_page = t3lib_div::makeInstance("t3lib_pageSelect");
00754         $rootLine = $sys_page->getRootLine($id);
00755         $tmpl->runThroughTemplates($rootLine,0);    // This generates the constants/config + hierarchy info for the template.
00756 
00757             // Root line uids
00758         $rootline_uids = array();
00759         foreach($tmpl->rootLine as $rlkey => $rldat)    {
00760             $rootline_uids[$rlkey] = $rldat['uid'];
00761         }
00762 
00763         return $rootline_uids;
00764     }
00765 
00766     /**
00767      * Generate the unix time stamp for next visit.
00768      *
00769      * @param   array       Index configuration record
00770      * @return  integer     The next time stamp
00771      */
00772     function generateNextIndexingTime($cfgRec)  {
00773         $currentTime = $GLOBALS['EXEC_TIME'];
00774 
00775             // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
00776         if ($cfgRec['timer_frequency']<=24*3600)    {
00777             $aMidNight = mktime (0,0,0)-1*24*3600;
00778         } else {
00779             $lastTime = $cfgRec['timer_next_indexing'] ? $cfgRec['timer_next_indexing'] : $GLOBALS['EXEC_TIME'];
00780             $aMidNight = mktime (0,0,0, date('m',$lastTime), date('d',$lastTime), date('y',$lastTime));
00781         }
00782 
00783             // Find last offset time plus frequency in seconds:
00784         $lastSureOffset = $aMidNight+t3lib_div::intInRange($cfgRec['timer_offset'],0,86400);
00785         $frequencySeconds = t3lib_div::intInRange($cfgRec['timer_frequency'],1);
00786 
00787             // Now, find out how many blocks of the length of frequency there is until the next time:
00788         $frequencyBlocksUntilNextTime = ceil(($currentTime-$lastSureOffset)/$frequencySeconds);
00789 
00790             // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
00791         $nextTime = $lastSureOffset + $frequencyBlocksUntilNextTime*$frequencySeconds;
00792 
00793         return $nextTime;
00794     }
00795 
00796     /**
00797      * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns true.
00798      *
00799      * @param   string      URL to test
00800      * @param   string      String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
00801      * @return  boolean     TRUE if there is a matching URL (hence, do not index!)
00802      */
00803     function checkDeniedSuburls($url, $url_deny)    {
00804         if (trim($url_deny))    {
00805             $url_denyArray = t3lib_div::trimExplode(LF,$url_deny,1);
00806             foreach($url_denyArray as $testurl) {
00807                 if (t3lib_div::isFirstPartOfStr($url,$testurl)) {
00808                     echo $url.' /// '.$url_deny.LF;
00809                     return TRUE;
00810                 }
00811             }
00812         }
00813         return FALSE;
00814     }
00815 
00816     /**
00817      * Adding entry in queue for Hook
00818      *
00819      * @param   array       Configuration record
00820      * @param   string      Title/URL
00821      * @return  void
00822      */
00823     function addQueueEntryForHook($cfgRec, $title)  {
00824 
00825         $nparams = array(
00826             'indexConfigUid' => $cfgRec['uid'],     // This must ALWAYS be the cfgRec uid!
00827             'url' => $title,
00828             'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')   // Also just for information. Its good style to show that its an indexing configuration that added the entry.
00829         );
00830         $this->pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
00831     }
00832 
00833     /**
00834      * Deletes all data stored by indexed search for a given page
00835      *
00836      * @param   integer     Uid of the page to delete all pHash
00837      * @return  void
00838      */
00839     function deleteFromIndex($id)   {
00840 
00841             // Lookup old phash rows:
00842         $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash','index_section', 'page_id='.intval($id));
00843 
00844         if (count($oldPhashRows))   {
00845             $pHashesToDelete = array();
00846             foreach ($oldPhashRows as $pHashRow)    {
00847                 $pHashesToDelete[] = $pHashRow['phash'];
00848             }
00849 
00850             $where_clause = 'phash IN ('.implode(',',$GLOBALS['TYPO3_DB']->cleanIntArray($pHashesToDelete)).')';
00851             $tables = explode(',', 'index_debug,index_fulltext,index_grlist,index_phash,index_rel,index_section');
00852             foreach ($tables as $table) {
00853                 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where_clause);
00854             }
00855         }
00856     }
00857 
00858 
00859 
00860 
00861 
00862 
00863 
00864     /*************************
00865      *
00866      * Hook functions for TCEmain (indexing of records)
00867      *
00868      *************************/
00869 
00870     /**
00871      * TCEmain hook function for on-the-fly indexing of database records
00872      *
00873      * @param   string      TCEmain command
00874      * @param   string      Table name
00875      * @param   string      Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
00876      * @param   mixed       Target value (ignored)
00877      * @param   object      Reference to tcemain calling object
00878      * @return  void
00879      */
00880     function processCmdmap_preProcess($command, $table, $id, $value, $pObj) {
00881 
00882             // Clean up the index
00883         if ($command=='delete' && $table == 'pages')    {
00884             $this->deleteFromIndex($id);
00885         }
00886     }
00887 
00888     /**
00889      * TCEmain hook function for on-the-fly indexing of database records
00890      *
00891      * @param   string      Status "new" or "update"
00892      * @param   string      Table name
00893      * @param   string      Record ID. If new record its a string pointing to index inside t3lib_tcemain::substNEWwithIDs
00894      * @param   array       Field array of updated fields in the operation
00895      * @param   object      Reference to tcemain calling object
00896      * @return  void
00897      */
00898     function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj) {
00899 
00900             // Check if any fields are actually updated:
00901         if (count($fieldArray)) {
00902 
00903                 // Translate new ids.
00904             if ($status=='new') {
00905                 $id = $pObj->substNEWwithIDs[$id];
00906 
00907             } elseif ($table=='pages' && $status=='update' && ((array_key_exists('hidden',$fieldArray) && $fieldArray['hidden']==1) || (array_key_exists('no_search',$fieldArray) && $fieldArray['no_search']==1))) {
00908 
00909                     // If the page should be hidden or not indexed after update, delete index for this page
00910                 $this->deleteFromIndex($id);
00911             }
00912 
00913                 // Get full record and if exists, search for indexing configurations:
00914             $currentRecord = t3lib_BEfunc::getRecord($table,$id);
00915             if (is_array($currentRecord))   {
00916 
00917                     // Select all (not running) indexing configurations of type "record" (1) and which points to this table and is located on the same page as the record or pointing to the right source PID
00918                 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
00919                     '*',
00920                     'index_config',
00921                     'hidden=0
00922                         AND (starttime=0 OR starttime<=' . $GLOBALS['EXEC_TIME'] . ')
00923                         AND set_id=0
00924                         AND type=1
00925                         AND table2index='.$GLOBALS['TYPO3_DB']->fullQuoteStr($table,'index_config').'
00926                         AND (
00927                                 (alternative_source_pid=0 AND pid='.intval($currentRecord['pid']).')
00928                                 OR (alternative_source_pid='.intval($currentRecord['pid']).')
00929                             )
00930                         AND records_indexonchange=1
00931                         '.t3lib_BEfunc::deleteClause('index_config')
00932                 );
00933 
00934                 foreach($indexingConfigurations as $cfgRec) {
00935                     $this->indexSingleRecord($currentRecord,$cfgRec);
00936                 }
00937             }
00938         }
00939     }
00940 }
00941 
00942 
00943 /**
00944  * Crawler hook for indexed search. Works with the "crawler" extension
00945  * This hook is specifically used to index external files found on pages through the crawler extension.
00946  *
00947  * @author  Kasper Skårhøj <kasperYYYY@typo3.com>
00948  * @package TYPO3
00949  * @subpackage tx_indexedsearch
00950  * @see tx_indexedsearch_indexer::extractLinks()
00951  */
00952 class tx_indexedsearch_files {
00953 
00954     /**
00955      * Call back function for execution of a log element
00956      *
00957      * @param   array       Params from log element.
00958      * @param   object      Parent object (tx_crawler lib)
00959      * @return  array       Result array
00960      */
00961     function crawler_execute($params,&$pObj)    {
00962 
00963             // Load indexer if not yet.
00964         $this->loadIndexerClass();
00965 
00966         if (is_array($params['conf']))  {
00967 
00968                 // Initialize the indexer class:
00969             $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
00970             $indexerObj->conf = $params['conf'];
00971             $indexerObj->init();
00972 
00973                 // Index document:
00974             if ($params['alturl'])  {
00975                 $fI = pathinfo($params['document']);
00976                 $ext = strtolower($fI['extension']);
00977                 $indexerObj->indexRegularDocument($params['alturl'], TRUE, $params['document'], $ext);
00978             } else {
00979                 $indexerObj->indexRegularDocument($params['document'], TRUE);
00980             }
00981 
00982                 // Return OK:
00983             return array('content' => array());
00984         }
00985     }
00986 
00987     /**
00988      * Include indexer class.
00989      *
00990      * @return  void
00991      */
00992     function loadIndexerClass() {
00993         global $TYPO3_CONF_VARS;
00994         require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
00995     }
00996 }
00997 
00998 
00999 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php'])) {
01000     include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php']);
01001 }
01002 
01003 ?>