TYPO3 API  SVNRelease
class.crawlerhook.php
Go to the documentation of this file.
00001 <?php
00002 /***************************************************************
00003 *  Copyright notice
00004 *
00005 *  (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
00006 *  All rights reserved
00007 *
00008 *  This script is part of the TYPO3 project. The TYPO3 project is
00009 *  free software; you can redistribute it and/or modify
00010 *  it under the terms of the GNU General Public License as published by
00011 *  the Free Software Foundation; either version 2 of the License, or
00012 *  (at your option) any later version.
00013 *
00014 *  The GNU General Public License can be found at
00015 *  http://www.gnu.org/copyleft/gpl.html.
00016 *  A copy is found in the textfile GPL.txt and important notices to the license
00017 *  from the author is found in LICENSE.txt distributed with these scripts.
00018 *
00019 *
00020 *  This script is distributed in the hope that it will be useful,
00021 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00022 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00023 *  GNU General Public License for more details.
00024 *
00025 *  This copyright notice MUST APPEAR in all copies of the script!
00026 ***************************************************************/
00027 /**
00028  * Index search crawler hook example
00029  *
00030  * $Id: class.crawlerhook.php 10120 2011-01-18 20:03:36Z ohader $
00031  *
00032  * @author  Kasper Skårhøj <kasperYYYY@typo3.com>
00033  */
00034 /**
00035  * [CLASS/FUNCTION INDEX of SCRIPT]
00036  *
00037  *
00038  *
00039  *   57: class tx_indexedsearch_crawlerhook
00040  *   64:     function initMessage()
00041  *   80:     function indexOperation($cfgRec,&$session_data,$params,&$pObj)
00042  *
00043  * TOTAL FUNCTIONS: 2
00044  * (This index is automatically created/updated by the extension "extdeveval")
00045  *
00046  */
00047 
00048 
00049 
00050 /**
00051  * Index search crawler hook example
00052  *
00053  * @package TYPO3
00054  * @subpackage tx_indexedsearch
00055  * @author  Kasper Skårhøj <kasperYYYY@typo3.com>
00056  */
00057 class tx_indexedsearch_crawlerhook {
00058 
00059     /**
00060      * Function is called when an indexing session starts according to the time intervals set for the indexing configuration.
00061      *
00062      * @return  string      Return a text string for the first, initiating queue entry for the crawler.
00063      */
00064     function initMessage()  {
00065         return 'Start of Custom Example Indexing session!';
00066     }
00067 
00068     /**
00069      * This will do two things:
00070      * 1) Carry out actual indexing of content (one or more items)
00071      * 2) Add one or more new entries into the crawlers queue so we are called again (another instance) for further indexing in the session (optional of course, if all indexing is done, we add no new entries)
00072      *
00073      * @param   array       Indexing Configuration Record (the record which holds the information that lead to this indexing session...)
00074      * @param   array       Session data variable. Passed by reference. Changed content is saved and passed back upon next instance in the session.
00075      * @param   array       Params array from the queue entry.
00076      * @param   object      Grant Parent Object (from "crawler" extension)
00077      * @param   object      Parent Object (from "indexed_search" extension)
00078      * @return  void
00079      */
00080     function indexOperation($cfgRec,&$session_data,$params,&$pObj)  {
00081 
00082             // Init session data array if not already:
00083         if (!is_array($session_data))   {
00084             $session_data = array(
00085                 'step' => 0
00086             );
00087         }
00088 
00089             // Increase step counter (this is just an example of how the session data can be used - to track how many instances of indexing is left)
00090         $session_data['step']++;
00091 
00092 
00093         switch((int)$session_data['step'])  {
00094              case 1:    // Indexing Example: Content accessed with GET parameters added to URL:
00095 
00096                     // Load indexer if not yet [DON'T CHANGE]:
00097                 $pObj->loadIndexerClass();
00098 
00099                     // Get rootline from the Indexing Record (needed because the indexer relates all search results to a position in the page tree!) [DON'T CHANGE]:
00100                 $rl = $pObj->getUidRootLineForClosestTemplate($cfgRec['pid']);
00101 
00102                     // Set up language uid, if any:
00103                 $sys_language_uid = 0;
00104 
00105                     // Set up 2 example items to index:
00106                 $exampleItems = array(
00107                     array(
00108                         'ID' => '123',
00109                         'title' => 'Title of Example 1',
00110                         'content' => 'Vestibulum leo turpis, fringilla sit amet, semper eget, vestibulum ut, arcu. Vestibulum mauris orci, vulputate quis, congue eget, nonummy'
00111                     ),
00112                     array(
00113                         'ID' => 'example2',
00114                         'title' => 'Title of Example 2',
00115                         'content' => 'Cras tortor turpis, vulputate non, accumsan a, pretium in, magna. Cras turpis turpis, pretium pulvinar, pretium vel, nonummy eu.'
00116                     )
00117                 );
00118 
00119                     // For each item, index it (this is what you might like to do in batches of like 100 items if all your content spans thousands of items!)
00120                 foreach($exampleItems as $item) {
00121 
00122                         // Prepare the GET variables array that must be added to the page URL in order to view result:
00123                     parse_str('&itemID='.rawurlencode($item['ID']), $GETparams);
00124 
00125                         // Prepare indexer (make instance, initialize it, set special features for indexing parameterized content - probably none of this should be changed by you) [DON'T CHANGE]:
00126                     $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
00127                     $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, FALSE);
00128                     $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
00129                     $indexerObj->forceIndexing = TRUE;
00130 
00131                         // Indexing the content of the item (see tx_indexedsearch_indexer::backend_indexAsTYPO3Page() for options)
00132                     $indexerObj->backend_indexAsTYPO3Page(
00133                         $item['title'],
00134                         '',
00135                         '',
00136                         $item['content'],
00137                         $GLOBALS['LANG']->charSet,  // Charset of content - MUST be set.
00138                         $item['tstamp'],            // Last-modified date
00139                         $item['create_date'],       // Created date
00140                         $item['ID']
00141                     );
00142                 }
00143              break;
00144              case 2: // Indexing Example: Content accessed directly in file system:
00145 
00146                     // Load indexer if not yet [DON'T CHANGE]:
00147                 $pObj->loadIndexerClass();
00148 
00149                     // Get rootline from the Indexing Record (needed because the indexer relates all search results to a position in the page tree!) [DON'T CHANGE]:
00150                 $rl = $pObj->getUidRootLineForClosestTemplate($cfgRec['pid']);
00151 
00152                     // Set up language uid, if any:
00153                 $sys_language_uid = 0;
00154 
00155                     // Prepare indexer (make instance, initialize it, set special features for indexing parameterized content - probably none of this should be changed by you) [DON'T CHANGE]:
00156                 $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
00157                 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl);
00158                 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
00159                 $indexerObj->hash['phash'] = -1;    // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
00160 
00161                     // Index document:
00162                 $indexerObj->indexRegularDocument('fileadmin/templates/index.html', TRUE);
00163              break;
00164              case 3: // Indexing Example: Content accessed on External URLs:
00165 
00166                     // Load indexer if not yet.
00167                 $pObj->loadIndexerClass();
00168 
00169                     // Index external URL:
00170                 $indexerObj = t3lib_div::makeInstance('tx_indexedsearch_indexer');
00171                 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl);
00172                 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
00173                 $indexerObj->hash['phash'] = -1;    // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
00174 
00175                     // Index external URL (HTML only):
00176                 $indexerObj->indexExternalUrl('http://www.google.com/');
00177              break;
00178         }
00179 
00180             // Finally, set entry for next indexing instance (if all steps are not completed)
00181         if ($session_data['step']<=3)   {
00182             $title = 'Step #'.$session_data['step'].' of 3';    // Just information field. Never mind that the field is called "url" - this is what will be shown in the "crawler" log. Could be a URL - or whatever else tells what that indexing instance will do.
00183             $pObj->addQueueEntryForHook($cfgRec, $title);
00184         }
00185     }
00186 }
00187 
00188 
00189 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/example/class.crawlerhook.php'])) {
00190     include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/example/class.crawlerhook.php']);
00191 }
00192 
00193 ?>