00001 <?php
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136 class tx_indexedsearch_indexer {
00137
00138
00139 var $reasons = array(
00140 -1 => 'mtime matched the document, so no changes detected and no content updated',
00141 -2 => 'The minimum age was not exceeded',
00142 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
00143 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
00144 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
00145 4 => 'Page has never been indexed (is not represented in the index_phash table).'
00146 );
00147
00148
00149 var $excludeSections = 'script,style';
00150
00151
00152 var $external_parsers = array();
00153
00154
00155 var $defaultGrList = '0,-1';
00156
00157
00158 var $tstamp_maxAge = 0;
00159 var $tstamp_minAge = 0;
00160 var $maxExternalFiles = 0;
00161
00162 var $forceIndexing = FALSE;
00163 var $crawlerActive = FALSE;
00164
00165
00166 var $defaultContentArray=array(
00167 'title' => '',
00168 'description' => '',
00169 'keywords' => '',
00170 'body' => '',
00171 );
00172 var $wordcount = 0;
00173 var $externalFileCounter = 0;
00174
00175 var $conf = array();
00176 var $indexerConfig = array();
00177 var $hash = array();
00178 var $file_phash_arr = array();
00179 var $contentParts = array();
00180 var $content_md5h = '';
00181 var $internal_log = array();
00182 var $indexExternalUrl_content = '';
00183
00184 var $cHashParams = array();
00185
00186 var $freqRange = 32000;
00187 var $freqMax = 0.1;
00188
00189
00190
00191
00192
00193
00194
00195 var $csObj;
00196
00197
00198
00199
00200
00201
00202 var $metaphoneObj;
00203
00204
00205
00206
00207
00208
00209 var $lexerObj;
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219 function hook_indexContent(&$pObj) {
00220
00221
00222 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00223
00224
00225
00226 if (t3lib_extMgm::isLoaded('crawler')
00227 && $pObj->applicationData['tx_crawler']['running']
00228 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
00229
00230
00231 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
00232
00233
00234 $this->crawlerActive = TRUE;
00235 $this->forceIndexing = TRUE;
00236 }
00237
00238
00239 if ($pObj->config['config']['index_enable']) {
00240 $this->log_push('Index page','');
00241
00242 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
00243 if (!$pObj->page['no_search']) {
00244 if (!$pObj->no_cache) {
00245 if (!strcmp($pObj->sys_language_uid,$pObj->sys_language_content)) {
00246
00247
00248 $this->conf = array();
00249
00250
00251 $this->conf['id'] = $pObj->id;
00252 $this->conf['type'] = $pObj->type;
00253 $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
00254 $this->conf['MP'] = $pObj->MP;
00255 $this->conf['gr_list'] = $pObj->gr_list;
00256
00257 $this->conf['cHash'] = $pObj->cHash;
00258 $this->conf['cHash_array'] = $pObj->cHash_array;
00259
00260 $this->conf['crdate'] = $pObj->page['crdate'];
00261 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
00262
00263
00264 $this->conf['rootline_uids'] = array();
00265 foreach($pObj->config['rootLine'] as $rlkey => $rldat) {
00266 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
00267 }
00268
00269
00270 $this->conf['content'] = $pObj->content;
00271 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
00272 $this->conf['metaCharset'] = $pObj->metaCharset;
00273 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'];
00274
00275
00276 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
00277 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
00278 $this->conf['index_metatags'] = isset($pObj->config['config']['index_metatags']) ? $pObj->config['config']['index_metatags'] : true;
00279
00280
00281 $this->conf['recordUid'] = 0;
00282 $this->conf['freeIndexUid'] = 0;
00283 $this->conf['freeIndexSetId'] = 0;
00284
00285
00286 $this->init();
00287 $this->indexTypo3PageContent();
00288 } else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
00289 } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
00290 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
00291 } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
00292 $this->log_pull();
00293 }
00294 }
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305
00306
00307
00308
00309
00310
00311
00312
00313
00314
00315
00316
00317
00318
00319
00320
00321 function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) {
00322
00323
00324 $this->conf = array();
00325
00326
00327 $this->conf['id'] = $id;
00328 $this->conf['type'] = $type;
00329 $this->conf['sys_language_uid'] = $sys_language_uid;
00330 $this->conf['MP'] = $MP;
00331 $this->conf['gr_list'] = '0,-1';
00332
00333
00334 $this->conf['cHash'] = $createCHash ? t3lib_div::generateCHash(t3lib_div::implodeArrayForUrl('', $cHash_array)) : '';
00335 $this->conf['cHash_array'] = $cHash_array;
00336
00337
00338 $this->conf['freeIndexUid'] = 0;
00339 $this->conf['freeIndexSetId'] = 0;
00340 $this->conf['page_cache_reg1'] = '';
00341
00342
00343 $this->conf['rootline_uids'] = $uidRL;
00344
00345
00346 $this->conf['index_externals'] = 1;
00347 $this->conf['index_descrLgd'] = 200;
00348 $this->conf['index_metatags'] = true;
00349
00350
00351 $this->init();
00352 }
00353
00354
00355
00356
00357
00358
00359
00360
00361 function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) {
00362 $this->conf['freeIndexUid'] = $freeIndexUid;
00363 $this->conf['freeIndexSetId'] = $freeIndexSetId;
00364 }
00365
00366
00367
00368
00369
00370
00371
00372
00373
00374
00375
00376
00377
00378
00379 function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
00380
00381
00382 $this->conf['mtime'] = $mtime;
00383 $this->conf['crdate'] = $crdate;
00384 $this->conf['recordUid'] = $recordUid;
00385
00386
00387 $this->conf['content'] = '
00388 <html>
00389 <head>
00390 <title>'.htmlspecialchars($title).'</title>
00391 <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
00392 <meta name="description" content="'.htmlspecialchars($description).'" />
00393 </head>
00394 <body>
00395 '.htmlspecialchars($content).'
00396 </body>
00397 </html>';
00398
00399
00400 $this->conf['metaCharset'] = $charset;
00401 $this->conf['indexedDocTitle'] = '';
00402
00403
00404 $this->indexTypo3PageContent();
00405 }
00406
00407
00408
00409
00410
00411
00412
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422
00423
00424
00425
00426
00427
00428
00429
00430 function init() {
00431 global $TYPO3_CONF_VARS;
00432
00433
00434 $this->cHashParams = $this->conf['cHash_array'];
00435 if (is_array($this->cHashParams) && count($this->cHashParams)) {
00436 if ($this->conf['cHash']) $this->cHashParams['cHash'] = $this->conf['cHash'];
00437 unset($this->cHashParams['encryptionKey']);
00438 }
00439
00440
00441 $this->setT3Hashes();
00442
00443
00444 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00445 $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0);
00446 $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0);
00447 $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
00448 $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255);
00449
00450
00451
00452 if ($this->conf['index_externals']) {
00453 $this->initializeExternalParsers();
00454 }
00455
00456
00457
00458 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
00459 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
00460 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
00461 $this->lexerObj = t3lib_div::getUserObj($lexerObjRef);
00462 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
00463
00464
00465
00466 if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
00467 $this->metaphoneObj = t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
00468 $this->metaphoneObj->pObj = $this;
00469 }
00470
00471
00472 $this->csObj = t3lib_div::makeInstance('t3lib_cs');
00473 }
00474
00475
00476
00477
00478
00479
00480
00481
00482 function initializeExternalParsers() {
00483 global $TYPO3_CONF_VARS;
00484
00485 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
00486 foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
00487 $this->external_parsers[$extension] = t3lib_div::getUserObj($_objRef);
00488 $this->external_parsers[$extension]->pObj = $this;
00489
00490
00491 if (!$this->external_parsers[$extension]->initParser($extension)) {
00492 unset($this->external_parsers[$extension]);
00493 }
00494 }
00495 }
00496 }
00497
00498
00499
00500
00501
00502
00503
00504
00505
00506
00507
00508
00509
00510
00511
00512
00513
00514
00515
00516
00517
00518
00519
00520
00521
00522
00523 function indexTypo3PageContent() {
00524
00525 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
00526 $is_grlist = $this->is_grlist_set($this->hash['phash']);
00527
00528 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
00529
00530
00531 if ($this->forceIndexing) {
00532 $this->log_setTSlogMessage('Indexing needed, reason: Forced',1);
00533 } elseif ($check > 0) {
00534 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
00535 } else {
00536 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
00537 }
00538
00539
00540 $this->log_push('Split content','');
00541 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
00542 if ($this->conf['indexedDocTitle']) {
00543 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
00544 }
00545 $this->log_pull();
00546
00547
00548 $this->content_md5h = $this->md5inthash(implode($this->contentParts,''));
00549
00550
00551
00552
00553 $checkCHash = $this->checkContentHash();
00554 if (!is_array($checkCHash) || $check===1) {
00555 $Pstart=t3lib_div::milliseconds();
00556
00557 $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8','');
00558 $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']);
00559 $this->log_pull();
00560
00561
00562 $this->log_push('Extract words from content','');
00563 $splitInWords = $this->processWordsInArrays($this->contentParts);
00564 $this->log_pull();
00565
00566
00567 $this->log_push('Analyse the extracted words','');
00568 $indexArr = $this->indexAnalyze($splitInWords);
00569 $this->log_pull();
00570
00571
00572 $this->log_push('Submitting page','');
00573 $this->submitPage();
00574 $this->log_pull();
00575
00576
00577 $this->log_push('Check word list and submit words','');
00578 $this->checkWordList($indexArr);
00579 $this->submitWords($indexArr,$this->hash['phash']);
00580 $this->log_pull();
00581
00582
00583 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
00584
00585
00586 $this->log_push('Checking external files','');
00587 if ($this->conf['index_externals']) {
00588 $this->extractLinks($this->conf['content']);
00589 }
00590 $this->log_pull();
00591 } else {
00592 $this->updateTstamp($this->hash['phash'],$this->conf['mtime']);
00593 $this->updateSetId($this->hash['phash']);
00594 $this->update_grlist($checkCHash['phash'],$this->hash['phash']);
00595 $this->updateRootline();
00596 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
00597 }
00598 } else {
00599 $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
00600 }
00601 }
00602
00603
00604
00605
00606
00607
00608
00609
00610 function splitHTMLContent($content) {
00611
00612
00613 $contentArr = $this->defaultContentArray;
00614 $contentArr['body'] = stristr($content,'<body');
00615 $headPart = substr($content,0,-strlen($contentArr['body']));
00616
00617
00618 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
00619 $titleParts = explode(':',$contentArr['title'],2);
00620 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
00621
00622
00623 if($this->conf['index_metatags']) {
00624 for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { }
00625 for($i=0;isset($meta[$i]);$i++) {
00626 $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
00627 if (stristr($meta[$i]['name'], 'keywords')) {
00628 $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
00629 }
00630 if (stristr($meta[$i]['name'], 'description')) {
00631 $contentArr['description'] .= ',' . $meta[$i]['content'];
00632 }
00633 }
00634 }
00635
00636
00637 $this->typoSearchTags($contentArr['body']);
00638
00639
00640 $tagList = explode(',',$this->excludeSections);
00641 foreach($tagList as $tag) {
00642 while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
00643 }
00644
00645
00646 $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
00647 $contentArr['body'] = trim(strip_tags($contentArr['body']));
00648
00649 $contentArr['keywords'] = trim($contentArr['keywords']);
00650 $contentArr['description'] = trim($contentArr['description']);
00651
00652
00653 return $contentArr;
00654 }
00655
00656
00657
00658
00659
00660
00661
00662 function getHTMLcharset($content) {
00663 if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i',$content,$reg)) {
00664 if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i',$reg[0],$reg2)) {
00665 return $reg2[1];
00666 }
00667 }
00668 }
00669
00670
00671
00672
00673
00674
00675
00676
00677 function convertHTMLToUtf8($content,$charset='') {
00678
00679
00680 $charset = $charset ? $charset : $this->getHTMLcharset($content);
00681 $charset = $this->csObj->parse_charset($charset);
00682
00683
00684 if ($charset && $charset!=='utf-8') {
00685 $content = $this->csObj->utf8_encode($content, $charset);
00686 }
00687
00688 $content = $this->csObj->entities_to_utf8($content, TRUE);
00689
00690 return $content;
00691 }
00692
00693
00694
00695
00696
00697
00698
00699
00700
00701
00702
00703
00704
00705 function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
00706 $endTag = '</'.$tagName.'>';
00707 $startTag = '<'.$tagName;
00708
00709 $isTagInText = stristr($string,$startTag);
00710 if(!$isTagInText) return false;
00711
00712 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
00713 $afterTagInText = stristr($isTagInText,$endTag);
00714 if ($afterTagInText) {
00715 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
00716 $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText));
00717 $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag));
00718 } else {
00719 $tagContent='';
00720 $stringAfter = $isTagInText;
00721 }
00722
00723 return true;
00724 }
00725
00726
00727
00728
00729
00730
00731
00732 function typoSearchTags(&$body) {
00733 $expBody = preg_split('/\<\!\-\-[\s]?TYPO3SEARCH_/',$body);
00734
00735 if(count($expBody)>1) {
00736 $body = '';
00737
00738 foreach($expBody as $val) {
00739 $part = explode('-->',$val,2);
00740 if(trim($part[0])=='begin') {
00741 $body.= $part[1];
00742 $prev = '';
00743 } elseif(trim($part[0])=='end') {
00744 $body.= $prev;
00745 } else {
00746 $prev = $val;
00747 }
00748 }
00749 return true;
00750 } else {
00751 return false;
00752 }
00753 }
00754
00755
00756
00757
00758
00759
00760
00761 function extractLinks($content) {
00762
00763
00764 $list = $this->extractHyperLinks($content);
00765
00766 if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler')) {
00767 $this->includeCrawlerClass();
00768 $crawler = t3lib_div::makeInstance('tx_crawler_lib');
00769 }
00770
00771
00772 foreach($list as $linkInfo) {
00773
00774
00775 if ($linkInfo['localPath']) {
00776 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']);
00777 } else {
00778 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
00779 }
00780
00781
00782 $qParts = parse_url($linkSource);
00783
00784
00785 if ($qParts['query'] && strstr($qParts['query'],'jumpurl=')) {
00786 parse_str($qParts['query'],$getP);
00787 $linkSource = $getP['jumpurl'];
00788 $qParts = parse_url($linkSource);
00789 }
00790
00791 if (!$linkInfo['localPath'] && $qParts['scheme']) {
00792 if ($this->indexerConfig['indexExternalURLs']) {
00793
00794 $this->indexExternalUrl($linkSource);
00795 }
00796 } elseif (!$qParts['query']) {
00797 $linkSource = urldecode($linkSource);
00798 if (t3lib_div::isAllowedAbsPath($linkSource)) {
00799 $localFile = $linkSource;
00800 } else {
00801 $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
00802 }
00803 if ($localFile && @is_file($localFile)) {
00804
00805
00806 if ($linkInfo['localPath']) {
00807
00808 $fI = pathinfo($linkSource);
00809 $ext = strtolower($fI['extension']);
00810 if (is_object($crawler)) {
00811 $params = array(
00812 'document' => $linkSource,
00813 'alturl' => $linkInfo['href'],
00814 'conf' => $this->conf
00815 );
00816 unset($params['conf']['content']);
00817
00818 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
00819 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
00820 } else {
00821 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
00822 }
00823 } else {
00824 if (is_object($crawler)) {
00825 $params = array(
00826 'document' => $linkSource,
00827 'conf' => $this->conf
00828 );
00829 unset($params['conf']['content']);
00830 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
00831 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
00832 } else {
00833 $this->indexRegularDocument($linkSource);
00834 }
00835 }
00836 }
00837 }
00838 }
00839 }
00840
00841
00842
00843
00844
00845
00846
00847
00848 function extractHyperLinks($html) {
00849 $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
00850 $htmlParts = $htmlParser->splitTags('a', $html);
00851 $hyperLinksData = array();
00852 foreach ($htmlParts as $index => $tagData) {
00853 if (($index % 2) !== 0) {
00854 $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
00855 $firstTagName = $htmlParser->getFirstTagName($tagData);
00856
00857 if (strtolower($firstTagName) == 'a') {
00858 if ($tagAttributes[0]['href'] && $tagAttributes[0]['href']{0} != '#') {
00859 $hyperLinksData[] = array(
00860 'tag' => $tagData,
00861 'href' => $tagAttributes[0]['href'],
00862 'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
00863 );
00864 }
00865 }
00866 }
00867 }
00868
00869 return $hyperLinksData;
00870 }
00871
00872
00873
00874
00875
00876
00877
00878 public function extractBaseHref($html) {
00879 $href = '';
00880 $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
00881 $htmlParts = $htmlParser->splitTags('base', $html);
00882 foreach ($htmlParts as $index => $tagData) {
00883 if (($index % 2) !== 0) {
00884 $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
00885 $firstTagName = $htmlParser->getFirstTagName($tagData);
00886 if (strtolower($firstTagName) == 'base') {
00887 $href = $tagAttributes[0]['href'];
00888 if ($href) {
00889 break;
00890 }
00891 }
00892 }
00893 }
00894
00895 return $href;
00896 }
00897
00898
00899
00900
00901
00902
00903
00904
00905
00906
00907
00908
00909
00910
00911 function indexExternalUrl($externalUrl) {
00912
00913
00914 $qParts = parse_url($externalUrl);
00915 $fI = pathinfo($qParts['path']);
00916 $ext = strtolower($fI['extension']);
00917
00918
00919 $urlHeaders = $this->getUrlHeaders($externalUrl);
00920 if (stristr($urlHeaders['Content-Type'],'text/html')) {
00921 $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
00922 if (strlen($content)) {
00923
00924
00925 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL');
00926 if ($tmpFile) {
00927 t3lib_div::writeFile($tmpFile, $content);
00928
00929
00930 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html');
00931 unlink($tmpFile);
00932 }
00933 }
00934 }
00935 }
00936
00937
00938
00939
00940
00941
00942
00943
00944 function getUrlHeaders($url) {
00945 $content = t3lib_div::getURL($url,2);
00946
00947 if (strlen($content)) {
00948
00949 $headers = t3lib_div::trimExplode(LF,$content,1);
00950 $retVal = array();
00951 foreach($headers as $line) {
00952 if (!strlen(trim($line))) {
00953 break;
00954 }
00955
00956 list($headKey, $headValue) = explode(':', $line, 2);
00957 $retVal[$headKey] = $headValue;
00958 }
00959 return $retVal;
00960 }
00961 }
00962
00963
00964
00965
00966
00967
00968
00969
00970
00971 protected function createLocalPath($sourcePath) {
00972 $localPath = '';
00973 static $pathFunctions = array(
00974 'createLocalPathFromT3vars',
00975 'createLocalPathUsingAbsRefPrefix',
00976 'createLocalPathUsingDomainURL',
00977 'createLocalPathFromAbsoluteURL',
00978 'createLocalPathFromRelativeURL'
00979 );
00980 foreach ($pathFunctions as $functionName) {
00981 $localPath = $this->$functionName($sourcePath);
00982 if ($localPath != '') {
00983 break;
00984 }
00985 }
00986 return $localPath;
00987 }
00988
00989
00990
00991
00992
00993
00994
00995
00996
00997 protected function createLocalPathFromT3vars($sourcePath) {
00998 $localPath = '';
00999 $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
01000 if (is_array($indexLocalFiles)) {
01001 $md5 = t3lib_div::shortMD5($sourcePath);
01002
01003
01004
01005 if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
01006 $localPath = $indexLocalFiles[$md5];
01007 }
01008 }
01009 return $localPath;
01010 }
01011
01012
01013
01014
01015
01016
01017
01018 protected function createLocalPathUsingDomainURL($sourcePath) {
01019 $localPath = '';
01020 $baseURL = t3lib_div::getIndpEnv('TYPO3_SITE_URL');
01021 $baseURLLength = strlen($baseURL);
01022 if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
01023 $sourcePath = substr($sourcePath, $baseURLLength);
01024 $localPath = PATH_site . $sourcePath;
01025 if (!self::isAllowedLocalFile($localPath)) {
01026 $localPath = '';
01027 }
01028 }
01029 return $localPath;
01030 }
01031
01032
01033
01034
01035
01036
01037
01038
01039 protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
01040 $localPath = '';
01041 if ($GLOBALS['TSFE'] instanceof tslib_fe) {
01042 $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
01043 $absRefPrefixLength = strlen($absRefPrefix);
01044 if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
01045 $sourcePath = substr($sourcePath, $absRefPrefixLength);
01046 $localPath = PATH_site . $sourcePath;
01047 if (!self::isAllowedLocalFile($localPath)) {
01048 $localPath = '';
01049 }
01050 }
01051 }
01052 return $localPath;
01053 }
01054
01055
01056
01057
01058
01059
01060
01061
01062 protected function createLocalPathFromAbsoluteURL($sourcePath) {
01063 $localPath = '';
01064 if ($sourcePath{0} == '/') {
01065 $sourcePath = substr($sourcePath, 1);
01066 $localPath = PATH_site . $sourcePath;
01067 if (!self::isAllowedLocalFile($localPath)) {
01068 $localPath = '';
01069 }
01070 }
01071 return $localPath;
01072 }
01073
01074
01075
01076
01077
01078
01079
01080 protected function createLocalPathFromRelativeURL($sourcePath) {
01081 $localPath = '';
01082 if (self::isRelativeURL($sourcePath)) {
01083 $localPath = PATH_site . $sourcePath;
01084 if (!self::isAllowedLocalFile($localPath)) {
01085 $localPath = '';
01086 }
01087 }
01088 return $localPath;
01089 }
01090
01091
01092
01093
01094
01095
01096
01097 static protected function isRelativeURL($url) {
01098 $urlParts = @parse_url($url);
01099 return ($urlParts['scheme'] == '' && $urlParts['path']{0} != '/');
01100 }
01101
01102
01103
01104
01105
01106
01107
01108 static protected function isAllowedLocalFile($filePath) {
01109 $filePath = t3lib_div::resolveBackPath($filePath);
01110 $insideWebPath = (substr($filePath, 0, strlen(PATH_site)) == PATH_site);
01111 $isFile = is_file($filePath);
01112 return $insideWebPath && $isFile;
01113 }
01114
01115
01116
01117
01118
01119
01120
01121
01122
01123
01124
01125
01126
01127
01128
01129
01130 function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') {
01131
01132
01133 $fI = pathinfo($file);
01134 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
01135
01136
01137 if (!$contentTmpFile) {
01138 if (!t3lib_div::isAbsPath($file)) {
01139 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
01140 } else {
01141 $absFile = $file;
01142 }
01143 $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
01144 } else {
01145 $absFile = $contentTmpFile;
01146 }
01147
01148
01149 if ($absFile && @is_file($absFile)) {
01150 if ($this->external_parsers[$ext]) {
01151 $mtime = filemtime($absFile);
01152 $cParts = $this->fileContentParts($ext,$absFile);
01153
01154 foreach($cParts as $cPKey) {
01155 $this->internal_log = array();
01156 $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
01157 $Pstart = t3lib_div::milliseconds();
01158 $subinfo = array('key' => $cPKey);
01159 $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);
01160 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
01161 if ($check > 0 || $force) {
01162 if ($check > 0) {
01163 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
01164 } else {
01165 $this->log_setTSlogMessage('Indexing forced by flag',1);
01166 }
01167
01168
01169 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
01170
01171
01172 $this->log_push('Split content','');
01173 $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
01174 $this->log_pull();
01175
01176 if (is_array($contentParts)) {
01177
01178 $content_md5h = $this->md5inthash(implode($contentParts,''));
01179
01180 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
01181
01182
01183 $this->externalFileCounter++;
01184
01185
01186 $this->log_push('Extract words from content','');
01187 $splitInWords = $this->processWordsInArrays($contentParts);
01188 $this->log_pull();
01189
01190
01191 $this->log_push('Analyse the extracted words','');
01192 $indexArr = $this->indexAnalyze($splitInWords);
01193 $this->log_pull();
01194
01195
01196 $this->log_push('Submitting page','');
01197 $size = filesize($absFile);
01198 $ctime = filemtime($absFile);
01199 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
01200 $this->log_pull();
01201
01202
01203 $this->log_push('Check word list and submit words','');
01204 $this->checkWordList($indexArr);
01205 $this->submitWords($indexArr,$phash_arr['phash']);
01206 $this->log_pull();
01207
01208
01209 $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
01210 } else {
01211 $this->updateTstamp($phash_arr['phash'],$mtime);
01212 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
01213 }
01214 } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
01215 } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
01216 } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
01217
01218
01219 # $this->submitFile_grlist($phash_arr['phash']); // Setting a gr_list record if there is none already (set for default fe_group)
01220 $this->submitFile_section($phash_arr['phash']);
01221 $this->log_pull();
01222 }
01223 } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
01224 } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
01225 }
01226
01227
01228
01229
01230
01231
01232
01233
01234
01235
01236 function readFileContent($ext,$absFile,$cPKey) {
01237
01238
01239 if (is_object($this->external_parsers[$ext])) {
01240 $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey);
01241 }
01242
01243 return $contentArr;
01244 }
01245
01246
01247
01248
01249
01250
01251
01252
01253 function fileContentParts($ext,$absFile) {
01254 $cParts = array(0);
01255
01256
01257 if (is_object($this->external_parsers[$ext])) {
01258 $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile);
01259 }
01260
01261 return $cParts;
01262 }
01263
01264
01265
01266
01267
01268
01269
01270
01271 function splitRegularContent($content) {
01272 $contentArr = $this->defaultContentArray;
01273 $contentArr['body'] = $content;
01274
01275 return $contentArr;
01276 }
01277
01278
01279
01280
01281
01282
01283
01284
01285
01286
01287
01288
01289
01290
01291
01292
01293
01294
01295
01296
01297
01298
01299
01300
01301
01302
01303
01304 function charsetEntity2utf8(&$contentArr, $charset) {
01305
01306
01307 foreach ($contentArr as $key => $value) {
01308 if (strlen($contentArr[$key])) {
01309
01310 if ($charset!=='utf-8') {
01311 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
01312 }
01313
01314
01315 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);
01316 }
01317 }
01318 }
01319
01320
01321
01322
01323
01324
01325
01326 function processWordsInArrays($contentArr) {
01327
01328
01329 foreach ($contentArr as $key => $value) {
01330 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
01331 }
01332
01333
01334 $contentArr['title'] = array_unique($contentArr['title']);
01335 $contentArr['keywords'] = array_unique($contentArr['keywords']);
01336 $contentArr['description'] = array_unique($contentArr['description']);
01337
01338
01339 return $contentArr;
01340 }
01341
01342
01343
01344
01345
01346
01347
01348
01349
01350 function procesWordsInArrays($contentArr) {
01351 t3lib_div::logDeprecatedFunction();
01352
01353 return $this->processWordsInArrays($contentArr);
01354 }
01355
01356
01357
01358
01359
01360
01361
01362 function bodyDescription($contentArr) {
01363
01364
01365 $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
01366 if ($maxL) {
01367
01368 # $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
01369 $bodyDescription = str_replace(array(' ',TAB,CR,LF),' ',$contentArr['body']);
01370
01371
01372 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
01373 }
01374
01375 return $bodyDescription;
01376 }
01377
01378
01379
01380
01381
01382
01383
01384 function indexAnalyze($content) {
01385 $indexArr = Array();
01386 $counter = 0;
01387
01388 $this->analyzeHeaderinfo($indexArr,$content,'title',7);
01389 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
01390 $this->analyzeHeaderinfo($indexArr,$content,'description',5);
01391 $this->analyzeBody($indexArr,$content);
01392
01393 return ($indexArr);
01394 }
01395
01396
01397
01398
01399
01400
01401
01402
01403
01404
01405 function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
01406 foreach ($content[$key] as $val) {
01407 $val = substr($val,0,60);
01408 $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
01409 $retArr[$val]['count'] = $retArr[$val]['count']+1;
01410 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
01411 $retArr[$val]['metaphone'] = $this->metaphone($val);
01412 $this->wordcount++;
01413 }
01414 }
01415
01416
01417
01418
01419
01420
01421
01422
01423 function analyzeBody(&$retArr,$content) {
01424 foreach($content['body'] as $key => $val) {
01425 $val = substr($val,0,60);
01426 if(!isset($retArr[$val])) {
01427 $retArr[$val]['first'] = $key;
01428 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
01429 $retArr[$val]['metaphone'] = $this->metaphone($val);
01430 }
01431 $retArr[$val]['count'] = $retArr[$val]['count']+1;
01432 $this->wordcount++;
01433 }
01434 }
01435
01436
01437
01438
01439
01440
01441
01442
01443 function metaphone($word,$retRaw=FALSE) {
01444
01445 if (is_object($this->metaphoneObj)) {
01446 $tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
01447 } else {
01448 $tmp = metaphone($word);
01449 }
01450
01451
01452 if ($retRaw) return $tmp;
01453
01454
01455 if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));
01456 return $ret;
01457 }
01458
01459
01460
01461
01462
01463
01464
01465
01466
01467
01468
01469
01470
01471
01472
01473
01474
01475
01476
01477
01478
01479
01480
01481
01482
01483
01484
01485 function submitPage() {
01486
01487
01488 $this->removeOldIndexedPages($this->hash['phash']);
01489
01490
01491 $fields = array(
01492 'phash' => $this->hash['phash'],
01493 'phash_grouping' => $this->hash['phash_grouping'],
01494 'cHashParams' => serialize($this->cHashParams),
01495 'contentHash' => $this->content_md5h,
01496 'data_page_id' => $this->conf['id'],
01497 'data_page_reg1' => $this->conf['page_cache_reg1'],
01498 'data_page_type' => $this->conf['type'],
01499 'data_page_mp' => $this->conf['MP'],
01500 'gr_list' => $this->conf['gr_list'],
01501 'item_type' => 0,
01502 'item_title' => $this->contentParts['title'],
01503 'item_description' => $this->bodyDescription($this->contentParts),
01504 'item_mtime' => $this->conf['mtime'],
01505 'item_size' => strlen($this->conf['content']),
01506 'tstamp' => $GLOBALS['EXEC_TIME'],
01507 'crdate' => $GLOBALS['EXEC_TIME'],
01508 'item_crdate' => $this->conf['crdate'],
01509 'sys_language_uid' => $this->conf['sys_language_uid'],
01510 'externalUrl' => 0,
01511 'recordUid' => intval($this->conf['recordUid']),
01512 'freeIndexUid' => intval($this->conf['freeIndexUid']),
01513 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
01514 );
01515
01516 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
01517
01518
01519 $this->submit_section($this->hash['phash'],$this->hash['phash']);
01520
01521
01522 $this->submit_grlist($this->hash['phash'],$this->hash['phash']);
01523
01524
01525 $fields = array(
01526 'phash' => $this->hash['phash'],
01527 'fulltextdata' => implode(' ', $this->contentParts)
01528 );
01529 if ($this->indexerConfig['fullTextDataLength']>0) {
01530 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
01531 }
01532 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
01533
01534
01535 if ($this->indexerConfig['debugMode']) {
01536 $fields = array(
01537 'phash' => $this->hash['phash'],
01538 'debuginfo' => serialize(array(
01539 'cHashParams' => $this->cHashParams,
01540 'external_parsers initialized' => array_keys($this->external_parsers),
01541 'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))),
01542 'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))),
01543 'logs' => $this->internal_log,
01544 'lexer' => $this->lexerObj->debugString,
01545 ))
01546 );
01547 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
01548 }
01549 }
01550
01551
01552
01553
01554
01555
01556
01557
01558
01559 function submit_grlist($hash,$phash_x) {
01560
01561
01562 $fields = array(
01563 'phash' => $hash,
01564 'phash_x' => $phash_x,
01565 'hash_gr_list' => $this->md5inthash($this->conf['gr_list']),
01566 'gr_list' => $this->conf['gr_list']
01567 );
01568 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
01569 }
01570
01571
01572
01573
01574
01575
01576
01577
01578
01579 function submit_section($hash,$hash_t3) {
01580 $fields = array(
01581 'phash' => $hash,
01582 'phash_t3' => $hash_t3,
01583 'page_id' => intval($this->conf['id'])
01584 );
01585
01586 $this->getRootLineFields($fields);
01587
01588 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
01589 }
01590
01591
01592
01593
01594
01595
01596
01597 function removeOldIndexedPages($phash) {
01598
01599 $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext,index_debug');
01600 foreach($tableArr as $table) {
01601 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
01602 }
01603
01604 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3='.intval($phash));
01605 }
01606
01607
01608
01609
01610
01611
01612
01613
01614
01615
01616
01617
01618
01619
01620
01621
01622
01623
01624
01625
01626
01627
01628
01629
01630
01631
01632
01633
01634
01635
01636
01637
01638
01639
01640 function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) {
01641
01642
01643 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
01644 $storeItemType = $storeItemType ? $storeItemType : $ext;
01645
01646
01647 $this->removeOldIndexedFiles($hash['phash']);
01648
01649
01650 $fileParts = parse_url($file);
01651
01652
01653 $fields = array(
01654 'phash' => $hash['phash'],
01655 'phash_grouping' => $hash['phash_grouping'],
01656 'cHashParams' => serialize($subinfo),
01657 'contentHash' => $content_md5h,
01658 'data_filename' => $file,
01659 'item_type' => $storeItemType,
01660 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
01661 'item_description' => $this->bodyDescription($contentParts),
01662 'item_mtime' => $mtime,
01663 'item_size' => $size,
01664 'item_crdate' => $ctime,
01665 'tstamp' => $GLOBALS['EXEC_TIME'],
01666 'crdate' => $GLOBALS['EXEC_TIME'],
01667 'gr_list' => $this->conf['gr_list'],
01668 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
01669 'recordUid' => intval($this->conf['recordUid']),
01670 'freeIndexUid' => intval($this->conf['freeIndexUid']),
01671 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
01672 );
01673 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
01674
01675
01676 $fields = array(
01677 'phash' => $hash['phash'],
01678 'fulltextdata' => implode(' ', $contentParts)
01679 );
01680 if ($this->indexerConfig['fullTextDataLength']>0) {
01681 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
01682 }
01683 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
01684
01685
01686 if ($this->indexerConfig['debugMode']) {
01687 $fields = array(
01688 'phash' => $hash['phash'],
01689 'debuginfo' => serialize(array(
01690 'cHashParams' => $subinfo,
01691 'contentParts' => array_merge($contentParts,array('body' => substr($contentParts['body'],0,1000))),
01692 'logs' => $this->internal_log,
01693 'lexer' => $this->lexerObj->debugString,
01694 ))
01695 );
01696 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
01697 }
01698 }
01699
01700
01701
01702
01703
01704
01705
01706 function submitFile_grlist($hash) {
01707
01708 $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
01709 'phash',
01710 'index_grlist',
01711 'phash=' . intval($hash) .
01712 ' AND (hash_gr_list=' . $this->md5inthash($this->defaultGrList) .
01713 ' OR hash_gr_list=' . $this->md5inthash($this->conf['gr_list']) . ')'
01714 );
01715 if (!$count) {
01716 $this->submit_grlist($hash,$hash);
01717 }
01718 }
01719
01720
01721
01722
01723
01724
01725
01726 function submitFile_section($hash) {
01727
01728 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->conf['id']));
01729 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
01730 $this->submit_section($hash,$this->hash['phash']);
01731 }
01732 }
01733
01734
01735
01736
01737
01738
01739
01740 function removeOldIndexedFiles($phash) {
01741
01742
01743 $tableArr = explode(',','index_phash,index_grlist,index_fulltext,index_debug');
01744 foreach($tableArr as $table) {
01745 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
01746 }
01747 }
01748
01749
01750
01751
01752
01753
01754
01755
01756
01757
01758
01759
01760
01761
01762
01763
01764
01765
01766
01767
01768
01769
01770
01771
01772
01773
01774
01775
01776 function checkMtimeTstamp($mtime,$phash) {
01777
01778
01779 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash));
01780 $out = 0;
01781
01782
01783 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01784 if ($this->tstamp_maxAge && ($row['tstamp'] + $this->tstamp_maxAge) < $GLOBALS['EXEC_TIME']) {
01785 $out = 1;
01786 } else {
01787 if (!$this->tstamp_minAge || ($row['tstamp'] + $this->tstamp_minAge) < $GLOBALS['EXEC_TIME']) {
01788 if ($mtime) {
01789 if ($row['item_mtime'] != $mtime) {
01790 $out = 2;
01791 } else {
01792 $out = -1;
01793 if ($this->tstamp_maxAge) {
01794 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
01795 } else {
01796 $this->updateTstamp($phash);
01797 $this->log_setTSlogMessage('mtime matched, timestamp updated.',1);
01798 }
01799 }
01800 } else {$out = 3; }
01801 } else {$out = -2;}
01802 }
01803 } else {$out = 4;}
01804 return $out;
01805 }
01806
01807
01808
01809
01810
01811
01812 function checkContentHash() {
01813
01814 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h));
01815 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01816 return $row;
01817 }
01818 return 1;
01819 }
01820
01821
01822
01823
01824
01825
01826
01827
01828
01829 function checkExternalDocContentHash($hashGr,$content_md5h) {
01830 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h));
01831 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01832 return 0;
01833 }
01834 return 1;
01835 }
01836
01837
01838
01839
01840
01841
01842
01843 function is_grlist_set($phash_x) {
01844 return $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
01845 'phash_x',
01846 'index_grlist',
01847 'phash_x=' . intval($phash_x)
01848 );
01849 }
01850
01851
01852
01853
01854
01855
01856
01857
01858
01859 function update_grlist($phash,$phash_x) {
01860 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->conf['gr_list']));
01861 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
01862 $this->submit_grlist($phash,$phash_x);
01863 $this->log_setTSlogMessage("Inserted gr_list '".$this->conf['gr_list']."' for phash '".$phash."'",1);
01864 }
01865 }
01866
01867
01868
01869
01870
01871
01872
01873
01874 function updateTstamp($phash,$mtime=0) {
01875 $updateFields = array(
01876 'tstamp' => $GLOBALS['EXEC_TIME']
01877 );
01878 if ($mtime) { $updateFields['item_mtime'] = intval($mtime); }
01879
01880 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
01881 }
01882
01883
01884
01885
01886
01887
01888
01889 function updateSetId($phash) {
01890 $updateFields = array(
01891 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
01892 );
01893
01894 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
01895 }
01896
01897
01898
01899
01900
01901
01902
01903
01904 function updateParsetime($phash,$parsetime) {
01905 $updateFields = array(
01906 'parsetime' => intval($parsetime)
01907 );
01908
01909 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
01910 }
01911
01912
01913
01914
01915
01916
01917 function updateRootline() {
01918
01919 $updateFields = array();
01920 $this->getRootLineFields($updateFields);
01921
01922 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->conf['id']), $updateFields);
01923 }
01924
01925
01926
01927
01928
01929
01930
01931
01932 function getRootLineFields(&$fieldArr) {
01933
01934 $fieldArr['rl0'] = intval($this->conf['rootline_uids'][0]);
01935 $fieldArr['rl1'] = intval($this->conf['rootline_uids'][1]);
01936 $fieldArr['rl2'] = intval($this->conf['rootline_uids'][2]);
01937
01938 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
01939 foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
01940 $fieldArr[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
01941 }
01942 }
01943 }
01944
01945
01946
01947
01948
01949
01950
01951 function removeLoginpagesWithContentHash() {
01952 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A,index_grlist B', '
01953 A.phash=B.phash
01954 AND A.phash_grouping='.intval($this->hash['phash_grouping']).'
01955 AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).'
01956 AND A.contentHash='.intval($this->content_md5h));
01957 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01958 $this->log_setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1);
01959 $this->removeOldIndexedPages($row['phash']);
01960 }
01961 }
01962
01963
01964
01965
01966
01967
01968 function includeCrawlerClass() {
01969 global $TYPO3_CONF_VARS;
01970
01971 require_once(t3lib_extMgm::extPath('crawler').'class.tx_crawler_lib.php');
01972 }
01973
01974
01975
01976
01977
01978
01979
01980
01981
01982
01983
01984
01985
01986
01987
01988
01989
01990
01991
01992
01993
01994
01995 function checkWordList($wl) {
01996 $phashArr = array();
01997 foreach ($wl as $key => $value) {
01998 $phashArr[] = $wl[$key]['hash'];
01999 }
02000 if (count($phashArr)) {
02001 $cwl = implode(',',$phashArr);
02002 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')');
02003
02004 if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) {
02005 $this->log_setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1);
02006 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
02007 unset($wl[$row['baseword']]);
02008 }
02009
02010 foreach ($wl as $key => $val) {
02011 $insertFields = array(
02012 'wid' => $val['hash'],
02013 'baseword' => $key,
02014 'metaphone' => $val['metaphone']
02015 );
02016
02017 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
02018 }
02019 }
02020 }
02021 }
02022
02023
02024
02025
02026
02027
02028
02029
02030 function submitWords($wl,$phash) {
02031 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash='.intval($phash));
02032
02033 foreach($wl as $val) {
02034 $insertFields = array(
02035 'phash' => $phash,
02036 'wid' => $val['hash'],
02037 'count' => $val['count'],
02038 'first' => $val['first'],
02039 'freq' => $this->freqMap(($val['count']/$this->wordcount)),
02040 'flags' => ($val['cmp'] & $this->flagBitMask)
02041 );
02042
02043 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
02044 }
02045 }
02046
02047
02048
02049
02050
02051
02052
02053
02054 function freqMap($freq) {
02055 $mapFactor = $this->freqMax*100*$this->freqRange;
02056 if($freq<1) {
02057 $newFreq = $freq*$mapFactor;
02058 $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq;
02059 } else {
02060 $newFreq = $freq/$mapFactor;
02061 }
02062 return $newFreq;
02063
02064 }
02065
02066
02067
02068
02069
02070
02071
02072
02073
02074
02075
02076
02077
02078
02079
02080
02081
02082
02083
02084
02085
02086
02087 function setT3Hashes() {
02088
02089
02090 $hArray = array(
02091 'id' => (integer)$this->conf['id'],
02092 'type' => (integer)$this->conf['type'],
02093 'sys_lang' => (integer)$this->conf['sys_language_uid'],
02094 'MP' => (string)$this->conf['MP'],
02095 'cHash' => $this->cHashParams
02096 );
02097
02098
02099 $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
02100
02101
02102 $hArray['gr_list'] = (string)$this->conf['gr_list'];
02103 $this->hash['phash'] = $this->md5inthash(serialize($hArray));
02104 }
02105
02106
02107
02108
02109
02110
02111
02112
02113 function setExtHashes($file,$subinfo=array()) {
02114
02115 $hash = array();
02116 $hArray = array(
02117 'file' => $file,
02118 );
02119
02120
02121 $hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
02122
02123
02124 $hArray['subinfo'] = $subinfo;
02125 $hash['phash'] = $this->md5inthash(serialize($hArray));
02126
02127 return $hash;
02128 }
02129
02130
02131
02132
02133
02134
02135
02136
02137 function md5inthash($str) {
02138 return hexdec(substr(md5($str),0,7));
02139 }
02140
02141
02142
02143
02144
02145
02146
02147
02148 function makeCHash($paramArray) {
02149 t3lib_div::logDeprecatedFunction();
02150
02151 $addQueryParams = t3lib_div::implodeArrayForUrl('', $paramArray);
02152
02153 $pA = t3lib_div::cHashParams($addQueryParams);
02154
02155 return t3lib_div::shortMD5(serialize($pA));
02156 }
02157
02158
02159
02160
02161
02162
02163
02164
02165
02166
02167
02168
02169
02170
02171
02172
02173
02174
02175
02176
02177
02178
02179
02180
02181
02182 function log_push($msg,$key) {
02183 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->push($msg,$key);
02184 }
02185
02186
02187
02188
02189
02190
02191 function log_pull() {
02192 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->pull();
02193 }
02194
02195
02196
02197
02198
02199
02200
02201
02202 function log_setTSlogMessage($msg, $errorNum=0) {
02203 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->setTSlogMessage($msg,$errorNum);
02204 $this->internal_log[] = $msg;
02205 }
02206
02207
02208
02209
02210
02211
02212
02213
02214
02215
02216
02217
02218
02219
02220
02221
02222
02223
02224
02225
02226
02227
02228 function fe_headerNoCache(&$params, $ref) {
02229 t3lib_div::logDeprecatedFunction();
02230
02231 require_once t3lib_extMgm::extPath('indexed_search') . 'hooks/class.tx_indexedsearch_tslib_fe_hook.php';
02232 t3lib_div::makeInstance('tx_indexedsearch_tslib_fe_hook')->headerNoCache($params, $ref);
02233 }
02234
02235
02236
02237
02238
02239
02240
02241
02242
02243 protected function addSpacesToKeywordList($keywordList) {
02244 $keywords = t3lib_div::trimExplode(',', $keywordList);
02245 return ' ' . implode(', ', $keywords) . ' ';
02246 }
02247 }
02248
02249
02250 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']) {
02251 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']);
02252 }
02253 ?>