|
TYPO3 API
SVNRelease
|
00001 <?php 00002 /*************************************************************** 00003 * Copyright notice 00004 * 00005 * (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com) 00006 * All rights reserved 00007 * 00008 * This script is part of the TYPO3 project. The TYPO3 project is 00009 * free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 2 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * The GNU General Public License can be found at 00015 * http://www.gnu.org/copyleft/gpl.html. 00016 * A copy is found in the textfile GPL.txt and important notices to the license 00017 * from the author is found in LICENSE.txt distributed with these scripts. 00018 * 00019 * 00020 * This script is distributed in the hope that it will be useful, 00021 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00022 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00023 * GNU General Public License for more details. 00024 * 00025 * This copyright notice MUST APPEAR in all copies of the script! 00026 ***************************************************************/ 00027 /** 00028 * External standard parsers for indexed_search 00029 * 00030 * @author Kasper Skårhøj <kasperYYYY@typo3.com> 00031 * @coauthor Olivier Simah <noname_paris@yahoo.fr> 00032 */ 00033 /** 00034 * [CLASS/FUNCTION INDEX of SCRIPT] 00035 * 00036 * 00037 * 00038 * 75: class tx_indexed_search_extparse 00039 * 94: function initParser($extension) 00040 * 214: function softInit($extension) 00041 * 247: function searchTypeMediaTitle($extension) 00042 * 323: function isMultiplePageExtension($extension) 00043 * 00044 * SECTION: Reading documents (for parsing) 00045 * 354: function readFileContent($ext,$absFile,$cPKey) 00046 * 521: function fileContentParts($ext,$absFile) 00047 * 560: function splitPdfInfo($pdfInfoArray) 00048 * 579: function removeEndJunk($string) 00049 * 00050 * SECTION: Backend analyzer 00051 * 606: function getIcon($extension) 00052 * 00053 * TOTAL FUNCTIONS: 9 00054 * (This index is automatically created/updated by the extension "extdeveval") 00055 * 00056 */ 00057 00058 00059 00060 00061 00062 00063 00064 00065 00066 00067 /** 00068 * External standard parsers for indexed_search 00069 * MUST RETURN utf-8 content! 00070 * 00071 * @author Kasper Skårhøj <kasperYYYY@typo3.com> 00072 * @package TYPO3 00073 * @subpackage tx_indexedsearch 00074 */ 00075 class tx_indexed_search_extparse { 00076 00077 // This value is also overridden from config. 00078 var $pdf_mode = -20; // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10 00079 00080 // This array is configured in initialization: 00081 var $app = array(); 00082 var $ext2itemtype_map = array(); 00083 var $supportedExtensions = array(); 00084 00085 var $pObj; // Reference to parent object (indexer class) 00086 protected $langObject; // Reference to LANG-Object 00087 00088 /** 00089 * Constructs this external parsers object 00090 */ 00091 public function __construct() { 00092 // Set the language object to be used accordant to current TYPO3_MODE: 00093 $this->langObject = (TYPO3_MODE == 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG']); 00094 } 00095 00096 /** 00097 * Initialize external parser for parsing content. 00098 * 00099 * @param string File extension 00100 * @return boolean Returns true if extension is supported/enabled, otherwise false. 00101 */ 00102 function initParser($extension) { 00103 00104 // Then read indexer-config and set if appropriate: 00105 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']); 00106 00107 // If windows, apply extension to tool name: 00108 $exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg 00109 $extOK = FALSE; 00110 $mainExtension = ''; 00111 00112 // Ignore extensions 00113 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1); 00114 if (in_array($extension, $ignoreExtensions)) { 00115 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:ignoreExtensions'), $extension), 1); 00116 return FALSE; 00117 } 00118 00119 $safeModeEnabled = t3lib_utility_PhpOptions::isSafeModeEnabled(); 00120 00121 // Switch on file extension: 00122 switch($extension) { 00123 case 'pdf': 00124 // PDF 00125 if ($indexerConfig['pdftools']) { 00126 $pdfPath = rtrim($indexerConfig['pdftools'], '/').'/'; 00127 if ($safeModeEnabled || (@is_file($pdfPath . 'pdftotext' . $exe) && @is_file($pdfPath . 'pdfinfo' . $exe))) { 00128 $this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe; 00129 $this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe; 00130 // PDF mode: 00131 $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100); 00132 $extOK = TRUE; 00133 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsNotFound'), $pdfPath), 3); 00134 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsDisabled'), 1); 00135 break; 00136 case 'doc': 00137 // Catdoc 00138 if ($indexerConfig['catdoc']) { 00139 $catdocPath = rtrim($indexerConfig['catdoc'], '/').'/'; 00140 if ($safeModeEnabled || @is_file($catdocPath . 'catdoc' . $exe)) { 00141 $this->app['catdoc'] = $catdocPath.'catdoc'.$exe; 00142 $extOK = TRUE; 00143 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:catdocNotFound'), $catdocPath), 3); 00144 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:catdocDisabled'), 1); 00145 break; 00146 case 'pps': // MS PowerPoint(?) 00147 case 'ppt': // MS PowerPoint 00148 // ppthtml 00149 if ($indexerConfig['ppthtml']) { 00150 $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/').'/'; 00151 if ($safeModeEnabled || @is_file($ppthtmlPath . 'ppthtml' . $exe)) { 00152 $this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe; 00153 $extOK = TRUE; 00154 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:ppthtmlNotFound'), $ppthtmlPath), 3); 00155 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:ppthtmlDisabled'), 1); 00156 break; 00157 case 'xls': // MS Excel 00158 // Xlhtml 00159 if ($indexerConfig['xlhtml']) { 00160 $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/').'/'; 00161 if ($safeModeEnabled || @is_file($xlhtmlPath . 'xlhtml' . $exe)) { 00162 $this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe; 00163 $extOK = TRUE; 00164 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:xlhtmlNotFound'), $xlhtmlPath), 3); 00165 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:xlhtmlDisabled'), 1); 00166 break; 00167 case 'sxc': // Open Office Calc. 00168 case 'sxi': // Open Office Impress 00169 case 'sxw': // Open Office Writer 00170 case 'ods': // Oasis OpenDocument Spreadsheet 00171 case 'odp': // Oasis OpenDocument Presentation 00172 case 'odt': // Oasis OpenDocument Text 00173 if ($indexerConfig['unzip']) { 00174 $unzipPath = rtrim($indexerConfig['unzip'], '/').'/'; 00175 if ($safeModeEnabled || @is_file($unzipPath . 'unzip' . $exe)) { 00176 $this->app['unzip'] = $unzipPath.'unzip'.$exe; 00177 $extOK = TRUE; 00178 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:unzipNotFound'), $unzipPath), 3); 00179 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:unzipDisabled'), 1); 00180 break; 00181 case 'rtf': 00182 // Catdoc 00183 if ($indexerConfig['unrtf']) { 00184 $unrtfPath = rtrim($indexerConfig['unrtf'], '/').'/'; 00185 if ($safeModeEnabled || @is_file($unrtfPath . 'unrtf' . $exe)) { 00186 $this->app['unrtf'] = $unrtfPath.'unrtf'.$exe; 00187 $extOK = TRUE; 00188 } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:unrtfNotFound'), $unrtfPath), 3); 00189 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:unrtfDisabled'), 1); 00190 break; 00191 case 'txt': // Raw text 00192 case 'csv': // Raw text 00193 case 'xml': // PHP strip-tags() 00194 case 'tif': // PHP EXIF 00195 $extOK = TRUE; 00196 break; 00197 case 'html': // PHP strip-tags() 00198 case 'htm': // PHP strip-tags() 00199 $extOK = TRUE; 00200 $mainExtension = 'html'; // making "html" the common "item_type" 00201 break; 00202 case 'jpg': // PHP EXIF 00203 case 'jpeg': // PHP EXIF 00204 $extOK = TRUE; 00205 $mainExtension = 'jpeg'; // making "jpeg" the common item_type 00206 break; 00207 } 00208 00209 // If extension was OK: 00210 if ($extOK) { 00211 $this->supportedExtensions[$extension] = TRUE; 00212 $this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension; 00213 return TRUE; 00214 } 00215 } 00216 00217 /** 00218 * Initialize external parser for backend modules 00219 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin 00220 * 00221 * @param string File extension to initialize for. 00222 * @return boolean Returns true if the extension is supported and enabled, otherwise false. 00223 */ 00224 function softInit($extension) { 00225 switch($extension) { 00226 case 'pdf': // PDF 00227 case 'doc': // MS Word files 00228 case 'pps': // MS PowerPoint 00229 case 'ppt': // MS PowerPoint 00230 case 'xls': // MS Excel 00231 case 'sxc': // Open Office Calc. 00232 case 'sxi': // Open Office Impress 00233 case 'sxw': // Open Office Writer 00234 case 'ods': // Oasis OpenDocument Spreadsheet 00235 case 'odp': // Oasis OpenDocument Presentation 00236 case 'odt': // Oasis OpenDocument Text 00237 case 'rtf': // RTF documents 00238 case 'txt': // ASCII Text documents 00239 case 'html': // HTML 00240 case 'htm': // HTML 00241 case 'csv': // Comma Separated Values 00242 case 'xml': // Generic XML 00243 case 'jpg': // Jpeg images (EXIF comment) 00244 case 'jpeg': // Jpeg images (EXIF comment) 00245 case 'tif': // TIF images (EXIF comment) 00246 return TRUE; 00247 break; 00248 } 00249 } 00250 00251 /** 00252 * Return title of entry in media type selector box. 00253 * 00254 * @param string File extension 00255 * @return string String with label value of entry in media type search selector box (frontend plugin). 00256 */ 00257 function searchTypeMediaTitle($extension) { 00258 00259 // Read indexer-config 00260 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']); 00261 00262 // Ignore extensions 00263 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1); 00264 if (in_array($extension, $ignoreExtensions)) { 00265 return FALSE; 00266 } 00267 00268 // Switch on file extension: 00269 switch($extension) { 00270 case 'pdf': 00271 // PDF 00272 if ($indexerConfig['pdftools']) { 00273 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.PDF'), $extension); 00274 } 00275 break; 00276 case 'doc': 00277 // Catdoc 00278 if ($indexerConfig['catdoc']) { 00279 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.DOC'), $extension); 00280 } 00281 break; 00282 case 'pps': // MS PowerPoint(?) 00283 case 'ppt': // MS PowerPoint 00284 // ppthtml 00285 if ($indexerConfig['ppthtml']) { 00286 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.PP'), $extension); 00287 } 00288 break; 00289 case 'xls': // MS Excel 00290 // Xlhtml 00291 if ($indexerConfig['xlhtml']) { 00292 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.XLS'), $extension); 00293 } 00294 break; 00295 case 'sxc': // Open Office Calc. 00296 if ($indexerConfig['unzip']) { 00297 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXC'), $extension); 00298 } 00299 break; 00300 case 'sxi': // Open Office Impress 00301 if ($indexerConfig['unzip']) { 00302 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXI'), $extension); 00303 } 00304 break; 00305 case 'sxw': // Open Office Writer 00306 if ($indexerConfig['unzip']) { 00307 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXW'), $extension); 00308 } 00309 break; 00310 case 'ods': // Oasis OpenDocument Spreadsheet 00311 if ($indexerConfig['unzip']) { 00312 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODS'), $extension); 00313 } 00314 break; 00315 case 'odp': // Oasis OpenDocument Presentation 00316 if ($indexerConfig['unzip']) { 00317 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODP'), $extension); 00318 } 00319 break; 00320 case 'odt': // Oasis OpenDocument Text 00321 if ($indexerConfig['unzip']) { 00322 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODT'), $extension); 00323 } 00324 break; 00325 case 'rtf': 00326 // Catdoc 00327 if ($indexerConfig['unrtf']) { 00328 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.RTF'), $extension); 00329 } 00330 break; 00331 case 'jpeg': // PHP EXIF 00332 case 'tif': // PHP EXIF 00333 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.Images'), $extension); 00334 break; 00335 case 'html': // PHP strip-tags() 00336 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.HTML'), $extension); 00337 break; 00338 case 'txt': // Raw text 00339 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.TXT'), $extension); 00340 break; 00341 case 'csv': // Raw text 00342 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.CSV'), $extension); 00343 break; 00344 case 'xml': // PHP strip-tags() 00345 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.XML'), $extension); 00346 break; 00347 // NO entry (duplicates or blank): 00348 case 'htm': // PHP strip-tags() 00349 case 'jpg': // PHP EXIF 00350 default: 00351 break; 00352 } 00353 } 00354 00355 /** 00356 * Returns true if the input extension (item_type) is a potentially a multi-page extension 00357 * 00358 * @param string Extension / item_type string 00359 * @return boolean Return true if multi-page 00360 */ 00361 function isMultiplePageExtension($extension) { 00362 // Switch on file extension: 00363 switch((string)$extension) { 00364 case 'pdf': 00365 return TRUE; 00366 break; 00367 } 00368 } 00369 00370 /** 00371 * Wraps the "splitLabel function" of the language object. 00372 * 00373 * @param string $reference: Reference/key of the label 00374 * @param boolean $useHtmlSpecialChar: Convert special chars to HTML entities (default: false) 00375 * @return string The label of the reference/key to be fetched 00376 */ 00377 protected function sL($reference, $useHtmlSpecialChar = false) { 00378 return $this->langObject->sL($reference, $useHtmlSpecialChar); 00379 } 00380 00381 00382 00383 00384 00385 00386 00387 00388 00389 /************************ 00390 * 00391 * Reading documents (for parsing) 00392 * 00393 ************************/ 00394 00395 /** 00396 * Reads the content of an external file being indexed. 00397 * 00398 * @param string File extension, eg. "pdf", "doc" etc. 00399 * @param string Absolute filename of file (must exist and be validated OK before calling function) 00400 * @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.) 00401 * @return array Standard content array (title, description, keywords, body keys) 00402 */ 00403 function readFileContent($ext,$absFile,$cPKey) { 00404 unset($contentArr); 00405 00406 // Return immediately if initialization didn't set support up: 00407 if (!$this->supportedExtensions[$ext]) return FALSE; 00408 00409 // Switch by file extension 00410 switch ($ext) { 00411 case 'pdf': 00412 if ($this->app['pdfinfo']) { 00413 // Getting pdf-info: 00414 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile); 00415 t3lib_utility_Command::exec($cmd, $res); 00416 $pdfInfo = $this->splitPdfInfo($res); 00417 unset($res); 00418 if (intval($pdfInfo['pages'])) { 00419 list($low,$high) = explode('-',$cPKey); 00420 00421 // Get pdf content: 00422 $tempFileName = t3lib_div::tempnam('Typo3_indexer'); // Create temporary name 00423 @unlink ($tempFileName); // Delete if exists, just to be safe. 00424 $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName; 00425 t3lib_utility_Command::exec($cmd); 00426 if (@is_file($tempFileName)) { 00427 $content = t3lib_div::getUrl($tempFileName); 00428 unlink($tempFileName); 00429 } else { 00430 $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsFailed'), $absFile), 2); 00431 } 00432 if (strlen($content)) { 00433 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content)); 00434 } 00435 } 00436 } 00437 break; 00438 case 'doc': 00439 if ($this->app['catdoc']) { 00440 $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile); 00441 t3lib_utility_Command::exec($cmd, $res); 00442 $content = implode(LF,$res); 00443 unset($res); 00444 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content)); 00445 } 00446 break; 00447 case 'pps': 00448 case 'ppt': 00449 if ($this->app['ppthtml']) { 00450 $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile); 00451 t3lib_utility_Command::exec($cmd, $res); 00452 $content = implode(LF,$res); 00453 unset($res); 00454 $content = $this->pObj->convertHTMLToUtf8($content); 00455 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content)); 00456 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00457 } 00458 break; 00459 case 'xls': 00460 if ($this->app['xlhtml']) { 00461 $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile); 00462 t3lib_utility_Command::exec($cmd, $res); 00463 $content = implode(LF,$res); 00464 unset($res); 00465 $content = $this->pObj->convertHTMLToUtf8($content); 00466 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content)); 00467 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00468 } 00469 break; 00470 case 'sxi': 00471 case 'sxc': 00472 case 'sxw': 00473 case 'ods': 00474 case 'odp': 00475 case 'odt': 00476 if ($this->app['unzip']) { 00477 // Read content.xml: 00478 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml'; 00479 t3lib_utility_Command::exec($cmd, $res); 00480 $content_xml = implode(LF,$res); 00481 unset($res); 00482 00483 // Read meta.xml: 00484 $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml'; 00485 t3lib_utility_Command::exec($cmd, $res); 00486 $meta_xml = implode(LF,$res); 00487 unset($res); 00488 00489 $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml))); 00490 $contentArr = $this->pObj->splitRegularContent($utf8_content); 00491 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00492 00493 // Meta information 00494 $metaContent = t3lib_div::xml2tree($meta_xml); 00495 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch']; 00496 if (is_array($metaContent)) { 00497 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title']; 00498 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0]; 00499 00500 // Keywords collected: 00501 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) { 00502 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) { 00503 $contentArr['keywords'].= $kwDat['values'][0].' '; 00504 } 00505 } 00506 } 00507 } 00508 break; 00509 case 'rtf': 00510 if ($this->app['unrtf']) { 00511 $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile); 00512 t3lib_utility_Command::exec($cmd, $res); 00513 $fileContent = implode(LF,$res); 00514 unset($res); 00515 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent); 00516 $contentArr = $this->pObj->splitHTMLContent($fileContent); 00517 } 00518 break; 00519 case 'txt': 00520 case 'csv': // Raw text 00521 $content = t3lib_div::getUrl($absFile); 00522 // TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...) 00523 $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1'); 00524 $contentArr = $this->pObj->splitRegularContent($content); 00525 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00526 break; 00527 case 'html': 00528 case 'htm': 00529 $fileContent = t3lib_div::getUrl($absFile); 00530 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent); 00531 $contentArr = $this->pObj->splitHTMLContent($fileContent); 00532 break; 00533 case 'xml': // PHP strip-tags() 00534 $fileContent = t3lib_div::getUrl($absFile); 00535 00536 // Finding charset: 00537 preg_match('/^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i',substr($fileContent,0,200),$reg); 00538 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8'; 00539 00540 // Converting content: 00541 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset); 00542 $contentArr = $this->pObj->splitRegularContent($fileContent); 00543 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00544 break; 00545 case 'jpg': // PHP EXIF 00546 case 'jpeg': // PHP EXIF 00547 case 'tif': // PHP EXIF 00548 if (function_exists('exif_read_data')) { 00549 $exif = exif_read_data($absFile, 'IFD0'); 00550 } else { 00551 $exif = FALSE; 00552 } 00553 00554 if ($exif) { 00555 $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']); // The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii. 00556 } else { 00557 $comment = ''; 00558 } 00559 $contentArr = $this->pObj->splitRegularContent($comment); 00560 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00561 break; 00562 default: 00563 return false; 00564 break; 00565 } 00566 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name. 00567 if (is_array($contentArr) && !$contentArr['title']) { 00568 $contentArr['title'] = str_replace('_',' ',basename($absFile)); // Substituting "_" for " " because many filenames may have this instead of a space char. 00569 } 00570 00571 return $contentArr; 00572 } 00573 00574 /** 00575 * Creates an array with pointers to divisions of document. 00576 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back. 00577 * 00578 * @param string File extension 00579 * @param string Absolute filename (must exist and be validated OK before calling function) 00580 * @return array Array of pointers to sections that the document should be divided into 00581 */ 00582 function fileContentParts($ext,$absFile) { 00583 $cParts = array(0); 00584 switch ($ext) { 00585 case 'pdf': 00586 // Getting pdf-info: 00587 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile); 00588 t3lib_utility_Command::exec($cmd, $res); 00589 $pdfInfo = $this->splitPdfInfo($res); 00590 unset($res); 00591 00592 if (intval($pdfInfo['pages'])) { 00593 $cParts = array(); 00594 00595 // Calculate mode 00596 if ($this->pdf_mode>0) { 00597 $iter = ceil($pdfInfo['pages']/$this->pdf_mode); 00598 } else { 00599 $iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']); 00600 } 00601 00602 // Traverse and create intervals. 00603 for ($a=0;$a<$iter;$a++) { 00604 $low = floor($a*($pdfInfo['pages']/$iter))+1; 00605 $high = floor(($a+1)*($pdfInfo['pages']/$iter)); 00606 $cParts[] = $low.'-'.$high; 00607 } 00608 } 00609 break; 00610 } 00611 return $cParts; 00612 } 00613 00614 /** 00615 * Analysing PDF info into a useable format. 00616 * 00617 * @param array Array of PDF content, coming from the pdfinfo tool 00618 * @return array Result array 00619 * @access private 00620 * @see fileContentParts() 00621 */ 00622 function splitPdfInfo($pdfInfoArray) { 00623 $res = array(); 00624 if (is_array($pdfInfoArray)) { 00625 foreach($pdfInfoArray as $line) { 00626 $parts = explode(':',$line,2); 00627 if (count($parts)>1 && trim($parts[0])) { 00628 $res[strtolower(trim($parts[0]))] = trim($parts[1]); 00629 } 00630 } 00631 } 00632 return $res; 00633 } 00634 00635 /** 00636 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files. 00637 * 00638 * @param string String to clean up 00639 * @return string String 00640 */ 00641 function removeEndJunk($string) { 00642 return trim(preg_replace('/['.LF.chr(12).']*$/','',$string)); 00643 } 00644 00645 00646 00647 00648 00649 00650 00651 00652 00653 00654 00655 00656 /************************ 00657 * 00658 * Backend analyzer 00659 * 00660 ************************/ 00661 00662 /** 00663 * Return icon for file extension 00664 * 00665 * @param string File extension, lowercase. 00666 * @return string Relative file reference, resolvable by t3lib_div::getFileAbsFileName() 00667 */ 00668 function getIcon($extension) { 00669 if ($extension=='htm') $extension = 'html'; 00670 if ($extension=='jpeg') $extension = 'jpg'; 00671 return 'EXT:indexed_search/pi/res/'.$extension.'.gif'; 00672 } 00673 } 00674 00675 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php'])) { 00676 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']); 00677 } 00678 00679 ?>
1.8.0