TYPO3 API  SVNRelease
class.external_parser.php
Go to the documentation of this file.
00001 <?php
00002 /***************************************************************
00003 *  Copyright notice
00004 *
00005 *  (c) 2001-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
00006 *  All rights reserved
00007 *
00008 *  This script is part of the TYPO3 project. The TYPO3 project is
00009 *  free software; you can redistribute it and/or modify
00010 *  it under the terms of the GNU General Public License as published by
00011 *  the Free Software Foundation; either version 2 of the License, or
00012 *  (at your option) any later version.
00013 *
00014 *  The GNU General Public License can be found at
00015 *  http://www.gnu.org/copyleft/gpl.html.
00016 *  A copy is found in the textfile GPL.txt and important notices to the license
00017 *  from the author is found in LICENSE.txt distributed with these scripts.
00018 *
00019 *
00020 *  This script is distributed in the hope that it will be useful,
00021 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00022 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00023 *  GNU General Public License for more details.
00024 *
00025 *  This copyright notice MUST APPEAR in all copies of the script!
00026 ***************************************************************/
00027 /**
00028  * External standard parsers for indexed_search
00029  *
00030  * @author  Kasper Skårhøj <kasperYYYY@typo3.com>
00031  * @coauthor    Olivier Simah <noname_paris@yahoo.fr>
00032  */
00033 /**
00034  * [CLASS/FUNCTION INDEX of SCRIPT]
00035  *
00036  *
00037  *
00038  *   75: class tx_indexed_search_extparse
00039  *   94:     function initParser($extension)
00040  *  214:     function softInit($extension)
00041  *  247:     function searchTypeMediaTitle($extension)
00042  *  323:     function isMultiplePageExtension($extension)
00043  *
00044  *              SECTION: Reading documents (for parsing)
00045  *  354:     function readFileContent($ext,$absFile,$cPKey)
00046  *  521:     function fileContentParts($ext,$absFile)
00047  *  560:     function splitPdfInfo($pdfInfoArray)
00048  *  579:     function removeEndJunk($string)
00049  *
00050  *              SECTION: Backend analyzer
00051  *  606:     function getIcon($extension)
00052  *
00053  * TOTAL FUNCTIONS: 9
00054  * (This index is automatically created/updated by the extension "extdeveval")
00055  *
00056  */
00057 
00058 
00059 
00060 
00061 
00062 
00063 
00064 
00065 
00066 
00067 /**
00068  * External standard parsers for indexed_search
00069  * MUST RETURN utf-8 content!
00070  *
00071  * @author  Kasper Skårhøj <kasperYYYY@typo3.com>
00072  * @package TYPO3
00073  * @subpackage tx_indexedsearch
00074  */
00075 class tx_indexed_search_extparse {
00076 
00077         // This value is also overridden from config.
00078     var $pdf_mode = -20;    // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
00079 
00080         // This array is configured in initialization:
00081     var $app = array();
00082     var $ext2itemtype_map = array();
00083     var $supportedExtensions = array();
00084 
00085     var $pObj;      // Reference to parent object (indexer class)
00086     protected $langObject;  // Reference to LANG-Object
00087 
00088     /**
00089      * Constructs this external parsers object
00090      */
00091     public function __construct() {
00092             // Set the language object to be used accordant to current TYPO3_MODE:
00093         $this->langObject = (TYPO3_MODE == 'FE' ? $GLOBALS['TSFE'] : $GLOBALS['LANG']);
00094     }
00095 
00096     /**
00097      * Initialize external parser for parsing content.
00098      *
00099      * @param   string      File extension
00100      * @return  boolean     Returns true if extension is supported/enabled, otherwise false.
00101      */
00102     function initParser($extension) {
00103 
00104             // Then read indexer-config and set if appropriate:
00105         $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00106 
00107             // If windows, apply extension to tool name:
00108         $exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg
00109         $extOK = FALSE;
00110         $mainExtension = '';
00111 
00112             // Ignore extensions
00113         $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
00114         if (in_array($extension, $ignoreExtensions))    {
00115             $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:ignoreExtensions'), $extension), 1);
00116             return FALSE;
00117         }
00118 
00119         $safeModeEnabled = t3lib_utility_PhpOptions::isSafeModeEnabled();
00120 
00121             // Switch on file extension:
00122         switch($extension)  {
00123             case 'pdf':
00124                     // PDF
00125                 if ($indexerConfig['pdftools']) {
00126                     $pdfPath = rtrim($indexerConfig['pdftools'], '/').'/';
00127                     if ($safeModeEnabled || (@is_file($pdfPath . 'pdftotext' . $exe) && @is_file($pdfPath . 'pdfinfo' . $exe))) {
00128                         $this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;
00129                         $this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe;
00130                             // PDF mode:
00131                         $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);
00132                         $extOK = TRUE;
00133                     } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsNotFound'), $pdfPath), 3);
00134                 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsDisabled'), 1);
00135             break;
00136             case 'doc':
00137                     // Catdoc
00138                 if ($indexerConfig['catdoc'])   {
00139                     $catdocPath = rtrim($indexerConfig['catdoc'], '/').'/';
00140                     if ($safeModeEnabled || @is_file($catdocPath . 'catdoc' . $exe)) {
00141                         $this->app['catdoc'] = $catdocPath.'catdoc'.$exe;
00142                         $extOK = TRUE;
00143                     } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:catdocNotFound'), $catdocPath), 3);
00144                 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:catdocDisabled'), 1);
00145             break;
00146             case 'pps':     // MS PowerPoint(?)
00147             case 'ppt':     // MS PowerPoint
00148                     // ppthtml
00149                 if ($indexerConfig['ppthtml'])  {
00150                     $ppthtmlPath = rtrim($indexerConfig['ppthtml'], '/').'/';
00151                     if ($safeModeEnabled || @is_file($ppthtmlPath . 'ppthtml' . $exe)) {
00152                         $this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;
00153                         $extOK = TRUE;
00154                     } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:ppthtmlNotFound'), $ppthtmlPath), 3);
00155                 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:ppthtmlDisabled'), 1);
00156             break;
00157             case 'xls':     // MS Excel
00158                     // Xlhtml
00159                 if ($indexerConfig['xlhtml'])   {
00160                     $xlhtmlPath = rtrim($indexerConfig['xlhtml'], '/').'/';
00161                     if ($safeModeEnabled || @is_file($xlhtmlPath . 'xlhtml' . $exe)) {
00162                         $this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;
00163                         $extOK = TRUE;
00164                     } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:xlhtmlNotFound'), $xlhtmlPath), 3);
00165                 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:xlhtmlDisabled'), 1);
00166             break;
00167             case 'sxc':     // Open Office Calc.
00168             case 'sxi':     // Open Office Impress
00169             case 'sxw':     // Open Office Writer
00170             case 'ods':     // Oasis OpenDocument Spreadsheet
00171             case 'odp':     // Oasis OpenDocument Presentation
00172             case 'odt':     // Oasis OpenDocument Text
00173                 if ($indexerConfig['unzip'])    {
00174                     $unzipPath = rtrim($indexerConfig['unzip'], '/').'/';
00175                     if ($safeModeEnabled || @is_file($unzipPath . 'unzip' . $exe)) {
00176                         $this->app['unzip'] = $unzipPath.'unzip'.$exe;
00177                         $extOK = TRUE;
00178                     } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:unzipNotFound'), $unzipPath), 3);
00179                 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:unzipDisabled'), 1);
00180             break;
00181             case 'rtf':
00182                     // Catdoc
00183                 if ($indexerConfig['unrtf'])    {
00184                     $unrtfPath = rtrim($indexerConfig['unrtf'], '/').'/';
00185                     if ($safeModeEnabled || @is_file($unrtfPath . 'unrtf' . $exe)) {
00186                         $this->app['unrtf'] = $unrtfPath.'unrtf'.$exe;
00187                         $extOK = TRUE;
00188                     } else $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:unrtfNotFound'), $unrtfPath), 3);
00189                 } else $this->pObj->log_setTSlogMessage($this->sL('LLL:EXT:indexed_search/locallang.xml:unrtfDisabled'), 1);
00190             break;
00191             case 'txt':     // Raw text
00192             case 'csv':     // Raw text
00193             case 'xml':     // PHP strip-tags()
00194             case 'tif':     // PHP EXIF
00195                 $extOK = TRUE;
00196             break;
00197             case 'html':    // PHP strip-tags()
00198             case 'htm':     // PHP strip-tags()
00199                 $extOK = TRUE;
00200                 $mainExtension = 'html';    // making "html" the common "item_type"
00201             break;
00202             case 'jpg':     // PHP EXIF
00203             case 'jpeg':    // PHP EXIF
00204                 $extOK = TRUE;
00205                 $mainExtension = 'jpeg';    // making "jpeg" the common item_type
00206             break;
00207         }
00208 
00209             // If extension was OK:
00210         if ($extOK) {
00211             $this->supportedExtensions[$extension] = TRUE;
00212             $this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;
00213             return TRUE;
00214         }
00215     }
00216 
00217     /**
00218      * Initialize external parser for backend modules
00219      * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
00220      *
00221      * @param   string      File extension to initialize for.
00222      * @return  boolean     Returns true if the extension is supported and enabled, otherwise false.
00223      */
00224     function softInit($extension)   {
00225         switch($extension)  {
00226             case 'pdf':     // PDF
00227             case 'doc':     // MS Word files
00228             case 'pps':     // MS PowerPoint
00229             case 'ppt':     // MS PowerPoint
00230             case 'xls':     // MS Excel
00231             case 'sxc':     // Open Office Calc.
00232             case 'sxi':     // Open Office Impress
00233             case 'sxw':     // Open Office Writer
00234             case 'ods':     // Oasis OpenDocument Spreadsheet
00235             case 'odp':     // Oasis OpenDocument Presentation
00236             case 'odt':     // Oasis OpenDocument Text
00237             case 'rtf':     // RTF documents
00238             case 'txt':     // ASCII Text documents
00239             case 'html':    // HTML
00240             case 'htm':     // HTML
00241             case 'csv':     // Comma Separated Values
00242             case 'xml':     // Generic XML
00243             case 'jpg':     // Jpeg images (EXIF comment)
00244             case 'jpeg':    // Jpeg images (EXIF comment)
00245             case 'tif':     // TIF images (EXIF comment)
00246                 return TRUE;
00247             break;
00248         }
00249     }
00250 
00251     /**
00252      * Return title of entry in media type selector box.
00253      *
00254      * @param   string      File extension
00255      * @return  string      String with label value of entry in media type search selector box (frontend plugin).
00256      */
00257     function searchTypeMediaTitle($extension)   {
00258 
00259             // Read indexer-config
00260         $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00261 
00262             // Ignore extensions
00263         $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
00264         if (in_array($extension, $ignoreExtensions))    {
00265             return FALSE;
00266         }
00267 
00268             // Switch on file extension:
00269         switch($extension)  {
00270             case 'pdf':
00271                     // PDF
00272                 if ($indexerConfig['pdftools']) {
00273                     return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.PDF'), $extension);
00274                 }
00275             break;
00276             case 'doc':
00277                     // Catdoc
00278                 if ($indexerConfig['catdoc'])   {
00279                     return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.DOC'), $extension);
00280                 }
00281             break;
00282             case 'pps':     // MS PowerPoint(?)
00283             case 'ppt':     // MS PowerPoint
00284                     // ppthtml
00285                 if ($indexerConfig['ppthtml'])  {
00286                     return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.PP'), $extension);
00287                 }
00288             break;
00289             case 'xls':     // MS Excel
00290                     // Xlhtml
00291                 if ($indexerConfig['xlhtml'])   {
00292                     return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.XLS'), $extension);
00293                 }
00294             break;
00295             case 'sxc':     // Open Office Calc.
00296             if ($indexerConfig['unzip'])    {
00297                     return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXC'), $extension);
00298                 }
00299             break;
00300             case 'sxi':     // Open Office Impress
00301             if ($indexerConfig['unzip'])    {
00302                     return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXI'), $extension);
00303                 }
00304             break;
00305             case 'sxw':     // Open Office Writer
00306             if ($indexerConfig['unzip'])    {
00307                     return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.SXW'), $extension);
00308                 }
00309             break;
00310             case 'ods':     // Oasis OpenDocument Spreadsheet
00311             if ($indexerConfig['unzip'])    {
00312                     return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODS'), $extension);
00313                 }
00314             break;
00315             case 'odp':     // Oasis OpenDocument Presentation
00316                 if ($indexerConfig['unzip'])    {
00317                     return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODP'), $extension);
00318                 }
00319             break;
00320             case 'odt':     // Oasis OpenDocument Text
00321                 if ($indexerConfig['unzip'])    {
00322                     return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.ODT'), $extension);
00323                 }
00324             break;
00325             case 'rtf':
00326                     // Catdoc
00327                 if ($indexerConfig['unrtf'])    {
00328                     return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.RTF'), $extension);
00329                 }
00330             break;
00331             case 'jpeg':    // PHP EXIF
00332             case 'tif':     // PHP EXIF
00333                 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.Images'), $extension);
00334             break;
00335             case 'html':    // PHP strip-tags()
00336                 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.HTML'), $extension);
00337             break;
00338             case 'txt':     // Raw text
00339                 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.TXT'), $extension);
00340             break;
00341             case 'csv':     // Raw text
00342                 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.CSV'), $extension);
00343             break;
00344             case 'xml':     // PHP strip-tags()
00345                 return sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:extension.XML'), $extension);
00346             break;
00347                 // NO entry (duplicates or blank):
00348             case 'htm':     // PHP strip-tags()
00349             case 'jpg':     // PHP EXIF
00350             default:
00351             break;
00352         }
00353     }
00354 
00355     /**
00356      * Returns true if the input extension (item_type) is a potentially a multi-page extension
00357      *
00358      * @param   string      Extension / item_type string
00359      * @return  boolean     Return true if multi-page
00360      */
00361     function isMultiplePageExtension($extension)    {
00362             // Switch on file extension:
00363         switch((string)$extension)  {
00364             case 'pdf':
00365                 return TRUE;
00366             break;
00367         }
00368     }
00369 
00370     /**
00371      * Wraps the "splitLabel function" of the language object.
00372      *
00373      * @param   string      $reference: Reference/key of the label
00374      * @param   boolean     $useHtmlSpecialChar: Convert special chars to HTML entities (default: false)
00375      * @return  string      The label of the reference/key to be fetched
00376      */
00377     protected function sL($reference, $useHtmlSpecialChar = false) {
00378         return $this->langObject->sL($reference, $useHtmlSpecialChar);
00379     }
00380 
00381 
00382 
00383 
00384 
00385 
00386 
00387 
00388 
00389     /************************
00390      *
00391      * Reading documents (for parsing)
00392      *
00393      ************************/
00394 
00395     /**
00396      * Reads the content of an external file being indexed.
00397      *
00398      * @param   string      File extension, eg. "pdf", "doc" etc.
00399      * @param   string      Absolute filename of file (must exist and be validated OK before calling function)
00400      * @param   string      Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
00401      * @return  array       Standard content array (title, description, keywords, body keys)
00402      */
00403     function readFileContent($ext,$absFile,$cPKey)  {
00404         unset($contentArr);
00405 
00406             // Return immediately if initialization didn't set support up:
00407         if (!$this->supportedExtensions[$ext])  return FALSE;
00408 
00409             // Switch by file extension
00410         switch ($ext)   {
00411             case 'pdf':
00412                 if ($this->app['pdfinfo'])  {
00413                         // Getting pdf-info:
00414                     $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
00415                     t3lib_utility_Command::exec($cmd, $res);
00416                     $pdfInfo = $this->splitPdfInfo($res);
00417                     unset($res);
00418                     if (intval($pdfInfo['pages']))  {
00419                         list($low,$high) = explode('-',$cPKey);
00420 
00421                             // Get pdf content:
00422                         $tempFileName = t3lib_div::tempnam('Typo3_indexer');        // Create temporary name
00423                         @unlink ($tempFileName);    // Delete if exists, just to be safe.
00424                         $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
00425                         t3lib_utility_Command::exec($cmd);
00426                         if (@is_file($tempFileName))    {
00427                             $content = t3lib_div::getUrl($tempFileName);
00428                             unlink($tempFileName);
00429                         } else {
00430                             $this->pObj->log_setTSlogMessage(sprintf($this->sL('LLL:EXT:indexed_search/locallang.xml:pdfToolsFailed'), $absFile), 2);
00431                         }
00432                         if (strlen($content))   {
00433                             $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
00434                         }
00435                     }
00436                 }
00437             break;
00438             case 'doc':
00439                 if ($this->app['catdoc'])   {
00440                     $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
00441                     t3lib_utility_Command::exec($cmd, $res);
00442                     $content = implode(LF,$res);
00443                     unset($res);
00444                     $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
00445                 }
00446             break;
00447             case 'pps':
00448             case 'ppt':
00449                 if ($this->app['ppthtml'])  {
00450                     $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
00451                     t3lib_utility_Command::exec($cmd, $res);
00452                     $content = implode(LF,$res);
00453                     unset($res);
00454                     $content = $this->pObj->convertHTMLToUtf8($content);
00455                     $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
00456                     $contentArr['title'] = basename($absFile);  // Make sure the title doesn't expose the absolute path!
00457                 }
00458             break;
00459             case 'xls':
00460                 if ($this->app['xlhtml'])   {
00461                     $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
00462                     t3lib_utility_Command::exec($cmd, $res);
00463                     $content = implode(LF,$res);
00464                     unset($res);
00465                     $content = $this->pObj->convertHTMLToUtf8($content);
00466                     $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
00467                     $contentArr['title'] = basename($absFile);  // Make sure the title doesn't expose the absolute path!
00468                 }
00469             break;
00470             case 'sxi':
00471             case 'sxc':
00472             case 'sxw':
00473             case 'ods':
00474             case 'odp':
00475             case 'odt':
00476                 if ($this->app['unzip'])    {
00477                         // Read content.xml:
00478                     $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' content.xml';
00479                     t3lib_utility_Command::exec($cmd, $res);
00480                     $content_xml = implode(LF,$res);
00481                     unset($res);
00482 
00483                         // Read meta.xml:
00484                     $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' meta.xml';
00485                     t3lib_utility_Command::exec($cmd, $res);
00486                     $meta_xml = implode(LF,$res);
00487                     unset($res);
00488 
00489                     $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml)));
00490                     $contentArr = $this->pObj->splitRegularContent($utf8_content);
00491                     $contentArr['title'] = basename($absFile);  // Make sure the title doesn't expose the absolute path!
00492 
00493                         // Meta information
00494                     $metaContent = t3lib_div::xml2tree($meta_xml);
00495                     $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
00496                     if (is_array($metaContent)) {
00497                         $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
00498                         $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];
00499 
00500                             // Keywords collected:
00501                         if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword']))   {
00502                             foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat)  {
00503                                 $contentArr['keywords'].= $kwDat['values'][0].' ';
00504                             }
00505                         }
00506                     }
00507                 }
00508             break;
00509             case 'rtf':
00510                 if ($this->app['unrtf'])    {
00511                     $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
00512                     t3lib_utility_Command::exec($cmd, $res);
00513                     $fileContent = implode(LF,$res);
00514                     unset($res);
00515                     $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
00516                     $contentArr = $this->pObj->splitHTMLContent($fileContent);
00517                 }
00518             break;
00519             case 'txt':
00520             case 'csv':     // Raw text
00521                 $content = t3lib_div::getUrl($absFile);
00522                     // TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...)
00523                 $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');
00524                 $contentArr = $this->pObj->splitRegularContent($content);
00525                 $contentArr['title'] = basename($absFile);  // Make sure the title doesn't expose the absolute path!
00526             break;
00527             case 'html':
00528             case 'htm':
00529                 $fileContent = t3lib_div::getUrl($absFile);
00530                 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
00531                 $contentArr = $this->pObj->splitHTMLContent($fileContent);
00532             break;
00533             case 'xml':     // PHP strip-tags()
00534                 $fileContent = t3lib_div::getUrl($absFile);
00535 
00536                     // Finding charset:
00537                 preg_match('/^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']/i',substr($fileContent,0,200),$reg);
00538                 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
00539 
00540                     // Converting content:
00541                 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);
00542                 $contentArr = $this->pObj->splitRegularContent($fileContent);
00543                 $contentArr['title'] = basename($absFile);  // Make sure the title doesn't expose the absolute path!
00544             break;
00545             case 'jpg':     // PHP EXIF
00546             case 'jpeg':    // PHP EXIF
00547             case 'tif':     // PHP EXIF
00548                 if (function_exists('exif_read_data'))  {
00549                     $exif = exif_read_data($absFile, 'IFD0');
00550                 } else {
00551                     $exif = FALSE;
00552                 }
00553 
00554                 if ($exif)  {
00555                     $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']); // The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii.
00556                 } else {
00557                     $comment = '';
00558                 }
00559                 $contentArr = $this->pObj->splitRegularContent($comment);
00560                 $contentArr['title'] = basename($absFile);  // Make sure the title doesn't expose the absolute path!
00561             break;
00562             default:
00563                 return false;
00564             break;
00565         }
00566             // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
00567         if (is_array($contentArr) && !$contentArr['title']) {
00568             $contentArr['title'] = str_replace('_',' ',basename($absFile)); // Substituting "_" for " " because many filenames may have this instead of a space char.
00569         }
00570 
00571         return $contentArr;
00572     }
00573 
00574     /**
00575      * Creates an array with pointers to divisions of document.
00576      * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.
00577      *
00578      * @param   string      File extension
00579      * @param   string      Absolute filename (must exist and be validated OK before calling function)
00580      * @return  array       Array of pointers to sections that the document should be divided into
00581      */
00582     function fileContentParts($ext,$absFile)    {
00583         $cParts = array(0);
00584         switch ($ext)   {
00585             case 'pdf':
00586                     // Getting pdf-info:
00587                 $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
00588                 t3lib_utility_Command::exec($cmd, $res);
00589                 $pdfInfo = $this->splitPdfInfo($res);
00590                 unset($res);
00591 
00592                 if (intval($pdfInfo['pages']))  {
00593                     $cParts = array();
00594 
00595                         // Calculate mode
00596                     if ($this->pdf_mode>0)  {
00597                         $iter = ceil($pdfInfo['pages']/$this->pdf_mode);
00598                     } else {
00599                         $iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
00600                     }
00601 
00602                         // Traverse and create intervals.
00603                     for ($a=0;$a<$iter;$a++)    {
00604                         $low = floor($a*($pdfInfo['pages']/$iter))+1;
00605                         $high = floor(($a+1)*($pdfInfo['pages']/$iter));
00606                         $cParts[] = $low.'-'.$high;
00607                     }
00608                 }
00609             break;
00610         }
00611         return $cParts;
00612     }
00613 
00614     /**
00615      * Analysing PDF info into a useable format.
00616      *
00617      * @param   array       Array of PDF content, coming from the pdfinfo tool
00618      * @return  array       Result array
00619      * @access private
00620      * @see fileContentParts()
00621      */
00622     function splitPdfInfo($pdfInfoArray)    {
00623         $res = array();
00624         if (is_array($pdfInfoArray))    {
00625             foreach($pdfInfoArray as $line) {
00626                 $parts = explode(':',$line,2);
00627                 if (count($parts)>1 && trim($parts[0])) {
00628                     $res[strtolower(trim($parts[0]))] = trim($parts[1]);
00629                 }
00630             }
00631         }
00632         return $res;
00633     }
00634 
00635     /**
00636      * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
00637      *
00638      * @param   string      String to clean up
00639      * @return  string      String
00640      */
00641     function removeEndJunk($string) {
00642         return trim(preg_replace('/['.LF.chr(12).']*$/','',$string));
00643     }
00644 
00645 
00646 
00647 
00648 
00649 
00650 
00651 
00652 
00653 
00654 
00655 
00656     /************************
00657      *
00658      * Backend analyzer
00659      *
00660      ************************/
00661 
00662     /**
00663      * Return icon for file extension
00664      *
00665      * @param   string      File extension, lowercase.
00666      * @return  string      Relative file reference, resolvable by t3lib_div::getFileAbsFileName()
00667      */
00668     function getIcon($extension)    {
00669         if ($extension=='htm')  $extension = 'html';
00670         if ($extension=='jpeg') $extension = 'jpg';
00671         return 'EXT:indexed_search/pi/res/'.$extension.'.gif';
00672     }
00673 }
00674 
00675 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php'])) {
00676     include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);
00677 }
00678 
00679 ?>