TYPO3 API  SVNRelease
class.t3lib_parsehtml.php
Go to the documentation of this file.
00001 <?php
00002 /***************************************************************
00003  *  Copyright notice
00004  *
00005  *  (c) 1999-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
00006  *  All rights reserved
00007  *
00008  *  This script is part of the TYPO3 project. The TYPO3 project is
00009  *  free software; you can redistribute it and/or modify
00010  *  it under the terms of the GNU General Public License as published by
00011  *  the Free Software Foundation; either version 2 of the License, or
00012  *  (at your option) any later version.
00013  *
00014  *  The GNU General Public License can be found at
00015  *  http://www.gnu.org/copyleft/gpl.html.
00016  *  A copy is found in the textfile GPL.txt and important notices to the license
00017  *  from the author is found in LICENSE.txt distributed with these scripts.
00018  *
00019  *
00020  *  This script is distributed in the hope that it will be useful,
00021  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00022  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00023  *  GNU General Public License for more details.
00024  *
00025  *  This copyright notice MUST APPEAR in all copies of the script!
00026  ***************************************************************/
00027 /**
00028  * Contains class with functions for parsing HTML code.
00029  *
00030  * $Id: class.t3lib_parsehtml.php 10121 2011-01-18 20:15:30Z ohader $
00031  * Revised for TYPO3 3.6 July/2003 by Kasper Skårhøj
00032  *
00033  * @author  Kasper Skårhøj <kasperYYYY@typo3.com>
00034  */
00035 /**
00036  * [CLASS/FUNCTION INDEX of SCRIPT]
00037  *
00038  *
00039  *
00040  *  106: class t3lib_parsehtml
00041  *  123:     function getSubpart($content, $marker)
00042  *  156:     function substituteSubpart($content,$marker,$subpartContent,$recursive=1,$keepMarker=0)
00043  *
00044  *            SECTION: Parsing HTML code
00045  *  247:     function splitIntoBlock($tag,$content,$eliminateExtraEndTags=0)
00046  *  308:     function splitIntoBlockRecursiveProc($tag,$content,&$procObj,$callBackContent,$callBackTags,$level=0)
00047  *  344:     function splitTags($tag,$content)
00048  *  378:     function getAllParts($parts,$tag_parts=1,$include_tag=1)
00049  *  396:     function removeFirstAndLastTag($str)
00050  *  412:     function getFirstTag($str)
00051  *  426:     function getFirstTagName($str,$preserveCase=FALSE)
00052  *  445:     function get_tag_attributes($tag,$deHSC=0)
00053  *  486:     function split_tag_attributes($tag)
00054  *  524:     function checkTagTypeCounts($content,$blockTags='a,b,blockquote,body,div,em,font,form,h1,h2,h3,h4,h5,h6,i,li,map,ol,option,p,pre,select,span,strong,table,td,textarea,tr,u,ul', $soloTags='br,hr,img,input,area')
00055  *
00056  *            SECTION: Clean HTML code
00057  *  617:     function HTMLcleaner($content, $tags=array(),$keepAll=0,$hSC=0,$addConfig=array())
00058  *  814:     function bidir_htmlspecialchars($value,$dir)
00059  *  837:     function prefixResourcePath($main_prefix,$content,$alternatives=array(),$suffix='')
00060  *  919:     function prefixRelPath($prefix,$srcVal,$suffix='')
00061  *  937:     function cleanFontTags($value,$keepFace=0,$keepSize=0,$keepColor=0)
00062  *  967:     function mapTags($value,$tags=array(),$ltChar='<',$ltChar2='<')
00063  *  982:     function unprotectTags($content,$tagList='')
00064  * 1015:     function stripTagsExcept($value,$tagList)
00065  * 1038:     function caseShift($str,$flag,$cacheKey='')
00066  * 1065:     function compileTagAttribs($tagAttrib,$meta=array(), $xhtmlClean=0)
00067  * 1093:     function get_tag_attributes_classic($tag,$deHSC=0)
00068  * 1106:     function indentLines($content, $number=1, $indentChar=TAB)
00069  * 1123:     function HTMLparserConfig($TSconfig,$keepTags=array())
00070  * 1247:     function XHTML_clean($content)
00071  * 1269:     function processTag($value,$conf,$endTag,$protected=0)
00072  * 1315:     function processContent($value,$dir,$conf)
00073  *
00074  * TOTAL FUNCTIONS: 28
00075  * (This index is automatically created/updated by the extension "extdeveval")
00076  *
00077  */
00078 
00079 
00080 /**
00081  * Functions for parsing HTML.
00082  * You are encouraged to use this class in your own applications
00083  *
00084  * @author  Kasper Skårhøj <kasperYYYY@typo3.com>
00085  * @package TYPO3
00086  * @subpackage t3lib
00087  */
00088 class t3lib_parsehtml {
00089 
00090     protected $caseShift_cache = array();
00091         // Void elements that do not have closing tags, as defined by HTML5, except link element
00092     const VOID_ELEMENTS = 'area|base|br|col|command|embed|hr|img|input|keygen|meta|param|source|track|wbr';
00093 
00094     /**
00095      * Returns the first subpart encapsulated in the marker, $marker
00096      * (possibly present in $content as a HTML comment)
00097      *
00098      * @param   string      Content with subpart wrapped in fx. "###CONTENT_PART###" inside.
00099      * @param   string      Marker string, eg. "###CONTENT_PART###"
00100      * @return  string
00101      */
00102     public static function getSubpart($content, $marker) {
00103         $start = strpos($content, $marker);
00104 
00105         if ($start === FALSE) {
00106             return '';
00107         }
00108 
00109         $start += strlen($marker);
00110         $stop = strpos($content, $marker, $start);
00111 
00112             // Q: What shall get returned if no stop marker is given
00113             // /*everything till the end*/ or nothing?
00114         if ($stop === FALSE) {
00115             return ''; /*substr($content, $start)*/
00116         }
00117 
00118         $content = substr($content, $start, $stop - $start);
00119 
00120         $matches = array();
00121         if (preg_match('/^([^<]*\-\->)(.*)(<\!\-\-[^>]*)$/s', $content, $matches) === 1) {
00122             return $matches[2];
00123         }
00124 
00125         $matches = array(); // resetting $matches
00126         if (preg_match('/(.*)(<\!\-\-[^>]*)$/s', $content, $matches) === 1) {
00127             return $matches[1];
00128         }
00129 
00130         $matches = array(); // resetting $matches
00131         if (preg_match('/^([^<]*\-\->)(.*)$/s', $content, $matches) === 1) {
00132             return $matches[2];
00133         }
00134 
00135         return $content;
00136     }
00137 
00138     /**
00139      * Substitutes a subpart in $content with the content of $subpartContent.
00140      *
00141      * @param   string      Content with subpart wrapped in fx. "###CONTENT_PART###" inside.
00142      * @param   string      Marker string, eg. "###CONTENT_PART###"
00143      * @param   array       If $subpartContent happens to be an array, it's [0] and [1] elements are wrapped around the content of the subpart (fetched by getSubpart())
00144      * @param   boolean     If $recursive is set, the function calls itself with the content set to the remaining part of the content after the second marker. This means that proceding subparts are ALSO substituted!
00145      * @param   boolean     If set, the marker around the subpart is not removed, but kept in the output
00146      * @return  string      Processed input content
00147      */
00148     public static function substituteSubpart($content, $marker, $subpartContent, $recursive = 1, $keepMarker = 0) {
00149         $start = strpos($content, $marker);
00150 
00151         if ($start === FALSE) {
00152             return $content;
00153         }
00154 
00155         $startAM = $start + strlen($marker);
00156         $stop = strpos($content, $marker, $startAM);
00157 
00158         if ($stop === FALSE) {
00159             return $content;
00160         }
00161 
00162         $stopAM = $stop + strlen($marker);
00163         $before = substr($content, 0, $start);
00164         $after = substr($content, $stopAM);
00165         $between = substr($content, $startAM, $stop - $startAM);
00166 
00167         if ($recursive) {
00168             $after = self::substituteSubpart(
00169                 $after,
00170                 $marker,
00171                 $subpartContent,
00172                 $recursive,
00173                 $keepMarker
00174             );
00175         }
00176 
00177         if ($keepMarker) {
00178             $matches = array();
00179             if (preg_match('/^([^<]*\-\->)(.*)(<\!\-\-[^>]*)$/s', $between, $matches) === 1) {
00180                 $before .= $marker . $matches[1];
00181                 $between = $matches[2];
00182                 $after = $matches[3] . $marker . $after;
00183             } elseif (preg_match('/^(.*)(<\!\-\-[^>]*)$/s', $between, $matches) === 1) {
00184                 $before .= $marker;
00185                 $between = $matches[1];
00186                 $after = $matches[2] . $marker . $after;
00187             } elseif (preg_match('/^([^<]*\-\->)(.*)$/s', $between, $matches) === 1) {
00188                 $before .= $marker . $matches[1];
00189                 $between = $matches[2];
00190                 $after = $marker . $after;
00191             } else {
00192                 $before .= $marker;
00193                 $after = $marker . $after;
00194             }
00195 
00196         } else {
00197             $matches = array();
00198             if (preg_match('/^(.*)<\!\-\-[^>]*$/s', $before, $matches) === 1) {
00199                 $before = $matches[1];
00200             }
00201 
00202             if (is_array($subpartContent)) {
00203                 $matches = array();
00204                 if (preg_match('/^([^<]*\-\->)(.*)(<\!\-\-[^>]*)$/s', $between, $matches) === 1) {
00205                     $between = $matches[2];
00206                 } elseif (preg_match('/^(.*)(<\!\-\-[^>]*)$/s', $between, $matches) === 1) {
00207                     $between = $matches[1];
00208                 } elseif (preg_match('/^([^<]*\-\->)(.*)$/s', $between, $matches) === 1) {
00209                     $between = $matches[2];
00210                 }
00211             }
00212 
00213             $matches = array(); // resetting $matches
00214             if (preg_match('/^[^<]*\-\->(.*)$/s', $after, $matches) === 1) {
00215                 $after = $matches[1];
00216             }
00217         }
00218 
00219         if (is_array($subpartContent)) {
00220             $between = $subpartContent[0] . $between . $subpartContent[1];
00221         } else {
00222             $between = $subpartContent;
00223         }
00224 
00225         return $before . $between . $after;
00226     }
00227 
00228     /**
00229      * Substitues multiple subparts at once
00230      *
00231      * @param   string      The content stream, typically HTML template content.
00232      * @param   array       The array of key/value pairs being subpart/content values used in the substitution. For each element in this array the function will substitute a subpart in the content stream with the content.
00233      * @return  string      The processed HTML content string.
00234      */
00235     public static function substituteSubpartArray($content, array $subpartsContent) {
00236         foreach ($subpartsContent as $subpartMarker => $subpartContent) {
00237             $content = self::substituteSubpart(
00238                 $content,
00239                 $subpartMarker,
00240                 $subpartContent
00241             );
00242         }
00243 
00244         return $content;
00245     }
00246 
00247 
00248     /**
00249      * Substitutes a marker string in the input content
00250      * (by a simple str_replace())
00251      *
00252      * @param   string      The content stream, typically HTML template content.
00253      * @param   string      The marker string, typically on the form "###[the marker string]###"
00254      * @param   mixed       The content to insert instead of the marker string found.
00255      * @return  string      The processed HTML content string.
00256      * @see substituteSubpart()
00257      */
00258     public static function substituteMarker($content, $marker, $markContent) {
00259         return str_replace($marker, $markContent, $content);
00260     }
00261 
00262 
00263     /**
00264      * Traverses the input $markContentArray array and for each key the marker
00265      * by the same name (possibly wrapped and in upper case) will be
00266      * substituted with the keys value in the array. This is very useful if you
00267      * have a data-record to substitute in some content. In particular when you
00268      * use the $wrap and $uppercase values to pre-process the markers. Eg. a
00269      * key name like "myfield" could effectively be represented by the marker
00270      * "###MYFIELD###" if the wrap value was "###|###" and the $uppercase
00271      * boolean true.
00272      *
00273      * @param   string      The content stream, typically HTML template content.
00274      * @param   array       The array of key/value pairs being marker/content values used in the substitution. For each element in this array the function will substitute a marker in the content stream with the content.
00275      * @param   string      A wrap value - [part 1] | [part 2] - for the markers before substitution
00276      * @param   boolean     If set, all marker string substitution is done with upper-case markers.
00277      * @param   boolean     If set, all unused marker are deleted.
00278      * @return  string      The processed output stream
00279      * @see substituteMarker(), substituteMarkerInObject(), TEMPLATE()
00280      */
00281     public static function substituteMarkerArray($content, $markContentArray, $wrap = '', $uppercase = 0, $deleteUnused = 0) {
00282         if (is_array($markContentArray)) {
00283             $wrapArr = t3lib_div::trimExplode('|', $wrap);
00284 
00285             foreach ($markContentArray as $marker => $markContent) {
00286                 if ($uppercase) {
00287                         // use strtr instead of strtoupper to avoid locale problems with Turkish
00288                     $marker = strtr(
00289                         $marker,
00290                         'abcdefghijklmnopqrstuvwxyz',
00291                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
00292                     );
00293                 }
00294 
00295                 if (count($wrapArr) > 0) {
00296                     $marker = $wrapArr[0] . $marker . $wrapArr[1];
00297                 }
00298 
00299                 $content = str_replace($marker, $markContent, $content);
00300             }
00301 
00302             if ($deleteUnused) {
00303                 if (empty($wrap)) {
00304                     $wrapArr = array('###', '###');
00305                 }
00306 
00307                 $content = preg_replace('/' . preg_quote($wrapArr[0]) . '([A-Z0-9_|\-]*)' . preg_quote($wrapArr[1]) . '/is', '', $content);
00308             }
00309         }
00310 
00311         return $content;
00312     }
00313 
00314 
00315     /************************************
00316      *
00317      * Parsing HTML code
00318      *
00319      ************************************/
00320 
00321     /**
00322      * Returns an array with the $content divided by tag-blocks specified with the list of tags, $tag
00323      * Even numbers in the array are outside the blocks, Odd numbers are block-content.
00324      * Use ->getAllParts() and ->removeFirstAndLastTag() to process the content if needed.
00325      *
00326      * @param   string      List of tags, comma separated.
00327      * @param   string      HTML-content
00328      * @param   boolean     If set, excessive end tags are ignored - you should probably set this in most cases.
00329      * @return  array       Even numbers in the array are outside the blocks, Odd numbers are block-content.
00330      * @see splitTags(), getAllParts(), removeFirstAndLastTag()
00331      */
00332     function splitIntoBlock($tag, $content, $eliminateExtraEndTags = 0) {
00333         $tags = array_unique(t3lib_div::trimExplode(',', $tag, 1));
00334         $regexStr = '/<\/?(' . implode('|', $tags) . ')(\s*>|\s[^>]*>)/si';
00335 
00336         $parts = preg_split($regexStr, $content);
00337 
00338         $newParts = array();
00339         $pointer = strlen($parts[0]);
00340         $buffer = $parts[0];
00341         $nested = 0;
00342         reset($parts);
00343         next($parts);
00344         while (list($k, $v) = each($parts)) {
00345             $isEndTag = substr($content, $pointer, 2) == '</' ? 1 : 0;
00346             $tagLen = strcspn(substr($content, $pointer), '>') + 1;
00347 
00348             if (!$isEndTag) { // We meet a start-tag:
00349                 if (!$nested) { // Ground level:
00350                     $newParts[] = $buffer; // previous buffer stored
00351                     $buffer = '';
00352                 }
00353                 $nested++; // We are inside now!
00354                 $mbuffer = substr($content, $pointer, strlen($v) + $tagLen); // New buffer set and pointer increased
00355                 $pointer += strlen($mbuffer);
00356                 $buffer .= $mbuffer;
00357             } else { // If we meet an endtag:
00358                 $nested--; // decrease nested-level
00359                 $eliminated = 0;
00360                 if ($eliminateExtraEndTags && $nested < 0) {
00361                     $nested = 0;
00362                     $eliminated = 1;
00363                 } else {
00364                     $buffer .= substr($content, $pointer, $tagLen); // In any case, add the endtag to current buffer and increase pointer
00365                 }
00366                 $pointer += $tagLen;
00367                 if (!$nested && !$eliminated) { // if we're back on ground level, (and not by eliminating tags...
00368                     $newParts[] = $buffer;
00369                     $buffer = '';
00370                 }
00371                 $mbuffer = substr($content, $pointer, strlen($v)); // New buffer set and pointer increased
00372                 $pointer += strlen($mbuffer);
00373                 $buffer .= $mbuffer;
00374             }
00375 
00376         }
00377         $newParts[] = $buffer;
00378         return $newParts;
00379     }
00380 
00381     /**
00382      * Splitting content into blocks *recursively* and processing tags/content with call back functions.
00383      *
00384      * @param   string      Tag list, see splitIntoBlock()
00385      * @param   string      Content, see splitIntoBlock()
00386      * @param   object      Object where call back methods are.
00387      * @param   string      Name of call back method for content; "function callBackContent($str,$level)"
00388      * @param   string      Name of call back method for tags; "function callBackTags($tags,$level)"
00389      * @param   integer     Indent level
00390      * @return  string      Processed content
00391      * @see splitIntoBlock()
00392      */
00393     function splitIntoBlockRecursiveProc($tag, $content, &$procObj, $callBackContent, $callBackTags, $level = 0) {
00394         $parts = $this->splitIntoBlock($tag, $content, TRUE);
00395         foreach ($parts as $k => $v) {
00396             if ($k % 2) {
00397                 $firstTagName = $this->getFirstTagName($v, TRUE);
00398                 $tagsArray = array();
00399                 $tagsArray['tag_start'] = $this->getFirstTag($v);
00400                 $tagsArray['tag_end'] = '</' . $firstTagName . '>';
00401                 $tagsArray['tag_name'] = strtolower($firstTagName);
00402                 $tagsArray['add_level'] = 1;
00403                 $tagsArray['content'] = $this->splitIntoBlockRecursiveProc($tag, $this->removeFirstAndLastTag($v), $procObj, $callBackContent, $callBackTags, $level + $tagsArray['add_level']);
00404 
00405                 if ($callBackTags) {
00406                     $tagsArray = $procObj->$callBackTags($tagsArray, $level);
00407                 }
00408 
00409                 $parts[$k] =
00410                         $tagsArray['tag_start'] .
00411                         $tagsArray['content'] .
00412                         $tagsArray['tag_end'];
00413             } else {
00414                 if ($callBackContent) {
00415                     $parts[$k] = $procObj->$callBackContent($parts[$k], $level);
00416                 }
00417             }
00418         }
00419 
00420         return implode('', $parts);
00421     }
00422 
00423     /**
00424      * Returns an array with the $content divided by tag-blocks specified with the list of tags, $tag
00425      * Even numbers in the array are outside the blocks, Odd numbers are block-content.
00426      * Use ->getAllParts() and ->removeFirstAndLastTag() to process the content if needed.
00427      *
00428      * @param   string      List of tags
00429      * @param   string      HTML-content
00430      * @return  array       Even numbers in the array are outside the blocks, Odd numbers are block-content.
00431      * @see splitIntoBlock(), getAllParts(), removeFirstAndLastTag()
00432      */
00433     function splitTags($tag, $content) {
00434         $tags = t3lib_div::trimExplode(',', $tag, 1);
00435         $regexStr = '/<(' . implode('|', $tags) . ')(\s[^>]*)?\/?>/si';
00436         $parts = preg_split($regexStr, $content);
00437 
00438         $pointer = strlen($parts[0]);
00439         $newParts = array();
00440         $newParts[] = $parts[0];
00441         reset($parts);
00442         next($parts);
00443         while (list($k, $v) = each($parts)) {
00444             $tagLen = strcspn(substr($content, $pointer), '>') + 1;
00445 
00446                 // Set tag:
00447             $tag = substr($content, $pointer, $tagLen); // New buffer set and pointer increased
00448             $newParts[] = $tag;
00449             $pointer += strlen($tag);
00450 
00451                 // Set content:
00452             $newParts[] = $v;
00453             $pointer += strlen($v);
00454         }
00455         return $newParts;
00456     }
00457 
00458     /**
00459      * Returns an array with either tag or non-tag content of the result from ->splitIntoBlock()/->splitTags()
00460      *
00461      * @param   array       Parts generated by ->splitIntoBlock() or >splitTags()
00462      * @param   boolean     Whether to return the tag-parts (default,true) or what was outside the tags.
00463      * @param   boolean     Whether to include the tags in the tag-parts (most useful for input made by ->splitIntoBlock())
00464      * @return  array       Tag-parts/Non-tag-parts depending on input argument settings
00465      * @see splitIntoBlock(), splitTags()
00466      */
00467     function getAllParts($parts, $tag_parts = 1, $include_tag = 1) {
00468         $newParts = array();
00469         foreach ($parts as $k => $v) {
00470             if (($k + ($tag_parts ? 0 : 1)) % 2) {
00471                 if (!$include_tag) {
00472                     $v = $this->removeFirstAndLastTag($v);
00473                 }
00474                 $newParts[] = $v;
00475             }
00476         }
00477         return $newParts;
00478     }
00479 
00480     /**
00481      * Removes the first and last tag in the string
00482      * Anything before the first and after the last tags respectively is also removed
00483      *
00484      * @param   string      String to process
00485      * @return  string
00486      */
00487     function removeFirstAndLastTag($str) {
00488             // End of first tag:
00489         $start = strpos($str, '>');
00490             // Begin of last tag:
00491         $end = strrpos($str, '<');
00492             // return
00493         return substr($str, $start + 1, $end - $start - 1);
00494     }
00495 
00496     /**
00497      * Returns the first tag in $str
00498      * Actually everything from the begining of the $str is returned, so you better make sure the tag is the first thing...
00499      *
00500      * @param   string      HTML string with tags
00501      * @return  string
00502      */
00503     function getFirstTag($str) {
00504             // First:
00505         $endLen = strpos($str, '>') + 1;
00506         return substr($str, 0, $endLen);
00507     }
00508 
00509     /**
00510      * Returns the NAME of the first tag in $str
00511      *
00512      * @param   string      HTML tag (The element name MUST be separated from the attributes by a space character! Just *whitespace* will not do)
00513      * @param   boolean     If set, then the tag is NOT converted to uppercase by case is preserved.
00514      * @return  string      Tag name in upper case
00515      * @see getFirstTag()
00516      */
00517     function getFirstTagName($str, $preserveCase = FALSE) {
00518         $matches = array();
00519         if (preg_match('/^\s*<([^\s>]+)(\s|>)/', $str, $matches) === 1) {
00520             if (!$preserveCase) {
00521                 return strtoupper($matches[1]);
00522             }
00523             return $matches[1];
00524         }
00525         return '';
00526     }
00527 
00528     /**
00529      * Returns an array with all attributes as keys. Attributes are only lowercase a-z
00530      * If a attribute is empty (shorthand), then the value for the key is empty. You can check if it existed with isset()
00531      *
00532      * @param   string      Tag: $tag is either a whole tag (eg '<TAG OPTION ATTRIB=VALUE>') or the parameterlist (ex ' OPTION ATTRIB=VALUE>')
00533      * @param   boolean     If set, the attribute values are de-htmlspecialchar'ed. Should actually always be set!
00534      * @return  array       array(Tag attributes,Attribute meta-data)
00535      */
00536     function get_tag_attributes($tag, $deHSC = 0) {
00537         list($components, $metaC) = $this->split_tag_attributes($tag);
00538         $name = ''; // attribute name is stored here
00539         $valuemode = FALSE;
00540         $attributes = array();
00541         $attributesMeta = array();
00542         if (is_array($components)) {
00543             foreach ($components as $key => $val) {
00544                 if ($val != '=') { // Only if $name is set (if there is an attribute, that waits for a value), that valuemode is enabled. This ensures that the attribute is assigned it's value
00545                     if ($valuemode) {
00546                         if ($name) {
00547                             $attributes[$name] = $deHSC ? t3lib_div::htmlspecialchars_decode($val) : $val;
00548                             $attributesMeta[$name]['dashType'] = $metaC[$key];
00549                             $name = '';
00550                         }
00551                     } else {
00552                         if ($namekey = preg_replace('/[^[:alnum:]_\:\-]/', '', $val)) {
00553                             $name = strtolower($namekey);
00554                             $attributesMeta[$name] = array();
00555                             $attributesMeta[$name]['origTag'] = $namekey;
00556                             $attributes[$name] = '';
00557                         }
00558                     }
00559                     $valuemode = FALSE;
00560                 } else {
00561                     $valuemode = TRUE;
00562                 }
00563             }
00564             return array($attributes, $attributesMeta);
00565         }
00566     }
00567 
00568     /**
00569      * Returns an array with the 'components' from an attribute list. The result is normally analyzed by get_tag_attributes
00570      * Removes tag-name if found
00571      *
00572      * @param   string      The tag or attributes
00573      * @return  array
00574      * @access private
00575      * @see t3lib_div::split_tag_attributes()
00576      */
00577     function split_tag_attributes($tag) {
00578         $matches = array();
00579         if (preg_match('/(<[^\s]+\s+)?(.*?)\s*(>)?$/s', $tag, $matches) !== 1) {
00580             return array(array(), array());
00581         }
00582         $tag_tmp = $matches[2];
00583 
00584         $metaValue = array();
00585         $value = array();
00586         $matches = array();
00587         if (preg_match_all('/("[^"]*"|\'[^\']*\'|[^\s"\'\=]+|\=)/s', $tag_tmp, $matches) > 0) {
00588             foreach ($matches[1] as $part) {
00589                 $firstChar = substr($part, 0, 1);
00590                 if ($firstChar == '"' || $firstChar == "'") {
00591                     $metaValue[] = $firstChar;
00592                     $value[] = substr($part, 1, -1);
00593                 } else {
00594                     $metaValue[] = '';
00595                     $value[] = $part;
00596                 }
00597             }
00598         }
00599         return array($value, $metaValue);
00600     }
00601 
00602     /**
00603      * Checks whether block/solo tags are found in the correct amounts in HTML content
00604      * Block tags are tags which are required to have an equal amount of start and end tags, eg. "<table>...</table>"
00605      * Solo tags are tags which are required to have ONLY start tags (possibly with an XHTML ending like ".../>")
00606      * NOTICE: Correct XHTML might actually fail since "<br></br>" is allowed as well as "<br/>". However only the LATTER is accepted by this function (with "br" in the "solo-tag" list), the first example will result in a warning.
00607      * NOTICE: Correct XHTML might actually fail since "<p/>" is allowed as well as "<p></p>". However only the LATTER is accepted by this function (with "p" in the "block-tag" list), the first example will result in an ERROR!
00608      * NOTICE: Correct HTML version "something" allows eg. <p> and <li> to be NON-ended (implicitly ended by other tags). However this is NOT accepted by this function (with "p" and "li" in the block-tag list) and it will result in an ERROR!
00609      *
00610      * @param   string      HTML content to analyze
00611      * @param   string      Tag names for block tags (eg. table or div or p) in lowercase, commalist (eg. "table,div,p")
00612      * @param   string      Tag names for solo tags (eg. img, br or input) in lowercase, commalist ("img,br,input")
00613      * @return  array       Analyse data.
00614      */
00615     function checkTagTypeCounts($content, $blockTags = 'a,b,blockquote,body,div,em,font,form,h1,h2,h3,h4,h5,h6,i,li,map,ol,option,p,pre,select,span,strong,table,td,textarea,tr,u,ul', $soloTags = 'br,hr,img,input,area') {
00616         $content = strtolower($content);
00617         $analyzedOutput = array();
00618         $analyzedOutput['counts'] = array(); // Counts appearances of start-tags
00619         $analyzedOutput['errors'] = array(); // Lists ERRORS
00620         $analyzedOutput['warnings'] = array(); // Lists warnings.
00621         $analyzedOutput['blocks'] = array(); // Lists stats for block-tags
00622         $analyzedOutput['solo'] = array(); // Lists stats for solo-tags
00623 
00624             // Block tags, must have endings...
00625         $blockTags = explode(',', $blockTags);
00626         foreach ($blockTags as $tagName) {
00627             $countBegin = count(preg_split('/<' . $tagName . '(\s|>)/s', $content)) - 1;
00628             $countEnd = count(preg_split('/<\/' . $tagName . '(\s|>)/s', $content)) - 1;
00629             $analyzedOutput['blocks'][$tagName] = array($countBegin, $countEnd, $countBegin - $countEnd);
00630             if ($countBegin) {
00631                 $analyzedOutput['counts'][$tagName] = $countBegin;
00632             }
00633             if ($countBegin - $countEnd) {
00634                 if ($countBegin - $countEnd > 0) {
00635                     $analyzedOutput['errors'][$tagName] = 'There were more start-tags (' . $countBegin . ') than end-tags (' . $countEnd . ') for the element "' . $tagName . '". There should be an equal amount!';
00636                 } else {
00637                     $analyzedOutput['warnings'][$tagName] = 'There were more end-tags (' . $countEnd . ') than start-tags (' . $countBegin . ') for the element "' . $tagName . '". There should be an equal amount! However the problem is not fatal.';
00638                 }
00639             }
00640         }
00641 
00642             // Solo tags, must NOT have endings...
00643         $soloTags = explode(',', $soloTags);
00644         foreach ($soloTags as $tagName) {
00645             $countBegin = count(preg_split('/<' . $tagName . '(\s|>)/s', $content)) - 1;
00646             $countEnd = count(preg_split('/<\/' . $tagName . '(\s|>)/s', $content)) - 1;
00647             $analyzedOutput['solo'][$tagName] = array($countBegin, $countEnd);
00648             if ($countBegin) {
00649                 $analyzedOutput['counts'][$tagName] = $countBegin;
00650             }
00651             if ($countEnd) {
00652                 $analyzedOutput['warnings'][$tagName] = 'There were end-tags found (' . $countEnd . ') for the element "' . $tagName . '". This was not expected (although XHTML technically allows it).';
00653             }
00654         }
00655 
00656         return $analyzedOutput;
00657     }
00658 
00659 
00660     /*********************************
00661      *
00662      * Clean HTML code
00663      *
00664      *********************************/
00665 
00666     /**
00667      * Function that can clean up HTML content according to configuration given in the $tags array.
00668      *
00669      * Initializing the $tags array to allow a list of tags (in this case <B>,<I>,<U> and <A>), set it like this:        $tags = array_flip(explode(',','b,a,i,u'))
00670      * If the value of the $tags[$tagname] entry is an array, advanced processing of the tags is initialized. These are the options:
00671      *
00672      *   $tags[$tagname] = Array(
00673      *       'overrideAttribs' => ''        If set, this string is preset as the attributes of the tag
00674      *       'allowedAttribs' =>   '0' (zero) = no attributes allowed, '[commalist of attributes]' = only allowed attributes. If blank, all attributes are allowed.
00675      *       'fixAttrib' => Array(
00676      *           '[attribute name]' => Array (
00677      *               'set' => Force the attribute value to this value.
00678      *               'unset' => Boolean: If set, the attribute is unset.
00679      *               'default' =>    If no attribute exists by this name, this value is set as default value (if this value is not blank)
00680      *               'always' =>     Boolean. If set, the attribute is always processed. Normally an attribute is processed only if it exists
00681      *               'trim,intval,lower,upper' =>    All booleans. If any of these keys are set, the value is passed through the respective PHP-functions.
00682      *               'range' => Array ('[low limit]','[high limit, optional]')      Setting integer range.
00683      *               'list' => Array ('[value1/default]','[value2]','[value3]')     Attribute must be in this list. If not, the value is set to the first element.
00684      *               'removeIfFalse' =>  Boolean/'blank'.   If set, then the attribute is removed if it is 'false'. If this value is set to 'blank' then the value must be a blank string (that means a 'zero' value will not be removed)
00685      *               'removeIfEquals' =>     [value]    If the attribute value matches the value set here, then it is removed.
00686      *               'casesensitiveComp' => 1   If set, then the removeIfEquals and list comparisons will be case sensitive. Otherwise not.
00687      *           )
00688      *       ),
00689      *       'protect' => '',   Boolean. If set, the tag <> is converted to &lt; and &gt;
00690      *       'remap' => '',     String. If set, the tagname is remapped to this tagname
00691      *       'rmTagIfNoAttrib' => '',   Boolean. If set, then the tag is removed if no attributes happend to be there.
00692      *       'nesting' => '',   Boolean/'global'. If set true, then this tag must have starting and ending tags in the correct order. Any tags not in this order will be discarded. Thus '</B><B><I></B></I></B>' will be converted to '<B><I></B></I>'. Is the value 'global' then true nesting in relation to other tags marked for 'global' nesting control is preserved. This means that if <B> and <I> are set for global nesting then this string '</B><B><I></B></I></B>' is converted to '<B></B>'
00693      *   )
00694      *
00695      * @param   string      $content; is the HTML-content being processed. This is also the result being returned.
00696      * @param   array       $tags; is an array where each key is a tagname in lowercase. Only tags present as keys in this array are preserved. The value of the key can be an array with a vast number of options to configure.
00697      * @param   string      $keepAll; boolean/'protect', if set, then all tags are kept regardless of tags present as keys in $tags-array. If 'protect' then the preserved tags have their <> converted to &lt; and &gt;
00698      * @param   integer     $hSC; Values -1,0,1,2: Set to zero= disabled, set to 1 then the content BETWEEN tags is htmlspecialchar()'ed, set to -1 its the opposite and set to 2 the content will be HSC'ed BUT with preservation for real entities (eg. "&amp;" or "&#234;")
00699      * @param   array       Configuration array send along as $conf to the internal functions ->processContent() and ->processTag()
00700      * @return  string      Processed HTML content
00701      */
00702     function HTMLcleaner($content, $tags = array(), $keepAll = 0, $hSC = 0, $addConfig = array()) {
00703         $newContent = array();
00704         $tokArr = explode('<', $content);
00705         $newContent[] = $this->processContent(current($tokArr), $hSC, $addConfig);
00706         next($tokArr);
00707 
00708         $c = 1;
00709         $tagRegister = array();
00710         $tagStack = array();
00711         $inComment = FALSE;
00712         $skipTag = FALSE;
00713         while (list(, $tok) = each($tokArr)) {
00714             if ($inComment) {
00715                 if (($eocPos = strpos($tok, '-->')) === FALSE) {
00716                         // End of comment is not found in the token. Go futher until end of comment is found in other tokens.
00717                     $newContent[$c++] = '<' . $tok;
00718                     continue;
00719                 }
00720                     // Comment ends in the middle of the token: add comment and proceed with rest of the token
00721                 $newContent[$c++] = '<' . substr($tok, 0, $eocPos + 3);
00722                 $tok = substr($tok, $eocPos + 3);
00723                 $inComment = FALSE;
00724                 $skipTag = TRUE;
00725             }
00726             elseif (substr($tok, 0, 3) == '!--') {
00727                 if (($eocPos = strpos($tok, '-->')) === FALSE) {
00728                         // Comment started in this token but it does end in the same token. Set a flag to skip till the end of comment
00729                     $newContent[$c++] = '<' . $tok;
00730                     $inComment = TRUE;
00731                     continue;
00732                 }
00733                     // Start and end of comment are both in the current token. Add comment and proceed with rest of the token
00734                 $newContent[$c++] = '<' . substr($tok, 0, $eocPos + 3);
00735                 $tok = substr($tok, $eocPos + 3);
00736                 $skipTag = TRUE;
00737             }
00738             $firstChar = substr($tok, 0, 1);
00739             if (!$skipTag && preg_match('/[[:alnum:]\/]/', $firstChar) == 1) { // It is a tag... (first char is a-z0-9 or /) (fixed 19/01 2004). This also avoids triggering on <?xml..> and <!DOCTYPE..>
00740                 $tagEnd = strpos($tok, '>');
00741                 if ($tagEnd) { // If there is and end-bracket...    tagEnd can't be 0 as the first character can't be a >
00742                     $endTag = $firstChar == '/' ? 1 : 0;
00743                     $tagContent = substr($tok, $endTag, $tagEnd - $endTag);
00744                     $tagParts = preg_split('/\s+/s', $tagContent, 2);
00745                     $tagName = strtolower($tagParts[0]);
00746                     $emptyTag = 0;
00747                     if (isset($tags[$tagName])) {
00748                         if (is_array($tags[$tagName])) { // If there is processing to do for the tag:
00749                             if (preg_match('/^(' . self::VOID_ELEMENTS . ' )$/i', $tagName)) {
00750                                 $emptyTag = 1;
00751                             }
00752                             if (!$endTag) { // If NOT an endtag, do attribute processing (added dec. 2003)
00753                                     // Override attributes
00754                                 if (strcmp($tags[$tagName]['overrideAttribs'], '')) {
00755                                     $tagParts[1] = $tags[$tagName]['overrideAttribs'];
00756                                 }
00757 
00758                                     // Allowed tags
00759                                 if (strcmp($tags[$tagName]['allowedAttribs'], '')) {
00760                                     if (!strcmp($tags[$tagName]['allowedAttribs'], '0')) { // No attribs allowed
00761                                         $tagParts[1] = '';
00762                                     } elseif (trim($tagParts[1])) {
00763                                         $tagAttrib = $this->get_tag_attributes($tagParts[1]);
00764                                         $tagParts[1] = '';
00765                                         $newTagAttrib = array();
00766                                         if (!($tList = $tags[$tagName]['_allowedAttribs'])) {
00767                                                 // Just explode attribts for tag once
00768                                             $tList = $tags[$tagName]['_allowedAttribs'] = t3lib_div::trimExplode(',', strtolower($tags[$tagName]['allowedAttribs']), 1);
00769                                         }
00770                                         foreach ($tList as $allowTag) {
00771                                             if (isset($tagAttrib[0][$allowTag])) {
00772                                                 $newTagAttrib[$allowTag] = $tagAttrib[0][$allowTag];
00773                                             }
00774                                         }
00775                                         $tagParts[1] = $this->compileTagAttribs($newTagAttrib, $tagAttrib[1]);
00776                                     }
00777                                 }
00778 
00779                                     // Fixed attrib values
00780                                 if (is_array($tags[$tagName]['fixAttrib'])) {
00781                                     $tagAttrib = $this->get_tag_attributes($tagParts[1]);
00782                                     $tagParts[1] = '';
00783                                     foreach ($tags[$tagName]['fixAttrib'] as $attr => $params) {
00784                                         if (strlen($params['set'])) {
00785                                             $tagAttrib[0][$attr] = $params['set'];
00786                                         }
00787                                         if (strlen($params['unset'])) {
00788                                             unset($tagAttrib[0][$attr]);
00789                                         }
00790                                         if (strcmp($params['default'], '') && !isset($tagAttrib[0][$attr])) {
00791                                             $tagAttrib[0][$attr] = $params['default'];
00792                                         }
00793                                         if ($params['always'] || isset($tagAttrib[0][$attr])) {
00794                                             if ($params['trim']) {
00795                                                 $tagAttrib[0][$attr] = trim($tagAttrib[0][$attr]);
00796                                             }
00797                                             if ($params['intval']) {
00798                                                 $tagAttrib[0][$attr] = intval($tagAttrib[0][$attr]);
00799                                             }
00800                                             if ($params['lower']) {
00801                                                 $tagAttrib[0][$attr] = strtolower($tagAttrib[0][$attr]);
00802                                             }
00803                                             if ($params['upper']) {
00804                                                 $tagAttrib[0][$attr] = strtoupper($tagAttrib[0][$attr]);
00805                                             }
00806                                             if ($params['range']) {
00807                                                 if (isset($params['range'][1])) {
00808                                                     $tagAttrib[0][$attr] = t3lib_div::intInRange($tagAttrib[0][$attr], intval($params['range'][0]), intval($params['range'][1]));
00809                                                 } else {
00810                                                     $tagAttrib[0][$attr] = t3lib_div::intInRange($tagAttrib[0][$attr], intval($params['range'][0]));
00811                                                 }
00812                                             }
00813                                             if (is_array($params['list'])) {
00814                                                     // For the class attribute, remove from the attribute value any class not in the list
00815                                                     // Classes are case sensitive
00816                                                 if ($attr == 'class') {
00817                                                     $newClasses = array();
00818                                                     $classes = t3lib_div::trimExplode(' ', $tagAttrib[0][$attr], TRUE);
00819                                                     foreach ($classes as $class) {
00820                                                         if (in_array($class, $params['list'])) {
00821                                                             $newClasses[] = $class;
00822                                                         }
00823                                                     }
00824                                                     if (count($newClasses)) {
00825                                                         $tagAttrib[0][$attr] = implode(' ', $newClasses);
00826                                                     } else {
00827                                                         $tagAttrib[0][$attr] = '';
00828                                                     }
00829                                                 } else {
00830                                                     if (!in_array($this->caseShift($tagAttrib[0][$attr], $params['casesensitiveComp']), $this->caseShift($params['list'], $params['casesensitiveComp'], $tagName))) {
00831                                                         $tagAttrib[0][$attr] = $params['list'][0];
00832                                                     }
00833                                                 }
00834                                             }
00835                                             if (($params['removeIfFalse'] && $params['removeIfFalse'] != 'blank' && !$tagAttrib[0][$attr]) || ($params['removeIfFalse'] == 'blank' && !strcmp($tagAttrib[0][$attr], ''))) {
00836                                                 unset($tagAttrib[0][$attr]);
00837                                             }
00838                                             if (strcmp($params['removeIfEquals'], '') && !strcmp($this->caseShift($tagAttrib[0][$attr], $params['casesensitiveComp']), $this->caseShift($params['removeIfEquals'], $params['casesensitiveComp']))) {
00839                                                 unset($tagAttrib[0][$attr]);
00840                                             }
00841                                             if ($params['prefixLocalAnchors']) {
00842                                                 if (substr($tagAttrib[0][$attr], 0, 1) == '#') {
00843                                                     $prefix = t3lib_div::getIndpEnv('TYPO3_REQUEST_URL');
00844                                                     $tagAttrib[0][$attr] = $prefix . $tagAttrib[0][$attr];
00845                                                     if ($params['prefixLocalAnchors'] == 2 && t3lib_div::isFirstPartOfStr($prefix, t3lib_div::getIndpEnv('TYPO3_SITE_URL'))) {
00846                                                         $tagAttrib[0][$attr] = substr($tagAttrib[0][$attr], strlen(t3lib_div::getIndpEnv('TYPO3_SITE_URL')));
00847                                                     }
00848                                                 }
00849                                             }
00850                                             if ($params['prefixRelPathWith']) {
00851                                                 $urlParts = parse_url($tagAttrib[0][$attr]);
00852                                                 if (!$urlParts['scheme'] && substr($urlParts['path'], 0, 1) != '/') { // If it is NOT an absolute URL (by http: or starting "/")
00853                                                     $tagAttrib[0][$attr] = $params['prefixRelPathWith'] . $tagAttrib[0][$attr];
00854                                                 }
00855                                             }
00856                                             if ($params['userFunc']) {
00857                                                 $tagAttrib[0][$attr] = t3lib_div::callUserFunction($params['userFunc'], $tagAttrib[0][$attr], $this);
00858                                             }
00859                                         }
00860                                     }
00861                                     $tagParts[1] = $this->compileTagAttribs($tagAttrib[0], $tagAttrib[1]);
00862                                 }
00863                             } else { // If endTag, remove any possible attributes:
00864                                 $tagParts[1] = '';
00865                             }
00866 
00867                                 // Protecting the tag by converting < and > to &lt; and &gt; ??
00868                             if ($tags[$tagName]['protect']) {
00869                                 $lt = '&lt;';
00870                                 $gt = '&gt;';
00871                             } else {
00872                                 $lt = '<';
00873                                 $gt = '>';
00874                             }
00875                                 // Remapping tag name?
00876                             if ($tags[$tagName]['remap']) {
00877                                 $tagParts[0] = $tags[$tagName]['remap'];
00878                             }
00879 
00880                                 // rmTagIfNoAttrib
00881                             if ($endTag || trim($tagParts[1]) || !$tags[$tagName]['rmTagIfNoAttrib']) {
00882                                 $setTag = 1;
00883                                     // Remove this closing tag if $tagName was among $TSconfig['removeTags']
00884                                 if ($endTag && $tags[$tagName]['allowedAttribs'] === 0 && $tags[$tagName]['rmTagIfNoAttrib'] === 1) {
00885                                     $setTag = 0;
00886                                 }
00887                                 if ($tags[$tagName]['nesting']) {
00888                                     if (!is_array($tagRegister[$tagName])) {
00889                                         $tagRegister[$tagName] = array();
00890                                     }
00891 
00892                                     if ($endTag) {
00893                                         /*                                      if ($tags[$tagName]['nesting']=='global')   {
00894                                                      $lastEl = end($tagStack);
00895                                                      $correctTag = !strcmp($tagName,$lastEl);
00896                                                  } else $correctTag=1;
00897              */
00898                                         $correctTag = 1;
00899                                         if ($tags[$tagName]['nesting'] == 'global') {
00900                                             $lastEl = end($tagStack);
00901                                             if (strcmp($tagName, $lastEl)) {
00902                                                 if (in_array($tagName, $tagStack)) {
00903                                                     while (count($tagStack) && strcmp($tagName, $lastEl)) {
00904                                                         $elPos = end($tagRegister[$lastEl]);
00905                                                         unset($newContent[$elPos]);
00906 
00907                                                         array_pop($tagRegister[$lastEl]);
00908                                                         array_pop($tagStack);
00909                                                         $lastEl = end($tagStack);
00910                                                     }
00911                                                 } else {
00912                                                     $correctTag = 0; // In this case the
00913                                                 }
00914                                             }
00915                                         }
00916                                         if (!count($tagRegister[$tagName]) || !$correctTag) {
00917                                             $setTag = 0;
00918                                         } else {
00919                                             array_pop($tagRegister[$tagName]);
00920                                             if ($tags[$tagName]['nesting'] == 'global') {
00921                                                 array_pop($tagStack);
00922                                             }
00923                                         }
00924                                     } else {
00925                                         array_push($tagRegister[$tagName], $c);
00926                                         if ($tags[$tagName]['nesting'] == 'global') {
00927                                             array_push($tagStack, $tagName);
00928                                         }
00929                                     }
00930                                 }
00931 
00932                                 if ($setTag) {
00933                                         // Setting the tag
00934                                     $newContent[$c++] = $this->processTag($lt . ($endTag ? '/' : '') . trim($tagParts[0] . ' ' . $tagParts[1]) . ($emptyTag ? ' /' : '' ) . $gt, $addConfig, $endTag, $lt == '&lt;');
00935                                 }
00936                             }
00937                         } else {
00938                             $newContent[$c++] = $this->processTag('<' . ($endTag ? '/' : '') . $tagContent . '>', $addConfig, $endTag);
00939                         }
00940                     } elseif ($keepAll) { // This is if the tag was not defined in the array for processing:
00941                         if (!strcmp($keepAll, 'protect')) {
00942                             $lt = '&lt;';
00943                             $gt = '&gt;';
00944                         } else {
00945                             $lt = '<';
00946                             $gt = '>';
00947                         }
00948                         $newContent[$c++] = $this->processTag($lt . ($endTag ? '/' : '') . $tagContent . $gt, $addConfig, $endTag, $lt == '&lt;');
00949                     }
00950                     $newContent[$c++] = $this->processContent(substr($tok, $tagEnd + 1), $hSC, $addConfig);
00951                 } else {
00952                     $newContent[$c++] = $this->processContent('<' . $tok, $hSC, $addConfig); // There were not end-bracket, so no tag...
00953                 }
00954             } else {
00955                 $newContent[$c++] = $this->processContent(($skipTag ? '' : '<') . $tok, $hSC, $addConfig); // It was not a tag anyways
00956                 $skipTag = FALSE;
00957             }
00958         }
00959 
00960             // Unsetting tags:
00961         foreach ($tagRegister as $tag => $positions) {
00962             foreach ($positions as $pKey) {
00963                 unset($newContent[$pKey]);
00964             }
00965         }
00966 
00967         return implode('', $newContent);
00968     }
00969 
00970     /**
00971      * Converts htmlspecialchars forth ($dir=1) AND back ($dir=-1)
00972      *
00973      * @param   string      Input value
00974      * @param   integer     Direction: forth ($dir=1, dir=2 for preserving entities) AND back ($dir=-1)
00975      * @return  string      Output value
00976      */
00977     function bidir_htmlspecialchars($value, $dir) {
00978         if ($dir == 1) {
00979             $value = htmlspecialchars($value);
00980         } elseif ($dir == 2) {
00981             $value = t3lib_div::deHSCentities(htmlspecialchars($value));
00982         } elseif ($dir == -1) {
00983             $value = str_replace('&gt;', '>', $value);
00984             $value = str_replace('&lt;', '<', $value);
00985             $value = str_replace('&quot;', '"', $value);
00986             $value = str_replace('&amp;', '&', $value);
00987         }
00988         return $value;
00989     }
00990 
00991     /**
00992      * Prefixes the relative paths of hrefs/src/action in the tags [td,table,body,img,input,form,link,script,a] in the $content with the $main_prefix or and alternative given by $alternatives
00993      *
00994      * @param   string      Prefix string
00995      * @param   string      HTML content
00996      * @param   array       Array with alternative prefixes for certain of the tags. key=>value pairs where the keys are the tag element names in uppercase
00997      * @param   string      Suffix string (put after the resource).
00998      * @return  string      Processed HTML content
00999      */
01000     function prefixResourcePath($main_prefix, $content, $alternatives = array(), $suffix = '') {
01001 
01002         $parts = $this->splitTags('embed,td,table,body,img,input,form,link,script,a,param', $content);
01003         foreach ($parts as $k => $v) {
01004             if ($k % 2) {
01005                 $params = $this->get_tag_attributes($v);
01006                 $tagEnd = substr($v, -2) == '/>' ? ' />' : '>'; // Detect tag-ending so that it is re-applied correctly.
01007                 $firstTagName = $this->getFirstTagName($v); // The 'name' of the first tag
01008                 $somethingDone = 0;
01009                 $prefix = isset($alternatives[strtoupper($firstTagName)]) ? $alternatives[strtoupper($firstTagName)] : $main_prefix;
01010                 switch (strtolower($firstTagName)) {
01011                         // background - attribute:
01012                     case 'td':
01013                     case 'body':
01014                     case 'table':
01015                         $src = $params[0]['background'];
01016                         if ($src) {
01017                             $params[0]['background'] = $this->prefixRelPath($prefix, $params[0]['background'], $suffix);
01018                             $somethingDone = 1;
01019                         }
01020                     break;
01021                         // src attribute
01022                     case 'img':
01023                     case 'input':
01024                     case 'script':
01025                     case 'embed':
01026                         $src = $params[0]['src'];
01027                         if ($src) {
01028                             $params[0]['src'] = $this->prefixRelPath($prefix, $params[0]['src'], $suffix);
01029                             $somethingDone = 1;
01030                         }
01031                     break;
01032                     case 'link':
01033                     case 'a':
01034                         $src = $params[0]['href'];
01035                         if ($src) {
01036                             $params[0]['href'] = $this->prefixRelPath($prefix, $params[0]['href'], $suffix);
01037                             $somethingDone = 1;
01038                         }
01039                     break;
01040                         // action attribute
01041                     case 'form':
01042                         $src = $params[0]['action'];
01043                         if ($src) {
01044                             $params[0]['action'] = $this->prefixRelPath($prefix, $params[0]['action'], $suffix);
01045                             $somethingDone = 1;
01046                         }
01047                     break;
01048                         // value attribute
01049                     case 'param':
01050                         $test = $params[0]['name'];
01051                         if ($test && $test === 'movie') {
01052                             if ($params[0]['value']) {
01053                                 $params[0]['value'] = $this->prefixRelPath($prefix, $params[0]['value'], $suffix);
01054                                 $somethingDone = 1;
01055                             }
01056                         }
01057                     break;
01058                 }
01059                 if ($somethingDone) {
01060                     $tagParts = preg_split('/\s+/s', $v, 2);
01061                     $tagParts[1] = $this->compileTagAttribs($params[0], $params[1]);
01062                     $parts[$k] = '<' . trim(strtolower($firstTagName) . ' ' . $tagParts[1]) . $tagEnd;
01063                 }
01064             }
01065         }
01066         $content = implode('', $parts);
01067 
01068             // Fix <style> section:
01069         $prefix = isset($alternatives['style']) ? $alternatives['style'] : $main_prefix;
01070         if (strlen($prefix)) {
01071             $parts = $this->splitIntoBlock('style', $content);
01072             foreach ($parts as $k => $v) {
01073                 if ($k % 2) {
01074                     $parts[$k] = preg_replace('/(url[[:space:]]*\([[:space:]]*["\']?)([^"\')]*)(["\']?[[:space:]]*\))/i', '\1' . $prefix . '\2' . $suffix . '\3', $parts[$k]);
01075                 }
01076             }
01077             $content = implode('', $parts);
01078         }
01079 
01080         return $content;
01081     }
01082 
01083     /**
01084      * Internal sub-function for ->prefixResourcePath()
01085      *
01086      * @param   string      Prefix string
01087      * @param   string      Relative path/URL
01088      * @param   string      Suffix string
01089      * @return  string      Output path, prefixed if no scheme in input string
01090      * @access private
01091      */
01092     function prefixRelPath($prefix, $srcVal, $suffix = '') {
01093             // Only prefix if it's not an absolute URL or
01094             // only a link to a section within the page.
01095         if (substr($srcVal, 0, 1) != '/' && substr($srcVal, 0, 1) != '#') {
01096             $urlParts = parse_url($srcVal);
01097                 // only prefix URLs without a scheme
01098             if (!$urlParts['scheme']) {
01099                 $srcVal = $prefix . $srcVal . $suffix;
01100             }
01101         }
01102         return $srcVal;
01103     }
01104 
01105     /**
01106      * Cleans up the input $value for fonttags.
01107      * If keepFace,-Size and -Color is set then font-tags with an allowed property is kept. Else deleted.
01108      *
01109      * @param   string      HTML content with font-tags inside to clean up.
01110      * @param   boolean     If set, keep "face" attribute
01111      * @param   boolean     If set, keep "size" attribute
01112      * @param   boolean     If set, keep "color" attribute
01113      * @return  string      Processed HTML content
01114      */
01115     function cleanFontTags($value, $keepFace = 0, $keepSize = 0, $keepColor = 0) {
01116         $fontSplit = $this->splitIntoBlock('font', $value); // ,1 ?? - could probably be more stable if splitTags() was used since this depends on end-tags being properly set!
01117         foreach ($fontSplit as $k => $v) {
01118             if ($k % 2) { // font:
01119                 $attribArray = $this->get_tag_attributes_classic($this->getFirstTag($v));
01120                 $newAttribs = array();
01121                 if ($keepFace && $attribArray['face']) {
01122                     $newAttribs[] = 'face="' . $attribArray['face'] . '"';
01123                 }
01124                 if ($keepSize && $attribArray['size']) {
01125                     $newAttribs[] = 'size="' . $attribArray['size'] . '"';
01126                 }
01127                 if ($keepColor && $attribArray['color']) {
01128                     $newAttribs[] = 'color="' . $attribArray['color'] . '"';
01129                 }
01130 
01131                 $innerContent = $this->cleanFontTags($this->removeFirstAndLastTag($v), $keepFace, $keepSize, $keepColor);
01132                 if (count($newAttribs)) {
01133                     $fontSplit[$k] = '<font ' . implode(' ', $newAttribs) . '>' . $innerContent . '</font>';
01134                 } else {
01135                     $fontSplit[$k] = $innerContent;
01136                 }
01137             }
01138         }
01139         return implode('', $fontSplit);
01140     }
01141 
01142     /**
01143      * This is used to map certain tag-names into other names.
01144      *
01145      * @param   string      HTML content
01146      * @param   array       Array with tag key=>value pairs where key is from-tag and value is to-tag
01147      * @param   string      Alternative less-than char to search for (search regex string)
01148      * @param   string      Alternative less-than char to replace with (replace regex string)
01149      * @return  string      Processed HTML content
01150      */
01151     function mapTags($value, $tags = array(), $ltChar = '<', $ltChar2 = '<') {
01152 
01153         foreach ($tags as $from => $to) {
01154             $value = preg_replace('/' . preg_quote($ltChar) . '(\/)?' . $from . '\s([^>])*(\/)?>/', $ltChar2 . '$1' . $to . ' $2$3>', $value);
01155         }
01156         return $value;
01157     }
01158 
01159     /**
01160      * This converts htmlspecialchar()'ed tags (from $tagList) back to real tags. Eg. '&lt;strong&gt' would be converted back to '<strong>' if found in $tagList
01161      *
01162      * @param   string      HTML content
01163      * @param   string      Tag list, separated by comma. Lowercase!
01164      * @return  string      Processed HTML content
01165      */
01166     function unprotectTags($content, $tagList = '') {
01167         $tagsArray = t3lib_div::trimExplode(',', $tagList, 1);
01168         $contentParts = explode('&lt;', $content);
01169         next($contentParts); // bypass the first
01170         while (list($k, $tok) = each($contentParts)) {
01171             $firstChar = substr($tok, 0, 1);
01172             if (strcmp(trim($firstChar), '')) {
01173                 $subparts = explode('&gt;', $tok, 2);
01174                 $tagEnd = strlen($subparts[0]);
01175                 if (strlen($tok) != $tagEnd) {
01176                     $endTag = $firstChar == '/' ? 1 : 0;
01177                     $tagContent = substr($tok, $endTag, $tagEnd - $endTag);
01178                     $tagParts = preg_split('/\s+/s', $tagContent, 2);
01179                     $tagName = strtolower($tagParts[0]);
01180                     if (!strcmp($tagList, '') || in_array($tagName, $tagsArray)) {
01181                         $contentParts[$k] = '<' . $subparts[0] . '>' . $subparts[1];
01182                     } else {
01183                         $contentParts[$k] = '&lt;' . $tok;
01184                     }
01185                 } else {
01186                     $contentParts[$k] = '&lt;' . $tok;
01187                 }
01188             } else {
01189                 $contentParts[$k] = '&lt;' . $tok;
01190             }
01191         }
01192 
01193         return implode('', $contentParts);
01194     }
01195 
01196     /**
01197      * Strips tags except the tags in the list, $tagList
01198      * OBSOLETE - use PHP function strip_tags()
01199      *
01200      * @param   string      Value to process
01201      * @param   string      List of tags
01202      * @return  string      Output value
01203      * @ignore
01204      */
01205     function stripTagsExcept($value, $tagList) {
01206         $tags = t3lib_div::trimExplode(',', $tagList, 1);
01207         $forthArr = array();
01208         $backArr = array();
01209         foreach ($tags as $theTag) {
01210             $forthArr[$theTag] = md5($theTag);
01211             $backArr[md5($theTag)] = $theTag;
01212         }
01213         $value = $this->mapTags($value, $forthArr, '<', '_');
01214         $value = strip_tags($value);
01215         $value = $this->mapTags($value, $backArr, '_', '<');
01216         return $value;
01217     }
01218 
01219     /**
01220      * Internal function for case shifting of a string or whole array
01221      *
01222      * @param   mixed       Input string/array
01223      * @param   boolean     If $str is a string AND this boolean(caseSensitive) is false, the string is returned in uppercase
01224      * @param   string      Key string used for internal caching of the results. Could be an MD5 hash of the serialized version of the input $str if that is an array.
01225      * @return  string      Output string, processed
01226      * @access private
01227      */
01228     function caseShift($str, $flag, $cacheKey = '') {
01229         $cacheKey .= $flag ? 1 : 0;
01230         if (is_array($str)) {
01231             if (!$cacheKey || !isset($this->caseShift_cache[$cacheKey])) {
01232                 reset($str);
01233                 foreach ($str as $k => $v) {
01234                     if (!$flag) {
01235                         $str[$k] = strtoupper($v);
01236                     }
01237                 }
01238                 if ($cacheKey) {
01239                     $this->caseShift_cache[$cacheKey] = $str;
01240                 }
01241             } else {
01242                 $str = $this->caseShift_cache[$cacheKey];
01243             }
01244         } elseif (!$flag) {
01245             $str = strtoupper($str);
01246         }
01247         return $str;
01248     }
01249 
01250     /**
01251      * Compiling an array with tag attributes into a string
01252      *
01253      * @param   array       Tag attributes
01254      * @param   array       Meta information about these attributes (like if they were quoted)
01255      * @param   boolean     If set, then the attribute names will be set in lower case, value quotes in double-quotes and the value will be htmlspecialchar()'ed
01256      * @return  string      Imploded attributes, eg: 'attribute="value" attrib2="value2"'
01257      * @access private
01258      */
01259     function compileTagAttribs($tagAttrib, $meta = array(), $xhtmlClean = 0) {
01260         $accu = array();
01261         foreach ($tagAttrib as $k => $v) {
01262             if ($xhtmlClean) {
01263                 $attr = strtolower($k);
01264                 if (strcmp($v, '') || isset($meta[$k]['dashType'])) {
01265                     $attr .= '="' . htmlspecialchars($v) . '"';
01266                 }
01267             } else {
01268                 $attr = $meta[$k]['origTag'] ? $meta[$k]['origTag'] : $k;
01269                 if (strcmp($v, '') || isset($meta[$k]['dashType'])) {
01270                     $dash = $meta[$k]['dashType'] ? $meta[$k]['dashType'] : (t3lib_div::testInt($v) ? '' : '"');
01271                     $attr .= '=' . $dash . $v . $dash;
01272                 }
01273             }
01274             $accu[] = $attr;
01275         }
01276         return implode(' ', $accu);
01277     }
01278 
01279     /**
01280      * Get tag attributes, the classic version (which had some limitations?)
01281      *
01282      * @param   string      The tag
01283      * @param   boolean     De-htmlspecialchar flag.
01284      * @return  array
01285      * @access private
01286      */
01287     function get_tag_attributes_classic($tag, $deHSC = 0) {
01288         $attr = $this->get_tag_attributes($tag, $deHSC);
01289         return is_array($attr[0]) ? $attr[0] : array();
01290     }
01291 
01292     /**
01293      * Indents input content with $number instances of $indentChar
01294      *
01295      * @param   string      Content string, multiple lines.
01296      * @param   integer     Number of indents
01297      * @param   string      Indent character/string
01298      * @return  string      Indented code (typ. HTML)
01299      */
01300     function indentLines($content, $number = 1, $indentChar = TAB) {
01301         $preTab = str_pad('', $number * strlen($indentChar), $indentChar);
01302         $lines = explode(LF, str_replace(CR, '', $content));
01303         foreach ($lines as $k => $v) {
01304             $lines[$k] = $preTab . $v;
01305         }
01306         return implode(LF, $lines);
01307     }
01308 
01309     /**
01310      * Converts TSconfig into an array for the HTMLcleaner function.
01311      *
01312      * @param   array       TSconfig for HTMLcleaner
01313      * @param   array       Array of tags to keep (?)
01314      * @return  array
01315      * @access private
01316      */
01317     function HTMLparserConfig($TSconfig, $keepTags = array()) {
01318             // Allow tags (base list, merged with incoming array)
01319         $alTags = array_flip(t3lib_div::trimExplode(',', strtolower($TSconfig['allowTags']), 1));
01320         $keepTags = array_merge($alTags, $keepTags);
01321 
01322             // Set config properties.
01323         if (is_array($TSconfig['tags.'])) {
01324             foreach ($TSconfig['tags.'] as $key => $tagC) {
01325                 if (!is_array($tagC) && $key == strtolower($key)) {
01326                     if (!strcmp($tagC, '0')) {
01327                         unset($keepTags[$key]);
01328                     }
01329                     if (!strcmp($tagC, '1') && !isset($keepTags[$key])) {
01330                         $keepTags[$key] = 1;
01331                     }
01332                 }
01333             }
01334 
01335             foreach ($TSconfig['tags.'] as $key => $tagC) {
01336                 if (is_array($tagC) && $key == strtolower($key)) {
01337                     $key = substr($key, 0, -1);
01338                     if (!is_array($keepTags[$key])) {
01339                         $keepTags[$key] = array();
01340                     }
01341                     if (is_array($tagC['fixAttrib.'])) {
01342                         foreach ($tagC['fixAttrib.'] as $atName => $atConfig) {
01343                             if (is_array($atConfig)) {
01344                                 $atName = substr($atName, 0, -1);
01345                                 if (!is_array($keepTags[$key]['fixAttrib'][$atName])) {
01346                                     $keepTags[$key]['fixAttrib'][$atName] = array();
01347                                 }
01348                                 $keepTags[$key]['fixAttrib'][$atName] = array_merge($keepTags[$key]['fixAttrib'][$atName], $atConfig); // Candidate for t3lib_div::array_merge() if integer-keys will some day make trouble...
01349                                 if (strcmp($keepTags[$key]['fixAttrib'][$atName]['range'], '')) {
01350                                     $keepTags[$key]['fixAttrib'][$atName]['range'] = t3lib_div::trimExplode(',', $keepTags[$key]['fixAttrib'][$atName]['range']);
01351                                 }
01352                                 if (strcmp($keepTags[$key]['fixAttrib'][$atName]['list'], '')) {
01353                                     $keepTags[$key]['fixAttrib'][$atName]['list'] = t3lib_div::trimExplode(',', $keepTags[$key]['fixAttrib'][$atName]['list']);
01354                                 }
01355                             }
01356                         }
01357                     }
01358                     unset($tagC['fixAttrib.']);
01359                     unset($tagC['fixAttrib']);
01360                     $keepTags[$key] = array_merge($keepTags[$key], $tagC); // Candidate for t3lib_div::array_merge() if integer-keys will some day make trouble...
01361                 }
01362             }
01363         }
01364             // localNesting
01365         if ($TSconfig['localNesting']) {
01366             $lN = t3lib_div::trimExplode(',', strtolower($TSconfig['localNesting']), 1);
01367             foreach ($lN as $tn) {
01368                 if (isset($keepTags[$tn])) {
01369                     $keepTags[$tn]['nesting'] = 1;
01370                 }
01371             }
01372         }
01373         if ($TSconfig['globalNesting']) {
01374             $lN = t3lib_div::trimExplode(',', strtolower($TSconfig['globalNesting']), 1);
01375             foreach ($lN as $tn) {
01376                 if (isset($keepTags[$tn])) {
01377                     if (!is_array($keepTags[$tn])) {
01378                         $keepTags[$tn] = array();
01379                     }
01380                     $keepTags[$tn]['nesting'] = 'global';
01381                 }
01382             }
01383         }
01384         if ($TSconfig['rmTagIfNoAttrib']) {
01385             $lN = t3lib_div::trimExplode(',', strtolower($TSconfig['rmTagIfNoAttrib']), 1);
01386             foreach ($lN as $tn) {
01387                 if (isset($keepTags[$tn])) {
01388                     if (!is_array($keepTags[$tn])) {
01389                         $keepTags[$tn] = array();
01390                     }
01391                     $keepTags[$tn]['rmTagIfNoAttrib'] = 1;
01392                 }
01393             }
01394         }
01395         if ($TSconfig['noAttrib']) {
01396             $lN = t3lib_div::trimExplode(',', strtolower($TSconfig['noAttrib']), 1);
01397             foreach ($lN as $tn) {
01398                 if (isset($keepTags[$tn])) {
01399                     if (!is_array($keepTags[$tn])) {
01400                         $keepTags[$tn] = array();
01401                     }
01402                     $keepTags[$tn]['allowedAttribs'] = 0;
01403                 }
01404             }
01405         }
01406         if ($TSconfig['removeTags']) {
01407             $lN = t3lib_div::trimExplode(',', strtolower($TSconfig['removeTags']), 1);
01408             foreach ($lN as $tn) {
01409                 $keepTags[$tn] = array();
01410                 $keepTags[$tn]['allowedAttribs'] = 0;
01411                 $keepTags[$tn]['rmTagIfNoAttrib'] = 1;
01412             }
01413         }
01414 
01415             // Create additional configuration:
01416         $addConfig = array();
01417         if ($TSconfig['xhtml_cleaning']) {
01418             $addConfig['xhtml'] = 1;
01419         }
01420 
01421         return array(
01422             $keepTags,
01423             '' . $TSconfig['keepNonMatchedTags'],
01424             intval($TSconfig['htmlSpecialChars']),
01425             $addConfig
01426         );
01427     }
01428 
01429     /**
01430      * Tries to convert the content to be XHTML compliant and other stuff like that.
01431      * STILL EXPERIMENTAL. See comments below.
01432      *
01433      *           What it does NOT do (yet) according to XHTML specs.:
01434      *           - Wellformedness: Nesting is NOT checked
01435      *           - name/id attribute issue is not observed at this point.
01436      *           - Certain nesting of elements not allowed. Most interesting, <PRE> cannot contain img, big,small,sub,sup ...
01437      *           - Wrapping scripts and style element contents in CDATA - or alternatively they should have entitites converted.
01438      *           - Setting charsets may put some special requirements on both XML declaration/ meta-http-equiv. (C.9)
01439      *           - UTF-8 encoding is in fact expected by XML!!
01440      *           - stylesheet element and attribute names are NOT converted to lowercase
01441      *           - ampersands (and entities in general I think) MUST be converted to an entity reference! (&amps;). This may mean further conversion of non-tag content before output to page. May be related to the charset issue as a whole.
01442      *           - Minimized values not allowed: Must do this: selected="selected"
01443      *
01444      *           What it does at this point:
01445      *           - All tags (frame,base,meta,link + img,br,hr,area,input) is ended with "/>" - others?
01446      *           - Lowercase for elements and attributes
01447      *           - All attributes in quotes
01448      *           - Add "alt" attribute to img-tags if it's not there already.
01449      *
01450      * @param   string      Content to clean up
01451      * @return  string      Cleaned up content returned.
01452      * @access private
01453      */
01454     function XHTML_clean($content) {
01455         $content = $this->HTMLcleaner(
01456             $content,
01457             array(), // No tags treated specially
01458             1, // Keep ALL tags.
01459             0, // All content is htmlspecialchar()'ed (or ??) - if we do, <script> content will break...
01460             array('xhtml' => 1)
01461         );
01462         return $content;
01463     }
01464 
01465     /**
01466      * Processing all tags themselves
01467      * (Some additions by Sacha Vorbeck)
01468      *
01469      * @param   string      Tag to process
01470      * @param   array       Configuration array passing instructions for processing. If count()==0, function will return value unprocessed. See source code for details
01471      * @param   boolean     Is endtag, then set this.
01472      * @param   boolean     If set, just return value straight away
01473      * @return  string      Processed value.
01474      * @access private
01475      */
01476     function processTag($value, $conf, $endTag, $protected = 0) {
01477             // Return immediately if protected or no parameters
01478         if ($protected || !count($conf)) {
01479             return $value;
01480         }
01481             // OK then, begin processing for XHTML output:
01482             // STILL VERY EXPERIMENTAL!!
01483         if ($conf['xhtml']) {
01484             if ($endTag) { // Endtags are just set lowercase right away
01485                 $value = strtolower($value);
01486             } elseif (substr($value, 0, 4) != '<!--') { // ... and comments are ignored.
01487                 $inValue = substr($value, 1, (substr($value, -2) == '/>' ? -2 : -1)); // Finding inner value with out < >
01488                 list($tagName, $tagP) = preg_split('/\s+/s', $inValue, 2); // Separate attributes and tagname
01489                 $tagName = strtolower($tagName);
01490 
01491                 // Process attributes
01492                 $tagAttrib = $this->get_tag_attributes($tagP);
01493                 if (!strcmp($tagName, 'img') && !isset($tagAttrib[0]['alt'])) {
01494                     $tagAttrib[0]['alt'] = '';
01495                 } // Set alt attribute for all images (not XHTML though...)
01496                 if (!strcmp($tagName, 'script') && !isset($tagAttrib[0]['type'])) {
01497                     $tagAttrib[0]['type'] = 'text/javascript';
01498                 } // Set type attribute for all script-tags
01499                 $outA = array();
01500                 foreach ($tagAttrib[0] as $attrib_name => $attrib_value) {
01501                         // Set attributes: lowercase, always in quotes, with htmlspecialchars converted.
01502                     $outA[] = $attrib_name . '="' . $this->bidir_htmlspecialchars($attrib_value, 2) . '"';
01503                 }
01504                 $newTag = '<' . trim($tagName . ' ' . implode(' ', $outA));
01505                     // All tags that are standalone (not wrapping, not having endtags) should be ended with '/>'
01506                 if (t3lib_div::inList('img,br,hr,meta,link,base,area,input,param,col', $tagName) || substr($value, -2) == '/>') {
01507                     $newTag .= ' />';
01508                 } else {
01509                     $newTag .= '>';
01510                 }
01511                 $value = $newTag;
01512             }
01513         }
01514 
01515         return $value;
01516     }
01517 
01518     /**
01519      * Processing content between tags for HTML_cleaner
01520      *
01521      * @param   string      The value
01522      * @param   integer     Direction, either -1 or +1. 0 (zero) means no change to input value.
01523      * @param   mixed       Not used, ignore.
01524      * @return  string      The processed value.
01525      * @access private
01526      */
01527     function processContent($value, $dir, $conf) {
01528         if ($dir != 0) {
01529             $value = $this->bidir_htmlspecialchars($value, $dir);
01530         }
01531         return $value;
01532     }
01533 }
01534 
01535 
01536 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_parsehtml.php'])) {
01537     include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_parsehtml.php']);
01538 }
01539 
01540 ?>