Parse.php

Go to the documentation of this file.
00001 <?php
00002 
00003 /**
00004  * This module implements a VERY limited parser that finds <link> tags
00005  * in the head of HTML or XHTML documents and parses out their
00006  * attributes according to the OpenID spec. It is a liberal parser,
00007  * but it requires these things from the data in order to work:
00008  *
00009  * - There must be an open <html> tag
00010  *
00011  * - There must be an open <head> tag inside of the <html> tag
00012  *
00013  * - Only <link>s that are found inside of the <head> tag are parsed
00014  *   (this is by design)
00015  *
00016  * - The parser follows the OpenID specification in resolving the
00017  *   attributes of the link tags. This means that the attributes DO
00018  *   NOT get resolved as they would by an XML or HTML parser. In
00019  *   particular, only certain entities get replaced, and href
00020  *   attributes do not get resolved relative to a base URL.
00021  *
00022  * From http://openid.net/specs.bml:
00023  *
00024  * - The openid.server URL MUST be an absolute URL. OpenID consumers
00025  *   MUST NOT attempt to resolve relative URLs.
00026  *
00027  * - The openid.server URL MUST NOT include entities other than &amp;,
00028  *   &lt;, &gt;, and &quot;.
00029  *
00030  * The parser ignores SGML comments and <![CDATA[blocks]]>. Both kinds
00031  * of quoting are allowed for attributes.
00032  *
00033  * The parser deals with invalid markup in these ways:
00034  *
00035  * - Tag names are not case-sensitive
00036  *
00037  * - The <html> tag is accepted even when it is not at the top level
00038  *
00039  * - The <head> tag is accepted even when it is not a direct child of
00040  *   the <html> tag, but a <html> tag must be an ancestor of the
00041  *   <head> tag
00042  *
00043  * - <link> tags are accepted even when they are not direct children
00044  *   of the <head> tag, but a <head> tag must be an ancestor of the
00045  *   <link> tag
00046  *
00047  * - If there is no closing tag for an open <html> or <head> tag, the
00048  *   remainder of the document is viewed as being inside of the
00049  *   tag. If there is no closing tag for a <link> tag, the link tag is
00050  *   treated as a short tag. Exceptions to this rule are that <html>
00051  *   closes <html> and <body> or <head> closes <head>
00052  *
00053  * - Attributes of the <link> tag are not required to be quoted.
00054  *
00055  * - In the case of duplicated attribute names, the attribute coming
00056  *   last in the tag will be the value returned.
00057  *
00058  * - Any text that does not parse as an attribute within a link tag
00059  *   will be ignored. (e.g. <link pumpkin rel='openid.server' /> will
00060  *   ignore pumpkin)
00061  *
00062  * - If there are more than one <html> or <head> tag, the parser only
00063  *   looks inside of the first one.
00064  *
00065  * - The contents of <script> tags are ignored entirely, except
00066  *   unclosed <script> tags. Unclosed <script> tags are ignored.
00067  *
00068  * - Any other invalid markup is ignored, including unclosed SGML
00069  *   comments and unclosed <![CDATA[blocks.
00070  *
00071  * PHP versions 4 and 5
00072  *
00073  * LICENSE: See the COPYING file included in this distribution.
00074  *
00075  * @access private
00076  * @package OpenID
00077  * @author JanRain, Inc. <openid@janrain.com>
00078  * @copyright 2005-2008 Janrain, Inc.
00079  * @license http://www.apache.org/licenses/LICENSE-2.0 Apache
00080  */
00081 
00082 /**
00083  * Require Auth_OpenID::arrayGet().
00084  */
00085 require_once "Auth/OpenID.php";
00086 
00087 class Auth_OpenID_Parse {
00088 
00089     /**
00090      * Specify some flags for use with regex matching.
00091      */
00092     var $_re_flags = "si";
00093 
00094     /**
00095      * Stuff to remove before we start looking for tags
00096      */
00097     var $_removed_re =
00098            "<!--.*?-->|<!\[CDATA\[.*?\]\]>|<script\b(?!:)[^>]*>.*?<\/script>";
00099 
00100     /**
00101      * Starts with the tag name at a word boundary, where the tag name
00102      * is not a namespace
00103      */
00104     var $_tag_expr = "<%s\b(?!:)([^>]*?)(?:\/>|>(.*?)(?:<\/?%s\s*>|\Z))";
00105 
00106     var $_attr_find = '\b(\w+)=("[^"]*"|\'[^\']*\'|[^\'"\s\/<>]+)';
00107 
00108     var $_open_tag_expr = "<%s\b";
00109     var $_close_tag_expr = "<((\/%s\b)|(%s[^>\/]*\/))>";
00110 
00111     function Auth_OpenID_Parse()
00112     {
00113         $this->_link_find = sprintf("/<link\b(?!:)([^>]*)(?!<)>/%s",
00114                                     $this->_re_flags);
00115 
00116         $this->_entity_replacements = array(
00117                                             'amp' => '&',
00118                                             'lt' => '<',
00119                                             'gt' => '>',
00120                                             'quot' => '"'
00121                                             );
00122 
00123         $this->_attr_find = sprintf("/%s/%s",
00124                                     $this->_attr_find,
00125                                     $this->_re_flags);
00126 
00127         $this->_removed_re = sprintf("/%s/%s",
00128                                      $this->_removed_re,
00129                                      $this->_re_flags);
00130 
00131         $this->_ent_replace =
00132             sprintf("&(%s);", implode("|",
00133                                       $this->_entity_replacements));
00134     }
00135 
00136     /**
00137      * Returns a regular expression that will match a given tag in an
00138      * SGML string.
00139      */
00140     function tagMatcher($tag_name, $close_tags = null)
00141     {
00142         $expr = $this->_tag_expr;
00143 
00144         if ($close_tags) {
00145             $options = implode("|", array_merge(array($tag_name), $close_tags));
00146             $closer = sprintf("(?:%s)", $options);
00147         } else {
00148             $closer = $tag_name;
00149         }
00150 
00151         $expr = sprintf($expr, $tag_name, $closer);
00152         return sprintf("/%s/%s", $expr, $this->_re_flags);
00153     }
00154 
00155     function openTag($tag_name)
00156     {
00157         $expr = sprintf($this->_open_tag_expr, $tag_name);
00158         return sprintf("/%s/%s", $expr, $this->_re_flags);
00159     }
00160 
00161     function closeTag($tag_name)
00162     {
00163         $expr = sprintf($this->_close_tag_expr, $tag_name, $tag_name);
00164         return sprintf("/%s/%s", $expr, $this->_re_flags);
00165     }
00166 
00167     function htmlBegin($s)
00168     {
00169         $matches = array();
00170         $result = preg_match($this->openTag('html'), $s,
00171                              $matches, PREG_OFFSET_CAPTURE);
00172         if ($result === false || !$matches) {
00173             return false;
00174         }
00175         // Return the offset of the first match.
00176         return $matches[0][1];
00177     }
00178 
00179     function htmlEnd($s)
00180     {
00181         $matches = array();
00182         $result = preg_match($this->closeTag('html'), $s,
00183                              $matches, PREG_OFFSET_CAPTURE);
00184         if ($result === false || !$matches) {
00185             return false;
00186         }
00187         // Return the offset of the first match.
00188         return $matches[count($matches) - 1][1];
00189     }
00190 
00191     function headFind()
00192     {
00193         return $this->tagMatcher('head', array('body', 'html'));
00194     }
00195 
00196     function replaceEntities($str)
00197     {
00198         foreach ($this->_entity_replacements as $old => $new) {
00199             $str = preg_replace(sprintf("/&%s;/", $old), $new, $str);
00200         }
00201         return $str;
00202     }
00203 
00204     function removeQuotes($str)
00205     {
00206         $matches = array();
00207         $double = '/^"(.*)"$/';
00208         $single = "/^\'(.*)\'$/";
00209 
00210         if (preg_match($double, $str, $matches)) {
00211             return $matches[1];
00212         } else if (preg_match($single, $str, $matches)) {
00213             return $matches[1];
00214         } else {
00215             return $str;
00216         }
00217     }
00218 
00219     /**
00220      * Find all link tags in a string representing a HTML document and
00221      * return a list of their attributes.
00222      *
00223      * @param string $html The text to parse
00224      * @return array $list An array of arrays of attributes, one for each
00225      * link tag
00226      */
00227     function parseLinkAttrs($html)
00228     {
00229         $stripped = preg_replace($this->_removed_re,
00230                                  "",
00231                                  $html);
00232 
00233         $html_begin = $this->htmlBegin($stripped);
00234         $html_end = $this->htmlEnd($stripped);
00235 
00236         if ($html_begin === false) {
00237             return array();
00238         }
00239 
00240         if ($html_end === false) {
00241             $html_end = strlen($stripped);
00242         }
00243 
00244         $stripped = substr($stripped, $html_begin,
00245                            $html_end - $html_begin);
00246 
00247         // Try to find the <HEAD> tag.
00248         $head_re = $this->headFind();
00249         $head_matches = array();
00250         if (!preg_match($head_re, $stripped, $head_matches)) {
00251             return array();
00252         }
00253 
00254         $link_data = array();
00255         $link_matches = array();
00256 
00257         if (!preg_match_all($this->_link_find, $head_matches[0],
00258                             $link_matches)) {
00259             return array();
00260         }
00261 
00262         foreach ($link_matches[0] as $link) {
00263             $attr_matches = array();
00264             preg_match_all($this->_attr_find, $link, $attr_matches);
00265             $link_attrs = array();
00266             foreach ($attr_matches[0] as $index => $full_match) {
00267                 $name = $attr_matches[1][$index];
00268                 $value = $this->replaceEntities(
00269                               $this->removeQuotes($attr_matches[2][$index]));
00270 
00271                 $link_attrs[strtolower($name)] = $value;
00272             }
00273             $link_data[] = $link_attrs;
00274         }
00275 
00276         return $link_data;
00277     }
00278 
00279     function relMatches($rel_attr, $target_rel)
00280     {
00281         // Does this target_rel appear in the rel_str?
00282         // XXX: TESTME
00283         $rels = preg_split("/\s+/", trim($rel_attr));
00284         foreach ($rels as $rel) {
00285             $rel = strtolower($rel);
00286             if ($rel == $target_rel) {
00287                 return 1;
00288             }
00289         }
00290 
00291         return 0;
00292     }
00293 
00294     function linkHasRel($link_attrs, $target_rel)
00295     {
00296         // Does this link have target_rel as a relationship?
00297         // XXX: TESTME
00298         $rel_attr = Auth_OpeniD::arrayGet($link_attrs, 'rel', null);
00299         return ($rel_attr && $this->relMatches($rel_attr,
00300                                                $target_rel));
00301     }
00302 
00303     function findLinksRel($link_attrs_list, $target_rel)
00304     {
00305         // Filter the list of link attributes on whether it has
00306         // target_rel as a relationship.
00307         // XXX: TESTME
00308         $result = array();
00309         foreach ($link_attrs_list as $attr) {
00310             if ($this->linkHasRel($attr, $target_rel)) {
00311                 $result[] = $attr;
00312             }
00313         }
00314 
00315         return $result;
00316     }
00317 
00318     function findFirstHref($link_attrs_list, $target_rel)
00319     {
00320         // Return the value of the href attribute for the first link
00321         // tag in the list that has target_rel as a relationship.
00322         // XXX: TESTME
00323         $matches = $this->findLinksRel($link_attrs_list,
00324                                        $target_rel);
00325         if (!$matches) {
00326             return null;
00327         }
00328         $first = $matches[0];
00329         return Auth_OpenID::arrayGet($first, 'href', null);
00330     }
00331 }
00332 
00333 function Auth_OpenID_legacy_discover($html_text, $server_rel,
00334                                      $delegate_rel)
00335 {
00336     $p = new Auth_OpenID_Parse();
00337 
00338     $link_attrs = $p->parseLinkAttrs($html_text);
00339 
00340     $server_url = $p->findFirstHref($link_attrs,
00341                                     $server_rel);
00342 
00343     if ($server_url === null) {
00344         return false;
00345     } else {
00346         $delegate_url = $p->findFirstHref($link_attrs,
00347                                           $delegate_rel);
00348         return array($delegate_url, $server_url);
00349     }
00350 }
00351 
00352 ?>

Generated on Sat Sep 4 04:17:21 2010 for TYPO3 API by  doxygen 1.4.7