Proyectos de Subversion Moodle

Rev

Autoría | Ultima modificación | Ver Log |

<?php

/**
 * Copyright (c) 2008, David R. Nadeau, NadeauSoftware.com.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *      * Redistributions of source code must retain the above copyright
 *        notice, this list of conditions and the following disclaimer.
 *
 *      * Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials provided
 *        with the distribution.
 *
 *      * Neither the names of David R. Nadeau or NadeauSoftware.com, nor
 *        the names of its contributors may be used to endorse or promote
 *        products derived from this software without specific prior
 *        written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
 * OF SUCH DAMAGE.
 */

/*
 * This is a BSD License approved by the Open Source Initiative (OSI).
 * See:  http://www.opensource.org/licenses/bsd-license.php
 */

defined('MOODLE_INTERNAL') || die();

/**
 * Combine a base URL and a relative URL to produce a new
 * absolute URL.  The base URL is often the URL of a page,
 * and the relative URL is a URL embedded on that page.
 *
 * This function implements the "absolutize" algorithm from
 * the RFC3986 specification for URLs.
 *
 * This function supports multi-byte characters with the UTF-8 encoding,
 * per the URL specification.
 *
 * Parameters:
 *      baseUrl         the absolute base URL.
 *
 *      url             the relative URL to convert.
 *
 * Return values:
 *      An absolute URL that combines parts of the base and relative
 *      URLs, or FALSE if the base URL is not absolute or if either
 *      URL cannot be parsed.
 */
function url_to_absolute( $baseUrl, $relativeUrl )
{
        // If relative URL has a scheme, clean path and return.
        $r = split_url( $relativeUrl );
        if ( $r === FALSE )
                return FALSE;
        if ( !empty( $r['scheme'] ) )
        {
                if ( !empty( $r['path'] ) && $r['path'][0] == '/' )
                        $r['path'] = url_remove_dot_segments( $r['path'] );
                return join_url( $r );
        }

        // Make sure the base URL is absolute.
        $b = split_url( $baseUrl );
        if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) )
                return FALSE;
        $r['scheme'] = $b['scheme'];
        if (empty($b['path'])) {
                $b['path'] = '';
        }

        // If relative URL has an authority, clean path and return.
        if ( isset( $r['host'] ) )
        {
                if ( !empty( $r['path'] ) )
                        $r['path'] = url_remove_dot_segments( $r['path'] );
                return join_url( $r );
        }
        unset( $r['port'] );
        unset( $r['user'] );
        unset( $r['pass'] );

        // Copy base authority.
        $r['host'] = $b['host'];
        if ( isset( $b['port'] ) ) $r['port'] = $b['port'];
        if ( isset( $b['user'] ) ) $r['user'] = $b['user'];
        if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass'];

        // If relative URL has no path, use base path
        if ( empty( $r['path'] ) )
        {
                if ( !empty( $b['path'] ) )
                        $r['path'] = $b['path'];
                if ( !isset( $r['query'] ) && isset( $b['query'] ) )
                        $r['query'] = $b['query'];
                return join_url( $r );
        }

        // If relative URL path doesn't start with /, merge with base path.
        if ($r['path'][0] != '/') {
                $base = core_text::strrchr($b['path'], '/', TRUE);
                if ($base === FALSE) {
                        $base = '';
                }
                $r['path'] = $base . '/' . $r['path'];
        }
        $r['path'] = url_remove_dot_segments($r['path']);
        return join_url($r);
}

/**
 * Filter out "." and ".." segments from a URL's path and return
 * the result.
 *
 * This function implements the "remove_dot_segments" algorithm from
 * the RFC3986 specification for URLs.
 *
 * This function supports multi-byte characters with the UTF-8 encoding,
 * per the URL specification.
 *
 * Parameters:
 *      path    the path to filter
 *
 * Return values:
 *      The filtered path with "." and ".." removed.
 */
function url_remove_dot_segments( $path )
{
        // multi-byte character explode
        $inSegs  = preg_split( '!/!u', $path );
        $outSegs = array( );
        foreach ( $inSegs as $seg )
        {
                if ( $seg == '' || $seg == '.')
                        continue;
                if ( $seg == '..' )
                        array_pop( $outSegs );
                else
                        array_push( $outSegs, $seg );
        }
        $outPath = implode( '/', $outSegs );

        if ($path[0] == '/') {
                $outPath = '/' . $outPath;
        }

        // Compare last multi-byte character against '/'.
        if ($outPath != '/' && (core_text::strlen($path) - 1) == core_text::strrpos($path, '/', 'UTF-8')) {
                $outPath .= '/';
        }
        return $outPath;
}

/**
 * This function parses an absolute or relative URL and splits it
 * into individual components.
 *
 * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 * A portion of the ABNFs are repeated here:
 *
 *      URI-reference   = URI
 *                      / relative-ref
 *
 *      URI             = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
 *
 *      relative-ref    = relative-part [ "?" query ] [ "#" fragment ]
 *
 *      hier-part       = "//" authority path-abempty
 *                      / path-absolute
 *                      / path-rootless
 *                      / path-empty
 *
 *      relative-part   = "//" authority path-abempty
 *                      / path-absolute
 *                      / path-noscheme
 *                      / path-empty
 *
 *      authority       = [ userinfo "@" ] host [ ":" port ]
 *
 * So, a URL has the following major components:
 *
 *      scheme
 *              The name of a method used to interpret the rest of
 *              the URL.  Examples:  "http", "https", "mailto", "file'.
 *
 *      authority
 *              The name of the authority governing the URL's name
 *              space.  Examples:  "example.com", "user@example.com",
 *              "example.com:80", "user:password@example.com:80".
 *
 *              The authority may include a host name, port number,
 *              user name, and password.
 *
 *              The host may be a name, an IPv4 numeric address, or
 *              an IPv6 numeric address.
 *
 *      path
 *              The hierarchical path to the URL's resource.
 *              Examples:  "/index.htm", "/scripts/page.php".
 *
 *      query
 *              The data for a query.  Examples:  "?search=google.com".
 *
 *      fragment
 *              The name of a secondary resource relative to that named
 *              by the path.  Examples:  "#section1", "#header".
 *
 * An "absolute" URL must include a scheme and path.  The authority, query,
 * and fragment components are optional.
 *
 * A "relative" URL does not include a scheme and must include a path.  The
 * authority, query, and fragment components are optional.
 *
 * This function splits the $url argument into the following components
 * and returns them in an associative array.  Keys to that array include:
 *
 *      "scheme"        The scheme, such as "http".
 *      "host"          The host name, IPv4, or IPv6 address.
 *      "port"          The port number.
 *      "user"          The user name.
 *      "pass"          The user password.
 *      "path"          The path, such as a file path for "http".
 *      "query"         The query.
 *      "fragment"      The fragment.
 *
 * One or more of these may not be present, depending upon the URL.
 *
 * Optionally, the "user", "pass", "host" (if a name, not an IP address),
 * "path", "query", and "fragment" may have percent-encoded characters
 * decoded.  The "scheme" and "port" cannot include percent-encoded
 * characters and are never decoded.  Decoding occurs after the URL has
 * been parsed.
 *
 * Parameters:
 *      url             the URL to parse.
 *
 *      decode          an optional boolean flag selecting whether
 *                      to decode percent encoding or not.  Default = TRUE.
 *
 * Return values:
 *      the associative array of URL parts, or FALSE if the URL is
 *      too malformed to recognize any parts.
 */
function split_url( $url, $decode=FALSE)
{
        // Character sets from RFC3986.
        $xunressub     = 'a-zA-Z\d\-._~\!$&\'()*+,;=';
        $xpchar        = $xunressub . ':@% ';

        // Scheme from RFC3986.
        $xscheme        = '([a-zA-Z][a-zA-Z\d+-.]*)';

        // User info (user + password) from RFC3986.
        $xuserinfo     = '((['  . $xunressub . '%]*)' .
                         '(:([' . $xunressub . ':%]*))?)';

        // IPv4 from RFC3986 (without digit constraints).
        $xipv4         = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})';

        // IPv6 from RFC2732 (without digit and grouping constraints).
        $xipv6         = '(\[([a-fA-F\d.:]+)\])';

        // Host name from RFC1035.  Technically, must start with a letter.
        // Relax that restriction to better parse URL structure, then
        // leave host name validation to application.
        $xhost_name    = '([a-zA-Z\d\-.%]+)';

        // Authority from RFC3986.  Skip IP future.
        $xhost         = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')';
        $xport         = '(\d*)';
        $xauthority    = '((' . $xuserinfo . '@)?' . $xhost .
                         '?(:' . $xport . ')?)';

        // Path from RFC3986.  Blend absolute & relative for efficiency.
        $xslash_seg    = '(/[' . $xpchar . ']*)';
        $xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))';
        $xpath_rel     = '([' . $xpchar . ']+' . $xslash_seg . '*)';
        $xpath_abs     = '(/(' . $xpath_rel . ')?)';
        $xapath        = '(' . $xpath_authabs . '|' . $xpath_abs .
                         '|' . $xpath_rel . ')';

        // Query and fragment from RFC3986.
        $xqueryfrag    = '([' . $xpchar . '/?' . ']*)';

        // URL.
        $xurl          = '^(' . $xscheme . ':)?' .  $xapath . '?' .
                         '(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$';


        // Split the URL into components.
        if ( !preg_match( '!' . $xurl . '!', $url, $m ) )
                return FALSE;

        if ( !empty($m[2]) )            $parts['scheme']  = strtolower($m[2]);

        if ( !empty($m[7]) ) {
                if ( isset( $m[9] ) )   $parts['user']    = $m[9];
                else                    $parts['user']    = '';
        }
        if ( !empty($m[10]) )           $parts['pass']    = $m[11];

        if ( !empty($m[13]) )           $h=$parts['host'] = $m[13];
        else if ( !empty($m[14]) )      $parts['host']    = $m[14];
        else if ( !empty($m[16]) )      $parts['host']    = $m[16];
        else if ( !empty( $m[5] ) )     $parts['host']    = '';
        if ( !empty($m[17]) )           $parts['port']    = $m[18];

        if ( !empty($m[19]) )           $parts['path']    = $m[19];
        else if ( !empty($m[21]) )      $parts['path']    = $m[21];
        else if ( !empty($m[25]) )      $parts['path']    = $m[25];

        if ( !empty($m[27]) )           $parts['query']   = $m[28];
        if ( !empty($m[29]) )           $parts['fragment']= $m[30];

        if ( !$decode )
                return $parts;
        if ( !empty($parts['user']) )
                $parts['user']     = rawurldecode( $parts['user'] );
        if ( !empty($parts['pass']) )
                $parts['pass']     = rawurldecode( $parts['pass'] );
        if ( !empty($parts['path']) )
                $parts['path']     = rawurldecode( $parts['path'] );
        if ( isset($h) )
                $parts['host']     = rawurldecode( $parts['host'] );
        if ( !empty($parts['query']) )
                $parts['query']    = rawurldecode( $parts['query'] );
        if ( !empty($parts['fragment']) )
                $parts['fragment'] = rawurldecode( $parts['fragment'] );
        return $parts;
}

/**
 * This function joins together URL components to form a complete URL.
 *
 * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 * This function implements the specification's "component recomposition"
 * algorithm for combining URI components into a full URI string.
 *
 * The $parts argument is an associative array containing zero or
 * more of the following:
 *
 *      "scheme"        The scheme, such as "http".
 *      "host"          The host name, IPv4, or IPv6 address.
 *      "port"          The port number.
 *      "user"          The user name.
 *      "pass"          The user password.
 *      "path"          The path, such as a file path for "http".
 *      "query"         The query.
 *      "fragment"      The fragment.
 *
 * The "port", "user", and "pass" values are only used when a "host"
 * is present.
 *
 * The optional $encode argument indicates if appropriate URL components
 * should be percent-encoded as they are assembled into the URL.  Encoding
 * is only applied to the "user", "pass", "host" (if a host name, not an
 * IP address), "path", "query", and "fragment" components.  The "scheme"
 * and "port" are never encoded.  When a "scheme" and "host" are both
 * present, the "path" is presumed to be hierarchical and encoding
 * processes each segment of the hierarchy separately (i.e., the slashes
 * are left alone).
 *
 * The assembled URL string is returned.
 *
 * Parameters:
 *      parts           an associative array of strings containing the
 *                      individual parts of a URL.
 *
 *      encode          an optional boolean flag selecting whether
 *                      to do percent encoding or not.  Default = true.
 *
 * Return values:
 *      Returns the assembled URL string.  The string is an absolute
 *      URL if a scheme is supplied, and a relative URL if not.  An
 *      empty string is returned if the $parts array does not contain
 *      any of the needed values.
 */
function join_url( $parts, $encode=FALSE)
{
        if ( $encode )
        {
                if ( isset( $parts['user'] ) )
                        $parts['user']     = rawurlencode( $parts['user'] );
                if ( isset( $parts['pass'] ) )
                        $parts['pass']     = rawurlencode( $parts['pass'] );
                if ( isset( $parts['host'] ) &&
                        !preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) )
                        $parts['host']     = rawurlencode( $parts['host'] );
                if ( !empty( $parts['path'] ) )
                        $parts['path']     = preg_replace( '!%2F!ui', '/',
                                rawurlencode( $parts['path'] ) );
                if ( isset( $parts['query'] ) )
                        $parts['query']    = rawurlencode( $parts['query'] );
                if ( isset( $parts['fragment'] ) )
                        $parts['fragment'] = rawurlencode( $parts['fragment'] );
        }

        $url = '';
        if ( !empty( $parts['scheme'] ) )
                $url .= $parts['scheme'] . ':';
        if ( isset( $parts['host'] ) )
        {
                $url .= '//';
                if ( isset( $parts['user'] ) )
                {
                        $url .= $parts['user'];
                        if ( isset( $parts['pass'] ) )
                                $url .= ':' . $parts['pass'];
                        $url .= '@';
                }
                if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) )
                        $url .= '[' . $parts['host'] . ']';     // IPv6
                else
                        $url .= $parts['host'];                 // IPv4 or name
                if ( isset( $parts['port'] ) )
                        $url .= ':' . $parts['port'];
                if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' )
                        $url .= '/';
        }
        if ( !empty( $parts['path'] ) )
                $url .= $parts['path'];
        if ( isset( $parts['query'] ) )
                $url .= '?' . $parts['query'];
        if ( isset( $parts['fragment'] ) )
                $url .= '#' . $parts['fragment'];
        return $url;
}

/**
 * This function encodes URL to form a URL which is properly
 * percent encoded to replace disallowed characters.
 *
 * RFC3986 specifies the allowed characters in the URL as well as
 * reserved characters in the URL. This function replaces all the
 * disallowed characters in the URL with their repective percent
 * encodings. Already encoded characters are not encoded again,
 * such as '%20' is not encoded to '%2520'.
 *
 * Parameters:
 *      url             the url to encode.
 *
 * Return values:
 *      Returns the encoded URL string.
 */
function encode_url($url) {
  $reserved = array(
    ":" => '!%3A!ui',
    "/" => '!%2F!ui',
    "?" => '!%3F!ui',
    "#" => '!%23!ui',
    "[" => '!%5B!ui',
    "]" => '!%5D!ui',
    "@" => '!%40!ui',
    "!" => '!%21!ui',
    "$" => '!%24!ui',
    "&" => '!%26!ui',
    "'" => '!%27!ui',
    "(" => '!%28!ui',
    ")" => '!%29!ui',
    "*" => '!%2A!ui',
    "+" => '!%2B!ui',
    "," => '!%2C!ui',
    ";" => '!%3B!ui',
    "=" => '!%3D!ui',
    "%" => '!%25!ui',
  );

  $url = rawurlencode($url);
  $url = preg_replace(array_values($reserved), array_keys($reserved), $url);
  return $url;
}

/**
 * Extract URLs from a web page.
 *
 * URLs are extracted from a long list of tags and attributes as defined
 * by the HTML 2.0, HTML 3.2, HTML 4.01, and draft HTML 5.0 specifications.
 * URLs are also extracted from tags and attributes that are common
 * extensions of HTML, from the draft Forms 2.0 specification, from XHTML,
 * and from WML 1.3 and 2.0.
 *
 * The function returns an associative array of associative arrays of
 * arrays of URLs.  The outermost array's keys are the tag (element) name,
 * such as "a" for <a> or "img" for <img>.  The values for these entries
 * are associative arrays where the keys are attribute names for those
 * tags, such as "href" for <a href="...">.  Finally, the values for
 * those arrays are URLs found in those tags and attributes throughout
 * the text.
 *
 * Parameters:
 *      text            the UTF-8 text to scan
 *
 * Return values:
 *      an associative array where keys are tags and values are an
 *      associative array where keys are attributes and values are
 *      an array of URLs.
 *
 * See:
 *      http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_web_page
 */
function extract_html_urls( $text )
{
        $match_elements = array(
                // HTML
                array('element'=>'a',           'attribute'=>'href'),           // 2.0
                array('element'=>'a',           'attribute'=>'urn'),            // 2.0
                array('element'=>'base',        'attribute'=>'href'),           // 2.0
                array('element'=>'form',        'attribute'=>'action'),         // 2.0
                array('element'=>'img',         'attribute'=>'src'),            // 2.0
                array('element'=>'link',        'attribute'=>'href'),           // 2.0

                array('element'=>'applet',      'attribute'=>'code'),           // 3.2
                array('element'=>'applet',      'attribute'=>'codebase'),       // 3.2
                array('element'=>'area',        'attribute'=>'href'),           // 3.2
                array('element'=>'body',        'attribute'=>'background'),     // 3.2
                array('element'=>'img',         'attribute'=>'usemap'),         // 3.2
                array('element'=>'input',       'attribute'=>'src'),            // 3.2

                array('element'=>'applet',      'attribute'=>'archive'),        // 4.01
                array('element'=>'applet',      'attribute'=>'object'),         // 4.01
                array('element'=>'blockquote',  'attribute'=>'cite'),           // 4.01
                array('element'=>'del',         'attribute'=>'cite'),           // 4.01
                array('element'=>'frame',       'attribute'=>'longdesc'),       // 4.01
                array('element'=>'frame',       'attribute'=>'src'),            // 4.01
                array('element'=>'head',        'attribute'=>'profile'),        // 4.01
                array('element'=>'iframe',      'attribute'=>'longdesc'),       // 4.01
                array('element'=>'iframe',      'attribute'=>'src'),            // 4.01
                array('element'=>'img',         'attribute'=>'longdesc'),       // 4.01
                array('element'=>'input',       'attribute'=>'usemap'),         // 4.01
                array('element'=>'ins',         'attribute'=>'cite'),           // 4.01
                array('element'=>'object',      'attribute'=>'archive'),        // 4.01
                array('element'=>'object',      'attribute'=>'classid'),        // 4.01
                array('element'=>'object',      'attribute'=>'codebase'),       // 4.01
                array('element'=>'object',      'attribute'=>'data'),           // 4.01
                array('element'=>'object',      'attribute'=>'usemap'),         // 4.01
                array('element'=>'q',           'attribute'=>'cite'),           // 4.01
                array('element'=>'script',      'attribute'=>'src'),            // 4.01

                array('element'=>'audio',       'attribute'=>'src'),            // 5.0
                array('element'=>'command',     'attribute'=>'icon'),           // 5.0
                array('element'=>'embed',       'attribute'=>'src'),            // 5.0
                array('element'=>'event-source','attribute'=>'src'),            // 5.0
                array('element'=>'html',        'attribute'=>'manifest'),       // 5.0
                array('element'=>'source',      'attribute'=>'src'),            // 5.0
                array('element'=>'video',       'attribute'=>'src'),            // 5.0
                array('element'=>'video',       'attribute'=>'poster'),         // 5.0

                array('element'=>'bgsound',     'attribute'=>'src'),            // Extension
                array('element'=>'body',        'attribute'=>'credits'),        // Extension
                array('element'=>'body',        'attribute'=>'instructions'),   // Extension
                array('element'=>'body',        'attribute'=>'logo'),           // Extension
                array('element'=>'div',         'attribute'=>'href'),           // Extension
                array('element'=>'div',         'attribute'=>'src'),            // Extension
                array('element'=>'embed',       'attribute'=>'code'),           // Extension
                array('element'=>'embed',       'attribute'=>'pluginspage'),    // Extension
                array('element'=>'html',        'attribute'=>'background'),     // Extension
                array('element'=>'ilayer',      'attribute'=>'src'),            // Extension
                array('element'=>'img',         'attribute'=>'dynsrc'),         // Extension
                array('element'=>'img',         'attribute'=>'lowsrc'),         // Extension
                array('element'=>'input',       'attribute'=>'dynsrc'),         // Extension
                array('element'=>'input',       'attribute'=>'lowsrc'),         // Extension
                array('element'=>'table',       'attribute'=>'background'),     // Extension
                array('element'=>'td',          'attribute'=>'background'),     // Extension
                array('element'=>'th',          'attribute'=>'background'),     // Extension
                array('element'=>'layer',       'attribute'=>'src'),            // Extension
                array('element'=>'xml',         'attribute'=>'src'),            // Extension

                array('element'=>'button',      'attribute'=>'action'),         // Forms 2.0
                array('element'=>'datalist',    'attribute'=>'data'),           // Forms 2.0
                array('element'=>'form',        'attribute'=>'data'),           // Forms 2.0
                array('element'=>'input',       'attribute'=>'action'),         // Forms 2.0
                array('element'=>'select',      'attribute'=>'data'),           // Forms 2.0

                // XHTML
                array('element'=>'html',        'attribute'=>'xmlns'),

                // WML
                array('element'=>'access',      'attribute'=>'path'),           // 1.3
                array('element'=>'card',        'attribute'=>'onenterforward'), // 1.3
                array('element'=>'card',        'attribute'=>'onenterbackward'),// 1.3
                array('element'=>'card',        'attribute'=>'ontimer'),        // 1.3
                array('element'=>'go',          'attribute'=>'href'),           // 1.3
                array('element'=>'option',      'attribute'=>'onpick'),         // 1.3
                array('element'=>'template',    'attribute'=>'onenterforward'), // 1.3
                array('element'=>'template',    'attribute'=>'onenterbackward'),// 1.3
                array('element'=>'template',    'attribute'=>'ontimer'),        // 1.3
                array('element'=>'wml',         'attribute'=>'xmlns'),          // 2.0
        );

        $match_metas = array(
                'content-base',
                'content-location',
                'referer',
                'location',
                'refresh',
        );

        // Extract all elements
        if ( !preg_match_all( '/<([a-z][^>]*)>/iu', $text, $matches ) )
                return array( );
        $elements = $matches[1];
        $value_pattern = '=(("([^"]*)")|([^\s]*))';

        // Match elements and attributes
        foreach ( $match_elements as $match_element )
        {
                $name = $match_element['element'];
                $attr = $match_element['attribute'];
                $pattern = '/^' . $name . '\s.*' . $attr . $value_pattern . '/iu';
                if ( $name == 'object' )
                        $split_pattern = '/\s*/u';      // Space-separated URL list
                else if ( $name == 'archive' )
                        $split_pattern = '/,\s*/u';     // Comma-separated URL list
                else
                        unset( $split_pattern );        // Single URL
                foreach ( $elements as $element )
                {
                        if ( !preg_match( $pattern, $element, $match ) )
                                continue;
                        $m = empty($match[3]) ? (!empty($match[4])?$match[4]:'') : $match[3];
                        if ( !isset( $split_pattern ) )
                                $urls[$name][$attr][] = $m;
                        else
                        {
                                $msplit = preg_split( $split_pattern, $m );
                                foreach ( $msplit as $ms )
                                        $urls[$name][$attr][] = $ms;
                        }
                }
        }

        // Match meta http-equiv elements
        foreach ( $match_metas as $match_meta )
        {
                $attr_pattern    = '/http-equiv="?' . $match_meta . '"?/iu';
                $content_pattern = '/content'  . $value_pattern . '/iu';
                $refresh_pattern = '/\d*;\s*(url=)?(.*)$/iu';
                foreach ( $elements as $element )
                {
                        if ( !preg_match( '/^meta/iu', $element ) ||
                                !preg_match( $attr_pattern, $element ) ||
                                !preg_match( $content_pattern, $element, $match ) )
                                continue;
                        $m = empty($match[3]) ? $match[4] : $match[3];
                        if ( $match_meta != 'refresh' )
                                $urls['meta']['http-equiv'][] = $m;
                        else if ( preg_match( $refresh_pattern, $m, $match ) )
                                $urls['meta']['http-equiv'][] = $match[2];
                }
        }

        // Match style attributes
        $urls['style'] = array( );
        $style_pattern = '/style' . $value_pattern . '/iu';
        foreach ( $elements as $element )
        {
                if ( !preg_match( $style_pattern, $element, $match ) )
                        continue;
                $m = empty($match[3]) ? $match[4] : $match[3];
                $style_urls = extract_css_urls( $m );
                if ( !empty( $style_urls ) )
                        $urls['style'] = array_merge_recursive(
                                $urls['style'], $style_urls );
        }

        // Match style bodies
        if ( preg_match_all( '/<style[^>]*>(.*?)<\/style>/siu', $text, $style_bodies ) )
        {
                foreach ( $style_bodies[1] as $style_body )
                {
                        $style_urls = extract_css_urls( $style_body );
                        if ( !empty( $style_urls ) )
                                $urls['style'] = array_merge_recursive(
                                        $urls['style'], $style_urls );
                }
        }
        if ( empty($urls['style']) )
                unset( $urls['style'] );

        return $urls;
}
/**
 * Extract URLs from UTF-8 CSS text.
 *
 * URLs within @import statements and url() property functions are extracted
 * and returned in an associative array of arrays.  Array keys indicate
 * the use context for the URL, including:
 *
 *      "import"
 *      "property"
 *
 * Each value in the associative array is an array of URLs.
 *
 * Parameters:
 *      text            the UTF-8 text to scan
 *
 * Return values:
 *      an associative array of arrays of URLs.
 *
 * See:
 *      http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_css_file
 */
function extract_css_urls( $text )
{
        $urls = array( );

        $url_pattern     = '(([^\\\\\'", \(\)]*(\\\\.)?)+)';
        $urlfunc_pattern = 'url\(\s*[\'"]?' . $url_pattern . '[\'"]?\s*\)';
        $pattern         = '/(' .
                 '(@import\s*[\'"]' . $url_pattern     . '[\'"])' .
                '|(@import\s*'      . $urlfunc_pattern . ')'      .
                '|('                . $urlfunc_pattern . ')'      .  ')/iu';
        if ( !preg_match_all( $pattern, $text, $matches ) )
                return $urls;

        // @import '...'
        // @import "..."
        foreach ( $matches[3] as $match )
                if ( !empty($match) )
                        $urls['import'][] =
                                preg_replace( '/\\\\(.)/u', '\\1', $match );

        // @import url(...)
        // @import url('...')
        // @import url("...")
        foreach ( $matches[7] as $match )
                if ( !empty($match) )
                        $urls['import'][] =
                                preg_replace( '/\\\\(.)/u', '\\1', $match );

        // url(...)
        // url('...')
        // url("...")
        foreach ( $matches[11] as $match )
                if ( !empty($match) )
                        $urls['property'][] =
                                preg_replace( '/\\\\(.)/u', '\\1', $match );

        return $urls;
}