Proyectos de Subversion Moodle

Rev

| Ultima modificación | Ver Log |

Rev Autor Línea Nro. Línea
1 efrain 1
<?php
2
 
3
/**
4
 * Copyright (c) 2008, David R. Nadeau, NadeauSoftware.com.
5
 * All rights reserved.
6
 *
7
 * Redistribution and use in source and binary forms, with or without
8
 * modification, are permitted provided that the following conditions
9
 * are met:
10
 *
11
 *	* Redistributions of source code must retain the above copyright
12
 *	  notice, this list of conditions and the following disclaimer.
13
 *
14
 *	* Redistributions in binary form must reproduce the above
15
 *	  copyright notice, this list of conditions and the following
16
 *	  disclaimer in the documentation and/or other materials provided
17
 *	  with the distribution.
18
 *
19
 *	* Neither the names of David R. Nadeau or NadeauSoftware.com, nor
20
 *	  the names of its contributors may be used to endorse or promote
21
 *	  products derived from this software without specific prior
22
 *	  written permission.
23
 *
24
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
27
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
28
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
32
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
34
 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
35
 * OF SUCH DAMAGE.
36
 */
37
 
38
/*
39
 * This is a BSD License approved by the Open Source Initiative (OSI).
40
 * See:  http://www.opensource.org/licenses/bsd-license.php
41
 */
42
 
43
defined('MOODLE_INTERNAL') || die();
44
 
45
/**
46
 * Combine a base URL and a relative URL to produce a new
47
 * absolute URL.  The base URL is often the URL of a page,
48
 * and the relative URL is a URL embedded on that page.
49
 *
50
 * This function implements the "absolutize" algorithm from
51
 * the RFC3986 specification for URLs.
52
 *
53
 * This function supports multi-byte characters with the UTF-8 encoding,
54
 * per the URL specification.
55
 *
56
 * Parameters:
57
 * 	baseUrl		the absolute base URL.
58
 *
59
 * 	url		the relative URL to convert.
60
 *
61
 * Return values:
62
 * 	An absolute URL that combines parts of the base and relative
63
 * 	URLs, or FALSE if the base URL is not absolute or if either
64
 * 	URL cannot be parsed.
65
 */
66
function url_to_absolute( $baseUrl, $relativeUrl )
67
{
68
	// If relative URL has a scheme, clean path and return.
69
	$r = split_url( $relativeUrl );
70
	if ( $r === FALSE )
71
		return FALSE;
72
	if ( !empty( $r['scheme'] ) )
73
	{
74
		if ( !empty( $r['path'] ) && $r['path'][0] == '/' )
75
			$r['path'] = url_remove_dot_segments( $r['path'] );
76
		return join_url( $r );
77
	}
78
 
79
	// Make sure the base URL is absolute.
80
	$b = split_url( $baseUrl );
81
	if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) )
82
		return FALSE;
83
	$r['scheme'] = $b['scheme'];
84
	if (empty($b['path'])) {
85
		$b['path'] = '';
86
	}
87
 
88
	// If relative URL has an authority, clean path and return.
89
	if ( isset( $r['host'] ) )
90
	{
91
		if ( !empty( $r['path'] ) )
92
			$r['path'] = url_remove_dot_segments( $r['path'] );
93
		return join_url( $r );
94
	}
95
	unset( $r['port'] );
96
	unset( $r['user'] );
97
	unset( $r['pass'] );
98
 
99
	// Copy base authority.
100
	$r['host'] = $b['host'];
101
	if ( isset( $b['port'] ) ) $r['port'] = $b['port'];
102
	if ( isset( $b['user'] ) ) $r['user'] = $b['user'];
103
	if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass'];
104
 
105
	// If relative URL has no path, use base path
106
	if ( empty( $r['path'] ) )
107
	{
108
		if ( !empty( $b['path'] ) )
109
			$r['path'] = $b['path'];
110
		if ( !isset( $r['query'] ) && isset( $b['query'] ) )
111
			$r['query'] = $b['query'];
112
		return join_url( $r );
113
	}
114
 
115
	// If relative URL path doesn't start with /, merge with base path.
116
	if ($r['path'][0] != '/') {
117
		$base = core_text::strrchr($b['path'], '/', TRUE);
118
		if ($base === FALSE) {
119
			$base = '';
120
		}
121
		$r['path'] = $base . '/' . $r['path'];
122
	}
123
	$r['path'] = url_remove_dot_segments($r['path']);
124
	return join_url($r);
125
}
126
 
127
/**
128
 * Filter out "." and ".." segments from a URL's path and return
129
 * the result.
130
 *
131
 * This function implements the "remove_dot_segments" algorithm from
132
 * the RFC3986 specification for URLs.
133
 *
134
 * This function supports multi-byte characters with the UTF-8 encoding,
135
 * per the URL specification.
136
 *
137
 * Parameters:
138
 * 	path	the path to filter
139
 *
140
 * Return values:
141
 * 	The filtered path with "." and ".." removed.
142
 */
143
function url_remove_dot_segments( $path )
144
{
145
	// multi-byte character explode
146
	$inSegs  = preg_split( '!/!u', $path );
147
	$outSegs = array( );
148
	foreach ( $inSegs as $seg )
149
	{
150
		if ( $seg == '' || $seg == '.')
151
			continue;
152
		if ( $seg == '..' )
153
			array_pop( $outSegs );
154
		else
155
			array_push( $outSegs, $seg );
156
	}
157
	$outPath = implode( '/', $outSegs );
158
 
159
	if ($path[0] == '/') {
160
		$outPath = '/' . $outPath;
161
	}
162
 
163
	// Compare last multi-byte character against '/'.
164
	if ($outPath != '/' && (core_text::strlen($path) - 1) == core_text::strrpos($path, '/', 'UTF-8')) {
165
		$outPath .= '/';
166
	}
167
	return $outPath;
168
}
169
 
170
/**
171
 * This function parses an absolute or relative URL and splits it
172
 * into individual components.
173
 *
174
 * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
175
 * A portion of the ABNFs are repeated here:
176
 *
177
 *	URI-reference	= URI
178
 *			/ relative-ref
179
 *
180
 *	URI		= scheme ":" hier-part [ "?" query ] [ "#" fragment ]
181
 *
182
 *	relative-ref	= relative-part [ "?" query ] [ "#" fragment ]
183
 *
184
 *	hier-part	= "//" authority path-abempty
185
 *			/ path-absolute
186
 *			/ path-rootless
187
 *			/ path-empty
188
 *
189
 *	relative-part	= "//" authority path-abempty
190
 *			/ path-absolute
191
 *			/ path-noscheme
192
 *			/ path-empty
193
 *
194
 *	authority	= [ userinfo "@" ] host [ ":" port ]
195
 *
196
 * So, a URL has the following major components:
197
 *
198
 *	scheme
199
 *		The name of a method used to interpret the rest of
200
 *		the URL.  Examples:  "http", "https", "mailto", "file'.
201
 *
202
 *	authority
203
 *		The name of the authority governing the URL's name
204
 *		space.  Examples:  "example.com", "user@example.com",
205
 *		"example.com:80", "user:password@example.com:80".
206
 *
207
 *		The authority may include a host name, port number,
208
 *		user name, and password.
209
 *
210
 *		The host may be a name, an IPv4 numeric address, or
211
 *		an IPv6 numeric address.
212
 *
213
 *	path
214
 *		The hierarchical path to the URL's resource.
215
 *		Examples:  "/index.htm", "/scripts/page.php".
216
 *
217
 *	query
218
 *		The data for a query.  Examples:  "?search=google.com".
219
 *
220
 *	fragment
221
 *		The name of a secondary resource relative to that named
222
 *		by the path.  Examples:  "#section1", "#header".
223
 *
224
 * An "absolute" URL must include a scheme and path.  The authority, query,
225
 * and fragment components are optional.
226
 *
227
 * A "relative" URL does not include a scheme and must include a path.  The
228
 * authority, query, and fragment components are optional.
229
 *
230
 * This function splits the $url argument into the following components
231
 * and returns them in an associative array.  Keys to that array include:
232
 *
233
 *	"scheme"	The scheme, such as "http".
234
 *	"host"		The host name, IPv4, or IPv6 address.
235
 *	"port"		The port number.
236
 *	"user"		The user name.
237
 *	"pass"		The user password.
238
 *	"path"		The path, such as a file path for "http".
239
 *	"query"		The query.
240
 *	"fragment"	The fragment.
241
 *
242
 * One or more of these may not be present, depending upon the URL.
243
 *
244
 * Optionally, the "user", "pass", "host" (if a name, not an IP address),
245
 * "path", "query", and "fragment" may have percent-encoded characters
246
 * decoded.  The "scheme" and "port" cannot include percent-encoded
247
 * characters and are never decoded.  Decoding occurs after the URL has
248
 * been parsed.
249
 *
250
 * Parameters:
251
 * 	url		the URL to parse.
252
 *
253
 * 	decode		an optional boolean flag selecting whether
254
 * 			to decode percent encoding or not.  Default = TRUE.
255
 *
256
 * Return values:
257
 * 	the associative array of URL parts, or FALSE if the URL is
258
 * 	too malformed to recognize any parts.
259
 */
260
function split_url( $url, $decode=FALSE)
261
{
262
	// Character sets from RFC3986.
263
	$xunressub     = 'a-zA-Z\d\-._~\!$&\'()*+,;=';
264
	$xpchar        = $xunressub . ':@% ';
265
 
266
	// Scheme from RFC3986.
267
	$xscheme        = '([a-zA-Z][a-zA-Z\d+-.]*)';
268
 
269
	// User info (user + password) from RFC3986.
270
	$xuserinfo     = '((['  . $xunressub . '%]*)' .
271
	                 '(:([' . $xunressub . ':%]*))?)';
272
 
273
	// IPv4 from RFC3986 (without digit constraints).
274
	$xipv4         = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})';
275
 
276
	// IPv6 from RFC2732 (without digit and grouping constraints).
277
	$xipv6         = '(\[([a-fA-F\d.:]+)\])';
278
 
279
	// Host name from RFC1035.  Technically, must start with a letter.
280
	// Relax that restriction to better parse URL structure, then
281
	// leave host name validation to application.
282
	$xhost_name    = '([a-zA-Z\d\-.%]+)';
283
 
284
	// Authority from RFC3986.  Skip IP future.
285
	$xhost         = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')';
286
	$xport         = '(\d*)';
287
	$xauthority    = '((' . $xuserinfo . '@)?' . $xhost .
288
		         '?(:' . $xport . ')?)';
289
 
290
	// Path from RFC3986.  Blend absolute & relative for efficiency.
291
	$xslash_seg    = '(/[' . $xpchar . ']*)';
292
	$xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))';
293
	$xpath_rel     = '([' . $xpchar . ']+' . $xslash_seg . '*)';
294
	$xpath_abs     = '(/(' . $xpath_rel . ')?)';
295
	$xapath        = '(' . $xpath_authabs . '|' . $xpath_abs .
296
			 '|' . $xpath_rel . ')';
297
 
298
	// Query and fragment from RFC3986.
299
	$xqueryfrag    = '([' . $xpchar . '/?' . ']*)';
300
 
301
	// URL.
302
	$xurl          = '^(' . $xscheme . ':)?' .  $xapath . '?' .
303
	                 '(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$';
304
 
305
 
306
	// Split the URL into components.
307
	if ( !preg_match( '!' . $xurl . '!', $url, $m ) )
308
		return FALSE;
309
 
310
	if ( !empty($m[2]) )		$parts['scheme']  = strtolower($m[2]);
311
 
312
	if ( !empty($m[7]) ) {
313
		if ( isset( $m[9] ) )	$parts['user']    = $m[9];
314
		else			$parts['user']    = '';
315
	}
316
	if ( !empty($m[10]) )		$parts['pass']    = $m[11];
317
 
318
	if ( !empty($m[13]) )		$h=$parts['host'] = $m[13];
319
	else if ( !empty($m[14]) )	$parts['host']    = $m[14];
320
	else if ( !empty($m[16]) )	$parts['host']    = $m[16];
321
	else if ( !empty( $m[5] ) )	$parts['host']    = '';
322
	if ( !empty($m[17]) )		$parts['port']    = $m[18];
323
 
324
	if ( !empty($m[19]) )		$parts['path']    = $m[19];
325
	else if ( !empty($m[21]) )	$parts['path']    = $m[21];
326
	else if ( !empty($m[25]) )	$parts['path']    = $m[25];
327
 
328
	if ( !empty($m[27]) )		$parts['query']   = $m[28];
329
	if ( !empty($m[29]) )		$parts['fragment']= $m[30];
330
 
331
	if ( !$decode )
332
		return $parts;
333
	if ( !empty($parts['user']) )
334
		$parts['user']     = rawurldecode( $parts['user'] );
335
	if ( !empty($parts['pass']) )
336
		$parts['pass']     = rawurldecode( $parts['pass'] );
337
	if ( !empty($parts['path']) )
338
		$parts['path']     = rawurldecode( $parts['path'] );
339
	if ( isset($h) )
340
		$parts['host']     = rawurldecode( $parts['host'] );
341
	if ( !empty($parts['query']) )
342
		$parts['query']    = rawurldecode( $parts['query'] );
343
	if ( !empty($parts['fragment']) )
344
		$parts['fragment'] = rawurldecode( $parts['fragment'] );
345
	return $parts;
346
}
347
 
348
/**
349
 * This function joins together URL components to form a complete URL.
350
 *
351
 * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
352
 * This function implements the specification's "component recomposition"
353
 * algorithm for combining URI components into a full URI string.
354
 *
355
 * The $parts argument is an associative array containing zero or
356
 * more of the following:
357
 *
358
 *	"scheme"	The scheme, such as "http".
359
 *	"host"		The host name, IPv4, or IPv6 address.
360
 *	"port"		The port number.
361
 *	"user"		The user name.
362
 *	"pass"		The user password.
363
 *	"path"		The path, such as a file path for "http".
364
 *	"query"		The query.
365
 *	"fragment"	The fragment.
366
 *
367
 * The "port", "user", and "pass" values are only used when a "host"
368
 * is present.
369
 *
370
 * The optional $encode argument indicates if appropriate URL components
371
 * should be percent-encoded as they are assembled into the URL.  Encoding
372
 * is only applied to the "user", "pass", "host" (if a host name, not an
373
 * IP address), "path", "query", and "fragment" components.  The "scheme"
374
 * and "port" are never encoded.  When a "scheme" and "host" are both
375
 * present, the "path" is presumed to be hierarchical and encoding
376
 * processes each segment of the hierarchy separately (i.e., the slashes
377
 * are left alone).
378
 *
379
 * The assembled URL string is returned.
380
 *
381
 * Parameters:
382
 * 	parts		an associative array of strings containing the
383
 * 			individual parts of a URL.
384
 *
385
 * 	encode		an optional boolean flag selecting whether
386
 * 			to do percent encoding or not.  Default = true.
387
 *
388
 * Return values:
389
 * 	Returns the assembled URL string.  The string is an absolute
390
 * 	URL if a scheme is supplied, and a relative URL if not.  An
391
 * 	empty string is returned if the $parts array does not contain
392
 * 	any of the needed values.
393
 */
394
function join_url( $parts, $encode=FALSE)
395
{
396
	if ( $encode )
397
	{
398
		if ( isset( $parts['user'] ) )
399
			$parts['user']     = rawurlencode( $parts['user'] );
400
		if ( isset( $parts['pass'] ) )
401
			$parts['pass']     = rawurlencode( $parts['pass'] );
402
		if ( isset( $parts['host'] ) &&
403
			!preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) )
404
			$parts['host']     = rawurlencode( $parts['host'] );
405
		if ( !empty( $parts['path'] ) )
406
			$parts['path']     = preg_replace( '!%2F!ui', '/',
407
				rawurlencode( $parts['path'] ) );
408
		if ( isset( $parts['query'] ) )
409
			$parts['query']    = rawurlencode( $parts['query'] );
410
		if ( isset( $parts['fragment'] ) )
411
			$parts['fragment'] = rawurlencode( $parts['fragment'] );
412
	}
413
 
414
	$url = '';
415
	if ( !empty( $parts['scheme'] ) )
416
		$url .= $parts['scheme'] . ':';
417
	if ( isset( $parts['host'] ) )
418
	{
419
		$url .= '//';
420
		if ( isset( $parts['user'] ) )
421
		{
422
			$url .= $parts['user'];
423
			if ( isset( $parts['pass'] ) )
424
				$url .= ':' . $parts['pass'];
425
			$url .= '@';
426
		}
427
		if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) )
428
			$url .= '[' . $parts['host'] . ']';	// IPv6
429
		else
430
			$url .= $parts['host'];			// IPv4 or name
431
		if ( isset( $parts['port'] ) )
432
			$url .= ':' . $parts['port'];
433
		if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' )
434
			$url .= '/';
435
	}
436
	if ( !empty( $parts['path'] ) )
437
		$url .= $parts['path'];
438
	if ( isset( $parts['query'] ) )
439
		$url .= '?' . $parts['query'];
440
	if ( isset( $parts['fragment'] ) )
441
		$url .= '#' . $parts['fragment'];
442
	return $url;
443
}
444
 
445
/**
446
 * This function encodes URL to form a URL which is properly
447
 * percent encoded to replace disallowed characters.
448
 *
449
 * RFC3986 specifies the allowed characters in the URL as well as
450
 * reserved characters in the URL. This function replaces all the
451
 * disallowed characters in the URL with their repective percent
452
 * encodings. Already encoded characters are not encoded again,
453
 * such as '%20' is not encoded to '%2520'.
454
 *
455
 * Parameters:
456
 * 	url		the url to encode.
457
 *
458
 * Return values:
459
 * 	Returns the encoded URL string.
460
 */
461
function encode_url($url) {
462
  $reserved = array(
463
    ":" => '!%3A!ui',
464
    "/" => '!%2F!ui',
465
    "?" => '!%3F!ui',
466
    "#" => '!%23!ui',
467
    "[" => '!%5B!ui',
468
    "]" => '!%5D!ui',
469
    "@" => '!%40!ui',
470
    "!" => '!%21!ui',
471
    "$" => '!%24!ui',
472
    "&" => '!%26!ui',
473
    "'" => '!%27!ui',
474
    "(" => '!%28!ui',
475
    ")" => '!%29!ui',
476
    "*" => '!%2A!ui',
477
    "+" => '!%2B!ui',
478
    "," => '!%2C!ui',
479
    ";" => '!%3B!ui',
480
    "=" => '!%3D!ui',
481
    "%" => '!%25!ui',
482
  );
483
 
484
  $url = rawurlencode($url);
485
  $url = preg_replace(array_values($reserved), array_keys($reserved), $url);
486
  return $url;
487
}
488
 
489
/**
490
 * Extract URLs from a web page.
491
 *
492
 * URLs are extracted from a long list of tags and attributes as defined
493
 * by the HTML 2.0, HTML 3.2, HTML 4.01, and draft HTML 5.0 specifications.
494
 * URLs are also extracted from tags and attributes that are common
495
 * extensions of HTML, from the draft Forms 2.0 specification, from XHTML,
496
 * and from WML 1.3 and 2.0.
497
 *
498
 * The function returns an associative array of associative arrays of
499
 * arrays of URLs.  The outermost array's keys are the tag (element) name,
500
 * such as "a" for <a> or "img" for <img>.  The values for these entries
501
 * are associative arrays where the keys are attribute names for those
502
 * tags, such as "href" for <a href="...">.  Finally, the values for
503
 * those arrays are URLs found in those tags and attributes throughout
504
 * the text.
505
 *
506
 * Parameters:
507
 * 	text		the UTF-8 text to scan
508
 *
509
 * Return values:
510
 * 	an associative array where keys are tags and values are an
511
 * 	associative array where keys are attributes and values are
512
 * 	an array of URLs.
513
 *
514
 * See:
515
 * 	http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_web_page
516
 */
517
function extract_html_urls( $text )
518
{
519
	$match_elements = array(
520
		// HTML
521
		array('element'=>'a',		'attribute'=>'href'),		// 2.0
522
		array('element'=>'a',		'attribute'=>'urn'),		// 2.0
523
		array('element'=>'base',	'attribute'=>'href'),		// 2.0
524
		array('element'=>'form',	'attribute'=>'action'),		// 2.0
525
		array('element'=>'img',		'attribute'=>'src'),		// 2.0
526
		array('element'=>'link',	'attribute'=>'href'),		// 2.0
527
 
528
		array('element'=>'applet',	'attribute'=>'code'),		// 3.2
529
		array('element'=>'applet',	'attribute'=>'codebase'),	// 3.2
530
		array('element'=>'area',	'attribute'=>'href'),		// 3.2
531
		array('element'=>'body',	'attribute'=>'background'),	// 3.2
532
		array('element'=>'img',		'attribute'=>'usemap'),		// 3.2
533
		array('element'=>'input',	'attribute'=>'src'),		// 3.2
534
 
535
		array('element'=>'applet',	'attribute'=>'archive'),	// 4.01
536
		array('element'=>'applet',	'attribute'=>'object'),		// 4.01
537
		array('element'=>'blockquote',	'attribute'=>'cite'),		// 4.01
538
		array('element'=>'del',		'attribute'=>'cite'),		// 4.01
539
		array('element'=>'frame',	'attribute'=>'longdesc'),	// 4.01
540
		array('element'=>'frame',	'attribute'=>'src'),		// 4.01
541
		array('element'=>'head',	'attribute'=>'profile'),	// 4.01
542
		array('element'=>'iframe',	'attribute'=>'longdesc'),	// 4.01
543
		array('element'=>'iframe',	'attribute'=>'src'),		// 4.01
544
		array('element'=>'img',		'attribute'=>'longdesc'),	// 4.01
545
		array('element'=>'input',	'attribute'=>'usemap'),		// 4.01
546
		array('element'=>'ins',		'attribute'=>'cite'),		// 4.01
547
		array('element'=>'object',	'attribute'=>'archive'),	// 4.01
548
		array('element'=>'object',	'attribute'=>'classid'),	// 4.01
549
		array('element'=>'object',	'attribute'=>'codebase'),	// 4.01
550
		array('element'=>'object',	'attribute'=>'data'),		// 4.01
551
		array('element'=>'object',	'attribute'=>'usemap'),		// 4.01
552
		array('element'=>'q',		'attribute'=>'cite'),		// 4.01
553
		array('element'=>'script',	'attribute'=>'src'),		// 4.01
554
 
555
		array('element'=>'audio',	'attribute'=>'src'),		// 5.0
556
		array('element'=>'command',	'attribute'=>'icon'),		// 5.0
557
		array('element'=>'embed',	'attribute'=>'src'),		// 5.0
558
		array('element'=>'event-source','attribute'=>'src'),		// 5.0
559
		array('element'=>'html',	'attribute'=>'manifest'),	// 5.0
560
		array('element'=>'source',	'attribute'=>'src'),		// 5.0
561
		array('element'=>'video',	'attribute'=>'src'),		// 5.0
562
		array('element'=>'video',	'attribute'=>'poster'),		// 5.0
563
 
564
		array('element'=>'bgsound',	'attribute'=>'src'),		// Extension
565
		array('element'=>'body',	'attribute'=>'credits'),	// Extension
566
		array('element'=>'body',	'attribute'=>'instructions'),	// Extension
567
		array('element'=>'body',	'attribute'=>'logo'),		// Extension
568
		array('element'=>'div',		'attribute'=>'href'),		// Extension
569
		array('element'=>'div',		'attribute'=>'src'),		// Extension
570
		array('element'=>'embed',	'attribute'=>'code'),		// Extension
571
		array('element'=>'embed',	'attribute'=>'pluginspage'),	// Extension
572
		array('element'=>'html',	'attribute'=>'background'),	// Extension
573
		array('element'=>'ilayer',	'attribute'=>'src'),		// Extension
574
		array('element'=>'img',		'attribute'=>'dynsrc'),		// Extension
575
		array('element'=>'img',		'attribute'=>'lowsrc'),		// Extension
576
		array('element'=>'input',	'attribute'=>'dynsrc'),		// Extension
577
		array('element'=>'input',	'attribute'=>'lowsrc'),		// Extension
578
		array('element'=>'table',	'attribute'=>'background'),	// Extension
579
		array('element'=>'td',		'attribute'=>'background'),	// Extension
580
		array('element'=>'th',		'attribute'=>'background'),	// Extension
581
		array('element'=>'layer',	'attribute'=>'src'),		// Extension
582
		array('element'=>'xml',		'attribute'=>'src'),		// Extension
583
 
584
		array('element'=>'button',	'attribute'=>'action'),		// Forms 2.0
585
		array('element'=>'datalist',	'attribute'=>'data'),		// Forms 2.0
586
		array('element'=>'form',	'attribute'=>'data'),		// Forms 2.0
587
		array('element'=>'input',	'attribute'=>'action'),		// Forms 2.0
588
		array('element'=>'select',	'attribute'=>'data'),		// Forms 2.0
589
 
590
		// XHTML
591
		array('element'=>'html',	'attribute'=>'xmlns'),
592
 
593
		// WML
594
		array('element'=>'access',	'attribute'=>'path'),		// 1.3
595
		array('element'=>'card',	'attribute'=>'onenterforward'),	// 1.3
596
		array('element'=>'card',	'attribute'=>'onenterbackward'),// 1.3
597
		array('element'=>'card',	'attribute'=>'ontimer'),	// 1.3
598
		array('element'=>'go',		'attribute'=>'href'),		// 1.3
599
		array('element'=>'option',	'attribute'=>'onpick'),		// 1.3
600
		array('element'=>'template',	'attribute'=>'onenterforward'),	// 1.3
601
		array('element'=>'template',	'attribute'=>'onenterbackward'),// 1.3
602
		array('element'=>'template',	'attribute'=>'ontimer'),	// 1.3
603
		array('element'=>'wml',		'attribute'=>'xmlns'),		// 2.0
604
	);
605
 
606
	$match_metas = array(
607
		'content-base',
608
		'content-location',
609
		'referer',
610
		'location',
611
		'refresh',
612
	);
613
 
614
	// Extract all elements
615
	if ( !preg_match_all( '/<([a-z][^>]*)>/iu', $text, $matches ) )
616
		return array( );
617
	$elements = $matches[1];
618
	$value_pattern = '=(("([^"]*)")|([^\s]*))';
619
 
620
	// Match elements and attributes
621
	foreach ( $match_elements as $match_element )
622
	{
623
		$name = $match_element['element'];
624
		$attr = $match_element['attribute'];
625
		$pattern = '/^' . $name . '\s.*' . $attr . $value_pattern . '/iu';
626
		if ( $name == 'object' )
627
			$split_pattern = '/\s*/u';	// Space-separated URL list
628
		else if ( $name == 'archive' )
629
			$split_pattern = '/,\s*/u';	// Comma-separated URL list
630
		else
631
			unset( $split_pattern );	// Single URL
632
		foreach ( $elements as $element )
633
		{
634
			if ( !preg_match( $pattern, $element, $match ) )
635
				continue;
636
			$m = empty($match[3]) ? (!empty($match[4])?$match[4]:'') : $match[3];
637
			if ( !isset( $split_pattern ) )
638
				$urls[$name][$attr][] = $m;
639
			else
640
			{
641
				$msplit = preg_split( $split_pattern, $m );
642
				foreach ( $msplit as $ms )
643
					$urls[$name][$attr][] = $ms;
644
			}
645
		}
646
	}
647
 
648
	// Match meta http-equiv elements
649
	foreach ( $match_metas as $match_meta )
650
	{
651
		$attr_pattern    = '/http-equiv="?' . $match_meta . '"?/iu';
652
		$content_pattern = '/content'  . $value_pattern . '/iu';
653
		$refresh_pattern = '/\d*;\s*(url=)?(.*)$/iu';
654
		foreach ( $elements as $element )
655
		{
656
			if ( !preg_match( '/^meta/iu', $element ) ||
657
				!preg_match( $attr_pattern, $element ) ||
658
				!preg_match( $content_pattern, $element, $match ) )
659
				continue;
660
			$m = empty($match[3]) ? $match[4] : $match[3];
661
			if ( $match_meta != 'refresh' )
662
				$urls['meta']['http-equiv'][] = $m;
663
			else if ( preg_match( $refresh_pattern, $m, $match ) )
664
				$urls['meta']['http-equiv'][] = $match[2];
665
		}
666
	}
667
 
668
	// Match style attributes
669
	$urls['style'] = array( );
670
	$style_pattern = '/style' . $value_pattern . '/iu';
671
	foreach ( $elements as $element )
672
	{
673
		if ( !preg_match( $style_pattern, $element, $match ) )
674
			continue;
675
		$m = empty($match[3]) ? $match[4] : $match[3];
676
		$style_urls = extract_css_urls( $m );
677
		if ( !empty( $style_urls ) )
678
			$urls['style'] = array_merge_recursive(
679
				$urls['style'], $style_urls );
680
	}
681
 
682
	// Match style bodies
683
	if ( preg_match_all( '/<style[^>]*>(.*?)<\/style>/siu', $text, $style_bodies ) )
684
	{
685
		foreach ( $style_bodies[1] as $style_body )
686
		{
687
			$style_urls = extract_css_urls( $style_body );
688
			if ( !empty( $style_urls ) )
689
				$urls['style'] = array_merge_recursive(
690
					$urls['style'], $style_urls );
691
		}
692
	}
693
	if ( empty($urls['style']) )
694
		unset( $urls['style'] );
695
 
696
	return $urls;
697
}
698
/**
699
 * Extract URLs from UTF-8 CSS text.
700
 *
701
 * URLs within @import statements and url() property functions are extracted
702
 * and returned in an associative array of arrays.  Array keys indicate
703
 * the use context for the URL, including:
704
 *
705
 * 	"import"
706
 * 	"property"
707
 *
708
 * Each value in the associative array is an array of URLs.
709
 *
710
 * Parameters:
711
 * 	text		the UTF-8 text to scan
712
 *
713
 * Return values:
714
 * 	an associative array of arrays of URLs.
715
 *
716
 * See:
717
 * 	http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_css_file
718
 */
719
function extract_css_urls( $text )
720
{
721
	$urls = array( );
722
 
723
	$url_pattern     = '(([^\\\\\'", \(\)]*(\\\\.)?)+)';
724
	$urlfunc_pattern = 'url\(\s*[\'"]?' . $url_pattern . '[\'"]?\s*\)';
725
	$pattern         = '/(' .
726
		 '(@import\s*[\'"]' . $url_pattern     . '[\'"])' .
727
		'|(@import\s*'      . $urlfunc_pattern . ')'      .
728
		'|('                . $urlfunc_pattern . ')'      .  ')/iu';
729
	if ( !preg_match_all( $pattern, $text, $matches ) )
730
		return $urls;
731
 
732
	// @import '...'
733
	// @import "..."
734
	foreach ( $matches[3] as $match )
735
		if ( !empty($match) )
736
			$urls['import'][] =
737
				preg_replace( '/\\\\(.)/u', '\\1', $match );
738
 
739
	// @import url(...)
740
	// @import url('...')
741
	// @import url("...")
742
	foreach ( $matches[7] as $match )
743
		if ( !empty($match) )
744
			$urls['import'][] =
745
				preg_replace( '/\\\\(.)/u', '\\1', $match );
746
 
747
	// url(...)
748
	// url('...')
749
	// url("...")
750
	foreach ( $matches[11] as $match )
751
		if ( !empty($match) )
752
			$urls['property'][] =
753
				preg_replace( '/\\\\(.)/u', '\\1', $match );
754
 
755
	return $urls;
756
}