1 |
efrain |
1 |
<?php
|
|
|
2 |
|
|
|
3 |
/**
|
|
|
4 |
* Copyright (c) 2008, David R. Nadeau, NadeauSoftware.com.
|
|
|
5 |
* All rights reserved.
|
|
|
6 |
*
|
|
|
7 |
* Redistribution and use in source and binary forms, with or without
|
|
|
8 |
* modification, are permitted provided that the following conditions
|
|
|
9 |
* are met:
|
|
|
10 |
*
|
|
|
11 |
* * Redistributions of source code must retain the above copyright
|
|
|
12 |
* notice, this list of conditions and the following disclaimer.
|
|
|
13 |
*
|
|
|
14 |
* * Redistributions in binary form must reproduce the above
|
|
|
15 |
* copyright notice, this list of conditions and the following
|
|
|
16 |
* disclaimer in the documentation and/or other materials provided
|
|
|
17 |
* with the distribution.
|
|
|
18 |
*
|
|
|
19 |
* * Neither the names of David R. Nadeau or NadeauSoftware.com, nor
|
|
|
20 |
* the names of its contributors may be used to endorse or promote
|
|
|
21 |
* products derived from this software without specific prior
|
|
|
22 |
* written permission.
|
|
|
23 |
*
|
|
|
24 |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
25 |
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
26 |
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
|
27 |
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
|
28 |
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
|
29 |
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
|
30 |
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
|
31 |
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
|
32 |
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
33 |
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
|
|
|
34 |
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
|
|
|
35 |
* OF SUCH DAMAGE.
|
|
|
36 |
*/
|
|
|
37 |
|
|
|
38 |
/*
|
|
|
39 |
* This is a BSD License approved by the Open Source Initiative (OSI).
|
|
|
40 |
* See: http://www.opensource.org/licenses/bsd-license.php
|
|
|
41 |
*/
|
|
|
42 |
|
|
|
43 |
defined('MOODLE_INTERNAL') || die();
|
|
|
44 |
|
|
|
45 |
/**
|
|
|
46 |
* Combine a base URL and a relative URL to produce a new
|
|
|
47 |
* absolute URL. The base URL is often the URL of a page,
|
|
|
48 |
* and the relative URL is a URL embedded on that page.
|
|
|
49 |
*
|
|
|
50 |
* This function implements the "absolutize" algorithm from
|
|
|
51 |
* the RFC3986 specification for URLs.
|
|
|
52 |
*
|
|
|
53 |
* This function supports multi-byte characters with the UTF-8 encoding,
|
|
|
54 |
* per the URL specification.
|
|
|
55 |
*
|
|
|
56 |
* Parameters:
|
|
|
57 |
* baseUrl the absolute base URL.
|
|
|
58 |
*
|
|
|
59 |
* url the relative URL to convert.
|
|
|
60 |
*
|
|
|
61 |
* Return values:
|
|
|
62 |
* An absolute URL that combines parts of the base and relative
|
|
|
63 |
* URLs, or FALSE if the base URL is not absolute or if either
|
|
|
64 |
* URL cannot be parsed.
|
|
|
65 |
*/
|
|
|
66 |
function url_to_absolute( $baseUrl, $relativeUrl )
|
|
|
67 |
{
|
|
|
68 |
// If relative URL has a scheme, clean path and return.
|
|
|
69 |
$r = split_url( $relativeUrl );
|
|
|
70 |
if ( $r === FALSE )
|
|
|
71 |
return FALSE;
|
|
|
72 |
if ( !empty( $r['scheme'] ) )
|
|
|
73 |
{
|
|
|
74 |
if ( !empty( $r['path'] ) && $r['path'][0] == '/' )
|
|
|
75 |
$r['path'] = url_remove_dot_segments( $r['path'] );
|
|
|
76 |
return join_url( $r );
|
|
|
77 |
}
|
|
|
78 |
|
|
|
79 |
// Make sure the base URL is absolute.
|
|
|
80 |
$b = split_url( $baseUrl );
|
|
|
81 |
if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) )
|
|
|
82 |
return FALSE;
|
|
|
83 |
$r['scheme'] = $b['scheme'];
|
|
|
84 |
if (empty($b['path'])) {
|
|
|
85 |
$b['path'] = '';
|
|
|
86 |
}
|
|
|
87 |
|
|
|
88 |
// If relative URL has an authority, clean path and return.
|
|
|
89 |
if ( isset( $r['host'] ) )
|
|
|
90 |
{
|
|
|
91 |
if ( !empty( $r['path'] ) )
|
|
|
92 |
$r['path'] = url_remove_dot_segments( $r['path'] );
|
|
|
93 |
return join_url( $r );
|
|
|
94 |
}
|
|
|
95 |
unset( $r['port'] );
|
|
|
96 |
unset( $r['user'] );
|
|
|
97 |
unset( $r['pass'] );
|
|
|
98 |
|
|
|
99 |
// Copy base authority.
|
|
|
100 |
$r['host'] = $b['host'];
|
|
|
101 |
if ( isset( $b['port'] ) ) $r['port'] = $b['port'];
|
|
|
102 |
if ( isset( $b['user'] ) ) $r['user'] = $b['user'];
|
|
|
103 |
if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass'];
|
|
|
104 |
|
|
|
105 |
// If relative URL has no path, use base path
|
|
|
106 |
if ( empty( $r['path'] ) )
|
|
|
107 |
{
|
|
|
108 |
if ( !empty( $b['path'] ) )
|
|
|
109 |
$r['path'] = $b['path'];
|
|
|
110 |
if ( !isset( $r['query'] ) && isset( $b['query'] ) )
|
|
|
111 |
$r['query'] = $b['query'];
|
|
|
112 |
return join_url( $r );
|
|
|
113 |
}
|
|
|
114 |
|
|
|
115 |
// If relative URL path doesn't start with /, merge with base path.
|
|
|
116 |
if ($r['path'][0] != '/') {
|
|
|
117 |
$base = core_text::strrchr($b['path'], '/', TRUE);
|
|
|
118 |
if ($base === FALSE) {
|
|
|
119 |
$base = '';
|
|
|
120 |
}
|
|
|
121 |
$r['path'] = $base . '/' . $r['path'];
|
|
|
122 |
}
|
|
|
123 |
$r['path'] = url_remove_dot_segments($r['path']);
|
|
|
124 |
return join_url($r);
|
|
|
125 |
}
|
|
|
126 |
|
|
|
127 |
/**
|
|
|
128 |
* Filter out "." and ".." segments from a URL's path and return
|
|
|
129 |
* the result.
|
|
|
130 |
*
|
|
|
131 |
* This function implements the "remove_dot_segments" algorithm from
|
|
|
132 |
* the RFC3986 specification for URLs.
|
|
|
133 |
*
|
|
|
134 |
* This function supports multi-byte characters with the UTF-8 encoding,
|
|
|
135 |
* per the URL specification.
|
|
|
136 |
*
|
|
|
137 |
* Parameters:
|
|
|
138 |
* path the path to filter
|
|
|
139 |
*
|
|
|
140 |
* Return values:
|
|
|
141 |
* The filtered path with "." and ".." removed.
|
|
|
142 |
*/
|
|
|
143 |
function url_remove_dot_segments( $path )
|
|
|
144 |
{
|
|
|
145 |
// multi-byte character explode
|
|
|
146 |
$inSegs = preg_split( '!/!u', $path );
|
|
|
147 |
$outSegs = array( );
|
|
|
148 |
foreach ( $inSegs as $seg )
|
|
|
149 |
{
|
|
|
150 |
if ( $seg == '' || $seg == '.')
|
|
|
151 |
continue;
|
|
|
152 |
if ( $seg == '..' )
|
|
|
153 |
array_pop( $outSegs );
|
|
|
154 |
else
|
|
|
155 |
array_push( $outSegs, $seg );
|
|
|
156 |
}
|
|
|
157 |
$outPath = implode( '/', $outSegs );
|
|
|
158 |
|
|
|
159 |
if ($path[0] == '/') {
|
|
|
160 |
$outPath = '/' . $outPath;
|
|
|
161 |
}
|
|
|
162 |
|
|
|
163 |
// Compare last multi-byte character against '/'.
|
|
|
164 |
if ($outPath != '/' && (core_text::strlen($path) - 1) == core_text::strrpos($path, '/', 'UTF-8')) {
|
|
|
165 |
$outPath .= '/';
|
|
|
166 |
}
|
|
|
167 |
return $outPath;
|
|
|
168 |
}
|
|
|
169 |
|
|
|
170 |
/**
|
|
|
171 |
* This function parses an absolute or relative URL and splits it
|
|
|
172 |
* into individual components.
|
|
|
173 |
*
|
|
|
174 |
* RFC3986 specifies the components of a Uniform Resource Identifier (URI).
|
|
|
175 |
* A portion of the ABNFs are repeated here:
|
|
|
176 |
*
|
|
|
177 |
* URI-reference = URI
|
|
|
178 |
* / relative-ref
|
|
|
179 |
*
|
|
|
180 |
* URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
|
|
|
181 |
*
|
|
|
182 |
* relative-ref = relative-part [ "?" query ] [ "#" fragment ]
|
|
|
183 |
*
|
|
|
184 |
* hier-part = "//" authority path-abempty
|
|
|
185 |
* / path-absolute
|
|
|
186 |
* / path-rootless
|
|
|
187 |
* / path-empty
|
|
|
188 |
*
|
|
|
189 |
* relative-part = "//" authority path-abempty
|
|
|
190 |
* / path-absolute
|
|
|
191 |
* / path-noscheme
|
|
|
192 |
* / path-empty
|
|
|
193 |
*
|
|
|
194 |
* authority = [ userinfo "@" ] host [ ":" port ]
|
|
|
195 |
*
|
|
|
196 |
* So, a URL has the following major components:
|
|
|
197 |
*
|
|
|
198 |
* scheme
|
|
|
199 |
* The name of a method used to interpret the rest of
|
|
|
200 |
* the URL. Examples: "http", "https", "mailto", "file'.
|
|
|
201 |
*
|
|
|
202 |
* authority
|
|
|
203 |
* The name of the authority governing the URL's name
|
|
|
204 |
* space. Examples: "example.com", "user@example.com",
|
|
|
205 |
* "example.com:80", "user:password@example.com:80".
|
|
|
206 |
*
|
|
|
207 |
* The authority may include a host name, port number,
|
|
|
208 |
* user name, and password.
|
|
|
209 |
*
|
|
|
210 |
* The host may be a name, an IPv4 numeric address, or
|
|
|
211 |
* an IPv6 numeric address.
|
|
|
212 |
*
|
|
|
213 |
* path
|
|
|
214 |
* The hierarchical path to the URL's resource.
|
|
|
215 |
* Examples: "/index.htm", "/scripts/page.php".
|
|
|
216 |
*
|
|
|
217 |
* query
|
|
|
218 |
* The data for a query. Examples: "?search=google.com".
|
|
|
219 |
*
|
|
|
220 |
* fragment
|
|
|
221 |
* The name of a secondary resource relative to that named
|
|
|
222 |
* by the path. Examples: "#section1", "#header".
|
|
|
223 |
*
|
|
|
224 |
* An "absolute" URL must include a scheme and path. The authority, query,
|
|
|
225 |
* and fragment components are optional.
|
|
|
226 |
*
|
|
|
227 |
* A "relative" URL does not include a scheme and must include a path. The
|
|
|
228 |
* authority, query, and fragment components are optional.
|
|
|
229 |
*
|
|
|
230 |
* This function splits the $url argument into the following components
|
|
|
231 |
* and returns them in an associative array. Keys to that array include:
|
|
|
232 |
*
|
|
|
233 |
* "scheme" The scheme, such as "http".
|
|
|
234 |
* "host" The host name, IPv4, or IPv6 address.
|
|
|
235 |
* "port" The port number.
|
|
|
236 |
* "user" The user name.
|
|
|
237 |
* "pass" The user password.
|
|
|
238 |
* "path" The path, such as a file path for "http".
|
|
|
239 |
* "query" The query.
|
|
|
240 |
* "fragment" The fragment.
|
|
|
241 |
*
|
|
|
242 |
* One or more of these may not be present, depending upon the URL.
|
|
|
243 |
*
|
|
|
244 |
* Optionally, the "user", "pass", "host" (if a name, not an IP address),
|
|
|
245 |
* "path", "query", and "fragment" may have percent-encoded characters
|
|
|
246 |
* decoded. The "scheme" and "port" cannot include percent-encoded
|
|
|
247 |
* characters and are never decoded. Decoding occurs after the URL has
|
|
|
248 |
* been parsed.
|
|
|
249 |
*
|
|
|
250 |
* Parameters:
|
|
|
251 |
* url the URL to parse.
|
|
|
252 |
*
|
|
|
253 |
* decode an optional boolean flag selecting whether
|
|
|
254 |
* to decode percent encoding or not. Default = TRUE.
|
|
|
255 |
*
|
|
|
256 |
* Return values:
|
|
|
257 |
* the associative array of URL parts, or FALSE if the URL is
|
|
|
258 |
* too malformed to recognize any parts.
|
|
|
259 |
*/
|
|
|
260 |
function split_url( $url, $decode=FALSE)
|
|
|
261 |
{
|
|
|
262 |
// Character sets from RFC3986.
|
|
|
263 |
$xunressub = 'a-zA-Z\d\-._~\!$&\'()*+,;=';
|
|
|
264 |
$xpchar = $xunressub . ':@% ';
|
|
|
265 |
|
|
|
266 |
// Scheme from RFC3986.
|
|
|
267 |
$xscheme = '([a-zA-Z][a-zA-Z\d+-.]*)';
|
|
|
268 |
|
|
|
269 |
// User info (user + password) from RFC3986.
|
|
|
270 |
$xuserinfo = '(([' . $xunressub . '%]*)' .
|
|
|
271 |
'(:([' . $xunressub . ':%]*))?)';
|
|
|
272 |
|
|
|
273 |
// IPv4 from RFC3986 (without digit constraints).
|
|
|
274 |
$xipv4 = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})';
|
|
|
275 |
|
|
|
276 |
// IPv6 from RFC2732 (without digit and grouping constraints).
|
|
|
277 |
$xipv6 = '(\[([a-fA-F\d.:]+)\])';
|
|
|
278 |
|
|
|
279 |
// Host name from RFC1035. Technically, must start with a letter.
|
|
|
280 |
// Relax that restriction to better parse URL structure, then
|
|
|
281 |
// leave host name validation to application.
|
|
|
282 |
$xhost_name = '([a-zA-Z\d\-.%]+)';
|
|
|
283 |
|
|
|
284 |
// Authority from RFC3986. Skip IP future.
|
|
|
285 |
$xhost = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')';
|
|
|
286 |
$xport = '(\d*)';
|
|
|
287 |
$xauthority = '((' . $xuserinfo . '@)?' . $xhost .
|
|
|
288 |
'?(:' . $xport . ')?)';
|
|
|
289 |
|
|
|
290 |
// Path from RFC3986. Blend absolute & relative for efficiency.
|
|
|
291 |
$xslash_seg = '(/[' . $xpchar . ']*)';
|
|
|
292 |
$xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))';
|
|
|
293 |
$xpath_rel = '([' . $xpchar . ']+' . $xslash_seg . '*)';
|
|
|
294 |
$xpath_abs = '(/(' . $xpath_rel . ')?)';
|
|
|
295 |
$xapath = '(' . $xpath_authabs . '|' . $xpath_abs .
|
|
|
296 |
'|' . $xpath_rel . ')';
|
|
|
297 |
|
|
|
298 |
// Query and fragment from RFC3986.
|
|
|
299 |
$xqueryfrag = '([' . $xpchar . '/?' . ']*)';
|
|
|
300 |
|
|
|
301 |
// URL.
|
|
|
302 |
$xurl = '^(' . $xscheme . ':)?' . $xapath . '?' .
|
|
|
303 |
'(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$';
|
|
|
304 |
|
|
|
305 |
|
|
|
306 |
// Split the URL into components.
|
|
|
307 |
if ( !preg_match( '!' . $xurl . '!', $url, $m ) )
|
|
|
308 |
return FALSE;
|
|
|
309 |
|
|
|
310 |
if ( !empty($m[2]) ) $parts['scheme'] = strtolower($m[2]);
|
|
|
311 |
|
|
|
312 |
if ( !empty($m[7]) ) {
|
|
|
313 |
if ( isset( $m[9] ) ) $parts['user'] = $m[9];
|
|
|
314 |
else $parts['user'] = '';
|
|
|
315 |
}
|
|
|
316 |
if ( !empty($m[10]) ) $parts['pass'] = $m[11];
|
|
|
317 |
|
|
|
318 |
if ( !empty($m[13]) ) $h=$parts['host'] = $m[13];
|
|
|
319 |
else if ( !empty($m[14]) ) $parts['host'] = $m[14];
|
|
|
320 |
else if ( !empty($m[16]) ) $parts['host'] = $m[16];
|
|
|
321 |
else if ( !empty( $m[5] ) ) $parts['host'] = '';
|
|
|
322 |
if ( !empty($m[17]) ) $parts['port'] = $m[18];
|
|
|
323 |
|
|
|
324 |
if ( !empty($m[19]) ) $parts['path'] = $m[19];
|
|
|
325 |
else if ( !empty($m[21]) ) $parts['path'] = $m[21];
|
|
|
326 |
else if ( !empty($m[25]) ) $parts['path'] = $m[25];
|
|
|
327 |
|
|
|
328 |
if ( !empty($m[27]) ) $parts['query'] = $m[28];
|
|
|
329 |
if ( !empty($m[29]) ) $parts['fragment']= $m[30];
|
|
|
330 |
|
|
|
331 |
if ( !$decode )
|
|
|
332 |
return $parts;
|
|
|
333 |
if ( !empty($parts['user']) )
|
|
|
334 |
$parts['user'] = rawurldecode( $parts['user'] );
|
|
|
335 |
if ( !empty($parts['pass']) )
|
|
|
336 |
$parts['pass'] = rawurldecode( $parts['pass'] );
|
|
|
337 |
if ( !empty($parts['path']) )
|
|
|
338 |
$parts['path'] = rawurldecode( $parts['path'] );
|
|
|
339 |
if ( isset($h) )
|
|
|
340 |
$parts['host'] = rawurldecode( $parts['host'] );
|
|
|
341 |
if ( !empty($parts['query']) )
|
|
|
342 |
$parts['query'] = rawurldecode( $parts['query'] );
|
|
|
343 |
if ( !empty($parts['fragment']) )
|
|
|
344 |
$parts['fragment'] = rawurldecode( $parts['fragment'] );
|
|
|
345 |
return $parts;
|
|
|
346 |
}
|
|
|
347 |
|
|
|
348 |
/**
|
|
|
349 |
* This function joins together URL components to form a complete URL.
|
|
|
350 |
*
|
|
|
351 |
* RFC3986 specifies the components of a Uniform Resource Identifier (URI).
|
|
|
352 |
* This function implements the specification's "component recomposition"
|
|
|
353 |
* algorithm for combining URI components into a full URI string.
|
|
|
354 |
*
|
|
|
355 |
* The $parts argument is an associative array containing zero or
|
|
|
356 |
* more of the following:
|
|
|
357 |
*
|
|
|
358 |
* "scheme" The scheme, such as "http".
|
|
|
359 |
* "host" The host name, IPv4, or IPv6 address.
|
|
|
360 |
* "port" The port number.
|
|
|
361 |
* "user" The user name.
|
|
|
362 |
* "pass" The user password.
|
|
|
363 |
* "path" The path, such as a file path for "http".
|
|
|
364 |
* "query" The query.
|
|
|
365 |
* "fragment" The fragment.
|
|
|
366 |
*
|
|
|
367 |
* The "port", "user", and "pass" values are only used when a "host"
|
|
|
368 |
* is present.
|
|
|
369 |
*
|
|
|
370 |
* The optional $encode argument indicates if appropriate URL components
|
|
|
371 |
* should be percent-encoded as they are assembled into the URL. Encoding
|
|
|
372 |
* is only applied to the "user", "pass", "host" (if a host name, not an
|
|
|
373 |
* IP address), "path", "query", and "fragment" components. The "scheme"
|
|
|
374 |
* and "port" are never encoded. When a "scheme" and "host" are both
|
|
|
375 |
* present, the "path" is presumed to be hierarchical and encoding
|
|
|
376 |
* processes each segment of the hierarchy separately (i.e., the slashes
|
|
|
377 |
* are left alone).
|
|
|
378 |
*
|
|
|
379 |
* The assembled URL string is returned.
|
|
|
380 |
*
|
|
|
381 |
* Parameters:
|
|
|
382 |
* parts an associative array of strings containing the
|
|
|
383 |
* individual parts of a URL.
|
|
|
384 |
*
|
|
|
385 |
* encode an optional boolean flag selecting whether
|
|
|
386 |
* to do percent encoding or not. Default = true.
|
|
|
387 |
*
|
|
|
388 |
* Return values:
|
|
|
389 |
* Returns the assembled URL string. The string is an absolute
|
|
|
390 |
* URL if a scheme is supplied, and a relative URL if not. An
|
|
|
391 |
* empty string is returned if the $parts array does not contain
|
|
|
392 |
* any of the needed values.
|
|
|
393 |
*/
|
|
|
394 |
function join_url( $parts, $encode=FALSE)
|
|
|
395 |
{
|
|
|
396 |
if ( $encode )
|
|
|
397 |
{
|
|
|
398 |
if ( isset( $parts['user'] ) )
|
|
|
399 |
$parts['user'] = rawurlencode( $parts['user'] );
|
|
|
400 |
if ( isset( $parts['pass'] ) )
|
|
|
401 |
$parts['pass'] = rawurlencode( $parts['pass'] );
|
|
|
402 |
if ( isset( $parts['host'] ) &&
|
|
|
403 |
!preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) )
|
|
|
404 |
$parts['host'] = rawurlencode( $parts['host'] );
|
|
|
405 |
if ( !empty( $parts['path'] ) )
|
|
|
406 |
$parts['path'] = preg_replace( '!%2F!ui', '/',
|
|
|
407 |
rawurlencode( $parts['path'] ) );
|
|
|
408 |
if ( isset( $parts['query'] ) )
|
|
|
409 |
$parts['query'] = rawurlencode( $parts['query'] );
|
|
|
410 |
if ( isset( $parts['fragment'] ) )
|
|
|
411 |
$parts['fragment'] = rawurlencode( $parts['fragment'] );
|
|
|
412 |
}
|
|
|
413 |
|
|
|
414 |
$url = '';
|
|
|
415 |
if ( !empty( $parts['scheme'] ) )
|
|
|
416 |
$url .= $parts['scheme'] . ':';
|
|
|
417 |
if ( isset( $parts['host'] ) )
|
|
|
418 |
{
|
|
|
419 |
$url .= '//';
|
|
|
420 |
if ( isset( $parts['user'] ) )
|
|
|
421 |
{
|
|
|
422 |
$url .= $parts['user'];
|
|
|
423 |
if ( isset( $parts['pass'] ) )
|
|
|
424 |
$url .= ':' . $parts['pass'];
|
|
|
425 |
$url .= '@';
|
|
|
426 |
}
|
|
|
427 |
if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) )
|
|
|
428 |
$url .= '[' . $parts['host'] . ']'; // IPv6
|
|
|
429 |
else
|
|
|
430 |
$url .= $parts['host']; // IPv4 or name
|
|
|
431 |
if ( isset( $parts['port'] ) )
|
|
|
432 |
$url .= ':' . $parts['port'];
|
|
|
433 |
if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' )
|
|
|
434 |
$url .= '/';
|
|
|
435 |
}
|
|
|
436 |
if ( !empty( $parts['path'] ) )
|
|
|
437 |
$url .= $parts['path'];
|
|
|
438 |
if ( isset( $parts['query'] ) )
|
|
|
439 |
$url .= '?' . $parts['query'];
|
|
|
440 |
if ( isset( $parts['fragment'] ) )
|
|
|
441 |
$url .= '#' . $parts['fragment'];
|
|
|
442 |
return $url;
|
|
|
443 |
}
|
|
|
444 |
|
|
|
445 |
/**
|
|
|
446 |
* This function encodes URL to form a URL which is properly
|
|
|
447 |
* percent encoded to replace disallowed characters.
|
|
|
448 |
*
|
|
|
449 |
* RFC3986 specifies the allowed characters in the URL as well as
|
|
|
450 |
* reserved characters in the URL. This function replaces all the
|
|
|
451 |
* disallowed characters in the URL with their repective percent
|
|
|
452 |
* encodings. Already encoded characters are not encoded again,
|
|
|
453 |
* such as '%20' is not encoded to '%2520'.
|
|
|
454 |
*
|
|
|
455 |
* Parameters:
|
|
|
456 |
* url the url to encode.
|
|
|
457 |
*
|
|
|
458 |
* Return values:
|
|
|
459 |
* Returns the encoded URL string.
|
|
|
460 |
*/
|
|
|
461 |
function encode_url($url) {
|
|
|
462 |
$reserved = array(
|
|
|
463 |
":" => '!%3A!ui',
|
|
|
464 |
"/" => '!%2F!ui',
|
|
|
465 |
"?" => '!%3F!ui',
|
|
|
466 |
"#" => '!%23!ui',
|
|
|
467 |
"[" => '!%5B!ui',
|
|
|
468 |
"]" => '!%5D!ui',
|
|
|
469 |
"@" => '!%40!ui',
|
|
|
470 |
"!" => '!%21!ui',
|
|
|
471 |
"$" => '!%24!ui',
|
|
|
472 |
"&" => '!%26!ui',
|
|
|
473 |
"'" => '!%27!ui',
|
|
|
474 |
"(" => '!%28!ui',
|
|
|
475 |
")" => '!%29!ui',
|
|
|
476 |
"*" => '!%2A!ui',
|
|
|
477 |
"+" => '!%2B!ui',
|
|
|
478 |
"," => '!%2C!ui',
|
|
|
479 |
";" => '!%3B!ui',
|
|
|
480 |
"=" => '!%3D!ui',
|
|
|
481 |
"%" => '!%25!ui',
|
|
|
482 |
);
|
|
|
483 |
|
|
|
484 |
$url = rawurlencode($url);
|
|
|
485 |
$url = preg_replace(array_values($reserved), array_keys($reserved), $url);
|
|
|
486 |
return $url;
|
|
|
487 |
}
|
|
|
488 |
|
|
|
489 |
/**
|
|
|
490 |
* Extract URLs from a web page.
|
|
|
491 |
*
|
|
|
492 |
* URLs are extracted from a long list of tags and attributes as defined
|
|
|
493 |
* by the HTML 2.0, HTML 3.2, HTML 4.01, and draft HTML 5.0 specifications.
|
|
|
494 |
* URLs are also extracted from tags and attributes that are common
|
|
|
495 |
* extensions of HTML, from the draft Forms 2.0 specification, from XHTML,
|
|
|
496 |
* and from WML 1.3 and 2.0.
|
|
|
497 |
*
|
|
|
498 |
* The function returns an associative array of associative arrays of
|
|
|
499 |
* arrays of URLs. The outermost array's keys are the tag (element) name,
|
|
|
500 |
* such as "a" for <a> or "img" for <img>. The values for these entries
|
|
|
501 |
* are associative arrays where the keys are attribute names for those
|
|
|
502 |
* tags, such as "href" for <a href="...">. Finally, the values for
|
|
|
503 |
* those arrays are URLs found in those tags and attributes throughout
|
|
|
504 |
* the text.
|
|
|
505 |
*
|
|
|
506 |
* Parameters:
|
|
|
507 |
* text the UTF-8 text to scan
|
|
|
508 |
*
|
|
|
509 |
* Return values:
|
|
|
510 |
* an associative array where keys are tags and values are an
|
|
|
511 |
* associative array where keys are attributes and values are
|
|
|
512 |
* an array of URLs.
|
|
|
513 |
*
|
|
|
514 |
* See:
|
|
|
515 |
* http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_web_page
|
|
|
516 |
*/
|
|
|
517 |
function extract_html_urls( $text )
|
|
|
518 |
{
|
|
|
519 |
$match_elements = array(
|
|
|
520 |
// HTML
|
|
|
521 |
array('element'=>'a', 'attribute'=>'href'), // 2.0
|
|
|
522 |
array('element'=>'a', 'attribute'=>'urn'), // 2.0
|
|
|
523 |
array('element'=>'base', 'attribute'=>'href'), // 2.0
|
|
|
524 |
array('element'=>'form', 'attribute'=>'action'), // 2.0
|
|
|
525 |
array('element'=>'img', 'attribute'=>'src'), // 2.0
|
|
|
526 |
array('element'=>'link', 'attribute'=>'href'), // 2.0
|
|
|
527 |
|
|
|
528 |
array('element'=>'applet', 'attribute'=>'code'), // 3.2
|
|
|
529 |
array('element'=>'applet', 'attribute'=>'codebase'), // 3.2
|
|
|
530 |
array('element'=>'area', 'attribute'=>'href'), // 3.2
|
|
|
531 |
array('element'=>'body', 'attribute'=>'background'), // 3.2
|
|
|
532 |
array('element'=>'img', 'attribute'=>'usemap'), // 3.2
|
|
|
533 |
array('element'=>'input', 'attribute'=>'src'), // 3.2
|
|
|
534 |
|
|
|
535 |
array('element'=>'applet', 'attribute'=>'archive'), // 4.01
|
|
|
536 |
array('element'=>'applet', 'attribute'=>'object'), // 4.01
|
|
|
537 |
array('element'=>'blockquote', 'attribute'=>'cite'), // 4.01
|
|
|
538 |
array('element'=>'del', 'attribute'=>'cite'), // 4.01
|
|
|
539 |
array('element'=>'frame', 'attribute'=>'longdesc'), // 4.01
|
|
|
540 |
array('element'=>'frame', 'attribute'=>'src'), // 4.01
|
|
|
541 |
array('element'=>'head', 'attribute'=>'profile'), // 4.01
|
|
|
542 |
array('element'=>'iframe', 'attribute'=>'longdesc'), // 4.01
|
|
|
543 |
array('element'=>'iframe', 'attribute'=>'src'), // 4.01
|
|
|
544 |
array('element'=>'img', 'attribute'=>'longdesc'), // 4.01
|
|
|
545 |
array('element'=>'input', 'attribute'=>'usemap'), // 4.01
|
|
|
546 |
array('element'=>'ins', 'attribute'=>'cite'), // 4.01
|
|
|
547 |
array('element'=>'object', 'attribute'=>'archive'), // 4.01
|
|
|
548 |
array('element'=>'object', 'attribute'=>'classid'), // 4.01
|
|
|
549 |
array('element'=>'object', 'attribute'=>'codebase'), // 4.01
|
|
|
550 |
array('element'=>'object', 'attribute'=>'data'), // 4.01
|
|
|
551 |
array('element'=>'object', 'attribute'=>'usemap'), // 4.01
|
|
|
552 |
array('element'=>'q', 'attribute'=>'cite'), // 4.01
|
|
|
553 |
array('element'=>'script', 'attribute'=>'src'), // 4.01
|
|
|
554 |
|
|
|
555 |
array('element'=>'audio', 'attribute'=>'src'), // 5.0
|
|
|
556 |
array('element'=>'command', 'attribute'=>'icon'), // 5.0
|
|
|
557 |
array('element'=>'embed', 'attribute'=>'src'), // 5.0
|
|
|
558 |
array('element'=>'event-source','attribute'=>'src'), // 5.0
|
|
|
559 |
array('element'=>'html', 'attribute'=>'manifest'), // 5.0
|
|
|
560 |
array('element'=>'source', 'attribute'=>'src'), // 5.0
|
|
|
561 |
array('element'=>'video', 'attribute'=>'src'), // 5.0
|
|
|
562 |
array('element'=>'video', 'attribute'=>'poster'), // 5.0
|
|
|
563 |
|
|
|
564 |
array('element'=>'bgsound', 'attribute'=>'src'), // Extension
|
|
|
565 |
array('element'=>'body', 'attribute'=>'credits'), // Extension
|
|
|
566 |
array('element'=>'body', 'attribute'=>'instructions'), // Extension
|
|
|
567 |
array('element'=>'body', 'attribute'=>'logo'), // Extension
|
|
|
568 |
array('element'=>'div', 'attribute'=>'href'), // Extension
|
|
|
569 |
array('element'=>'div', 'attribute'=>'src'), // Extension
|
|
|
570 |
array('element'=>'embed', 'attribute'=>'code'), // Extension
|
|
|
571 |
array('element'=>'embed', 'attribute'=>'pluginspage'), // Extension
|
|
|
572 |
array('element'=>'html', 'attribute'=>'background'), // Extension
|
|
|
573 |
array('element'=>'ilayer', 'attribute'=>'src'), // Extension
|
|
|
574 |
array('element'=>'img', 'attribute'=>'dynsrc'), // Extension
|
|
|
575 |
array('element'=>'img', 'attribute'=>'lowsrc'), // Extension
|
|
|
576 |
array('element'=>'input', 'attribute'=>'dynsrc'), // Extension
|
|
|
577 |
array('element'=>'input', 'attribute'=>'lowsrc'), // Extension
|
|
|
578 |
array('element'=>'table', 'attribute'=>'background'), // Extension
|
|
|
579 |
array('element'=>'td', 'attribute'=>'background'), // Extension
|
|
|
580 |
array('element'=>'th', 'attribute'=>'background'), // Extension
|
|
|
581 |
array('element'=>'layer', 'attribute'=>'src'), // Extension
|
|
|
582 |
array('element'=>'xml', 'attribute'=>'src'), // Extension
|
|
|
583 |
|
|
|
584 |
array('element'=>'button', 'attribute'=>'action'), // Forms 2.0
|
|
|
585 |
array('element'=>'datalist', 'attribute'=>'data'), // Forms 2.0
|
|
|
586 |
array('element'=>'form', 'attribute'=>'data'), // Forms 2.0
|
|
|
587 |
array('element'=>'input', 'attribute'=>'action'), // Forms 2.0
|
|
|
588 |
array('element'=>'select', 'attribute'=>'data'), // Forms 2.0
|
|
|
589 |
|
|
|
590 |
// XHTML
|
|
|
591 |
array('element'=>'html', 'attribute'=>'xmlns'),
|
|
|
592 |
|
|
|
593 |
// WML
|
|
|
594 |
array('element'=>'access', 'attribute'=>'path'), // 1.3
|
|
|
595 |
array('element'=>'card', 'attribute'=>'onenterforward'), // 1.3
|
|
|
596 |
array('element'=>'card', 'attribute'=>'onenterbackward'),// 1.3
|
|
|
597 |
array('element'=>'card', 'attribute'=>'ontimer'), // 1.3
|
|
|
598 |
array('element'=>'go', 'attribute'=>'href'), // 1.3
|
|
|
599 |
array('element'=>'option', 'attribute'=>'onpick'), // 1.3
|
|
|
600 |
array('element'=>'template', 'attribute'=>'onenterforward'), // 1.3
|
|
|
601 |
array('element'=>'template', 'attribute'=>'onenterbackward'),// 1.3
|
|
|
602 |
array('element'=>'template', 'attribute'=>'ontimer'), // 1.3
|
|
|
603 |
array('element'=>'wml', 'attribute'=>'xmlns'), // 2.0
|
|
|
604 |
);
|
|
|
605 |
|
|
|
606 |
$match_metas = array(
|
|
|
607 |
'content-base',
|
|
|
608 |
'content-location',
|
|
|
609 |
'referer',
|
|
|
610 |
'location',
|
|
|
611 |
'refresh',
|
|
|
612 |
);
|
|
|
613 |
|
|
|
614 |
// Extract all elements
|
|
|
615 |
if ( !preg_match_all( '/<([a-z][^>]*)>/iu', $text, $matches ) )
|
|
|
616 |
return array( );
|
|
|
617 |
$elements = $matches[1];
|
|
|
618 |
$value_pattern = '=(("([^"]*)")|([^\s]*))';
|
|
|
619 |
|
|
|
620 |
// Match elements and attributes
|
|
|
621 |
foreach ( $match_elements as $match_element )
|
|
|
622 |
{
|
|
|
623 |
$name = $match_element['element'];
|
|
|
624 |
$attr = $match_element['attribute'];
|
|
|
625 |
$pattern = '/^' . $name . '\s.*' . $attr . $value_pattern . '/iu';
|
|
|
626 |
if ( $name == 'object' )
|
|
|
627 |
$split_pattern = '/\s*/u'; // Space-separated URL list
|
|
|
628 |
else if ( $name == 'archive' )
|
|
|
629 |
$split_pattern = '/,\s*/u'; // Comma-separated URL list
|
|
|
630 |
else
|
|
|
631 |
unset( $split_pattern ); // Single URL
|
|
|
632 |
foreach ( $elements as $element )
|
|
|
633 |
{
|
|
|
634 |
if ( !preg_match( $pattern, $element, $match ) )
|
|
|
635 |
continue;
|
|
|
636 |
$m = empty($match[3]) ? (!empty($match[4])?$match[4]:'') : $match[3];
|
|
|
637 |
if ( !isset( $split_pattern ) )
|
|
|
638 |
$urls[$name][$attr][] = $m;
|
|
|
639 |
else
|
|
|
640 |
{
|
|
|
641 |
$msplit = preg_split( $split_pattern, $m );
|
|
|
642 |
foreach ( $msplit as $ms )
|
|
|
643 |
$urls[$name][$attr][] = $ms;
|
|
|
644 |
}
|
|
|
645 |
}
|
|
|
646 |
}
|
|
|
647 |
|
|
|
648 |
// Match meta http-equiv elements
|
|
|
649 |
foreach ( $match_metas as $match_meta )
|
|
|
650 |
{
|
|
|
651 |
$attr_pattern = '/http-equiv="?' . $match_meta . '"?/iu';
|
|
|
652 |
$content_pattern = '/content' . $value_pattern . '/iu';
|
|
|
653 |
$refresh_pattern = '/\d*;\s*(url=)?(.*)$/iu';
|
|
|
654 |
foreach ( $elements as $element )
|
|
|
655 |
{
|
|
|
656 |
if ( !preg_match( '/^meta/iu', $element ) ||
|
|
|
657 |
!preg_match( $attr_pattern, $element ) ||
|
|
|
658 |
!preg_match( $content_pattern, $element, $match ) )
|
|
|
659 |
continue;
|
|
|
660 |
$m = empty($match[3]) ? $match[4] : $match[3];
|
|
|
661 |
if ( $match_meta != 'refresh' )
|
|
|
662 |
$urls['meta']['http-equiv'][] = $m;
|
|
|
663 |
else if ( preg_match( $refresh_pattern, $m, $match ) )
|
|
|
664 |
$urls['meta']['http-equiv'][] = $match[2];
|
|
|
665 |
}
|
|
|
666 |
}
|
|
|
667 |
|
|
|
668 |
// Match style attributes
|
|
|
669 |
$urls['style'] = array( );
|
|
|
670 |
$style_pattern = '/style' . $value_pattern . '/iu';
|
|
|
671 |
foreach ( $elements as $element )
|
|
|
672 |
{
|
|
|
673 |
if ( !preg_match( $style_pattern, $element, $match ) )
|
|
|
674 |
continue;
|
|
|
675 |
$m = empty($match[3]) ? $match[4] : $match[3];
|
|
|
676 |
$style_urls = extract_css_urls( $m );
|
|
|
677 |
if ( !empty( $style_urls ) )
|
|
|
678 |
$urls['style'] = array_merge_recursive(
|
|
|
679 |
$urls['style'], $style_urls );
|
|
|
680 |
}
|
|
|
681 |
|
|
|
682 |
// Match style bodies
|
|
|
683 |
if ( preg_match_all( '/<style[^>]*>(.*?)<\/style>/siu', $text, $style_bodies ) )
|
|
|
684 |
{
|
|
|
685 |
foreach ( $style_bodies[1] as $style_body )
|
|
|
686 |
{
|
|
|
687 |
$style_urls = extract_css_urls( $style_body );
|
|
|
688 |
if ( !empty( $style_urls ) )
|
|
|
689 |
$urls['style'] = array_merge_recursive(
|
|
|
690 |
$urls['style'], $style_urls );
|
|
|
691 |
}
|
|
|
692 |
}
|
|
|
693 |
if ( empty($urls['style']) )
|
|
|
694 |
unset( $urls['style'] );
|
|
|
695 |
|
|
|
696 |
return $urls;
|
|
|
697 |
}
|
|
|
698 |
/**
|
|
|
699 |
* Extract URLs from UTF-8 CSS text.
|
|
|
700 |
*
|
|
|
701 |
* URLs within @import statements and url() property functions are extracted
|
|
|
702 |
* and returned in an associative array of arrays. Array keys indicate
|
|
|
703 |
* the use context for the URL, including:
|
|
|
704 |
*
|
|
|
705 |
* "import"
|
|
|
706 |
* "property"
|
|
|
707 |
*
|
|
|
708 |
* Each value in the associative array is an array of URLs.
|
|
|
709 |
*
|
|
|
710 |
* Parameters:
|
|
|
711 |
* text the UTF-8 text to scan
|
|
|
712 |
*
|
|
|
713 |
* Return values:
|
|
|
714 |
* an associative array of arrays of URLs.
|
|
|
715 |
*
|
|
|
716 |
* See:
|
|
|
717 |
* http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_css_file
|
|
|
718 |
*/
|
|
|
719 |
function extract_css_urls( $text )
|
|
|
720 |
{
|
|
|
721 |
$urls = array( );
|
|
|
722 |
|
|
|
723 |
$url_pattern = '(([^\\\\\'", \(\)]*(\\\\.)?)+)';
|
|
|
724 |
$urlfunc_pattern = 'url\(\s*[\'"]?' . $url_pattern . '[\'"]?\s*\)';
|
|
|
725 |
$pattern = '/(' .
|
|
|
726 |
'(@import\s*[\'"]' . $url_pattern . '[\'"])' .
|
|
|
727 |
'|(@import\s*' . $urlfunc_pattern . ')' .
|
|
|
728 |
'|(' . $urlfunc_pattern . ')' . ')/iu';
|
|
|
729 |
if ( !preg_match_all( $pattern, $text, $matches ) )
|
|
|
730 |
return $urls;
|
|
|
731 |
|
|
|
732 |
// @import '...'
|
|
|
733 |
// @import "..."
|
|
|
734 |
foreach ( $matches[3] as $match )
|
|
|
735 |
if ( !empty($match) )
|
|
|
736 |
$urls['import'][] =
|
|
|
737 |
preg_replace( '/\\\\(.)/u', '\\1', $match );
|
|
|
738 |
|
|
|
739 |
// @import url(...)
|
|
|
740 |
// @import url('...')
|
|
|
741 |
// @import url("...")
|
|
|
742 |
foreach ( $matches[7] as $match )
|
|
|
743 |
if ( !empty($match) )
|
|
|
744 |
$urls['import'][] =
|
|
|
745 |
preg_replace( '/\\\\(.)/u', '\\1', $match );
|
|
|
746 |
|
|
|
747 |
// url(...)
|
|
|
748 |
// url('...')
|
|
|
749 |
// url("...")
|
|
|
750 |
foreach ( $matches[11] as $match )
|
|
|
751 |
if ( !empty($match) )
|
|
|
752 |
$urls['property'][] =
|
|
|
753 |
preg_replace( '/\\\\(.)/u', '\\1', $match );
|
|
|
754 |
|
|
|
755 |
return $urls;
|
|
|
756 |
}
|