Proyectos de Subversion Moodle

Rev

| Ultima modificación | Ver Log |

Rev Autor Línea Nro. Línea
1 efrain 1
<?php
2
 
3
/*
4
 * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com>
5
 *
6
 * This script is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * The GNU General Public License can be found at
12
 * http://www.gnu.org/copyleft/gpl.html.
13
 *
14
 * This script is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
 * GNU General Public License for more details.
18
 */
19
 
20
namespace Html2Text;
21
 
22
class Html2Text
23
{
24
    const ENCODING = 'UTF-8';
25
 
26
    protected $htmlFuncFlags;
27
 
28
    /**
29
     * Contains the HTML content to convert.
30
     *
31
     * @var string $html
32
     */
33
    protected $html;
34
 
35
    /**
36
     * Contains the converted, formatted text.
37
     *
38
     * @var string $text
39
     */
40
    protected $text;
41
 
42
    /**
43
     * List of preg* regular expression patterns to search for,
44
     * used in conjunction with $replace.
45
     *
46
     * @var array $search
47
     * @see $replace
48
     */
49
    protected $search = array(
50
        "/\r/",                                           // Non-legal carriage return
51
        "/[\n\t]+/",                                      // Newlines and tabs
52
        '/<head\b[^>]*>.*?<\/head>/i',                    // <head>
53
        '/<script\b[^>]*>.*?<\/script>/i',                // <script>s -- which strip_tags supposedly has problems with
54
        '/<style\b[^>]*>.*?<\/style>/i',                  // <style>s -- which strip_tags supposedly has problems with
55
        '/<i\b[^>]*>(.*?)<\/i>/i',                        // <i>
56
        '/<em\b[^>]*>(.*?)<\/em>/i',                      // <em>
57
        '/<ins\b[^>]*>(.*?)<\/ins>/i',                    // <ins>
58
        '/(<ul\b[^>]*>|<\/ul>)/i',                        // <ul> and </ul>
59
        '/(<ol\b[^>]*>|<\/ol>)/i',                        // <ol> and </ol>
60
        '/(<dl\b[^>]*>|<\/dl>)/i',                        // <dl> and </dl>
61
        '/<li\b[^>]*>(.*?)<\/li>/i',                      // <li> and </li>
62
        '/<dd\b[^>]*>(.*?)<\/dd>/i',                      // <dd> and </dd>
63
        '/<dt\b[^>]*>(.*?)<\/dt>/i',                      // <dt> and </dt>
64
        '/<li\b[^>]*>/i',                                 // <li>
65
        '/<hr\b[^>]*>/i',                                 // <hr>
66
        '/<div\b[^>]*>/i',                                // <div>
67
        '/(<table\b[^>]*>|<\/table>)/i',                  // <table> and </table>
68
        '/(<tr\b[^>]*>|<\/tr>)/i',                        // <tr> and </tr>
69
        '/<td\b[^>]*>(.*?)<\/td>/i',                      // <td> and </td>
70
        '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span>
71
        '/<(img)\b[^>]*alt=\"([^>"]+)\"[^>]*>/i',         // <img> with alt tag
72
    );
73
 
74
    /**
75
     * List of pattern replacements corresponding to patterns searched.
76
     *
77
     * @var array $replace
78
     * @see $search
79
     */
80
    protected $replace = array(
81
        '',                              // Non-legal carriage return
82
        ' ',                             // Newlines and tabs
83
        '',                              // <head>
84
        '',                              // <script>s -- which strip_tags supposedly has problems with
85
        '',                              // <style>s -- which strip_tags supposedly has problems with
86
        '_\\1_',                         // <i>
87
        '_\\1_',                         // <em>
88
        '_\\1_',                         // <ins>
89
        "\n\n",                          // <ul> and </ul>
90
        "\n\n",                          // <ol> and </ol>
91
        "\n\n",                          // <dl> and </dl>
92
        "\t* \\1\n",                     // <li> and </li>
93
        " \\1\n",                        // <dd> and </dd>
94
        "\t* \\1",                       // <dt> and </dt>
95
        "\n\t* ",                        // <li>
96
        "\n-------------------------\n", // <hr>
97
        "<div>\n",                       // <div>
98
        "\n\n",                          // <table> and </table>
99
        "\n",                            // <tr> and </tr>
100
        "\t\t\\1\n",                     // <td> and </td>
101
        "",                              // <span class="_html2text_ignore">...</span>
102
        '[\\2]',                         // <img> with alt tag
103
    );
104
 
105
    /**
106
     * List of preg* regular expression patterns to search for,
107
     * used in conjunction with $entReplace.
108
     *
109
     * @var array $entSearch
110
     * @see $entReplace
111
     */
112
    protected $entSearch = array(
113
        '/&#153;/i',                                     // TM symbol in win-1252
114
        '/&#151;/i',                                     // m-dash in win-1252
115
        '/&(amp|#38);/i',                                // Ampersand: see converter()
116
        '/[ ]{2,}/',                                     // Runs of spaces, post-handling
117
        '/&#39;/i',                                      // The apostrophe symbol
118
    );
119
 
120
    /**
121
     * List of pattern replacements corresponding to patterns searched.
122
     *
123
     * @var array $entReplace
124
     * @see $entSearch
125
     */
126
    protected $entReplace = array(
127
        'â„¢',         // TM symbol
128
        '—',         // m-dash
129
        '|+|amp|+|', // Ampersand: see converter()
130
        ' ',         // Runs of spaces, post-handling
131
        '\'',        // Apostrophe
132
    );
133
 
134
    /**
135
     * List of preg* regular expression patterns to search for
136
     * and replace using callback function.
137
     *
138
     * @var array $callbackSearch
139
     */
140
    protected $callbackSearch = array(
141
        '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i',           // h1 - h6
142
        '/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si',                  // <p> with surrounding whitespace.
143
        '/<(br)[^>]*>[ ]*/i',                                    // <br> with leading whitespace after the newline.
144
        '/<(b)( [^>]*)?>(.*?)<\/b>/i',                           // <b>
145
        '/<(strong)( [^>]*)?>(.*?)<\/strong>/i',                 // <strong>
146
        '/<(del)( [^>]*)?>(.*?)<\/del>/i',                       // <del>
147
        '/<(th)( [^>]*)?>(.*?)<\/th>/i',                         // <th> and </th>
148
        '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i'  // <a href="">
149
    );
150
 
151
    /**
152
     * List of preg* regular expression patterns to search for in PRE body,
153
     * used in conjunction with $preReplace.
154
     *
155
     * @var array $preSearch
156
     * @see $preReplace
157
     */
158
    protected $preSearch = array(
159
        "/\n/",
160
        "/\t/",
161
        '/ /',
162
        '/<pre[^>]*>/',
163
        '/<\/pre>/'
164
    );
165
 
166
    /**
167
     * List of pattern replacements corresponding to patterns searched for PRE body.
168
     *
169
     * @var array $preReplace
170
     * @see $preSearch
171
     */
172
    protected $preReplace = array(
173
        '<br>',
174
        '&nbsp;&nbsp;&nbsp;&nbsp;',
175
        '&nbsp;',
176
        '',
177
        '',
178
    );
179
 
180
    /**
181
     * Temporary workspace used during PRE processing.
182
     *
183
     * @var string $preContent
184
     */
185
    protected $preContent = '';
186
 
187
    /**
188
     * Contains the base URL that relative links should resolve to.
189
     *
190
     * @var string $baseurl
191
     */
192
    protected $baseurl = '';
193
 
194
    /**
195
     * Indicates whether content in the $html variable has been converted yet.
196
     *
197
     * @var boolean $converted
198
     * @see $html, $text
199
     */
200
    protected $converted = false;
201
 
202
    /**
203
     * Contains URL addresses from links to be rendered in plain text.
204
     *
205
     * @var array $linkList
206
     * @see buildlinkList()
207
     */
208
    protected $linkList = array();
209
 
210
    /**
211
     * Various configuration options (able to be set in the constructor)
212
     *
213
     * @var array $options
214
     */
215
    protected $options = array(
216
        'do_links' => 'inline', // 'none'
217
                                // 'inline' (show links inline)
218
                                // 'nextline' (show links on the next line)
219
                                // 'table' (if a table of link URLs should be listed after the text.
220
                                // 'bbcode' (show links as bbcode)
221
 
222
        'width' => 70,          //  Maximum width of the formatted text, in columns.
223
                                //  Set this value to 0 (or less) to ignore word wrapping
224
                                //  and not constrain text to a fixed-width column.
225
    );
226
 
227
    private function legacyConstruct($html = '', $fromFile = false, array $options = array())
228
    {
229
        $this->set_html($html, $fromFile);
230
        $this->options = array_merge($this->options, $options);
231
    }
232
 
233
    /**
234
     * @param string $html    Source HTML
235
     * @param array  $options Set configuration options
236
     */
237
    public function __construct($html = '', $options = array())
238
    {
239
        // for backwards compatibility
240
        if (!is_array($options)) {
241
            return call_user_func_array(array($this, 'legacyConstruct'), func_get_args());
242
        }
243
 
244
        $this->html = $html;
245
        $this->options = array_merge($this->options, $options);
246
        $this->htmlFuncFlags = (PHP_VERSION_ID < 50400)
247
            ? ENT_COMPAT
248
            : ENT_COMPAT | ENT_HTML5;
249
    }
250
 
251
    /**
252
    * Get the source HTML
253
    *
254
    * @return string
255
    */
256
    public function getHtml()
257
    {
258
        return $this->html;
259
    }
260
 
261
    /**
262
     * Set the source HTML
263
     *
264
     * @param string $html HTML source content
265
     */
266
    public function setHtml($html)
267
    {
268
        $this->html = $html;
269
        $this->converted = false;
270
    }
271
 
272
    /**
273
     * @deprecated
274
     */
275
    public function set_html($html, $from_file = false)
276
    {
277
        if ($from_file) {
278
            throw new \InvalidArgumentException("Argument from_file no longer supported");
279
        }
280
 
281
        return $this->setHtml($html);
282
    }
283
 
284
    /**
285
     * Returns the text, converted from HTML.
286
     *
287
     * @return string Plain text
288
     */
289
    public function getText()
290
    {
291
        if (!$this->converted) {
292
            $this->convert();
293
        }
294
 
295
        return $this->text;
296
    }
297
 
298
    /**
299
     * @deprecated
300
     */
301
    public function get_text()
302
    {
303
        return $this->getText();
304
    }
305
 
306
    /**
307
     * @deprecated
308
     */
309
    public function print_text()
310
    {
311
        print $this->getText();
312
    }
313
 
314
    /**
315
     * @deprecated
316
     */
317
    public function p()
318
    {
319
        return $this->print_text();
320
    }
321
 
322
    /**
323
     * Sets a base URL to handle relative links.
324
     *
325
     * @param string $baseurl
326
     */
327
    public function setBaseUrl($baseurl)
328
    {
329
        $this->baseurl = $baseurl;
330
    }
331
 
332
    /**
333
     * @deprecated
334
     */
335
    public function set_base_url($baseurl)
336
    {
337
        return $this->setBaseUrl($baseurl);
338
    }
339
 
340
    protected function convert()
341
    {
342
       $origEncoding = mb_internal_encoding();
343
       mb_internal_encoding(self::ENCODING);
344
 
345
       $this->doConvert();
346
 
347
       mb_internal_encoding($origEncoding);
348
    }
349
 
350
    protected function doConvert()
351
    {
352
        $this->linkList = array();
353
 
354
        $text = trim($this->html);
355
 
356
        $this->converter($text);
357
 
358
        if ($this->linkList) {
359
            $text .= "\n\nLinks:\n------\n";
360
            foreach ($this->linkList as $i => $url) {
361
                $text .= '[' . ($i + 1) . '] ' . $url . "\n";
362
            }
363
        }
364
 
365
        $this->text = $text;
366
 
367
        $this->converted = true;
368
    }
369
 
370
    protected function converter(&$text)
371
    {
372
        $this->convertBlockquotes($text);
373
        $this->convertPre($text);
374
        $text = preg_replace($this->search, $this->replace, $text);
375
        $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text);
376
        $text = strip_tags($text);
377
        $text = preg_replace($this->entSearch, $this->entReplace, $text);
378
        $text = html_entity_decode($text, $this->htmlFuncFlags, self::ENCODING);
379
 
380
        // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
381
        $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
382
 
383
        // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
384
        // This properly handles situation of "&amp;quot;" in input string
385
        $text = str_replace('|+|amp|+|', '&', $text);
386
 
387
        // Normalise empty lines
388
        $text = preg_replace("/\n\s+\n/", "\n\n", $text);
389
        $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
390
 
391
        // remove leading empty lines (can be produced by eg. P tag on the beginning)
392
        $text = ltrim($text, "\n");
393
 
394
        if ($this->options['width'] > 0) {
395
            $text = wordwrap($text, $this->options['width']);
396
        }
397
    }
398
 
399
    /**
400
     * Helper function called by preg_replace() on link replacement.
401
     *
402
     * Maintains an internal list of links to be displayed at the end of the
403
     * text, with numeric indices to the original point in the text they
404
     * appeared. Also makes an effort at identifying and handling absolute
405
     * and relative links.
406
     *
407
     * @param  string $link          URL of the link
408
     * @param  string $display       Part of the text to associate number with
409
     * @param  null   $linkOverride
410
     * @return string
411
     */
412
    protected function buildlinkList($link, $display, $linkOverride = null)
413
    {
414
        $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links'];
415
        if ($linkMethod == 'none') {
416
            return $display;
417
        }
418
 
419
        // Ignored link types
420
        if (preg_match('!^(javascript:|mailto:|#)!i', html_entity_decode($link))) {
421
            return $display;
422
        }
423
 
424
        if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
425
            $url = $link;
426
        } else {
427
            $url = $this->baseurl;
428
            if (mb_substr($link, 0, 1) != '/') {
429
                $url .= '/';
430
            }
431
            $url .= $link;
432
        }
433
 
434
        if ($linkMethod == 'table') {
435
            if (($index = array_search($url, $this->linkList)) === false) {
436
                $index = count($this->linkList);
437
                $this->linkList[] = $url;
438
            }
439
 
440
            return $display . ' [' . ($index + 1) . ']';
441
        } elseif ($linkMethod == 'nextline') {
442
            if ($url === $display) {
443
                return $display;
444
            }
445
            return $display . "\n[" . $url . ']';
446
        } elseif ($linkMethod == 'bbcode') {
447
            return sprintf('[url=%s]%s[/url]', $url, $display);
448
        } else { // link_method defaults to inline
449
            if ($url === $display) {
450
                return $display;
451
            }
452
            return $display . ' [' . $url . ']';
453
        }
454
    }
455
 
456
    /**
457
     * Helper function for PRE body conversion.
458
     *
459
     * @param string &$text HTML content
460
     */
461
    protected function convertPre(&$text)
462
    {
463
        // get the content of PRE element
464
        while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
465
            // Replace br tags with newlines to prevent the search-and-replace callback from killing whitespace
466
            $this->preContent = preg_replace('/(<br\b[^>]*>)/i', "\n", $matches[1]);
467
 
468
            // Run our defined tags search-and-replace with callback
469
            $this->preContent = preg_replace_callback(
470
                $this->callbackSearch,
471
                array($this, 'pregCallback'),
472
                $this->preContent
473
            );
474
 
475
            // convert the content
476
            $this->preContent = sprintf(
477
                '<div><br>%s<br></div>',
478
                preg_replace($this->preSearch, $this->preReplace, $this->preContent)
479
            );
480
 
481
            // replace the content (use callback because content can contain $0 variable)
482
            $text = preg_replace_callback(
483
                '/<pre[^>]*>.*<\/pre>/ismU',
484
                array($this, 'pregPreCallback'),
485
                $text,
486
                1
487
            );
488
 
489
            // free memory
490
            $this->preContent = '';
491
        }
492
    }
493
 
494
    /**
495
     * Helper function for BLOCKQUOTE body conversion.
496
     *
497
     * @param string &$text HTML content
498
     */
499
    protected function convertBlockquotes(&$text)
500
    {
501
        if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
502
            $originalText = $text;
503
            $start = 0;
504
            $taglen = 0;
505
            $level = 0;
506
            $diff = 0;
507
            foreach ($matches[0] as $m) {
508
                $m[1] = mb_strlen(substr($originalText, 0, $m[1]));
509
                if ($m[0][0] == '<' && $m[0][1] == '/') {
510
                    $level--;
511
                    if ($level < 0) {
512
                        $level = 0; // malformed HTML: go to next blockquote
513
                    } elseif ($level > 0) {
514
                        // skip inner blockquote
515
                    } else {
516
                        $end = $m[1];
517
                        $len = $end - $taglen - $start;
518
                        // Get blockquote content
519
                        $body = mb_substr($text, $start + $taglen - $diff, $len);
520
 
521
                        // Set text width
522
                        $pWidth = $this->options['width'];
523
                        if ($this->options['width'] > 0) $this->options['width'] -= 2;
524
                        // Convert blockquote content
525
                        $body = trim($body);
526
                        $this->converter($body);
527
                        // Add citation markers and create PRE block
528
                        $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
529
                        $body = '<pre>' . htmlspecialchars($body, $this->htmlFuncFlags, self::ENCODING) . '</pre>';
530
                        // Re-set text width
531
                        $this->options['width'] = $pWidth;
532
                        // Replace content
533
                        $text = mb_substr($text, 0, $start - $diff)
534
                            . $body
535
                            . mb_substr($text, $end + mb_strlen($m[0]) - $diff);
536
 
537
                        $diff += $len + $taglen + mb_strlen($m[0]) - mb_strlen($body);
538
                        unset($body);
539
                    }
540
                } else {
541
                    if ($level == 0) {
542
                        $start = $m[1];
543
                        $taglen = mb_strlen($m[0]);
544
                    }
545
                    $level++;
546
                }
547
            }
548
        }
549
    }
550
 
551
    /**
552
     * Callback function for preg_replace_callback use.
553
     *
554
     * @param  array  $matches PREG matches
555
     * @return string
556
     */
557
    protected function pregCallback($matches)
558
    {
559
        switch (mb_strtolower($matches[1])) {
560
            case 'p':
561
                // Replace newlines with spaces.
562
                $para = str_replace("\n", " ", $matches[3]);
563
 
564
                // Trim trailing and leading whitespace within the tag.
565
                $para = trim($para);
566
 
567
                // Add trailing newlines for this para.
568
                return "\n" . $para . "\n";
569
            case 'br':
570
                return "\n";
571
            case 'b':
572
            case 'strong':
573
                return $this->toupper($matches[3]);
574
            case 'del':
575
                return $this->tostrike($matches[3]);
576
            case 'th':
577
                return $this->toupper("\t\t" . $matches[3] . "\n");
578
            case 'h':
579
                return $this->toupper("\n\n" . $matches[3] . "\n\n");
580
            case 'a':
581
                // override the link method
582
                $linkOverride = null;
583
                if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) {
584
                    $linkOverride = $linkOverrideMatch[1];
585
                }
586
                // Remove spaces in URL (#1487805)
587
                $url = str_replace(' ', '', $matches[3]);
588
 
589
                return $this->buildlinkList($url, $matches[5], $linkOverride);
590
        }
591
 
592
        return '';
593
    }
594
 
595
    /**
596
     * Callback function for preg_replace_callback use in PRE content handler.
597
     *
598
     * @param  array  $matches PREG matches
599
     * @return string
600
     */
601
    protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches)
602
    {
603
        return $this->preContent;
604
    }
605
 
606
    /**
607
     * Strtoupper function with HTML tags and entities handling.
608
     *
609
     * @param  string $str Text to convert
610
     * @return string Converted text
611
     */
612
    protected function toupper($str)
613
    {
614
        // string can contain HTML tags
615
        $chunks = preg_split('/(<[^>]*>)/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
616
 
617
        // convert toupper only the text between HTML tags
618
        foreach ($chunks as $i => $chunk) {
619
            if ($chunk[0] != '<') {
620
                $chunks[$i] = $this->strtoupper($chunk);
621
            }
622
        }
623
 
624
        return implode($chunks);
625
    }
626
 
627
    /**
628
     * Strtoupper multibyte wrapper function with HTML entities handling.
629
     *
630
     * @param  string $str Text to convert
631
     * @return string Converted text
632
     */
633
    protected function strtoupper($str)
634
    {
635
        $str = html_entity_decode($str, $this->htmlFuncFlags, self::ENCODING);
636
        $str = mb_strtoupper($str);
637
        $str = htmlspecialchars($str, $this->htmlFuncFlags, self::ENCODING);
638
 
639
        return $str;
640
    }
641
 
642
    /**
643
     * Helper function for DEL conversion.
644
     *
645
     * @param  string $text HTML content
646
     * @return string Converted text
647
     */
648
    protected function tostrike($str)
649
    {
650
        $rtn = '';
651
        for ($i = 0; $i < mb_strlen($str); $i++) {
652
            $chr = mb_substr($str, $i, 1);
653
            $combiningChr = chr(0xC0 | 0x336 >> 6). chr(0x80 | 0x336 & 0x3F);
654
            $rtn .= $chr . $combiningChr;
655
        }
656
        return $rtn;
657
    }
658
}