Proyectos de Subversion Moodle

Rev

| Ultima modificación | Ver Log |

Rev Autor Línea Nro. Línea
1441 ariadna 1
<?php
2
 
3
/*
4
 * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com>
5
 *
6
 * This script is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * The GNU General Public License can be found at
12
 * http://www.gnu.org/copyleft/gpl.html.
13
 *
14
 * This script is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
 * GNU General Public License for more details.
18
 */
19
 
20
namespace Html2Text;
21
 
22
class Html2Text
23
{
24
    const ENCODING = 'UTF-8';
25
 
26
    protected $htmlFuncFlags;
27
 
28
    /**
29
     * Contains the HTML content to convert.
30
     *
31
     * @var string $html
32
     */
33
    protected $html;
34
 
35
    /**
36
     * Contains the converted, formatted text.
37
     *
38
     * @var string $text
39
     */
40
    protected $text;
41
 
42
    /**
43
     * List of preg* regular expression patterns to search for,
44
     * used in conjunction with $replace.
45
     *
46
     * @var array $search
47
     * @see $replace
48
     */
49
    protected $search = array(
50
        "/\r/",                                           // Non-legal carriage return
51
        "/[\n\t]+/",                                      // Newlines and tabs
52
        '/<head\b[^>]*>.*?<\/head>/i',                    // <head>
53
        '/<script\b[^>]*>.*?<\/script>/i',                // <script>s -- which strip_tags supposedly has problems with
54
        '/<style\b[^>]*>.*?<\/style>/i',                  // <style>s -- which strip_tags supposedly has problems with
55
        '/<i\b[^>]*>(.*?)<\/i>/i',                        // <i>
56
        '/<em\b[^>]*>(.*?)<\/em>/i',                      // <em>
57
        '/<ins\b[^>]*>(.*?)<\/ins>/i',                    // <ins>
58
        '/(<ul\b[^>]*>|<\/ul>)/i',                        // <ul> and </ul>
59
        '/(<ol\b[^>]*>|<\/ol>)/i',                        // <ol> and </ol>
60
        '/(<dl\b[^>]*>|<\/dl>)/i',                        // <dl> and </dl>
61
        '/<li\b[^>]*>(.*?)<\/li>/i',                      // <li> and </li>
62
        '/<dd\b[^>]*>(.*?)<\/dd>/i',                      // <dd> and </dd>
63
        '/<dt\b[^>]*>(.*?)<\/dt>/i',                      // <dt> and </dt>
64
        '/<li\b[^>]*>/i',                                 // <li>
65
        '/<hr\b[^>]*>/i',                                 // <hr>
66
        '/<div\b[^>]*>/i',                                // <div>
67
        '/(<table\b[^>]*>|<\/table>)/i',                  // <table> and </table>
68
        '/(<tr\b[^>]*>|<\/tr>)/i',                        // <tr> and </tr>
69
        '/<td\b[^>]*>(.*?)<\/td>/i',                      // <td> and </td>
70
        '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span>
71
        '/<(img)\b[^>]*alt=\"([^>"]+)\"[^>]*>/i',         // <img> with alt tag
72
    );
73
 
74
    /**
75
     * List of pattern replacements corresponding to patterns searched.
76
     *
77
     * @var array $replace
78
     * @see $search
79
     */
80
    protected $replace = array(
81
        '',                              // Non-legal carriage return
82
        ' ',                             // Newlines and tabs
83
        '',                              // <head>
84
        '',                              // <script>s -- which strip_tags supposedly has problems with
85
        '',                              // <style>s -- which strip_tags supposedly has problems with
86
        '_\\1_',                         // <i>
87
        '_\\1_',                         // <em>
88
        '_\\1_',                         // <ins>
89
        "\n\n",                          // <ul> and </ul>
90
        "\n\n",                          // <ol> and </ol>
91
        "\n\n",                          // <dl> and </dl>
92
        "\t* \\1\n",                     // <li> and </li>
93
        " \\1\n",                        // <dd> and </dd>
94
        "\t* \\1",                       // <dt> and </dt>
95
        "\n\t* ",                        // <li>
96
        "\n-------------------------\n", // <hr>
97
        "<div>\n",                       // <div>
98
        "\n\n",                          // <table> and </table>
99
        "\n",                            // <tr> and </tr>
100
        "\t\t\\1\n",                     // <td> and </td>
101
        "",                              // <span class="_html2text_ignore">...</span>
102
        '[\\2]',                         // <img> with alt tag
103
    );
104
 
105
    /**
106
     * List of preg* regular expression patterns to search for,
107
     * used in conjunction with $entReplace.
108
     *
109
     * @var array $entSearch
110
     * @see $entReplace
111
     */
112
    protected $entSearch = array(
113
        '/&#153;/i',                                     // TM symbol in win-1252
114
        '/&#151;/i',                                     // m-dash in win-1252
115
        '/&(amp|#38);/i',                                // Ampersand: see converter()
116
        '/[ ]{2,}/',                                     // Runs of spaces, post-handling
117
        '/&#39;/i',                                      // The apostrophe symbol
118
    );
119
 
120
    /**
121
     * List of pattern replacements corresponding to patterns searched.
122
     *
123
     * @var array $entReplace
124
     * @see $entSearch
125
     */
126
    protected $entReplace = array(
127
        'â„¢',         // TM symbol
128
        '—',         // m-dash
129
        '|+|amp|+|', // Ampersand: see converter()
130
        ' ',         // Runs of spaces, post-handling
131
        '\'',        // Apostrophe
132
    );
133
 
134
    /**
135
     * List of preg* regular expression patterns to search for
136
     * and replace using callback function.
137
     *
138
     * @var array $callbackSearch
139
     */
140
    protected $callbackSearch = array(
141
        '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i',           // h1 - h6
142
        '/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si',                  // <p> with surrounding whitespace.
143
        '/<(br)[^>]*>[ ]*/i',                                    // <br> with leading whitespace after the newline.
144
        '/<(b)( [^>]*)?>(.*?)<\/b>/i',                           // <b>
145
        '/<(strong)( [^>]*)?>(.*?)<\/strong>/i',                 // <strong>
146
        '/<(del)( [^>]*)?>(.*?)<\/del>/i',                       // <del>
147
        '/<(th)( [^>]*)?>(.*?)<\/th>/i',                         // <th> and </th>
148
        '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i'  // <a href="">
149
    );
150
 
151
    /**
152
     * List of preg* regular expression patterns to search for in PRE body,
153
     * used in conjunction with $preReplace.
154
     *
155
     * @var array $preSearch
156
     * @see $preReplace
157
     */
158
    protected $preSearch = array(
159
        "/\n/",
160
        "/\t/",
161
        '/ /',
162
        '/<pre[^>]*>/',
163
        '/<\/pre>/'
164
    );
165
 
166
    /**
167
     * List of pattern replacements corresponding to patterns searched for PRE body.
168
     *
169
     * @var array $preReplace
170
     * @see $preSearch
171
     */
172
    protected $preReplace = array(
173
        '<br>',
174
        '&nbsp;&nbsp;&nbsp;&nbsp;',
175
        '&nbsp;',
176
        '',
177
        '',
178
    );
179
 
180
    /**
181
     * Temporary workspace used during PRE processing.
182
     *
183
     * @var string $preContent
184
     */
185
    protected $preContent = '';
186
 
187
    /**
188
     * Contains the base URL that relative links should resolve to.
189
     *
190
     * @var string $baseurl
191
     */
192
    protected $baseurl = '';
193
 
194
    /**
195
     * Indicates whether content in the $html variable has been converted yet.
196
     *
197
     * @var boolean $converted
198
     * @see $html, $text
199
     */
200
    protected $converted = false;
201
 
202
    /**
203
     * Contains URL addresses from links to be rendered in plain text.
204
     *
205
     * @var array $linkList
206
     * @see buildlinkList()
207
     */
208
    protected $linkList = array();
209
 
210
    /**
211
     * Various configuration options (able to be set in the constructor)
212
     *
213
     * @var array $options
214
     */
215
    protected $options = array(
216
        'do_links' => 'inline', // 'none'
217
                                // 'inline' (show links inline)
218
                                // 'nextline' (show links on the next line)
219
                                // 'table' (if a table of link URLs should be listed after the text.
220
                                // 'bbcode' (show links as bbcode)
221
 
222
        'width' => 70,          //  Maximum width of the formatted text, in columns.
223
                                //  Set this value to 0 (or less) to ignore word wrapping
224
                                //  and not constrain text to a fixed-width column.
225
    );
226
 
227
    private function legacyConstruct($html = '', $fromFile = false, array $options = array())
228
    {
229
        $this->set_html($html, $fromFile);
230
        $this->options = array_merge($this->options, $options);
231
    }
232
 
233
    /**
234
     * @param string $html    Source HTML
235
     * @param array  $options Set configuration options
236
     */
237
    public function __construct($html = '', $options = array())
238
    {
239
        $this->htmlFuncFlags = (PHP_VERSION_ID < 50400)
240
            ? ENT_QUOTES
241
            : ENT_QUOTES | ENT_HTML5;
242
 
243
        // for backwards compatibility
244
        if (!is_array($options)) {
245
            // phpcs:ignore (PHPCompatibility.FunctionUse.ArgumentFunctionsReportCurrentValue.NeedsInspection
246
            call_user_func_array(array($this, 'legacyConstruct'), func_get_args());
247
            return;
248
        }
249
 
250
        $this->html = $html;
251
        $this->options = array_merge($this->options, $options);
252
    }
253
 
254
    /**
255
    * Get the source HTML
256
    *
257
    * @return string
258
    */
259
    public function getHtml()
260
    {
261
        return $this->html;
262
    }
263
 
264
    /**
265
     * Set the source HTML
266
     *
267
     * @param string $html HTML source content
268
     */
269
    public function setHtml($html)
270
    {
271
        $this->html = $html;
272
        $this->converted = false;
273
    }
274
 
275
    /**
276
     * @deprecated
277
     */
278
    public function set_html($html, $from_file = false)
279
    {
280
        if ($from_file) {
281
            throw new \InvalidArgumentException("Argument from_file no longer supported");
282
        }
283
 
284
        return $this->setHtml($html);
285
    }
286
 
287
    /**
288
     * Returns the text, converted from HTML.
289
     *
290
     * @return string Plain text
291
     */
292
    public function getText()
293
    {
294
        if (!$this->converted) {
295
            $this->convert();
296
        }
297
 
298
        return $this->text;
299
    }
300
 
301
    /**
302
     * @deprecated
303
     */
304
    public function get_text()
305
    {
306
        return $this->getText();
307
    }
308
 
309
    /**
310
     * @deprecated
311
     */
312
    public function print_text()
313
    {
314
        print $this->getText();
315
    }
316
 
317
    /**
318
     * @deprecated
319
     */
320
    public function p()
321
    {
322
        return $this->print_text();
323
    }
324
 
325
    /**
326
     * Sets a base URL to handle relative links.
327
     *
328
     * @param string $baseurl
329
     */
330
    public function setBaseUrl($baseurl)
331
    {
332
        $this->baseurl = $baseurl;
333
    }
334
 
335
    /**
336
     * @deprecated
337
     */
338
    public function set_base_url($baseurl)
339
    {
340
        return $this->setBaseUrl($baseurl);
341
    }
342
 
343
    protected function convert()
344
    {
345
       $origEncoding = mb_internal_encoding();
346
       mb_internal_encoding(self::ENCODING);
347
 
348
       $this->doConvert();
349
 
350
       mb_internal_encoding($origEncoding);
351
    }
352
 
353
    protected function doConvert()
354
    {
355
        $this->linkList = array();
356
 
357
        if ($this->html === null) {
358
            $text = '';
359
        } else {
360
            $text = trim($this->html);
361
        }
362
 
363
        $this->converter($text);
364
 
365
        if ($this->linkList) {
366
            $text .= "\n\nLinks:\n------\n";
367
            foreach ($this->linkList as $i => $url) {
368
                $text .= '[' . ($i + 1) . '] ' . $url . "\n";
369
            }
370
        }
371
 
372
        $this->text = $text;
373
 
374
        $this->converted = true;
375
    }
376
 
377
    protected function converter(&$text)
378
    {
379
        $this->convertBlockquotes($text);
380
        $this->convertPre($text);
381
        $text = preg_replace($this->search, $this->replace, $text);
382
        $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text);
383
        $text = strip_tags($text);
384
        $text = preg_replace($this->entSearch, $this->entReplace, $text);
385
        $text = html_entity_decode($text, $this->htmlFuncFlags, self::ENCODING);
386
 
387
        // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
388
        $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
389
 
390
        // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
391
        // This properly handles situation of "&amp;quot;" in input string
392
        $text = str_replace('|+|amp|+|', '&', $text);
393
 
394
        // Normalise empty lines
395
        $text = preg_replace("/\n\s+\n/", "\n\n", $text);
396
        $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
397
 
398
        // remove leading empty lines (can be produced by eg. P tag on the beginning)
399
        if ($text === null) {
400
            $text = '';
401
        }
402
        $text = ltrim($text, "\n");
403
 
404
        if ($this->options['width'] > 0) {
405
            $text = wordwrap($text, $this->options['width']);
406
        }
407
    }
408
 
409
    /**
410
     * Helper function called by preg_replace() on link replacement.
411
     *
412
     * Maintains an internal list of links to be displayed at the end of the
413
     * text, with numeric indices to the original point in the text they
414
     * appeared. Also makes an effort at identifying and handling absolute
415
     * and relative links.
416
     *
417
     * @param  string $link          URL of the link
418
     * @param  string $display       Part of the text to associate number with
419
     * @param  null   $linkOverride
420
     * @return string
421
     */
422
    protected function buildlinkList($link, $display, $linkOverride = null)
423
    {
424
        $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links'];
425
        if ($linkMethod == 'none') {
426
            return $display;
427
        }
428
 
429
        // Ignored link types
430
        if (preg_match('!^(javascript:|mailto:|#)!i', html_entity_decode($link, $this->htmlFuncFlags, self::ENCODING))) {
431
            return $display;
432
        }
433
 
434
        if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
435
            $url = $link;
436
        } else {
437
            $url = $this->baseurl;
438
            if (mb_substr($link, 0, 1) != '/') {
439
                $url .= '/';
440
            }
441
            $url .= $link;
442
        }
443
 
444
        if ($linkMethod == 'table') {
445
            if (($index = array_search($url, $this->linkList)) === false) {
446
                $index = count($this->linkList);
447
                $this->linkList[] = $url;
448
            }
449
 
450
            return $display . ' [' . ($index + 1) . ']';
451
        } elseif ($linkMethod == 'nextline') {
452
            if ($url === $display) {
453
                return $display;
454
            }
455
            return $display . "\n[" . $url . ']';
456
        } elseif ($linkMethod == 'bbcode') {
457
            return sprintf('[url=%s]%s[/url]', $url, $display);
458
        } else { // link_method defaults to inline
459
            if ($url === $display) {
460
                return $display;
461
            }
462
            return $display . ' [' . $url . ']';
463
        }
464
    }
465
 
466
    /**
467
     * Helper function for PRE body conversion.
468
     *
469
     * @param string &$text HTML content
470
     */
471
    protected function convertPre(&$text)
472
    {
473
        // get the content of PRE element
474
        while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
475
            // Replace br tags with newlines to prevent the search-and-replace callback from killing whitespace
476
            $this->preContent = preg_replace('/(<br\b[^>]*>)/i', "\n", $matches[1]);
477
 
478
            // Run our defined tags search-and-replace with callback
479
            $this->preContent = preg_replace_callback(
480
                $this->callbackSearch,
481
                array($this, 'pregCallback'),
482
                $this->preContent
483
            );
484
 
485
            // convert the content
486
            $this->preContent = sprintf(
487
                '<div><br>%s<br></div>',
488
                preg_replace($this->preSearch, $this->preReplace, $this->preContent)
489
            );
490
 
491
            // replace the content (use callback because content can contain $0 variable)
492
            $text = preg_replace_callback(
493
                '/<pre[^>]*>.*<\/pre>/ismU',
494
                array($this, 'pregPreCallback'),
495
                $text,
496
                1
497
            );
498
 
499
            // free memory
500
            $this->preContent = '';
501
        }
502
    }
503
 
504
    /**
505
     * Helper function for BLOCKQUOTE body conversion.
506
     *
507
     * @param string &$text HTML content
508
     */
509
    protected function convertBlockquotes(&$text)
510
    {
511
        if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
512
            $originalText = $text;
513
            $start = 0;
514
            $taglen = 0;
515
            $level = 0;
516
            $diff = 0;
517
            foreach ($matches[0] as $m) {
518
                $m[1] = mb_strlen(substr($originalText, 0, $m[1]));
519
                if ($m[0][0] == '<' && $m[0][1] == '/') {
520
                    $level--;
521
                    if ($level < 0) {
522
                        $level = 0; // malformed HTML: go to next blockquote
523
                    } elseif ($level > 0) {
524
                        // skip inner blockquote
525
                    } else {
526
                        $end = $m[1];
527
                        $len = $end - $taglen - $start;
528
                        // Get blockquote content
529
                        $body = mb_substr($text, $start + $taglen - $diff, $len);
530
 
531
                        // Set text width
532
                        $pWidth = $this->options['width'];
533
                        if ($this->options['width'] > 0) $this->options['width'] -= 2;
534
                        // Convert blockquote content
535
                        $body = trim($body);
536
                        $this->converter($body);
537
                        // Add citation markers and create PRE block
538
                        $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
539
                        $body = '<pre>' . htmlspecialchars($body, $this->htmlFuncFlags, self::ENCODING) . '</pre>';
540
                        // Re-set text width
541
                        $this->options['width'] = $pWidth;
542
                        // Replace content
543
                        $text = mb_substr($text, 0, $start - $diff)
544
                            . $body
545
                            . mb_substr($text, $end + mb_strlen($m[0]) - $diff);
546
 
547
                        $diff += $len + $taglen + mb_strlen($m[0]) - mb_strlen($body);
548
                        unset($body);
549
                    }
550
                } else {
551
                    if ($level == 0) {
552
                        $start = $m[1];
553
                        $taglen = mb_strlen($m[0]);
554
                    }
555
                    $level++;
556
                }
557
            }
558
        }
559
    }
560
 
561
    /**
562
     * Callback function for preg_replace_callback use.
563
     *
564
     * @param  array  $matches PREG matches
565
     * @return string
566
     */
567
    protected function pregCallback($matches)
568
    {
569
        switch (mb_strtolower($matches[1])) {
570
            case 'p':
571
                // Replace newlines with spaces.
572
                $para = str_replace("\n", " ", $matches[3]);
573
 
574
                // Trim trailing and leading whitespace within the tag.
575
                $para = trim($para);
576
 
577
                // Add trailing newlines for this para.
578
                return "\n" . $para . "\n";
579
            case 'br':
580
                return "\n";
581
            case 'b':
582
            case 'strong':
583
                return $this->toupper($matches[3]);
584
            case 'del':
585
                return $this->tostrike($matches[3]);
586
            case 'th':
587
                return $this->toupper("\t\t" . $matches[3] . "\n");
588
            case 'h':
589
                return $this->toupper("\n\n" . $matches[3] . "\n\n");
590
            case 'a':
591
                // override the link method
592
                $linkOverride = null;
593
                if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) {
594
                    $linkOverride = $linkOverrideMatch[1];
595
                }
596
                // Remove spaces in URL (#1487805)
597
                $url = str_replace(' ', '', $matches[3]);
598
 
599
                return $this->buildlinkList($url, $matches[5], $linkOverride);
600
        }
601
 
602
        return '';
603
    }
604
 
605
    /**
606
     * Callback function for preg_replace_callback use in PRE content handler.
607
     *
608
     * @param  array  $matches PREG matches
609
     * @return string
610
     */
611
    protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches)
612
    {
613
        return $this->preContent;
614
    }
615
 
616
    /**
617
     * Strtoupper function with HTML tags and entities handling.
618
     *
619
     * @param  string $str Text to convert
620
     * @return string Converted text
621
     */
622
    protected function toupper($str)
623
    {
624
        // string can contain HTML tags
625
        $chunks = preg_split('/(<[^>]*>)/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
626
 
627
        // convert toupper only the text between HTML tags
628
        foreach ($chunks as $i => $chunk) {
629
            if ($chunk[0] != '<') {
630
                $chunks[$i] = $this->strtoupper($chunk);
631
            }
632
        }
633
 
634
        return implode($chunks);
635
    }
636
 
637
    /**
638
     * Strtoupper multibyte wrapper function with HTML entities handling.
639
     *
640
     * @param  string $str Text to convert
641
     * @return string Converted text
642
     */
643
    protected function strtoupper($str)
644
    {
645
        $str = html_entity_decode($str, $this->htmlFuncFlags, self::ENCODING);
646
        $str = mb_strtoupper($str);
647
        $str = htmlspecialchars($str, $this->htmlFuncFlags, self::ENCODING);
648
 
649
        return $str;
650
    }
651
 
652
    /**
653
     * Helper function for DEL conversion.
654
     *
655
     * @param  string $text HTML content
656
     * @return string Converted text
657
     */
658
    protected function tostrike($str)
659
    {
660
        $rtn = '';
661
        for ($i = 0; $i < mb_strlen($str); $i++) {
662
            $chr = mb_substr($str, $i, 1);
663
            $combiningChr = chr(0xC0 | 0x336 >> 6). chr(0x80 | 0x336 & 0x3F);
664
            $rtn .= $chr . $combiningChr;
665
        }
666
        return $rtn;
667
    }
668
}