| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314 |
- <?php
- /**
- * Copyright 2010-2013 Horde LLC (http://www.horde.org/)
- *
- * See the enclosed file COPYING for license information (LGPL). If you
- * did not receive this file, see http://www.horde.org/licenses/lgpl21.
- *
- * @category Horde
- * @copyright 2010-2013 Horde LLC
- * @package Util
- * @license http://www.horde.org/licenses/lgpl21 LGPL 2.1
- */
- /**
- * Parse DOM data from HTML strings.
- *
- * @author Michael Slusarz <slusarz@horde.org>
- * @category Horde
- * @copyright 2010-2013 Horde LLC
- * @package Util
- * @license http://www.horde.org/licenses/lgpl21 LGPL 2.1
- */
- class Horde_Domhtml implements Iterator
- {
- /**
- * DOM object.
- *
- * @var DOMDocument
- */
- public $dom;
- /**
- * Iterator status.
- *
- * @var array
- */
- protected $_iterator = null;
- /**
- * Original charset of data.
- *
- * @var string
- */
- protected $_origCharset;
- /**
- * Encoding tag added to beginning of output.
- *
- * @var string
- */
- protected $_xmlencoding = '';
- /**
- * Constructor.
- *
- * @param string $text The text of the HTML document.
- * @param string $charset The charset of the HTML document.
- *
- * @throws Exception
- */
- public function __construct($text, $charset = null)
- {
- if (!extension_loaded('dom')) {
- throw new Exception('DOM extension is not available.');
- }
- // Bug #9616: Make sure we have valid HTML input.
- if (!strlen($text)) {
- $text = '<html></html>';
- }
- $old_error = libxml_use_internal_errors(true);
- $doc = new DOMDocument();
- if (is_null($charset)) {
- /* If no charset given, charset is whatever libxml tells us the
- * encoding should be defaulting to 'iso-8859-1'. */
- $doc->loadHTML($text);
- $this->_origCharset = $doc->encoding
- ? $doc->encoding
- : 'iso-8859-1';
- } else {
- /* Convert/try with UTF-8 first. */
- $this->_origCharset = Horde_String::lower($charset);
- $this->_xmlencoding = '<?xml encoding="UTF-8"?>';
- $doc->loadHTML($this->_xmlencoding . Horde_String::convertCharset($text, $charset, 'UTF-8'));
- if ($doc->encoding &&
- (Horde_String::lower($doc->encoding) != 'utf-8')) {
- /* Convert charset to what the HTML document says it SHOULD
- * be. */
- $doc->loadHTML(Horde_String::convertCharset($text, $charset, $doc->encoding));
- $this->_xmlencoding = '';
- }
- }
- if ($old_error) {
- libxml_use_internal_errors(false);
- }
- $this->dom = $doc;
- /* Sanity checking: make sure we have the documentElement object. */
- if (!$this->dom->documentElement) {
- $this->dom->appendChild($this->dom->createElement('html'));
- }
- /* Remove old charset information. */
- $xpath = new DOMXPath($this->dom);
- $domlist = $xpath->query('/html/head/meta[@http-equiv="content-type"]');
- for ($i = $domlist->length; $i > 0; --$i) {
- $meta = $domlist->item($i - 1);
- $meta->parentNode->removeChild($meta);
- }
- }
- /**
- * Returns the HEAD element, or creates one if it doesn't exist.
- *
- * @return DOMElement HEAD element.
- */
- public function getHead()
- {
- $head = $this->dom->getElementsByTagName('head');
- if ($head->length) {
- return $head->item(0);
- }
- $headelt = $this->dom->createElement('head');
- $this->dom->documentElement->insertBefore($headelt, $this->dom->documentElement->firstChild);
- return $headelt;
- }
- /**
- * Returns the BODY element, or creates one if it doesn't exist.
- *
- * @since 2.2.0
- *
- * @return DOMElement BODY element.
- */
- public function getBody()
- {
- $body = $this->dom->getElementsByTagName('body');
- if ($body->length) {
- return $body->item(0);
- }
- $bodyelt = $this->dom->createElement('body');
- $this->dom->documentElement->appendChild($body);
- return $bodyelt;
- }
- /**
- * Returns the full HTML text in the original charset.
- *
- * @param array $opts Additional options: (since 2.1.0)
- * - charset: (string) Return using this charset. If set but empty, will
- * return as currently stored in the DOM object.
- * - metacharset: (boolean) If true, will add a META tag containing the
- * charset information.
- *
- * @return string HTML text.
- */
- public function returnHtml(array $opts = [])
- {
- $curr_charset = $this->getCharset();
- if (strcasecmp($curr_charset, 'US-ASCII') === 0) {
- $curr_charset = 'UTF-8';
- }
- $charset = array_key_exists('charset', $opts)
- ? (empty($opts['charset']) ? $curr_charset : $opts['charset'])
- : $this->_origCharset;
- if (empty($opts['metacharset'])) {
- $text = $this->dom->saveHTML();
- } else {
- /* Add placeholder for META tag. Can't add charset yet because DOM
- * extension will alter output if it exists. */
- $meta = $this->dom->createElement('meta');
- $meta->setAttribute('http-equiv', 'content-type');
- $meta->setAttribute('horde_dom_html_charset', '');
- $head = $this->getHead();
- $head->insertBefore($meta, $head->firstChild);
- $text = str_replace(
- 'horde_dom_html_charset=""',
- 'content="text/html; charset=' . $charset . '"',
- $this->dom->saveHTML()
- );
- $head->removeChild($meta);
- }
- if (strcasecmp($curr_charset, $charset) !== 0) {
- $text = Horde_String::convertCharset($text, $curr_charset, $charset);
- }
- if (!$this->_xmlencoding ||
- (($pos = strpos($text, $this->_xmlencoding)) === false)) {
- return $text;
- }
- return substr_replace($text, '', $pos, strlen($this->_xmlencoding));
- }
- /**
- * Returns the body text in the original charset.
- *
- * @return string HTML text.
- */
- public function returnBody()
- {
- $body = $this->dom->getElementsByTagName('body')->item(0);
- $text = '';
- if ($body && $body->hasChildNodes()) {
- foreach ($body->childNodes as $child) {
- $text .= $this->dom->saveXML($child);
- }
- }
- return Horde_String::convertCharset($text, 'UTF-8', $this->_origCharset);
- }
- /**
- * Get the charset of the DOM data.
- *
- * @since 2.1.0
- *
- * @return string Charset of DOM data.
- */
- public function getCharset()
- {
- return $this->dom->encoding
- ? $this->dom->encoding
- : ($this->_xmlencoding ? 'UTF-8' : $this->_origCharset);
- }
- /* Iterator methods. */
- /**
- */
- public function current()
- {
- if ($this->_iterator instanceof DOMDocument) {
- return $this->_iterator;
- }
- $curr = end($this->_iterator);
- return $curr['list']->item($curr['i']);
- }
- /**
- */
- public function key()
- {
- return 0;
- }
- /**
- */
- public function next()
- {
- /* Iterate in the reverse direction through the node list. This allows
- * alteration of the original list without breaking things (foreach()
- * w/removeChild() may exit iteration after removal is complete. */
- if ($this->_iterator instanceof DOMDocument) {
- $this->_iterator = [];
- $curr = [];
- $node = $this->dom;
- } elseif (empty($this->_iterator)) {
- $this->_iterator = null;
- return;
- } else {
- $curr = &$this->_iterator[count($this->_iterator) - 1];
- $node = $curr['list']->item($curr['i']);
- }
- if (empty($curr['child']) &&
- ($node instanceof DOMNode) &&
- $node->hasChildNodes()) {
- $curr['child'] = true;
- $this->_iterator[] = [
- 'child' => false,
- 'i' => $node->childNodes->length - 1,
- 'list' => $node->childNodes
- ];
- } elseif (--$curr['i'] < 0) {
- array_pop($this->_iterator);
- $this->next();
- } else {
- $curr['child'] = false;
- }
- }
- /**
- */
- public function rewind()
- {
- $this->_iterator = $this->dom;
- }
- /**
- */
- public function valid()
- {
- return !is_null($this->_iterator);
- }
- }
|