Domhtml.php 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. <?php
  2. /**
  3. * Copyright 2010-2013 Horde LLC (http://www.horde.org/)
  4. *
  5. * See the enclosed file COPYING for license information (LGPL). If you
  6. * did not receive this file, see http://www.horde.org/licenses/lgpl21.
  7. *
  8. * @category Horde
  9. * @copyright 2010-2013 Horde LLC
  10. * @package Util
  11. * @license http://www.horde.org/licenses/lgpl21 LGPL 2.1
  12. */
  13. /**
  14. * Parse DOM data from HTML strings.
  15. *
  16. * @author Michael Slusarz <slusarz@horde.org>
  17. * @category Horde
  18. * @copyright 2010-2013 Horde LLC
  19. * @package Util
  20. * @license http://www.horde.org/licenses/lgpl21 LGPL 2.1
  21. */
  22. class Horde_Domhtml implements Iterator
  23. {
  24. /**
  25. * DOM object.
  26. *
  27. * @var DOMDocument
  28. */
  29. public $dom;
  30. /**
  31. * Iterator status.
  32. *
  33. * @var array
  34. */
  35. protected $_iterator = null;
  36. /**
  37. * Original charset of data.
  38. *
  39. * @var string
  40. */
  41. protected $_origCharset;
  42. /**
  43. * Encoding tag added to beginning of output.
  44. *
  45. * @var string
  46. */
  47. protected $_xmlencoding = '';
  48. /**
  49. * Constructor.
  50. *
  51. * @param string $text The text of the HTML document.
  52. * @param string $charset The charset of the HTML document.
  53. *
  54. * @throws Exception
  55. */
  56. public function __construct($text, $charset = null)
  57. {
  58. if (!extension_loaded('dom')) {
  59. throw new Exception('DOM extension is not available.');
  60. }
  61. // Bug #9616: Make sure we have valid HTML input.
  62. if (!strlen($text)) {
  63. $text = '<html></html>';
  64. }
  65. $old_error = libxml_use_internal_errors(true);
  66. $doc = new DOMDocument();
  67. if (is_null($charset)) {
  68. /* If no charset given, charset is whatever libxml tells us the
  69. * encoding should be defaulting to 'iso-8859-1'. */
  70. $doc->loadHTML($text);
  71. $this->_origCharset = $doc->encoding
  72. ? $doc->encoding
  73. : 'iso-8859-1';
  74. } else {
  75. /* Convert/try with UTF-8 first. */
  76. $this->_origCharset = Horde_String::lower($charset);
  77. $this->_xmlencoding = '<?xml encoding="UTF-8"?>';
  78. $doc->loadHTML($this->_xmlencoding . Horde_String::convertCharset($text, $charset, 'UTF-8'));
  79. if ($doc->encoding &&
  80. (Horde_String::lower($doc->encoding) != 'utf-8')) {
  81. /* Convert charset to what the HTML document says it SHOULD
  82. * be. */
  83. $doc->loadHTML(Horde_String::convertCharset($text, $charset, $doc->encoding));
  84. $this->_xmlencoding = '';
  85. }
  86. }
  87. if ($old_error) {
  88. libxml_use_internal_errors(false);
  89. }
  90. $this->dom = $doc;
  91. /* Sanity checking: make sure we have the documentElement object. */
  92. if (!$this->dom->documentElement) {
  93. $this->dom->appendChild($this->dom->createElement('html'));
  94. }
  95. /* Remove old charset information. */
  96. $xpath = new DOMXPath($this->dom);
  97. $domlist = $xpath->query('/html/head/meta[@http-equiv="content-type"]');
  98. for ($i = $domlist->length; $i > 0; --$i) {
  99. $meta = $domlist->item($i - 1);
  100. $meta->parentNode->removeChild($meta);
  101. }
  102. }
  103. /**
  104. * Returns the HEAD element, or creates one if it doesn't exist.
  105. *
  106. * @return DOMElement HEAD element.
  107. */
  108. public function getHead()
  109. {
  110. $head = $this->dom->getElementsByTagName('head');
  111. if ($head->length) {
  112. return $head->item(0);
  113. }
  114. $headelt = $this->dom->createElement('head');
  115. $this->dom->documentElement->insertBefore($headelt, $this->dom->documentElement->firstChild);
  116. return $headelt;
  117. }
  118. /**
  119. * Returns the BODY element, or creates one if it doesn't exist.
  120. *
  121. * @since 2.2.0
  122. *
  123. * @return DOMElement BODY element.
  124. */
  125. public function getBody()
  126. {
  127. $body = $this->dom->getElementsByTagName('body');
  128. if ($body->length) {
  129. return $body->item(0);
  130. }
  131. $bodyelt = $this->dom->createElement('body');
  132. $this->dom->documentElement->appendChild($body);
  133. return $bodyelt;
  134. }
  135. /**
  136. * Returns the full HTML text in the original charset.
  137. *
  138. * @param array $opts Additional options: (since 2.1.0)
  139. * - charset: (string) Return using this charset. If set but empty, will
  140. * return as currently stored in the DOM object.
  141. * - metacharset: (boolean) If true, will add a META tag containing the
  142. * charset information.
  143. *
  144. * @return string HTML text.
  145. */
  146. public function returnHtml(array $opts = [])
  147. {
  148. $curr_charset = $this->getCharset();
  149. if (strcasecmp($curr_charset, 'US-ASCII') === 0) {
  150. $curr_charset = 'UTF-8';
  151. }
  152. $charset = array_key_exists('charset', $opts)
  153. ? (empty($opts['charset']) ? $curr_charset : $opts['charset'])
  154. : $this->_origCharset;
  155. if (empty($opts['metacharset'])) {
  156. $text = $this->dom->saveHTML();
  157. } else {
  158. /* Add placeholder for META tag. Can't add charset yet because DOM
  159. * extension will alter output if it exists. */
  160. $meta = $this->dom->createElement('meta');
  161. $meta->setAttribute('http-equiv', 'content-type');
  162. $meta->setAttribute('horde_dom_html_charset', '');
  163. $head = $this->getHead();
  164. $head->insertBefore($meta, $head->firstChild);
  165. $text = str_replace(
  166. 'horde_dom_html_charset=""',
  167. 'content="text/html; charset=' . $charset . '"',
  168. $this->dom->saveHTML()
  169. );
  170. $head->removeChild($meta);
  171. }
  172. if (strcasecmp($curr_charset, $charset) !== 0) {
  173. $text = Horde_String::convertCharset($text, $curr_charset, $charset);
  174. }
  175. if (!$this->_xmlencoding ||
  176. (($pos = strpos($text, $this->_xmlencoding)) === false)) {
  177. return $text;
  178. }
  179. return substr_replace($text, '', $pos, strlen($this->_xmlencoding));
  180. }
  181. /**
  182. * Returns the body text in the original charset.
  183. *
  184. * @return string HTML text.
  185. */
  186. public function returnBody()
  187. {
  188. $body = $this->dom->getElementsByTagName('body')->item(0);
  189. $text = '';
  190. if ($body && $body->hasChildNodes()) {
  191. foreach ($body->childNodes as $child) {
  192. $text .= $this->dom->saveXML($child);
  193. }
  194. }
  195. return Horde_String::convertCharset($text, 'UTF-8', $this->_origCharset);
  196. }
  197. /**
  198. * Get the charset of the DOM data.
  199. *
  200. * @since 2.1.0
  201. *
  202. * @return string Charset of DOM data.
  203. */
  204. public function getCharset()
  205. {
  206. return $this->dom->encoding
  207. ? $this->dom->encoding
  208. : ($this->_xmlencoding ? 'UTF-8' : $this->_origCharset);
  209. }
  210. /* Iterator methods. */
  211. /**
  212. */
  213. public function current()
  214. {
  215. if ($this->_iterator instanceof DOMDocument) {
  216. return $this->_iterator;
  217. }
  218. $curr = end($this->_iterator);
  219. return $curr['list']->item($curr['i']);
  220. }
  221. /**
  222. */
  223. public function key()
  224. {
  225. return 0;
  226. }
  227. /**
  228. */
  229. public function next()
  230. {
  231. /* Iterate in the reverse direction through the node list. This allows
  232. * alteration of the original list without breaking things (foreach()
  233. * w/removeChild() may exit iteration after removal is complete. */
  234. if ($this->_iterator instanceof DOMDocument) {
  235. $this->_iterator = [];
  236. $curr = [];
  237. $node = $this->dom;
  238. } elseif (empty($this->_iterator)) {
  239. $this->_iterator = null;
  240. return;
  241. } else {
  242. $curr = &$this->_iterator[count($this->_iterator) - 1];
  243. $node = $curr['list']->item($curr['i']);
  244. }
  245. if (empty($curr['child']) &&
  246. ($node instanceof DOMNode) &&
  247. $node->hasChildNodes()) {
  248. $curr['child'] = true;
  249. $this->_iterator[] = [
  250. 'child' => false,
  251. 'i' => $node->childNodes->length - 1,
  252. 'list' => $node->childNodes
  253. ];
  254. } elseif (--$curr['i'] < 0) {
  255. array_pop($this->_iterator);
  256. $this->next();
  257. } else {
  258. $curr['child'] = false;
  259. }
  260. }
  261. /**
  262. */
  263. public function rewind()
  264. {
  265. $this->_iterator = $this->dom;
  266. }
  267. /**
  268. */
  269. public function valid()
  270. {
  271. return !is_null($this->_iterator);
  272. }
  273. }