String.php 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797
  1. <?php
  2. /**
  3. * Provides static methods for charset and locale safe string manipulation.
  4. *
  5. * Copyright 2003-2013 Horde LLC (http://www.horde.org/)
  6. *
  7. * See the enclosed file COPYING for license information (LGPL). If you
  8. * did not receive this file, see http://www.horde.org/licenses/lgpl21.
  9. *
  10. * @author Jan Schneider <jan@horde.org>
  11. * @category Horde
  12. * @license http://www.horde.org/licenses/lgpl21 LGPL 2.1
  13. * @package Util
  14. */
  15. class Horde_String
  16. {
  17. /**
  18. * lower() cache.
  19. *
  20. * @var array
  21. */
  22. static protected $_lowers = array();
  23. /**
  24. * upper() cache.
  25. *
  26. * @var array
  27. */
  28. static protected $_uppers = array();
  29. /**
  30. * Converts a string from one charset to another.
  31. *
  32. * Uses the iconv or the mbstring extensions.
  33. * The original string is returned if conversion failed or none
  34. * of the extensions were available.
  35. *
  36. * @param mixed $input The data to be converted. If $input is an an
  37. * array, the array's values get converted
  38. * recursively.
  39. * @param string $from The string's current charset.
  40. * @param string $to The charset to convert the string to.
  41. * @param boolean $force Force conversion?
  42. *
  43. * @return mixed The converted input data.
  44. */
  45. static public function convertCharset($input, $from, $to, $force = false)
  46. {
  47. /* Don't bother converting numbers. */
  48. if (is_numeric($input)) {
  49. return $input;
  50. }
  51. /* If the from and to character sets are identical, return now. */
  52. if (!$force && $from == $to) {
  53. return $input;
  54. }
  55. $from = self::lower($from);
  56. $to = self::lower($to);
  57. if (!$force && $from == $to) {
  58. return $input;
  59. }
  60. if (is_array($input)) {
  61. $tmp = array();
  62. reset($input);
  63. while (list($key, $val) = each($input)) {
  64. $tmp[self::_convertCharset($key, $from, $to)] = self::convertCharset($val, $from, $to, $force);
  65. }
  66. return $tmp;
  67. }
  68. if (is_object($input)) {
  69. // PEAR_Error/Exception objects are almost guaranteed to contain
  70. // recursion, which will cause a segfault in PHP. We should never
  71. // reach this line, but add a check.
  72. if (($input instanceof Exception) ||
  73. ($input instanceof PEAR_Error)) {
  74. return '';
  75. }
  76. $input = clone $input;
  77. $vars = get_object_vars($input);
  78. while (list($key, $val) = each($vars)) {
  79. $input->$key = self::convertCharset($val, $from, $to, $force);
  80. }
  81. return $input;
  82. }
  83. if (!is_string($input)) {
  84. return $input;
  85. }
  86. return self::_convertCharset($input, $from, $to);
  87. }
  88. /**
  89. * Internal function used to do charset conversion.
  90. *
  91. * @param string $input See self::convertCharset().
  92. * @param string $from See self::convertCharset().
  93. * @param string $to See self::convertCharset().
  94. *
  95. * @return string The converted string.
  96. */
  97. static protected function _convertCharset($input, $from, $to)
  98. {
  99. /* Use utf8_[en|de]code() if possible and if the string isn't too
  100. * large (less than 16 MB = 16 * 1024 * 1024 = 16777216 bytes) - these
  101. * functions use more memory. */
  102. if (Horde_Util::extensionExists('xml') &&
  103. ((strlen($input) < 16777216) ||
  104. !Horde_Util::extensionExists('iconv') ||
  105. !Horde_Util::extensionExists('mbstring'))) {
  106. if (($to == 'utf-8') &&
  107. in_array($from, array('iso-8859-1', 'us-ascii', 'utf-8'))) {
  108. return utf8_encode($input);
  109. }
  110. if (($from == 'utf-8') &&
  111. in_array($to, array('iso-8859-1', 'us-ascii', 'utf-8'))) {
  112. return utf8_decode($input);
  113. }
  114. }
  115. /* Try UTF7-IMAP conversions. */
  116. if (($from == 'utf7-imap') || ($to == 'utf7-imap')) {
  117. try {
  118. if ($from == 'utf7-imap') {
  119. return self::convertCharset(Horde_Imap_Client_Utf7imap::Utf7ImapToUtf8($input), 'UTF-8', $to);
  120. } else {
  121. if ($from == 'utf-8') {
  122. $conv = $input;
  123. } else {
  124. $conv = self::convertCharset($input, $from, 'UTF-8');
  125. }
  126. return Horde_Imap_Client_Utf7imap::Utf8ToUtf7Imap($conv);
  127. }
  128. } catch (Horde_Imap_Client_Exception $e) {
  129. return $input;
  130. }
  131. }
  132. /* Try iconv with transliteration. */
  133. if (Horde_Util::extensionExists('iconv')) {
  134. unset($php_errormsg);
  135. ini_set('track_errors', 1);
  136. $out = @iconv($from, $to . '//TRANSLIT', $input);
  137. $errmsg = isset($php_errormsg);
  138. ini_restore('track_errors');
  139. if (!$errmsg) {
  140. return $out;
  141. }
  142. }
  143. /* Try mbstring. */
  144. if (Horde_Util::extensionExists('mbstring')) {
  145. $out = @mb_convert_encoding($input, $to, self::_mbstringCharset($from));
  146. if (!empty($out)) {
  147. return $out;
  148. }
  149. }
  150. return $input;
  151. }
  152. /**
  153. * Makes a string lowercase.
  154. *
  155. * @param string $string The string to be converted.
  156. * @param boolean $locale If true the string will be converted based on
  157. * a given charset, locale independent else.
  158. * @param string $charset If $locale is true, the charset to use when
  159. * converting.
  160. *
  161. * @return string The string with lowercase characters.
  162. */
  163. static public function lower($string, $locale = false, $charset = null)
  164. {
  165. if ($locale) {
  166. if (Horde_Util::extensionExists('mbstring')) {
  167. if (is_null($charset)) {
  168. throw new InvalidArgumentException('$charset argument must not be null');
  169. }
  170. $ret = @mb_strtolower($string, self::_mbstringCharset($charset));
  171. if (!empty($ret)) {
  172. return $ret;
  173. }
  174. }
  175. return strtolower($string);
  176. }
  177. if (!isset(self::$_lowers[$string])) {
  178. $language = setlocale(LC_CTYPE, 0);
  179. setlocale(LC_CTYPE, 'C');
  180. self::$_lowers[$string] = strtolower($string);
  181. setlocale(LC_CTYPE, $language);
  182. }
  183. return self::$_lowers[$string];
  184. }
  185. /**
  186. * Makes a string uppercase.
  187. *
  188. * @param string $string The string to be converted.
  189. * @param boolean $locale If true the string will be converted based on a
  190. * given charset, locale independent else.
  191. * @param string $charset If $locale is true, the charset to use when
  192. * converting. If not provided the current charset.
  193. *
  194. * @return string The string with uppercase characters.
  195. */
  196. static public function upper($string, $locale = false, $charset = null)
  197. {
  198. if ($locale) {
  199. if (Horde_Util::extensionExists('mbstring')) {
  200. if (is_null($charset)) {
  201. throw new InvalidArgumentException('$charset argument must not be null');
  202. }
  203. $ret = @mb_strtoupper($string, self::_mbstringCharset($charset));
  204. if (!empty($ret)) {
  205. return $ret;
  206. }
  207. }
  208. return strtoupper($string);
  209. }
  210. if (!isset(self::$_uppers[$string])) {
  211. $language = setlocale(LC_CTYPE, 0);
  212. setlocale(LC_CTYPE, 'C');
  213. self::$_uppers[$string] = strtoupper($string);
  214. setlocale(LC_CTYPE, $language);
  215. }
  216. return self::$_uppers[$string];
  217. }
  218. /**
  219. * Returns a string with the first letter capitalized if it is
  220. * alphabetic.
  221. *
  222. * @param string $string The string to be capitalized.
  223. * @param boolean $locale If true the string will be converted based on a
  224. * given charset, locale independent else.
  225. * @param string $charset The charset to use, defaults to current charset.
  226. *
  227. * @return string The capitalized string.
  228. */
  229. static public function ucfirst($string, $locale = false, $charset = null)
  230. {
  231. if ($locale) {
  232. if (is_null($charset)) {
  233. throw new InvalidArgumentException('$charset argument must not be null');
  234. }
  235. $first = self::substr($string, 0, 1, $charset);
  236. if (self::isAlpha($first, $charset)) {
  237. $string = self::upper($first, true, $charset) . self::substr($string, 1, null, $charset);
  238. }
  239. } else {
  240. $string = self::upper(substr($string, 0, 1), false) . substr($string, 1);
  241. }
  242. return $string;
  243. }
  244. /**
  245. * Returns a string with the first letter of each word capitalized if it is
  246. * alphabetic.
  247. *
  248. * Sentences are splitted into words at whitestrings.
  249. *
  250. * @param string $string The string to be capitalized.
  251. * @param boolean $locale If true the string will be converted based on a
  252. * given charset, locale independent else.
  253. * @param string $charset The charset to use, defaults to current charset.
  254. *
  255. * @return string The capitalized string.
  256. */
  257. static public function ucwords($string, $locale = false, $charset = null)
  258. {
  259. $words = preg_split('/(\s+)/', $string, -1, PREG_SPLIT_DELIM_CAPTURE);
  260. for ($i = 0, $c = count($words); $i < $c; $i += 2) {
  261. $words[$i] = self::ucfirst($words[$i], $locale, $charset);
  262. }
  263. return implode('', $words);
  264. }
  265. /**
  266. * Returns part of a string.
  267. *
  268. * @param string $string The string to be converted.
  269. * @param integer $start The part's start position, zero based.
  270. * @param integer $length The part's length.
  271. * @param string $charset The charset to use when calculating the part's
  272. * position and length, defaults to current
  273. * charset.
  274. *
  275. * @return string The string's part.
  276. */
  277. static public function substr($string, $start, $length = null,
  278. $charset = 'UTF-8')
  279. {
  280. if (is_null($length)) {
  281. $length = self::length($string, $charset) - $start;
  282. }
  283. if ($length == 0) {
  284. return '';
  285. }
  286. /* Try mbstring. */
  287. if (Horde_Util::extensionExists('mbstring')) {
  288. $ret = @mb_substr($string, $start, $length, self::_mbstringCharset($charset));
  289. /* mb_substr() returns empty string on failure. */
  290. if (strlen($ret)) {
  291. return $ret;
  292. }
  293. }
  294. /* Try iconv. */
  295. if (Horde_Util::extensionExists('iconv')) {
  296. $ret = @iconv_substr($string, $start, $length, $charset);
  297. /* iconv_substr() returns false on failure. */
  298. if ($ret !== false) {
  299. return $ret;
  300. }
  301. }
  302. return substr($string, $start, $length);
  303. }
  304. /**
  305. * Returns the character (not byte) length of a string.
  306. *
  307. * @param string $string The string to return the length of.
  308. * @param string $charset The charset to use when calculating the string's
  309. * length.
  310. *
  311. * @return integer The string's length.
  312. */
  313. static public function length($string, $charset = 'UTF-8')
  314. {
  315. $charset = self::lower($charset);
  316. if ($charset == 'utf-8' || $charset == 'utf8') {
  317. return strlen(utf8_decode($string));
  318. }
  319. if (Horde_Util::extensionExists('mbstring')) {
  320. $ret = @mb_strlen($string, self::_mbstringCharset($charset));
  321. if (!empty($ret)) {
  322. return $ret;
  323. }
  324. }
  325. return strlen($string);
  326. }
  327. /**
  328. * Returns the numeric position of the first occurrence of $needle
  329. * in the $haystack string.
  330. *
  331. * @param string $haystack The string to search through.
  332. * @param string $needle The string to search for.
  333. * @param integer $offset Allows to specify which character in haystack
  334. * to start searching.
  335. * @param string $charset The charset to use when searching for the
  336. * $needle string.
  337. *
  338. * @return integer The position of first occurrence.
  339. */
  340. static public function pos($haystack, $needle, $offset = 0,
  341. $charset = 'UTF-8')
  342. {
  343. if (Horde_Util::extensionExists('mbstring')) {
  344. $track_errors = ini_set('track_errors', 1);
  345. $ret = @mb_strpos($haystack, $needle, $offset, self::_mbstringCharset($charset));
  346. ini_set('track_errors', $track_errors);
  347. if (!isset($php_errormsg)) {
  348. return $ret;
  349. }
  350. }
  351. return strpos($haystack, $needle, $offset);
  352. }
  353. /**
  354. * Returns the numeric position of the last occurrence of $needle
  355. * in the $haystack string.
  356. *
  357. * @param string $haystack The string to search through.
  358. * @param string $needle The string to search for.
  359. * @param integer $offset Allows to specify which character in haystack
  360. * to start searching.
  361. * @param string $charset The charset to use when searching for the
  362. * $needle string.
  363. *
  364. * @return integer The position of first occurrence.
  365. */
  366. static public function rpos($haystack, $needle, $offset = 0,
  367. $charset = 'UTF-8')
  368. {
  369. if (Horde_Util::extensionExists('mbstring')) {
  370. $track_errors = ini_set('track_errors', 1);
  371. $ret = @mb_strrpos($haystack, $needle, $offset, self::_mbstringCharset($charset));
  372. ini_set('track_errors', $track_errors);
  373. if (!isset($php_errormsg)) {
  374. return $ret;
  375. }
  376. }
  377. return strrpos($haystack, $needle, $offset);
  378. }
  379. /**
  380. * Returns a string padded to a certain length with another string.
  381. * This method behaves exactly like str_pad() but is multibyte safe.
  382. *
  383. * @param string $input The string to be padded.
  384. * @param integer $length The length of the resulting string.
  385. * @param string $pad The string to pad the input string with. Must
  386. * be in the same charset like the input string.
  387. * @param const $type The padding type. One of STR_PAD_LEFT,
  388. * STR_PAD_RIGHT, or STR_PAD_BOTH.
  389. * @param string $charset The charset of the input and the padding
  390. * strings.
  391. *
  392. * @return string The padded string.
  393. */
  394. static public function pad($input, $length, $pad = ' ',
  395. $type = STR_PAD_RIGHT, $charset = 'UTF-8')
  396. {
  397. $mb_length = self::length($input, $charset);
  398. $sb_length = strlen($input);
  399. $pad_length = self::length($pad, $charset);
  400. /* Return if we already have the length. */
  401. if ($mb_length >= $length) {
  402. return $input;
  403. }
  404. /* Shortcut for single byte strings. */
  405. if ($mb_length == $sb_length && $pad_length == strlen($pad)) {
  406. return str_pad($input, $length, $pad, $type);
  407. }
  408. switch ($type) {
  409. case STR_PAD_LEFT:
  410. $left = $length - $mb_length;
  411. $output = self::substr(str_repeat($pad, ceil($left / $pad_length)), 0, $left, $charset) . $input;
  412. break;
  413. case STR_PAD_BOTH:
  414. $left = floor(($length - $mb_length) / 2);
  415. $right = ceil(($length - $mb_length) / 2);
  416. $output = self::substr(str_repeat($pad, ceil($left / $pad_length)), 0, $left, $charset) .
  417. $input .
  418. self::substr(str_repeat($pad, ceil($right / $pad_length)), 0, $right, $charset);
  419. break;
  420. case STR_PAD_RIGHT:
  421. $right = $length - $mb_length;
  422. $output = $input . self::substr(str_repeat($pad, ceil($right / $pad_length)), 0, $right, $charset);
  423. break;
  424. }
  425. return $output;
  426. }
  427. /**
  428. * Wraps the text of a message.
  429. *
  430. * @param string $string String containing the text to wrap.
  431. * @param integer $width Wrap the string at this number of
  432. * characters.
  433. * @param string $break Character(s) to use when breaking lines.
  434. * @param boolean $cut Whether to cut inside words if a line
  435. * can't be wrapped.
  436. * @param boolean $line_folding Whether to apply line folding rules per
  437. * RFC 822 or similar. The correct break
  438. * characters including leading whitespace
  439. * have to be specified too.
  440. *
  441. * @return string String containing the wrapped text.
  442. */
  443. static public function wordwrap($string, $width = 75, $break = "\n",
  444. $cut = false, $line_folding = false)
  445. {
  446. $wrapped = '';
  447. while (self::length($string, 'UTF-8') > $width) {
  448. $line = self::substr($string, 0, $width, 'UTF-8');
  449. $string = self::substr($string, self::length($line, 'UTF-8'), null, 'UTF-8');
  450. // Make sure we didn't cut a word, unless we want hard breaks
  451. // anyway.
  452. if (!$cut && preg_match('/^(.+?)((\s|\r?\n).*)/us', $string, $match)) {
  453. $line .= $match[1];
  454. $string = $match[2];
  455. }
  456. // Wrap at existing line breaks.
  457. if (preg_match('/^(.*?)(\r?\n)(.*)$/su', $line, $match)) {
  458. $wrapped .= $match[1] . $match[2];
  459. $string = $match[3] . $string;
  460. continue;
  461. }
  462. // Wrap at the last colon or semicolon followed by a whitespace if
  463. // doing line folding.
  464. if ($line_folding &&
  465. preg_match('/^(.*?)(;|:)(\s+.*)$/u', $line, $match)) {
  466. $wrapped .= $match[1] . $match[2] . $break;
  467. $string = $match[3] . $string;
  468. continue;
  469. }
  470. // Wrap at the last whitespace of $line.
  471. $sub = $line_folding
  472. ? '(.+[^\s])'
  473. : '(.*)';
  474. if (preg_match('/^' . $sub . '(\s+)(.*)$/u', $line, $match)) {
  475. $wrapped .= $match[1] . $break;
  476. $string = ($line_folding ? $match[2] : '') . $match[3] . $string;
  477. continue;
  478. }
  479. // Hard wrap if necessary.
  480. if ($cut) {
  481. $wrapped .= $line . $break;
  482. continue;
  483. }
  484. $wrapped .= $line;
  485. }
  486. return $wrapped . $string;
  487. }
  488. /**
  489. * Wraps the text of a message.
  490. *
  491. * @param string $text String containing the text to wrap.
  492. * @param integer $length Wrap $text at this number of characters.
  493. * @param string $break_char Character(s) to use when breaking lines.
  494. * @param boolean $quote Ignore lines that are wrapped with the '>'
  495. * character (RFC 2646)? If true, we don't
  496. * remove any padding whitespace at the end of
  497. * the string.
  498. *
  499. * @return string String containing the wrapped text.
  500. */
  501. static public function wrap($text, $length = 80, $break_char = "\n",
  502. $quote = false)
  503. {
  504. $paragraphs = array();
  505. foreach (preg_split('/\r?\n/', $text) as $input) {
  506. if ($quote && (strpos($input, '>') === 0)) {
  507. $line = $input;
  508. } else {
  509. /* We need to handle the Usenet-style signature line
  510. * separately; since the space after the two dashes is
  511. * REQUIRED, we don't want to trim the line. */
  512. if ($input != '-- ') {
  513. $input = rtrim($input);
  514. }
  515. $line = self::wordwrap($input, $length, $break_char);
  516. }
  517. $paragraphs[] = $line;
  518. }
  519. return implode($break_char, $paragraphs);
  520. }
  521. /**
  522. * Return a truncated string, suitable for notifications.
  523. *
  524. * @param string $text The original string.
  525. * @param integer $length The maximum length.
  526. *
  527. * @return string The truncated string, if longer than $length.
  528. */
  529. static public function truncate($text, $length = 100)
  530. {
  531. return (self::length($text) > $length)
  532. ? rtrim(self::substr($text, 0, $length - 3)) . '...'
  533. : $text;
  534. }
  535. /**
  536. * Return an abbreviated string, with characters in the middle of the
  537. * excessively long string replaced by '...'.
  538. *
  539. * @param string $text The original string.
  540. * @param integer $length The length at which to abbreviate.
  541. *
  542. * @return string The abbreviated string, if longer than $length.
  543. */
  544. static public function abbreviate($text, $length = 20)
  545. {
  546. return (self::length($text) > $length)
  547. ? rtrim(self::substr($text, 0, round(($length - 3) / 2))) . '...' . ltrim(self::substr($text, (($length - 3) / 2) * -1))
  548. : $text;
  549. }
  550. /**
  551. * Returns the common leading part of two strings.
  552. *
  553. * @param string $str1 A string.
  554. * @param string $str2 Another string.
  555. *
  556. * @return string The start of $str1 and $str2 that is identical in both.
  557. */
  558. static public function common($str1, $str2)
  559. {
  560. for ($result = '', $i = 0;
  561. isset($str1[$i]) && isset($str2[$i]) && $str1[$i] == $str2[$i];
  562. $i++) {
  563. $result .= $str1[$i];
  564. }
  565. return $result;
  566. }
  567. /**
  568. * Returns true if the every character in the parameter is an alphabetic
  569. * character.
  570. *
  571. * @param string $string The string to test.
  572. * @param string $charset The charset to use when testing the string.
  573. *
  574. * @return boolean True if the parameter was alphabetic only.
  575. */
  576. static public function isAlpha($string, $charset)
  577. {
  578. if (!Horde_Util::extensionExists('mbstring')) {
  579. return ctype_alpha($string);
  580. }
  581. $charset = self::_mbstringCharset($charset);
  582. $old_charset = mb_regex_encoding();
  583. if ($charset != $old_charset) {
  584. @mb_regex_encoding($charset);
  585. }
  586. $alpha = !@mb_ereg_match('[^[:alpha:]]', $string);
  587. if ($charset != $old_charset) {
  588. @mb_regex_encoding($old_charset);
  589. }
  590. return $alpha;
  591. }
  592. /**
  593. * Returns true if ever character in the parameter is a lowercase letter in
  594. * the current locale.
  595. *
  596. * @param string $string The string to test.
  597. * @param string $charset The charset to use when testing the string.
  598. *
  599. * @return boolean True if the parameter was lowercase.
  600. */
  601. static public function isLower($string, $charset)
  602. {
  603. return ((self::lower($string, true, $charset) === $string) &&
  604. self::isAlpha($string, $charset));
  605. }
  606. /**
  607. * Returns true if every character in the parameter is an uppercase letter
  608. * in the current locale.
  609. *
  610. * @param string $string The string to test.
  611. * @param string $charset The charset to use when testing the string.
  612. *
  613. * @return boolean True if the parameter was uppercase.
  614. */
  615. static public function isUpper($string, $charset)
  616. {
  617. return ((self::upper($string, true, $charset) === $string) &&
  618. self::isAlpha($string, $charset));
  619. }
  620. /**
  621. * Performs a multibyte safe regex match search on the text provided.
  622. *
  623. * @param string $text The text to search.
  624. * @param array $regex The regular expressions to use, without perl
  625. * regex delimiters (e.g. '/' or '|').
  626. * @param string $charset The character set of the text.
  627. *
  628. * @return array The matches array from the first regex that matches.
  629. */
  630. static public function regexMatch($text, $regex, $charset = null)
  631. {
  632. if (!empty($charset)) {
  633. $regex = self::convertCharset($regex, $charset, 'utf-8');
  634. $text = self::convertCharset($text, $charset, 'utf-8');
  635. }
  636. $matches = array();
  637. foreach ($regex as $val) {
  638. if (preg_match('/' . $val . '/u', $text, $matches)) {
  639. break;
  640. }
  641. }
  642. if (!empty($charset)) {
  643. $matches = self::convertCharset($matches, 'utf-8', $charset);
  644. }
  645. return $matches;
  646. }
  647. /**
  648. * Check to see if a string is valid UTF-8.
  649. *
  650. * @param string $text The text to check.
  651. *
  652. * @return boolean True if valid UTF-8.
  653. */
  654. static public function validUtf8($text)
  655. {
  656. $text = strval($text);
  657. for ($i = 0, $len = strlen($text); $i < $len; ++$i) {
  658. $c = ord($text[$i]);
  659. if ($c > 128) {
  660. if ($c > 247) {
  661. // STD 63 (RFC 3629) eliminates 5 & 6-byte characters.
  662. return false;
  663. } elseif ($c > 239) {
  664. $j = 3;
  665. } elseif ($c > 223) {
  666. $j = 2;
  667. } elseif ($c > 191) {
  668. $j = 1;
  669. } else {
  670. return false;
  671. }
  672. if (($i + $j) > $len) {
  673. return false;
  674. }
  675. do {
  676. $c = ord($text[++$i]);
  677. if (($c < 128) || ($c > 191)) {
  678. return false;
  679. }
  680. } while (--$j);
  681. }
  682. }
  683. return true;
  684. }
  685. /**
  686. * Workaround charsets that don't work with mbstring functions.
  687. *
  688. * @param string $charset The original charset.
  689. *
  690. * @return string The charset to use with mbstring functions.
  691. */
  692. static protected function _mbstringCharset($charset)
  693. {
  694. /* mbstring functions do not handle the 'ks_c_5601-1987' &
  695. * 'ks_c_5601-1989' charsets. However, these charsets are used, for
  696. * example, by various versions of Outlook to send Korean characters.
  697. * Use UHC (CP949) encoding instead. See, e.g.,
  698. * http://lists.w3.org/Archives/Public/ietf-charsets/2001AprJun/0030.html */
  699. return in_array(self::lower($charset), array('ks_c_5601-1987', 'ks_c_5601-1989'))
  700. ? 'UHC'
  701. : $charset;
  702. }
  703. /**
  704. * Strip UTF-8 byte order mark (BOM) from string data.
  705. *
  706. * @param string $str Input string (UTF-8).
  707. *
  708. * @return string Stripped string (UTF-8).
  709. */
  710. static public function trimUtf8Bom($str)
  711. {
  712. return (substr($str, 0, 3) == pack('CCC', 239, 187, 191))
  713. ? substr($str, 3)
  714. : $str;
  715. }
  716. }